From f872f5400cc01373d8e29d9c7a5296ccfaf4ccf3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Tue, 29 Dec 2015 20:12:19 -0800
Subject: mm: Add a vm_special_mapping.fault() method

Requiring special mappings to give a list of struct pages is
inflexible: it prevents sane use of IO memory in a special
mapping, it's inefficient (it requires arch code to initialize a
list of struct pages, and it requires the mm core to walk the
entire list just to figure out how long it is), and it prevents
arch code from doing anything fancy when a special mapping fault
occurs.

Add a .fault method as an alternative to filling in a .pages
array.

Looks-OK-to: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a26d1677c0bc7e774c33f469451a78ca31e9e6af.1451446564.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mm_types.h | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f8d1492a114f..c88e48a3c155 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -568,10 +568,26 @@ static inline void clear_tlb_flush_pending(struct mm_struct *mm)
 }
 #endif
 
-struct vm_special_mapping
-{
-	const char *name;
+struct vm_fault;
+
+struct vm_special_mapping {
+	const char *name;	/* The name, e.g. "[vdso]". */
+
+	/*
+	 * If .fault is not provided, this points to a
+	 * NULL-terminated array of pages that back the special mapping.
+	 *
+	 * This must not be NULL unless .fault is provided.
+	 */
 	struct page **pages;
+
+	/*
+	 * If non-NULL, then this is called to resolve page faults
+	 * on the special mapping.  If used, .pages is not checked.
+	 */
+	int (*fault)(const struct vm_special_mapping *sm,
+		     struct vm_area_struct *vma,
+		     struct vm_fault *vmf);
 };
 
 enum tlb_flush_reason {
-- 
cgit 


From 1745cbc5d0dee0749a6bc0ea8e872c5db0074061 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 29 Dec 2015 20:12:20 -0800
Subject: mm: Add vm_insert_pfn_prot()

The x86 vvar vma contains pages with differing cacheability
flags.  x86 currently implements this by manually inserting all
the ptes using (io_)remap_pfn_range when the vma is set up.

x86 wants to move to using .fault with VM_FAULT_NOPAGE to set up
the mappings as needed.  The correct API to use to insert a pfn
in .fault is vm_insert_pfn(), but vm_insert_pfn() can't override the
vma's cache mode, and the HPET page in particular needs to be
uncached despite the fact that the rest of the VMA is cached.

Add vm_insert_pfn_prot() to support varying cacheability within
the same non-COW VMA in a more sane manner.

x86 could alternatively use multiple VMAs, but that's messy,
would break CRIU, and would create unnecessary VMAs that would
waste memory.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/d2938d1eb37be7a5e4f86182db646551f11e45aa.1451446564.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 25 +++++++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00bad7793788..87ef1d7730ba 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2080,6 +2080,8 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn);
+int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+			unsigned long pfn, pgprot_t pgprot);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
diff --git a/mm/memory.c b/mm/memory.c
index c387430f06c3..a29f0b90fc56 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1563,9 +1563,30 @@ out:
  */
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn)
+{
+	return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_pfn);
+
+/**
+ * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ * @pgprot: pgprot flags for the inserted page
+ *
+ * This is exactly like vm_insert_pfn, except that it allows drivers to
+ * to override pgprot on a per-page basis.
+ *
+ * This only makes sense for IO mappings, and it makes no sense for
+ * cow mappings.  In general, using multiple vmas is preferable;
+ * vm_insert_pfn_prot should only be used if using multiple VMAs is
+ * impractical.
+ */
+int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+			unsigned long pfn, pgprot_t pgprot)
 {
 	int ret;
-	pgprot_t pgprot = vma->vm_page_prot;
 	/*
 	 * Technically, architectures with pte_special can avoid all these
 	 * restrictions (same for remap_pfn_range).  However we would like
@@ -1587,7 +1608,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 
 	return ret;
 }
-EXPORT_SYMBOL(vm_insert_pfn);
+EXPORT_SYMBOL(vm_insert_pfn_prot);
 
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn)
-- 
cgit 


From dd42ac8f02aea32661756554aace2095f7181d34 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Fri, 16 Oct 2015 15:20:53 +0600
Subject: clockevents: Rename last parameter of clocks_calc_mult_shift() to
 maxsec

Last parameter of the clocks_calc_mult_shift() was renamed from minsec to
maxsec in the 5fdade95 (time: Rename misnamed minsec argument of
clocks_calc_mult_shift()).

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Link: http://lkml.kernel.org/r/1444987253-11018-1-git-send-email-kuleshovmail@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index bdcf358dfce2..0d442e34c349 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -190,9 +190,9 @@ extern void clockevents_config_and_register(struct clock_event_device *dev,
 extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq);
 
 static inline void
-clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec)
+clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 maxsec)
 {
-	return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, minsec);
+	return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, maxsec);
 }
 
 extern void clockevents_suspend(void);
-- 
cgit 


From 9c808765e88efb6fa6af7e2206ef89512f1840a7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 15 Jan 2016 17:41:08 +0000
Subject: hrtimer: Add support for CLOCK_MONOTONIC_RAW

The KVM/ARM timer implementation arms a hrtimer when a vcpu is
blocked (usually because it is waiting for an interrupt)
while its timer is going to kick in the future.

It is essential that this timer doesn't get adjusted, or the
guest will end up being woken-up at the wrong time (NTP running
on the host seems to confuse the hell out of some guests).

In order to allow this, let's add CLOCK_MONOTONIC_RAW support
to hrtimer (it is so far only supported for posix timers). It also
has the (limited) benefit of fixing de0421d53bfb ("mac80211_hwsim:
shuffle code to prepare for dynamic radios"), which already uses
this functionnality without realizing wasn't implemented (just being
lucky...).

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Tomasz Nowicki <tn@semihalf.com>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Link: http://lkml.kernel.org/r/1452879670-16133-2-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  1 +
 kernel/time/hrtimer.c   | 11 ++++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 76dd4f0da5ca..a6d64af5e73f 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -151,6 +151,7 @@ enum  hrtimer_base_type {
 	HRTIMER_BASE_REALTIME,
 	HRTIMER_BASE_BOOTTIME,
 	HRTIMER_BASE_TAI,
+	HRTIMER_BASE_MONOTONIC_RAW,
 	HRTIMER_MAX_CLOCK_BASES,
 };
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 435b8850dd80..a125f222fee2 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -90,12 +90,18 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
 		},
+		{
+			.index = HRTIMER_BASE_MONOTONIC_RAW,
+			.clockid = CLOCK_MONOTONIC_RAW,
+			.get_time = &ktime_get_raw,
+		},
 	}
 };
 
 static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
 	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
 	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
+	[CLOCK_MONOTONIC_RAW]	= HRTIMER_BASE_MONOTONIC_RAW,
 	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
 	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
 };
@@ -1268,7 +1274,10 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
 		if (!(active & 0x01))
 			continue;
 
-		basenow = ktime_add(now, base->offset);
+		if (unlikely(base->index == HRTIMER_BASE_MONOTONIC_RAW))
+			basenow = ktime_get_raw();
+		else
+			basenow = ktime_add(now, base->offset);
 
 		while ((node = timerqueue_getnext(&base->active))) {
 			struct hrtimer *timer;
-- 
cgit 


From ad315455d396a1cbcb2f9fdd687b7e1b26b789e7 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Tue, 29 Dec 2015 12:18:46 +0800
Subject: sparse: Add __private to privatize members of structs

In C programming language, we don't have a easy way to privatize a
member of a structure. However in kernel, sometimes there is a need to
privatize a member in case of potential bugs or misuses.

Fortunately, the noderef attribute of sparse is a way to privatize a
member, as by defining a member as noderef, the address-of operator on
the member will produce a noderef pointer to that member, and if anyone
wants to dereference that kind of pointers to read or modify the member,
sparse will yell.

Based on this, __private modifier and related operation ACCESS_PRIVATE()
are introduced, which could help detect undesigned public uses of
private members of structs. Here is an example of sparse's output if it
detect an undersigned public use:

| kernel/rcu/tree.c:4453:25: warning: incorrect type in argument 1 (different modifiers)
| kernel/rcu/tree.c:4453:25:    expected struct raw_spinlock [usertype] *lock
| kernel/rcu/tree.c:4453:25:    got struct raw_spinlock [noderef] *<noident>

Also, this patch improves compiler.h a little bit by adding comments for
"#else" and "#endif".

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/compiler.h | 12 ++++++++----
 scripts/checkpatch.pl    |  3 ++-
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 00b042c49ccd..c845356952bb 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -20,12 +20,14 @@
 # define __pmem		__attribute__((noderef, address_space(5)))
 #ifdef CONFIG_SPARSE_RCU_POINTER
 # define __rcu		__attribute__((noderef, address_space(4)))
-#else
+#else /* CONFIG_SPARSE_RCU_POINTER */
 # define __rcu
-#endif
+#endif /* CONFIG_SPARSE_RCU_POINTER */
+# define __private	__attribute__((noderef))
 extern void __chk_user_ptr(const volatile void __user *);
 extern void __chk_io_ptr(const volatile void __iomem *);
-#else
+# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member))
+#else /* __CHECKER__ */
 # define __user
 # define __kernel
 # define __safe
@@ -44,7 +46,9 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 # define __percpu
 # define __rcu
 # define __pmem
-#endif
+# define __private
+# define ACCESS_PRIVATE(p, member) ((p)->member)
+#endif /* __CHECKER__ */
 
 /* Indirect macros required for expanded argument pasting, eg. __LINE__. */
 #define ___PASTE(a,b) a##b
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 0147c91fa549..874132b26d23 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -269,7 +269,8 @@ our $Sparse	= qr{
 			__init_refok|
 			__kprobes|
 			__ref|
-			__rcu
+			__rcu|
+			__private
 		}x;
 our $InitAttributePrefix = qr{__(?:mem|cpu|dev|net_|)};
 our $InitAttributeData = qr{$InitAttributePrefix(?:initdata\b)};
-- 
cgit 


From b354286effa52da6cb1b1f16604d41ff81b8c445 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Tue, 29 Dec 2015 12:18:48 +0800
Subject: irq: Privatize irq_common_data::state_use_accessors

irq_common_data::state_use_accessors is not designed for public use.
Therefore make it private so that people who write code accessing it
directly will get blamed by sparse. Also #undef the macro
__irqd_to_state after used in header files, so that the macro can't be
misused.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/irq.h    | 6 ++++--
 kernel/irq/internals.h | 4 ++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3c1c96786248..cd14cd4a22b4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -137,7 +137,7 @@ struct irq_domain;
  * @msi_desc:		MSI descriptor
  */
 struct irq_common_data {
-	unsigned int		state_use_accessors;
+	unsigned int		__private state_use_accessors;
 #ifdef CONFIG_NUMA
 	unsigned int		node;
 #endif
@@ -208,7 +208,7 @@ enum {
 	IRQD_FORWARDED_TO_VCPU		= (1 << 20),
 };
 
-#define __irqd_to_state(d)		((d)->common->state_use_accessors)
+#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
 
 static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
 {
@@ -299,6 +299,8 @@ static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d)
 	__irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU;
 }
 
+#undef __irqd_to_state
+
 static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
 {
 	return d->hwirq;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fcab63c66905..3d182932d2d1 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -160,6 +160,8 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
 	__irq_put_desc_unlock(desc, flags, false);
 }
 
+#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
+
 /*
  * Manipulation functions for irq_data.state
  */
@@ -188,6 +190,8 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
 	return __irqd_to_state(d) & mask;
 }
 
+#undef __irqd_to_state
+
 static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
 {
 	__this_cpu_inc(*desc->kstat_irqs);
-- 
cgit 


From 9de630c4f264dec48e61edd871cb98b8e1b58250 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 8 Jan 2016 07:43:50 -0800
Subject: rcu: Document unique-name limitation for DEFINE_STATIC_SRCU()

SRCU uses per-CPU variables, and DEFINE_STATIC_SRCU() uses a static
per-CPU variable.  However, per-CPU variables have significant
restrictions, for example, names of per-CPU variables must be globally
unique, even if declared static.  These restrictions carry over to
DEFINE_STATIC_SRCU(), and this commit therefore documents these
restrictions.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
---
 include/linux/srcu.h | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index f5f80c5643ac..dc8eb63c6568 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -99,8 +99,23 @@ void process_srcu(struct work_struct *work);
 	}
 
 /*
- * define and init a srcu struct at build time.
- * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
+ * Define and initialize a srcu struct at build time.
+ * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
+ *
+ * Note that although DEFINE_STATIC_SRCU() hides the name from other
+ * files, the per-CPU variable rules nevertheless require that the
+ * chosen name be globally unique.  These rules also prohibit use of
+ * DEFINE_STATIC_SRCU() within a function.  If these rules are too
+ * restrictive, declare the srcu_struct manually.  For example, in
+ * each file:
+ *
+ *	static struct srcu_struct my_srcu;
+ *
+ * Then, before the first use of each my_srcu, manually initialize it:
+ *
+ *	init_srcu_struct(&my_srcu);
+ *
+ * See include/linux/percpu-defs.h for the rules on per-CPU variables.
  */
 #define __DEFINE_SRCU(name, is_static)					\
 	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-- 
cgit 


From 3500efae4410454522697c94c23fc40323c0cee9 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Mon, 19 Oct 2015 14:45:02 -0700
Subject: rcu: Remove rcu_user_hooks_switch

Because there are neither uses nor intended uses for the
rcu_user_hooks_switch() function that was orginally intended
for nohz use, this commit removes it.

Signed-off-by: Yang Shi <yang.shi@linaro.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 14e6f47ee16f..b5d48bd56e3f 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -360,8 +360,6 @@ void rcu_user_exit(void);
 #else
 static inline void rcu_user_enter(void) { }
 static inline void rcu_user_exit(void) { }
-static inline void rcu_user_hooks_switch(struct task_struct *prev,
-					 struct task_struct *next) { }
 #endif /* CONFIG_NO_HZ_FULL */
 
 #ifdef CONFIG_RCU_NOCB_CPU
-- 
cgit 


From 0abefbaab4edbcec637e00fefcdeccb52797fe4f Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:12 +0000
Subject: genirq: Add new IPI irqdomain flags

These flags will be used to identify an IPI domain. We have two flavours of
IPI implementations:

IRQ_DOMAIN_FLAG_IPI_PER_CPU: Each CPU has its own virq and hwirq
IRQ_DOMAIN_FLAG_IPI_SINGLE : A single virq and hwirq for all CPUs

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-2-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdomain.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 04579d9fbce4..9bb0a9cfc1c4 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -172,6 +172,12 @@ enum {
 	/* Core calls alloc/free recursive through the domain hierarchy. */
 	IRQ_DOMAIN_FLAG_AUTO_RECURSIVE	= (1 << 1),
 
+	/* Irq domain is an IPI domain with virq per cpu */
+	IRQ_DOMAIN_FLAG_IPI_PER_CPU	= (1 << 2),
+
+	/* Irq domain is an IPI domain with single virq */
+	IRQ_DOMAIN_FLAG_IPI_SINGLE	= (1 << 3),
+
 	/*
 	 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
 	 * for implementation specific purposes and ignored by the
@@ -400,6 +406,22 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain)
 {
 	return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY;
 }
+
+static inline bool irq_domain_is_ipi(struct irq_domain *domain)
+{
+	return domain->flags &
+		(IRQ_DOMAIN_FLAG_IPI_PER_CPU | IRQ_DOMAIN_FLAG_IPI_SINGLE);
+}
+
+static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain)
+{
+	return domain->flags & IRQ_DOMAIN_FLAG_IPI_PER_CPU;
+}
+
+static inline bool irq_domain_is_ipi_single(struct irq_domain *domain)
+{
+	return domain->flags & IRQ_DOMAIN_FLAG_IPI_SINGLE;
+}
 #else	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
 static inline void irq_domain_activate_irq(struct irq_data *data) { }
 static inline void irq_domain_deactivate_irq(struct irq_data *data) { }
@@ -413,6 +435,21 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain)
 {
 	return false;
 }
+
+static inline bool irq_domain_is_ipi(struct irq_domain *domain)
+{
+	return false;
+}
+
+static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain)
+{
+	return false;
+}
+
+static inline bool irq_domain_is_ipi_single(struct irq_domain *domain)
+{
+	return false;
+}
 #endif	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
 
 #else /* CONFIG_IRQ_DOMAIN */
-- 
cgit 


From 29d5c8db26ad54592436508819ac617119306f96 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:13 +0000
Subject: genirq: Add DOMAIN_BUS_IPI

We need a way to search and match IPI domains.

Using the new enum we can use irq_find_matching_host() to do that.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-3-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdomain.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 9bb0a9cfc1c4..130e1c3117c3 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -74,6 +74,7 @@ enum irq_domain_bus_token {
 	DOMAIN_BUS_PCI_MSI,
 	DOMAIN_BUS_PLATFORM_MSI,
 	DOMAIN_BUS_NEXUS,
+	DOMAIN_BUS_IPI,
 };
 
 /**
-- 
cgit 


From 955bfe5912e7839abcc83694f06867535487404b Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:17 +0000
Subject: genirq: Add an extra comment about the use of affinity in
 irq_common_data

Affinity will have dual meaning depends on the type of the irq. If it is
a normal irq, it'll have the standard affinity meaning.

If it is an IPI, it will hold the mask of the cpus to which an IPI can be
sent.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-7-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3c1c96786248..0817afd0d719 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -133,7 +133,9 @@ struct irq_domain;
  *			Use accessor functions to deal with it
  * @node:		node index useful for balancing
  * @handler_data:	per-IRQ data for the irq_chip methods
- * @affinity:		IRQ affinity on SMP
+ * @affinity:		IRQ affinity on SMP. If this is an IPI
+ *			related irq, then this is the mask of the
+ *			CPUs to which an IPI can be sent.
  * @msi_desc:		MSI descriptor
  */
 struct irq_common_data {
-- 
cgit 


From f256c9a0c54820ffef21b126f8226be2bece3dd7 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:16 +0000
Subject: genirq: Add ipi_offset to irq_common_data

IPIs are always assumed to be consecutively allocated, hence virqs and hwirqs
can be inferred by using CPU id as an offset. But the first cpu doesn't always
have to start at offset 0. ipi_offset stores the position of the first cpu so
that we can easily calculate the virq or hwirq of an IPI associated with a
specific cpu.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-6-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 0817afd0d719..a32b47fbf874 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -137,6 +137,7 @@ struct irq_domain;
  *			related irq, then this is the mask of the
  *			CPUs to which an IPI can be sent.
  * @msi_desc:		MSI descriptor
+ * @ipi_offset:		Offset of first IPI target cpu in @affinity. Optional.
  */
 struct irq_common_data {
 	unsigned int		state_use_accessors;
@@ -146,6 +147,9 @@ struct irq_common_data {
 	void			*handler_data;
 	struct msi_desc		*msi_desc;
 	cpumask_var_t		affinity;
+#ifdef CONFIG_GENERIC_IRQ_IPI
+	unsigned int		ipi_offset;
+#endif
 };
 
 /**
-- 
cgit 


From ac0a0cd266d1f21236d5975ca6bced9b377a2a6a Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:18 +0000
Subject: genirq: Make irq_domain_alloc_descs() non static

We will need to use this function to implement irq_reserve_ipi() later. So
make it non static and move the prototype to irqdomain.h to allow using it
outside irqdomain.c

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-8-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdomain.h | 2 ++
 kernel/irq/irqdomain.c    | 6 ++----
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 130e1c3117c3..c466cc17b8e9 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -213,6 +213,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 extern struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
 						   enum irq_domain_bus_token bus_token);
 extern void irq_set_default_host(struct irq_domain *host);
+extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
+				  irq_hw_number_t hwirq, int node);
 
 static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node)
 {
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3e56d2f03e24..86811541f073 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex);
 static DEFINE_MUTEX(revmap_trees_mutex);
 static struct irq_domain *irq_default_domain;
 
-static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
-				  irq_hw_number_t hwirq, int node);
 static void irq_domain_check_hierarchy(struct irq_domain *domain);
 
 struct irqchip_fwid {
@@ -840,8 +838,8 @@ const struct irq_domain_ops irq_domain_simple_ops = {
 };
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 
-static int irq_domain_alloc_descs(int virq, unsigned int cnt,
-				  irq_hw_number_t hwirq, int node)
+int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
+			   int node)
 {
 	unsigned int hint;
 
-- 
cgit 


From d17bf24e695290d3fe7943aca52ab48098a10653 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:19 +0000
Subject: genirq: Add a new generic IPI reservation code to irq core

Add a generic mechanism to dynamically allocate an IPI. Depending on the
underlying implementation this creates either a single Linux irq or a
consective range of Linux irqs. The Linux irq is used later to send IPIs to
other CPUs.

[ tglx: Massaged the code and removed the 'consecutive mask' restriction for
  	the single IRQ case ]

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-9-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h       |   3 +
 include/linux/irqdomain.h |   5 ++
 kernel/irq/Makefile       |   1 +
 kernel/irq/ipi.c          | 137 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 146 insertions(+)
 create mode 100644 kernel/irq/ipi.c

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index a32b47fbf874..95f4f66f95f3 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -940,4 +940,7 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc,
 		return readl(gc->reg_base + reg_offset);
 }
 
+/* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */
+#define INVALID_HWIRQ	(~0UL)
+
 #endif /* _LINUX_IRQ_H */
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index c466cc17b8e9..ed48594e96d2 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -344,6 +344,11 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr,
 			const u32 *intspec, unsigned int intsize,
 			irq_hw_number_t *out_hwirq, unsigned int *out_type);
 
+/* IPI functions */
+unsigned int irq_reserve_ipi(struct irq_domain *domain,
+			     const struct cpumask *dest);
+void irq_destroy_ipi(unsigned int irq);
+
 /* V2 interfaces to support hierarchy IRQ domains. */
 extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
 						unsigned int virq);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2fc9cbdf35b6..2ee42e95a3ce 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
 obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
 obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
new file mode 100644
index 000000000000..340af273429c
--- /dev/null
+++ b/kernel/irq/ipi.c
@@ -0,0 +1,137 @@
+/*
+ * linux/kernel/irq/ipi.c
+ *
+ * Copyright (C) 2015 Imagination Technologies Ltd
+ * Author: Qais Yousef <qais.yousef@imgtec.com>
+ *
+ * This file contains driver APIs to the IPI subsystem.
+ */
+
+#define pr_fmt(fmt) "genirq/ipi: " fmt
+
+#include <linux/irqdomain.h>
+#include <linux/irq.h>
+
+/**
+ * irq_reserve_ipi() - Setup an IPI to destination cpumask
+ * @domain:	IPI domain
+ * @dest:	cpumask of cpus which can receive the IPI
+ *
+ * Allocate a virq that can be used to send IPI to any CPU in dest mask.
+ *
+ * On success it'll return linux irq number and 0 on failure
+ */
+unsigned int irq_reserve_ipi(struct irq_domain *domain,
+			     const struct cpumask *dest)
+{
+	unsigned int nr_irqs, offset;
+	struct irq_data *data;
+	int virq, i;
+
+	if (!domain ||!irq_domain_is_ipi(domain)) {
+		pr_warn("Reservation on a non IPI domain\n");
+		return 0;
+	}
+
+	if (!cpumask_subset(dest, cpu_possible_mask)) {
+		pr_warn("Reservation is not in possible_cpu_mask\n");
+		return 0;
+	}
+
+	nr_irqs = cpumask_weight(dest);
+	if (!nr_irqs) {
+		pr_warn("Reservation for empty destination mask\n");
+		return 0;
+	}
+
+	if (irq_domain_is_ipi_single(domain)) {
+		/*
+		 * If the underlying implementation uses a single HW irq on
+		 * all cpus then we only need a single Linux irq number for
+		 * it. We have no restrictions vs. the destination mask. The
+		 * underlying implementation can deal with holes nicely.
+		 */
+		nr_irqs = 1;
+		offset = 0;
+	} else {
+		unsigned int next;
+
+		/*
+		 * The IPI requires a seperate HW irq on each CPU. We require
+		 * that the destination mask is consecutive. If an
+		 * implementation needs to support holes, it can reserve
+		 * several IPI ranges.
+		 */
+		offset = cpumask_first(dest);
+		/*
+		 * Find a hole and if found look for another set bit after the
+		 * hole. For now we don't support this scenario.
+		 */
+		next = cpumask_next_zero(offset, dest);
+		if (next < nr_cpu_ids)
+			next = cpumask_next(next, dest);
+		if (next < nr_cpu_ids) {
+			pr_warn("Destination mask has holes\n");
+			return 0;
+		}
+	}
+
+	virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
+	if (virq <= 0) {
+		pr_warn("Can't reserve IPI, failed to alloc descs\n");
+		return 0;
+	}
+
+	virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
+				       (void *) dest, true);
+
+	if (virq <= 0) {
+		pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
+		goto free_descs;
+	}
+
+	for (i = 0; i < nr_irqs; i++) {
+		data = irq_get_irq_data(virq + i);
+		cpumask_copy(data->common->affinity, dest);
+		data->common->ipi_offset = offset;
+	}
+	return virq;
+
+free_descs:
+	irq_free_descs(virq, nr_irqs);
+	return 0;
+}
+
+/**
+ * irq_destroy_ipi() - unreserve an IPI that was previously allocated
+ * @irq:	linux irq number to be destroyed
+ *
+ * Return the IPIs allocated with irq_reserve_ipi() to the system destroying
+ * all virqs associated with them.
+ */
+void irq_destroy_ipi(unsigned int irq)
+{
+	struct irq_data *data = irq_get_irq_data(irq);
+	struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+	struct irq_domain *domain;
+	unsigned int nr_irqs;
+
+	if (!irq || !data || !ipimask)
+		return;
+
+	domain = data->domain;
+	if (WARN_ON(domain == NULL))
+		return;
+
+	if (!irq_domain_is_ipi(domain)) {
+		pr_warn("Trying to destroy a non IPI domain!\n");
+		return;
+	}
+
+	if (irq_domain_is_ipi_per_cpu(domain))
+		nr_irqs = cpumask_weight(ipimask);
+	else
+		nr_irqs = 1;
+
+	irq_domain_free_irqs(irq, nr_irqs);
+}
-- 
cgit 


From f9bce791ae2a1a10a965b30427f5507c1a77669f Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:20 +0000
Subject: genirq: Add a new function to get IPI reverse mapping

When dealing with coprocessors we need to find out the actual hwirqs values to
pass on to the firmware so that it knows what it needs to use to receive IPIs
from and send IPIs to Linux cpus.

[ tglx: Fixed the single hwirq IPI case. The hardware irq number does not
  	change due to the cpu number ]

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-10-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h |  1 +
 kernel/irq/ipi.c    | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 95f4f66f95f3..10273dce058a 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -942,5 +942,6 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc,
 
 /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */
 #define INVALID_HWIRQ	(~0UL)
+irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu);
 
 #endif /* _LINUX_IRQ_H */
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 340af273429c..6f34f2930bc0 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -135,3 +135,37 @@ void irq_destroy_ipi(unsigned int irq)
 
 	irq_domain_free_irqs(irq, nr_irqs);
 }
+
+/**
+ * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu
+ * @irq:	linux irq number
+ * @cpu:	the target cpu
+ *
+ * When dealing with coprocessors IPI, we need to inform the coprocessor of
+ * the hwirq it needs to use to receive and send IPIs.
+ *
+ * Returns hwirq value on success and INVALID_HWIRQ on failure.
+ */
+irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
+{
+	struct irq_data *data = irq_get_irq_data(irq);
+	struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+
+	if (!data || !ipimask || cpu > nr_cpu_ids)
+		return INVALID_HWIRQ;
+
+	if (!cpumask_test_cpu(cpu, ipimask))
+		return INVALID_HWIRQ;
+
+	/*
+	 * Get the real hardware irq number if the underlying implementation
+	 * uses a seperate irq per cpu. If the underlying implementation uses
+	 * a single hardware irq for all cpus then the IPI send mechanism
+	 * needs to take care of this.
+	 */
+	if (irq_domain_is_ipi_per_cpu(data->domain))
+		data = irq_get_irq_data(irq + cpu - data->common->ipi_offset);
+
+	return data ? irqd_to_hwirq(data) : INVALID_HWIRQ;
+}
+EXPORT_SYMBOL_GPL(ipi_get_hwirq);
-- 
cgit 


From 34dc1ae101018dbb50e1d04e88aa89052802a7db Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:21 +0000
Subject: genirq: Add send_ipi callbacks to irq_chip

Introduce the new callbacks which can be used by the core code to implement a
generic IPI send mechanism.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-11-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 10273dce058a..3b3a5b817469 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -347,6 +347,8 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
  * @irq_get_irqchip_state:	return the internal state of an interrupt
  * @irq_set_irqchip_state:	set the internal state of a interrupt
  * @irq_set_vcpu_affinity:	optional to target a vCPU in a virtual machine
+ * @ipi_send_single:	send a single IPI to destination cpus
+ * @ipi_send_mask:	send an IPI to destination cpus in cpumask
  * @flags:		chip specific flags
  */
 struct irq_chip {
@@ -391,6 +393,9 @@ struct irq_chip {
 
 	int		(*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info);
 
+	void		(*ipi_send_single)(struct irq_data *data, unsigned int cpu);
+	void		(*ipi_send_mask)(struct irq_data *data, const struct cpumask *dest);
+
 	unsigned long	flags;
 };
 
-- 
cgit 


From 3b8e29a82dd16c1f2061e0b955a71cd36eeb061b Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:22 +0000
Subject: genirq: Implement ipi_send_mask/single()

Add APIs to send IPIs from driver and arch code.

We have different functions because we allow architecture code to cache the
irq descriptor to avoid lookups. Driver code has to use the irq number and is
subject to more restrictive checks.

[ tglx: Polish the implementation ]

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-12-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h |   4 ++
 kernel/irq/ipi.c    | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 160 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3b3a5b817469..d5ebd94822d2 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -948,5 +948,9 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc,
 /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */
 #define INVALID_HWIRQ	(~0UL)
 irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu);
+int __ipi_send_single(struct irq_desc *desc, unsigned int cpu);
+int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
+int ipi_send_single(unsigned int virq, unsigned int cpu);
+int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
 
 #endif /* _LINUX_IRQ_H */
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 6f34f2930bc0..c37f34b00a11 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -161,7 +161,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
 	 * Get the real hardware irq number if the underlying implementation
 	 * uses a seperate irq per cpu. If the underlying implementation uses
 	 * a single hardware irq for all cpus then the IPI send mechanism
-	 * needs to take care of this.
+	 * needs to take care of the cpu destinations.
 	 */
 	if (irq_domain_is_ipi_per_cpu(data->domain))
 		data = irq_get_irq_data(irq + cpu - data->common->ipi_offset);
@@ -169,3 +169,158 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
 	return data ? irqd_to_hwirq(data) : INVALID_HWIRQ;
 }
 EXPORT_SYMBOL_GPL(ipi_get_hwirq);
+
+static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
+			   const struct cpumask *dest, unsigned int cpu)
+{
+	struct cpumask *ipimask = irq_data_get_affinity_mask(data);
+
+	if (!chip || !ipimask)
+		return -EINVAL;
+
+	if (!chip->ipi_send_single && !chip->ipi_send_mask)
+		return -EINVAL;
+
+	if (cpu > nr_cpu_ids)
+		return -EINVAL;
+
+	if (dest) {
+		if (!cpumask_subset(dest, ipimask))
+			return -EINVAL;
+	} else {
+		if (!cpumask_test_cpu(cpu, ipimask))
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * __ipi_send_single - send an IPI to a target Linux SMP CPU
+ * @desc:	pointer to irq_desc of the IRQ
+ * @cpu:	destination CPU, must in the destination mask passed to
+ *		irq_reserve_ipi()
+ *
+ * This function is for architecture or core code to speed up IPI sending. Not
+ * usable from driver code.
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
+{
+	struct irq_data *data = irq_desc_get_irq_data(desc);
+	struct irq_chip *chip = irq_data_get_irq_chip(data);
+
+#ifdef DEBUG
+	/*
+	 * Minimise the overhead by omitting the checks for Linux SMP IPIs.
+	 * Since the callers should be arch or core code which is generally
+	 * trusted, only check for errors when debugging.
+	 */
+	if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
+		return -EINVAL;
+#endif
+	if (!chip->ipi_send_single) {
+		chip->ipi_send_mask(data, cpumask_of(cpu));
+		return 0;
+	}
+
+	/* FIXME: Store this information in irqdata flags */
+	if (irq_domain_is_ipi_per_cpu(data->domain) &&
+	    cpu != data->common->ipi_offset) {
+		/* use the correct data for that cpu */
+		unsigned irq = data->irq + cpu - data->common->ipi_offset;
+
+		data = irq_get_irq_data(irq);
+	}
+	chip->ipi_send_single(data, cpu);
+	return 0;
+}
+
+/**
+ * ipi_send_mask - send an IPI to target Linux SMP CPU(s)
+ * @desc:	pointer to irq_desc of the IRQ
+ * @dest:	dest CPU(s), must be a subset of the mask passed to
+ *		irq_reserve_ipi()
+ *
+ * This function is for architecture or core code to speed up IPI sending. Not
+ * usable from driver code.
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
+{
+	struct irq_data *data = irq_desc_get_irq_data(desc);
+	struct irq_chip *chip = irq_data_get_irq_chip(data);
+	unsigned int cpu;
+
+#ifdef DEBUG
+	/*
+	 * Minimise the overhead by omitting the checks for Linux SMP IPIs.
+	 * Since the callers should be arch or core code which is generally
+	 * trusted, only check for errors when debugging.
+	 */
+	if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
+		return -EINVAL;
+#endif
+	if (chip->ipi_send_mask) {
+		chip->ipi_send_mask(data, dest);
+		return 0;
+	}
+
+	if (irq_domain_is_ipi_per_cpu(data->domain)) {
+		unsigned int base = data->irq;
+
+		for_each_cpu(cpu, dest) {
+			unsigned irq = base + cpu - data->common->ipi_offset;
+
+			data = irq_get_irq_data(irq);
+			chip->ipi_send_single(data, cpu);
+		}
+	} else {
+		for_each_cpu(cpu, dest)
+			chip->ipi_send_single(data, cpu);
+	}
+	return 0;
+}
+
+/**
+ * ipi_send_single - Send an IPI to a single CPU
+ * @virq:	linux irq number from irq_reserve_ipi()
+ * @cpu:	destination CPU, must in the destination mask passed to
+ *		irq_reserve_ipi()
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int ipi_send_single(unsigned int virq, unsigned int cpu)
+{
+	struct irq_desc *desc = irq_to_desc(virq);
+	struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
+	struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
+
+	if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
+		return -EINVAL;
+
+	return __ipi_send_single(desc, cpu);
+}
+EXPORT_SYMBOL_GPL(ipi_send_single);
+
+/**
+ * ipi_send_mask - Send an IPI to target CPU(s)
+ * @virq:	linux irq number from irq_reserve_ipi()
+ * @dest:	dest CPU(s), must be a subset of the mask passed to
+ *		irq_reserve_ipi()
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int ipi_send_mask(unsigned int virq, const struct cpumask *dest)
+{
+	struct irq_desc *desc = irq_to_desc(virq);
+	struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
+	struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
+
+	if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
+		return -EINVAL;
+
+	return __ipi_send_mask(desc, dest);
+}
+EXPORT_SYMBOL_GPL(ipi_send_mask);
-- 
cgit 


From bb11cff327e54179c13446c4022ed4ed7d4871c7 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:28 +0000
Subject: MIPS: Make smp CMP, CPS and MT use the new generic IPI functions

This commit does several things to avoid breaking bisectability.

	1- Remove IPI init code from irqchip/mips-gic
	2- Implement the new irqchip->send_ipi() in irqchip/mips-gic
	3- Select GENERIC_IRQ_IPI Kconfig symbol for MIPS_GIC
	4- Change MIPS SMP to use the generic IPI implementation

Only the SMP variants that use GIC were converted as it's the only irqchip that
will have the support for generic IPI for now.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-18-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/mips/include/asm/smp-ops.h  |  5 ++-
 arch/mips/kernel/smp-cmp.c       |  4 +-
 arch/mips/kernel/smp-cps.c       |  4 +-
 arch/mips/kernel/smp-mt.c        |  2 +-
 drivers/irqchip/Kconfig          |  1 +
 drivers/irqchip/irq-mips-gic.c   | 86 +++-------------------------------------
 include/linux/irqchip/mips-gic.h |  3 --
 7 files changed, 14 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/include/asm/smp-ops.h b/arch/mips/include/asm/smp-ops.h
index 6ba1fb8b11e2..db7c322f057f 100644
--- a/arch/mips/include/asm/smp-ops.h
+++ b/arch/mips/include/asm/smp-ops.h
@@ -44,8 +44,9 @@ static inline void plat_smp_setup(void)
 	mp_ops->smp_setup();
 }
 
-extern void gic_send_ipi_single(int cpu, unsigned int action);
-extern void gic_send_ipi_mask(const struct cpumask *mask, unsigned int action);
+extern void mips_smp_send_ipi_single(int cpu, unsigned int action);
+extern void mips_smp_send_ipi_mask(const struct cpumask *mask,
+				      unsigned int action);
 
 #else /* !CONFIG_SMP */
 
diff --git a/arch/mips/kernel/smp-cmp.c b/arch/mips/kernel/smp-cmp.c
index d5e0f949dc48..76923349b4fe 100644
--- a/arch/mips/kernel/smp-cmp.c
+++ b/arch/mips/kernel/smp-cmp.c
@@ -149,8 +149,8 @@ void __init cmp_prepare_cpus(unsigned int max_cpus)
 }
 
 struct plat_smp_ops cmp_smp_ops = {
-	.send_ipi_single	= gic_send_ipi_single,
-	.send_ipi_mask		= gic_send_ipi_mask,
+	.send_ipi_single	= mips_smp_send_ipi_single,
+	.send_ipi_mask		= mips_smp_send_ipi_mask,
 	.init_secondary		= cmp_init_secondary,
 	.smp_finish		= cmp_smp_finish,
 	.boot_secondary		= cmp_boot_secondary,
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index 2ad4e4c96d61..253e1409338c 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -472,8 +472,8 @@ static struct plat_smp_ops cps_smp_ops = {
 	.boot_secondary		= cps_boot_secondary,
 	.init_secondary		= cps_init_secondary,
 	.smp_finish		= cps_smp_finish,
-	.send_ipi_single	= gic_send_ipi_single,
-	.send_ipi_mask		= gic_send_ipi_mask,
+	.send_ipi_single	= mips_smp_send_ipi_single,
+	.send_ipi_mask		= mips_smp_send_ipi_mask,
 #ifdef CONFIG_HOTPLUG_CPU
 	.cpu_disable		= cps_cpu_disable,
 	.cpu_die		= cps_cpu_die,
diff --git a/arch/mips/kernel/smp-mt.c b/arch/mips/kernel/smp-mt.c
index 86311a164ef1..4f9570a57e8d 100644
--- a/arch/mips/kernel/smp-mt.c
+++ b/arch/mips/kernel/smp-mt.c
@@ -121,7 +121,7 @@ static void vsmp_send_ipi_single(int cpu, unsigned int action)
 
 #ifdef CONFIG_MIPS_GIC
 	if (gic_present) {
-		gic_send_ipi_single(cpu, action);
+		mips_smp_send_ipi_single(cpu, action);
 		return;
 	}
 #endif
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 71e648adc3fd..00bbec6eca0b 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -209,6 +209,7 @@ config KEYSTONE_IRQ
 
 config MIPS_GIC
 	bool
+	select GENERIC_IRQ_IPI
 	select IRQ_DOMAIN_HIERARCHY
 	select MIPS_CM
 
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 83395bf834c8..37831a557bcb 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -280,9 +280,11 @@ static void gic_bind_eic_interrupt(int irq, int set)
 		  GIC_VPE_EIC_SS(irq), set);
 }
 
-void gic_send_ipi(unsigned int intr)
+static void gic_send_ipi(struct irq_data *d, unsigned int cpu)
 {
-	gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(intr));
+	irq_hw_number_t hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(d));
+
+	gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(hwirq));
 }
 
 int gic_get_c0_compare_int(void)
@@ -495,6 +497,7 @@ static struct irq_chip gic_edge_irq_controller = {
 #ifdef CONFIG_SMP
 	.irq_set_affinity	=	gic_set_affinity,
 #endif
+	.ipi_send_single	=	gic_send_ipi,
 };
 
 static void gic_handle_local_int(bool chained)
@@ -588,83 +591,6 @@ static void gic_irq_dispatch(struct irq_desc *desc)
 	gic_handle_shared_int(true);
 }
 
-#ifdef CONFIG_MIPS_GIC_IPI
-static int gic_resched_int_base;
-static int gic_call_int_base;
-
-unsigned int plat_ipi_resched_int_xlate(unsigned int cpu)
-{
-	return gic_resched_int_base + cpu;
-}
-
-unsigned int plat_ipi_call_int_xlate(unsigned int cpu)
-{
-	return gic_call_int_base + cpu;
-}
-
-static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
-{
-	scheduler_ipi();
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t ipi_call_interrupt(int irq, void *dev_id)
-{
-	generic_smp_call_function_interrupt();
-
-	return IRQ_HANDLED;
-}
-
-static struct irqaction irq_resched = {
-	.handler	= ipi_resched_interrupt,
-	.flags		= IRQF_PERCPU,
-	.name		= "IPI resched"
-};
-
-static struct irqaction irq_call = {
-	.handler	= ipi_call_interrupt,
-	.flags		= IRQF_PERCPU,
-	.name		= "IPI call"
-};
-
-static __init void gic_ipi_init_one(unsigned int intr, int cpu,
-				    struct irqaction *action)
-{
-	int virq = irq_create_mapping(gic_irq_domain,
-				      GIC_SHARED_TO_HWIRQ(intr));
-	int i;
-
-	gic_map_to_vpe(intr, mips_cm_vp_id(cpu));
-	for (i = 0; i < NR_CPUS; i++)
-		clear_bit(intr, pcpu_masks[i].pcpu_mask);
-	set_bit(intr, pcpu_masks[cpu].pcpu_mask);
-
-	irq_set_irq_type(virq, IRQ_TYPE_EDGE_RISING);
-
-	irq_set_handler(virq, handle_percpu_irq);
-	setup_irq(virq, action);
-}
-
-static __init void gic_ipi_init(void)
-{
-	int i;
-
-	/* Use last 2 * NR_CPUS interrupts as IPIs */
-	gic_resched_int_base = gic_shared_intrs - nr_cpu_ids;
-	gic_call_int_base = gic_resched_int_base - nr_cpu_ids;
-
-	for (i = 0; i < nr_cpu_ids; i++) {
-		gic_ipi_init_one(gic_call_int_base + i, i, &irq_call);
-		gic_ipi_init_one(gic_resched_int_base + i, i, &irq_resched);
-	}
-}
-#else
-static inline void gic_ipi_init(void)
-{
-}
-#endif
-
 static void __init gic_basic_init(void)
 {
 	unsigned int i;
@@ -1105,8 +1031,6 @@ static void __init __gic_init(unsigned long gic_base_addr,
 	bitmap_set(ipi_resrv, gic_shared_intrs - 2 * gic_vpes, 2 * gic_vpes);
 
 	gic_basic_init();
-
-	gic_ipi_init();
 }
 
 void __init gic_init(unsigned long gic_base_addr,
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index ce824db48d64..80f89e4a29ac 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -261,9 +261,6 @@ extern void gic_write_compare(cycle_t cnt);
 extern void gic_write_cpu_compare(cycle_t cnt, int cpu);
 extern void gic_start_count(void);
 extern void gic_stop_count(void);
-extern void gic_send_ipi(unsigned int intr);
-extern unsigned int plat_ipi_call_int_xlate(unsigned int);
-extern unsigned int plat_ipi_resched_int_xlate(unsigned int);
 extern int gic_get_c0_compare_int(void);
 extern int gic_get_c0_perfcount_int(void);
 extern int gic_get_c0_fdc_int(void);
-- 
cgit 


From 7aca0c07207385cca76025cc85231519935722b9 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Fri, 26 Feb 2016 19:14:13 -0800
Subject: clocksource: Introduce clocksource_freq2mult()

The clocksource_khz2mult() and clocksource_hz2mult() share similar
code wihch calculates a mult from the given frequency. Both implementations
in differ only in value of a frequency. This patch introduces the
clocksource_freq2mult() helper with generic implementation of
mult calculation to prevent code duplication.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Link: http://lkml.kernel.org/r/1456542854-22104-2-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h | 45 +++++++++++++++++++--------------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 6013021a3b39..a307bf62974f 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -118,6 +118,23 @@ struct clocksource {
 /* simplify initialization of mask field */
 #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
 
+static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from)
+{
+	/*  freq = cyc/from
+	 *  mult/2^shift  = ns/cyc
+	 *  mult = ns/cyc * 2^shift
+	 *  mult = from/freq * 2^shift
+	 *  mult = from * 2^shift / freq
+	 *  mult = (from<<shift) / freq
+	 */
+	u64 tmp = ((u64)from) << shift_constant;
+
+	tmp += freq/2; /* round for do_div */
+	do_div(tmp, freq);
+
+	return (u32)tmp;
+}
+
 /**
  * clocksource_khz2mult - calculates mult from khz and shift
  * @khz:		Clocksource frequency in KHz
@@ -128,19 +145,7 @@ struct clocksource {
  */
 static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant)
 {
-	/*  khz = cyc/(Million ns)
-	 *  mult/2^shift  = ns/cyc
-	 *  mult = ns/cyc * 2^shift
-	 *  mult = 1Million/khz * 2^shift
-	 *  mult = 1000000 * 2^shift / khz
-	 *  mult = (1000000<<shift) / khz
-	 */
-	u64 tmp = ((u64)1000000) << shift_constant;
-
-	tmp += khz/2; /* round for do_div */
-	do_div(tmp, khz);
-
-	return (u32)tmp;
+	return clocksource_freq2mult(khz, shift_constant, NSEC_PER_MSEC);
 }
 
 /**
@@ -154,19 +159,7 @@ static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant)
  */
 static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
 {
-	/*  hz = cyc/(Billion ns)
-	 *  mult/2^shift  = ns/cyc
-	 *  mult = ns/cyc * 2^shift
-	 *  mult = 1Billion/hz * 2^shift
-	 *  mult = 1000000000 * 2^shift / hz
-	 *  mult = (1000000000<<shift) / hz
-	 */
-	u64 tmp = ((u64)1000000000) << shift_constant;
-
-	tmp += hz/2; /* round for do_div */
-	do_div(tmp, hz);
-
-	return (u32)tmp;
+	return clocksource_freq2mult(hz, shift_constant, NSEC_PER_SEC);
 }
 
 /**
-- 
cgit 


From 090e77c391dd983c8945b8e2e16d09f378d2e334 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:23 +0000
Subject: cpu/hotplug: Restructure FROZEN state handling

There are only a few callbacks which really care about FROZEN
vs. !FROZEN. No need to have extra states for this.

Publish the frozen state in an extra variable which is updated under
the hotplug lock and let the users interested deal with it w/o
imposing that extra state checks on everyone.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182340.334912357@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h |  2 ++
 kernel/cpu.c        | 69 ++++++++++++++++++++++-------------------------------
 2 files changed, 31 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d2ca8c38f9c4..f2fb54938ee6 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -118,6 +118,7 @@ enum {
 
 
 #ifdef CONFIG_SMP
+extern bool cpuhp_tasks_frozen;
 /* Need to know about CPUs going up/down? */
 #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
 #define cpu_notifier(fn, pri) {					\
@@ -177,6 +178,7 @@ extern void cpu_maps_update_done(void);
 #define cpu_notifier_register_done	cpu_maps_update_done
 
 #else	/* CONFIG_SMP */
+#define cpuhp_tasks_frozen	0
 
 #define cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 #define __cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5b9d39633ce9..41a6cb85c0af 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -29,6 +29,8 @@
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
+bool cpuhp_tasks_frozen;
+EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
 
 /*
  * The following two APIs (cpu_maps_update_begin/done) must be used when
@@ -207,27 +209,30 @@ int __register_cpu_notifier(struct notifier_block *nb)
 	return raw_notifier_chain_register(&cpu_chain, nb);
 }
 
-static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
+static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call,
 			int *nr_calls)
 {
+	unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0;
+	void *hcpu = (void *)(long)cpu;
+
 	int ret;
 
-	ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
+	ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call,
 					nr_calls);
 
 	return notifier_to_errno(ret);
 }
 
-static int cpu_notify(unsigned long val, void *v)
+static int cpu_notify(unsigned long val, unsigned int cpu)
 {
-	return __cpu_notify(val, v, -1, NULL);
+	return __cpu_notify(val, cpu, -1, NULL);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static void cpu_notify_nofail(unsigned long val, void *v)
+static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
 {
-	BUG_ON(cpu_notify(val, v));
+	BUG_ON(cpu_notify(val, cpu));
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 EXPORT_SYMBOL(__register_cpu_notifier);
@@ -311,27 +316,21 @@ static inline void check_for_tasks(int dead_cpu)
 	read_unlock(&tasklist_lock);
 }
 
-struct take_cpu_down_param {
-	unsigned long mod;
-	void *hcpu;
-};
-
 /* Take this CPU down. */
 static int take_cpu_down(void *_param)
 {
-	struct take_cpu_down_param *param = _param;
-	int err;
+	int err, cpu = smp_processor_id();
 
 	/* Ensure this CPU doesn't handle any more interrupts. */
 	err = __cpu_disable();
 	if (err < 0)
 		return err;
 
-	cpu_notify(CPU_DYING | param->mod, param->hcpu);
+	cpu_notify(CPU_DYING, cpu);
 	/* Give up timekeeping duties */
 	tick_handover_do_timer();
 	/* Park the stopper thread */
-	stop_machine_park((long)param->hcpu);
+	stop_machine_park(cpu);
 	return 0;
 }
 
@@ -339,12 +338,6 @@ static int take_cpu_down(void *_param)
 static int _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
-	void *hcpu = (void *)(long)cpu;
-	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
-	struct take_cpu_down_param tcd_param = {
-		.mod = mod,
-		.hcpu = hcpu,
-	};
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -354,10 +347,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	cpu_hotplug_begin();
 
-	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
+	cpuhp_tasks_frozen = tasks_frozen;
+
+	err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls);
 	if (err) {
 		nr_calls--;
-		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
+		__cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL);
 		pr_warn("%s: attempt to take down CPU %u failed\n",
 			__func__, cpu);
 		goto out_release;
@@ -389,10 +384,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	/*
 	 * So now all preempt/rcu users must observe !cpu_active().
 	 */
-	err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+	err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
-		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+		cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
 		irq_unlock_sparse();
 		goto out_release;
 	}
@@ -419,14 +414,14 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
 	tick_cleanup_dead_cpu(cpu);
-	cpu_notify_nofail(CPU_DEAD | mod, hcpu);
+	cpu_notify_nofail(CPU_DEAD, cpu);
 
 	check_for_tasks(cpu);
 
 out_release:
 	cpu_hotplug_done();
 	if (!err)
-		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
+		cpu_notify_nofail(CPU_POST_DEAD, cpu);
 	return err;
 }
 
@@ -485,10 +480,8 @@ void smpboot_thread_init(void)
 /* Requires cpu_add_remove_lock to be held */
 static int _cpu_up(unsigned int cpu, int tasks_frozen)
 {
-	int ret, nr_calls = 0;
-	void *hcpu = (void *)(long)cpu;
-	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 	struct task_struct *idle;
+	int ret, nr_calls = 0;
 
 	cpu_hotplug_begin();
 
@@ -507,7 +500,9 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 	if (ret)
 		goto out;
 
-	ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
+	cpuhp_tasks_frozen = tasks_frozen;
+
+	ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls);
 	if (ret) {
 		nr_calls--;
 		pr_warn("%s: attempt to bring up CPU %u failed\n",
@@ -523,11 +518,11 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 	BUG_ON(!cpu_online(cpu));
 
 	/* Now call notifier in preparation. */
-	cpu_notify(CPU_ONLINE | mod, hcpu);
+	cpu_notify(CPU_ONLINE, cpu);
 
 out_notify:
 	if (ret != 0)
-		__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+		__cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL);
 out:
 	cpu_hotplug_done();
 
@@ -719,13 +714,7 @@ core_initcall(cpu_hotplug_pm_sync_init);
  */
 void notify_cpu_starting(unsigned int cpu)
 {
-	unsigned long val = CPU_STARTING;
-
-#ifdef CONFIG_PM_SLEEP_SMP
-	if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
-		val = CPU_STARTING_FROZEN;
-#endif /* CONFIG_PM_SLEEP_SMP */
-	cpu_notify(val, (void *)(long)cpu);
+	cpu_notify(CPU_STARTING, cpu);
 }
 
 #endif /* CONFIG_SMP */
-- 
cgit 


From cff7d378d3fdbb53db9b6e2578b14855f401cd41 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:28 +0000
Subject: cpu/hotplug: Convert to a state machine for the control processor

Move the split out steps into a callback array and let the cpu_up/down
code iterate through the array functions. For now most of the
callbacks are asymmetric to resemble the current hotplug maze.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182340.671816690@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h        |   9 +-
 include/linux/cpuhotplug.h |  13 +++
 init/main.c                |  15 +---
 kernel/cpu.c               | 202 +++++++++++++++++++++++++++++++++++++++------
 4 files changed, 194 insertions(+), 45 deletions(-)
 create mode 100644 include/linux/cpuhotplug.h

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f2fb54938ee6..78989f20420f 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -16,6 +16,7 @@
 #include <linux/node.h>
 #include <linux/compiler.h>
 #include <linux/cpumask.h>
+#include <linux/cpuhotplug.h>
 
 struct device;
 struct device_node;
@@ -27,6 +28,9 @@ struct cpu {
 	struct device dev;
 };
 
+extern void boot_cpu_init(void);
+extern void boot_cpu_state_init(void);
+
 extern int register_cpu(struct cpu *cpu, int num);
 extern struct device *get_cpu_device(unsigned cpu);
 extern bool cpu_is_hotpluggable(unsigned cpu);
@@ -267,11 +271,6 @@ static inline int disable_nonboot_cpus(void) { return 0; }
 static inline void enable_nonboot_cpus(void) {}
 #endif /* !CONFIG_PM_SLEEP_SMP */
 
-enum cpuhp_state {
-	CPUHP_OFFLINE,
-	CPUHP_ONLINE,
-};
-
 void cpu_startup_entry(enum cpuhp_state state);
 
 void cpu_idle_poll_ctrl(bool enable);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
new file mode 100644
index 000000000000..d55c9e64acd7
--- /dev/null
+++ b/include/linux/cpuhotplug.h
@@ -0,0 +1,13 @@
+#ifndef __CPUHOTPLUG_H
+#define __CPUHOTPLUG_H
+
+enum cpuhp_state {
+	CPUHP_OFFLINE,
+	CPUHP_CREATE_THREADS,
+	CPUHP_NOTIFY_PREPARE,
+	CPUHP_BRINGUP_CPU,
+	CPUHP_NOTIFY_ONLINE,
+	CPUHP_ONLINE,
+};
+
+#endif
diff --git a/init/main.c b/init/main.c
index 58c9e374704b..c2ea72362ee3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -452,20 +452,6 @@ void __init parse_early_param(void)
 	done = 1;
 }
 
-/*
- *	Activate the first processor.
- */
-
-static void __init boot_cpu_init(void)
-{
-	int cpu = smp_processor_id();
-	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
-	set_cpu_online(cpu, true);
-	set_cpu_active(cpu, true);
-	set_cpu_present(cpu, true);
-	set_cpu_possible(cpu, true);
-}
-
 void __init __weak smp_setup_processor_id(void)
 {
 }
@@ -530,6 +516,7 @@ asmlinkage __visible void __init start_kernel(void)
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
+	boot_cpu_state_init();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
 	build_all_zonelists(NULL, NULL);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0b5d2596f3ec..301851974b8d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -22,10 +22,64 @@
 #include <linux/lockdep.h>
 #include <linux/tick.h>
 #include <linux/irq.h>
+
 #include <trace/events/power.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/cpuhp.h>
 
 #include "smpboot.h"
 
+/**
+ * cpuhp_cpu_state - Per cpu hotplug state storage
+ * @state:	The current cpu state
+ * @target:	The target state
+ */
+struct cpuhp_cpu_state {
+	enum cpuhp_state	state;
+	enum cpuhp_state	target;
+};
+
+static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
+
+/**
+ * cpuhp_step - Hotplug state machine step
+ * @name:	Name of the step
+ * @startup:	Startup function of the step
+ * @teardown:	Teardown function of the step
+ * @skip_onerr:	Do not invoke the functions on error rollback
+ *		Will go away once the notifiers	are gone
+ */
+struct cpuhp_step {
+	const char	*name;
+	int		(*startup)(unsigned int cpu);
+	int		(*teardown)(unsigned int cpu);
+	bool		skip_onerr;
+};
+
+static struct cpuhp_step cpuhp_bp_states[];
+
+/**
+ * cpuhp_invoke_callback _ Invoke the callbacks for a given state
+ * @cpu:	The cpu for which the callback should be invoked
+ * @step:	The step in the state machine
+ * @cb:		The callback function to invoke
+ *
+ * Called from cpu hotplug and from the state register machinery
+ */
+static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step,
+				 int (*cb)(unsigned int))
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+	int ret = 0;
+
+	if (cb) {
+		trace_cpuhp_enter(cpu, st->target, step, cb);
+		ret = cb(cpu);
+		trace_cpuhp_exit(cpu, st->state, step, ret);
+	}
+	return ret;
+}
+
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -454,10 +508,29 @@ static int notify_dead(unsigned int cpu)
 	return 0;
 }
 
+#else
+#define notify_down_prepare	NULL
+#define takedown_cpu		NULL
+#define notify_dead		NULL
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+	for (st->state++; st->state < st->target; st->state++) {
+		struct cpuhp_step *step = cpuhp_bp_states + st->state;
+
+		if (!step->skip_onerr)
+			cpuhp_invoke_callback(cpu, st->state, step->startup);
+	}
+}
+
 /* Requires cpu_add_remove_lock to be held */
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
-	int err;
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+	int prev_state, ret = 0;
+	bool hasdied = false;
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -469,20 +542,25 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	cpuhp_tasks_frozen = tasks_frozen;
 
-	err = notify_down_prepare(cpu);
-	if (err)
-		goto out_release;
-	err = takedown_cpu(cpu);
-	if (err)
-		goto out_release;
+	prev_state = st->state;
+	st->target = CPUHP_OFFLINE;
+	for (; st->state > st->target; st->state--) {
+		struct cpuhp_step *step = cpuhp_bp_states + st->state;
 
-	notify_dead(cpu);
+		ret = cpuhp_invoke_callback(cpu, st->state, step->teardown);
+		if (ret) {
+			st->target = prev_state;
+			undo_cpu_down(cpu, st);
+			break;
+		}
+	}
+	hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
 
-out_release:
 	cpu_hotplug_done();
-	if (!err)
+	/* This post dead nonsense must die */
+	if (!ret && hasdied)
 		cpu_notify_nofail(CPU_POST_DEAD, cpu);
-	return err;
+	return ret;
 }
 
 int cpu_down(unsigned int cpu)
@@ -537,11 +615,22 @@ void smpboot_thread_init(void)
 	register_cpu_notifier(&smpboot_thread_notifier);
 }
 
+static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+	for (st->state--; st->state > st->target; st->state--) {
+		struct cpuhp_step *step = cpuhp_bp_states + st->state;
+
+		if (!step->skip_onerr)
+			cpuhp_invoke_callback(cpu, st->state, step->teardown);
+	}
+}
+
 /* Requires cpu_add_remove_lock to be held */
 static int _cpu_up(unsigned int cpu, int tasks_frozen)
 {
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	struct task_struct *idle;
-	int ret;
+	int prev_state, ret = 0;
 
 	cpu_hotplug_begin();
 
@@ -550,6 +639,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out;
 	}
 
+	/* Let it fail before we try to bring the cpu up */
 	idle = idle_thread_get(cpu);
 	if (IS_ERR(idle)) {
 		ret = PTR_ERR(idle);
@@ -558,22 +648,22 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 
 	cpuhp_tasks_frozen = tasks_frozen;
 
-	ret = smpboot_create_threads(cpu);
-	if (ret)
-		goto out;
-
-	ret = notify_prepare(cpu);
-	if (ret)
-		goto out;
-
-	ret = bringup_cpu(cpu);
-	if (ret)
-		goto out;
-
-	notify_online(cpu);
+	prev_state = st->state;
+	st->target = CPUHP_ONLINE;
+	while (st->state < st->target) {
+		struct cpuhp_step *step;
+
+		st->state++;
+		step = cpuhp_bp_states + st->state;
+		ret = cpuhp_invoke_callback(cpu, st->state, step->startup);
+		if (ret) {
+			st->target = prev_state;
+			undo_cpu_up(cpu, st);
+			break;
+		}
+	}
 out:
 	cpu_hotplug_done();
-
 	return ret;
 }
 
@@ -767,6 +857,44 @@ void notify_cpu_starting(unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+/* Boot processor state steps */
+static struct cpuhp_step cpuhp_bp_states[] = {
+	[CPUHP_OFFLINE] = {
+		.name			= "offline",
+		.startup		= NULL,
+		.teardown		= NULL,
+	},
+#ifdef CONFIG_SMP
+	[CPUHP_CREATE_THREADS]= {
+		.name			= "threads:create",
+		.startup		= smpboot_create_threads,
+		.teardown		= NULL,
+	},
+	[CPUHP_NOTIFY_PREPARE] = {
+		.name			= "notify:prepare",
+		.startup		= notify_prepare,
+		.teardown		= notify_dead,
+		.skip_onerr		= true,
+	},
+	[CPUHP_BRINGUP_CPU] = {
+		.name			= "cpu:bringup",
+		.startup		= bringup_cpu,
+		.teardown		= takedown_cpu,
+		.skip_onerr		= true,
+	},
+	[CPUHP_NOTIFY_ONLINE] = {
+		.name			= "notify:online",
+		.startup		= notify_online,
+		.teardown		= notify_down_prepare,
+	},
+#endif
+	[CPUHP_ONLINE] = {
+		.name			= "online",
+		.startup		= NULL,
+		.teardown		= NULL,
+	},
+};
+
 /*
  * cpu_bit_bitmap[] is a special, "compressed" data structure that
  * represents all NR_CPUS bits binary values of 1<<nr.
@@ -826,3 +954,25 @@ void init_cpu_online(const struct cpumask *src)
 {
 	cpumask_copy(&__cpu_online_mask, src);
 }
+
+/*
+ * Activate the first processor.
+ */
+void __init boot_cpu_init(void)
+{
+	int cpu = smp_processor_id();
+
+	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
+	set_cpu_online(cpu, true);
+	set_cpu_active(cpu, true);
+	set_cpu_present(cpu, true);
+	set_cpu_possible(cpu, true);
+}
+
+/*
+ * Must be called _AFTER_ setting up the per_cpu areas
+ */
+void __init boot_cpu_state_init(void)
+{
+	per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE;
+}
-- 
cgit 


From 4baa0afc6719cbf36a1e08551484a641926b3fd1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:29 +0000
Subject: cpu/hotplug: Convert the hotplugged cpu work to a state machine

Move the functions which need to run on the hotplugged processor into
a state machine array and let the code iterate through these functions.

In a later state, this will grow synchronization points between the
control processor and the hotplugged processor, so we can move the
various architecture implementations of the synchronizations to the
core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182340.770651526@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpuhotplug.h |  4 +++
 kernel/cpu.c               | 81 +++++++++++++++++++++++++++++++++++++---------
 2 files changed, 70 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index d55c9e64acd7..d9303cca83d3 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -6,6 +6,10 @@ enum cpuhp_state {
 	CPUHP_CREATE_THREADS,
 	CPUHP_NOTIFY_PREPARE,
 	CPUHP_BRINGUP_CPU,
+	CPUHP_AP_OFFLINE,
+	CPUHP_AP_NOTIFY_STARTING,
+	CPUHP_AP_ONLINE,
+	CPUHP_TEARDOWN_CPU,
 	CPUHP_NOTIFY_ONLINE,
 	CPUHP_ONLINE,
 };
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 301851974b8d..797723e81756 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -57,6 +57,7 @@ struct cpuhp_step {
 };
 
 static struct cpuhp_step cpuhp_bp_states[];
+static struct cpuhp_step cpuhp_ap_states[];
 
 /**
  * cpuhp_invoke_callback _ Invoke the callbacks for a given state
@@ -304,6 +305,12 @@ static int notify_online(unsigned int cpu)
 	return 0;
 }
 
+static int notify_starting(unsigned int cpu)
+{
+	cpu_notify(CPU_STARTING, cpu);
+	return 0;
+}
+
 static int bringup_cpu(unsigned int cpu)
 {
 	struct task_struct *idle = idle_thread_get(cpu);
@@ -421,9 +428,17 @@ static int notify_down_prepare(unsigned int cpu)
 	return err;
 }
 
+static int notify_dying(unsigned int cpu)
+{
+	cpu_notify(CPU_DYING, cpu);
+	return 0;
+}
+
 /* Take this CPU down. */
 static int take_cpu_down(void *_param)
 {
+	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+	enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
 	int err, cpu = smp_processor_id();
 
 	/* Ensure this CPU doesn't handle any more interrupts. */
@@ -431,7 +446,12 @@ static int take_cpu_down(void *_param)
 	if (err < 0)
 		return err;
 
-	cpu_notify(CPU_DYING, cpu);
+	/* Invoke the former CPU_DYING callbacks */
+	for (; st->state > target; st->state--) {
+		struct cpuhp_step *step = cpuhp_ap_states + st->state;
+
+		cpuhp_invoke_callback(cpu, st->state, step->teardown);
+	}
 	/* Give up timekeeping duties */
 	tick_handover_do_timer();
 	/* Park the stopper thread */
@@ -512,6 +532,7 @@ static int notify_dead(unsigned int cpu)
 #define notify_down_prepare	NULL
 #define takedown_cpu		NULL
 #define notify_dead		NULL
+#define notify_dying		NULL
 #endif
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -615,6 +636,28 @@ void smpboot_thread_init(void)
 	register_cpu_notifier(&smpboot_thread_notifier);
 }
 
+/**
+ * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * @cpu: cpu that just started
+ *
+ * This function calls the cpu_chain notifiers with CPU_STARTING.
+ * It must be called by the arch code on the new cpu, before the new cpu
+ * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ */
+void notify_cpu_starting(unsigned int cpu)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+	enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+
+	while (st->state < target) {
+		struct cpuhp_step *step;
+
+		st->state++;
+		step = cpuhp_ap_states + st->state;
+		cpuhp_invoke_callback(cpu, st->state, step->startup);
+	}
+}
+
 static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
 	for (st->state--; st->state > st->target; st->state--) {
@@ -842,19 +885,6 @@ core_initcall(cpu_hotplug_pm_sync_init);
 
 #endif /* CONFIG_PM_SLEEP_SMP */
 
-/**
- * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
- * @cpu: cpu that just started
- *
- * This function calls the cpu_chain notifiers with CPU_STARTING.
- * It must be called by the arch code on the new cpu, before the new cpu
- * enables interrupts and before the "boot" cpu returns from __cpu_up().
- */
-void notify_cpu_starting(unsigned int cpu)
-{
-	cpu_notify(CPU_STARTING, cpu);
-}
-
 #endif /* CONFIG_SMP */
 
 /* Boot processor state steps */
@@ -879,8 +909,12 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 	[CPUHP_BRINGUP_CPU] = {
 		.name			= "cpu:bringup",
 		.startup		= bringup_cpu,
+		.teardown		= NULL,
+	},
+	[CPUHP_TEARDOWN_CPU] = {
+		.name			= "cpu:teardown",
+		.startup		= NULL,
 		.teardown		= takedown_cpu,
-		.skip_onerr		= true,
 	},
 	[CPUHP_NOTIFY_ONLINE] = {
 		.name			= "notify:online",
@@ -895,6 +929,23 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 	},
 };
 
+/* Application processor state steps */
+static struct cpuhp_step cpuhp_ap_states[] = {
+#ifdef CONFIG_SMP
+	[CPUHP_AP_NOTIFY_STARTING] = {
+		.name			= "notify:starting",
+		.startup		= notify_starting,
+		.teardown		= notify_dying,
+		.skip_onerr		= true,
+	},
+#endif
+	[CPUHP_ONLINE] = {
+		.name			= "online",
+		.startup		= NULL,
+		.teardown		= NULL,
+	},
+};
+
 /*
  * cpu_bit_bitmap[] is a special, "compressed" data structure that
  * represents all NR_CPUS bits binary values of 1<<nr.
-- 
cgit 


From 5b7aa87e0482be768486e0c2277aa4122487eb9d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:33 +0000
Subject: cpu/hotplug: Implement setup/removal interface

Implement function which allow to setup/remove hotplug state callbacks.

The default behaviour for setup is to call the startup function for this state
for (or on) all cpus which have a hotplug state >= the installed state.

The default behaviour for removal is to call the teardown function for this
state for (or on) all cpus which have a hotplug state >= the installed state.

This includes rollback to the previous state in case of failure.

A special state is CPUHP_ONLINE_DYN. Its for dynamically registering a hotplug
callback pair. This is for drivers which have no dependencies to avoid that we
need to allocate CPUHP states for each of them

For both setup and remove helper functions are provided, which prevent the
core to issue the callbacks. This simplifies the conversion of existing
hotplug notifiers.

[ Dynamic registering implemented by Sebastian Siewior ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.103464877@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpuhotplug.h |  67 ++++++++++++++
 kernel/cpu.c               | 224 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 291 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index d9303cca83d3..29935261b26d 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -11,7 +11,74 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_NOTIFY_ONLINE,
+	CPUHP_ONLINE_DYN,
+	CPUHP_ONLINE_DYN_END		= CPUHP_ONLINE_DYN + 30,
 	CPUHP_ONLINE,
 };
 
+int __cpuhp_setup_state(enum cpuhp_state state,	const char *name, bool invoke,
+			int (*startup)(unsigned int cpu),
+			int (*teardown)(unsigned int cpu));
+
+/**
+ * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks
+ * @state:	The state for which the calls are installed
+ * @name:	Name of the callback (will be used in debug output)
+ * @startup:	startup callback function
+ * @teardown:	teardown callback function
+ *
+ * Installs the callback functions and invokes the startup callback on
+ * the present cpus which have already reached the @state.
+ */
+static inline int cpuhp_setup_state(enum cpuhp_state state,
+				    const char *name,
+				    int (*startup)(unsigned int cpu),
+				    int (*teardown)(unsigned int cpu))
+{
+	return __cpuhp_setup_state(state, name, true, startup, teardown);
+}
+
+/**
+ * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the
+ *			       callbacks
+ * @state:	The state for which the calls are installed
+ * @name:	Name of the callback.
+ * @startup:	startup callback function
+ * @teardown:	teardown callback function
+ *
+ * Same as @cpuhp_setup_state except that no calls are executed are invoked
+ * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n.
+ */
+static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state,
+					    const char *name,
+					    int (*startup)(unsigned int cpu),
+					    int (*teardown)(unsigned int cpu))
+{
+	return __cpuhp_setup_state(state, name, false, startup, teardown);
+}
+
+void __cpuhp_remove_state(enum cpuhp_state state, bool invoke);
+
+/**
+ * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown
+ * @state:	The state for which the calls are removed
+ *
+ * Removes the callback functions and invokes the teardown callback on
+ * the present cpus which have already reached the @state.
+ */
+static inline void cpuhp_remove_state(enum cpuhp_state state)
+{
+	__cpuhp_remove_state(state, true);
+}
+
+/**
+ * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking
+ *				teardown
+ * @state:	The state for which the calls are removed
+ */
+static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state)
+{
+	__cpuhp_remove_state(state, false);
+}
+
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index be9335da82f1..b5eacb9587af 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -973,6 +973,14 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 	},
 };
 
+/* Sanity check for callbacks */
+static int cpuhp_cb_check(enum cpuhp_state state)
+{
+	if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
+		return -EINVAL;
+	return 0;
+}
+
 static bool cpuhp_is_ap_state(enum cpuhp_state state)
 {
 	return (state > CPUHP_AP_OFFLINE && state < CPUHP_AP_ONLINE);
@@ -986,6 +994,222 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
 	return sp + state;
 }
 
+static void cpuhp_store_callbacks(enum cpuhp_state state,
+				  const char *name,
+				  int (*startup)(unsigned int cpu),
+				  int (*teardown)(unsigned int cpu))
+{
+	/* (Un)Install the callbacks for further cpu hotplug operations */
+	struct cpuhp_step *sp;
+
+	mutex_lock(&cpuhp_state_mutex);
+	sp = cpuhp_get_step(state);
+	sp->startup = startup;
+	sp->teardown = teardown;
+	sp->name = name;
+	mutex_unlock(&cpuhp_state_mutex);
+}
+
+static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
+{
+	return cpuhp_get_step(state)->teardown;
+}
+
+/* Helper function to run callback on the target cpu */
+static void cpuhp_on_cpu_cb(void *__cb)
+{
+	int (*cb)(unsigned int cpu) = __cb;
+
+	BUG_ON(cb(smp_processor_id()));
+}
+
+/*
+ * Call the startup/teardown function for a step either on the AP or
+ * on the current CPU.
+ */
+static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
+			    int (*cb)(unsigned int), bool bringup)
+{
+	int ret;
+
+	if (!cb)
+		return 0;
+
+	/*
+	 * This invokes the callback directly for now. In a later step we
+	 * convert that to use cpuhp_invoke_callback().
+	 */
+	if (cpuhp_is_ap_state(state)) {
+		/*
+		 * Note, that a function called on the AP is not
+		 * allowed to fail.
+		 */
+		if (cpu_online(cpu))
+			smp_call_function_single(cpu, cpuhp_on_cpu_cb, cb, 1);
+		return 0;
+	}
+
+	/*
+	 * The non AP bound callbacks can fail on bringup. On teardown
+	 * e.g. module removal we crash for now.
+	 */
+	ret = cb(cpu);
+	BUG_ON(ret && !bringup);
+	return ret;
+}
+
+/*
+ * Called from __cpuhp_setup_state on a recoverable failure.
+ *
+ * Note: The teardown callbacks for rollback are not allowed to fail!
+ */
+static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
+				   int (*teardown)(unsigned int cpu))
+{
+	int cpu;
+
+	if (!teardown)
+		return;
+
+	/* Roll back the already executed steps on the other cpus */
+	for_each_present_cpu(cpu) {
+		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+		int cpustate = st->state;
+
+		if (cpu >= failedcpu)
+			break;
+
+		/* Did we invoke the startup call on that cpu ? */
+		if (cpustate >= state)
+			cpuhp_issue_call(cpu, state, teardown, false);
+	}
+}
+
+/*
+ * Returns a free for dynamic slot assignment of the Online state. The states
+ * are protected by the cpuhp_slot_states mutex and an empty slot is identified
+ * by having no name assigned.
+ */
+static int cpuhp_reserve_state(enum cpuhp_state state)
+{
+	enum cpuhp_state i;
+
+	mutex_lock(&cpuhp_state_mutex);
+	for (i = CPUHP_ONLINE_DYN; i <= CPUHP_ONLINE_DYN_END; i++) {
+		if (cpuhp_bp_states[i].name)
+			continue;
+
+		cpuhp_bp_states[i].name = "Reserved";
+		mutex_unlock(&cpuhp_state_mutex);
+		return i;
+	}
+	mutex_unlock(&cpuhp_state_mutex);
+	WARN(1, "No more dynamic states available for CPU hotplug\n");
+	return -ENOSPC;
+}
+
+/**
+ * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
+ * @state:	The state to setup
+ * @invoke:	If true, the startup function is invoked for cpus where
+ *		cpu state >= @state
+ * @startup:	startup callback function
+ * @teardown:	teardown callback function
+ *
+ * Returns 0 if successful, otherwise a proper error code
+ */
+int __cpuhp_setup_state(enum cpuhp_state state,
+			const char *name, bool invoke,
+			int (*startup)(unsigned int cpu),
+			int (*teardown)(unsigned int cpu))
+{
+	int cpu, ret = 0;
+	int dyn_state = 0;
+
+	if (cpuhp_cb_check(state) || !name)
+		return -EINVAL;
+
+	get_online_cpus();
+
+	/* currently assignments for the ONLINE state are possible */
+	if (state == CPUHP_ONLINE_DYN) {
+		dyn_state = 1;
+		ret = cpuhp_reserve_state(state);
+		if (ret < 0)
+			goto out;
+		state = ret;
+	}
+
+	cpuhp_store_callbacks(state, name, startup, teardown);
+
+	if (!invoke || !startup)
+		goto out;
+
+	/*
+	 * Try to call the startup callback for each present cpu
+	 * depending on the hotplug state of the cpu.
+	 */
+	for_each_present_cpu(cpu) {
+		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+		int cpustate = st->state;
+
+		if (cpustate < state)
+			continue;
+
+		ret = cpuhp_issue_call(cpu, state, startup, true);
+		if (ret) {
+			cpuhp_rollback_install(cpu, state, teardown);
+			cpuhp_store_callbacks(state, NULL, NULL, NULL);
+			goto out;
+		}
+	}
+out:
+	put_online_cpus();
+	if (!ret && dyn_state)
+		return state;
+	return ret;
+}
+EXPORT_SYMBOL(__cpuhp_setup_state);
+
+/**
+ * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
+ * @state:	The state to remove
+ * @invoke:	If true, the teardown function is invoked for cpus where
+ *		cpu state >= @state
+ *
+ * The teardown callback is currently not allowed to fail. Think
+ * about module removal!
+ */
+void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
+{
+	int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state);
+	int cpu;
+
+	BUG_ON(cpuhp_cb_check(state));
+
+	get_online_cpus();
+
+	if (!invoke || !teardown)
+		goto remove;
+
+	/*
+	 * Call the teardown callback for each present cpu depending
+	 * on the hotplug state of the cpu. This function is not
+	 * allowed to fail currently!
+	 */
+	for_each_present_cpu(cpu) {
+		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+		int cpustate = st->state;
+
+		if (cpustate >= state)
+			cpuhp_issue_call(cpu, state, teardown, false);
+	}
+remove:
+	cpuhp_store_callbacks(state, NULL, NULL, NULL);
+	put_online_cpus();
+}
+EXPORT_SYMBOL(__cpuhp_remove_state);
+
 #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
 static ssize_t show_cpuhp_state(struct device *dev,
 				struct device_attribute *attr, char *buf)
-- 
cgit 


From 949338e35131c551f7bf54f48a2e3a227af6721b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:35 +0000
Subject: cpu/hotplug: Move scheduler cpu_online notifier to hotplug core

Move the scheduler cpu online notifier part to the hotplug core. This is
anyway the highest priority callback and we need that functionality right now
for the next changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.200791046@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpuhotplug.h |  1 +
 kernel/cpu.c               | 18 ++++++++++++++++++
 kernel/sched/core.c        | 10 ----------
 3 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 29935261b26d..2f2e5d9711c4 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -10,6 +10,7 @@ enum cpuhp_state {
 	CPUHP_AP_NOTIFY_STARTING,
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
+	CPUHP_CPU_SET_ACTIVE,
 	CPUHP_NOTIFY_ONLINE,
 	CPUHP_ONLINE_DYN,
 	CPUHP_ONLINE_DYN_END		= CPUHP_ONLINE_DYN + 30,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b5eacb9587af..65e34d34ca93 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -666,6 +666,19 @@ void notify_cpu_starting(unsigned int cpu)
 	}
 }
 
+/*
+ * Called from the idle task. We need to set active here, so we can kick off
+ * the stopper thread.
+ */
+static int cpuhp_set_cpu_active(unsigned int cpu)
+{
+	/* The cpu is marked online, set it active now */
+	set_cpu_active(cpu, true);
+	/* Unpark the stopper thread */
+	stop_machine_unpark(cpu);
+	return 0;
+}
+
 static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
 	for (st->state--; st->state > st->target; st->state--) {
@@ -941,6 +954,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.teardown		= takedown_cpu,
 		.cant_stop		= true,
 	},
+	[CPUHP_CPU_SET_ACTIVE] = {
+		.name			= "cpu:active",
+		.startup		= cpuhp_set_cpu_active,
+		.teardown		= NULL,
+	},
 	[CPUHP_NOTIFY_ONLINE] = {
 		.name			= "notify:online",
 		.startup		= notify_online,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9503d590e5ef..626646396ca0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5692,16 +5692,6 @@ static int sched_cpu_active(struct notifier_block *nfb,
 		set_cpu_rq_start_time();
 		return NOTIFY_OK;
 
-	case CPU_ONLINE:
-		/*
-		 * At this point a starting CPU has marked itself as online via
-		 * set_cpu_online(). But it might not yet have marked itself
-		 * as active, which is essential from here on.
-		 */
-		set_cpu_active(cpu, true);
-		stop_machine_unpark(cpu);
-		return NOTIFY_OK;
-
 	case CPU_DOWN_FAILED:
 		set_cpu_active(cpu, true);
 		return NOTIFY_OK;
-- 
cgit 


From 931ef163309ee955611f287dc65248b39a65fc9d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:36 +0000
Subject: cpu/hotplug: Unpark smpboot threads from the state machine

Handle the smpboot threads in the state machine.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.295777684@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h        |  7 +------
 include/linux/cpuhotplug.h |  1 +
 init/main.c                |  1 -
 kernel/cpu.c               | 39 +++++----------------------------------
 kernel/smpboot.c           |  6 ++++--
 kernel/smpboot.h           |  4 ++--
 6 files changed, 13 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 78989f20420f..83f35767016d 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -78,7 +78,7 @@ enum {
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION	= 10,
-	CPU_PRI_SMPBOOT		= 9,
+
 	/* bring up workqueues before normal notifiers and down after */
 	CPU_PRI_WORKQUEUE_UP	= 5,
 	CPU_PRI_WORKQUEUE_DOWN	= -5,
@@ -172,7 +172,6 @@ static inline void __unregister_cpu_notifier(struct notifier_block *nb)
 }
 #endif
 
-void smpboot_thread_init(void);
 int cpu_up(unsigned int cpu);
 void notify_cpu_starting(unsigned int cpu);
 extern void cpu_maps_update_begin(void);
@@ -221,10 +220,6 @@ static inline void cpu_notifier_register_done(void)
 {
 }
 
-static inline void smpboot_thread_init(void)
-{
-}
-
 #endif /* CONFIG_SMP */
 extern struct bus_type cpu_subsys;
 
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 2f2e5d9711c4..38679106fddd 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -11,6 +11,7 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_CPU_SET_ACTIVE,
+	CPUHP_SMPBOOT_THREADS,
 	CPUHP_NOTIFY_ONLINE,
 	CPUHP_ONLINE_DYN,
 	CPUHP_ONLINE_DYN_END		= CPUHP_ONLINE_DYN + 30,
diff --git a/init/main.c b/init/main.c
index c2ea72362ee3..55563fd36be3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -388,7 +388,6 @@ static noinline void __init_refok rest_init(void)
 	int pid;
 
 	rcu_scheduler_starting();
-	smpboot_thread_init();
 	/*
 	 * We need to spawn init first so that it obtains pid 1, however
 	 * the init task will end up wanting to create kthreads, which, if
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 65e34d34ca93..3ec86bc414b7 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -481,8 +481,6 @@ static int takedown_cpu(unsigned int cpu)
 	else
 		synchronize_rcu();
 
-	smpboot_park_threads(cpu);
-
 	/*
 	 * Prevent irq alloc/free while the dying cpu reorganizes the
 	 * interrupt affinities.
@@ -612,38 +610,6 @@ int cpu_down(unsigned int cpu)
 EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 
-/*
- * Unpark per-CPU smpboot kthreads at CPU-online time.
- */
-static int smpboot_thread_call(struct notifier_block *nfb,
-			       unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-
-	case CPU_DOWN_FAILED:
-	case CPU_ONLINE:
-		smpboot_unpark_threads(cpu);
-		break;
-
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block smpboot_thread_notifier = {
-	.notifier_call = smpboot_thread_call,
-	.priority = CPU_PRI_SMPBOOT,
-};
-
-void smpboot_thread_init(void)
-{
-	register_cpu_notifier(&smpboot_thread_notifier);
-}
-
 /**
  * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
  * @cpu: cpu that just started
@@ -959,6 +925,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup		= cpuhp_set_cpu_active,
 		.teardown		= NULL,
 	},
+	[CPUHP_SMPBOOT_THREADS] = {
+		.name			= "smpboot:threads",
+		.startup		= smpboot_unpark_threads,
+		.teardown		= smpboot_park_threads,
+	},
 	[CPUHP_NOTIFY_ONLINE] = {
 		.name			= "notify:online",
 		.startup		= notify_online,
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d264f59bff56..13bc43d1fb22 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -226,7 +226,7 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
 		kthread_unpark(tsk);
 }
 
-void smpboot_unpark_threads(unsigned int cpu)
+int smpboot_unpark_threads(unsigned int cpu)
 {
 	struct smp_hotplug_thread *cur;
 
@@ -235,6 +235,7 @@ void smpboot_unpark_threads(unsigned int cpu)
 		if (cpumask_test_cpu(cpu, cur->cpumask))
 			smpboot_unpark_thread(cur, cpu);
 	mutex_unlock(&smpboot_threads_lock);
+	return 0;
 }
 
 static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
@@ -245,7 +246,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
 		kthread_park(tsk);
 }
 
-void smpboot_park_threads(unsigned int cpu)
+int smpboot_park_threads(unsigned int cpu)
 {
 	struct smp_hotplug_thread *cur;
 
@@ -253,6 +254,7 @@ void smpboot_park_threads(unsigned int cpu)
 	list_for_each_entry_reverse(cur, &hotplug_threads, list)
 		smpboot_park_thread(cur, cpu);
 	mutex_unlock(&smpboot_threads_lock);
+	return 0;
 }
 
 static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 72415a0eb955..6b5f02017be3 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -14,7 +14,7 @@ static inline void idle_threads_init(void) { }
 #endif
 
 int smpboot_create_threads(unsigned int cpu);
-void smpboot_park_threads(unsigned int cpu);
-void smpboot_unpark_threads(unsigned int cpu);
+int smpboot_park_threads(unsigned int cpu);
+int smpboot_unpark_threads(unsigned int cpu);
 
 #endif
-- 
cgit 


From 1cf4f629d9d246519a1e76c021806f2a51ddba4d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:39 +0000
Subject: cpu/hotplug: Move online calls to hotplugged cpu

Let the hotplugged cpu invoke the setup/teardown callbacks
(CPU_ONLINE/CPU_DOWN_PREPARE) itself.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.536364371@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpuhotplug.h |  10 ++--
 kernel/cpu.c               | 144 ++++++++++++++++++++++++++++++---------------
 2 files changed, 102 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 38679106fddd..8a715bb1e192 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -11,10 +11,12 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_CPU_SET_ACTIVE,
-	CPUHP_SMPBOOT_THREADS,
-	CPUHP_NOTIFY_ONLINE,
-	CPUHP_ONLINE_DYN,
-	CPUHP_ONLINE_DYN_END		= CPUHP_ONLINE_DYN + 30,
+	CPUHP_KICK_AP_THREAD,
+	CPUHP_BP_ONLINE,
+	CPUHP_AP_SMPBOOT_THREADS,
+	CPUHP_AP_NOTIFY_ONLINE,
+	CPUHP_AP_ONLINE_DYN,
+	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
 	CPUHP_ONLINE,
 };
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9048c33689ac..e220e565ea98 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -429,7 +429,7 @@ static int cpuhp_should_run(unsigned int cpu)
 /* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
 static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
-	enum cpuhp_state target = max((int)st->target, CPUHP_AP_ONLINE);
+	enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
 
 	return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target);
 }
@@ -469,6 +469,9 @@ static void cpuhp_thread_fun(unsigned int cpu)
 			ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
 		}
 	} else {
+		/* Cannot happen .... */
+		BUG_ON(st->state < CPUHP_KICK_AP_THREAD);
+
 		/* Regular hotplug work */
 		if (st->state < st->target)
 			ret = cpuhp_ap_online(cpu, st);
@@ -502,12 +505,8 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
 }
 
 /* Regular hotplug invocation of the AP hotplug thread */
-static int cpuhp_kick_ap_work(unsigned int cpu)
+static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
 {
-	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-	enum cpuhp_state state = st->state;
-
-	trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
 	st->result = 0;
 	st->cb = NULL;
 	/*
@@ -517,6 +516,15 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
 	smp_mb();
 	st->should_run = true;
 	wake_up_process(st->thread);
+}
+
+static int cpuhp_kick_ap_work(unsigned int cpu)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+	enum cpuhp_state state = st->state;
+
+	trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
+	__cpuhp_kick_ap_work(st);
 	wait_for_completion(&st->done);
 	trace_cpuhp_exit(cpu, st->state, state, st->result);
 	return st->result;
@@ -688,6 +696,9 @@ static int takedown_cpu(unsigned int cpu)
 	else
 		synchronize_rcu();
 
+	/* Park the hotplug thread */
+	kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
+
 	/*
 	 * Prevent irq alloc/free while the dying cpu reorganizes the
 	 * interrupt affinities.
@@ -765,10 +776,34 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 
 	prev_state = st->state;
 	st->target = target;
+	/*
+	 * If the current CPU state is in the range of the AP hotplug thread,
+	 * then we need to kick the thread.
+	 */
+	if (st->state >= CPUHP_KICK_AP_THREAD) {
+		ret = cpuhp_kick_ap_work(cpu);
+		/*
+		 * The AP side has done the error rollback already. Just
+		 * return the error code..
+		 */
+		if (ret)
+			goto out;
+
+		/*
+		 * We might have stopped still in the range of the AP hotplug
+		 * thread. Nothing to do anymore.
+		 */
+		if (st->state >= CPUHP_KICK_AP_THREAD)
+			goto out;
+	}
+	/*
+	 * The AP brought itself down below CPUHP_KICK_AP_THREAD. So we need
+	 * to do the further cleanups.
+	 */
 	ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
 
 	hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
-
+out:
 	cpu_hotplug_done();
 	/* This post dead nonsense must die */
 	if (!ret && hasdied)
@@ -828,10 +863,13 @@ void notify_cpu_starting(unsigned int cpu)
  */
 static int cpuhp_set_cpu_active(unsigned int cpu)
 {
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
 	/* The cpu is marked online, set it active now */
 	set_cpu_active(cpu, true);
-	/* Unpark the stopper thread */
+	/* Unpark the stopper thread and the hotplug thread */
 	stop_machine_unpark(cpu);
+	kthread_unpark(st->thread);
 	return 0;
 }
 
@@ -868,6 +906,26 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 	cpuhp_tasks_frozen = tasks_frozen;
 
 	st->target = target;
+	/*
+	 * If the current CPU state is in the range of the AP hotplug thread,
+	 * then we need to kick the thread once more.
+	 */
+	if (st->state >= CPUHP_KICK_AP_THREAD) {
+		ret = cpuhp_kick_ap_work(cpu);
+		/*
+		 * The AP side has done the error rollback already. Just
+		 * return the error code..
+		 */
+		if (ret)
+			goto out;
+	}
+
+	/*
+	 * Try to reach the target state. We max out on the BP at
+	 * CPUHP_KICK_AP_THREAD. After that the AP hotplug thread is
+	 * responsible for bringing it up to the target state.
+	 */
+	target = min((int)target, CPUHP_KICK_AP_THREAD);
 	ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target);
 out:
 	cpu_hotplug_done();
@@ -1093,19 +1151,13 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup		= cpuhp_set_cpu_active,
 		.teardown		= NULL,
 	},
-	[CPUHP_SMPBOOT_THREADS] = {
-		.name			= "smpboot:threads",
-		.startup		= smpboot_unpark_threads,
-		.teardown		= smpboot_park_threads,
-	},
-	[CPUHP_NOTIFY_ONLINE] = {
-		.name			= "notify:online",
-		.startup		= notify_online,
-		.teardown		= notify_down_prepare,
-		.cant_stop		= true,
+	[CPUHP_KICK_AP_THREAD] = {
+		.name			= "cpuhp:kickthread",
+		.startup		= cpuhp_kick_ap_work,
+		.teardown		= cpuhp_kick_ap_work,
 	},
 #endif
-	[CPUHP_ONLINE] = {
+	[CPUHP_BP_ONLINE] = {
 		.name			= "online",
 		.startup		= NULL,
 		.teardown		= NULL,
@@ -1122,6 +1174,16 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.skip_onerr		= true,
 		.cant_stop		= true,
 	},
+	[CPUHP_AP_SMPBOOT_THREADS] = {
+		.name			= "smpboot:threads",
+		.startup		= smpboot_unpark_threads,
+		.teardown		= smpboot_park_threads,
+	},
+	[CPUHP_AP_NOTIFY_ONLINE] = {
+		.name			= "notify:online",
+		.startup		= notify_online,
+		.teardown		= notify_down_prepare,
+	},
 #endif
 	[CPUHP_ONLINE] = {
 		.name			= "online",
@@ -1140,7 +1202,9 @@ static int cpuhp_cb_check(enum cpuhp_state state)
 
 static bool cpuhp_is_ap_state(enum cpuhp_state state)
 {
-	return (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE);
+	if (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE)
+		return true;
+	return state > CPUHP_BP_ONLINE;
 }
 
 static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
@@ -1172,14 +1236,6 @@ static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
 	return cpuhp_get_step(state)->teardown;
 }
 
-/* Helper function to run callback on the target cpu */
-static void cpuhp_on_cpu_cb(void *__cb)
-{
-	int (*cb)(unsigned int cpu) = __cb;
-
-	BUG_ON(cb(smp_processor_id()));
-}
-
 /*
  * Call the startup/teardown function for a step either on the AP or
  * on the current CPU.
@@ -1191,26 +1247,18 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
 
 	if (!cb)
 		return 0;
-
-	/*
-	 * This invokes the callback directly for now. In a later step we
-	 * convert that to use cpuhp_invoke_callback().
-	 */
-	if (cpuhp_is_ap_state(state)) {
-		/*
-		 * Note, that a function called on the AP is not
-		 * allowed to fail.
-		 */
-		if (cpu_online(cpu))
-			smp_call_function_single(cpu, cpuhp_on_cpu_cb, cb, 1);
-		return 0;
-	}
-
 	/*
 	 * The non AP bound callbacks can fail on bringup. On teardown
 	 * e.g. module removal we crash for now.
 	 */
-	ret = cb(cpu);
+#ifdef CONFIG_SMP
+	if (cpuhp_is_ap_state(state))
+		ret = cpuhp_invoke_ap_callback(cpu, state, cb);
+	else
+		ret = cpuhp_invoke_callback(cpu, state, cb);
+#else
+	ret = cpuhp_invoke_callback(cpu, state, cb);
+#endif
 	BUG_ON(ret && !bringup);
 	return ret;
 }
@@ -1252,11 +1300,11 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
 	enum cpuhp_state i;
 
 	mutex_lock(&cpuhp_state_mutex);
-	for (i = CPUHP_ONLINE_DYN; i <= CPUHP_ONLINE_DYN_END; i++) {
-		if (cpuhp_bp_states[i].name)
+	for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
+		if (cpuhp_ap_states[i].name)
 			continue;
 
-		cpuhp_bp_states[i].name = "Reserved";
+		cpuhp_ap_states[i].name = "Reserved";
 		mutex_unlock(&cpuhp_state_mutex);
 		return i;
 	}
@@ -1289,7 +1337,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
 	get_online_cpus();
 
 	/* currently assignments for the ONLINE state are possible */
-	if (state == CPUHP_ONLINE_DYN) {
+	if (state == CPUHP_AP_ONLINE_DYN) {
 		dyn_state = 1;
 		ret = cpuhp_reserve_state(state);
 		if (ret < 0)
-- 
cgit 


From fc6d73d67436e7784758a831227bd019547a3f73 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:40 +0000
Subject: arch/hotplug: Call into idle with a proper state

Let the non boot cpus call into idle with the corresponding hotplug state, so
the hotplug core can handle the further bringup. That's a first step to
convert the boot side of the hotplugged cpus to do all the synchronization
with the other side through the state machine. For now it'll only start the
hotplug thread and kick the full bringup of the cpu.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.614102639@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/alpha/kernel/smp.c         | 2 +-
 arch/arc/kernel/smp.c           | 2 +-
 arch/arm/kernel/smp.c           | 2 +-
 arch/arm64/kernel/smp.c         | 2 +-
 arch/blackfin/mach-common/smp.c | 2 +-
 arch/hexagon/kernel/smp.c       | 2 +-
 arch/ia64/kernel/smpboot.c      | 2 +-
 arch/m32r/kernel/smpboot.c      | 2 +-
 arch/metag/kernel/smp.c         | 2 +-
 arch/mips/kernel/smp.c          | 2 +-
 arch/mn10300/kernel/smp.c       | 2 +-
 arch/parisc/kernel/smp.c        | 2 +-
 arch/powerpc/kernel/smp.c       | 2 +-
 arch/s390/kernel/smp.c          | 2 +-
 arch/sh/kernel/smp.c            | 2 +-
 arch/sparc/kernel/smp_32.c      | 2 +-
 arch/sparc/kernel/smp_64.c      | 2 +-
 arch/tile/kernel/smpboot.c      | 2 +-
 arch/x86/kernel/smpboot.c       | 2 +-
 arch/x86/xen/smp.c              | 2 +-
 arch/xtensa/kernel/smp.c        | 2 +-
 include/linux/cpuhotplug.h      | 1 +
 22 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 2f24447fef92..46bf263c3153 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -168,7 +168,7 @@ smp_callin(void)
 	      cpuid, current, current->active_mm));
 
 	preempt_disable();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 /* Wait until hwrpb->txrdy is clear for cpu.  Return -1 on timeout.  */
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index 424e937da5c8..4cb3add77c75 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -142,7 +142,7 @@ void start_kernel_secondary(void)
 
 	local_irq_enable();
 	preempt_disable();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 /*
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 37312f6749f3..baee70267f29 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -409,7 +409,7 @@ asmlinkage void secondary_start_kernel(void)
 	/*
 	 * OK, it's off to the idle thread for us
 	 */
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index b1adc51b2c2e..460765799c64 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -195,7 +195,7 @@ asmlinkage void secondary_start_kernel(void)
 	/*
 	 * OK, it's off to the idle thread for us
 	 */
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 0030e21cfceb..23c4ef5f8bdc 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -333,7 +333,7 @@ void secondary_start_kernel(void)
 
 	/* We are done with local CPU inits, unblock the boot CPU. */
 	set_cpu_online(cpu, true);
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void __init smp_prepare_boot_cpu(void)
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index ff759f26b96a..983bae7d2665 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -180,7 +180,7 @@ void start_secondary(void)
 
 	local_irq_enable();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 0e76fad27975..74fe317477e6 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -454,7 +454,7 @@ start_secondary (void *unused)
 	preempt_disable();
 	smp_callin();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 	return 0;
 }
 
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c
index a468467542f4..f98d2f6519d6 100644
--- a/arch/m32r/kernel/smpboot.c
+++ b/arch/m32r/kernel/smpboot.c
@@ -432,7 +432,7 @@ int __init start_secondary(void *unused)
 	 */
 	local_flush_tlb_all();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 	return 0;
 }
 
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index c3c6f0864881..bad13232de51 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -396,7 +396,7 @@ asmlinkage void secondary_start_kernel(void)
 	/*
 	 * OK, it's off to the idle thread for us
 	 */
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index bd4385a8e6e8..f2112a8ddf15 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -191,7 +191,7 @@ asmlinkage void start_secondary(void)
 	WARN_ON_ONCE(!irqs_disabled());
 	mp_ops->smp_finish();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 static void stop_this_cpu(void *dummy)
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index f984193718b1..426173c4b0b9 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -675,7 +675,7 @@ int __init start_secondary(void *unused)
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 	init_clockevents();
 #endif
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 	return 0;
 }
 
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 52e85973a283..c2a9cc55a62f 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -305,7 +305,7 @@ void __init smp_callin(void)
 
 	local_irq_enable();  /* Interrupts have been off until now */
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 
 	/* NOTREACHED */
 	panic("smp_callin() AAAAaaaaahhhh....\n");
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index ec9ec2058d2d..cc13d4c83291 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -727,7 +727,7 @@ void start_secondary(void *unused)
 
 	local_irq_enable();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 
 	BUG();
 }
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 3c65a8eae34d..40a6b4f9c36c 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -798,7 +798,7 @@ static void smp_start_secondary(void *cpuvoid)
 	set_cpu_online(smp_processor_id(), true);
 	inc_irq_stat(CPU_RST);
 	local_irq_enable();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 /* Upping and downing of CPUs */
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index de6be008fc01..13f633add29a 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -203,7 +203,7 @@ asmlinkage void start_secondary(void)
 	set_cpu_online(cpu, true);
 	per_cpu(cpu_state, cpu) = CPU_ONLINE;
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 extern struct {
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index b3a5d81b20f0..fb30e7c6a5b1 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -364,7 +364,7 @@ static void sparc_start_secondary(void *arg)
 	local_irq_enable();
 
 	wmb();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 
 	/* We should never reach here! */
 	BUG();
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 19cd08d18672..8a6151a628ce 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -134,7 +134,7 @@ void smp_callin(void)
 
 	local_irq_enable();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void cpu_panic(void)
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index 20d52a98e171..6c0abaacec33 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -208,7 +208,7 @@ void online_secondary(void)
 	/* Set up tile-timer clock-event device on this cpu */
 	setup_tile_timer();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 int __cpu_up(unsigned int cpu, struct task_struct *tidle)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 24d57f77b3c1..293b22a7ab02 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -248,7 +248,7 @@ static void notrace start_secondary(void *unused)
 	x86_cpuinit.setup_percpu_clockev();
 
 	wmb();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void __init smp_store_boot_cpu_info(void)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 3f4ebf0261f2..3c6d17fd423a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -112,7 +112,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu)
 		xen_pvh_secondary_vcpu_init(cpu);
 #endif
 	cpu_bringup();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 static void xen_smp_intr_free(unsigned int cpu)
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 4d02e38514f5..fc4ad21a5ed4 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -157,7 +157,7 @@ void secondary_start_kernel(void)
 
 	complete(&cpu_running);
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 static void mx_cpu_start(void *p)
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 8a715bb1e192..4aa263adc536 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -13,6 +13,7 @@ enum cpuhp_state {
 	CPUHP_CPU_SET_ACTIVE,
 	CPUHP_KICK_AP_THREAD,
 	CPUHP_BP_ONLINE,
+	CPUHP_AP_ONLINE_IDLE,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_NOTIFY_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
-- 
cgit 


From 8df3e07e7f21f2ed8d001e6fabf9505946b438aa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:41 +0000
Subject: cpu/hotplug: Let upcoming cpu bring itself fully up

Let the upcoming cpu kick the hotplug thread and let itself complete the
bringup. That way the controll side can just wait for the completion or later
when we made the hotplug machinery async not care at all.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.697655464@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpuhotplug.h |  9 ++++---
 kernel/cpu.c               | 66 ++++++++++++++++++++++++++--------------------
 kernel/sched/idle.c        |  2 ++
 3 files changed, 45 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 4aa263adc536..ad5d7fcb0130 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -10,9 +10,6 @@ enum cpuhp_state {
 	CPUHP_AP_NOTIFY_STARTING,
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
-	CPUHP_CPU_SET_ACTIVE,
-	CPUHP_KICK_AP_THREAD,
-	CPUHP_BP_ONLINE,
 	CPUHP_AP_ONLINE_IDLE,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_NOTIFY_ONLINE,
@@ -86,4 +83,10 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state)
 	__cpuhp_remove_state(state, false);
 }
 
+#ifdef CONFIG_SMP
+void cpuhp_online_idle(enum cpuhp_state state);
+#else
+static inline void cpuhp_online_idle(enum cpuhp_state state) { }
+#endif
+
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e220e565ea98..f1f880fac832 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -329,6 +329,14 @@ static int notify_starting(unsigned int cpu)
 	return 0;
 }
 
+static int bringup_wait_for_ap(unsigned int cpu)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+	wait_for_completion(&st->done);
+	return st->result;
+}
+
 static int bringup_cpu(unsigned int cpu)
 {
 	struct task_struct *idle = idle_thread_get(cpu);
@@ -340,8 +348,9 @@ static int bringup_cpu(unsigned int cpu)
 		cpu_notify(CPU_UP_CANCELED, cpu);
 		return ret;
 	}
+	ret = bringup_wait_for_ap(cpu);
 	BUG_ON(!cpu_online(cpu));
-	return 0;
+	return ret;
 }
 
 /*
@@ -470,7 +479,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
 		}
 	} else {
 		/* Cannot happen .... */
-		BUG_ON(st->state < CPUHP_KICK_AP_THREAD);
+		BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
 
 		/* Regular hotplug work */
 		if (st->state < st->target)
@@ -780,7 +789,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 	 * If the current CPU state is in the range of the AP hotplug thread,
 	 * then we need to kick the thread.
 	 */
-	if (st->state >= CPUHP_KICK_AP_THREAD) {
+	if (st->state > CPUHP_TEARDOWN_CPU) {
 		ret = cpuhp_kick_ap_work(cpu);
 		/*
 		 * The AP side has done the error rollback already. Just
@@ -793,11 +802,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 		 * We might have stopped still in the range of the AP hotplug
 		 * thread. Nothing to do anymore.
 		 */
-		if (st->state >= CPUHP_KICK_AP_THREAD)
+		if (st->state > CPUHP_TEARDOWN_CPU)
 			goto out;
 	}
 	/*
-	 * The AP brought itself down below CPUHP_KICK_AP_THREAD. So we need
+	 * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
 	 * to do the further cleanups.
 	 */
 	ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
@@ -859,18 +868,32 @@ void notify_cpu_starting(unsigned int cpu)
 
 /*
  * Called from the idle task. We need to set active here, so we can kick off
- * the stopper thread.
+ * the stopper thread and unpark the smpboot threads. If the target state is
+ * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the
+ * cpu further.
  */
-static int cpuhp_set_cpu_active(unsigned int cpu)
+void cpuhp_online_idle(enum cpuhp_state state)
 {
-	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+	unsigned int cpu = smp_processor_id();
+
+	/* Happens for the boot cpu */
+	if (state != CPUHP_AP_ONLINE_IDLE)
+		return;
+
+	st->state = CPUHP_AP_ONLINE_IDLE;
 
 	/* The cpu is marked online, set it active now */
 	set_cpu_active(cpu, true);
-	/* Unpark the stopper thread and the hotplug thread */
+	/* Unpark the stopper thread and the hotplug thread of this cpu */
 	stop_machine_unpark(cpu);
 	kthread_unpark(st->thread);
-	return 0;
+
+	/* Should we go further up ? */
+	if (st->target > CPUHP_AP_ONLINE_IDLE)
+		__cpuhp_kick_ap_work(st);
+	else
+		complete(&st->done);
 }
 
 /* Requires cpu_add_remove_lock to be held */
@@ -910,7 +933,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 	 * If the current CPU state is in the range of the AP hotplug thread,
 	 * then we need to kick the thread once more.
 	 */
-	if (st->state >= CPUHP_KICK_AP_THREAD) {
+	if (st->state > CPUHP_BRINGUP_CPU) {
 		ret = cpuhp_kick_ap_work(cpu);
 		/*
 		 * The AP side has done the error rollback already. Just
@@ -922,10 +945,10 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 
 	/*
 	 * Try to reach the target state. We max out on the BP at
-	 * CPUHP_KICK_AP_THREAD. After that the AP hotplug thread is
+	 * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
 	 * responsible for bringing it up to the target state.
 	 */
-	target = min((int)target, CPUHP_KICK_AP_THREAD);
+	target = min((int)target, CPUHP_BRINGUP_CPU);
 	ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target);
 out:
 	cpu_hotplug_done();
@@ -1146,22 +1169,7 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.teardown		= takedown_cpu,
 		.cant_stop		= true,
 	},
-	[CPUHP_CPU_SET_ACTIVE] = {
-		.name			= "cpu:active",
-		.startup		= cpuhp_set_cpu_active,
-		.teardown		= NULL,
-	},
-	[CPUHP_KICK_AP_THREAD] = {
-		.name			= "cpuhp:kickthread",
-		.startup		= cpuhp_kick_ap_work,
-		.teardown		= cpuhp_kick_ap_work,
-	},
 #endif
-	[CPUHP_BP_ONLINE] = {
-		.name			= "online",
-		.startup		= NULL,
-		.teardown		= NULL,
-	},
 };
 
 /* Application processor state steps */
@@ -1204,7 +1212,7 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state)
 {
 	if (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE)
 		return true;
-	return state > CPUHP_BP_ONLINE;
+	return state > CPUHP_BRINGUP_CPU;
 }
 
 static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 544a7133cbd1..a4b9813afc96 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -4,6 +4,7 @@
 #include <linux/sched.h>
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
+#include <linux/cpuhotplug.h>
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
@@ -291,5 +292,6 @@ void cpu_startup_entry(enum cpuhp_state state)
 	boot_init_stack_canary();
 #endif
 	arch_cpu_idle_prepare();
+	cpuhp_online_idle(state);
 	cpu_idle_loop();
 }
-- 
cgit 


From e69aab13117efc1987620090e539b4ebeb33a04c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:43 +0000
Subject: cpu/hotplug: Make wait for dead cpu completion based

Kill the busy spinning on the control side and just wait for the hotplugged
cpu to tell that it reached the dead state.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.776157858@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h        |  5 +++--
 include/linux/cpuhotplug.h |  1 +
 kernel/cpu.c               | 16 ++++++++++++----
 kernel/sched/idle.c        |  5 +----
 4 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 83f35767016d..91a48d1b4ca0 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -276,14 +276,15 @@ void arch_cpu_idle_enter(void);
 void arch_cpu_idle_exit(void);
 void arch_cpu_idle_dead(void);
 
-DECLARE_PER_CPU(bool, cpu_dead_idle);
-
 int cpu_report_state(int cpu);
 int cpu_check_up_prepare(int cpu);
 void cpu_set_state_online(int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 bool cpu_wait_death(unsigned int cpu, int seconds);
 bool cpu_report_death(void);
+void cpuhp_report_idle_dead(void);
+#else
+static inline void cpuhp_report_idle_dead(void) { }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
 #endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index ad5d7fcb0130..5d68e15e46b7 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -6,6 +6,7 @@ enum cpuhp_state {
 	CPUHP_CREATE_THREADS,
 	CPUHP_NOTIFY_PREPARE,
 	CPUHP_BRINGUP_CPU,
+	CPUHP_AP_IDLE_DEAD,
 	CPUHP_AP_OFFLINE,
 	CPUHP_AP_NOTIFY_STARTING,
 	CPUHP_AP_ONLINE,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f1f880fac832..0e8c07f2566e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -688,6 +688,7 @@ static int take_cpu_down(void *_param)
 
 static int takedown_cpu(unsigned int cpu)
 {
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	int err;
 
 	/*
@@ -733,10 +734,8 @@ static int takedown_cpu(unsigned int cpu)
 	 *
 	 * Wait for the stop thread to go away.
 	 */
-	while (!per_cpu(cpu_dead_idle, cpu))
-		cpu_relax();
-	smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
-	per_cpu(cpu_dead_idle, cpu) = false;
+	wait_for_completion(&st->done);
+	BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
 
 	/* Interrupts are moved away from the dying cpu, reenable alloc/free */
 	irq_unlock_sparse();
@@ -756,6 +755,15 @@ static int notify_dead(unsigned int cpu)
 	return 0;
 }
 
+void cpuhp_report_idle_dead(void)
+{
+	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+
+	BUG_ON(st->state != CPUHP_AP_OFFLINE);
+	st->state = CPUHP_AP_IDLE_DEAD;
+	complete(&st->done);
+}
+
 #else
 #define notify_down_prepare	NULL
 #define takedown_cpu		NULL
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index a4b9813afc96..8abbe89e9114 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -194,8 +194,6 @@ exit_idle:
 	rcu_idle_exit();
 }
 
-DEFINE_PER_CPU(bool, cpu_dead_idle);
-
 /*
  * Generic idle loop implementation
  *
@@ -224,8 +222,7 @@ static void cpu_idle_loop(void)
 			if (cpu_is_offline(smp_processor_id())) {
 				rcu_cpu_notify(NULL, CPU_DYING_IDLE,
 					       (void *)(long)smp_processor_id());
-				smp_mb(); /* all activity before dead. */
-				this_cpu_write(cpu_dead_idle, true);
+				cpuhp_report_idle_dead();
 				arch_cpu_idle_dead();
 			}
 
-- 
cgit 


From 27d50c7eeb0f03c3d3ca72aac4d2dd487ca1f3f0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:44 +0000
Subject: rcu: Make CPU_DYING_IDLE an explicit call

Make the RCU CPU_DYING_IDLE callback an explicit function call, so it gets
invoked at the proper place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.870167933@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h      |  4 +--
 include/linux/notifier.h |  2 ++
 include/linux/rcupdate.h |  4 +--
 kernel/cpu.c             |  1 +
 kernel/rcu/tree.c        | 70 +++++++++++++++++++++++++-----------------------
 kernel/sched/idle.c      |  2 --
 6 files changed, 42 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 91a48d1b4ca0..f9b1fab4388a 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -101,9 +101,7 @@ enum {
 					* Called on the new cpu, just before
 					* enabling interrupts. Must not sleep,
 					* must not fail */
-#define CPU_DYING_IDLE		0x000B /* CPU (unsigned)v dying, reached
-					* idle loop. */
-#define CPU_BROKEN		0x000C /* CPU (unsigned)v did not die properly,
+#define CPU_BROKEN		0x000B /* CPU (unsigned)v did not die properly,
 					* perhaps due to preemption. */
 
 /* Used for CPU hotplug events occurring while tasks are frozen due to a suspend
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index d14a4c362465..4149868de4e6 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -47,6 +47,8 @@
  * runtime initialization.
  */
 
+struct notifier_block;
+
 typedef	int (*notifier_fn_t)(struct notifier_block *nb,
 			unsigned long action, void *data);
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 14e6f47ee16f..fc46fe3ea259 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -332,9 +332,7 @@ void rcu_init(void);
 void rcu_sched_qs(void);
 void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
-struct notifier_block;
-int rcu_cpu_notify(struct notifier_block *self,
-		   unsigned long action, void *hcpu);
+void rcu_report_dead(unsigned int cpu);
 
 #ifndef CONFIG_TINY_RCU
 void rcu_end_inkernel_boot(void);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0e8c07f2566e..ff8059b76a85 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -762,6 +762,7 @@ void cpuhp_report_idle_dead(void)
 	BUG_ON(st->state != CPUHP_AP_OFFLINE);
 	st->state = CPUHP_AP_IDLE_DEAD;
 	complete(&st->done);
+	rcu_report_dead(smp_processor_id());
 }
 
 #else
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e41dd4131f7a..85b41341272e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2606,28 +2606,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 	}
 }
 
-/*
- * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
- * function.  We now remove it from the rcu_node tree's ->qsmaskinit
- * bit masks.
- */
-static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
-{
-	unsigned long flags;
-	unsigned long mask;
-	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
-
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
-		return;
-
-	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
-	mask = rdp->grpmask;
-	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
-	rnp->qsmaskinitnext &= ~mask;
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
 /*
  * The CPU has been completely removed, and some other CPU is reporting
  * this fact from process context.  Do the remainder of the cleanup,
@@ -4247,6 +4225,43 @@ static void rcu_prepare_cpu(int cpu)
 		rcu_init_percpu_data(cpu, rsp);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function.  We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ */
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
+
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+		return;
+
+	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+	mask = rdp->grpmask;
+	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
+	rnp->qsmaskinitnext &= ~mask;
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+void rcu_report_dead(unsigned int cpu)
+{
+	struct rcu_state *rsp;
+
+	/* QS for any half-done expedited RCU-sched GP. */
+	preempt_disable();
+	rcu_report_exp_rdp(&rcu_sched_state,
+			   this_cpu_ptr(rcu_sched_state.rda), true);
+	preempt_enable();
+	for_each_rcu_flavor(rsp)
+		rcu_cleanup_dying_idle_cpu(cpu, rsp);
+}
+#endif
+
 /*
  * Handle CPU online/offline notification events.
  */
@@ -4278,17 +4293,6 @@ int rcu_cpu_notify(struct notifier_block *self,
 		for_each_rcu_flavor(rsp)
 			rcu_cleanup_dying_cpu(rsp);
 		break;
-	case CPU_DYING_IDLE:
-		/* QS for any half-done expedited RCU-sched GP. */
-		preempt_disable();
-		rcu_report_exp_rdp(&rcu_sched_state,
-				   this_cpu_ptr(rcu_sched_state.rda), true);
-		preempt_enable();
-
-		for_each_rcu_flavor(rsp) {
-			rcu_cleanup_dying_idle_cpu(cpu, rsp);
-		}
-		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 	case CPU_UP_CANCELED:
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8abbe89e9114..bd12c6c714ec 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -220,8 +220,6 @@ static void cpu_idle_loop(void)
 			rmb();
 
 			if (cpu_is_offline(smp_processor_id())) {
-				rcu_cpu_notify(NULL, CPU_DYING_IDLE,
-					       (void *)(long)smp_processor_id());
 				cpuhp_report_idle_dead();
 				arch_cpu_idle_dead();
 			}
-- 
cgit 


From 9da0f49c8767cc0ef6101cb21156cf4380ed50dd Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 22 Feb 2016 03:15:20 -0800
Subject: time: Add timekeeping snapshot code capturing system time and counter

In the current timekeeping code there isn't any interface to
atomically capture the current relationship between the system counter
and system time. ktime_get_snapshot() returns this triple (counter,
monotonic raw, realtime) in the system_time_snapshot struct.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Moved structure definitions around to clean things up,
 fixed cycles_t/cycle_t confusion.]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h | 18 ++++++++++++++++++
 kernel/time/timekeeping.c   | 30 ++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index ec89d846324c..7817591af46f 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -266,6 +266,24 @@ extern void timekeeping_inject_sleeptime64(struct timespec64 *delta);
 extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw,
 				        struct timespec64 *ts_real);
 
+/*
+ * struct system_time_snapshot - simultaneous raw/real time capture with
+ *	counter value
+ * @cycles:	Clocksource counter value to produce the system times
+ * @real:	Realtime system time
+ * @raw:	Monotonic raw system time
+ */
+struct system_time_snapshot {
+	cycle_t		cycles;
+	ktime_t		real;
+	ktime_t		raw;
+};
+
+/*
+ * Simultaneously snapshot realtime and monotonic raw clocks
+ */
+extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);
+
 /*
  * Persistent clock related interfaces
  */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4243d28177ac..89b4695bd083 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -874,6 +874,36 @@ time64_t __ktime_get_real_seconds(void)
 	return tk->xtime_sec;
 }
 
+/**
+ * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
+ * @systime_snapshot:	pointer to struct receiving the system time snapshot
+ */
+void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	unsigned long seq;
+	ktime_t base_raw;
+	ktime_t base_real;
+	s64 nsec_raw;
+	s64 nsec_real;
+	cycle_t now;
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+
+		now = tk->tkr_mono.read(tk->tkr_mono.clock);
+		base_real = ktime_add(tk->tkr_mono.base,
+				      tk_core.timekeeper.offs_real);
+		base_raw = tk->tkr_raw.base;
+		nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
+		nsec_raw  = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	systime_snapshot->cycles = now;
+	systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
+	systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_snapshot);
 
 #ifdef CONFIG_NTP_PPS
 
-- 
cgit 


From ba26621e63ce6dc481d90ab9f6902e058d4ea39a Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 22 Feb 2016 03:15:21 -0800
Subject: time: Remove duplicated code in ktime_get_raw_and_real()

The code in ktime_get_snapshot() is a superset of the code in
ktime_get_raw_and_real() code. Further, ktime_get_raw_and_real() is
called only by the PPS code, pps_get_ts(). Consolidate the
pps_get_ts() code into a single function calling ktime_get_snapshot()
and eliminate ktime_get_raw_and_real(). A side effect of this is that
the raw and real results of pps_get_ts() correspond to exactly the
same clock cycle. Previously these values represented separate reads
of the system clock.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/pps_kernel.h | 17 ++++++-----------
 kernel/time/timekeeping.c  | 40 ++--------------------------------------
 2 files changed, 8 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index 54bf1484d41f..35ac903956c7 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -111,22 +111,17 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt,
 	kt->nsec = ts.tv_nsec;
 }
 
-#ifdef CONFIG_NTP_PPS
-
 static inline void pps_get_ts(struct pps_event_time *ts)
 {
-	ktime_get_raw_and_real_ts64(&ts->ts_raw, &ts->ts_real);
-}
+	struct system_time_snapshot snap;
 
-#else /* CONFIG_NTP_PPS */
-
-static inline void pps_get_ts(struct pps_event_time *ts)
-{
-	ktime_get_real_ts64(&ts->ts_real);
+	ktime_get_snapshot(&snap);
+	ts->ts_real = ktime_to_timespec64(snap.real);
+#ifdef CONFIG_NTP_PPS
+	ts->ts_raw = ktime_to_timespec64(snap.raw);
+#endif
 }
 
-#endif /* CONFIG_NTP_PPS */
-
 /* Subtract known time delay from PPS event time(s) */
 static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta)
 {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 89b4695bd083..af19a49d5223 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -888,6 +888,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 	s64 nsec_real;
 	cycle_t now;
 
+	WARN_ON_ONCE(timekeeping_suspended);
+
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
@@ -905,44 +907,6 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 }
 EXPORT_SYMBOL_GPL(ktime_get_snapshot);
 
-#ifdef CONFIG_NTP_PPS
-
-/**
- * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format
- * @ts_raw:	pointer to the timespec to be set to raw monotonic time
- * @ts_real:	pointer to the timespec to be set to the time of day
- *
- * This function reads both the time of day and raw monotonic time at the
- * same time atomically and stores the resulting timestamps in timespec
- * format.
- */
-void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned long seq;
-	s64 nsecs_raw, nsecs_real;
-
-	WARN_ON_ONCE(timekeeping_suspended);
-
-	do {
-		seq = read_seqcount_begin(&tk_core.seq);
-
-		*ts_raw = tk->raw_time;
-		ts_real->tv_sec = tk->xtime_sec;
-		ts_real->tv_nsec = 0;
-
-		nsecs_raw  = timekeeping_get_ns(&tk->tkr_raw);
-		nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
-
-	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	timespec64_add_ns(ts_raw, nsecs_raw);
-	timespec64_add_ns(ts_real, nsecs_real);
-}
-EXPORT_SYMBOL(ktime_get_raw_and_real_ts64);
-
-#endif /* CONFIG_NTP_PPS */
-
 /**
  * do_gettimeofday - Returns the time of day in a timeval
  * @tv:		pointer to the timeval to be set
-- 
cgit 


From 8006c24595cab106bcb9da12d35e32e14ff492df Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 22 Feb 2016 03:15:22 -0800
Subject: time: Add driver cross timestamp interface for higher precision time
 synchronization

ACKNOWLEDGMENT: cross timestamp code was developed by Thomas Gleixner
<tglx@linutronix.de>. It has changed considerably and any mistakes are
mine.

The precision with which events on multiple networked systems can be
synchronized using, as an example, PTP (IEEE 1588, 802.1AS) is limited
by the precision of the cross timestamps between the system clock and
the device (timestamp) clock. Precision here is the degree of
simultaneity when capturing the cross timestamp.

Currently the PTP cross timestamp is captured in software using the
PTP device driver ioctl PTP_SYS_OFFSET. Reads of the device clock are
interleaved with reads of the realtime clock. At best, the precision
of this cross timestamp is on the order of several microseconds due to
software latencies. Sub-microsecond precision is required for
industrial control and some media applications. To achieve this level
of precision hardware supported cross timestamping is needed.

The function get_device_system_crosstimestamp() allows device drivers
to return a cross timestamp with system time properly scaled to
nanoseconds.  The realtime value is needed to discipline that clock
using PTP and the monotonic raw value is used for applications that
don't require a "real" time, but need an unadjusted clock time.  The
get_device_system_crosstimestamp() code calls back into the driver to
ensure that the system counter is within the current timekeeping
update interval.

Modern Intel hardware provides an Always Running Timer (ART) which is
exactly related to TSC through a known frequency ratio. The ART is
routed to devices on the system and is used to precisely and
simultaneously capture the device clock with the ART.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Reworked to remove extra structures and simplify calling]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h | 35 ++++++++++++++++++++++++++++
 kernel/time/timekeeping.c   | 56 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 7817591af46f..4a2ca65fc778 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -279,6 +279,41 @@ struct system_time_snapshot {
 	ktime_t		raw;
 };
 
+/*
+ * struct system_device_crosststamp - system/device cross-timestamp
+ *	(syncronized capture)
+ * @device:		Device time
+ * @sys_realtime:	Realtime simultaneous with device time
+ * @sys_monoraw:	Monotonic raw simultaneous with device time
+ */
+struct system_device_crosststamp {
+	ktime_t device;
+	ktime_t sys_realtime;
+	ktime_t sys_monoraw;
+};
+
+/*
+ * struct system_counterval_t - system counter value with the pointer to the
+ *	corresponding clocksource
+ * @cycles:	System counter value
+ * @cs:		Clocksource corresponding to system counter value. Used by
+ *	timekeeping code to verify comparibility of two cycle values
+ */
+struct system_counterval_t {
+	cycle_t			cycles;
+	struct clocksource	*cs;
+};
+
+/*
+ * Get cross timestamp between system clock and device clock
+ */
+extern int get_device_system_crosststamp(
+			int (*get_time_fn)(ktime_t *device_time,
+				struct system_counterval_t *system_counterval,
+				void *ctx),
+			void *ctx,
+			struct system_device_crosststamp *xtstamp);
+
 /*
  * Simultaneously snapshot realtime and monotonic raw clocks
  */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index af19a49d5223..dba595cdb200 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -907,6 +907,62 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 }
 EXPORT_SYMBOL_GPL(ktime_get_snapshot);
 
+/**
+ * get_device_system_crosststamp - Synchronously capture system/device timestamp
+ * @sync_devicetime:	Callback to get simultaneous device time and
+ *	system counter from the device driver
+ * @xtstamp:		Receives simultaneously captured system and device time
+ *
+ * Reads a timestamp from a device and correlates it to system time
+ */
+int get_device_system_crosststamp(int (*get_time_fn)
+				  (ktime_t *device_time,
+				   struct system_counterval_t *sys_counterval,
+				   void *ctx),
+				  void *ctx,
+				  struct system_device_crosststamp *xtstamp)
+{
+	struct system_counterval_t system_counterval;
+	struct timekeeper *tk = &tk_core.timekeeper;
+	ktime_t base_real, base_raw;
+	s64 nsec_real, nsec_raw;
+	unsigned long seq;
+	int ret;
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		/*
+		 * Try to synchronously capture device time and a system
+		 * counter value calling back into the device driver
+		 */
+		ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
+		if (ret)
+			return ret;
+
+		/*
+		 * Verify that the clocksource associated with the captured
+		 * system counter value is the same as the currently installed
+		 * timekeeper clocksource
+		 */
+		if (tk->tkr_mono.clock != system_counterval.cs)
+			return -ENODEV;
+
+		base_real = ktime_add(tk->tkr_mono.base,
+				      tk_core.timekeeper.offs_real);
+		base_raw = tk->tkr_raw.base;
+
+		nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono,
+						     system_counterval.cycles);
+		nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw,
+						    system_counterval.cycles);
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
+	xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
+
 /**
  * do_gettimeofday - Returns the time of day in a timeval
  * @tv:		pointer to the timeval to be set
-- 
cgit 


From 2c756feb18d9ec258dbb3a3d11c47e28820690d7 Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 22 Feb 2016 03:15:23 -0800
Subject: time: Add history to cross timestamp interface supporting slower
 devices

Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.

In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).

From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:

    System Clock <-> Audio clock
    System Clock <-> Network Device Clock [<-> PTP Master Clock]

Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock).  The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence.  This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.

Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.

When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeper_internal.h |   2 +
 include/linux/timekeeping.h         |   5 ++
 kernel/time/timekeeping.c           | 171 +++++++++++++++++++++++++++++++++++-
 3 files changed, 177 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 25247220b4b7..e88005459035 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -50,6 +50,7 @@ struct tk_read_base {
  * @offs_tai:		Offset clock monotonic -> clock tai
  * @tai_offset:		The current UTC to TAI offset in seconds
  * @clock_was_set_seq:	The sequence number of clock was set events
+ * @cs_was_changed_seq:	The sequence number of clocksource change events
  * @next_leap_ktime:	CLOCK_MONOTONIC time value of a pending leap-second
  * @raw_time:		Monotonic raw base time in timespec64 format
  * @cycle_interval:	Number of clock cycles in one NTP interval
@@ -91,6 +92,7 @@ struct timekeeper {
 	ktime_t			offs_tai;
 	s32			tai_offset;
 	unsigned int		clock_was_set_seq;
+	u8			cs_was_changed_seq;
 	ktime_t			next_leap_ktime;
 	struct timespec64	raw_time;
 
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 4a2ca65fc778..96f37bee3bc1 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -272,11 +272,15 @@ extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw,
  * @cycles:	Clocksource counter value to produce the system times
  * @real:	Realtime system time
  * @raw:	Monotonic raw system time
+ * @clock_was_set_seq:	The sequence number of clock was set events
+ * @cs_was_changed_seq:	The sequence number of clocksource change events
  */
 struct system_time_snapshot {
 	cycle_t		cycles;
 	ktime_t		real;
 	ktime_t		raw;
+	unsigned int	clock_was_set_seq;
+	u8		cs_was_changed_seq;
 };
 
 /*
@@ -312,6 +316,7 @@ extern int get_device_system_crosststamp(
 				struct system_counterval_t *system_counterval,
 				void *ctx),
 			void *ctx,
+			struct system_time_snapshot *history,
 			struct system_device_crosststamp *xtstamp);
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index dba595cdb200..931b0b1a71e9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	u64 tmp, ntpinterval;
 	struct clocksource *old_clock;
 
+	++tk->cs_was_changed_seq;
 	old_clock = tk->tkr_mono.clock;
 	tk->tkr_mono.clock = clock;
 	tk->tkr_mono.read = clock->read;
@@ -894,6 +895,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 		seq = read_seqcount_begin(&tk_core.seq);
 
 		now = tk->tkr_mono.read(tk->tkr_mono.clock);
+		systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
+		systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
 		base_real = ktime_add(tk->tkr_mono.base,
 				      tk_core.timekeeper.offs_real);
 		base_raw = tk->tkr_raw.base;
@@ -907,10 +910,123 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 }
 EXPORT_SYMBOL_GPL(ktime_get_snapshot);
 
+/* Scale base by mult/div checking for overflow */
+static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
+{
+	u64 tmp, rem;
+
+	tmp = div64_u64_rem(*base, div, &rem);
+
+	if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
+	    ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
+		return -EOVERFLOW;
+	tmp *= mult;
+	rem *= mult;
+
+	do_div(rem, div);
+	*base = tmp + rem;
+	return 0;
+}
+
+/**
+ * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
+ * @history:			Snapshot representing start of history
+ * @partial_history_cycles:	Cycle offset into history (fractional part)
+ * @total_history_cycles:	Total history length in cycles
+ * @discontinuity:		True indicates clock was set on history period
+ * @ts:				Cross timestamp that should be adjusted using
+ *	partial/total ratio
+ *
+ * Helper function used by get_device_system_crosststamp() to correct the
+ * crosstimestamp corresponding to the start of the current interval to the
+ * system counter value (timestamp point) provided by the driver. The
+ * total_history_* quantities are the total history starting at the provided
+ * reference point and ending at the start of the current interval. The cycle
+ * count between the driver timestamp point and the start of the current
+ * interval is partial_history_cycles.
+ */
+static int adjust_historical_crosststamp(struct system_time_snapshot *history,
+					 cycle_t partial_history_cycles,
+					 cycle_t total_history_cycles,
+					 bool discontinuity,
+					 struct system_device_crosststamp *ts)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	u64 corr_raw, corr_real;
+	bool interp_forward;
+	int ret;
+
+	if (total_history_cycles == 0 || partial_history_cycles == 0)
+		return 0;
+
+	/* Interpolate shortest distance from beginning or end of history */
+	interp_forward = partial_history_cycles > total_history_cycles/2 ?
+		true : false;
+	partial_history_cycles = interp_forward ?
+		total_history_cycles - partial_history_cycles :
+		partial_history_cycles;
+
+	/*
+	 * Scale the monotonic raw time delta by:
+	 *	partial_history_cycles / total_history_cycles
+	 */
+	corr_raw = (u64)ktime_to_ns(
+		ktime_sub(ts->sys_monoraw, history->raw));
+	ret = scale64_check_overflow(partial_history_cycles,
+				     total_history_cycles, &corr_raw);
+	if (ret)
+		return ret;
+
+	/*
+	 * If there is a discontinuity in the history, scale monotonic raw
+	 *	correction by:
+	 *	mult(real)/mult(raw) yielding the realtime correction
+	 * Otherwise, calculate the realtime correction similar to monotonic
+	 *	raw calculation
+	 */
+	if (discontinuity) {
+		corr_real = mul_u64_u32_div
+			(corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
+	} else {
+		corr_real = (u64)ktime_to_ns(
+			ktime_sub(ts->sys_realtime, history->real));
+		ret = scale64_check_overflow(partial_history_cycles,
+					     total_history_cycles, &corr_real);
+		if (ret)
+			return ret;
+	}
+
+	/* Fixup monotonic raw and real time time values */
+	if (interp_forward) {
+		ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
+		ts->sys_realtime = ktime_add_ns(history->real, corr_real);
+	} else {
+		ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
+		ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
+	}
+
+	return 0;
+}
+
+/*
+ * cycle_between - true if test occurs chronologically between before and after
+ */
+static bool cycle_between(cycle_t before, cycle_t test, cycle_t after)
+{
+	if (test > before && test < after)
+		return true;
+	if (test < before && before > after)
+		return true;
+	return false;
+}
+
 /**
  * get_device_system_crosststamp - Synchronously capture system/device timestamp
- * @sync_devicetime:	Callback to get simultaneous device time and
+ * @get_time_fn:	Callback to get simultaneous device time and
  *	system counter from the device driver
+ * @ctx:		Context passed to get_time_fn()
+ * @history_begin:	Historical reference point used to interpolate system
+ *	time when counter provided by the driver is before the current interval
  * @xtstamp:		Receives simultaneously captured system and device time
  *
  * Reads a timestamp from a device and correlates it to system time
@@ -920,13 +1036,18 @@ int get_device_system_crosststamp(int (*get_time_fn)
 				   struct system_counterval_t *sys_counterval,
 				   void *ctx),
 				  void *ctx,
+				  struct system_time_snapshot *history_begin,
 				  struct system_device_crosststamp *xtstamp)
 {
 	struct system_counterval_t system_counterval;
 	struct timekeeper *tk = &tk_core.timekeeper;
+	cycle_t cycles, now, interval_start;
+	unsigned int clock_was_set_seq;
 	ktime_t base_real, base_raw;
 	s64 nsec_real, nsec_raw;
+	u8 cs_was_changed_seq;
 	unsigned long seq;
+	bool do_interp;
 	int ret;
 
 	do {
@@ -946,6 +1067,22 @@ int get_device_system_crosststamp(int (*get_time_fn)
 		 */
 		if (tk->tkr_mono.clock != system_counterval.cs)
 			return -ENODEV;
+		cycles = system_counterval.cycles;
+
+		/*
+		 * Check whether the system counter value provided by the
+		 * device driver is on the current timekeeping interval.
+		 */
+		now = tk->tkr_mono.read(tk->tkr_mono.clock);
+		interval_start = tk->tkr_mono.cycle_last;
+		if (!cycle_between(interval_start, cycles, now)) {
+			clock_was_set_seq = tk->clock_was_set_seq;
+			cs_was_changed_seq = tk->cs_was_changed_seq;
+			cycles = interval_start;
+			do_interp = true;
+		} else {
+			do_interp = false;
+		}
 
 		base_real = ktime_add(tk->tkr_mono.base,
 				      tk_core.timekeeper.offs_real);
@@ -959,6 +1096,38 @@ int get_device_system_crosststamp(int (*get_time_fn)
 
 	xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
 	xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
+
+	/*
+	 * Interpolate if necessary, adjusting back from the start of the
+	 * current interval
+	 */
+	if (do_interp) {
+		cycle_t partial_history_cycles, total_history_cycles;
+		bool discontinuity;
+
+		/*
+		 * Check that the counter value occurs after the provided
+		 * history reference and that the history doesn't cross a
+		 * clocksource change
+		 */
+		if (!history_begin ||
+		    !cycle_between(history_begin->cycles,
+				   system_counterval.cycles, cycles) ||
+		    history_begin->cs_was_changed_seq != cs_was_changed_seq)
+			return -EINVAL;
+		partial_history_cycles = cycles - system_counterval.cycles;
+		total_history_cycles = cycles - history_begin->cycles;
+		discontinuity =
+			history_begin->clock_was_set_seq != clock_was_set_seq;
+
+		ret = adjust_historical_crosststamp(history_begin,
+						    partial_history_cycles,
+						    total_history_cycles,
+						    discontinuity, xtstamp);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
-- 
cgit 


From 82e88ff1ea948d83125a8aaa7c9809f03ccc500f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 Mar 2016 11:11:12 +0100
Subject: hrtimer: Revert CLOCK_MONOTONIC_RAW support

Revert commits:
a6e707ddbdf1: KVM: arm/arm64: timer: Switch to CLOCK_MONOTONIC_RAW
9006a01829a5: hrtimer: Catch illegal clockids
9c808765e88e: hrtimer: Add support for CLOCK_MONOTONIC_RAW

Marc found out, that there are fundamental issues with that patch series
because __hrtimer_get_next_event() and hrtimer_forward() need support for
CLOCK_MONOTONIC_RAW. Nothing which is easily fixed, so revert the whole lot.

Reported-by: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/56D6CEF0.8060607@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h   |  1 -
 kernel/time/hrtimer.c     | 18 ++----------------
 virt/kvm/arm/arch_timer.c |  4 ++--
 3 files changed, 4 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index a6d64af5e73f..76dd4f0da5ca 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -151,7 +151,6 @@ enum  hrtimer_base_type {
 	HRTIMER_BASE_REALTIME,
 	HRTIMER_BASE_BOOTTIME,
 	HRTIMER_BASE_TAI,
-	HRTIMER_BASE_MONOTONIC_RAW,
 	HRTIMER_MAX_CLOCK_BASES,
 };
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index cb0fe70f3c51..435b8850dd80 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -90,30 +90,19 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
 		},
-		{
-			.index = HRTIMER_BASE_MONOTONIC_RAW,
-			.clockid = CLOCK_MONOTONIC_RAW,
-			.get_time = &ktime_get_raw,
-		},
 	}
 };
 
 static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
-	/* Make sure we catch unsupported clockids */
-	[0 ... MAX_CLOCKS - 1]	= HRTIMER_MAX_CLOCK_BASES,
-
 	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
 	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
-	[CLOCK_MONOTONIC_RAW]	= HRTIMER_BASE_MONOTONIC_RAW,
 	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
 	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
 };
 
 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 {
-	int base = hrtimer_clock_to_base_table[clock_id];
-	BUG_ON(base == HRTIMER_MAX_CLOCK_BASES);
-	return base;
+	return hrtimer_clock_to_base_table[clock_id];
 }
 
 /*
@@ -1279,10 +1268,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
 		if (!(active & 0x01))
 			continue;
 
-		if (unlikely(base->index == HRTIMER_BASE_MONOTONIC_RAW))
-			basenow = ktime_get_raw();
-		else
-			basenow = ktime_add(now, base->offset);
+		basenow = ktime_add(now, base->offset);
 
 		while ((node = timerqueue_getnext(&base->active))) {
 			struct hrtimer *timer;
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 97c58153f923..69bca185c471 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -48,7 +48,7 @@ static bool timer_is_armed(struct arch_timer_cpu *timer)
 static void timer_arm(struct arch_timer_cpu *timer, u64 ns)
 {
 	timer->armed = true;
-	hrtimer_start(&timer->timer, ktime_add_ns(ktime_get_raw(), ns),
+	hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns),
 		      HRTIMER_MODE_ABS);
 }
 
@@ -308,7 +308,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
 	INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
-	hrtimer_init(&timer->timer, CLOCK_MONOTONIC_RAW, HRTIMER_MODE_ABS);
+	hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	timer->timer.function = kvm_timer_expire;
 }
 
-- 
cgit 


From 719f1aa4a67199a3c4c68a03f94e5ec44d9d5f82 Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 22 Feb 2016 03:15:25 -0800
Subject: ptp: Add PTP_SYS_OFFSET_PRECISE for driver crosstimestamping

Currently, network /system cross-timestamping is performed in the
PTP_SYS_OFFSET ioctl. The PTP clock driver reads gettimeofday() and
the gettime64() callback provided by the driver. The cross-timestamp
is best effort where the latency between the capture of system time
(getnstimeofday()) and the device time (driver callback) may be
significant.

The getcrosststamp() callback and corresponding PTP_SYS_OFFSET_PRECISE
ioctl allows the driver to perform this device/system correlation when
for example cross timestamp hardware is available. Modern Intel
systems can do this for onboard Ethernet controllers using the ART
counter. There is virtually zero latency between captures of the ART
and network device clock.

The capabilities ioctl (PTP_CLOCK_GETCAPS), is augmented allowing
applications to query whether or not drivers implement the
getcrosststamp callback, providing more precise cross timestamping.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Commit subject tweaks]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 Documentation/ptp/testptp.c      |  6 ++++--
 drivers/ptp/ptp_chardev.c        | 27 +++++++++++++++++++++++++++
 include/linux/ptp_clock_kernel.h |  8 ++++++++
 include/uapi/linux/ptp_clock.h   | 13 ++++++++++++-
 4 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ptp/testptp.c b/Documentation/ptp/testptp.c
index 6c6247aaa7b9..d99012f41602 100644
--- a/Documentation/ptp/testptp.c
+++ b/Documentation/ptp/testptp.c
@@ -277,13 +277,15 @@ int main(int argc, char *argv[])
 			       "  %d external time stamp channels\n"
 			       "  %d programmable periodic signals\n"
 			       "  %d pulse per second\n"
-			       "  %d programmable pins\n",
+			       "  %d programmable pins\n"
+			       "  %d cross timestamping\n",
 			       caps.max_adj,
 			       caps.n_alarm,
 			       caps.n_ext_ts,
 			       caps.n_per_out,
 			       caps.pps,
-			       caps.n_pins);
+			       caps.n_pins,
+			       caps.cross_timestamping);
 		}
 	}
 
diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index da7bae991552..579fd65299a0 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -22,6 +22,7 @@
 #include <linux/poll.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/timekeeping.h>
 
 #include "ptp_private.h"
 
@@ -120,11 +121,13 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 	struct ptp_clock_caps caps;
 	struct ptp_clock_request req;
 	struct ptp_sys_offset *sysoff = NULL;
+	struct ptp_sys_offset_precise precise_offset;
 	struct ptp_pin_desc pd;
 	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
 	struct ptp_clock_info *ops = ptp->info;
 	struct ptp_clock_time *pct;
 	struct timespec64 ts;
+	struct system_device_crosststamp xtstamp;
 	int enable, err = 0;
 	unsigned int i, pin_index;
 
@@ -138,6 +141,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		caps.n_per_out = ptp->info->n_per_out;
 		caps.pps = ptp->info->pps;
 		caps.n_pins = ptp->info->n_pins;
+		caps.cross_timestamping = ptp->info->getcrosststamp != NULL;
 		if (copy_to_user((void __user *)arg, &caps, sizeof(caps)))
 			err = -EFAULT;
 		break;
@@ -180,6 +184,29 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		err = ops->enable(ops, &req, enable);
 		break;
 
+	case PTP_SYS_OFFSET_PRECISE:
+		if (!ptp->info->getcrosststamp) {
+			err = -EOPNOTSUPP;
+			break;
+		}
+		err = ptp->info->getcrosststamp(ptp->info, &xtstamp);
+		if (err)
+			break;
+
+		ts = ktime_to_timespec64(xtstamp.device);
+		precise_offset.device.sec = ts.tv_sec;
+		precise_offset.device.nsec = ts.tv_nsec;
+		ts = ktime_to_timespec64(xtstamp.sys_realtime);
+		precise_offset.sys_realtime.sec = ts.tv_sec;
+		precise_offset.sys_realtime.nsec = ts.tv_nsec;
+		ts = ktime_to_timespec64(xtstamp.sys_monoraw);
+		precise_offset.sys_monoraw.sec = ts.tv_sec;
+		precise_offset.sys_monoraw.nsec = ts.tv_nsec;
+		if (copy_to_user((void __user *)arg, &precise_offset,
+				 sizeof(precise_offset)))
+			err = -EFAULT;
+		break;
+
 	case PTP_SYS_OFFSET:
 		sysoff = kmalloc(sizeof(*sysoff), GFP_KERNEL);
 		if (!sysoff) {
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index b8b73066d137..6b15e168148a 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -38,6 +38,7 @@ struct ptp_clock_request {
 	};
 };
 
+struct system_device_crosststamp;
 /**
  * struct ptp_clock_info - decribes a PTP hardware clock
  *
@@ -67,6 +68,11 @@ struct ptp_clock_request {
  * @gettime64:  Reads the current time from the hardware clock.
  *              parameter ts: Holds the result.
  *
+ * @getcrosststamp:  Reads the current time from the hardware clock and
+ *                   system clock simultaneously.
+ *                   parameter cts: Contains timestamp (device,system) pair,
+ *                   where system time is realtime and monotonic.
+ *
  * @settime64:  Set the current time on the hardware clock.
  *              parameter ts: Time value to set.
  *
@@ -105,6 +111,8 @@ struct ptp_clock_info {
 	int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta);
 	int (*adjtime)(struct ptp_clock_info *ptp, s64 delta);
 	int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts);
+	int (*getcrosststamp)(struct ptp_clock_info *ptp,
+			      struct system_device_crosststamp *cts);
 	int (*settime64)(struct ptp_clock_info *p, const struct timespec64 *ts);
 	int (*enable)(struct ptp_clock_info *ptp,
 		      struct ptp_clock_request *request, int on);
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index f0b7bfe5da92..ac6dded80ffa 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -51,7 +51,9 @@ struct ptp_clock_caps {
 	int n_per_out; /* Number of programmable periodic signals. */
 	int pps;       /* Whether the clock supports a PPS callback. */
 	int n_pins;    /* Number of input/output pins. */
-	int rsv[14];   /* Reserved for future use. */
+	/* Whether the clock supports precise system-device cross timestamps */
+	int cross_timestamping;
+	int rsv[13];   /* Reserved for future use. */
 };
 
 struct ptp_extts_request {
@@ -81,6 +83,13 @@ struct ptp_sys_offset {
 	struct ptp_clock_time ts[2 * PTP_MAX_SAMPLES + 1];
 };
 
+struct ptp_sys_offset_precise {
+	struct ptp_clock_time device;
+	struct ptp_clock_time sys_realtime;
+	struct ptp_clock_time sys_monoraw;
+	unsigned int rsv[4];    /* Reserved for future use. */
+};
+
 enum ptp_pin_function {
 	PTP_PF_NONE,
 	PTP_PF_EXTTS,
@@ -124,6 +133,8 @@ struct ptp_pin_desc {
 #define PTP_SYS_OFFSET     _IOW(PTP_CLK_MAGIC, 5, struct ptp_sys_offset)
 #define PTP_PIN_GETFUNC    _IOWR(PTP_CLK_MAGIC, 6, struct ptp_pin_desc)
 #define PTP_PIN_SETFUNC    _IOW(PTP_CLK_MAGIC, 7, struct ptp_pin_desc)
+#define PTP_SYS_OFFSET_PRECISE \
+	_IOWR(PTP_CLK_MAGIC, 8, struct ptp_sys_offset_precise)
 
 struct ptp_extts_event {
 	struct ptp_clock_time t; /* Time event occured. */
-- 
cgit