diff options
Diffstat (limited to 'arch/x86/include/asm')
63 files changed, 943 insertions, 528 deletions
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4dd1f2d770af..aeac434c9feb 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -9,3 +9,4 @@ generic-y += cputime.h  generic-y += dma-contiguous.h  generic-y += early_ioremap.h  generic-y += mcs_spinlock.h +generic-y += mm-arch-hooks.h diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index c8393634ca0c..ebf6d5e5668c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -313,7 +313,6 @@ struct apic {  	/* wakeup_secondary_cpu */  	int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); -	bool wait_for_init_deassert;  	void (*inquire_remote_apic)(int apicid);  	/* apic ops */ @@ -378,7 +377,6 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[];   * APIC functionality to boot other CPUs - only used on SMP:   */  #ifdef CONFIG_SMP -extern atomic_t init_deasserted;  extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);  #endif diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index 9686c3d9ff73..259a7c1ef709 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -21,7 +21,7 @@   * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective   * compiler switches.   */ -static inline unsigned int __arch_hweight32(unsigned int w) +static __always_inline unsigned int __arch_hweight32(unsigned int w)  {  	unsigned int res = 0; @@ -42,20 +42,23 @@ static inline unsigned int __arch_hweight8(unsigned int w)  	return __arch_hweight32(w & 0xff);  } +#ifdef CONFIG_X86_32  static inline unsigned long __arch_hweight64(__u64 w)  { -	unsigned long res = 0; - -#ifdef CONFIG_X86_32  	return  __arch_hweight32((u32)w) +  		__arch_hweight32((u32)(w >> 32)); +}  #else +static __always_inline unsigned long __arch_hweight64(__u64 w) +{ +	unsigned long res = 0; +  	asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)  		     : "="REG_OUT (res)  		     : REG_IN (w)); -#endif /* CONFIG_X86_32 */  	return res;  } +#endif /* CONFIG_X86_32 */  #endif diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index e9168955c42f..fb52aa644aab 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -182,6 +182,21 @@ static inline int atomic_xchg(atomic_t *v, int new)  	return xchg(&v->counter, new);  } +#define ATOMIC_OP(op)							\ +static inline void atomic_##op(int i, atomic_t *v)			\ +{									\ +	asm volatile(LOCK_PREFIX #op"l %1,%0"				\ +			: "+m" (v->counter)				\ +			: "ir" (i)					\ +			: "memory");					\ +} + +ATOMIC_OP(and) +ATOMIC_OP(or) +ATOMIC_OP(xor) + +#undef ATOMIC_OP +  /**   * __atomic_add_unless - add unless the number is already a given value   * @v: pointer of type atomic_t @@ -219,16 +234,6 @@ static __always_inline short int atomic_inc_short(short int *v)  	return *v;  } -/* These are x86-specific, used by some header files */ -#define atomic_clear_mask(mask, addr)				\ -	asm volatile(LOCK_PREFIX "andl %0,%1"			\ -		     : : "r" (~(mask)), "m" (*(addr)) : "memory") - -#define atomic_set_mask(mask, addr)				\ -	asm volatile(LOCK_PREFIX "orl %0,%1"			\ -		     : : "r" ((unsigned)(mask)), "m" (*(addr))	\ -		     : "memory") -  #ifdef CONFIG_X86_32  # include <asm/atomic64_32.h>  #else diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index b154de75c90c..a11c30b77fb5 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -313,4 +313,18 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)  #undef alternative_atomic64  #undef __alternative_atomic64 +#define ATOMIC64_OP(op, c_op)						\ +static inline void atomic64_##op(long long i, atomic64_t *v)		\ +{									\ +	long long old, c = 0;						\ +	while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c)		\ +		c = old;						\ +} + +ATOMIC64_OP(and, &) +ATOMIC64_OP(or, |) +ATOMIC64_OP(xor, ^) + +#undef ATOMIC64_OP +  #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index b965f9e03f2a..50e33eff58de 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -220,4 +220,19 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)  	return dec;  } +#define ATOMIC64_OP(op)							\ +static inline void atomic64_##op(long i, atomic64_t *v)			\ +{									\ +	asm volatile(LOCK_PREFIX #op"q %1,%0"				\ +			: "+m" (v->counter)				\ +			: "er" (i)					\ +			: "memory");					\ +} + +ATOMIC64_OP(and) +ATOMIC64_OP(or) +ATOMIC64_OP(xor) + +#undef ATOMIC64_OP +  #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index e51a8f803f55..0681d2532527 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -57,12 +57,12 @@  do {									\  	compiletime_assert_atomic_type(*p);				\  	smp_mb();							\ -	ACCESS_ONCE(*p) = (v);						\ +	WRITE_ONCE(*p, v);						\  } while (0)  #define smp_load_acquire(p)						\  ({									\ -	typeof(*p) ___p1 = ACCESS_ONCE(*p);				\ +	typeof(*p) ___p1 = READ_ONCE(*p);				\  	compiletime_assert_atomic_type(*p);				\  	smp_mb();							\  	___p1;								\ @@ -74,12 +74,12 @@ do {									\  do {									\  	compiletime_assert_atomic_type(*p);				\  	barrier();							\ -	ACCESS_ONCE(*p) = (v);						\ +	WRITE_ONCE(*p, v);						\  } while (0)  #define smp_load_acquire(p)						\  ({									\ -	typeof(*p) ___p1 = ACCESS_ONCE(*p);				\ +	typeof(*p) ___p1 = READ_ONCE(*p);				\  	compiletime_assert_atomic_type(*p);				\  	barrier();							\  	___p1;								\ @@ -91,15 +91,4 @@ do {									\  #define smp_mb__before_atomic()	barrier()  #define smp_mb__after_atomic()	barrier() -/* - * Stop RDTSC speculation. This is needed when you need to use RDTSC - * (or get_cycles or vread that possibly accesses the TSC) in a defined - * code region. - */ -static __always_inline void rdtsc_barrier(void) -{ -	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, -			  "lfence", X86_FEATURE_LFENCE_RDTSC); -} -  #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 9bf3ea14b9f0..e63aa38e85fb 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -89,6 +89,8 @@ int set_pages_rw(struct page *page, int numpages);  void clflush_cache_range(void *addr, unsigned int size); +#define mmio_flush_range(addr, size) clflush_cache_range(addr, size) +  #ifdef CONFIG_DEBUG_RODATA  void mark_rodata_ro(void);  extern const int rodata_test_data; @@ -109,75 +111,4 @@ static inline int rodata_test(void)  }  #endif -#ifdef ARCH_HAS_NOCACHE_UACCESS - -/** - * arch_memcpy_to_pmem - copy data to persistent memory - * @dst: destination buffer for the copy - * @src: source buffer for the copy - * @n: length of the copy in bytes - * - * Copy data to persistent memory media via non-temporal stores so that - * a subsequent arch_wmb_pmem() can flush cpu and memory controller - * write buffers to guarantee durability. - */ -static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, -		size_t n) -{ -	int unwritten; - -	/* -	 * We are copying between two kernel buffers, if -	 * __copy_from_user_inatomic_nocache() returns an error (page -	 * fault) we would have already reported a general protection fault -	 * before the WARN+BUG. -	 */ -	unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, -			(void __user *) src, n); -	if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", -				__func__, dst, src, unwritten)) -		BUG(); -} - -/** - * arch_wmb_pmem - synchronize writes to persistent memory - * - * After a series of arch_memcpy_to_pmem() operations this drains data - * from cpu write buffers and any platform (memory controller) buffers - * to ensure that written data is durable on persistent memory media. - */ -static inline void arch_wmb_pmem(void) -{ -	/* -	 * wmb() to 'sfence' all previous writes such that they are -	 * architecturally visible to 'pcommit'.  Note, that we've -	 * already arranged for pmem writes to avoid the cache via -	 * arch_memcpy_to_pmem(). -	 */ -	wmb(); -	pcommit_sfence(); -} - -static inline bool __arch_has_wmb_pmem(void) -{ -#ifdef CONFIG_X86_64 -	/* -	 * We require that wmb() be an 'sfence', that is only guaranteed on -	 * 64-bit builds -	 */ -	return static_cpu_has(X86_FEATURE_PCOMMIT); -#else -	return false; -#endif -} -#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */ -extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n); -extern void arch_wmb_pmem(void); - -static inline bool __arch_has_wmb_pmem(void) -{ -	return false; -} -#endif -  #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/context_tracking.h b/arch/x86/include/asm/context_tracking.h deleted file mode 100644 index 1fe49704b146..000000000000 --- a/arch/x86/include/asm/context_tracking.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef _ASM_X86_CONTEXT_TRACKING_H -#define _ASM_X86_CONTEXT_TRACKING_H - -#ifdef CONFIG_CONTEXT_TRACKING -# define SCHEDULE_USER call schedule_user -#else -# define SCHEDULE_USER call schedule -#endif - -#endif diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3d6606fb97d0..477fc28050e4 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -119,6 +119,7 @@  #define X86_FEATURE_TM2		( 4*32+ 8) /* Thermal Monitor 2 */  #define X86_FEATURE_SSSE3	( 4*32+ 9) /* Supplemental SSE-3 */  #define X86_FEATURE_CID		( 4*32+10) /* Context ID */ +#define X86_FEATURE_SDBG	( 4*32+11) /* Silicon Debug */  #define X86_FEATURE_FMA		( 4*32+12) /* Fused multiply-add */  #define X86_FEATURE_CX16	( 4*32+13) /* CMPXCHG16B */  #define X86_FEATURE_XTPR	( 4*32+14) /* Send Task Priority Messages */ @@ -176,6 +177,7 @@  #define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */  #define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */  #define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */  /*   * Auxiliary flags: Linux defined - For features scattered in various diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index 9b3b4f2754c7..36a760bda462 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -4,5 +4,6 @@  #include <asm-generic/delay.h>  void use_tsc_delay(void); +void use_mwaitx_delay(void);  #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index a0bf89fd2647..4e10d73cf018 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -280,21 +280,6 @@ static inline void clear_LDT(void)  	set_ldt(NULL, 0);  } -/* - * load one particular LDT into the current CPU - */ -static inline void load_LDT_nolock(mm_context_t *pc) -{ -	set_ldt(pc->ldt, pc->size); -} - -static inline void load_LDT(mm_context_t *pc) -{ -	preempt_disable(); -	load_LDT_nolock(pc); -	preempt_enable(); -} -  static inline unsigned long get_desc_base(const struct desc_struct *desc)  {  	return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 1f5b7287d1ad..953b7263f844 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -12,7 +12,6 @@  #include <linux/dma-attrs.h>  #include <asm/io.h>  #include <asm/swiotlb.h> -#include <asm-generic/dma-coherent.h>  #include <linux/dma-contiguous.h>  #ifdef CONFIG_ISA @@ -41,24 +40,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)  #endif  } -#include <asm-generic/dma-mapping-common.h> - -/* Make sure we keep the same behaviour */ -static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ -	struct dma_map_ops *ops = get_dma_ops(dev); -	debug_dma_mapping_error(dev, dma_addr); -	if (ops->mapping_error) -		return ops->mapping_error(dev, dma_addr); - -	return (dma_addr == DMA_ERROR_CODE); -} - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) +bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); +#define arch_dma_alloc_attrs arch_dma_alloc_attrs +#define HAVE_ARCH_DMA_SUPPORTED 1  extern int dma_supported(struct device *hwdev, u64 mask); -extern int dma_set_mask(struct device *dev, u64 mask); + +#include <asm-generic/dma-mapping-common.h>  extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,  					dma_addr_t *dma_addr, gfp_t flag, @@ -125,16 +113,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)         return gfp;  } -#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL) - -void * -dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, -		gfp_t gfp, struct dma_attrs *attrs); - -#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL) - -void dma_free_attrs(struct device *dev, size_t size, -		    void *vaddr, dma_addr_t bus, -		    struct dma_attrs *attrs); -  #endif diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index f161c189c27b..141c561f4664 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -78,7 +78,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t;  #ifdef CONFIG_X86_64  extern unsigned int vdso64_enabled;  #endif -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)  extern unsigned int vdso32_enabled;  #endif @@ -187,8 +187,8 @@ static inline void elf_common_init(struct thread_struct *t,  #define	COMPAT_ELF_PLAT_INIT(regs, load_addr)		\  	elf_common_init(¤t->thread, regs, __USER_DS) -void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); -#define compat_start_thread start_thread_ia32 +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp); +#define compat_start_thread compat_start_thread  void set_personality_ia32(bool);  #define COMPAT_SET_PERSONALITY(ex)			\ @@ -344,14 +344,9 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,   */  static inline int mmap_is_ia32(void)  { -#ifdef CONFIG_X86_32 -	return 1; -#endif -#ifdef CONFIG_IA32_EMULATION -	if (test_thread_flag(TIF_ADDR32)) -		return 1; -#endif -	return 0; +	return config_enabled(CONFIG_X86_32) || +	       (config_enabled(CONFIG_COMPAT) && +		test_thread_flag(TIF_ADDR32));  }  /* Do not change the values. See get_align_mask() */ diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 0637826292de..c49c5173158e 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -189,6 +189,7 @@ union fpregs_state {  	struct fxregs_state		fxsave;  	struct swregs_state		soft;  	struct xregs_state		xsave; +	u8 __padding[PAGE_SIZE];  };  /* @@ -198,40 +199,6 @@ union fpregs_state {   */  struct fpu {  	/* -	 * @state: -	 * -	 * In-memory copy of all FPU registers that we save/restore -	 * over context switches. If the task is using the FPU then -	 * the registers in the FPU are more recent than this state -	 * copy. If the task context-switches away then they get -	 * saved here and represent the FPU state. -	 * -	 * After context switches there may be a (short) time period -	 * during which the in-FPU hardware registers are unchanged -	 * and still perfectly match this state, if the tasks -	 * scheduled afterwards are not using the FPU. -	 * -	 * This is the 'lazy restore' window of optimization, which -	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. -	 * -	 * We detect whether a subsequent task uses the FPU via setting -	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault. -	 * -	 * During this window, if the task gets scheduled again, we -	 * might be able to skip having to do a restore from this -	 * memory buffer to the hardware registers - at the cost of -	 * incurring the overhead of #NM fault traps. -	 * -	 * Note that on modern CPUs that support the XSAVEOPT (or other -	 * optimized XSAVE instructions), we don't use #NM traps anymore, -	 * as the hardware can track whether FPU registers need saving -	 * or not. On such CPUs we activate the non-lazy ('eagerfpu') -	 * logic, which unconditionally saves/restores all FPU state -	 * across context switches. (if FPU state exists.) -	 */ -	union fpregs_state		state; - -	/*  	 * @last_cpu:  	 *  	 * Records the last CPU on which this context was loaded into @@ -288,6 +255,43 @@ struct fpu {  	 * deal with bursty apps that only use the FPU for a short time:  	 */  	unsigned char			counter; +	/* +	 * @state: +	 * +	 * In-memory copy of all FPU registers that we save/restore +	 * over context switches. If the task is using the FPU then +	 * the registers in the FPU are more recent than this state +	 * copy. If the task context-switches away then they get +	 * saved here and represent the FPU state. +	 * +	 * After context switches there may be a (short) time period +	 * during which the in-FPU hardware registers are unchanged +	 * and still perfectly match this state, if the tasks +	 * scheduled afterwards are not using the FPU. +	 * +	 * This is the 'lazy restore' window of optimization, which +	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. +	 * +	 * We detect whether a subsequent task uses the FPU via setting +	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault. +	 * +	 * During this window, if the task gets scheduled again, we +	 * might be able to skip having to do a restore from this +	 * memory buffer to the hardware registers - at the cost of +	 * incurring the overhead of #NM fault traps. +	 * +	 * Note that on modern CPUs that support the XSAVEOPT (or other +	 * optimized XSAVE instructions), we don't use #NM traps anymore, +	 * as the hardware can track whether FPU registers need saving +	 * or not. On such CPUs we activate the non-lazy ('eagerfpu') +	 * logic, which unconditionally saves/restores all FPU state +	 * across context switches. (if FPU state exists.) +	 */ +	union fpregs_state		state; +	/* +	 * WARNING: 'state' is dynamically-sized.  Do not put +	 * anything after it here. +	 */  };  #endif /* _ASM_X86_FPU_H */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f45acad3c4b6..24938852db30 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -3,9 +3,9 @@  #ifdef CONFIG_FUNCTION_TRACER  #ifdef CC_USING_FENTRY -# define MCOUNT_ADDR		((long)(__fentry__)) +# define MCOUNT_ADDR		((unsigned long)(__fentry__))  #else -# define MCOUNT_ADDR		((long)(mcount)) +# define MCOUNT_ADDR		((unsigned long)(mcount))  #endif  #define MCOUNT_INSN_SIZE	5 /* sizeof mcount call */ diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 6615032e19c8..1e3408e88604 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -182,10 +182,10 @@ extern char irq_entries_start[];  #define trace_irq_entries_start irq_entries_start  #endif -#define VECTOR_UNDEFINED	(-1) -#define VECTOR_RETRIGGERED	(-2) +#define VECTOR_UNUSED		NULL +#define VECTOR_RETRIGGERED	((void *)~0UL) -typedef int vector_irq_t[NR_VECTORS]; +typedef struct irq_desc* vector_irq_t[NR_VECTORS];  DECLARE_PER_CPU(vector_irq_t, vector_irq);  #endif /* !ASSEMBLY_ */ diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index d0e8e0141041..28019765442e 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -22,15 +22,6 @@ struct ucontext_ia32 {  	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */  }; -struct ucontext_x32 { -	unsigned int	  uc_flags; -	unsigned int 	  uc_link; -	compat_stack_t	  uc_stack; -	unsigned int	  uc__pad0;     /* needed for alignment */ -	struct sigcontext uc_mcontext;  /* the 64-bit sigcontext type */ -	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */ -}; -  /* This matches struct stat64 in glibc2.2, hence the absolutely   * insane amounts of padding around dev_t's.   */ diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h index 200ec2e7821d..cd0310e186f4 100644 --- a/arch/x86/include/asm/intel_pmc_ipc.h +++ b/arch/x86/include/asm/intel_pmc_ipc.h @@ -25,36 +25,9 @@  #if IS_ENABLED(CONFIG_INTEL_PMC_IPC) -/* - * intel_pmc_ipc_simple_command - * @cmd: command - * @sub: sub type - */  int intel_pmc_ipc_simple_command(int cmd, int sub); - -/* - * intel_pmc_ipc_raw_cmd - * @cmd: command - * @sub: sub type - * @in: input data - * @inlen: input length in bytes - * @out: output data - * @outlen: output length in dwords - * @sptr: data writing to SPTR register - * @dptr: data writing to DPTR register - */  int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen,  		u32 *out, u32 outlen, u32 dptr, u32 sptr); - -/* - * intel_pmc_ipc_command - * @cmd: command - * @sub: sub type - * @in: input data - * @inlen: input length in bytes - * @out: output data - * @outlen: output length in dwords - */  int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,  		u32 *out, u32 outlen); diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index cc9c61bc1abe..de25aad07853 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -180,6 +180,8 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)   */  extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);  extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); +#define ioremap_uc ioremap_uc +  extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);  extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,  				unsigned long prot_val); @@ -248,12 +250,6 @@ static inline void flush_write_buffers(void)  #endif  } -static inline void __pmem *arch_memremap_pmem(resource_size_t offset, -	unsigned long size) -{ -	return (void __force __pmem *) ioremap_cache(offset, size); -} -  #endif /* __KERNEL__ */  extern void native_io_delay(void); diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index 57995f0596a6..b72ad0faa6c5 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -52,20 +52,20 @@  /* Quark available units */  #define QRK_MBI_UNIT_HBA	0x00 -#define QRK_MBI_UNIT_HB	0x03 +#define QRK_MBI_UNIT_HB		0x03  #define QRK_MBI_UNIT_RMU	0x04 -#define QRK_MBI_UNIT_MM	0x05 +#define QRK_MBI_UNIT_MM		0x05  #define QRK_MBI_UNIT_MMESRAM	0x05  #define QRK_MBI_UNIT_SOC	0x31  /* Quark read/write opcodes */  #define QRK_MBI_HBA_READ	0x10  #define QRK_MBI_HBA_WRITE	0x11 -#define QRK_MBI_HB_READ	0x10 +#define QRK_MBI_HB_READ		0x10  #define QRK_MBI_HB_WRITE	0x11  #define QRK_MBI_RMU_READ	0x10  #define QRK_MBI_RMU_WRITE	0x11 -#define QRK_MBI_MM_READ	0x10 +#define QRK_MBI_MM_READ		0x10  #define QRK_MBI_MM_WRITE	0x11  #define QRK_MBI_MMESRAM_READ	0x12  #define QRK_MBI_MMESRAM_WRITE	0x13 diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 8008d06581c7..881b4768644a 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -36,7 +36,9 @@ extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));  extern void (*x86_platform_ipi_callback)(void);  extern void native_init_IRQ(void); -extern bool handle_irq(unsigned irq, struct pt_regs *regs); + +struct irq_desc; +extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs);  extern __visible unsigned int do_IRQ(struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 4c2d2eb2060a..6ca9fd6234e1 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -117,16 +117,6 @@  #define FPU_IRQ				  13 -#define	FIRST_VM86_IRQ			   3 -#define LAST_VM86_IRQ			  15 - -#ifndef __ASSEMBLY__ -static inline int invalid_vm86_irq(int irq) -{ -	return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; -} -#endif -  /*   * Size the maximum number of interrupts.   * diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index a4c1cf7e93f8..5daeca3d0f9e 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -16,15 +16,32 @@  # define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC  #endif -static __always_inline bool arch_static_branch(struct static_key *key) +static __always_inline bool arch_static_branch(struct static_key *key, bool branch)  {  	asm_volatile_goto("1:"  		".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"  		".pushsection __jump_table,  \"aw\" \n\t"  		_ASM_ALIGN "\n\t" -		_ASM_PTR "1b, %l[l_yes], %c0 \n\t" +		_ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"  		".popsection \n\t" -		: :  "i" (key) : : l_yes); +		: :  "i" (key), "i" (branch) : : l_yes); + +	return false; +l_yes: +	return true; +} + +static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) +{ +	asm_volatile_goto("1:" +		".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t" +		"2:\n\t" +		".pushsection __jump_table,  \"aw\" \n\t" +		_ASM_ALIGN "\n\t" +		_ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" +		".popsection \n\t" +		: :  "i" (key), "i" (branch) : : l_yes); +  	return false;  l_yes:  	return true; diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index 74a2a8dc9908..1410b567ecde 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -1,6 +1,9 @@  #ifndef _ASM_X86_KASAN_H  #define _ASM_X86_KASAN_H +#include <linux/const.h> +#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) +  /*   * Compiler uses shadow offset assuming that addresses start   * from 0. Kernel addresses don't start from 0, so shadow diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 32ce71375b21..b130d59406fb 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,  extern void __show_regs(struct pt_regs *regs, int all);  extern unsigned long oops_begin(void);  extern void oops_end(unsigned long, struct pt_regs *, int signr); -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE  extern int in_crash_kexec;  #else  /* no crash dump is ever in progress if no crash kernel can be kexec'd */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2a7f5d782c33..c12e845f59e6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -252,6 +252,11 @@ struct kvm_pio_request {  	int size;  }; +struct rsvd_bits_validate { +	u64 rsvd_bits_mask[2][4]; +	u64 bad_mt_xwr; +}; +  /*   * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level   * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu @@ -289,8 +294,15 @@ struct kvm_mmu {  	u64 *pae_root;  	u64 *lm_root; -	u64 rsvd_bits_mask[2][4]; -	u64 bad_mt_xwr; + +	/* +	 * check zero bits on shadow page table entries, these +	 * bits include not only hardware reserved bits but also +	 * the bits spte never used. +	 */ +	struct rsvd_bits_validate shadow_zero_check; + +	struct rsvd_bits_validate guest_rsvd_check;  	/*  	 * Bitmap: bit set = last pte in walk @@ -358,6 +370,11 @@ struct kvm_mtrr {  	struct list_head head;  }; +/* Hyper-V per vcpu emulation context */ +struct kvm_vcpu_hv { +	u64 hv_vapic; +}; +  struct kvm_vcpu_arch {  	/*  	 * rip and regs accesses must go through @@ -514,8 +531,7 @@ struct kvm_vcpu_arch {  	/* used for guest single stepping over the given code position */  	unsigned long singlestep_rip; -	/* fields used by HYPER-V emulation */ -	u64 hv_vapic; +	struct kvm_vcpu_hv hyperv;  	cpumask_var_t wbinvd_dirty_mask; @@ -586,6 +602,17 @@ struct kvm_apic_map {  	struct kvm_lapic *logical_map[16][16];  }; +/* Hyper-V emulation context */ +struct kvm_hv { +	u64 hv_guest_os_id; +	u64 hv_hypercall; +	u64 hv_tsc_page; + +	/* Hyper-v based guest crash (NT kernel bugcheck) parameters */ +	u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; +	u64 hv_crash_ctl; +}; +  struct kvm_arch {  	unsigned int n_used_mmu_pages;  	unsigned int n_requested_mmu_pages; @@ -604,6 +631,8 @@ struct kvm_arch {  	bool iommu_noncoherent;  #define __KVM_HAVE_ARCH_NONCOHERENT_DMA  	atomic_t noncoherent_dma_count; +#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE +	atomic_t assigned_device_count;  	struct kvm_pic *vpic;  	struct kvm_ioapic *vioapic;  	struct kvm_pit *vpit; @@ -643,16 +672,14 @@ struct kvm_arch {  	/* reads protected by irq_srcu, writes by irq_lock */  	struct hlist_head mask_notifier_list; -	/* fields used by HYPER-V emulation */ -	u64 hv_guest_os_id; -	u64 hv_hypercall; -	u64 hv_tsc_page; +	struct kvm_hv hyperv;  	#ifdef CONFIG_KVM_MMU_AUDIT  	int audit_point;  	#endif  	bool boot_vcpu_runs_old_kvmclock; +	u32 bsp_vcpu_id;  	u64 disabled_quirks;  }; @@ -1201,5 +1228,7 @@ int __x86_set_memory_region(struct kvm *kvm,  			    const struct kvm_userspace_memory_region *mem);  int x86_set_memory_region(struct kvm *kvm,  			  const struct kvm_userspace_memory_region *mem); +bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); +bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);  #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h index 031f6266f425..0d9b14f60d2c 100644 --- a/arch/x86/include/asm/math_emu.h +++ b/arch/x86/include/asm/math_emu.h @@ -2,7 +2,6 @@  #define _ASM_X86_MATH_EMU_H  #include <asm/ptrace.h> -#include <asm/vm86.h>  /* This structure matches the layout of the data saved to the stack     following a device-not-present interrupt, part of it saved @@ -10,9 +9,6 @@     */  struct math_emu_info {  	long ___orig_eip; -	union { -		struct pt_regs *regs; -		struct kernel_vm86_regs *vm86; -	}; +	struct pt_regs *regs;  };  #endif /* _ASM_X86_MATH_EMU_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 982dfc3679ad..2dbc0bf2b9f3 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -151,10 +151,12 @@ extern int mce_p5_enabled;  #ifdef CONFIG_X86_MCE  int mcheck_init(void);  void mcheck_cpu_init(struct cpuinfo_x86 *c); +void mcheck_cpu_clear(struct cpuinfo_x86 *c);  void mcheck_vendor_init_severity(void);  #else  static inline int mcheck_init(void) { return 0; }  static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} +static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}  static inline void mcheck_vendor_init_severity(void) {}  #endif @@ -181,20 +183,18 @@ DECLARE_PER_CPU(struct device *, mce_device);  #ifdef CONFIG_X86_MCE_INTEL  void mce_intel_feature_init(struct cpuinfo_x86 *c); +void mce_intel_feature_clear(struct cpuinfo_x86 *c);  void cmci_clear(void);  void cmci_reenable(void);  void cmci_rediscover(void);  void cmci_recheck(void); -void lmce_clear(void); -void lmce_enable(void);  #else  static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } +static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { }  static inline void cmci_clear(void) {}  static inline void cmci_reenable(void) {}  static inline void cmci_rediscover(void) {}  static inline void cmci_recheck(void) {} -static inline void lmce_clear(void) {} -static inline void lmce_enable(void) {}  #endif  #ifdef CONFIG_X86_MCE_AMD diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h deleted file mode 100644 index 4e881a342236..000000000000 --- a/arch/x86/include/asm/mm-arch-hooks.h +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Architecture specific mm hooks - * - * Copyright (C) 2015, IBM Corporation - * Author: Laurent Dufour <[email protected]> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef _ASM_X86_MM_ARCH_HOOKS_H -#define _ASM_X86_MM_ARCH_HOOKS_H - -#endif /* _ASM_X86_MM_ARCH_HOOKS_H */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 09b9620a73b4..55234d5e7160 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -9,8 +9,9 @@   * we put the segment information here.   */  typedef struct { -	void *ldt; -	int size; +#ifdef CONFIG_MODIFY_LDT_SYSCALL +	struct ldt_struct *ldt; +#endif  #ifdef CONFIG_X86_64  	/* True if mm supports a task running in 32 bit compatibility mode. */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 5e8daee7c5c9..379cd3658799 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -23,7 +23,7 @@ extern struct static_key rdpmc_always_available;  static inline void load_mm_cr4(struct mm_struct *mm)  { -	if (static_key_true(&rdpmc_always_available) || +	if (static_key_false(&rdpmc_always_available) ||  	    atomic_read(&mm->context.perf_rdpmc_allowed))  		cr4_set_bits(X86_CR4_PCE);  	else @@ -33,12 +33,68 @@ static inline void load_mm_cr4(struct mm_struct *mm)  static inline void load_mm_cr4(struct mm_struct *mm) {}  #endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL +/* + * ldt_structs can be allocated, used, and freed, but they are never + * modified while live. + */ +struct ldt_struct { +	/* +	 * Xen requires page-aligned LDTs with special permissions.  This is +	 * needed to prevent us from installing evil descriptors such as +	 * call gates.  On native, we could merge the ldt_struct and LDT +	 * allocations, but it's not worth trying to optimize. +	 */ +	struct desc_struct *entries; +	int size; +}; +  /*   * Used for LDT copy/destruction.   */  int init_new_context(struct task_struct *tsk, struct mm_struct *mm);  void destroy_context(struct mm_struct *mm); +#else	/* CONFIG_MODIFY_LDT_SYSCALL */ +static inline int init_new_context(struct task_struct *tsk, +				   struct mm_struct *mm) +{ +	return 0; +} +static inline void destroy_context(struct mm_struct *mm) {} +#endif +static inline void load_mm_ldt(struct mm_struct *mm) +{ +#ifdef CONFIG_MODIFY_LDT_SYSCALL +	struct ldt_struct *ldt; + +	/* lockless_dereference synchronizes with smp_store_release */ +	ldt = lockless_dereference(mm->context.ldt); + +	/* +	 * Any change to mm->context.ldt is followed by an IPI to all +	 * CPUs with the mm active.  The LDT will not be freed until +	 * after the IPI is handled by all such CPUs.  This means that, +	 * if the ldt_struct changes before we return, the values we see +	 * will be safe, and the new values will be loaded before we run +	 * any user code. +	 * +	 * NB: don't try to convert this to use RCU without extreme care. +	 * We would still need IRQs off, because we don't want to change +	 * the local LDT after an IPI loaded a newer value than the one +	 * that we can see. +	 */ + +	if (unlikely(ldt)) +		set_ldt(ldt->entries, ldt->size); +	else +		clear_LDT(); +#else +	clear_LDT(); +#endif + +	DEBUG_LOCKS_WARN_ON(preemptible()); +}  static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)  { @@ -70,6 +126,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,  		/* Load per-mm CR4 state */  		load_mm_cr4(next); +#ifdef CONFIG_MODIFY_LDT_SYSCALL  		/*  		 * Load the LDT, if the LDT is different.  		 * @@ -78,12 +135,13 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,  		 * was called and then modify_ldt changed  		 * prev->context.ldt but suppressed an IPI to this CPU.  		 * In this case, prev->context.ldt != NULL, because we -		 * never free an LDT while the mm still exists.  That -		 * means that next->context.ldt != prev->context.ldt, -		 * because mms never share an LDT. +		 * never set context.ldt to NULL while the mm still +		 * exists.  That means that next->context.ldt != +		 * prev->context.ldt, because mms never share an LDT.  		 */  		if (unlikely(prev->context.ldt != next->context.ldt)) -			load_LDT_nolock(&next->context); +			load_mm_ldt(next); +#endif  	}  #ifdef CONFIG_SMP  	  else { @@ -106,7 +164,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,  			load_cr3(next->pgd);  			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);  			load_mm_cr4(next); -			load_LDT_nolock(&next->context); +			load_mm_ldt(next);  		}  	}  #endif diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index c163215abb9a..aaf59b7da98a 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -7,6 +7,7 @@  struct ms_hyperv_info {  	u32 features; +	u32 misc_features;  	u32 hints;  }; @@ -20,4 +21,8 @@ void hyperv_vector_handler(struct pt_regs *regs);  void hv_setup_vmbus_irq(void (*handler)(void));  void hv_remove_vmbus_irq(void); +void hv_setup_kexec_handler(void (*handler)(void)); +void hv_remove_kexec_handler(void); +void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); +void hv_remove_crash_handler(void);  #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9ebc3d009373..c1c0a1c14344 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -73,6 +73,12 @@  #define MSR_LBR_CORE_FROM		0x00000040  #define MSR_LBR_CORE_TO			0x00000060 +#define MSR_LBR_INFO_0			0x00000dc0 /* ... 0xddf for _31 */ +#define LBR_INFO_MISPRED		BIT_ULL(63) +#define LBR_INFO_IN_TX			BIT_ULL(62) +#define LBR_INFO_ABORT			BIT_ULL(61) +#define LBR_INFO_CYCLES			0xffff +  #define MSR_IA32_PEBS_ENABLE		0x000003f1  #define MSR_IA32_DS_AREA		0x00000600  #define MSR_IA32_PERF_CAPABILITIES	0x00000345 @@ -80,13 +86,21 @@  #define MSR_IA32_RTIT_CTL		0x00000570  #define RTIT_CTL_TRACEEN		BIT(0) +#define RTIT_CTL_CYCLEACC		BIT(1)  #define RTIT_CTL_OS			BIT(2)  #define RTIT_CTL_USR			BIT(3)  #define RTIT_CTL_CR3EN			BIT(7)  #define RTIT_CTL_TOPA			BIT(8) +#define RTIT_CTL_MTC_EN			BIT(9)  #define RTIT_CTL_TSC_EN			BIT(10)  #define RTIT_CTL_DISRETC		BIT(11)  #define RTIT_CTL_BRANCH_EN		BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET	14 +#define RTIT_CTL_MTC_RANGE		(0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET	19 +#define RTIT_CTL_CYC_THRESH		(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET	24 +#define RTIT_CTL_PSB_FREQ      		(0x0full << RTIT_CTL_PSB_FREQ_OFFSET)  #define MSR_IA32_RTIT_STATUS		0x00000571  #define RTIT_STATUS_CONTEXTEN		BIT(1)  #define RTIT_STATUS_TRIGGEREN		BIT(2) @@ -170,6 +184,12 @@  #define MSR_PP1_ENERGY_STATUS		0x00000641  #define MSR_PP1_POLICY			0x00000642 +#define MSR_CONFIG_TDP_NOMINAL		0x00000648 +#define MSR_CONFIG_TDP_LEVEL_1		0x00000649 +#define MSR_CONFIG_TDP_LEVEL_2		0x0000064A +#define MSR_CONFIG_TDP_CONTROL		0x0000064B +#define MSR_TURBO_ACTIVATION_RATIO	0x0000064C +  #define MSR_PKG_WEIGHTED_CORE_C0_RES	0x00000658  #define MSR_PKG_ANY_CORE_C0_RES		0x00000659  #define MSR_PKG_ANY_GFXE_C0_RES		0x0000065A diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index e6a707eb5081..77d8b284e4a7 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -47,14 +47,13 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)   * it means rax *or* rdx.   */  #ifdef CONFIG_X86_64 -#define DECLARE_ARGS(val, low, high)	unsigned low, high -#define EAX_EDX_VAL(val, low, high)	((low) | ((u64)(high) << 32)) -#define EAX_EDX_ARGS(val, low, high)	"a" (low), "d" (high) +/* Using 64-bit values saves one instruction clearing the high half of low */ +#define DECLARE_ARGS(val, low, high)	unsigned long low, high +#define EAX_EDX_VAL(val, low, high)	((low) | (high) << 32)  #define EAX_EDX_RET(val, low, high)	"=a" (low), "=d" (high)  #else  #define DECLARE_ARGS(val, low, high)	unsigned long long val  #define EAX_EDX_VAL(val, low, high)	(val) -#define EAX_EDX_ARGS(val, low, high)	"A" (val)  #define EAX_EDX_RET(val, low, high)	"=A" (val)  #endif @@ -106,12 +105,19 @@ notrace static inline int native_write_msr_safe(unsigned int msr,  	return err;  } -extern unsigned long long native_read_tsc(void); -  extern int rdmsr_safe_regs(u32 regs[8]);  extern int wrmsr_safe_regs(u32 regs[8]); -static __always_inline unsigned long long __native_read_tsc(void) +/** + * rdtsc() - returns the current TSC without ordering constraints + * + * rdtsc() returns the result of RDTSC as a 64-bit integer.  The + * only ordering constraint it supplies is the ordering implied by + * "asm volatile": it will put the RDTSC in the place you expect.  The + * CPU can and will speculatively execute that RDTSC, though, so the + * results can be non-monotonic if compared on different CPUs. + */ +static __always_inline unsigned long long rdtsc(void)  {  	DECLARE_ARGS(val, low, high); @@ -120,6 +126,35 @@ static __always_inline unsigned long long __native_read_tsc(void)  	return EAX_EDX_VAL(val, low, high);  } +/** + * rdtsc_ordered() - read the current TSC in program order + * + * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. + * It is ordered like a load to a global in-memory counter.  It should + * be impossible to observe non-monotonic rdtsc_unordered() behavior + * across multiple CPUs as long as the TSC is synced. + */ +static __always_inline unsigned long long rdtsc_ordered(void) +{ +	/* +	 * The RDTSC instruction is not ordered relative to memory +	 * access.  The Intel SDM and the AMD APM are both vague on this +	 * point, but empirically an RDTSC instruction can be +	 * speculatively executed before prior loads.  An RDTSC +	 * immediately after an appropriate barrier appears to be +	 * ordered as a normal load, that is, it provides the same +	 * ordering guarantees as reading from a global memory location +	 * that some other imaginary CPU is updating continuously with a +	 * time stamp. +	 */ +	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, +			  "lfence", X86_FEATURE_LFENCE_RDTSC); +	return rdtsc(); +} + +/* Deprecated, keep it for a cycle for easier merging: */ +#define rdtscll(now)	do { (now) = rdtsc_ordered(); } while (0) +  static inline unsigned long long native_read_pmc(int counter)  {  	DECLARE_ARGS(val, low, high); @@ -153,8 +188,10 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high)  #define rdmsrl(msr, val)			\  	((val) = native_read_msr((msr))) -#define wrmsrl(msr, val)						\ -	native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32)) +static inline void wrmsrl(unsigned msr, u64 val) +{ +	native_write_msr(msr, (u32)val, (u32)(val >> 32)); +}  /* wrmsr with exception handling */  static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) @@ -180,12 +217,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)  	return err;  } -#define rdtscl(low)						\ -	((low) = (u32)__native_read_tsc()) - -#define rdtscll(val)						\ -	((val) = __native_read_tsc()) -  #define rdpmc(counter, low, high)			\  do {							\  	u64 _l = native_read_pmc((counter));		\ @@ -195,15 +226,6 @@ do {							\  #define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) -#define rdtscp(low, high, aux)					\ -do {                                                            \ -	unsigned long long _val = native_read_tscp(&(aux));     \ -	(low) = (u32)_val;                                      \ -	(high) = (u32)(_val >> 32);                             \ -} while (0) - -#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux)) -  #endif	/* !CONFIG_PARAVIRT */  /* diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 653dfa7662e1..c70689b5e5aa 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -14,6 +14,9 @@  #define CPUID5_ECX_INTERRUPT_BREAK	0x2  #define MWAIT_ECX_INTERRUPT_BREAK	0x1 +#define MWAITX_ECX_TIMER_ENABLE		BIT(1) +#define MWAITX_MAX_LOOPS		((u32)-1) +#define MWAITX_DISABLE_CSTATES		0xf  static inline void __monitor(const void *eax, unsigned long ecx,  			     unsigned long edx) @@ -23,6 +26,14 @@ static inline void __monitor(const void *eax, unsigned long ecx,  		     :: "a" (eax), "c" (ecx), "d"(edx));  } +static inline void __monitorx(const void *eax, unsigned long ecx, +			      unsigned long edx) +{ +	/* "monitorx %eax, %ecx, %edx;" */ +	asm volatile(".byte 0x0f, 0x01, 0xfa;" +		     :: "a" (eax), "c" (ecx), "d"(edx)); +} +  static inline void __mwait(unsigned long eax, unsigned long ecx)  {  	/* "mwait %eax, %ecx;" */ @@ -30,6 +41,40 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)  		     :: "a" (eax), "c" (ecx));  } +/* + * MWAITX allows for a timer expiration to get the core out a wait state in + * addition to the default MWAIT exit condition of a store appearing at a + * monitored virtual address. + * + * Registers: + * + * MWAITX ECX[1]: enable timer if set + * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0 + * frequency is the same as the TSC frequency. + * + * Below is a comparison between MWAIT and MWAITX on AMD processors: + * + *                 MWAIT                           MWAITX + * opcode          0f 01 c9           |            0f 01 fb + * ECX[0]                  value of RFLAGS.IF seen by instruction + * ECX[1]          unused/#GP if set  |            enable timer if set + * ECX[31:2]                     unused/#GP if set + * EAX                           unused (reserve for hint) + * EBX[31:0]       unused             |            max wait time (P0 clocks) + * + *                 MONITOR                         MONITORX + * opcode          0f 01 c8           |            0f 01 fa + * EAX                     (logical) address to monitor + * ECX                     #GP if not zero + */ +static inline void __mwaitx(unsigned long eax, unsigned long ebx, +			    unsigned long ecx) +{ +	/* "mwaitx %eax, %ebx, %ecx;" */ +	asm volatile(".byte 0x0f, 0x01, 0xfb;" +		     :: "a" (eax), "b" (ebx), "c" (ecx)); +} +  static inline void __sti_mwait(unsigned long eax, unsigned long ecx)  {  	trace_hardirqs_on(); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index d143bfad45d7..10d0596433f8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -153,7 +153,11 @@ do {						\  	val = paravirt_read_msr(msr, &_err);	\  } while (0) -#define wrmsrl(msr, val)	wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32) +static inline void wrmsrl(unsigned msr, u64 val) +{ +	wrmsr(msr, (u32)val, (u32)(val>>32)); +} +  #define wrmsr_safe(msr, a, b)	paravirt_write_msr(msr, a, b)  /* rdmsr with exception handling */ @@ -174,19 +178,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)  	return err;  } -static inline u64 paravirt_read_tsc(void) -{ -	return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); -} - -#define rdtscl(low)				\ -do {						\ -	u64 _l = paravirt_read_tsc();		\ -	low = (int)_l;				\ -} while (0) - -#define rdtscll(val) (val = paravirt_read_tsc()) -  static inline unsigned long long paravirt_sched_clock(void)  {  	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); @@ -215,27 +206,6 @@ do {						\  #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) -static inline unsigned long long paravirt_rdtscp(unsigned int *aux) -{ -	return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); -} - -#define rdtscp(low, high, aux)				\ -do {							\ -	int __aux;					\ -	unsigned long __val = paravirt_rdtscp(&__aux);	\ -	(low) = (u32)__val;				\ -	(high) = (u32)(__val >> 32);			\ -	(aux) = __aux;					\ -} while (0) - -#define rdtscpll(val, aux)				\ -do {							\ -	unsigned long __aux; 				\ -	val = paravirt_rdtscp(&__aux);			\ -	(aux) = __aux;					\ -} while (0) -  static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)  {  	PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index a6b8f9fadb06..ce029e4fa7c6 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -156,9 +156,7 @@ struct pv_cpu_ops {  	u64 (*read_msr)(unsigned int msr, int *err);  	int (*write_msr)(unsigned int msr, unsigned low, unsigned high); -	u64 (*read_tsc)(void);  	u64 (*read_pmc)(int counter); -	unsigned long long (*read_tscp)(unsigned int *aux);  #ifdef CONFIG_X86_32  	/* diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 164e3f8d3c3d..fa1195dae425 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,8 +93,6 @@ extern raw_spinlock_t pci_config_lock;  extern int (*pcibios_enable_irq)(struct pci_dev *dev);  extern void (*pcibios_disable_irq)(struct pci_dev *dev); -extern bool mp_should_keep_irq(struct device *dev); -  struct pci_raw_ops {  	int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,  						int reg, int len, u32 *val); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index dc0f6ed35b08..7bcb861a04e5 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -159,6 +159,13 @@ struct x86_pmu_capability {   */  #define INTEL_PMC_IDX_FIXED_BTS				(INTEL_PMC_IDX_FIXED + 16) +#define GLOBAL_STATUS_COND_CHG				BIT_ULL(63) +#define GLOBAL_STATUS_BUFFER_OVF			BIT_ULL(62) +#define GLOBAL_STATUS_UNC_OVF				BIT_ULL(61) +#define GLOBAL_STATUS_ASIF				BIT_ULL(60) +#define GLOBAL_STATUS_COUNTERS_FROZEN			BIT_ULL(59) +#define GLOBAL_STATUS_LBRS_FROZEN			BIT_ULL(58) +  /*   * IBS cpuid feature detection   */ diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h index bc0fc0866553..aa8744c77c6d 100644 --- a/arch/x86/include/asm/pmc_atom.h +++ b/arch/x86/include/asm/pmc_atom.h @@ -18,6 +18,8 @@  /* ValleyView Power Control Unit PCI Device ID */  #define	PCI_DEVICE_ID_VLV_PMC	0x0F1C +/* CherryTrail Power Control Unit PCI Device ID */ +#define	PCI_DEVICE_ID_CHT_PMC	0x229C  /* PMC Memory mapped IO registers */  #define	PMC_BASE_ADDR_OFFSET	0x44 @@ -29,6 +31,10 @@  #define	PMC_FUNC_DIS		0x34  #define	PMC_FUNC_DIS_2		0x38 +/* CHT specific bits in FUNC_DIS2 register */ +#define	BIT_FD_GMM		BIT(3) +#define	BIT_FD_ISH		BIT(4) +  /* S0ix wake event control */  #define	PMC_S0IX_WAKE_EN	0x3C @@ -75,6 +81,21 @@  #define PMC_PSS_BIT_USB			BIT(16)  #define PMC_PSS_BIT_USB_SUS		BIT(17) +/* CHT specific bits in PSS register */ +#define	PMC_PSS_BIT_CHT_UFS		BIT(7) +#define	PMC_PSS_BIT_CHT_UXD		BIT(11) +#define	PMC_PSS_BIT_CHT_UXD_FD		BIT(12) +#define	PMC_PSS_BIT_CHT_UX_ENG		BIT(15) +#define	PMC_PSS_BIT_CHT_USB_SUS		BIT(16) +#define	PMC_PSS_BIT_CHT_GMM		BIT(17) +#define	PMC_PSS_BIT_CHT_ISH		BIT(18) +#define	PMC_PSS_BIT_CHT_DFX_MASTER	BIT(26) +#define	PMC_PSS_BIT_CHT_DFX_CLUSTER1	BIT(27) +#define	PMC_PSS_BIT_CHT_DFX_CLUSTER2	BIT(28) +#define	PMC_PSS_BIT_CHT_DFX_CLUSTER3	BIT(29) +#define	PMC_PSS_BIT_CHT_DFX_CLUSTER4	BIT(30) +#define	PMC_PSS_BIT_CHT_DFX_CLUSTER5	BIT(31) +  /* These registers reflect D3 status of functions */  #define	PMC_D3_STS_0		0xA0 @@ -117,6 +138,10 @@  #define	BIT_USH_SS_PHY		BIT(2)  #define	BIT_DFX			BIT(3) +/* CHT specific bits in PMC_D3_STS_1 register */ +#define	BIT_STS_GMM		BIT(1) +#define	BIT_STS_ISH		BIT(2) +  /* PMC I/O Registers */  #define	ACPI_BASE_ADDR_OFFSET	0x40  #define	ACPI_BASE_ADDR_MASK	0xFFFFFE00 @@ -126,4 +151,8 @@  #define	SLEEP_TYPE_MASK		0xFFFFECFF  #define	SLEEP_TYPE_S5		0x1C00  #define	SLEEP_ENABLE		0x2000 + +extern int pmc_atom_read(int offset, u32 *value); +extern int pmc_atom_write(int offset, u32 value); +  #endif /* PMC_ATOM_H */ diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h new file mode 100644 index 000000000000..d8ce3ec816ab --- /dev/null +++ b/arch/x86/include/asm/pmem.h @@ -0,0 +1,153 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + */ +#ifndef __ASM_X86_PMEM_H__ +#define __ASM_X86_PMEM_H__ + +#include <linux/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/special_insns.h> + +#ifdef CONFIG_ARCH_HAS_PMEM_API +/** + * arch_memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Copy data to persistent memory media via non-temporal stores so that + * a subsequent arch_wmb_pmem() can flush cpu and memory controller + * write buffers to guarantee durability. + */ +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, +		size_t n) +{ +	int unwritten; + +	/* +	 * We are copying between two kernel buffers, if +	 * __copy_from_user_inatomic_nocache() returns an error (page +	 * fault) we would have already reported a general protection fault +	 * before the WARN+BUG. +	 */ +	unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, +			(void __user *) src, n); +	if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", +				__func__, dst, src, unwritten)) +		BUG(); +} + +/** + * arch_wmb_pmem - synchronize writes to persistent memory + * + * After a series of arch_memcpy_to_pmem() operations this drains data + * from cpu write buffers and any platform (memory controller) buffers + * to ensure that written data is durable on persistent memory media. + */ +static inline void arch_wmb_pmem(void) +{ +	/* +	 * wmb() to 'sfence' all previous writes such that they are +	 * architecturally visible to 'pcommit'.  Note, that we've +	 * already arranged for pmem writes to avoid the cache via +	 * arch_memcpy_to_pmem(). +	 */ +	wmb(); +	pcommit_sfence(); +} + +/** + * __arch_wb_cache_pmem - write back a cache range with CLWB + * @vaddr:	virtual start address + * @size:	number of bytes to write back + * + * Write back a cache range using the CLWB (cache line write back) + * instruction.  This function requires explicit ordering with an + * arch_wmb_pmem() call.  This API is internal to the x86 PMEM implementation. + */ +static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) +{ +	u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; +	unsigned long clflush_mask = x86_clflush_size - 1; +	void *vend = vaddr + size; +	void *p; + +	for (p = (void *)((unsigned long)vaddr & ~clflush_mask); +	     p < vend; p += x86_clflush_size) +		clwb(p); +} + +/* + * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec + * iterators, so for other types (bvec & kvec) we must do a cache write-back. + */ +static inline bool __iter_needs_pmem_wb(struct iov_iter *i) +{ +	return iter_is_iovec(i) == false; +} + +/** + * arch_copy_from_iter_pmem - copy data from an iterator to PMEM + * @addr:	PMEM destination address + * @bytes:	number of bytes to copy + * @i:		iterator with source data + * + * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'. + * This function requires explicit ordering with an arch_wmb_pmem() call. + */ +static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, +		struct iov_iter *i) +{ +	void *vaddr = (void __force *)addr; +	size_t len; + +	/* TODO: skip the write-back by always using non-temporal stores */ +	len = copy_from_iter_nocache(vaddr, bytes, i); + +	if (__iter_needs_pmem_wb(i)) +		__arch_wb_cache_pmem(vaddr, bytes); + +	return len; +} + +/** + * arch_clear_pmem - zero a PMEM memory range + * @addr:	virtual start address + * @size:	number of bytes to zero + * + * Write zeros into the memory range starting at 'addr' for 'size' bytes. + * This function requires explicit ordering with an arch_wmb_pmem() call. + */ +static inline void arch_clear_pmem(void __pmem *addr, size_t size) +{ +	void *vaddr = (void __force *)addr; + +	/* TODO: implement the zeroing via non-temporal writes */ +	if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) +		clear_page(vaddr); +	else +		memset(vaddr, 0, size); + +	__arch_wb_cache_pmem(vaddr, size); +} + +static inline bool __arch_has_wmb_pmem(void) +{ +	/* +	 * We require that wmb() be an 'sfence', that is only guaranteed on +	 * 64-bit builds +	 */ +	return static_cpu_has(X86_FEATURE_PCOMMIT); +} +#endif /* CONFIG_ARCH_HAS_PMEM_API */ +#endif /* __ASM_X86_PMEM_H__ */ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index dca71714f860..b12f81022a6b 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -90,9 +90,9 @@ static __always_inline bool __preempt_count_dec_and_test(void)  /*   * Returns true when we need to resched and can (barring IRQ state).   */ -static __always_inline bool should_resched(void) +static __always_inline bool should_resched(int preempt_offset)  { -	return unlikely(!raw_cpu_read_4(__preempt_count)); +	return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);  }  #ifdef CONFIG_PREEMPT diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 43e6519df0d5..19577dd325fa 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -6,8 +6,8 @@  /* Forward declaration, a strange C thing */  struct task_struct;  struct mm_struct; +struct vm86; -#include <asm/vm86.h>  #include <asm/math_emu.h>  #include <asm/segment.h>  #include <asm/types.h> @@ -390,9 +390,6 @@ struct thread_struct {  #endif  	unsigned long		gs; -	/* Floating point and extended processor state */ -	struct fpu		fpu; -  	/* Save middle states of ptrace breakpoints */  	struct perf_event	*ptrace_bps[HBP_NUM];  	/* Debug status used for traps, single steps, etc... */ @@ -403,21 +400,22 @@ struct thread_struct {  	unsigned long		cr2;  	unsigned long		trap_nr;  	unsigned long		error_code; -#ifdef CONFIG_X86_32 +#ifdef CONFIG_VM86  	/* Virtual 86 mode info */ -	struct vm86_struct __user *vm86_info; -	unsigned long		screen_bitmap; -	unsigned long		v86flags; -	unsigned long		v86mask; -	unsigned long		saved_sp0; -	unsigned int		saved_fs; -	unsigned int		saved_gs; +	struct vm86		*vm86;  #endif  	/* IO permissions: */  	unsigned long		*io_bitmap_ptr;  	unsigned long		iopl;  	/* Max allowed port in the bitmap, in bytes: */  	unsigned		io_bitmap_max; + +	/* Floating point and extended processor state */ +	struct fpu		fpu; +	/* +	 * WARNING: 'fpu' is dynamically-sized.  It *MUST* be at +	 * the end. +	 */  };  /* @@ -647,14 +645,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)  extern void set_task_blockstep(struct task_struct *task, bool on); -/* - * from system description table in BIOS. Mostly for MCA use, but - * others may find it useful: - */ -extern unsigned int		machine_id; -extern unsigned int		machine_submodel_id; -extern unsigned int		BIOS_revision; -  /* Boot loader type from the setup header: */  extern int			bootloader_type;  extern int			bootloader_version; @@ -716,7 +706,6 @@ static inline void spin_lock_prefetch(const void *x)  #define INIT_THREAD  {							  \  	.sp0			= TOP_OF_INIT_STACK,			  \ -	.vm86_info		= NULL,					  \  	.sysenter_cs		= __KERNEL_CS,				  \  	.io_bitmap_ptr		= NULL,					  \  } diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 5fabf1362942..6271281f947d 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -88,7 +88,6 @@ extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,  				       unsigned long phase1_result);  extern long syscall_trace_enter(struct pt_regs *); -extern void syscall_trace_leave(struct pt_regs *);  static inline unsigned long regs_return_value(struct pt_regs *regs)  { diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 628954ceede1..7a6bed5c08bc 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)  static __always_inline  u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)  { -	u64 delta = __native_read_tsc() - src->tsc_timestamp; +	u64 delta = rdtsc_ordered() - src->tsc_timestamp;  	return pvclock_scale_delta(delta, src->tsc_to_system_mul,  				   src->tsc_shift);  } @@ -76,13 +76,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,  	u8 ret_flags;  	version = src->version; -	/* Note: emulated platforms which do not advertise SSE2 support -	 * result in kvmclock not using the necessary RDTSC barriers. -	 * Without barriers, it is possible that RDTSC instruction reads from -	 * the time stamp counter outside rdtsc_barrier protected section -	 * below, resulting in violation of monotonicity. -	 */ -	rdtsc_barrier(); +  	offset = pvclock_get_nsec_offset(src);  	ret = src->system_time + offset;  	ret_flags = src->flags; diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h index ae0e241e228b..c537cbb038a7 100644 --- a/arch/x86/include/asm/qrwlock.h +++ b/arch/x86/include/asm/qrwlock.h @@ -2,16 +2,6 @@  #define _ASM_X86_QRWLOCK_H  #include <asm-generic/qrwlock_types.h> - -#ifndef CONFIG_X86_PPRO_FENCE -#define queue_write_unlock queue_write_unlock -static inline void queue_write_unlock(struct qrwlock *lock) -{ -        barrier(); -        ACCESS_ONCE(*(u8 *)&lock->cnts) = 0; -} -#endif -  #include <asm-generic/qrwlock.h>  #endif /* _ASM_X86_QRWLOCK_H */ diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 6fe6b182c998..9dfce4e0417d 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -57,9 +57,9 @@ struct sigcontext {  	unsigned long ip;  	unsigned long flags;  	unsigned short cs; -	unsigned short __pad2;	/* Was called gs, but was always zero. */ -	unsigned short __pad1;	/* Was called fs, but was always zero. */ -	unsigned short ss; +	unsigned short gs; +	unsigned short fs; +	unsigned short __pad0;  	unsigned long err;  	unsigned long trapno;  	unsigned long oldmask; diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 7c7c27c97daa..1f3175bb994e 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -4,6 +4,7 @@  #include <asm/sigcontext.h>  #include <asm/siginfo.h>  #include <asm/ucontext.h> +#include <linux/compat.h>  #ifdef CONFIG_X86_32  #define sigframe_ia32		sigframe @@ -69,6 +70,15 @@ struct rt_sigframe {  #ifdef CONFIG_X86_X32_ABI +struct ucontext_x32 { +	unsigned int	  uc_flags; +	unsigned int 	  uc_link; +	compat_stack_t	  uc_stack; +	unsigned int	  uc__pad0;     /* needed for alignment */ +	struct sigcontext uc_mcontext;  /* the 64-bit sigcontext type */ +	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */ +}; +  struct rt_sigframe_x32 {  	u64 pretcode;  	struct ucontext_x32 uc; diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 31eab867e6d3..c481be78fcf1 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -30,7 +30,7 @@ typedef sigset_t compat_sigset_t;  #endif /* __ASSEMBLY__ */  #include <uapi/asm/signal.h>  #ifndef __ASSEMBLY__ -extern void do_notify_resume(struct pt_regs *, void *, __u32); +extern void do_signal(struct pt_regs *regs);  #define __ARCH_HAS_SA_RESTORER diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index c2e00bb2a136..58505f01962f 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -72,7 +72,7 @@ static __always_inline void boot_init_stack_canary(void)  	 * on during the bootup the random pool has true entropy too.  	 */  	get_random_bytes(&canary, sizeof(canary)); -	tsc = __native_read_tsc(); +	tsc = rdtsc();  	canary += tsc + (tsc << 32UL);  	current->stack_canary = canary; diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 751bf4b7bf11..d7f3b3b78ac3 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -79,12 +79,12 @@ do {									\  #else /* CONFIG_X86_32 */  /* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT    "pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" +#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"  #define __EXTRA_CLOBBER  \  	, "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ -	  "r12", "r13", "r14", "r15", "flags" +	  "r12", "r13", "r14", "r15"  #ifdef CONFIG_CC_STACKPROTECTOR  #define __switch_canary							  \ @@ -100,11 +100,7 @@ do {									\  #define __switch_canary_iparam  #endif	/* CC_STACKPROTECTOR */ -/* - * There is no need to save or restore flags, because flags are always - * clean in kernel mode, with the possible exception of IOPL.  Kernel IOPL - * has no effect. - */ +/* Save restore flags to clear handle leaking NT */  #define switch_to(prev, next, last) \  	asm volatile(SAVE_CONTEXT					  \  	     "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */	  \ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 592a6a672e07..91dfcafe27a6 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -37,6 +37,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *);  asmlinkage unsigned long sys_sigreturn(void);  /* kernel/vm86_32.c */ +struct vm86_struct;  asmlinkage long sys_vm86old(struct vm86_struct __user *);  asmlinkage long sys_vm86(unsigned long, unsigned long); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 225ee545e1a0..8afdc3e44247 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -27,14 +27,17 @@   * Without this offset, that can result in a page fault.  (We are   * careful that, in this case, the value we read doesn't matter.)   * - * In vm86 mode, the hardware frame is much longer still, but we neither - * access the extra members from NMI context, nor do we write such a - * frame at sp0 at all. + * In vm86 mode, the hardware frame is much longer still, so add 16 + * bytes to make room for the real-mode segments.   *   * x86_64 has a fixed-length stack frame.   */  #ifdef CONFIG_X86_32 -# define TOP_OF_KERNEL_STACK_PADDING 8 +# ifdef CONFIG_VM86 +#  define TOP_OF_KERNEL_STACK_PADDING 16 +# else +#  define TOP_OF_KERNEL_STACK_PADDING 8 +# endif  #else  # define TOP_OF_KERNEL_STACK_PADDING 0  #endif @@ -140,27 +143,11 @@ struct thread_info {  	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\  	 _TIF_NOHZ) -/* work to do in syscall_trace_leave() */ -#define _TIF_WORK_SYSCALL_EXIT	\ -	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP |	\ -	 _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ) - -/* work to do on interrupt/exception return */ -#define _TIF_WORK_MASK							\ -	(0x0000FFFF &							\ -	 ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|			\ -	   _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) -  /* work to do on any return to user space */  #define _TIF_ALLWORK_MASK						\  	((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT |	\  	_TIF_NOHZ) -/* Only used for 64 bit */ -#define _TIF_DO_NOTIFY_MASK						\ -	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |				\ -	 _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) -  /* flags to check in __switch_to() */  #define _TIF_WORK_CTXSW							\  	(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cd791948b286..6df2029405a3 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)  #endif	/* SMP */ +/* Not inlined due to inc_irq_stat not being defined yet */ +#define flush_tlb_local() {		\ +	inc_irq_stat(irq_tlb_count);	\ +	local_flush_tlb();		\ +} +  #ifndef CONFIG_PARAVIRT  #define flush_tlb_others(mask, mm, start, end)	\  	native_flush_tlb_others(mask, mm, start, end) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c5380bea2a36..c3496619740a 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -112,8 +112,8 @@ asmlinkage void smp_threshold_interrupt(void);  asmlinkage void smp_deferred_error_interrupt(void);  #endif -extern enum ctx_state ist_enter(struct pt_regs *regs); -extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); +extern void ist_enter(struct pt_regs *regs); +extern void ist_exit(struct pt_regs *regs);  extern void ist_begin_non_atomic(struct pt_regs *regs);  extern void ist_end_non_atomic(void); diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 94605c0e9cee..6d7c5479bcea 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -21,28 +21,12 @@ extern void disable_TSC(void);  static inline cycles_t get_cycles(void)  { -	unsigned long long ret = 0; -  #ifndef CONFIG_X86_TSC  	if (!cpu_has_tsc)  		return 0;  #endif -	rdtscll(ret); - -	return ret; -} -static __always_inline cycles_t vget_cycles(void) -{ -	/* -	 * We only do VDSOs on TSC capable CPUs, so this shouldn't -	 * access boot_cpu_data (which is not VDSO-safe): -	 */ -#ifndef CONFIG_X86_TSC -	if (!cpu_has_tsc) -		return 0; -#endif -	return (cycles_t)__native_read_tsc(); +	return rdtsc();  }  extern void tsc_init(void); @@ -51,6 +35,7 @@ extern int unsynchronized_tsc(void);  extern int check_tsc_unstable(void);  extern int check_tsc_disabled(void);  extern unsigned long native_calibrate_tsc(void); +extern unsigned long long native_sched_clock_from_tsc(u64 tsc);  extern int tsc_clocksource_reliable; diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 1d8de3f3feca..1e491f3af317 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -1,7 +1,6 @@  #ifndef _ASM_X86_VM86_H  #define _ASM_X86_VM86_H -  #include <asm/ptrace.h>  #include <uapi/asm/vm86.h> @@ -28,43 +27,49 @@ struct kernel_vm86_regs {  	unsigned short gs, __gsh;  }; -struct kernel_vm86_struct { -	struct kernel_vm86_regs regs; -/* - * the below part remains on the kernel stack while we are in VM86 mode. - * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we - * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above - * 'struct kernel_vm86_regs' with the then actual values. - * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct' - * in kernelspace, hence we need not reget the data from userspace. - */ -#define VM86_TSS_ESP0 flags +struct vm86 { +	struct vm86plus_struct __user *user_vm86; +	struct pt_regs regs32; +	unsigned long veflags; +	unsigned long veflags_mask; +	unsigned long saved_sp0; +  	unsigned long flags;  	unsigned long screen_bitmap;  	unsigned long cpu_type;  	struct revectored_struct int_revectored;  	struct revectored_struct int21_revectored;  	struct vm86plus_info_struct vm86plus; -	struct pt_regs *regs32;   /* here we save the pointer to the old regs */ -/* - * The below is not part of the structure, but the stack layout continues - * this way. In front of 'return-eip' may be some data, depending on - * compilation, so we don't rely on this and save the pointer to 'oldregs' - * in 'regs32' above. - * However, with GCC-2.7.2 and the current CFLAGS you see exactly this: - -	long return-eip;        from call to vm86() -	struct pt_regs oldregs;  user space registers as saved by syscall - */  };  #ifdef CONFIG_VM86  void handle_vm86_fault(struct kernel_vm86_regs *, long);  int handle_vm86_trap(struct kernel_vm86_regs *, long, int); -struct pt_regs *save_v86_state(struct kernel_vm86_regs *); +void save_v86_state(struct kernel_vm86_regs *, int);  struct task_struct; + +#define free_vm86(t) do {				\ +	struct thread_struct *__t = (t);		\ +	if (__t->vm86 != NULL) {			\ +		kfree(__t->vm86);			\ +		__t->vm86 = NULL;			\ +	}						\ +} while (0) + +/* + * Support for VM86 programs to request interrupts for + * real mode hardware drivers: + */ +#define FIRST_VM86_IRQ		 3 +#define LAST_VM86_IRQ		15 + +static inline int invalid_vm86_irq(int irq) +{ +	return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; +} +  void release_vm86_irqs(struct task_struct *);  #else @@ -77,6 +82,10 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)  	return 0;  } +static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { } + +#define free_vm86(t) do { } while(0) +  #endif /* CONFIG_VM86 */  #endif /* _ASM_X86_VM86_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index da772edd19ab..448b7ca61aee 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -47,6 +47,7 @@  #define CPU_BASED_MOV_DR_EXITING                0x00800000  #define CPU_BASED_UNCOND_IO_EXITING             0x01000000  #define CPU_BASED_USE_IO_BITMAPS                0x02000000 +#define CPU_BASED_MONITOR_TRAP_FLAG             0x08000000  #define CPU_BASED_USE_MSR_BITMAPS               0x10000000  #define CPU_BASED_MONITOR_EXITING               0x20000000  #define CPU_BASED_PAUSE_EXITING                 0x40000000 @@ -367,29 +368,29 @@ enum vmcs_field {  #define TYPE_PHYSICAL_APIC_EVENT        (10 << 12)  #define TYPE_PHYSICAL_APIC_INST         (15 << 12) -/* segment AR */ -#define SEGMENT_AR_L_MASK (1 << 13) - -#define AR_TYPE_ACCESSES_MASK 1 -#define AR_TYPE_READABLE_MASK (1 << 1) -#define AR_TYPE_WRITEABLE_MASK (1 << 2) -#define AR_TYPE_CODE_MASK (1 << 3) -#define AR_TYPE_MASK 0x0f -#define AR_TYPE_BUSY_64_TSS 11 -#define AR_TYPE_BUSY_32_TSS 11 -#define AR_TYPE_BUSY_16_TSS 3 -#define AR_TYPE_LDT 2 - -#define AR_UNUSABLE_MASK (1 << 16) -#define AR_S_MASK (1 << 4) -#define AR_P_MASK (1 << 7) -#define AR_L_MASK (1 << 13) -#define AR_DB_MASK (1 << 14) -#define AR_G_MASK (1 << 15) -#define AR_DPL_SHIFT 5 -#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) - -#define AR_RESERVD_MASK 0xfffe0f00 +/* segment AR in VMCS -- these are different from what LAR reports */ +#define VMX_SEGMENT_AR_L_MASK (1 << 13) + +#define VMX_AR_TYPE_ACCESSES_MASK 1 +#define VMX_AR_TYPE_READABLE_MASK (1 << 1) +#define VMX_AR_TYPE_WRITEABLE_MASK (1 << 2) +#define VMX_AR_TYPE_CODE_MASK (1 << 3) +#define VMX_AR_TYPE_MASK 0x0f +#define VMX_AR_TYPE_BUSY_64_TSS 11 +#define VMX_AR_TYPE_BUSY_32_TSS 11 +#define VMX_AR_TYPE_BUSY_16_TSS 3 +#define VMX_AR_TYPE_LDT 2 + +#define VMX_AR_UNUSABLE_MASK (1 << 16) +#define VMX_AR_S_MASK (1 << 4) +#define VMX_AR_P_MASK (1 << 7) +#define VMX_AR_L_MASK (1 << 13) +#define VMX_AR_DB_MASK (1 << 14) +#define VMX_AR_G_MASK (1 << 15) +#define VMX_AR_DPL_SHIFT 5 +#define VMX_AR_DPL(ar) (((ar) >> VMX_AR_DPL_SHIFT) & 3) + +#define VMX_AR_RESERVD_MASK 0xfffe0f00  #define TSS_PRIVATE_MEMSLOT			(KVM_USER_MEM_SLOTS + 0)  #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	(KVM_USER_MEM_SLOTS + 1) diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index 608a79d5a466..e6911caf5bbf 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)  /* No need for a barrier -- XCHG is a barrier on x86. */  #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +extern int xen_have_vector_callback; + +/* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ +static inline bool xen_support_evtchn_rebind(void) +{ +	return (!xen_hvm_domain() || xen_have_vector_callback); +} +  #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index ca08a27b90b3..83aea8055119 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -465,6 +465,12 @@ HYPERVISOR_tmem_op(  	return _hypercall1(int, tmem_op, op);  } +static inline int +HYPERVISOR_xenpmu_op(unsigned int op, void *arg) +{ +	return _hypercall2(int, xenpmu_op, op, arg); +} +  static inline void  MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)  { diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 3400dbaec3c3..62ca03ef5c65 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -3,12 +3,38 @@   *   * Guest OS interface to x86 Xen.   * - * Copyright (c) 2004, K A Fraser + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2006, K A Fraser   */  #ifndef _ASM_X86_XEN_INTERFACE_H  #define _ASM_X86_XEN_INTERFACE_H +/* + * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field + * in a struct in memory. + * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an + * hypercall argument. + * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but + * they might not be on other architectures. + */  #ifdef __XEN__  #define __DEFINE_GUEST_HANDLE(name, type) \      typedef struct { type *p; } __guest_handle_ ## name @@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t);   * start of the GDT because some stupid OSes export hard-coded selector values   * in their ABI. These hard-coded values are always near the start of the GDT,   * so Xen places itself out of the way, at the far end of the GDT. + * + * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op   */  #define FIRST_RESERVED_GDT_PAGE  14  #define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)  #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)  /* - * Send an array of these to HYPERVISOR_set_trap_table() + * Send an array of these to HYPERVISOR_set_trap_table(). + * Terminate the array with a sentinel entry, with traps[].address==0.   * The privilege level specifies which modes may enter a trap via a software   * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate   * privilege levels as follows: @@ -118,10 +147,41 @@ struct trap_info {  DEFINE_GUEST_HANDLE_STRUCT(trap_info);  struct arch_shared_info { -    unsigned long max_pfn;                  /* max pfn that appears in table */ -    /* Frame containing list of mfns containing list of mfns containing p2m. */ -    unsigned long pfn_to_mfn_frame_list_list; -    unsigned long nmi_reason; +	/* +	 * Number of valid entries in the p2m table(s) anchored at +	 * pfn_to_mfn_frame_list_list and/or p2m_vaddr. +	 */ +	unsigned long max_pfn; +	/* +	 * Frame containing list of mfns containing list of mfns containing p2m. +	 * A value of 0 indicates it has not yet been set up, ~0 indicates it +	 * has been set to invalid e.g. due to the p2m being too large for the +	 * 3-level p2m tree. In this case the linear mapper p2m list anchored +	 * at p2m_vaddr is to be used. +	 */ +	xen_pfn_t pfn_to_mfn_frame_list_list; +	unsigned long nmi_reason; +	/* +	 * Following three fields are valid if p2m_cr3 contains a value +	 * different from 0. +	 * p2m_cr3 is the root of the address space where p2m_vaddr is valid. +	 * p2m_cr3 is in the same format as a cr3 value in the vcpu register +	 * state and holds the folded machine frame number (via xen_pfn_to_cr3) +	 * of a L3 or L4 page table. +	 * p2m_vaddr holds the virtual address of the linear p2m list. All +	 * entries in the range [0...max_pfn[ are accessible via this pointer. +	 * p2m_generation will be incremented by the guest before and after each +	 * change of the mappings of the p2m list. p2m_generation starts at 0 +	 * and a value with the least significant bit set indicates that a +	 * mapping update is in progress. This allows guest external software +	 * (e.g. in Dom0) to verify that read mappings are consistent and +	 * whether they have changed since the last check. +	 * Modifying a p2m element in the linear p2m list is allowed via an +	 * atomic write only. +	 */ +	unsigned long p2m_cr3;		/* cr3 value of the p2m address space */ +	unsigned long p2m_vaddr;	/* virtual address of the p2m list */ +	unsigned long p2m_generation;	/* generation count of p2m mapping */  };  #endif	/* !__ASSEMBLY__ */ @@ -137,13 +197,31 @@ struct arch_shared_info {  /*   * The following is all CPU context. Note that the fpu_ctxt block is filled   * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + * + * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise + * for HVM and PVH guests, not all information in this structure is updated: + * + * - For HVM guests, the structures read include: fpu_ctxt (if + * VGCT_I387_VALID is set), flags, user_regs, debugreg[*] + * + * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to + * set cr3. All other fields not used should be set to 0.   */  struct vcpu_guest_context {      /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */      struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */ -#define VGCF_I387_VALID (1<<0) -#define VGCF_HVM_GUEST  (1<<1) -#define VGCF_IN_KERNEL  (1<<2) +#define VGCF_I387_VALID                (1<<0) +#define VGCF_IN_KERNEL                 (1<<2) +#define _VGCF_i387_valid               0 +#define VGCF_i387_valid                (1<<_VGCF_i387_valid) +#define _VGCF_in_kernel                2 +#define VGCF_in_kernel                 (1<<_VGCF_in_kernel) +#define _VGCF_failsafe_disables_events 3 +#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events) +#define _VGCF_syscall_disables_events  4 +#define VGCF_syscall_disables_events   (1<<_VGCF_syscall_disables_events) +#define _VGCF_online                   5 +#define VGCF_online                    (1<<_VGCF_online)      unsigned long flags;                    /* VGCF_* flags                 */      struct cpu_user_regs user_regs;         /* User-level CPU registers     */      struct trap_info trap_ctxt[256];        /* Virtual IDT                  */ @@ -172,6 +250,129 @@ struct vcpu_guest_context {  #endif  };  DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); + +/* AMD PMU registers and structures */ +struct xen_pmu_amd_ctxt { +	/* +	 * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd). +	 * For PV(H) guests these fields are RO. +	 */ +	uint32_t counters; +	uint32_t ctrls; + +	/* Counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +	uint64_t regs[]; +#elif defined(__GNUC__) +	uint64_t regs[0]; +#endif +}; + +/* Intel PMU registers and structures */ +struct xen_pmu_cntr_pair { +	uint64_t counter; +	uint64_t control; +}; + +struct xen_pmu_intel_ctxt { +	/* +	 * Offsets to fixed and architectural counter MSRs (relative to +	 * xen_pmu_arch.c.intel). +	 * For PV(H) guests these fields are RO. +	 */ +	uint32_t fixed_counters; +	uint32_t arch_counters; + +	/* PMU registers */ +	uint64_t global_ctrl; +	uint64_t global_ovf_ctrl; +	uint64_t global_status; +	uint64_t fixed_ctrl; +	uint64_t ds_area; +	uint64_t pebs_enable; +	uint64_t debugctl; + +	/* Fixed and architectural counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +	uint64_t regs[]; +#elif defined(__GNUC__) +	uint64_t regs[0]; +#endif +}; + +/* Sampled domain's registers */ +struct xen_pmu_regs { +	uint64_t ip; +	uint64_t sp; +	uint64_t flags; +	uint16_t cs; +	uint16_t ss; +	uint8_t cpl; +	uint8_t pad[3]; +}; + +/* PMU flags */ +#define PMU_CACHED	   (1<<0) /* PMU MSRs are cached in the context */ +#define PMU_SAMPLE_USER	   (1<<1) /* Sample is from user or kernel mode */ +#define PMU_SAMPLE_REAL	   (1<<2) /* Sample is from realmode */ +#define PMU_SAMPLE_PV	   (1<<3) /* Sample from a PV guest */ + +/* + * Architecture-specific information describing state of the processor at + * the time of PMU interrupt. + * Fields of this structure marked as RW for guest should only be written by + * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the + * hypervisor during PMU interrupt). Hypervisor will read updated data in + * XENPMU_flush hypercall and clear PMU_CACHED bit. + */ +struct xen_pmu_arch { +	union { +		/* +		 * Processor's registers at the time of interrupt. +		 * WO for hypervisor, RO for guests. +		 */ +		struct xen_pmu_regs regs; +		/* +		 * Padding for adding new registers to xen_pmu_regs in +		 * the future +		 */ +#define XENPMU_REGS_PAD_SZ  64 +		uint8_t pad[XENPMU_REGS_PAD_SZ]; +	} r; + +	/* WO for hypervisor, RO for guest */ +	uint64_t pmu_flags; + +	/* +	 * APIC LVTPC register. +	 * RW for both hypervisor and guest. +	 * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware +	 * during XENPMU_flush or XENPMU_lvtpc_set. +	 */ +	union { +		uint32_t lapic_lvtpc; +		uint64_t pad; +	} l; + +	/* +	 * Vendor-specific PMU registers. +	 * RW for both hypervisor and guest (see exceptions above). +	 * Guest's updates to this field are verified and then loaded by the +	 * hypervisor into hardware during XENPMU_flush +	 */ +	union { +		struct xen_pmu_amd_ctxt amd; +		struct xen_pmu_intel_ctxt intel; + +		/* +		 * Padding for contexts (fixed parts only, does not include +		 * MSR banks that are specified by offsets) +		 */ +#define XENPMU_CTXT_PAD_SZ  128 +		uint8_t pad[XENPMU_CTXT_PAD_SZ]; +	} c; +}; +  #endif	/* !__ASSEMBLY__ */  /* diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c44a5d53e464..0679e11d2cf7 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -35,9 +35,7 @@ typedef struct xpaddr {  #define FOREIGN_FRAME(m)	((m) | FOREIGN_FRAME_BIT)  #define IDENTITY_FRAME(m)	((m) | IDENTITY_FRAME_BIT) -/* Maximum amount of memory we can handle in a domain in pages */ -#define MAX_DOMAIN_PAGES						\ -    ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) +#define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))  extern unsigned long *machine_to_phys_mapping;  extern unsigned long  machine_to_phys_nr; @@ -48,8 +46,8 @@ extern unsigned long  xen_max_p2m_pfn;  extern unsigned long get_phys_to_machine(unsigned long pfn);  extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);  extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); -extern unsigned long set_phys_range_identity(unsigned long pfn_s, -					     unsigned long pfn_e); +extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, +						    unsigned long pfn_e);  extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,  				   struct gnttab_map_grant_ref *kmap_ops, @@ -103,6 +101,11 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)  {  	unsigned long mfn; +	/* +	 * Some x86 code are still using pfn_to_mfn instead of +	 * pfn_to_mfn. This will have to be removed when we figured +	 * out which call. +	 */  	if (xen_feature(XENFEAT_auto_translated_physmap))  		return pfn; @@ -149,6 +152,11 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)  {  	unsigned long pfn; +	/* +	 * Some x86 code are still using mfn_to_pfn instead of +	 * gfn_to_pfn. This will have to be removed when we figure +	 * out which call. +	 */  	if (xen_feature(XENFEAT_auto_translated_physmap))  		return mfn; @@ -178,6 +186,27 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)  	return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);  } +/* Pseudo-physical <-> Guest conversion */ +static inline unsigned long pfn_to_gfn(unsigned long pfn) +{ +	if (xen_feature(XENFEAT_auto_translated_physmap)) +		return pfn; +	else +		return pfn_to_mfn(pfn); +} + +static inline unsigned long gfn_to_pfn(unsigned long gfn) +{ +	if (xen_feature(XENFEAT_auto_translated_physmap)) +		return gfn; +	else +		return mfn_to_pfn(gfn); +} + +/* Pseudo-physical <-> Bus conversion */ +#define pfn_to_bfn(pfn)		pfn_to_gfn(pfn) +#define bfn_to_pfn(bfn)		gfn_to_pfn(bfn) +  /*   * We detect special mappings in one of two ways:   *  1. If the MFN is an I/O page then Xen will set the m2p entry @@ -198,7 +227,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)   *      require. In all the cases we care about, the FOREIGN_FRAME bit is   *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.   */ -static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +static inline unsigned long bfn_to_local_pfn(unsigned long mfn)  {  	unsigned long pfn; @@ -217,6 +246,10 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)  #define virt_to_mfn(v)		(pfn_to_mfn(virt_to_pfn(v)))  #define mfn_to_virt(m)		(__va(mfn_to_pfn(m) << PAGE_SHIFT)) +/* VIRT <-> GUEST conversion */ +#define virt_to_gfn(v)		(pfn_to_gfn(virt_to_pfn(v))) +#define gfn_to_virt(g)		(__va(gfn_to_pfn(g) << PAGE_SHIFT)) +  static inline unsigned long pte_mfn(pte_t pte)  {  	return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; @@ -264,7 +297,7 @@ void make_lowmem_page_readwrite(void *vaddr);  static inline bool xen_arch_need_swiotlb(struct device *dev,  					 unsigned long pfn, -					 unsigned long mfn) +					 unsigned long bfn)  {  	return false;  }  |