diff options
Diffstat (limited to 'arch/x86')
265 files changed, 6841 insertions, 7374 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 32a1918e1b88..0896008f7509 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -45,11 +45,15 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select USER_STACKTRACE_SUPPORT + select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_DMA_API_DEBUG select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select HAVE_KERNEL_LZO select HAVE_HW_BREAKPOINT + select PERF_EVENTS + select ANON_INODES select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER @@ -986,12 +990,6 @@ config X86_CPUID with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to /dev/cpu/31/cpuid. -config X86_CPU_DEBUG - tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support" - ---help--- - If you select this option, this will provide various x86 CPUs - information through debugfs. - choice prompt "High Memory Support" default HIGHMEM4G if !X86_NUMAQ @@ -1244,6 +1242,11 @@ config ARCH_MEMORY_PROBE def_bool X86_64 depends on MEMORY_HOTPLUG +config ILLEGAL_POINTER_VALUE + hex + default 0 if X86_32 + default 0xdead000000000000 if X86_64 + source "mm/Kconfig" config HIGHPTE @@ -2012,18 +2015,9 @@ config SCx200HR_TIMER processor goes idle (as is done by the scheduler). The other workaround is idle=poll boot option. -config GEODE_MFGPT_TIMER - def_bool y - prompt "Geode Multi-Function General Purpose Timer (MFGPT) events" - depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS - ---help--- - This driver provides a clock event source based on the MFGPT - timer(s) in the CS5535 and CS5536 companion chip for the geode. - MFGPTs have a better resolution and max interval than the - generic PIT, and are suitable for use as high-res timers. - config OLPC bool "One Laptop Per Child support" + select GPIOLIB default n ---help--- Add support for detecting the unique features of the OLPC diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 08e442bc3ab9..a19829374e6a 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -319,7 +319,7 @@ config X86_L1_CACHE_SHIFT config X86_XADD def_bool y - depends on X86_32 && !M386 + depends on X86_64 || !M386 config X86_PPRO_FENCE bool "PentiumPro memory ordering errata workaround" @@ -396,7 +396,7 @@ config X86_TSC config X86_CMPXCHG64 def_bool y - depends on !M386 && !M486 + depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM # this should be set for all -march=.. options where the compiler # generates cmov. diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 731318e5ac1d..bc01e3ebfeb2 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -187,8 +187,8 @@ config HAVE_MMIOTRACE_SUPPORT def_bool y config X86_DECODER_SELFTEST - bool "x86 instruction decoder selftest" - depends on DEBUG_KERNEL + bool "x86 instruction decoder selftest" + depends on DEBUG_KERNEL && KPROBES ---help--- Perform x86 instruction decoder selftests at build time. This option is useful for checking the sanity of x86 instruction diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 78b32be55e9e..0a43dc515e4c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -135,9 +135,7 @@ drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ # suspend and hibernation support drivers-$(CONFIG_PM) += arch/x86/power/ -ifeq ($(CONFIG_X86_32),y) drivers-$(CONFIG_FB) += arch/x86/video/ -endif #### # boot loader support. Several targets are kept for legacy purposes diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f8ed0658404c..fbb47daf2459 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,11 +4,12 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o +targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += $(call cc-option,-ffreestanding) @@ -48,10 +49,13 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE $(call if_changed,bzip2) $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzma) +$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE + $(call if_changed,lzo) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma +suffix-$(CONFIG_KERNEL_LZO) := lzo quiet_cmd_mkpiggy = MKPIGGY $@ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 842b2a36174a..51e240779a44 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -19,11 +19,6 @@ #define _ASM_X86_DESC_H 1 #endif -#ifdef CONFIG_X86_64 -#define _LINUX_STRING_H_ 1 -#define __LINUX_BITMAP_H 1 -#endif - #include <linux/linkage.h> #include <linux/screen_info.h> #include <linux/elf.h> @@ -131,8 +126,8 @@ static void error(char *m); static struct boot_params *real_mode; /* Pointer to real-mode data */ static int quiet; -static void *memset(void *s, int c, unsigned n); -void *memcpy(void *dest, const void *src, unsigned n); +void *memset(void *s, int c, size_t n); +void *memcpy(void *dest, const void *src, size_t n); static void __putstr(int, const char *); #define putstr(__x) __putstr(0, __x) @@ -162,6 +157,10 @@ static int lines, cols; #include "../../../../lib/decompress_unlzma.c" #endif +#ifdef CONFIG_KERNEL_LZO +#include "../../../../lib/decompress_unlzo.c" +#endif + static void scroll(void) { int i; @@ -181,11 +180,9 @@ static void __putstr(int error, const char *s) return; #endif -#ifdef CONFIG_X86_32 if (real_mode->screen_info.orig_video_mode == 0 && lines == 0 && cols == 0) return; -#endif x = real_mode->screen_info.orig_x; y = real_mode->screen_info.orig_y; @@ -219,7 +216,7 @@ static void __putstr(int error, const char *s) outb(0xff & (pos >> 1), vidport+1); } -static void *memset(void *s, int c, unsigned n) +void *memset(void *s, int c, size_t n) { int i; char *ss = s; @@ -229,7 +226,7 @@ static void *memset(void *s, int c, unsigned n) return s; } -void *memcpy(void *dest, const void *src, unsigned n) +void *memcpy(void *dest, const void *src, size_t n) { int i; const char *s = src; diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index bbeb0c3fbd90..89bbf4e4d05d 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -9,6 +9,9 @@ #include <byteswap.h> #define USE_BSD #include <endian.h> +#include <regex.h> + +static void die(char *fmt, ...); #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) static Elf32_Ehdr ehdr; @@ -30,25 +33,47 @@ static struct section *secs; * the address for which it has been compiled. Don't warn user about * absolute relocations present w.r.t these symbols. */ -static const char* safe_abs_relocs[] = { - "xen_irq_disable_direct_reloc", - "xen_save_fl_direct_reloc", -}; +static const char abs_sym_regex[] = + "^(xen_irq_disable_direct_reloc$|" + "xen_save_fl_direct_reloc$|" + "VDSO|" + "__crc_)"; +static regex_t abs_sym_regex_c; +static int is_abs_reloc(const char *sym_name) +{ + return !regexec(&abs_sym_regex_c, sym_name, 0, NULL, 0); +} -static int is_safe_abs_reloc(const char* sym_name) +/* + * These symbols are known to be relative, even if the linker marks them + * as absolute (typically defined outside any section in the linker script.) + */ +static const char rel_sym_regex[] = + "^_end$"; +static regex_t rel_sym_regex_c; +static int is_rel_reloc(const char *sym_name) { - int i; + return !regexec(&rel_sym_regex_c, sym_name, 0, NULL, 0); +} - for (i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) { - if (!strcmp(sym_name, safe_abs_relocs[i])) - /* Match found */ - return 1; - } - if (strncmp(sym_name, "VDSO", 4) == 0) - return 1; - if (strncmp(sym_name, "__crc_", 6) == 0) - return 1; - return 0; +static void regex_init(void) +{ + char errbuf[128]; + int err; + + err = regcomp(&abs_sym_regex_c, abs_sym_regex, + REG_EXTENDED|REG_NOSUB); + if (err) { + regerror(err, &abs_sym_regex_c, errbuf, sizeof errbuf); + die("%s", errbuf); + } + + err = regcomp(&rel_sym_regex_c, rel_sym_regex, + REG_EXTENDED|REG_NOSUB); + if (err) { + regerror(err, &rel_sym_regex_c, errbuf, sizeof errbuf); + die("%s", errbuf); + } } static void die(char *fmt, ...) @@ -131,7 +156,7 @@ static const char *rel_type(unsigned type) #undef REL_TYPE }; const char *name = "unknown type rel type name"; - if (type < ARRAY_SIZE(type_name)) { + if (type < ARRAY_SIZE(type_name) && type_name[type]) { name = type_name[type]; } return name; @@ -448,7 +473,7 @@ static void print_absolute_relocs(void) * Before warning check if this absolute symbol * relocation is harmless. */ - if (is_safe_abs_reloc(name)) + if (is_abs_reloc(name) || is_rel_reloc(name)) continue; if (!printed) { @@ -501,21 +526,26 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; r_type = ELF32_R_TYPE(rel->r_info); /* Don't visit relocations to absolute symbols */ - if (sym->st_shndx == SHN_ABS) { + if (sym->st_shndx == SHN_ABS && + !is_rel_reloc(sym_name(sym_strtab, sym))) { continue; } - if (r_type == R_386_NONE || r_type == R_386_PC32) { + switch (r_type) { + case R_386_NONE: + case R_386_PC32: /* * NONE can be ignored and and PC relative * relocations don't need to be adjusted. */ - } - else if (r_type == R_386_32) { + break; + case R_386_32: /* Visit relocations that need to be adjusted */ visit(rel, sym); - } - else { - die("Unsupported relocation type: %d\n", r_type); + break; + default: + die("Unsupported relocation type: %s (%d)\n", + rel_type(r_type), r_type); + break; } } } @@ -571,16 +601,15 @@ static void emit_relocs(int as_text) } else { unsigned char buf[4]; - buf[0] = buf[1] = buf[2] = buf[3] = 0; /* Print a stop */ - printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); + fwrite("\0\0\0\0", 4, 1, stdout); /* Now print each relocation */ for (i = 0; i < reloc_count; i++) { buf[0] = (relocs[i] >> 0) & 0xff; buf[1] = (relocs[i] >> 8) & 0xff; buf[2] = (relocs[i] >> 16) & 0xff; buf[3] = (relocs[i] >> 24) & 0xff; - printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); + fwrite(buf, 4, 1, stdout); } } } @@ -598,6 +627,8 @@ int main(int argc, char **argv) FILE *fp; int i; + regex_init(); + show_absolute_syms = 0; show_absolute_relocs = 0; as_text = 0; diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index b31cc54b4641..93e689f4bd86 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -16,7 +16,7 @@ */ #include <asm/segment.h> -#include <linux/utsrelease.h> +#include <generated/utsrelease.h> #include <asm/boot.h> #include <asm/e820.h> #include <asm/page_types.h> diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index 8ef60f20b371..919257f526f2 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -22,7 +22,7 @@ int main(void) int i, j; const char *str; - printf("static const char x86_cap_strs[] = \n"); + printf("static const char x86_cap_strs[] =\n"); for (i = 0; i < NCAPINTS; i++) { for (j = 0; j < 32; j++) { diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c index 2723d9b5ce43..2b15aa488ffb 100644 --- a/arch/x86/boot/version.c +++ b/arch/x86/boot/version.c @@ -13,8 +13,8 @@ */ #include "boot.h" -#include <linux/utsrelease.h> -#include <linux/compile.h> +#include <generated/utsrelease.h> +#include <generated/compile.h> const char kernel_version[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") " diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 819caa1f2008..ed7aeff786b2 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -42,22 +42,15 @@ static u8 vga_set_basic_mode(void) { struct biosregs ireg, oreg; u16 ax; - u8 rows; u8 mode; initregs(&ireg); + /* Query current mode */ ax = 0x0f00; intcall(0x10, &ireg, &oreg); mode = oreg.al; - set_fs(0); - rows = rdfs8(0x484); /* rows minus one */ - - if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && - (rows == 0 || rows == 24)) - return mode; - if (mode != 3 && mode != 7) mode = 3; diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index f767164cd5df..43eda284d27f 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -298,11 +298,18 @@ static void restore_screen(void) } /* Restore cursor position */ + if (saved.curx >= xs) + saved.curx = xs-1; + if (saved.cury >= ys) + saved.cury = ys-1; + initregs(&ireg); ireg.ah = 0x02; /* Set cursor position */ ireg.dh = saved.cury; ireg.dl = saved.curx; intcall(0x10, &ireg, NULL); + + store_cursor_position(); } void set_video(void) diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 2a4d073d2cf1..9046e4af66ce 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -297,7 +297,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) * size limits imposed on them by creating programs with large * arrays in the data or bss. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + rlim = rlimit(RLIMIT_DATA); if (rlim >= RLIM_INFINITY) rlim = ~0; if (ex.a_data + ex.a_bss > rlim) @@ -308,14 +308,15 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval) return retval; - regs->cs = __USER32_CS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - /* OK, This is the point of no return */ set_personality(PER_LINUX); set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_ABI_PENDING); + + setup_new_exec(bprm); + + regs->cs = __USER32_CS; + regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = + regs->r13 = regs->r14 = regs->r15 = 0; current->mm->end_code = ex.a_text + (current->mm->start_code = N_TXTADDR(ex)); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 4eefdca9832b..53147ad85b96 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -696,7 +696,7 @@ ia32_sys_call_table: .quad quiet_ni_syscall /* streams2 */ .quad stub32_vfork /* 190 */ .quad compat_sys_getrlimit - .quad sys32_mmap2 + .quad sys_mmap_pgoff .quad sys32_truncate64 .quad sys32_ftruncate64 .quad sys32_stat64 /* 195 */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index df82c0e48ded..422572c77923 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -155,9 +155,6 @@ struct mmap_arg_struct { asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) { struct mmap_arg_struct a; - struct file *file = NULL; - unsigned long retval; - struct mm_struct *mm ; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; @@ -165,22 +162,8 @@ asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) if (a.offset & ~PAGE_MASK) return -EINVAL; - if (!(a.flags & MAP_ANONYMOUS)) { - file = fget(a.fd); - if (!file) - return -EBADF; - } - - mm = current->mm; - down_write(&mm->mmap_sem); - retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset>>PAGE_SHIFT); - if (file) - fput(file); - - up_write(&mm->mmap_sem); - - return retval; } asmlinkage long sys32_mprotect(unsigned long start, size_t len, @@ -483,30 +466,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd, return ret; } -asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct mm_struct *mm = current->mm; - unsigned long error; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return -EBADF; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); - return error; -} - asmlinkage long sys32_olduname(struct oldold_utsname __user *name) { char *arch = "x86_64"; diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 60d2b2db0bc5..56f462cf22d2 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -142,6 +142,32 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) return max_cstate; } +static inline bool arch_has_acpi_pdc(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + return (c->x86_vendor == X86_VENDOR_INTEL || + c->x86_vendor == X86_VENDOR_CENTAUR); +} + +static inline void arch_acpi_set_pdc_bits(u32 *buf) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + buf[2] |= ACPI_PDC_C_CAPABILITY_SMP; + + if (cpu_has(c, X86_FEATURE_EST)) + buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP; + + if (cpu_has(c, X86_FEATURE_ACPI)) + buf[2] |= ACPI_PDC_T_FFH; + + /* + * If mwait/monitor is unsupported, C2/C3_FFH will be disabled + */ + if (!cpu_has(c, X86_FEATURE_MWAIT)) + buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); +} + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 69b74a7b877f..f1e253ceba4b 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -65,12 +65,17 @@ extern void alternatives_smp_module_add(struct module *mod, char *name, void *text, void *text_end); extern void alternatives_smp_module_del(struct module *mod); extern void alternatives_smp_switch(int smp); +extern int alternatives_text_reserved(void *start, void *end); #else static inline void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) {} static inline void alternatives_smp_module_del(struct module *mod) {} static inline void alternatives_smp_switch(int smp) {} +static inline int alternatives_text_reserved(void *start, void *end) +{ + return 0; +} #endif /* CONFIG_SMP */ /* alternative assembly primitive: */ @@ -125,11 +130,16 @@ static inline void alternatives_smp_switch(int smp) {} asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ : output : "i" (0), ## input) +/* Like alternative_io, but for replacing a direct call with another one. */ +#define alternative_call(oldfunc, newfunc, feature, output, input...) \ + asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \ + : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) + /* * use this macro(s) if you need more than one output parameter * in alternative_io */ -#define ASM_OUTPUT2(a, b) a, b +#define ASM_OUTPUT2(a...) a struct paravirt_patch_site; #ifdef CONFIG_PARAVIRT diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index 84786fb9a23b..d2544f1d705d 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -28,7 +28,10 @@ extern void amd_iommu_flush_all_domains(void); extern void amd_iommu_flush_all_devices(void); extern void amd_iommu_apply_erratum_63(u16 devid); extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); - +extern int amd_iommu_init_devices(void); +extern void amd_iommu_uninit_devices(void); +extern void amd_iommu_init_notifier(void); +extern void amd_iommu_init_api(void); #ifndef CONFIG_AMD_IOMMU_STATS static inline void amd_iommu_stats_init(void) { } diff --git a/arch/x86/include/asm/asm-offsets.h b/arch/x86/include/asm/asm-offsets.h new file mode 100644 index 000000000000..d370ee36a182 --- /dev/null +++ b/arch/x86/include/asm/asm-offsets.h @@ -0,0 +1 @@ +#include <generated/asm-offsets.h> diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 4e1b8873c474..8f8217b9bdac 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -1,5 +1,300 @@ +#ifndef _ASM_X86_ATOMIC_H +#define _ASM_X86_ATOMIC_H + +#include <linux/compiler.h> +#include <linux/types.h> +#include <asm/processor.h> +#include <asm/alternative.h> +#include <asm/cmpxchg.h> + +/* + * Atomic operations that C can't guarantee us. Useful for + * resource counting etc.. + */ + +#define ATOMIC_INIT(i) { (i) } + +/** + * atomic_read - read atomic variable + * @v: pointer of type atomic_t + * + * Atomically reads the value of @v. + */ +static inline int atomic_read(const atomic_t *v) +{ + return v->counter; +} + +/** + * atomic_set - set atomic variable + * @v: pointer of type atomic_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v. + */ +static inline void atomic_add(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "addl %1,%0" + : "+m" (v->counter) + : "ir" (i)); +} + +/** + * atomic_sub - subtract integer from atomic variable + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic_sub(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "subl %1,%0" + : "+m" (v->counter) + : "ir" (i)); +} + +/** + * atomic_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_sub_and_test(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. + */ +static inline void atomic_inc(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "incl %0" + : "+m" (v->counter)); +} + +/** + * atomic_dec - decrement atomic variable + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic_dec(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "decl %0" + : "+m" (v->counter)); +} + +/** + * atomic_dec_and_test - decrement and test + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decl %0; sete %1" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +/** + * atomic_inc_and_test - increment and test + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_inc_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incl %0; sete %1" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +/** + * atomic_add_negative - add and test if negative + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic_add_negative(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +} + +/** + * atomic_add_return - add integer and return + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline int atomic_add_return(int i, atomic_t *v) +{ + int __i; +#ifdef CONFIG_M386 + unsigned long flags; + if (unlikely(boot_cpu_data.x86 <= 3)) + goto no_xadd; +#endif + /* Modern 486+ processor */ + __i = i; + asm volatile(LOCK_PREFIX "xaddl %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; + +#ifdef CONFIG_M386 +no_xadd: /* Legacy 386 processor */ + raw_local_irq_save(flags); + __i = atomic_read(v); + atomic_set(v, i + __i); + raw_local_irq_restore(flags); + return i + __i; +#endif +} + +/** + * atomic_sub_return - subtract integer and return + * @v: pointer of type atomic_t + * @i: integer value to subtract + * + * Atomically subtracts @i from @v and returns @v - @i + */ +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +#define atomic_inc_return(v) (atomic_add_return(1, v)) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline int atomic_xchg(atomic_t *v, int new) +{ + return xchg(&v->counter, new); +} + +/** + * atomic_add_unless - add unless the number is already a given value + * @v: pointer of type atomic_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as @v was not already @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic_add_unless(atomic_t *v, int a, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) + +/** + * atomic_inc_short - increment of a short integer + * @v: pointer to type int + * + * Atomically adds 1 to @v + * Returns the new value of @u + */ +static inline short int atomic_inc_short(short int *v) +{ + asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); + return *v; +} + +#ifdef CONFIG_X86_64 +/** + * atomic_or_long - OR of two long integers + * @v1: pointer to type unsigned long + * @v2: pointer to type unsigned long + * + * Atomically ORs @v1 and @v2 + * Returns the result of the OR + */ +static inline void atomic_or_long(unsigned long *v1, unsigned long v2) +{ + asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); +} +#endif + +/* These are x86-specific, used by some header files */ +#define atomic_clear_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "andl %0,%1" \ + : : "r" (~(mask)), "m" (*(addr)) : "memory") + +#define atomic_set_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "orl %0,%1" \ + : : "r" ((unsigned)(mask)), "m" (*(addr)) \ + : "memory") + +/* Atomic operations are already serializing on x86 */ +#define smp_mb__before_atomic_dec() barrier() +#define smp_mb__after_atomic_dec() barrier() +#define smp_mb__before_atomic_inc() barrier() +#define smp_mb__after_atomic_inc() barrier() + #ifdef CONFIG_X86_32 -# include "atomic_32.h" +# include "atomic64_32.h" #else -# include "atomic_64.h" +# include "atomic64_64.h" #endif + +#include <asm-generic/atomic-long.h> +#endif /* _ASM_X86_ATOMIC_H */ diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h new file mode 100644 index 000000000000..03027bf28de5 --- /dev/null +++ b/arch/x86/include/asm/atomic64_32.h @@ -0,0 +1,160 @@ +#ifndef _ASM_X86_ATOMIC64_32_H +#define _ASM_X86_ATOMIC64_32_H + +#include <linux/compiler.h> +#include <linux/types.h> +#include <asm/processor.h> +//#include <asm/cmpxchg.h> + +/* An 64bit atomic type */ + +typedef struct { + u64 __aligned(8) counter; +} atomic64_t; + +#define ATOMIC64_INIT(val) { (val) } + +extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); + +/** + * atomic64_xchg - xchg atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically xchgs the value of @ptr to @new_val and returns + * the old value. + */ +extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val); + +/** + * atomic64_set - set atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically sets the value of @ptr to @new_val. + */ +extern void atomic64_set(atomic64_t *ptr, u64 new_val); + +/** + * atomic64_read - read atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically reads the value of @ptr and returns it. + */ +static inline u64 atomic64_read(atomic64_t *ptr) +{ + u64 res; + + /* + * Note, we inline this atomic64_t primitive because + * it only clobbers EAX/EDX and leaves the others + * untouched. We also (somewhat subtly) rely on the + * fact that cmpxchg8b returns the current 64-bit value + * of the memory location we are touching: + */ + asm volatile( + "mov %%ebx, %%eax\n\t" + "mov %%ecx, %%edx\n\t" + LOCK_PREFIX "cmpxchg8b %1\n" + : "=&A" (res) + : "m" (*ptr) + ); + + return res; +} + +extern u64 atomic64_read(atomic64_t *ptr); + +/** + * atomic64_add_return - add and return + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns @delta + *@ptr + */ +extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr); + +/* + * Other variants with different arithmetic operators: + */ +extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr); +extern u64 atomic64_inc_return(atomic64_t *ptr); +extern u64 atomic64_dec_return(atomic64_t *ptr); + +/** + * atomic64_add - add integer to atomic64 variable + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr. + */ +extern void atomic64_add(u64 delta, atomic64_t *ptr); + +/** + * atomic64_sub - subtract the atomic64 variable + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr. + */ +extern void atomic64_sub(u64 delta, atomic64_t *ptr); + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr and returns + * true if the result is zero, or false for all + * other cases. + */ +extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr); + +/** + * atomic64_inc - increment atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1. + */ +extern void atomic64_inc(atomic64_t *ptr); + +/** + * atomic64_dec - decrement atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1. + */ +extern void atomic64_dec(atomic64_t *ptr); + +/** + * atomic64_dec_and_test - decrement and test + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +extern int atomic64_dec_and_test(atomic64_t *ptr); + +/** + * atomic64_inc_and_test - increment and test + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +extern int atomic64_inc_and_test(atomic64_t *ptr); + +/** + * atomic64_add_negative - add and test if negative + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +extern int atomic64_add_negative(u64 delta, atomic64_t *ptr); + +#endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h new file mode 100644 index 000000000000..51c5b4056929 --- /dev/null +++ b/arch/x86/include/asm/atomic64_64.h @@ -0,0 +1,224 @@ +#ifndef _ASM_X86_ATOMIC64_64_H +#define _ASM_X86_ATOMIC64_64_H + +#include <linux/types.h> +#include <asm/alternative.h> +#include <asm/cmpxchg.h> + +/* The 64-bit atomic type */ + +#define ATOMIC64_INIT(i) { (i) } + +/** + * atomic64_read - read atomic64 variable + * @v: pointer of type atomic64_t + * + * Atomically reads the value of @v. + * Doesn't imply a read memory barrier. + */ +static inline long atomic64_read(const atomic64_t *v) +{ + return v->counter; +} + +/** + * atomic64_set - set atomic64 variable + * @v: pointer to type atomic64_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +static inline void atomic64_set(atomic64_t *v, long i) +{ + v->counter = i; +} + +/** + * atomic64_add - add integer to atomic64 variable + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v. + */ +static inline void atomic64_add(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "addq %1,%0" + : "=m" (v->counter) + : "er" (i), "m" (v->counter)); +} + +/** + * atomic64_sub - subtract the atomic64 variable + * @i: integer value to subtract + * @v: pointer to type atomic64_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic64_sub(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "subq %1,%0" + : "=m" (v->counter) + : "er" (i), "m" (v->counter)); +} + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer to type atomic64_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_sub_and_test(long i, atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subq %2,%0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "er" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic64_inc - increment atomic64 variable + * @v: pointer to type atomic64_t + * + * Atomically increments @v by 1. + */ +static inline void atomic64_inc(atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "incq %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic64_dec - decrement atomic64 variable + * @v: pointer to type atomic64_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic64_dec(atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "decq %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic64_dec_and_test - decrement and test + * @v: pointer to type atomic64_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic64_dec_and_test(atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decq %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic64_inc_and_test - increment and test + * @v: pointer to type atomic64_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_inc_and_test(atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incq %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic64_add_negative - add and test if negative + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic64_add_negative(long i, atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addq %2,%0; sets %1" + : "=m" (v->counter), "=qm" (c) + : "er" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic64_add_return - add and return + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline long atomic64_add_return(long i, atomic64_t *v) +{ + long __i = i; + asm volatile(LOCK_PREFIX "xaddq %0, %1;" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; +} + +static inline long atomic64_sub_return(long i, atomic64_t *v) +{ + return atomic64_add_return(-i, v); +} + +#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) +#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) + +static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline long atomic64_xchg(atomic64_t *v, long new) +{ + return xchg(&v->counter, new); +} + +/** + * atomic64_add_unless - add unless the number is a given value + * @v: pointer of type atomic64_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as it was not @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic64_add_unless(atomic64_t *v, long a, long u) +{ + long c, old; + c = atomic64_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic64_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) + +#endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h deleted file mode 100644 index dc5a667ff791..000000000000 --- a/arch/x86/include/asm/atomic_32.h +++ /dev/null @@ -1,415 +0,0 @@ -#ifndef _ASM_X86_ATOMIC_32_H -#define _ASM_X86_ATOMIC_32_H - -#include <linux/compiler.h> -#include <linux/types.h> -#include <asm/processor.h> -#include <asm/cmpxchg.h> - -/* - * Atomic operations that C can't guarantee us. Useful for - * resource counting etc.. - */ - -#define ATOMIC_INIT(i) { (i) } - -/** - * atomic_read - read atomic variable - * @v: pointer of type atomic_t - * - * Atomically reads the value of @v. - */ -static inline int atomic_read(const atomic_t *v) -{ - return v->counter; -} - -/** - * atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value - * - * Atomically sets the value of @v to @i. - */ -static inline void atomic_set(atomic_t *v, int i) -{ - v->counter = i; -} - -/** - * atomic_add - add integer to atomic variable - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v. - */ -static inline void atomic_add(int i, atomic_t *v) -{ - asm volatile(LOCK_PREFIX "addl %1,%0" - : "+m" (v->counter) - : "ir" (i)); -} - -/** - * atomic_sub - subtract integer from atomic variable - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v. - */ -static inline void atomic_sub(int i, atomic_t *v) -{ - asm volatile(LOCK_PREFIX "subl %1,%0" - : "+m" (v->counter) - : "ir" (i)); -} - -/** - * atomic_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ -static inline int atomic_sub_and_test(int i, atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" - : "+m" (v->counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; -} - -/** - * atomic_inc - increment atomic variable - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1. - */ -static inline void atomic_inc(atomic_t *v) -{ - asm volatile(LOCK_PREFIX "incl %0" - : "+m" (v->counter)); -} - -/** - * atomic_dec - decrement atomic variable - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1. - */ -static inline void atomic_dec(atomic_t *v) -{ - asm volatile(LOCK_PREFIX "decl %0" - : "+m" (v->counter)); -} - -/** - * atomic_dec_and_test - decrement and test - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -static inline int atomic_dec_and_test(atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "decl %0; sete %1" - : "+m" (v->counter), "=qm" (c) - : : "memory"); - return c != 0; -} - -/** - * atomic_inc_and_test - increment and test - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -static inline int atomic_inc_and_test(atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "incl %0; sete %1" - : "+m" (v->counter), "=qm" (c) - : : "memory"); - return c != 0; -} - -/** - * atomic_add_negative - add and test if negative - * @v: pointer of type atomic_t - * @i: integer value to add - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -static inline int atomic_add_negative(int i, atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" - : "+m" (v->counter), "=qm" (c) - : "ir" (i) : "memory"); - return c; -} - -/** - * atomic_add_return - add integer and return - * @v: pointer of type atomic_t - * @i: integer value to add - * - * Atomically adds @i to @v and returns @i + @v - */ -static inline int atomic_add_return(int i, atomic_t *v) -{ - int __i; -#ifdef CONFIG_M386 - unsigned long flags; - if (unlikely(boot_cpu_data.x86 <= 3)) - goto no_xadd; -#endif - /* Modern 486+ processor */ - __i = i; - asm volatile(LOCK_PREFIX "xaddl %0, %1" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; - -#ifdef CONFIG_M386 -no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); - __i = atomic_read(v); - atomic_set(v, i + __i); - local_irq_restore(flags); - return i + __i; -#endif -} - -/** - * atomic_sub_return - subtract integer and return - * @v: pointer of type atomic_t - * @i: integer value to subtract - * - * Atomically subtracts @i from @v and returns @v - @i - */ -static inline int atomic_sub_return(int i, atomic_t *v) -{ - return atomic_add_return(-i, v); -} - -static inline int atomic_cmpxchg(atomic_t *v, int old, int new) -{ - return cmpxchg(&v->counter, old, new); -} - -static inline int atomic_xchg(atomic_t *v, int new) -{ - return xchg(&v->counter, new); -} - -/** - * atomic_add_unless - add unless the number is already a given value - * @v: pointer of type atomic_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as @v was not already @u. - * Returns non-zero if @v was not @u, and zero otherwise. - */ -static inline int atomic_add_unless(atomic_t *v, int a, int u) -{ - int c, old; - c = atomic_read(v); - for (;;) { - if (unlikely(c == (u))) - break; - old = atomic_cmpxchg((v), c, c + (a)); - if (likely(old == c)) - break; - c = old; - } - return c != (u); -} - -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) - -#define atomic_inc_return(v) (atomic_add_return(1, v)) -#define atomic_dec_return(v) (atomic_sub_return(1, v)) - -/* These are x86-specific, used by some header files */ -#define atomic_clear_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "andl %0,%1" \ - : : "r" (~(mask)), "m" (*(addr)) : "memory") - -#define atomic_set_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "orl %0,%1" \ - : : "r" (mask), "m" (*(addr)) : "memory") - -/* Atomic operations are already serializing on x86 */ -#define smp_mb__before_atomic_dec() barrier() -#define smp_mb__after_atomic_dec() barrier() -#define smp_mb__before_atomic_inc() barrier() -#define smp_mb__after_atomic_inc() barrier() - -/* An 64bit atomic type */ - -typedef struct { - u64 __aligned(8) counter; -} atomic64_t; - -#define ATOMIC64_INIT(val) { (val) } - -extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); - -/** - * atomic64_xchg - xchg atomic64 variable - * @ptr: pointer to type atomic64_t - * @new_val: value to assign - * - * Atomically xchgs the value of @ptr to @new_val and returns - * the old value. - */ -extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val); - -/** - * atomic64_set - set atomic64 variable - * @ptr: pointer to type atomic64_t - * @new_val: value to assign - * - * Atomically sets the value of @ptr to @new_val. - */ -extern void atomic64_set(atomic64_t *ptr, u64 new_val); - -/** - * atomic64_read - read atomic64 variable - * @ptr: pointer to type atomic64_t - * - * Atomically reads the value of @ptr and returns it. - */ -static inline u64 atomic64_read(atomic64_t *ptr) -{ - u64 res; - - /* - * Note, we inline this atomic64_t primitive because - * it only clobbers EAX/EDX and leaves the others - * untouched. We also (somewhat subtly) rely on the - * fact that cmpxchg8b returns the current 64-bit value - * of the memory location we are touching: - */ - asm volatile( - "mov %%ebx, %%eax\n\t" - "mov %%ecx, %%edx\n\t" - LOCK_PREFIX "cmpxchg8b %1\n" - : "=&A" (res) - : "m" (*ptr) - ); - - return res; -} - -extern u64 atomic64_read(atomic64_t *ptr); - -/** - * atomic64_add_return - add and return - * @delta: integer value to add - * @ptr: pointer to type atomic64_t - * - * Atomically adds @delta to @ptr and returns @delta + *@ptr - */ -extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr); - -/* - * Other variants with different arithmetic operators: - */ -extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr); -extern u64 atomic64_inc_return(atomic64_t *ptr); -extern u64 atomic64_dec_return(atomic64_t *ptr); - -/** - * atomic64_add - add integer to atomic64 variable - * @delta: integer value to add - * @ptr: pointer to type atomic64_t - * - * Atomically adds @delta to @ptr. - */ -extern void atomic64_add(u64 delta, atomic64_t *ptr); - -/** - * atomic64_sub - subtract the atomic64 variable - * @delta: integer value to subtract - * @ptr: pointer to type atomic64_t - * - * Atomically subtracts @delta from @ptr. - */ -extern void atomic64_sub(u64 delta, atomic64_t *ptr); - -/** - * atomic64_sub_and_test - subtract value from variable and test result - * @delta: integer value to subtract - * @ptr: pointer to type atomic64_t - * - * Atomically subtracts @delta from @ptr and returns - * true if the result is zero, or false for all - * other cases. - */ -extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr); - -/** - * atomic64_inc - increment atomic64 variable - * @ptr: pointer to type atomic64_t - * - * Atomically increments @ptr by 1. - */ -extern void atomic64_inc(atomic64_t *ptr); - -/** - * atomic64_dec - decrement atomic64 variable - * @ptr: pointer to type atomic64_t - * - * Atomically decrements @ptr by 1. - */ -extern void atomic64_dec(atomic64_t *ptr); - -/** - * atomic64_dec_and_test - decrement and test - * @ptr: pointer to type atomic64_t - * - * Atomically decrements @ptr by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -extern int atomic64_dec_and_test(atomic64_t *ptr); - -/** - * atomic64_inc_and_test - increment and test - * @ptr: pointer to type atomic64_t - * - * Atomically increments @ptr by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -extern int atomic64_inc_and_test(atomic64_t *ptr); - -/** - * atomic64_add_negative - add and test if negative - * @delta: integer value to add - * @ptr: pointer to type atomic64_t - * - * Atomically adds @delta to @ptr and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -extern int atomic64_add_negative(u64 delta, atomic64_t *ptr); - -#include <asm-generic/atomic-long.h> -#endif /* _ASM_X86_ATOMIC_32_H */ diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h deleted file mode 100644 index d605dc268e79..000000000000 --- a/arch/x86/include/asm/atomic_64.h +++ /dev/null @@ -1,485 +0,0 @@ -#ifndef _ASM_X86_ATOMIC_64_H -#define _ASM_X86_ATOMIC_64_H - -#include <linux/types.h> -#include <asm/alternative.h> -#include <asm/cmpxchg.h> - -/* - * Atomic operations that C can't guarantee us. Useful for - * resource counting etc.. - */ - -#define ATOMIC_INIT(i) { (i) } - -/** - * atomic_read - read atomic variable - * @v: pointer of type atomic_t - * - * Atomically reads the value of @v. - */ -static inline int atomic_read(const atomic_t *v) -{ - return v->counter; -} - -/** - * atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value - * - * Atomically sets the value of @v to @i. - */ -static inline void atomic_set(atomic_t *v, int i) -{ - v->counter = i; -} - -/** - * atomic_add - add integer to atomic variable - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v. - */ -static inline void atomic_add(int i, atomic_t *v) -{ - asm volatile(LOCK_PREFIX "addl %1,%0" - : "=m" (v->counter) - : "ir" (i), "m" (v->counter)); -} - -/** - * atomic_sub - subtract the atomic variable - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v. - */ -static inline void atomic_sub(int i, atomic_t *v) -{ - asm volatile(LOCK_PREFIX "subl %1,%0" - : "=m" (v->counter) - : "ir" (i), "m" (v->counter)); -} - -/** - * atomic_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ -static inline int atomic_sub_and_test(int i, atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "ir" (i), "m" (v->counter) : "memory"); - return c; -} - -/** - * atomic_inc - increment atomic variable - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1. - */ -static inline void atomic_inc(atomic_t *v) -{ - asm volatile(LOCK_PREFIX "incl %0" - : "=m" (v->counter) - : "m" (v->counter)); -} - -/** - * atomic_dec - decrement atomic variable - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1. - */ -static inline void atomic_dec(atomic_t *v) -{ - asm volatile(LOCK_PREFIX "decl %0" - : "=m" (v->counter) - : "m" (v->counter)); -} - -/** - * atomic_dec_and_test - decrement and test - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -static inline int atomic_dec_and_test(atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "decl %0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; -} - -/** - * atomic_inc_and_test - increment and test - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -static inline int atomic_inc_and_test(atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "incl %0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; -} - -/** - * atomic_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -static inline int atomic_add_negative(int i, atomic_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" - : "=m" (v->counter), "=qm" (c) - : "ir" (i), "m" (v->counter) : "memory"); - return c; -} - -/** - * atomic_add_return - add and return - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v and returns @i + @v - */ -static inline int atomic_add_return(int i, atomic_t *v) -{ - int __i = i; - asm volatile(LOCK_PREFIX "xaddl %0, %1" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; -} - -static inline int atomic_sub_return(int i, atomic_t *v) -{ - return atomic_add_return(-i, v); -} - -#define atomic_inc_return(v) (atomic_add_return(1, v)) -#define atomic_dec_return(v) (atomic_sub_return(1, v)) - -/* The 64-bit atomic type */ - -#define ATOMIC64_INIT(i) { (i) } - -/** - * atomic64_read - read atomic64 variable - * @v: pointer of type atomic64_t - * - * Atomically reads the value of @v. - * Doesn't imply a read memory barrier. - */ -static inline long atomic64_read(const atomic64_t *v) -{ - return v->counter; -} - -/** - * atomic64_set - set atomic64 variable - * @v: pointer to type atomic64_t - * @i: required value - * - * Atomically sets the value of @v to @i. - */ -static inline void atomic64_set(atomic64_t *v, long i) -{ - v->counter = i; -} - -/** - * atomic64_add - add integer to atomic64 variable - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v. - */ -static inline void atomic64_add(long i, atomic64_t *v) -{ - asm volatile(LOCK_PREFIX "addq %1,%0" - : "=m" (v->counter) - : "er" (i), "m" (v->counter)); -} - -/** - * atomic64_sub - subtract the atomic64 variable - * @i: integer value to subtract - * @v: pointer to type atomic64_t - * - * Atomically subtracts @i from @v. - */ -static inline void atomic64_sub(long i, atomic64_t *v) -{ - asm volatile(LOCK_PREFIX "subq %1,%0" - : "=m" (v->counter) - : "er" (i), "m" (v->counter)); -} - -/** - * atomic64_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer to type atomic64_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ -static inline int atomic64_sub_and_test(long i, atomic64_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "subq %2,%0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "er" (i), "m" (v->counter) : "memory"); - return c; -} - -/** - * atomic64_inc - increment atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically increments @v by 1. - */ -static inline void atomic64_inc(atomic64_t *v) -{ - asm volatile(LOCK_PREFIX "incq %0" - : "=m" (v->counter) - : "m" (v->counter)); -} - -/** - * atomic64_dec - decrement atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically decrements @v by 1. - */ -static inline void atomic64_dec(atomic64_t *v) -{ - asm volatile(LOCK_PREFIX "decq %0" - : "=m" (v->counter) - : "m" (v->counter)); -} - -/** - * atomic64_dec_and_test - decrement and test - * @v: pointer to type atomic64_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -static inline int atomic64_dec_and_test(atomic64_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "decq %0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; -} - -/** - * atomic64_inc_and_test - increment and test - * @v: pointer to type atomic64_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -static inline int atomic64_inc_and_test(atomic64_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "incq %0; sete %1" - : "=m" (v->counter), "=qm" (c) - : "m" (v->counter) : "memory"); - return c != 0; -} - -/** - * atomic64_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -static inline int atomic64_add_negative(long i, atomic64_t *v) -{ - unsigned char c; - - asm volatile(LOCK_PREFIX "addq %2,%0; sets %1" - : "=m" (v->counter), "=qm" (c) - : "er" (i), "m" (v->counter) : "memory"); - return c; -} - -/** - * atomic64_add_return - add and return - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v and returns @i + @v - */ -static inline long atomic64_add_return(long i, atomic64_t *v) -{ - long __i = i; - asm volatile(LOCK_PREFIX "xaddq %0, %1;" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; -} - -static inline long atomic64_sub_return(long i, atomic64_t *v) -{ - return atomic64_add_return(-i, v); -} - -#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) -#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) - -static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) -{ - return cmpxchg(&v->counter, old, new); -} - -static inline long atomic64_xchg(atomic64_t *v, long new) -{ - return xchg(&v->counter, new); -} - -static inline long atomic_cmpxchg(atomic_t *v, int old, int new) -{ - return cmpxchg(&v->counter, old, new); -} - -static inline long atomic_xchg(atomic_t *v, int new) -{ - return xchg(&v->counter, new); -} - -/** - * atomic_add_unless - add unless the number is a given value - * @v: pointer of type atomic_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as it was not @u. - * Returns non-zero if @v was not @u, and zero otherwise. - */ -static inline int atomic_add_unless(atomic_t *v, int a, int u) -{ - int c, old; - c = atomic_read(v); - for (;;) { - if (unlikely(c == (u))) - break; - old = atomic_cmpxchg((v), c, c + (a)); - if (likely(old == c)) - break; - c = old; - } - return c != (u); -} - -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) - -/** - * atomic64_add_unless - add unless the number is a given value - * @v: pointer of type atomic64_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as it was not @u. - * Returns non-zero if @v was not @u, and zero otherwise. - */ -static inline int atomic64_add_unless(atomic64_t *v, long a, long u) -{ - long c, old; - c = atomic64_read(v); - for (;;) { - if (unlikely(c == (u))) - break; - old = atomic64_cmpxchg((v), c, c + (a)); - if (likely(old == c)) - break; - c = old; - } - return c != (u); -} - -/** - * atomic_inc_short - increment of a short integer - * @v: pointer to type int - * - * Atomically adds 1 to @v - * Returns the new value of @u - */ -static inline short int atomic_inc_short(short int *v) -{ - asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); - return *v; -} - -/** - * atomic_or_long - OR of two long integers - * @v1: pointer to type unsigned long - * @v2: pointer to type unsigned long - * - * Atomically ORs @v1 and @v2 - * Returns the result of the OR - */ -static inline void atomic_or_long(unsigned long *v1, unsigned long v2) -{ - asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); -} - -#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) - -/* These are x86-specific, used by some header files */ -#define atomic_clear_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "andl %0,%1" \ - : : "r" (~(mask)), "m" (*(addr)) : "memory") - -#define atomic_set_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "orl %0,%1" \ - : : "r" ((unsigned)(mask)), "m" (*(addr)) \ - : "memory") - -/* Atomic operations are already serializing on x86 */ -#define smp_mb__before_atomic_dec() barrier() -#define smp_mb__after_atomic_dec() barrier() -#define smp_mb__before_atomic_inc() barrier() -#define smp_mb__after_atomic_inc() barrier() - -#include <asm-generic/atomic-long.h> -#endif /* _ASM_X86_ATOMIC_64_H */ diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h deleted file mode 100644 index d96c1ee3a95c..000000000000 --- a/arch/x86/include/asm/cpu_debug.h +++ /dev/null @@ -1,127 +0,0 @@ -#ifndef _ASM_X86_CPU_DEBUG_H -#define _ASM_X86_CPU_DEBUG_H - -/* - * CPU x86 architecture debug - * - * Copyright(C) 2009 Jaswinder Singh Rajput - */ - -/* Register flags */ -enum cpu_debug_bit { -/* Model Specific Registers (MSRs) */ - CPU_MC_BIT, /* Machine Check */ - CPU_MONITOR_BIT, /* Monitor */ - CPU_TIME_BIT, /* Time */ - CPU_PMC_BIT, /* Performance Monitor */ - CPU_PLATFORM_BIT, /* Platform */ - CPU_APIC_BIT, /* APIC */ - CPU_POWERON_BIT, /* Power-on */ - CPU_CONTROL_BIT, /* Control */ - CPU_FEATURES_BIT, /* Features control */ - CPU_LBRANCH_BIT, /* Last Branch */ - CPU_BIOS_BIT, /* BIOS */ - CPU_FREQ_BIT, /* Frequency */ - CPU_MTTR_BIT, /* MTRR */ - CPU_PERF_BIT, /* Performance */ - CPU_CACHE_BIT, /* Cache */ - CPU_SYSENTER_BIT, /* Sysenter */ - CPU_THERM_BIT, /* Thermal */ - CPU_MISC_BIT, /* Miscellaneous */ - CPU_DEBUG_BIT, /* Debug */ - CPU_PAT_BIT, /* PAT */ - CPU_VMX_BIT, /* VMX */ - CPU_CALL_BIT, /* System Call */ - CPU_BASE_BIT, /* BASE Address */ - CPU_VER_BIT, /* Version ID */ - CPU_CONF_BIT, /* Configuration */ - CPU_SMM_BIT, /* System mgmt mode */ - CPU_SVM_BIT, /*Secure Virtual Machine*/ - CPU_OSVM_BIT, /* OS-Visible Workaround*/ -/* Standard Registers */ - CPU_TSS_BIT, /* Task Stack Segment */ - CPU_CR_BIT, /* Control Registers */ - CPU_DT_BIT, /* Descriptor Table */ -/* End of Registers flags */ - CPU_REG_ALL_BIT, /* Select all Registers */ -}; - -#define CPU_REG_ALL (~0) /* Select all Registers */ - -#define CPU_MC (1 << CPU_MC_BIT) -#define CPU_MONITOR (1 << CPU_MONITOR_BIT) -#define CPU_TIME (1 << CPU_TIME_BIT) -#define CPU_PMC (1 << CPU_PMC_BIT) -#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT) -#define CPU_APIC (1 << CPU_APIC_BIT) -#define CPU_POWERON (1 << CPU_POWERON_BIT) -#define CPU_CONTROL (1 << CPU_CONTROL_BIT) -#define CPU_FEATURES (1 << CPU_FEATURES_BIT) -#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT) -#define CPU_BIOS (1 << CPU_BIOS_BIT) -#define CPU_FREQ (1 << CPU_FREQ_BIT) -#define CPU_MTRR (1 << CPU_MTTR_BIT) -#define CPU_PERF (1 << CPU_PERF_BIT) -#define CPU_CACHE (1 << CPU_CACHE_BIT) -#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT) -#define CPU_THERM (1 << CPU_THERM_BIT) -#define CPU_MISC (1 << CPU_MISC_BIT) -#define CPU_DEBUG (1 << CPU_DEBUG_BIT) -#define CPU_PAT (1 << CPU_PAT_BIT) -#define CPU_VMX (1 << CPU_VMX_BIT) -#define CPU_CALL (1 << CPU_CALL_BIT) -#define CPU_BASE (1 << CPU_BASE_BIT) -#define CPU_VER (1 << CPU_VER_BIT) -#define CPU_CONF (1 << CPU_CONF_BIT) -#define CPU_SMM (1 << CPU_SMM_BIT) -#define CPU_SVM (1 << CPU_SVM_BIT) -#define CPU_OSVM (1 << CPU_OSVM_BIT) -#define CPU_TSS (1 << CPU_TSS_BIT) -#define CPU_CR (1 << CPU_CR_BIT) -#define CPU_DT (1 << CPU_DT_BIT) - -/* Register file flags */ -enum cpu_file_bit { - CPU_INDEX_BIT, /* index */ - CPU_VALUE_BIT, /* value */ -}; - -#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) - -#define MAX_CPU_FILES 512 - -struct cpu_private { - unsigned cpu; - unsigned type; - unsigned reg; - unsigned file; -}; - -struct cpu_debug_base { - char *name; /* Register name */ - unsigned flag; /* Register flag */ - unsigned write; /* Register write flag */ -}; - -/* - * Currently it looks similar to cpu_debug_base but once we add more files - * cpu_file_base will go in different direction - */ -struct cpu_file_base { - char *name; /* Register file name */ - unsigned flag; /* Register file flag */ - unsigned write; /* Register write flag */ -}; - -struct cpu_cpuX_base { - struct dentry *dentry; /* Register dentry */ - int init; /* Register index file */ -}; - -struct cpu_debug_range { - unsigned min; /* Register range min */ - unsigned max; /* Register range max */ - unsigned flag; /* Supported flags */ -}; - -#endif /* _ASM_X86_CPU_DEBUG_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 613700f27a4a..0cd82d068613 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -153,6 +153,7 @@ #define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ #define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ +#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -167,6 +168,10 @@ #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ #define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ #define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ +#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */ +#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */ +#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ +#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 8240f76b531e..b81002f23614 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -14,6 +14,9 @@ which debugging register was responsible for the trap. The other bits are either reserved or not of interest to us. */ +/* Define reserved bits in DR6 which are always set to 1 */ +#define DR6_RESERVED (0xFFFF0FF0) + #define DR_TRAP0 (0x1) /* db0 */ #define DR_TRAP1 (0x2) /* db1 */ #define DR_TRAP2 (0x4) /* db2 */ diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 9d6684849fd9..278441f39856 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -12,9 +12,9 @@ #include <linux/types.h> /* - * FIXME: Acessing the desc_struct through its fields is more elegant, + * FIXME: Accessing the desc_struct through its fields is more elegant, * and should be the one valid thing to do. However, a lot of open code - * still touches the a and b acessors, and doing this allow us to do it + * still touches the a and b accessors, and doing this allow us to do it * incrementally. We keep the signature as a struct, rather than an union, * so we can get rid of it transparently in the future -- glommer */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 0f6c02f3b7d4..ac91eed21061 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -67,7 +67,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) if (!dev->dma_mask) return 0; - return addr + size <= *dev->dma_mask; + return addr + size - 1 <= *dev->dma_mask; } static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 8a024babe5e6..f2ad2163109d 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -170,10 +170,7 @@ static inline void elf_common_init(struct thread_struct *t, } #define ELF_PLAT_INIT(_r, load_addr) \ -do { \ - elf_common_init(¤t->thread, _r, 0); \ - clear_thread_flag(TIF_IA32); \ -} while (0) + elf_common_init(¤t->thread, _r, 0) #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) @@ -181,14 +178,8 @@ do { \ void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); #define compat_start_thread start_thread_ia32 -#define COMPAT_SET_PERSONALITY(ex) \ -do { \ - if (test_thread_flag(TIF_IA32)) \ - clear_thread_flag(TIF_ABI_PENDING); \ - else \ - set_thread_flag(TIF_ABI_PENDING); \ - current->personality |= force_personality32; \ -} while (0) +void set_personality_ia32(void); +#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32() #define COMPAT_ELF_PLATFORM ("i686") @@ -239,7 +230,6 @@ extern int force_personality32; #endif /* !CONFIG_X86_32 */ #define CORE_DUMP_USE_REGSET -#define USE_ELF_CORE_DUMP #define ELF_EXEC_PAGESIZE 4096 /* This is the location that an ET_DYN program is loaded if exec'ed. Typical diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h index 53018464aea6..2519d0679d99 100644 --- a/arch/x86/include/asm/fb.h +++ b/arch/x86/include/asm/fb.h @@ -12,10 +12,6 @@ static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; } -#ifdef CONFIG_X86_32 extern int fb_is_primary_device(struct fb_info *info); -#else -static inline int fb_is_primary_device(struct fb_info *info) { return 0; } -#endif #endif /* _ASM_X86_FB_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 14f9890eb495..635f03bb4995 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -118,14 +118,20 @@ enum fixed_addresses { * 256 temporary boot-time mappings, used by early_ioremap(), * before ioremap() is functional. * - * We round it up to the next 256 pages boundary so that we - * can have a single pgd entry and a single pte table: + * If necessary we round it up to the next 256 pages boundary so + * that we can have a single pgd entry and a single pte table: */ #define NR_FIX_BTMAPS 64 #define FIX_BTMAPS_SLOTS 4 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - - (__end_of_permanent_fixed_addresses & 255), - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, +#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS) + FIX_BTMAP_END = + (__end_of_permanent_fixed_addresses ^ + (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) & + -PTRS_PER_PTE + ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - + (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1)) + : __end_of_permanent_fixed_addresses, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT FIX_OHCI1394_BASE, #endif diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h index ad3c2ed75481..7cd73552a4e8 100644 --- a/arch/x86/include/asm/geode.h +++ b/arch/x86/include/asm/geode.h @@ -12,160 +12,7 @@ #include <asm/processor.h> #include <linux/io.h> - -/* Generic southbridge functions */ - -#define GEODE_DEV_PMS 0 -#define GEODE_DEV_ACPI 1 -#define GEODE_DEV_GPIO 2 -#define GEODE_DEV_MFGPT 3 - -extern int geode_get_dev_base(unsigned int dev); - -/* Useful macros */ -#define geode_pms_base() geode_get_dev_base(GEODE_DEV_PMS) -#define geode_acpi_base() geode_get_dev_base(GEODE_DEV_ACPI) -#define geode_gpio_base() geode_get_dev_base(GEODE_DEV_GPIO) -#define geode_mfgpt_base() geode_get_dev_base(GEODE_DEV_MFGPT) - -/* MSRS */ - -#define MSR_GLIU_P2D_RO0 0x10000029 - -#define MSR_LX_GLD_MSR_CONFIG 0x48002001 -#define MSR_LX_MSR_PADSEL 0x48002011 /* NOT 0x48000011; the data - * sheet has the wrong value */ -#define MSR_GLCP_SYS_RSTPLL 0x4C000014 -#define MSR_GLCP_DOTPLL 0x4C000015 - -#define MSR_LBAR_SMB 0x5140000B -#define MSR_LBAR_GPIO 0x5140000C -#define MSR_LBAR_MFGPT 0x5140000D -#define MSR_LBAR_ACPI 0x5140000E -#define MSR_LBAR_PMS 0x5140000F - -#define MSR_DIVIL_SOFT_RESET 0x51400017 - -#define MSR_PIC_YSEL_LOW 0x51400020 -#define MSR_PIC_YSEL_HIGH 0x51400021 -#define MSR_PIC_ZSEL_LOW 0x51400022 -#define MSR_PIC_ZSEL_HIGH 0x51400023 -#define MSR_PIC_IRQM_LPC 0x51400025 - -#define MSR_MFGPT_IRQ 0x51400028 -#define MSR_MFGPT_NR 0x51400029 -#define MSR_MFGPT_SETUP 0x5140002B - -#define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */ - -#define MSR_GX_GLD_MSR_CONFIG 0xC0002001 -#define MSR_GX_MSR_PADSEL 0xC0002011 - -/* Resource Sizes */ - -#define LBAR_GPIO_SIZE 0xFF -#define LBAR_MFGPT_SIZE 0x40 -#define LBAR_ACPI_SIZE 0x40 -#define LBAR_PMS_SIZE 0x80 - -/* ACPI registers (PMS block) */ - -/* - * PM1_EN is only valid when VSA is enabled for 16 bit reads. - * When VSA is not enabled, *always* read both PM1_STS and PM1_EN - * with a 32 bit read at offset 0x0 - */ - -#define PM1_STS 0x00 -#define PM1_EN 0x02 -#define PM1_CNT 0x08 -#define PM2_CNT 0x0C -#define PM_TMR 0x10 -#define PM_GPE0_STS 0x18 -#define PM_GPE0_EN 0x1C - -/* PMC registers (PMS block) */ - -#define PM_SSD 0x00 -#define PM_SCXA 0x04 -#define PM_SCYA 0x08 -#define PM_OUT_SLPCTL 0x0C -#define PM_SCLK 0x10 -#define PM_SED 0x1 -#define PM_SCXD 0x18 -#define PM_SCYD 0x1C -#define PM_IN_SLPCTL 0x20 -#define PM_WKD 0x30 -#define PM_WKXD 0x34 -#define PM_RD 0x38 -#define PM_WKXA 0x3C -#define PM_FSD 0x40 -#define PM_TSD 0x44 -#define PM_PSD 0x48 -#define PM_NWKD 0x4C -#define PM_AWKD 0x50 -#define PM_SSC 0x54 - -/* VSA2 magic values */ - -#define VSA_VRC_INDEX 0xAC1C -#define VSA_VRC_DATA 0xAC1E -#define VSA_VR_UNLOCK 0xFC53 /* unlock virtual register */ -#define VSA_VR_SIGNATURE 0x0003 -#define VSA_VR_MEM_SIZE 0x0200 -#define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */ -#define GSW_VSA_SIG 0x534d /* General Software signature */ -/* GPIO */ - -#define GPIO_OUTPUT_VAL 0x00 -#define GPIO_OUTPUT_ENABLE 0x04 -#define GPIO_OUTPUT_OPEN_DRAIN 0x08 -#define GPIO_OUTPUT_INVERT 0x0C -#define GPIO_OUTPUT_AUX1 0x10 -#define GPIO_OUTPUT_AUX2 0x14 -#define GPIO_PULL_UP 0x18 -#define GPIO_PULL_DOWN 0x1C -#define GPIO_INPUT_ENABLE 0x20 -#define GPIO_INPUT_INVERT 0x24 -#define GPIO_INPUT_FILTER 0x28 -#define GPIO_INPUT_EVENT_COUNT 0x2C -#define GPIO_READ_BACK 0x30 -#define GPIO_INPUT_AUX1 0x34 -#define GPIO_EVENTS_ENABLE 0x38 -#define GPIO_LOCK_ENABLE 0x3C -#define GPIO_POSITIVE_EDGE_EN 0x40 -#define GPIO_NEGATIVE_EDGE_EN 0x44 -#define GPIO_POSITIVE_EDGE_STS 0x48 -#define GPIO_NEGATIVE_EDGE_STS 0x4C - -#define GPIO_MAP_X 0xE0 -#define GPIO_MAP_Y 0xE4 -#define GPIO_MAP_Z 0xE8 -#define GPIO_MAP_W 0xEC - -static inline u32 geode_gpio(unsigned int nr) -{ - BUG_ON(nr > 28); - return 1 << nr; -} - -extern void geode_gpio_set(u32, unsigned int); -extern void geode_gpio_clear(u32, unsigned int); -extern int geode_gpio_isset(u32, unsigned int); -extern void geode_gpio_setup_event(unsigned int, int, int); -extern void geode_gpio_set_irq(unsigned int, unsigned int); - -static inline void geode_gpio_event_irq(unsigned int gpio, int pair) -{ - geode_gpio_setup_event(gpio, pair, 0); -} - -static inline void geode_gpio_event_pme(unsigned int gpio, int pair) -{ - geode_gpio_setup_event(gpio, pair, 1); -} - -/* Specific geode tests */ +#include <linux/cs5535.h> static inline int is_geode_gx(void) { @@ -186,68 +33,4 @@ static inline int is_geode(void) return (is_geode_gx() || is_geode_lx()); } -#ifdef CONFIG_MGEODE_LX -extern int geode_has_vsa2(void); -#else -static inline int geode_has_vsa2(void) -{ - return 0; -} -#endif - -/* MFGPTs */ - -#define MFGPT_MAX_TIMERS 8 -#define MFGPT_TIMER_ANY (-1) - -#define MFGPT_DOMAIN_WORKING 1 -#define MFGPT_DOMAIN_STANDBY 2 -#define MFGPT_DOMAIN_ANY (MFGPT_DOMAIN_WORKING | MFGPT_DOMAIN_STANDBY) - -#define MFGPT_CMP1 0 -#define MFGPT_CMP2 1 - -#define MFGPT_EVENT_IRQ 0 -#define MFGPT_EVENT_NMI 1 -#define MFGPT_EVENT_RESET 3 - -#define MFGPT_REG_CMP1 0 -#define MFGPT_REG_CMP2 2 -#define MFGPT_REG_COUNTER 4 -#define MFGPT_REG_SETUP 6 - -#define MFGPT_SETUP_CNTEN (1 << 15) -#define MFGPT_SETUP_CMP2 (1 << 14) -#define MFGPT_SETUP_CMP1 (1 << 13) -#define MFGPT_SETUP_SETUP (1 << 12) -#define MFGPT_SETUP_STOPEN (1 << 11) -#define MFGPT_SETUP_EXTEN (1 << 10) -#define MFGPT_SETUP_REVEN (1 << 5) -#define MFGPT_SETUP_CLKSEL (1 << 4) - -static inline void geode_mfgpt_write(int timer, u16 reg, u16 value) -{ - u32 base = geode_get_dev_base(GEODE_DEV_MFGPT); - outw(value, base + reg + (timer * 8)); -} - -static inline u16 geode_mfgpt_read(int timer, u16 reg) -{ - u32 base = geode_get_dev_base(GEODE_DEV_MFGPT); - return inw(base + reg + (timer * 8)); -} - -extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable); -extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable); -extern int geode_mfgpt_alloc_timer(int timer, int domain); - -#define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1) -#define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0) - -#ifdef CONFIG_GEODE_MFGPT_TIMER -extern int __init mfgpt_timer_setup(void); -#else -static inline int mfgpt_timer_setup(void) { return 0; } -#endif - #endif /* _ASM_X86_GEODE_H */ diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 5d89fd2a3690..1d5c08a1bdfd 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -67,6 +67,7 @@ extern unsigned long hpet_address; extern unsigned long force_hpet_address; extern u8 hpet_blockid; extern int hpet_force_user; +extern u8 hpet_msi_disable; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 08c48a81841f..eeac829a0f44 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -103,7 +103,8 @@ extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); extern void send_cleanup_vector(struct irq_cfg *); struct irq_desc; -extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *); +extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *, + unsigned int *dest_id); extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr); extern void setup_ioapic_dest(void); diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index ebfb8a9e11f7..da2930924501 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -33,8 +33,16 @@ extern void init_thread_xstate(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern user_regset_active_fn fpregs_active, xfpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get; -extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, + xstateregs_set; + +/* + * xstateregs_active == fpregs_active. Please refer to the comment + * at the definition of fpregs_active. + */ +#define xstateregs_active fpregs_active extern struct _fpx_sw_bytes fx_sw_reserved; #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 73739322b6d0..a1dcfa3ab17d 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -1,8 +1,42 @@ #ifndef _ASM_X86_IO_H #define _ASM_X86_IO_H +/* + * This file contains the definitions for the x86 IO instructions + * inb/inw/inl/outb/outw/outl and the "string versions" of the same + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" + * versions of the single-IO instructions (inb_p/inw_p/..). + * + * This file is not meant to be obfuscating: it's just complicated + * to (a) handle it all in a way that makes gcc able to optimize it + * as well as possible and (b) trying to avoid writing the same thing + * over and over again with slight variations and possibly making a + * mistake somewhere. + */ + +/* + * Thanks to James van Artsdalen for a better timing-fix than + * the two short jumps: using outb's to a nonexistent port seems + * to guarantee better timings even on fast machines. + * + * On the other hand, I'd like to be sure of a non-existent port: + * I feel a bit unsafe about using 0x80 (should be safe, though) + * + * Linus + */ + + /* + * Bit simplified and optimized by Jan Hubicka + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. + * + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, + * isa_read[wl] and isa_write[wl] fixed + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> + */ + #define ARCH_HAS_IOREMAP_WC +#include <linux/string.h> #include <linux/compiler.h> #include <asm-generic/int-ll64.h> #include <asm/page.h> @@ -173,11 +207,126 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) extern void iounmap(volatile void __iomem *addr); -#ifdef CONFIG_X86_32 -# include "io_32.h" +#ifdef __KERNEL__ + +#include <asm-generic/iomap.h> + +#include <linux/vmalloc.h> + +/* + * Convert a virtual cached pointer to an uncached pointer + */ +#define xlate_dev_kmem_ptr(p) p + +static inline void +memset_io(volatile void __iomem *addr, unsigned char val, size_t count) +{ + memset((void __force *)addr, val, count); +} + +static inline void +memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) +{ + memcpy(dst, (const void __force *)src, count); +} + +static inline void +memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) +{ + memcpy((void __force *)dst, src, count); +} + +/* + * ISA space is 'always mapped' on a typical x86 system, no need to + * explicitly ioremap() it. The fact that the ISA IO space is mapped + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values + * are physical addresses. The following constant pointer can be + * used as the IO-area pointer (it can be iounmapped as well, so the + * analogy with PCI is quite large): + */ +#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) + +/* + * Cache management + * + * This needed for two cases + * 1. Out of order aware processors + * 2. Accidentally out of order processors (PPro errata #51) + */ + +static inline void flush_write_buffers(void) +{ +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) + asm volatile("lock; addl $0,0(%%esp)": : :"memory"); +#endif +} + +#endif /* __KERNEL__ */ + +extern void native_io_delay(void); + +extern int io_delay_type; +extern void io_delay_init(void); + +#if defined(CONFIG_PARAVIRT) +#include <asm/paravirt.h> #else -# include "io_64.h" + +static inline void slow_down_io(void) +{ + native_io_delay(); +#ifdef REALLY_SLOW_IO + native_io_delay(); + native_io_delay(); + native_io_delay(); #endif +} + +#endif + +#define BUILDIO(bwl, bw, type) \ +static inline void out##bwl(unsigned type value, int port) \ +{ \ + asm volatile("out" #bwl " %" #bw "0, %w1" \ + : : "a"(value), "Nd"(port)); \ +} \ + \ +static inline unsigned type in##bwl(int port) \ +{ \ + unsigned type value; \ + asm volatile("in" #bwl " %w1, %" #bw "0" \ + : "=a"(value) : "Nd"(port)); \ + return value; \ +} \ + \ +static inline void out##bwl##_p(unsigned type value, int port) \ +{ \ + out##bwl(value, port); \ + slow_down_io(); \ +} \ + \ +static inline unsigned type in##bwl##_p(int port) \ +{ \ + unsigned type value = in##bwl(port); \ + slow_down_io(); \ + return value; \ +} \ + \ +static inline void outs##bwl(int port, const void *addr, unsigned long count) \ +{ \ + asm volatile("rep; outs" #bwl \ + : "+S"(addr), "+c"(count) : "d"(port)); \ +} \ + \ +static inline void ins##bwl(int port, void *addr, unsigned long count) \ +{ \ + asm volatile("rep; ins" #bwl \ + : "+D"(addr), "+c"(count) : "d"(port)); \ +} + +BUILDIO(b, b, char) +BUILDIO(w, w, short) +BUILDIO(l, , int) extern void *xlate_dev_mem_ptr(unsigned long phys); extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h deleted file mode 100644 index a299900f5920..000000000000 --- a/arch/x86/include/asm/io_32.h +++ /dev/null @@ -1,196 +0,0 @@ -#ifndef _ASM_X86_IO_32_H -#define _ASM_X86_IO_32_H - -#include <linux/string.h> -#include <linux/compiler.h> - -/* - * This file contains the definitions for the x86 IO instructions - * inb/inw/inl/outb/outw/outl and the "string versions" of the same - * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" - * versions of the single-IO instructions (inb_p/inw_p/..). - * - * This file is not meant to be obfuscating: it's just complicated - * to (a) handle it all in a way that makes gcc able to optimize it - * as well as possible and (b) trying to avoid writing the same thing - * over and over again with slight variations and possibly making a - * mistake somewhere. - */ - -/* - * Thanks to James van Artsdalen for a better timing-fix than - * the two short jumps: using outb's to a nonexistent port seems - * to guarantee better timings even on fast machines. - * - * On the other hand, I'd like to be sure of a non-existent port: - * I feel a bit unsafe about using 0x80 (should be safe, though) - * - * Linus - */ - - /* - * Bit simplified and optimized by Jan Hubicka - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. - * - * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, - * isa_read[wl] and isa_write[wl] fixed - * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#define XQUAD_PORTIO_BASE 0xfe400000 -#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ - -#ifdef __KERNEL__ - -#include <asm-generic/iomap.h> - -#include <linux/vmalloc.h> - -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -static inline void -memset_io(volatile void __iomem *addr, unsigned char val, int count) -{ - memset((void __force *)addr, val, count); -} - -static inline void -memcpy_fromio(void *dst, const volatile void __iomem *src, int count) -{ - __memcpy(dst, (const void __force *)src, count); -} - -static inline void -memcpy_toio(volatile void __iomem *dst, const void *src, int count) -{ - __memcpy((void __force *)dst, src, count); -} - -/* - * ISA space is 'always mapped' on a typical x86 system, no need to - * explicitly ioremap() it. The fact that the ISA IO space is mapped - * to PAGE_OFFSET is pure coincidence - it does not mean ISA values - * are physical addresses. The following constant pointer can be - * used as the IO-area pointer (it can be iounmapped as well, so the - * analogy with PCI is quite large): - */ -#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) - -/* - * Cache management - * - * This needed for two cases - * 1. Out of order aware processors - * 2. Accidentally out of order processors (PPro errata #51) - */ - -#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) - -static inline void flush_write_buffers(void) -{ - asm volatile("lock; addl $0,0(%%esp)": : :"memory"); -} - -#else - -#define flush_write_buffers() do { } while (0) - -#endif - -#endif /* __KERNEL__ */ - -extern void native_io_delay(void); - -extern int io_delay_type; -extern void io_delay_init(void); - -#if defined(CONFIG_PARAVIRT) -#include <asm/paravirt.h> -#else - -static inline void slow_down_io(void) -{ - native_io_delay(); -#ifdef REALLY_SLOW_IO - native_io_delay(); - native_io_delay(); - native_io_delay(); -#endif -} - -#endif - -#define __BUILDIO(bwl, bw, type) \ -static inline void out##bwl(unsigned type value, int port) \ -{ \ - out##bwl##_local(value, port); \ -} \ - \ -static inline unsigned type in##bwl(int port) \ -{ \ - return in##bwl##_local(port); \ -} - -#define BUILDIO(bwl, bw, type) \ -static inline void out##bwl##_local(unsigned type value, int port) \ -{ \ - asm volatile("out" #bwl " %" #bw "0, %w1" \ - : : "a"(value), "Nd"(port)); \ -} \ - \ -static inline unsigned type in##bwl##_local(int port) \ -{ \ - unsigned type value; \ - asm volatile("in" #bwl " %w1, %" #bw "0" \ - : "=a"(value) : "Nd"(port)); \ - return value; \ -} \ - \ -static inline void out##bwl##_local_p(unsigned type value, int port) \ -{ \ - out##bwl##_local(value, port); \ - slow_down_io(); \ -} \ - \ -static inline unsigned type in##bwl##_local_p(int port) \ -{ \ - unsigned type value = in##bwl##_local(port); \ - slow_down_io(); \ - return value; \ -} \ - \ -__BUILDIO(bwl, bw, type) \ - \ -static inline void out##bwl##_p(unsigned type value, int port) \ -{ \ - out##bwl(value, port); \ - slow_down_io(); \ -} \ - \ -static inline unsigned type in##bwl##_p(int port) \ -{ \ - unsigned type value = in##bwl(port); \ - slow_down_io(); \ - return value; \ -} \ - \ -static inline void outs##bwl(int port, const void *addr, unsigned long count) \ -{ \ - asm volatile("rep; outs" #bwl \ - : "+S"(addr), "+c"(count) : "d"(port)); \ -} \ - \ -static inline void ins##bwl(int port, void *addr, unsigned long count) \ -{ \ - asm volatile("rep; ins" #bwl \ - : "+D"(addr), "+c"(count) : "d"(port)); \ -} - -BUILDIO(b, b, char) -BUILDIO(w, w, short) -BUILDIO(l, , int) - -#endif /* _ASM_X86_IO_32_H */ diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h deleted file mode 100644 index 244067893af4..000000000000 --- a/arch/x86/include/asm/io_64.h +++ /dev/null @@ -1,181 +0,0 @@ -#ifndef _ASM_X86_IO_64_H -#define _ASM_X86_IO_64_H - - -/* - * This file contains the definitions for the x86 IO instructions - * inb/inw/inl/outb/outw/outl and the "string versions" of the same - * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" - * versions of the single-IO instructions (inb_p/inw_p/..). - * - * This file is not meant to be obfuscating: it's just complicated - * to (a) handle it all in a way that makes gcc able to optimize it - * as well as possible and (b) trying to avoid writing the same thing - * over and over again with slight variations and possibly making a - * mistake somewhere. - */ - -/* - * Thanks to James van Artsdalen for a better timing-fix than - * the two short jumps: using outb's to a nonexistent port seems - * to guarantee better timings even on fast machines. - * - * On the other hand, I'd like to be sure of a non-existent port: - * I feel a bit unsafe about using 0x80 (should be safe, though) - * - * Linus - */ - - /* - * Bit simplified and optimized by Jan Hubicka - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. - * - * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, - * isa_read[wl] and isa_write[wl] fixed - * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -extern void native_io_delay(void); - -extern int io_delay_type; -extern void io_delay_init(void); - -#if defined(CONFIG_PARAVIRT) -#include <asm/paravirt.h> -#else - -static inline void slow_down_io(void) -{ - native_io_delay(); -#ifdef REALLY_SLOW_IO - native_io_delay(); - native_io_delay(); - native_io_delay(); -#endif -} -#endif - -/* - * Talk about misusing macros.. - */ -#define __OUT1(s, x) \ -static inline void out##s(unsigned x value, unsigned short port) { - -#define __OUT2(s, s1, s2) \ -asm volatile ("out" #s " %" s1 "0,%" s2 "1" - -#ifndef REALLY_SLOW_IO -#define REALLY_SLOW_IO -#define UNSET_REALLY_SLOW_IO -#endif - -#define __OUT(s, s1, x) \ - __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \ - } \ - __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \ - slow_down_io(); \ -} - -#define __IN1(s) \ -static inline RETURN_TYPE in##s(unsigned short port) \ -{ \ - RETURN_TYPE _v; - -#define __IN2(s, s1, s2) \ - asm volatile ("in" #s " %" s2 "1,%" s1 "0" - -#define __IN(s, s1, i...) \ - __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \ - return _v; \ - } \ - __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \ - slow_down_io(); \ - return _v; } - -#ifdef UNSET_REALLY_SLOW_IO -#undef REALLY_SLOW_IO -#endif - -#define __INS(s) \ -static inline void ins##s(unsigned short port, void *addr, \ - unsigned long count) \ -{ \ - asm volatile ("rep ; ins" #s \ - : "=D" (addr), "=c" (count) \ - : "d" (port), "0" (addr), "1" (count)); \ -} - -#define __OUTS(s) \ -static inline void outs##s(unsigned short port, const void *addr, \ - unsigned long count) \ -{ \ - asm volatile ("rep ; outs" #s \ - : "=S" (addr), "=c" (count) \ - : "d" (port), "0" (addr), "1" (count)); \ -} - -#define RETURN_TYPE unsigned char -__IN(b, "") -#undef RETURN_TYPE -#define RETURN_TYPE unsigned short -__IN(w, "") -#undef RETURN_TYPE -#define RETURN_TYPE unsigned int -__IN(l, "") -#undef RETURN_TYPE - -__OUT(b, "b", char) -__OUT(w, "w", short) -__OUT(l, , int) - -__INS(b) -__INS(w) -__INS(l) - -__OUTS(b) -__OUTS(w) -__OUTS(l) - -#if defined(__KERNEL__) && defined(__x86_64__) - -#include <linux/vmalloc.h> - -#include <asm-generic/iomap.h> - -void __memcpy_fromio(void *, unsigned long, unsigned); -void __memcpy_toio(unsigned long, const void *, unsigned); - -static inline void memcpy_fromio(void *to, const volatile void __iomem *from, - unsigned len) -{ - __memcpy_fromio(to, (unsigned long)from, len); -} - -static inline void memcpy_toio(volatile void __iomem *to, const void *from, - unsigned len) -{ - __memcpy_toio((unsigned long)to, from, len); -} - -void memset_io(volatile void __iomem *a, int b, size_t c); - -/* - * ISA space is 'always mapped' on a typical x86 system, no need to - * explicitly ioremap() it. The fact that the ISA IO space is mapped - * to PAGE_OFFSET is pure coincidence - it does not mean ISA values - * are physical addresses. The following constant pointer can be - * used as the IO-area pointer (it can be iounmapped as well, so the - * analogy with PCI is quite large): - */ -#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) - -#define flush_write_buffers() - -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_IO_64_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6a635bd39867..4611f085cd43 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -113,7 +113,7 @@ */ #define LOCAL_PENDING_VECTOR 0xec -#define UV_BAU_MESSAGE 0xec +#define UV_BAU_MESSAGE 0xea /* * Self IPI vector for machine checks diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 950df434763f..f46b79f6c16c 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -254,6 +254,10 @@ struct kvm_reinject_control { __u8 reserved[31]; }; +/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ +#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 +#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 + /* for KVM_GET/SET_VCPU_EVENTS */ struct kvm_vcpu_events { struct { diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 858baa061cfc..6c3fdd631ed3 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -108,10 +108,11 @@ struct mce_log { #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) -extern struct atomic_notifier_head x86_mce_decoder_chain; #ifdef __KERNEL__ +extern struct atomic_notifier_head x86_mce_decoder_chain; + #include <linux/percpu.h> #include <linux/init.h> #include <asm/atomic.h> diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index c24ca9a56458..ef51b501e22a 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -12,8 +12,6 @@ struct device; enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; struct microcode_ops { - void (*init)(struct device *device); - void (*fini)(void); enum ucode_state (*request_microcode_user) (int cpu, const void __user *buf, size_t size); diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index ede6998bd92c..91df7c51806c 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -47,7 +47,7 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {} /* * generic node memory support, the following assumptions apply: * - * 1) memory comes in 64Mb contigious chunks which are either present or not + * 1) memory comes in 64Mb contiguous chunks which are either present or not * 2) we will not have more than 64Gb in total * * for now assume that 64Gb is max amount of RAM for whole system diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index a29f48c2a322..288b96f815a6 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ NODE_DATA(nid)->node_spanned_pages) - -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -#endif - #endif #endif /* _ASM_X86_MMZONE_64_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 4ffe09b2ad75..1cd58cdbc03f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -12,6 +12,7 @@ #define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ #define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ #define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ +#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */ /* EFER bits: */ #define _EFER_SCE 0 /* SYSCALL/SYSRET */ @@ -123,6 +124,7 @@ #define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff #define FAM10H_MMIO_CONF_BASE_SHIFT 20 +#define MSR_FAM10H_NODE_ID 0xc001100c /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 5bef931f8b14..c5bc4c2d33f5 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -27,6 +27,18 @@ struct msr { }; }; +struct msr_info { + u32 msr_no; + struct msr reg; + struct msr *msrs; + int err; +}; + +struct msr_regs_info { + u32 *regs; + int err; +}; + static inline unsigned long long native_read_tscp(unsigned int *aux) { unsigned long low, high; @@ -240,9 +252,12 @@ do { \ #define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ (u32)((val) >> 32)) -#define write_tsc(val1, val2) wrmsr(0x10, (val1), (val2)) +#define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2)) + +#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0) -#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0) +struct msr *msrs_alloc(void); +void msrs_free(struct msr *msrs); #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 139d4c1a33a7..93da9c3f3341 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -19,7 +19,6 @@ extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); -extern int avail_to_resrv_perfctr_nmi(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index c4ae822e415f..823e070e7c26 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); + +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE ((u64)64 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) +#endif /* CONFIG_NUMA_EMU */ #else static inline void init_cpu_to_node(void) { } static inline void numa_set_node(int cpu, int node) { } diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h index 9f0a5f5d29ec..13370b95ea94 100644 --- a/arch/x86/include/asm/numaq.h +++ b/arch/x86/include/asm/numaq.h @@ -33,6 +33,10 @@ extern int get_memcfg_numaq(void); extern void *xquad_portio; +#define XQUAD_PORTIO_BASE 0xfe400000 +#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) + /* * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the */ diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 834a30295fab..3a57385d9fa7 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -120,7 +120,7 @@ extern int olpc_ec_mask_unset(uint8_t bits); /* GPIO assignments */ -#define OLPC_GPIO_MIC_AC geode_gpio(1) +#define OLPC_GPIO_MIC_AC 1 #define OLPC_GPIO_DCON_IRQ geode_gpio(7) #define OLPC_GPIO_THRM_ALRM geode_gpio(10) #define OLPC_GPIO_SMB_CLK geode_gpio(14) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 642fe34b36a2..a667f24c7254 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -40,7 +40,6 @@ #ifndef __ASSEMBLY__ -extern int page_is_ram(unsigned long pagenr); extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index efb38994859c..dd59a85a918f 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -731,34 +731,34 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) -static inline int __raw_spin_is_locked(struct raw_spinlock *lock) +static inline int arch_spin_is_locked(struct arch_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); } -static inline int __raw_spin_is_contended(struct raw_spinlock *lock) +static inline int arch_spin_is_contended(struct arch_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); } -#define __raw_spin_is_contended __raw_spin_is_contended +#define arch_spin_is_contended arch_spin_is_contended -static __always_inline void __raw_spin_lock(struct raw_spinlock *lock) +static __always_inline void arch_spin_lock(struct arch_spinlock *lock) { PVOP_VCALL1(pv_lock_ops.spin_lock, lock); } -static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock, +static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags) { PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); } -static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock) +static __always_inline int arch_spin_trylock(struct arch_spinlock *lock) { return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); } -static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) +static __always_inline void arch_spin_unlock(struct arch_spinlock *lock) { PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); } diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 9357473c8da0..b1e70d51e40c 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -318,14 +318,14 @@ struct pv_mmu_ops { phys_addr_t phys, pgprot_t flags); }; -struct raw_spinlock; +struct arch_spinlock; struct pv_lock_ops { - int (*spin_is_locked)(struct raw_spinlock *lock); - int (*spin_is_contended)(struct raw_spinlock *lock); - void (*spin_lock)(struct raw_spinlock *lock); - void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct raw_spinlock *lock); - void (*spin_unlock)(struct raw_spinlock *lock); + int (*spin_is_locked)(struct arch_spinlock *lock); + int (*spin_is_contended)(struct arch_spinlock *lock); + void (*spin_lock)(struct arch_spinlock *lock); + void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags); + int (*spin_trylock)(struct arch_spinlock *lock); + void (*spin_unlock)(struct arch_spinlock *lock); }; /* This contains all the paravirt structures: we get a convenient diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b399988eee3a..05b58ccb2e82 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -29,6 +29,7 @@ #define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 #define PCI_HAS_IO_ECS 0x40000 #define PCI_NOASSIGN_ROMS 0x80000 +#define PCI_ROOT_NO_CRS 0x100000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; @@ -118,11 +119,27 @@ extern int __init pcibios_init(void); /* pci-mmconfig.c */ +/* "PCI MMCONFIG %04x [bus %02x-%02x]" */ +#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2) + +struct pci_mmcfg_region { + struct list_head list; + struct resource res; + u64 address; + char __iomem *virt; + u16 segment; + u8 start_bus; + u8 end_bus; + char name[PCI_MMCFG_RESOURCE_NAME_LEN]; +}; + extern int __init pci_mmcfg_arch_init(void); extern void __init pci_mmcfg_arch_free(void); +extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus); + +extern struct list_head pci_mmcfg_list; -extern struct acpi_mcfg_allocation *pci_mmcfg_config; -extern int pci_mmcfg_config_num; +#define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20) /* * AMD Fam10h CPUs are buggy, and cannot access MMIO config space diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index b65a36defeb7..0c44196b78ac 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -74,31 +74,31 @@ extern void __bad_percpu_size(void); #define percpu_to_op(op, var, val) \ do { \ - typedef typeof(var) T__; \ + typedef typeof(var) pto_T__; \ if (0) { \ - T__ tmp__; \ - tmp__ = (val); \ + pto_T__ pto_tmp__; \ + pto_tmp__ = (val); \ } \ switch (sizeof(var)) { \ case 1: \ asm(op "b %1,"__percpu_arg(0) \ : "+m" (var) \ - : "qi" ((T__)(val))); \ + : "qi" ((pto_T__)(val))); \ break; \ case 2: \ asm(op "w %1,"__percpu_arg(0) \ : "+m" (var) \ - : "ri" ((T__)(val))); \ + : "ri" ((pto_T__)(val))); \ break; \ case 4: \ asm(op "l %1,"__percpu_arg(0) \ : "+m" (var) \ - : "ri" ((T__)(val))); \ + : "ri" ((pto_T__)(val))); \ break; \ case 8: \ asm(op "q %1,"__percpu_arg(0) \ : "+m" (var) \ - : "re" ((T__)(val))); \ + : "re" ((pto_T__)(val))); \ break; \ default: __bad_percpu_size(); \ } \ @@ -106,31 +106,31 @@ do { \ #define percpu_from_op(op, var, constraint) \ ({ \ - typeof(var) ret__; \ + typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ asm(op "b "__percpu_arg(1)",%0" \ - : "=q" (ret__) \ + : "=q" (pfo_ret__) \ : constraint); \ break; \ case 2: \ asm(op "w "__percpu_arg(1)",%0" \ - : "=r" (ret__) \ + : "=r" (pfo_ret__) \ : constraint); \ break; \ case 4: \ asm(op "l "__percpu_arg(1)",%0" \ - : "=r" (ret__) \ + : "=r" (pfo_ret__) \ : constraint); \ break; \ case 8: \ asm(op "q "__percpu_arg(1)",%0" \ - : "=r" (ret__) \ + : "=r" (pfo_ret__) \ : constraint); \ break; \ default: __bad_percpu_size(); \ } \ - ret__; \ + pfo_ret__; \ }) /* @@ -153,6 +153,84 @@ do { \ #define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) #define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val) +#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define __this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) + +#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) +#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) +#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) +#define __this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) +#define __this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) +#define __this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) +#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) +#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) +#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) +#define __this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) +#define __this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) +#define __this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) +#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) +#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) +#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) + +#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) +#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) +#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) +#define this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) +#define this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) +#define this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) +#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) +#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) +#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) +#define this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) +#define this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) +#define this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) +#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) +#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) +#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) + +#define irqsafe_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) +#define irqsafe_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) +#define irqsafe_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) +#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) +#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) +#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) +#define irqsafe_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) +#define irqsafe_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) +#define irqsafe_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) +#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) +#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) +#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) + +/* + * Per cpu atomic 64 bit operations are only available under 64 bit. + * 32 bit must fall back to generic operations. + */ +#ifdef CONFIG_X86_64 +#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) +#define __this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) +#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) + +#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) +#define this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) +#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) + +#define irqsafe_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) +#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) + +#endif + /* This is not atomic against other CPUs -- CPU preemption needs to be off */ #define x86_test_and_clear_bit_percpu(bit, var) \ ({ \ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8d9f8548a870..befd172c82ad 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -19,6 +19,7 @@ #define MSR_ARCH_PERFMON_EVENTSEL1 0x187 #define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21) #define ARCH_PERFMON_EVENTSEL_INT (1 << 20) #define ARCH_PERFMON_EVENTSEL_OS (1 << 17) #define ARCH_PERFMON_EVENTSEL_USR (1 << 16) @@ -26,7 +27,14 @@ /* * Includes eventsel and unit mask as well: */ -#define ARCH_PERFMON_EVENT_MASK 0xffff + + +#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL +#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL +#define INTEL_ARCH_EDGE_MASK 0x00040000ULL +#define INTEL_ARCH_INV_MASK 0x00800000ULL +#define INTEL_ARCH_CNT_MASK 0xFF000000ULL +#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK) /* * filter mask to validate fixed counter events. @@ -37,7 +45,12 @@ * The other filters are supported by fixed counters. * The any-thread option is supported starting with v3. */ -#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000 +#define INTEL_ARCH_FIXED_MASK \ + (INTEL_ARCH_CNT_MASK| \ + INTEL_ARCH_INV_MASK| \ + INTEL_ARCH_EDGE_MASK|\ + INTEL_ARCH_UNIT_MASK|\ + INTEL_ARCH_EVTSEL_MASK) #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 0e8c2a0fd922..271de94c3810 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -23,6 +23,11 @@ static inline void paravirt_release_pud(unsigned long pfn) {} #endif /* + * Flags to use when allocating a user page table page. + */ +extern gfp_t __userpte_alloc_gfp; + +/* * Allocate and free page tables. */ extern pgd_t *pgd_alloc(struct mm_struct *); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6f8ec1c37e0a..b753ea59703a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -181,7 +181,7 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ - asm("cpuid" + asm volatile("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), @@ -450,6 +450,8 @@ struct thread_struct { struct perf_event *ptrace_bps[HBP_NUM]; /* Debug status used for traps, single steps, etc... */ unsigned long debugreg6; + /* Keep track of the exact dr7 value set by the user */ + unsigned long ptrace_dr7; /* Fault info: */ unsigned long cr2; unsigned long trap_no; diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 3d11fd0f44c5..20102808b191 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -274,10 +274,6 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, return 0; } -/* Get Nth argument at function call */ -extern unsigned long regs_get_argument_nth(struct pt_regs *regs, - unsigned int n); - /* * These are defined as per linux/ptrace.h, which see. */ @@ -292,6 +288,8 @@ extern void user_enable_block_step(struct task_struct *); #define arch_has_block_step() (boot_cpu_data.x86 >= 6) #endif +#define ARCH_HAS_USER_SINGLE_STEP_INFO + struct user_desc; extern int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *info); diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index ca7517d33776..606ede126972 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -41,6 +41,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/lockdep.h> +#include <asm/asm.h> struct rwsem_waiter; @@ -55,17 +56,28 @@ extern asmregparm struct rw_semaphore * /* * the semaphore definition + * + * The bias values and the counter type limits the number of + * potential readers/writers to 32767 for 32 bits and 2147483647 + * for 64 bits. */ -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) +#ifdef CONFIG_X86_64 +# define RWSEM_ACTIVE_MASK 0xffffffffL +#else +# define RWSEM_ACTIVE_MASK 0x0000ffffL +#endif + +#define RWSEM_UNLOCKED_VALUE 0x00000000L +#define RWSEM_ACTIVE_BIAS 0x00000001L +#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) +typedef signed long rwsem_count_t; + struct rw_semaphore { - signed long count; + rwsem_count_t count; spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -105,7 +117,7 @@ do { \ static inline void __down_read(struct rw_semaphore *sem) { asm volatile("# beginning down_read\n\t" - LOCK_PREFIX " incl (%%eax)\n\t" + LOCK_PREFIX _ASM_INC "(%1)\n\t" /* adds 0x00000001, returns the old value */ " jns 1f\n" " call call_rwsem_down_read_failed\n" @@ -121,14 +133,14 @@ static inline void __down_read(struct rw_semaphore *sem) */ static inline int __down_read_trylock(struct rw_semaphore *sem) { - __s32 result, tmp; + rwsem_count_t result, tmp; asm volatile("# beginning __down_read_trylock\n\t" - " movl %0,%1\n\t" + " mov %0,%1\n\t" "1:\n\t" - " movl %1,%2\n\t" - " addl %3,%2\n\t" + " mov %1,%2\n\t" + " add %3,%2\n\t" " jle 2f\n\t" - LOCK_PREFIX " cmpxchgl %2,%0\n\t" + LOCK_PREFIX " cmpxchg %2,%0\n\t" " jnz 1b\n\t" "2:\n\t" "# ending __down_read_trylock\n\t" @@ -143,13 +155,13 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) { - int tmp; + rwsem_count_t tmp; tmp = RWSEM_ACTIVE_WRITE_BIAS; asm volatile("# beginning down_write\n\t" - LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" + LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtract 0x0000ffff, returns the old value */ - " testl %%edx,%%edx\n\t" + " test %1,%1\n\t" /* was the count 0 before? */ " jz 1f\n" " call call_rwsem_down_write_failed\n" @@ -170,9 +182,9 @@ static inline void __down_write(struct rw_semaphore *sem) */ static inline int __down_write_trylock(struct rw_semaphore *sem) { - signed long ret = cmpxchg(&sem->count, - RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + rwsem_count_t ret = cmpxchg(&sem->count, + RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); if (ret == RWSEM_UNLOCKED_VALUE) return 1; return 0; @@ -183,9 +195,9 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) */ static inline void __up_read(struct rw_semaphore *sem) { - __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; + rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS; asm volatile("# beginning __up_read\n\t" - LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" + LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 1, returns the old value */ " jns 1f\n\t" " call call_rwsem_wake\n" @@ -201,18 +213,18 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { + rwsem_count_t tmp; asm volatile("# beginning __up_write\n\t" - " movl %2,%%edx\n\t" - LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" + LOCK_PREFIX " xadd %1,(%2)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */ " jz 1f\n" " call call_rwsem_wake\n" "1:\n\t" "# ending __up_write\n" - : "+m" (sem->count) - : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc", "edx"); + : "+m" (sem->count), "=d" (tmp) + : "a" (sem), "1" (-RWSEM_ACTIVE_WRITE_BIAS) + : "memory", "cc"); } /* @@ -221,33 +233,38 @@ static inline void __up_write(struct rw_semaphore *sem) static inline void __downgrade_write(struct rw_semaphore *sem) { asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX " addl %2,(%%eax)\n\t" - /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ + LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t" + /* + * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386) + * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64) + */ " jns 1f\n\t" " call call_rwsem_downgrade_wake\n" "1:\n\t" "# ending __downgrade_write\n" : "+m" (sem->count) - : "a" (sem), "i" (-RWSEM_WAITING_BIAS) + : "a" (sem), "er" (-RWSEM_WAITING_BIAS) : "memory", "cc"); } /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(rwsem_count_t delta, + struct rw_semaphore *sem) { - asm volatile(LOCK_PREFIX "addl %1,%0" + asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" : "+m" (sem->count) - : "ir" (delta)); + : "er" (delta)); } /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, + struct rw_semaphore *sem) { - int tmp = delta; + rwsem_count_t tmp = delta; asm volatile(LOCK_PREFIX "xadd %0,%1" : "+r" (tmp), "+m" (sem->count) diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 72e5a4491661..04459d25e66e 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -124,7 +124,7 @@ struct sigcontext { * fpstate is really (struct _fpstate *) or (struct _xstate *) * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end - * of extended memory layout. See comments at the defintion of + * of extended memory layout. See comments at the definition of * (struct _fpx_sw_bytes) */ void __user *fpstate; /* zero when no FPU/extended context */ @@ -219,7 +219,7 @@ struct sigcontext { * fpstate is really (struct _fpstate *) or (struct _xstate *) * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end - * of extended memory layout. See comments at the defintion of + * of extended memory layout. See comments at the definition of * (struct _fpx_sw_bytes) */ void __user *fpstate; /* zero when no FPU/extended context */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 1e796782cd7b..4cfc90824068 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -135,6 +135,8 @@ int native_cpu_disable(void); void native_cpu_die(unsigned int cpu); void native_play_dead(void); void play_dead_common(void); +void wbinvd_on_cpu(int cpu); +int wbinvd_on_all_cpus(void); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); @@ -147,6 +149,13 @@ static inline int num_booting_cpus(void) { return cpumask_weight(cpu_callout_mask); } +#else /* !CONFIG_SMP */ +#define wbinvd_on_cpu(cpu) wbinvd() +static inline int wbinvd_on_all_cpus(void) +{ + wbinvd(); + return 0; +} #endif /* CONFIG_SMP */ extern unsigned disabled_cpus __cpuinitdata; diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 4e77853321db..3089f70c0c52 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -58,7 +58,7 @@ #if (NR_CPUS < 256) #define TICKET_SHIFT 8 -static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { short inc = 0x0100; @@ -77,7 +77,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) : "memory", "cc"); } -static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) +static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { int tmp, new; @@ -96,7 +96,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) return tmp; } -static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { asm volatile(UNLOCK_LOCK_PREFIX "incb %0" : "+m" (lock->slock) @@ -106,7 +106,7 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) #else #define TICKET_SHIFT 16 -static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { int inc = 0x00010000; int tmp; @@ -127,7 +127,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) : "memory", "cc"); } -static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) +static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { int tmp; int new; @@ -149,7 +149,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) return tmp; } -static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { asm volatile(UNLOCK_LOCK_PREFIX "incw %0" : "+m" (lock->slock) @@ -158,14 +158,14 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) } #endif -static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) +static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { int tmp = ACCESS_ONCE(lock->slock); return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); } -static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) +static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) { int tmp = ACCESS_ONCE(lock->slock); @@ -174,43 +174,43 @@ static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) #ifndef CONFIG_PARAVIRT_SPINLOCKS -static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +static inline int arch_spin_is_locked(arch_spinlock_t *lock) { return __ticket_spin_is_locked(lock); } -static inline int __raw_spin_is_contended(raw_spinlock_t *lock) +static inline int arch_spin_is_contended(arch_spinlock_t *lock) { return __ticket_spin_is_contended(lock); } -#define __raw_spin_is_contended __raw_spin_is_contended +#define arch_spin_is_contended arch_spin_is_contended -static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) +static __always_inline void arch_spin_lock(arch_spinlock_t *lock) { __ticket_spin_lock(lock); } -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) +static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) { return __ticket_spin_trylock(lock); } -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) +static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { __ticket_spin_unlock(lock); } -static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, +static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { - __raw_spin_lock(lock); + arch_spin_lock(lock); } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ -static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { - while (__raw_spin_is_locked(lock)) + while (arch_spin_is_locked(lock)) cpu_relax(); } @@ -232,7 +232,7 @@ static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) * read_can_lock - would read_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_read_can_lock(raw_rwlock_t *lock) +static inline int arch_read_can_lock(arch_rwlock_t *lock) { return (int)(lock)->lock > 0; } @@ -241,12 +241,12 @@ static inline int __raw_read_can_lock(raw_rwlock_t *lock) * write_can_lock - would write_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_write_can_lock(raw_rwlock_t *lock) +static inline int arch_write_can_lock(arch_rwlock_t *lock) { return (lock)->lock == RW_LOCK_BIAS; } -static inline void __raw_read_lock(raw_rwlock_t *rw) +static inline void arch_read_lock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" "jns 1f\n" @@ -255,7 +255,7 @@ static inline void __raw_read_lock(raw_rwlock_t *rw) ::LOCK_PTR_REG (rw) : "memory"); } -static inline void __raw_write_lock(raw_rwlock_t *rw) +static inline void arch_write_lock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" "jz 1f\n" @@ -264,7 +264,7 @@ static inline void __raw_write_lock(raw_rwlock_t *rw) ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); } -static inline int __raw_read_trylock(raw_rwlock_t *lock) +static inline int arch_read_trylock(arch_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; @@ -274,7 +274,7 @@ static inline int __raw_read_trylock(raw_rwlock_t *lock) return 0; } -static inline int __raw_write_trylock(raw_rwlock_t *lock) +static inline int arch_write_trylock(arch_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; @@ -284,23 +284,23 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock) return 0; } -static inline void __raw_read_unlock(raw_rwlock_t *rw) +static inline void arch_read_unlock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); } -static inline void __raw_write_unlock(raw_rwlock_t *rw) +static inline void arch_write_unlock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX "addl %1, %0" : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); } -#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock) -#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock) +#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) +#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#define _raw_spin_relax(lock) cpu_relax() -#define _raw_read_relax(lock) cpu_relax() -#define _raw_write_relax(lock) cpu_relax() +#define arch_spin_relax(lock) cpu_relax() +#define arch_read_relax(lock) cpu_relax() +#define arch_write_relax(lock) cpu_relax() /* The {read|write|spin}_lock() on x86 are full memory barriers. */ static inline void smp_mb__after_lock(void) { } diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 845f81c87091..dcb48b2edc11 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -5,16 +5,16 @@ # error "please don't include this file directly" #endif -typedef struct raw_spinlock { +typedef struct arch_spinlock { unsigned int slock; -} raw_spinlock_t; +} arch_spinlock_t; -#define __RAW_SPIN_LOCK_UNLOCKED { 0 } +#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } typedef struct { unsigned int lock; -} raw_rwlock_t; +} arch_rwlock_t; -#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } +#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } #endif /* _ASM_X86_SPINLOCK_TYPES_H */ diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index cf86a5e73815..4dab78edbad9 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -3,7 +3,28 @@ extern int kstack_depth_to_print; -int x86_is_stack_id(int id, char *name); +struct thread_info; +struct stacktrace_ops; + +typedef unsigned long (*walk_stack_t)(struct thread_info *tinfo, + unsigned long *stack, + unsigned long bp, + const struct stacktrace_ops *ops, + void *data, + unsigned long *end, + int *graph); + +extern unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph); + +extern unsigned long +print_context_stack_bp(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph); /* Generic stack tracer with callbacks */ @@ -14,6 +35,7 @@ struct stacktrace_ops { void (*address)(void *data, unsigned long address, int reliable); /* On negative return stop dumping */ int (*stack)(void *data, char *name); + walk_stack_t walk_stack; }; void dump_trace(struct task_struct *tsk, struct pt_regs *regs, diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index 87ffcb12a1b8..8085277e1b8b 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -5,13 +5,17 @@ #ifdef CONFIG_SWIOTLB extern int swiotlb; -extern int pci_swiotlb_init(void); +extern int __init pci_swiotlb_detect(void); +extern void __init pci_swiotlb_init(void); #else #define swiotlb 0 -static inline int pci_swiotlb_init(void) +static inline int pci_swiotlb_detect(void) { return 0; } +static inline void pci_swiotlb_init(void) +{ +} #endif static inline void dma_mark_clean(void *addr, size_t size) {} diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 9af9decb38c3..d5f69045c100 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -30,7 +30,6 @@ struct mmap_arg_struct; asmlinkage long sys32_mmap(struct mmap_arg_struct __user *); asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long); -asmlinkage long sys32_pipe(int __user *); struct sigaction32; struct old_sigaction32; asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *, @@ -57,9 +56,6 @@ asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); asmlinkage long sys32_personality(unsigned long); asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); - struct oldold_utsname; struct old_utsname; asmlinkage long sys32_olduname(struct oldold_utsname __user *); diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 8d33bc5462d1..c4a348f7bd43 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,6 +16,8 @@ #include <linux/sched.h> #include <linux/err.h> +extern const unsigned long sys_call_table[]; + /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 372b76edd63f..8868b9420b0e 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -18,16 +18,24 @@ /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); +long sys_iopl(unsigned int, struct pt_regs *); /* kernel/process.c */ int sys_fork(struct pt_regs *); int sys_vfork(struct pt_regs *); +long sys_execve(char __user *, char __user * __user *, + char __user * __user *, struct pt_regs *); +long sys_clone(unsigned long, unsigned long, void __user *, + void __user *, struct pt_regs *); /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ long sys_rt_sigreturn(struct pt_regs *); +long sys_sigaltstack(const stack_t __user *, stack_t __user *, + struct pt_regs *); + /* kernel/tls.c */ asmlinkage int sys_set_thread_area(struct user_desc __user *); @@ -35,18 +43,11 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ #ifdef CONFIG_X86_32 -/* kernel/ioport.c */ -long sys_iopl(struct pt_regs *); - -/* kernel/process_32.c */ -int sys_clone(struct pt_regs *); -int sys_execve(struct pt_regs *); /* kernel/signal.c */ asmlinkage int sys_sigsuspend(int, int, old_sigset_t); asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); -int sys_sigaltstack(struct pt_regs *); unsigned long sys_sigreturn(struct pt_regs *); /* kernel/sys_i386_32.c */ @@ -55,8 +56,6 @@ struct sel_arg_struct; struct oldold_utsname; struct old_utsname; -asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); asmlinkage int old_mmap(struct mmap_arg_struct __user *); asmlinkage int old_select(struct sel_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); @@ -64,28 +63,15 @@ asmlinkage int sys_uname(struct old_utsname __user *); asmlinkage int sys_olduname(struct oldold_utsname __user *); /* kernel/vm86_32.c */ -int sys_vm86old(struct pt_regs *); -int sys_vm86(struct pt_regs *); +int sys_vm86old(struct vm86_struct __user *, struct pt_regs *); +int sys_vm86(unsigned long, unsigned long, struct pt_regs *); #else /* CONFIG_X86_32 */ /* X86_64 only */ -/* kernel/ioport.c */ -asmlinkage long sys_iopl(unsigned int, struct pt_regs *); - /* kernel/process_64.c */ -asmlinkage long sys_clone(unsigned long, unsigned long, - void __user *, void __user *, - struct pt_regs *); -asmlinkage long sys_execve(char __user *, char __user * __user *, - char __user * __user *, - struct pt_regs *); long sys_arch_prctl(int, unsigned long); -/* kernel/signal.c */ -asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, - struct pt_regs *); - /* kernel/sys_x86_64.c */ struct new_utsname; diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 022a84386de8..e04740f7a0bb 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -11,9 +11,9 @@ #include <linux/irqflags.h> /* entries in ARCH_DLINFO: */ -#ifdef CONFIG_IA32_EMULATION +#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) # define AT_VECTOR_SIZE_ARCH 2 -#else +#else /* else it's non-compat x86-64 */ # define AT_VECTOR_SIZE_ARCH 1 #endif @@ -23,6 +23,7 @@ struct task_struct *__switch_to(struct task_struct *prev, struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); +extern void show_regs_common(void); #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 375c917c37d2..e0d28901e969 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -87,7 +87,6 @@ struct thread_info { #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ -#define TIF_ABI_PENDING 19 #define TIF_MEMDIE 20 #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -112,7 +111,6 @@ struct thread_info { #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) -#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING) #define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FREEZE (1 << TIF_FREEZE) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 40e37b10c6c0..c5087d796587 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -35,11 +35,16 @@ # endif #endif -/* Node not present */ -#define NUMA_NO_NODE (-1) +/* + * to preserve the visibility of NUMA_NO_NODE definition, + * moved to there from here. May be used independent of + * CONFIG_NUMA. + */ +#include <linux/numa.h> #ifdef CONFIG_NUMA #include <linux/cpumask.h> + #include <asm/mpspec.h> #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index 90f06c25221d..cb507bb05d79 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -16,7 +16,6 @@ extern unsigned long initial_code; extern unsigned long initial_gs; #define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) -#define TRAMPOLINE_BASE 0x6000 extern unsigned long setup_trampoline(void); extern void __init reserve_trampoline_memory(void); diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 0c9825e97f36..088d09fb1615 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -205,14 +205,13 @@ static inline unsigned long __must_check copy_from_user(void *to, unsigned long n) { int sz = __compiletime_object_size(to); - int ret = -EFAULT; if (likely(sz == -1 || sz >= n)) - ret = _copy_from_user(to, from, n); + n = _copy_from_user(to, from, n); else copy_from_user_overflow(); - return ret; + return n; } long __must_check strncpy_from_user(char *dst, const char __user *src, diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 46324c6a4f6e..316708d5af92 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -8,6 +8,8 @@ #include <linux/errno.h> #include <linux/prefetch.h> #include <linux/lockdep.h> +#include <asm/alternative.h> +#include <asm/cpufeature.h> #include <asm/page.h> /* @@ -16,7 +18,24 @@ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long -copy_user_generic(void *to, const void *from, unsigned len); +copy_user_generic_string(void *to, const void *from, unsigned len); +__must_check unsigned long +copy_user_generic_unrolled(void *to, const void *from, unsigned len); + +static __always_inline __must_check unsigned long +copy_user_generic(void *to, const void *from, unsigned len) +{ + unsigned ret; + + alternative_call(copy_user_generic_unrolled, + copy_user_generic_string, + X86_FEATURE_REP_GOOD, + ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), + "=d" (len)), + "1" (to), "2" (from), "3" (len) + : "memory", "rcx", "r8", "r9", "r10", "r11"); + return ret; +} __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned len); @@ -30,16 +49,15 @@ static inline unsigned long __must_check copy_from_user(void *to, unsigned long n) { int sz = __compiletime_object_size(to); - int ret = -EFAULT; might_fault(); if (likely(sz == -1 || sz >= n)) - ret = _copy_from_user(to, from, n); + n = _copy_from_user(to, from, n); #ifdef CONFIG_DEBUG_VM else WARN(1, "Buffer overflow detected!\n"); #endif - return ret; + return n; } static __always_inline __must_check diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h index 999873b22e7f..24532c7da3d6 100644 --- a/arch/x86/include/asm/user.h +++ b/arch/x86/include/asm/user.h @@ -1,5 +1,63 @@ +#ifndef _ASM_X86_USER_H +#define _ASM_X86_USER_H + #ifdef CONFIG_X86_32 # include "user_32.h" #else # include "user_64.h" #endif + +#include <asm/types.h> + +struct user_ymmh_regs { + /* 16 * 16 bytes for each YMMH-reg */ + __u32 ymmh_space[64]; +}; + +struct user_xsave_hdr { + __u64 xstate_bv; + __u64 reserved1[2]; + __u64 reserved2[5]; +}; + +/* + * The structure layout of user_xstateregs, used for exporting the + * extended register state through ptrace and core-dump (NT_X86_XSTATE note) + * interfaces will be same as the memory layout of xsave used by the processor + * (except for the bytes 464..511, which can be used by the software) and hence + * the size of this structure varies depending on the features supported by the + * processor and OS. The size of the structure that users need to use can be + * obtained by doing: + * cpuid_count(0xd, 0, &eax, &ptrace_xstateregs_struct_size, &ecx, &edx); + * i.e., cpuid.(eax=0xd,ecx=0).ebx will be the size that user (debuggers, etc.) + * need to use. + * + * For now, only the first 8 bytes of the software usable bytes[464..471] will + * be used and will be set to OS enabled xstate mask (which is same as the + * 64bit mask returned by the xgetbv's xCR0). Users (analyzing core dump + * remotely, etc.) can use this mask as well as the mask saved in the + * xstate_hdr bytes and interpret what states the processor/OS supports + * and what states are in modified/initialized conditions for the + * particular process/thread. + * + * Also when the user modifies certain state FP/SSE/etc through the + * ptrace interface, they must ensure that the xsave_hdr.xstate_bv + * bytes[512..519] of the memory layout are updated correspondingly. + * i.e., for example when FP state is modified to a non-init state, + * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to + * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc. + */ +#define USER_XSTATE_FX_SW_WORDS 6 +#define USER_XSTATE_XCR0_WORD 0 + +struct user_xstateregs { + struct { + __u64 fpx_space[58]; + __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS]; + } i387; + struct user_xsave_hdr xsave_hdr; + struct user_ymmh_regs ymmh; + /* further processor state extensions go here */ +}; + +#endif /* _ASM_X86_USER_H */ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 7ed17ff502b9..71605c7d5c5c 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -18,8 +18,8 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson <rja@sgi.com> */ #include <linux/rtc.h> @@ -36,7 +36,8 @@ enum uv_bios_cmd { UV_BIOS_WATCHLIST_ALLOC, UV_BIOS_WATCHLIST_FREE, UV_BIOS_MEMPROTECT, - UV_BIOS_GET_PARTITION_ADDR + UV_BIOS_GET_PARTITION_ADDR, + UV_BIOS_SET_LEGACY_VGA_TARGET }; /* @@ -76,15 +77,6 @@ union partition_info_u { }; }; -union uv_watchlist_u { - u64 val; - struct { - u64 blade : 16, - size : 32, - filler : 16; - }; -}; - enum uv_memprotect { UV_MEMPROT_RESTRICT_ACCESS, UV_MEMPROT_ALLOW_AMO, @@ -98,13 +90,14 @@ extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); -extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); +extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); -extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int, +extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); +extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); extern void uv_bios_init(void); @@ -113,6 +106,7 @@ extern int uv_type; extern long sn_partition_id; extern long sn_coherency_id; extern long sn_region_size; +extern long system_serial_number; #define partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index c0a01b5d985b..3bb9491b7659 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -11,6 +11,7 @@ struct mm_struct; extern enum uv_system_type get_uv_system_type(void); extern int is_uv_system(void); extern void uv_cpu_init(void); +extern void uv_nmi_init(void); extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 80e2984f521c..b414d2b401f6 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -55,7 +55,7 @@ #define DESC_STATUS_SOURCE_TIMEOUT 3 /* - * source side threshholds at which message retries print a warning + * source side thresholds at which message retries print a warning */ #define SOURCE_TIMEOUT_LIMIT 20 #define DESTINATION_TIMEOUT_LIMIT 20 diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index d1414af98559..14cc74ba5d23 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -31,20 +31,20 @@ * contiguous (although various IO spaces may punch holes in * it).. * - * N - Number of bits in the node portion of a socket physical - * address. + * N - Number of bits in the node portion of a socket physical + * address. * - * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of - * routers always have low bit of 1, C/MBricks have low bit - * equal to 0. Most addressing macros that target UV hub chips - * right shift the NASID by 1 to exclude the always-zero bit. - * NASIDs contain up to 15 bits. + * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of + * routers always have low bit of 1, C/MBricks have low bit + * equal to 0. Most addressing macros that target UV hub chips + * right shift the NASID by 1 to exclude the always-zero bit. + * NASIDs contain up to 15 bits. * * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead * of nasids. * - * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant - * of the nasid for socket usage. + * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant + * of the nasid for socket usage. * * * NumaLink Global Physical Address Format: @@ -71,12 +71,12 @@ * * * APICID format - * NOTE!!!!!! This is the current format of the APICID. However, code - * should assume that this will change in the future. Use functions - * in this file for all APICID bit manipulations and conversion. + * NOTE!!!!!! This is the current format of the APICID. However, code + * should assume that this will change in the future. Use functions + * in this file for all APICID bit manipulations and conversion. * - * 1111110000000000 - * 5432109876543210 + * 1111110000000000 + * 5432109876543210 * pppppppppplc0cch * sssssssssss * @@ -89,9 +89,9 @@ * Note: Processor only supports 12 bits in the APICID register. The ACPI * tables hold all 16 bits. Software needs to be aware of this. * - * Unless otherwise specified, all references to APICID refer to - * the FULL value contained in ACPI tables, not the subset in the - * processor APICID register. + * Unless otherwise specified, all references to APICID refer to + * the FULL value contained in ACPI tables, not the subset in the + * processor APICID register. */ @@ -151,16 +151,16 @@ struct uv_hub_info_s { }; DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); -#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) +#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) /* * Local & Global MMR space macros. - * Note: macros are intended to be used ONLY by inline functions - * in this file - not by other kernel code. - * n - NASID (full 15-bit global nasid) - * g - GNODE (full 15-bit global nasid, right shifted 1) - * p - PNODE (local part of nsids, right shifted 1) + * Note: macros are intended to be used ONLY by inline functions + * in this file - not by other kernel code. + * n - NASID (full 15-bit global nasid) + * g - GNODE (full 15-bit global nasid, right shifted 1) + * p - PNODE (local part of nsids, right shifted 1) */ #define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) #define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) @@ -172,6 +172,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024) #define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024) +#define UV_GLOBAL_GRU_MMR_BASE 0x4000000 + #define UV_GLOBAL_MMR32_PNODE_SHIFT 15 #define UV_GLOBAL_MMR64_PNODE_SHIFT 26 @@ -213,8 +215,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); /* * Macros for converting between kernel virtual addresses, socket local physical * addresses, and UV global physical addresses. - * Note: use the standard __pa() & __va() macros for converting - * between socket virtual and socket physical addresses. + * Note: use the standard __pa() & __va() macros for converting + * between socket virtual and socket physical addresses. */ /* socket phys RAM --> UV global physical address */ @@ -232,6 +234,26 @@ static inline unsigned long uv_gpa(void *v) return uv_soc_phys_ram_to_gpa(__pa(v)); } +/* Top two bits indicate the requested address is in MMR space. */ +static inline int +uv_gpa_in_mmr_space(unsigned long gpa) +{ + return (gpa >> 62) == 0x3UL; +} + +/* UV global physical address --> socket phys RAM */ +static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa) +{ + unsigned long paddr = gpa & uv_hub_info->gpa_mask; + unsigned long remap_base = uv_hub_info->lowmem_remap_base; + unsigned long remap_top = uv_hub_info->lowmem_remap_top; + + if (paddr >= remap_base && paddr < remap_base + remap_top) + paddr -= remap_base; + return paddr; +} + + /* gnode -> pnode */ static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) { @@ -265,21 +287,18 @@ static inline int uv_apicid_to_pnode(int apicid) * Access global MMRs using the low memory MMR32 space. This region supports * faster MMR access but not all MMRs are accessible in this space. */ -static inline unsigned long *uv_global_mmr32_address(int pnode, - unsigned long offset) +static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR32_BASE | UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr32(int pnode, unsigned long offset, - unsigned long val) +static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val) { writeq(val, uv_global_mmr32_address(pnode, offset)); } -static inline unsigned long uv_read_global_mmr32(int pnode, - unsigned long offset) +static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset) { return readq(uv_global_mmr32_address(pnode, offset)); } @@ -288,26 +307,43 @@ static inline unsigned long uv_read_global_mmr32(int pnode, * Access Global MMR space using the MMR space located at the top of physical * memory. */ -static inline unsigned long *uv_global_mmr64_address(int pnode, - unsigned long offset) +static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR64_BASE | UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr64(int pnode, unsigned long offset, - unsigned long val) +static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val) { writeq(val, uv_global_mmr64_address(pnode, offset)); } -static inline unsigned long uv_read_global_mmr64(int pnode, - unsigned long offset) +static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset) { return readq(uv_global_mmr64_address(pnode, offset)); } /* + * Global MMR space addresses when referenced by the GRU. (GRU does + * NOT use socket addressing). + */ +static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset) +{ + return UV_GLOBAL_GRU_MMR_BASE | offset | + ((unsigned long)pnode << uv_hub_info->m_val); +} + +static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val) +{ + writeb(val, uv_global_mmr64_address(pnode, offset)); +} + +static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset) +{ + return readb(uv_global_mmr64_address(pnode, offset)); +} + +/* * Access hub local MMRs. Faster than using global space but only local MMRs * are accessible. */ @@ -426,14 +462,28 @@ static inline void uv_set_scir_bits(unsigned char value) } } +static inline unsigned long uv_scir_offset(int apicid) +{ + return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f); +} + static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) { if (uv_cpu_hub_info(cpu)->scir.state != value) { + uv_write_global_mmr8(uv_cpu_to_pnode(cpu), + uv_cpu_hub_info(cpu)->scir.offset, value); uv_cpu_hub_info(cpu)->scir.state = value; - uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value); } } +static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode) +{ + return (1UL << UVH_IPI_INT_SEND_SHFT) | + ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | + (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | + (vector << UVH_IPI_INT_VECTOR_SHFT); +} + static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) { unsigned long val; @@ -442,12 +492,21 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) if (vector == NMI_VECTOR) dmode = dest_NMI; - val = (1UL << UVH_IPI_INT_SEND_SHFT) | - ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | - (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | - (vector << UVH_IPI_INT_VECTOR_SHFT); + val = uv_hub_ipi_value(apicid, vector, dmode); uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } +/* + * Get the minimum revision number of the hub chips within the partition. + * 1 - initial rev 1.0 silicon + * 2 - rev 2.0 production silicon + */ +static inline int uv_get_min_hub_revision_id(void) +{ + extern int uv_min_hub_revision_id; + + return uv_min_hub_revision_id; +} + #endif /* CONFIG_X86_64 */ #endif /* _ASM_X86_UV_UV_HUB_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ea0e8ea15e15..60cc35269083 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -126,6 +126,7 @@ struct x86_cpuinit_ops { * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic + * @nmi_init enable NMI on cpus */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -133,6 +134,7 @@ struct x86_platform_ops { int (*set_wallclock)(unsigned long nowtime); void (*iommu_shutdown)(void); bool (*is_untracked_pat_range)(u64 start, u64 end); + void (*nmi_init)(void); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index d5b7e90c0edf..396ff4cc8ed4 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -37,31 +37,4 @@ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; -enum xen_domain_type { - XEN_NATIVE, /* running on bare hardware */ - XEN_PV_DOMAIN, /* running in a PV domain */ - XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ -}; - -#ifdef CONFIG_XEN -extern enum xen_domain_type xen_domain_type; -#else -#define xen_domain_type XEN_NATIVE -#endif - -#define xen_domain() (xen_domain_type != XEN_NATIVE) -#define xen_pv_domain() (xen_domain() && \ - xen_domain_type == XEN_PV_DOMAIN) -#define xen_hvm_domain() (xen_domain() && \ - xen_domain_type == XEN_HVM_DOMAIN) - -#ifdef CONFIG_XEN_DOM0 -#include <xen/interface/xen.h> - -#define xen_initial_domain() (xen_pv_domain() && \ - xen_start_info->flags & SIF_INITDOMAIN) -#else /* !CONFIG_XEN_DOM0 */ -#define xen_initial_domain() (0) -#endif /* CONFIG_XEN_DOM0 */ - #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 727acc152344..ddc04ccad03b 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -27,9 +27,11 @@ extern unsigned int xstate_size; extern u64 pcntxt_mask; extern struct xsave_struct *init_xstate_buf; +extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void xsave_cntxt_init(void); extern void xsave_init(void); +extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); extern int init_fpu(struct task_struct *child); extern int check_for_xstate(struct i387_fxsave_struct __user *buf, void __user *fpstate, diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4f2e66e29ecc..d87f09bc5a52 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -89,7 +89,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index fd5ca97a2ad5..6f35260bb3ef 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o ifneq ($(CONFIG_ACPI_PROCESSOR),) -obj-y += cstate.o processor.o +obj-y += cstate.o endif $(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 87eee07da21f..f95703098f8d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -49,6 +49,7 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include <asm/proto.h> +# include <asm/numa_64.h> #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ @@ -482,6 +483,25 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) */ #ifdef CONFIG_ACPI_HOTPLUG_CPU +static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ +#ifdef CONFIG_ACPI_NUMA + int nid; + + nid = acpi_get_node(handle); + if (nid == -1 || !node_online(nid)) + return; +#ifdef CONFIG_X86_64 + apicid_to_node[physid] = nid; + numa_set_node(cpu, nid); +#else /* CONFIG_X86_32 */ + apicid_2_node[physid] = nid; + cpu_to_node_map[cpu] = nid; +#endif + +#endif +} + static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; @@ -540,6 +560,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) } cpu = cpumask_first(new_map); + acpi_map_cpu2node(handle, cpu, physid); *pcpu = cpu; retval = 0; @@ -1123,7 +1144,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) if (!acpi_sci_override_gsi) acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); - /* Fill in identity legacy mapings where no override */ + /* Fill in identity legacy mappings where no override */ mp_config_acpi_legacy_irqs(); count = @@ -1185,9 +1206,6 @@ static void __init acpi_process_madt(void) if (!error) { acpi_lapic = 1; -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif /* * Parse MADT IO-APIC entries */ @@ -1197,8 +1215,6 @@ static void __init acpi_process_madt(void) acpi_ioapic = 1; smp_found_config = 1; - if (apic->setup_apic_routing) - apic->setup_apic_routing(); } } if (error == -EINVAL) { @@ -1349,14 +1365,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, { .callback = force_acpi_ht, - .ident = "ASUS P2B-DS", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"), - }, - }, - { - .callback = force_acpi_ht, .ident = "ASUS CUR-DLS", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), @@ -1529,16 +1537,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { * if acpi_blacklisted() acpi_disabled = 1; * acpi_irq_model=... * ... - * - * return value: (currently ignored) - * 0: success - * !0: failure */ -int __init acpi_boot_table_init(void) +void __init acpi_boot_table_init(void) { - int error; - dmi_check_system(acpi_dmi_table); /* @@ -1546,15 +1548,14 @@ int __init acpi_boot_table_init(void) * One exception: acpi=ht continues far enough to enumerate LAPICs */ if (acpi_disabled && !acpi_ht) - return 1; + return; /* * Initialize the ACPI boot-time table parser. */ - error = acpi_table_init(); - if (error) { + if (acpi_table_init()) { disable_acpi(); - return error; + return; } acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1562,18 +1563,15 @@ int __init acpi_boot_table_init(void) /* * blacklist may disable ACPI entirely */ - error = acpi_blacklisted(); - if (error) { + if (acpi_blacklisted()) { if (acpi_force) { printk(KERN_WARNING PREFIX "acpi=force override\n"); } else { printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); disable_acpi(); - return error; + return; } } - - return 0; } int __init early_acpi_boot_init(void) diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 59cdfa4686b2..2e837f5080fe 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, * P4, Core and beyond CPUs */ if (c->x86_vendor == X86_VENDOR_INTEL && - (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) + (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) flags->bm_control = 0; } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c deleted file mode 100644 index d85d1b2432ba..000000000000 --- a/arch/x86/kernel/acpi/processor.c +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (C) 2005 Intel Corporation - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> - * - Added _PDC for platforms with Intel CPUs - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/acpi.h> - -#include <acpi/processor.h> -#include <asm/acpi.h> - -static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) -{ - struct acpi_object_list *obj_list; - union acpi_object *obj; - u32 *buf; - - /* allocate and initialize pdc. It will be used later. */ - obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL); - if (!obj_list) { - printk(KERN_ERR "Memory allocation error\n"); - return; - } - - obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL); - if (!obj) { - printk(KERN_ERR "Memory allocation error\n"); - kfree(obj_list); - return; - } - - buf = kmalloc(12, GFP_KERNEL); - if (!buf) { - printk(KERN_ERR "Memory allocation error\n"); - kfree(obj); - kfree(obj_list); - return; - } - - buf[0] = ACPI_PDC_REVISION_ID; - buf[1] = 1; - buf[2] = ACPI_PDC_C_CAPABILITY_SMP; - - /* - * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so - * that OSPM is capable of native ACPI throttling software - * coordination using BIOS supplied _TSD info. - */ - buf[2] |= ACPI_PDC_SMP_T_SWCOORD; - if (cpu_has(c, X86_FEATURE_EST)) - buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP; - - if (cpu_has(c, X86_FEATURE_ACPI)) - buf[2] |= ACPI_PDC_T_FFH; - - /* - * If mwait/monitor is unsupported, C2/C3_FFH will be disabled - */ - if (!cpu_has(c, X86_FEATURE_MWAIT)) - buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); - - obj->type = ACPI_TYPE_BUFFER; - obj->buffer.length = 12; - obj->buffer.pointer = (u8 *) buf; - obj_list->count = 1; - obj_list->pointer = obj; - pr->pdc = obj_list; - - return; -} - - -/* Initialize _PDC data based on the CPU vendor */ -void arch_acpi_processor_init_pdc(struct acpi_processor *pr) -{ - struct cpuinfo_x86 *c = &cpu_data(pr->id); - - pr->pdc = NULL; - if (c->x86_vendor == X86_VENDOR_INTEL || - c->x86_vendor == X86_VENDOR_CENTAUR) - init_intel_pdc(pr, c); - - return; -} - -EXPORT_SYMBOL(arch_acpi_processor_init_pdc); - -void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr) -{ - if (pr->pdc) { - kfree(pr->pdc->pointer->buffer.pointer); - kfree(pr->pdc->pointer); - kfree(pr->pdc); - pr->pdc = NULL; - } -} - -EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 82e508677b91..f9961034e557 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); + if (strncmp(str, "sci_force_enable", 16) == 0) + acpi_set_sci_en_on_resume(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index de7353c0ce9c..e6ea0342c8f8 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -205,7 +205,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *end) { struct alt_instr *a; - char insnbuf[MAX_PATCH_LEN]; + u8 insnbuf[MAX_PATCH_LEN]; DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); for (a = start; a < end; a++) { @@ -223,6 +223,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start, } #endif memcpy(insnbuf, a->replacement, a->replacementlen); + if (*insnbuf == 0xe8 && a->replacementlen == 5) + *(s32 *)(insnbuf + 1) += a->replacement - a->instr; add_nops(insnbuf + a->replacementlen, a->instrlen - a->replacementlen); text_poke_early(instr, insnbuf, a->instrlen); @@ -390,6 +392,24 @@ void alternatives_smp_switch(int smp) mutex_unlock(&smp_alt); } +/* Return 1 if the address range is reserved for smp-alternatives */ +int alternatives_text_reserved(void *start, void *end) +{ + struct smp_alt_module *mod; + u8 **ptr; + u8 *text_start = start; + u8 *text_end = end; + + list_for_each_entry(mod, &smp_alt_modules, next) { + if (mod->text > text_end || mod->text_end < text_start) + continue; + for (ptr = mod->locks; ptr < mod->locks_end; ptr++) + if (text_start <= *ptr && text_end >= *ptr) + return 1; + } + + return 0; +} #endif #ifdef CONFIG_PARAVIRT diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 32fb09102a13..adb0ba025702 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -19,7 +19,7 @@ #include <linux/pci.h> #include <linux/gfp.h> -#include <linux/bitops.h> +#include <linux/bitmap.h> #include <linux/debugfs.h> #include <linux/scatterlist.h> #include <linux/dma-mapping.h> @@ -166,6 +166,43 @@ static void iommu_uninit_device(struct device *dev) { kfree(dev->archdata.iommu); } + +void __init amd_iommu_uninit_devices(void) +{ + struct pci_dev *pdev = NULL; + + for_each_pci_dev(pdev) { + + if (!check_device(&pdev->dev)) + continue; + + iommu_uninit_device(&pdev->dev); + } +} + +int __init amd_iommu_init_devices(void) +{ + struct pci_dev *pdev = NULL; + int ret = 0; + + for_each_pci_dev(pdev) { + + if (!check_device(&pdev->dev)) + continue; + + ret = iommu_init_device(&pdev->dev); + if (ret) + goto out_free; + } + + return 0; + +out_free: + + amd_iommu_uninit_devices(); + + return ret; +} #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -943,7 +980,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; struct amd_iommu *iommu; - int i; + unsigned long i; #ifdef CONFIG_IOMMU_STRESS populate = false; @@ -1125,7 +1162,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; - iommu_area_free(range->bitmap, address, pages); + bitmap_clear(range->bitmap, address, pages); } @@ -1452,11 +1489,14 @@ static void __detach_device(struct device *dev) { struct iommu_dev_data *dev_data = get_dev_data(dev); struct iommu_dev_data *alias_data; + struct protection_domain *domain; unsigned long flags; BUG_ON(!dev_data->domain); - spin_lock_irqsave(&dev_data->domain->lock, flags); + domain = dev_data->domain; + + spin_lock_irqsave(&domain->lock, flags); if (dev_data->alias != dev) { alias_data = get_dev_data(dev_data->alias); @@ -1467,13 +1507,15 @@ static void __detach_device(struct device *dev) if (atomic_dec_and_test(&dev_data->bind)) do_detach(dev); - spin_unlock_irqrestore(&dev_data->domain->lock, flags); + spin_unlock_irqrestore(&domain->lock, flags); /* * If we run in passthrough mode the device must be assigned to the - * passthrough domain if it is detached from any other domain + * passthrough domain if it is detached from any other domain. + * Make sure we can deassign from the pt_domain itself. */ - if (iommu_pass_through && dev_data->domain == NULL) + if (iommu_pass_through && + (dev_data->domain == NULL && domain != pt_domain)) __attach_device(dev, pt_domain); } @@ -1587,6 +1629,11 @@ static struct notifier_block device_nb = { .notifier_call = device_change_notifier, }; +void amd_iommu_init_notifier(void) +{ + bus_register_notifier(&pci_bus_type, &device_nb); +} + /***************************************************************************** * * The next functions belong to the dma_ops mapping/unmapping code. @@ -1783,7 +1830,7 @@ retry: goto out; /* - * aperture was sucessfully enlarged by 128 MB, try + * aperture was successfully enlarged by 128 MB, try * allocation again */ goto retry; @@ -2145,8 +2192,6 @@ static void prealloc_protection_domains(void) if (!check_device(&dev->dev)) continue; - iommu_init_device(&dev->dev); - /* Is there already any domain for it? */ if (domain_for_device(&dev->dev)) continue; @@ -2178,6 +2223,12 @@ static struct dma_map_ops amd_iommu_dma_ops = { /* * The function which clues the AMD IOMMU driver into dma_ops. */ + +void __init amd_iommu_init_api(void) +{ + register_iommu(&amd_iommu_ops); +} + int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; @@ -2213,10 +2264,6 @@ int __init amd_iommu_init_dma_ops(void) /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; - register_iommu(&amd_iommu_ops); - - bus_register_notifier(&pci_bus_type, &device_nb); - amd_iommu_stats_init(); return 0; @@ -2490,7 +2537,7 @@ int __init amd_iommu_init_passthrough(void) struct pci_dev *dev = NULL; u16 devid; - /* allocate passthroug domain */ + /* allocate passthrough domain */ pt_domain = protection_domain_alloc(); if (!pt_domain) return -ENOMEM; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 7ffc39965233..9dc91b431470 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -138,6 +138,11 @@ int amd_iommus_present; bool amd_iommu_np_cache __read_mostly; /* + * Set to true if ACPI table parsing and hardware intialization went properly + */ +static bool amd_iommu_initialized; + +/* * List of protection domains - used during resume */ LIST_HEAD(amd_iommu_pd_list); @@ -929,6 +934,8 @@ static int __init init_iommu_all(struct acpi_table_header *table) } WARN_ON(p != end); + amd_iommu_initialized = true; + return 0; } @@ -1263,6 +1270,9 @@ static int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_iommu_all) != 0) goto free; + if (!amd_iommu_initialized) + goto free; + if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; @@ -1274,13 +1284,22 @@ static int __init amd_iommu_init(void) if (ret) goto free; + ret = amd_iommu_init_devices(); + if (ret) + goto free; + if (iommu_pass_through) ret = amd_iommu_init_passthrough(); else ret = amd_iommu_init_dma_ops(); + if (ret) goto free; + amd_iommu_init_api(); + + amd_iommu_init_notifier(); + enable_iommus(); if (iommu_pass_through) @@ -1296,6 +1315,9 @@ out: return ret; free: + + amd_iommu_uninit_devices(); + free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, get_order(MAX_DOMAIN_ID/8)); @@ -1336,6 +1358,9 @@ void __init amd_iommu_detect(void) iommu_detected = 1; amd_iommu_detected = 1; x86_init.iommu.iommu_init = amd_iommu_init; + + /* Make sure ACS will be enabled */ + pci_request_acs(); } } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index e0dfb6856aa2..3704997e8b25 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -280,7 +280,8 @@ void __init early_gart_iommu_check(void) * or BIOS forget to put that in reserved. * try to update e820 to make that region as reserved. */ - int i, fix, slot; + u32 agp_aper_base = 0, agp_aper_order = 0; + int i, fix, slot, valid_agp = 0; u32 ctl; u32 aper_size = 0, aper_order = 0, last_aper_order = 0; u64 aper_base = 0, last_aper_base = 0; @@ -290,6 +291,8 @@ void __init early_gart_iommu_check(void) return; /* This is mostly duplicate of iommu_hole_init */ + agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + fix = 0; for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { int bus; @@ -342,10 +345,10 @@ void __init early_gart_iommu_check(void) } } - if (!fix) + if (valid_agp) return; - /* different nodes have different setting, disable them all at first*/ + /* disable them all at first */ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { int bus; int dev_base, dev_limit; @@ -458,8 +461,6 @@ out: if (aper_alloc) { /* Got the aperture from the AGP bridge */ - } else if (!valid_agp) { - /* Do nothing */ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || force_iommu || valid_agp || diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index efb2b9cd132c..6e29b2a77aa8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U; /* * The highest APIC ID seen during enumeration. - * - * On AMD, this determines the messaging protocol we can use: if all APIC IDs - * are in the 0 ... 7 range, then we can use logical addressing which - * has some performance advantages (better broadcasting). - * - * If there's an APIC ID above 8, we use physical addressing. */ unsigned int max_physical_apicid; @@ -587,7 +581,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) res = (((u64)(*deltatsc)) * pm_100ms); do_div(res, deltapm); apic_printk(APIC_VERBOSE, "TSC delta adjusted to " - "PM-Timer: %lu (%ld) \n", + "PM-Timer: %lu (%ld)\n", (unsigned long)res, *deltatsc); *deltatsc = (long)res; } @@ -1341,7 +1335,7 @@ void enable_x2apic(void) rdmsr(MSR_IA32_APICBASE, msr, msr2); if (!(msr & X2APIC_ENABLE)) { - pr_info("Enabling x2apic\n"); + printk_once(KERN_INFO "Enabling x2apic\n"); wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } @@ -1647,9 +1641,7 @@ int __init APIC_init_uniprocessor(void) #endif enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); -#endif verify_local_APIC(); connect_bsp_APIC(); @@ -1897,18 +1889,6 @@ void __cpuinit generic_processor_info(int apicid, int version) if (apicid > max_physical_apicid) max_physical_apicid = apicid; -#ifdef CONFIG_X86_32 - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (num_processors > 8) - def_to_bigsmp = 1; - break; - case X86_VENDOR_AMD: - if (max_physical_apicid >= 8) - def_to_bigsmp = 1; - } -#endif - #if defined(CONFIG_SMP) || defined(CONFIG_X86_64) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index d0c99abc26c3..e3c3d820c325 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; } + + if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) { + printk(KERN_DEBUG "IBM Summit detected, will use apic physical"); + return 1; + } #endif return 0; @@ -306,10 +311,7 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_apicid, cpu); } struct apic apic_physflat = { diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index d9acc3bee0f4..e31b9ffe25f5 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -127,7 +127,7 @@ static u32 noop_apic_read(u32 reg) static void noop_apic_write(u32 reg, u32 v) { - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); + WARN_ON_ONCE(cpu_has_apic && !disable_apic); } struct apic apic_noop = { diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 38dcecfa5818..cb804c5091b9 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -131,10 +131,7 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (cpu < nr_cpu_ids) - return bigsmp_cpu_to_logical_apicid(cpu); - - return BAD_APICID; + return bigsmp_cpu_to_logical_apicid(cpu); } static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index e85f8fb7f8e7..dd2b5f264643 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -27,6 +27,9 @@ * * http://www.unisys.com */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/notifier.h> #include <linux/spinlock.h> #include <linux/cpumask.h> @@ -223,9 +226,9 @@ static int parse_unisys_oem(char *oemptr) mip_addr = val; mip = (struct mip_reg *)val; mip_reg = __va(mip); - pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", + pr_debug("host_reg = 0x%lx\n", (unsigned long)host_reg); - pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", + pr_debug("mip_reg = 0x%lx\n", (unsigned long)mip_reg); success++; break; @@ -401,7 +404,7 @@ static void es7000_enable_apic_mode(void) if (!es7000_plat) return; - printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); + pr_info("Enabling APIC mode.\n"); memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); es7000_mip_reg.off_0x00 = MIP_SW_APIC; es7000_mip_reg.off_0x38 = MIP_VALID; @@ -514,8 +517,7 @@ static void es7000_setup_apic_routing(void) { int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); - printk(KERN_INFO - "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", + pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", (apic_version[apic] == 0x14) ? "Physical Cluster" : "Logical Cluster", nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d5d498fbee4b..6bdd2c7ead75 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1647,7 +1647,7 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG ".... IRQ redirection table:\n"); printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect: \n"); + " Stat Dmod Deli Vect:\n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; @@ -2276,26 +2276,28 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq /* * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and * leaves desc->affinity untouched. */ unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask, + unsigned int *dest_id) { struct irq_cfg *cfg; unsigned int irq; if (!cpumask_intersects(mask, cpu_online_mask)) - return BAD_APICID; + return -1; irq = desc->irq; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return BAD_APICID; + return -1; cpumask_copy(desc->affinity, mask); - return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); + *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); + return 0; } static int @@ -2311,12 +2313,11 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) cfg = desc->chip_data; spin_lock_irqsave(&ioapic_lock, flags); - dest = set_desc_affinity(desc, mask); - if (dest != BAD_APICID) { + ret = set_desc_affinity(desc, mask, &dest); + if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); - ret = 0; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2431,7 +2432,14 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) continue; cfg = irq_cfg(irq); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); + + /* + * Check if the irq migration is in progress. If so, we + * haven't received the cleanup request yet for this irq. + */ + if (cfg->move_in_progress) + goto unlock; if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; @@ -2450,7 +2458,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) } __get_cpu_var(vector_irq)[vector] = -1; unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } irq_exit(); @@ -3351,8 +3359,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3384,8 +3391,7 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) if (get_irte(irq, &irte)) return -1; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; irte.vector = cfg->vector; @@ -3567,8 +3573,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3623,8 +3628,7 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3730,8 +3734,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) struct irq_cfg *cfg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 6389432a9dbf..0159a69396cb 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -361,7 +361,7 @@ void stop_apic_nmi_watchdog(void *unused) */ static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(local_t, alert_counter); +static DEFINE_PER_CPU(long, alert_counter); static DEFINE_PER_CPU(int, nmi_touch); void touch_nmi_watchdog(void) @@ -438,8 +438,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) + __this_cpu_inc(per_cpu_var(alert_counter)); + if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ @@ -447,7 +447,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; - local_set(&__get_cpu_var(alert_counter), 0); + __this_cpu_write(per_cpu_var(alert_counter), 0); } /* see if the nmi watchdog went off */ diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 98c4665f251c..47dd856708e5 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) mpc_record = 0; printk(KERN_INFO - "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); + "Found an OEM MPC table at %8p - parsing it...\n", oemtable); if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { printk(KERN_WARNING diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1a6559f6768c..99d2fe016084 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -52,7 +52,32 @@ static int __init print_ipi_mode(void) } late_initcall(print_ipi_mode); -void default_setup_apic_routing(void) +void __init default_setup_apic_routing(void) +{ + int version = apic_version[boot_cpu_physical_apicid]; + + if (num_possible_cpus() > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + +#ifdef CONFIG_X86_BIGSMP + generic_bigsmp_probe(); +#endif + + if (apic->setup_apic_routing) + apic->setup_apic_routing(); +} + +static void setup_apic_flat_routing(void) { #ifdef CONFIG_X86_IO_APIC printk(KERN_INFO @@ -103,7 +128,7 @@ struct apic apic_default = { .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, - .setup_apic_routing = default_setup_apic_routing, + .setup_apic_routing = setup_apic_flat_routing, .multi_timer_check = NULL, .apicid_to_node = default_apicid_to_node, .cpu_to_logical_apicid = default_cpu_to_logical_apicid, diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index c4cbd3080c1c..83e9be4778e2 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -67,17 +67,8 @@ void __init default_setup_apic_routing(void) } #endif - if (apic == &apic_flat) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (num_processors > 8) - apic = &apic_physflat; - break; - case X86_VENDOR_AMD: - if (max_physical_apicid >= 8) - apic = &apic_physflat; - } - } + if (apic == &apic_flat && num_possible_cpus() > 8) + apic = &apic_physflat; printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index a5371ec36776..cf69c59f4910 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -148,10 +148,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_logical_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_logical_apicid, cpu); } static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a8989aadc99a..8972f38c5ced 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -146,10 +146,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_apicid, cpu); } static unsigned int x2apic_phys_get_apic_id(unsigned long x) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b684bb303cbf..3740c8a4eae7 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved. */ #include <linux/cpumask.h> #include <linux/hardirq.h> @@ -20,6 +20,8 @@ #include <linux/cpu.h> #include <linux/init.h> #include <linux/io.h> +#include <linux/pci.h> +#include <linux/kdebug.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -34,8 +36,13 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); +#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args) + static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; +int uv_min_hub_revision_id; +EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +static DEFINE_SPINLOCK(uv_nmi_lock); static inline bool is_GRU_range(u64 start, u64 end) { @@ -55,20 +62,28 @@ static int early_get_nodeid(void) mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); node_id.v = *mmr; early_iounmap(mmr, sizeof(*mmr)); + + /* Currently, all blades have same revision number */ + uv_min_hub_revision_id = node_id.s.revision; + return node_id.s.node_id; } static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { + int nodeid; + if (!strcmp(oem_id, "SGI")) { + nodeid = early_get_nodeid(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + x86_platform.nmi_init = uv_nmi_init; if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; else if (!strcmp(oem_table_id, "UVX")) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { __get_cpu_var(x2apic_extra_bits) = - early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); + nodeid << (UV_APIC_PNODE_SHIFT - 1); uv_system_type = UV_NON_UNIQUE_APIC; return 1; } @@ -225,10 +240,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_apicid, cpu); } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -377,13 +389,13 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) enum map_type {map_wb, map_uc}; -static __init void map_high(char *id, unsigned long base, int shift, - int max_pnode, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int pshift, + int bshift, int max_pnode, enum map_type map_type) { unsigned long bytes, paddr; - paddr = base << shift; - bytes = (1UL << shift) * (max_pnode + 1); + paddr = base << pshift; + bytes = (1UL << bshift) * (max_pnode + 1); printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) @@ -399,7 +411,7 @@ static __init void map_gru_high(int max_pnode) gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (gru.s.enable) { - map_high("GRU", gru.s.base, shift, max_pnode, map_wb); + map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); gru_start_paddr = ((u64)gru.s.base << shift); gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); @@ -413,7 +425,7 @@ static __init void map_mmr_high(int max_pnode) mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); + map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc); } static __init void map_mmioh_high(int max_pnode) @@ -423,7 +435,8 @@ static __init void map_mmioh_high(int max_pnode) mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); if (mmioh.s.enable) - map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); + map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io, + max_pnode, map_uc); } static __init void map_low_mmrs(void) @@ -475,7 +488,7 @@ static void uv_heartbeat(unsigned long ignored) static void __cpuinit uv_heartbeat_enable(int cpu) { - if (!uv_cpu_hub_info(cpu)->scir.enabled) { + while (!uv_cpu_hub_info(cpu)->scir.enabled) { struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); @@ -483,11 +496,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu) timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_hub_info(cpu)->scir.enabled = 1; - } - /* check boot cpu */ - if (!uv_cpu_hub_info(0)->scir.enabled) - uv_heartbeat_enable(0); + /* also ensure that boot cpu is enabled */ + cpu = 0; + } } #ifdef CONFIG_HOTPLUG_CPU @@ -546,6 +558,30 @@ late_initcall(uv_init_heartbeat); #endif /* !CONFIG_HOTPLUG_CPU */ +/* Direct Legacy VGA I/O traffic to designated IOH */ +int uv_set_vga_state(struct pci_dev *pdev, bool decode, + unsigned int command_bits, bool change_bridge) +{ + int domain, bus, rc; + + PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n", + pdev->devfn, decode, command_bits, change_bridge); + + if (!change_bridge) + return 0; + + if ((command_bits & PCI_COMMAND_IO) == 0) + return 0; + + domain = pci_domain_nr(pdev->bus); + bus = pdev->bus->number; + + rc = uv_bios_set_legacy_vga_target(decode, domain, bus); + PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc); + + return rc; +} + /* * Called on each cpu to initialize the per_cpu UV data area. * FIXME: hotplug not supported yet @@ -562,6 +598,46 @@ void __cpuinit uv_cpu_init(void) set_x2apic_extra_bits(uv_hub_info->pnode); } +/* + * When NMI is received, print a stack trace. + */ +int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) +{ + if (reason != DIE_NMI_IPI) + return NOTIFY_OK; + /* + * Use a lock so only one cpu prints at a time + * to prevent intermixed output. + */ + spin_lock(&uv_nmi_lock); + pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); + dump_stack(); + spin_unlock(&uv_nmi_lock); + + return NOTIFY_STOP; +} + +static struct notifier_block uv_dump_stack_nmi_nb = { + .notifier_call = uv_handle_nmi +}; + +void uv_register_nmi_notifier(void) +{ + if (register_die_notifier(&uv_dump_stack_nmi_nb)) + printk(KERN_WARNING "UV NMI handler failed to register\n"); +} + +void uv_nmi_init(void) +{ + unsigned int value; + + /* + * Unmask NMI on all cpus + */ + value = apic_read(APIC_LVT1) | APIC_DM_NMI; + value &= ~APIC_LVT_MASKED; + apic_write(APIC_LVT1, value); +} void __init uv_system_init(void) { @@ -627,13 +703,15 @@ void __init uv_system_init(void) } uv_bios_init(); - uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &sn_coherency_id, &sn_region_size); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, + &sn_region_size, &system_serial_number); uv_rtc_init(); for_each_present_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + nid = cpu_to_node(cpu); - pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; @@ -654,15 +732,13 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; - uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; + uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); - printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " - "lcpu %d, blade %d\n", - cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, - lcpu, blade); + printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n", + cpu, apicid, pnode, nid, lcpu, blade); } /* Add blade/pnode info for nodes without cpus */ @@ -683,5 +759,9 @@ void __init uv_system_init(void) uv_cpu_init(); uv_scir_register_cpu_notifier(); + uv_register_nmi_notifier(); proc_mkdir("sgi_uv", NULL); + + /* register Legacy VGA I/O redirection handler */ + pci_register_set_vga_state(uv_set_vga_state); } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index b5b6b23bce53..031aa887b0eb 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1992,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d) apm_info.disabled = 1; printk(KERN_INFO "%s machine detected. " "Disabling APM.\n", d->ident); - printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); - printk(KERN_INFO "download from support.intel.com \n"); + printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n"); + printk(KERN_INFO "download from support.intel.com\n"); } return 0; } diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 63a88e1f987d..8bc57baaa9ad 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -15,8 +15,8 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson <rja@sgi.com> */ #include <linux/efi.h> @@ -30,6 +30,7 @@ static struct uv_systab uv_systab; s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { struct uv_systab *tab = &uv_systab; + s64 ret; if (!tab->function) /* @@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) */ return BIOS_STATUS_UNIMPLEMENTED; - return efi_call6((void *)__va(tab->function), - (u64)which, a1, a2, a3, a4, a5); + ret = efi_call6((void *)__va(tab->function), (u64)which, + a1, a2, a3, a4, a5); + return ret; } +EXPORT_SYMBOL_GPL(uv_bios_call); s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) @@ -73,11 +76,14 @@ long sn_coherency_id; EXPORT_SYMBOL_GPL(sn_coherency_id); long sn_region_size; EXPORT_SYMBOL_GPL(sn_region_size); +long system_serial_number; +EXPORT_SYMBOL_GPL(system_serial_number); int uv_type; +EXPORT_SYMBOL_GPL(uv_type); s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, - long *region) + long *region, long *ssn) { s64 ret; u64 v0, v1; @@ -97,25 +103,24 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, *coher = part.coherence_id; if (region) *region = part.region_size; + if (ssn) + *ssn = v1; return ret; } +EXPORT_SYMBOL_GPL(uv_bios_get_sn_info); int -uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, +uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, unsigned long *intr_mmr_offset) { - union uv_watchlist_u size_blade; u64 watchlist; s64 ret; - size_blade.size = mq_size; - size_blade.blade = blade; - /* * bios returns watchlist number or negative error number. */ ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, - size_blade.val, (u64)intr_mmr_offset, + mq_size, (u64)intr_mmr_offset, (u64)&watchlist, 0); if (ret < BIOS_STATUS_SUCCESS) return ret; @@ -158,6 +163,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) } EXPORT_SYMBOL_GPL(uv_bios_freq_base); +/* + * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target + * @decode: true to enable target, false to disable target + * @domain: PCI domain number + * @bus: PCI bus number + * + * Returns: + * 0: Success + * -EINVAL: Invalid domain or bus number + * -ENOSYS: Capability not available + * -EBUSY: Legacy VGA I/O cannot be retargeted at this time + */ +int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) +{ + return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET, + (u64)decode, (u64)domain, (u64)bus, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); + #ifdef CONFIG_EFI void uv_bios_init(void) @@ -189,4 +213,3 @@ void uv_bios_init(void) void uv_bios_init(void) { } #endif - diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1d2cb383410e..c202b62f3671 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -19,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o -obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o - obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index c965e5212714..97ad79cdf688 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, { 0, 0, 0, 0 } }; @@ -74,6 +78,7 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) unsigned int eax, ebx, ecx, edx, sub_index; unsigned int ht_mask_width, core_plus_mask_width; unsigned int core_select_mask, core_level_siblings; + static bool printed; if (c->cpuid_level < 0xb) return; @@ -127,12 +132,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) c->x86_max_cores = (core_level_siblings / smp_num_siblings); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); + if (!printed) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + printed = 1; + } return; #endif } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7128b3799cec..e485825130d2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -254,59 +254,36 @@ static int __cpuinit nearby_node(int apicid) /* * Fixup core topology information for AMD multi-node processors. - * Assumption 1: Number of cores in each internal node is the same. - * Assumption 2: Mixed systems with both single-node and dual-node - * processors are not supported. + * Assumption: Number of cores in each internal node is the same. */ #ifdef CONFIG_X86_HT static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) { -#ifdef CONFIG_PCI - u32 t, cpn; - u8 n, n_id; + unsigned long long value; + u32 nodes, cores_per_node; int cpu = smp_processor_id(); + if (!cpu_has(c, X86_FEATURE_NODEID_MSR)) + return; + /* fixup topology information only once for a core */ if (cpu_has(c, X86_FEATURE_AMD_DCM)) return; - /* check for multi-node processor on boot cpu */ - t = read_pci_config(0, 24, 3, 0xe8); - if (!(t & (1 << 29))) + rdmsrl(MSR_FAM10H_NODE_ID, value); + + nodes = ((value >> 3) & 7) + 1; + if (nodes == 1) return; set_cpu_cap(c, X86_FEATURE_AMD_DCM); + cores_per_node = c->x86_max_cores / nodes; - /* cores per node: each internal node has half the number of cores */ - cpn = c->x86_max_cores >> 1; - - /* even-numbered NB_id of this dual-node processor */ - n = c->phys_proc_id << 1; - - /* - * determine internal node id and assign cores fifty-fifty to - * each node of the dual-node processor - */ - t = read_pci_config(0, 24 + n, 3, 0xe8); - n = (t>>30) & 0x3; - if (n == 0) { - if (c->cpu_core_id < cpn) - n_id = 0; - else - n_id = 1; - } else { - if (c->cpu_core_id < cpn) - n_id = 1; - else - n_id = 0; - } - - /* compute entire NodeID, use llc_shared_map to store sibling info */ - per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id; + /* store NodeID, use llc_shared_map to store sibling info */ + per_cpu(cpu_llc_id, cpu) = value & 7; - /* fixup core id to be in range from 0 to cpn */ - c->cpu_core_id = c->cpu_core_id % cpn; -#endif + /* fixup core id to be in range from 0 to (cores_per_node - 1) */ + c->cpu_core_id = c->cpu_core_id % cores_per_node; } #endif @@ -375,8 +352,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) node = nearby_node(apicid); } numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1afa990a6c8..4868e4a951ee 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -427,6 +427,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_HT u32 eax, ebx, ecx, edx; int index_msb, core_bits; + static bool printed; if (!cpu_has(c, X86_FEATURE_HT)) return; @@ -442,7 +443,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n"); goto out; } @@ -469,11 +470,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) ((1 << core_bits) - 1); out: - if ((c->x86_max_cores * smp_num_siblings) > 1) { + if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) { printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); + printed = 1; } #endif } @@ -1093,7 +1095,7 @@ static void clear_all_debug_regs(void) void __cpuinit cpu_init(void) { - struct orig_ist *orig_ist; + struct orig_ist *oist; struct task_struct *me; struct tss_struct *t; unsigned long v; @@ -1102,7 +1104,7 @@ void __cpuinit cpu_init(void) cpu = stack_smp_processor_id(); t = &per_cpu(init_tss, cpu); - orig_ist = &per_cpu(orig_ist, cpu); + oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA if (cpu != 0 && percpu_read(node_number) == 0 && @@ -1115,7 +1117,7 @@ void __cpuinit cpu_init(void) if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) panic("CPU#%d already initialized!\n", cpu); - printk(KERN_INFO "Initializing CPU#%d\n", cpu); + pr_debug("Initializing CPU#%d\n", cpu); clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); @@ -1143,12 +1145,12 @@ void __cpuinit cpu_init(void) /* * set up and load the per-CPU TSS */ - if (!orig_ist->ist[0]) { + if (!oist->ist[0]) { char *estacks = per_cpu(exception_stacks, cpu); for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = + oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c deleted file mode 100644 index dca325c03999..000000000000 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ /dev/null @@ -1,688 +0,0 @@ -/* - * CPU x86 architecture debug code - * - * Copyright(C) 2009 Jaswinder Singh Rajput - * - * For licencing details see kernel-base/COPYING - */ - -#include <linux/interrupt.h> -#include <linux/compiler.h> -#include <linux/seq_file.h> -#include <linux/debugfs.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/smp.h> - -#include <asm/cpu_debug.h> -#include <asm/paravirt.h> -#include <asm/system.h> -#include <asm/traps.h> -#include <asm/apic.h> -#include <asm/desc.h> - -static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); -static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); -static DEFINE_PER_CPU(int, cpu_priv_count); - -static DEFINE_MUTEX(cpu_debug_lock); - -static struct dentry *cpu_debugfs_dir; - -static struct cpu_debug_base cpu_base[] = { - { "mc", CPU_MC, 0 }, - { "monitor", CPU_MONITOR, 0 }, - { "time", CPU_TIME, 0 }, - { "pmc", CPU_PMC, 1 }, - { "platform", CPU_PLATFORM, 0 }, - { "apic", CPU_APIC, 0 }, - { "poweron", CPU_POWERON, 0 }, - { "control", CPU_CONTROL, 0 }, - { "features", CPU_FEATURES, 0 }, - { "lastbranch", CPU_LBRANCH, 0 }, - { "bios", CPU_BIOS, 0 }, - { "freq", CPU_FREQ, 0 }, - { "mtrr", CPU_MTRR, 0 }, - { "perf", CPU_PERF, 0 }, - { "cache", CPU_CACHE, 0 }, - { "sysenter", CPU_SYSENTER, 0 }, - { "therm", CPU_THERM, 0 }, - { "misc", CPU_MISC, 0 }, - { "debug", CPU_DEBUG, 0 }, - { "pat", CPU_PAT, 0 }, - { "vmx", CPU_VMX, 0 }, - { "call", CPU_CALL, 0 }, - { "base", CPU_BASE, 0 }, - { "ver", CPU_VER, 0 }, - { "conf", CPU_CONF, 0 }, - { "smm", CPU_SMM, 0 }, - { "svm", CPU_SVM, 0 }, - { "osvm", CPU_OSVM, 0 }, - { "tss", CPU_TSS, 0 }, - { "cr", CPU_CR, 0 }, - { "dt", CPU_DT, 0 }, - { "registers", CPU_REG_ALL, 0 }, -}; - -static struct cpu_file_base cpu_file[] = { - { "index", CPU_REG_ALL, 0 }, - { "value", CPU_REG_ALL, 1 }, -}; - -/* CPU Registers Range */ -static struct cpu_debug_range cpu_reg_range[] = { - { 0x00000000, 0x00000001, CPU_MC, }, - { 0x00000006, 0x00000007, CPU_MONITOR, }, - { 0x00000010, 0x00000010, CPU_TIME, }, - { 0x00000011, 0x00000013, CPU_PMC, }, - { 0x00000017, 0x00000017, CPU_PLATFORM, }, - { 0x0000001B, 0x0000001B, CPU_APIC, }, - { 0x0000002A, 0x0000002B, CPU_POWERON, }, - { 0x0000002C, 0x0000002C, CPU_FREQ, }, - { 0x0000003A, 0x0000003A, CPU_CONTROL, }, - { 0x00000040, 0x00000047, CPU_LBRANCH, }, - { 0x00000060, 0x00000067, CPU_LBRANCH, }, - { 0x00000079, 0x00000079, CPU_BIOS, }, - { 0x00000088, 0x0000008A, CPU_CACHE, }, - { 0x0000008B, 0x0000008B, CPU_BIOS, }, - { 0x0000009B, 0x0000009B, CPU_MONITOR, }, - { 0x000000C1, 0x000000C4, CPU_PMC, }, - { 0x000000CD, 0x000000CD, CPU_FREQ, }, - { 0x000000E7, 0x000000E8, CPU_PERF, }, - { 0x000000FE, 0x000000FE, CPU_MTRR, }, - - { 0x00000116, 0x0000011E, CPU_CACHE, }, - { 0x00000174, 0x00000176, CPU_SYSENTER, }, - { 0x00000179, 0x0000017B, CPU_MC, }, - { 0x00000186, 0x00000189, CPU_PMC, }, - { 0x00000198, 0x00000199, CPU_PERF, }, - { 0x0000019A, 0x0000019A, CPU_TIME, }, - { 0x0000019B, 0x0000019D, CPU_THERM, }, - { 0x000001A0, 0x000001A0, CPU_MISC, }, - { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, - { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, }, - { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, }, - { 0x00000250, 0x00000250, CPU_MTRR, }, - { 0x00000258, 0x00000259, CPU_MTRR, }, - { 0x00000268, 0x0000026F, CPU_MTRR, }, - { 0x00000277, 0x00000277, CPU_PAT, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, }, - - { 0x00000300, 0x00000311, CPU_PMC, }, - { 0x00000345, 0x00000345, CPU_PMC, }, - { 0x00000360, 0x00000371, CPU_PMC, }, - { 0x0000038D, 0x00000390, CPU_PMC, }, - { 0x000003A0, 0x000003BE, CPU_PMC, }, - { 0x000003C0, 0x000003CD, CPU_PMC, }, - { 0x000003E0, 0x000003E1, CPU_PMC, }, - { 0x000003F0, 0x000003F2, CPU_PMC, }, - - { 0x00000400, 0x00000417, CPU_MC, }, - { 0x00000480, 0x0000048B, CPU_VMX, }, - - { 0x00000600, 0x00000600, CPU_DEBUG, }, - { 0x00000680, 0x0000068F, CPU_LBRANCH, }, - { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, - - { 0x000107CC, 0x000107D3, CPU_PMC, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, }, - { 0xC0000081, 0xC0000084, CPU_CALL, }, - { 0xC0000100, 0xC0000102, CPU_BASE, }, - { 0xC0000103, 0xC0000103, CPU_TIME, }, - - { 0xC0010000, 0xC0010007, CPU_PMC, }, - { 0xC0010010, 0xC0010010, CPU_CONF, }, - { 0xC0010015, 0xC0010015, CPU_CONF, }, - { 0xC0010016, 0xC001001A, CPU_MTRR, }, - { 0xC001001D, 0xC001001D, CPU_MTRR, }, - { 0xC001001F, 0xC001001F, CPU_CONF, }, - { 0xC0010030, 0xC0010035, CPU_BIOS, }, - { 0xC0010044, 0xC0010048, CPU_MC, }, - { 0xC0010050, 0xC0010056, CPU_SMM, }, - { 0xC0010058, 0xC0010058, CPU_CONF, }, - { 0xC0010060, 0xC0010060, CPU_CACHE, }, - { 0xC0010061, 0xC0010068, CPU_SMM, }, - { 0xC0010069, 0xC001006B, CPU_SMM, }, - { 0xC0010070, 0xC0010071, CPU_SMM, }, - { 0xC0010111, 0xC0010113, CPU_SMM, }, - { 0xC0010114, 0xC0010118, CPU_SVM, }, - { 0xC0010140, 0xC0010141, CPU_OSVM, }, - { 0xC0011022, 0xC0011023, CPU_CONF, }, -}; - -static int is_typeflag_valid(unsigned cpu, unsigned flag) -{ - int i; - - /* Standard Registers should be always valid */ - if (flag >= CPU_TSS) - return 1; - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (cpu_reg_range[i].flag == flag) - return 1; - } - - /* Invalid */ - return 0; -} - -static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, - int index, unsigned flag) -{ - if (cpu_reg_range[index].flag == flag) { - *min = cpu_reg_range[index].min; - *max = cpu_reg_range[index].max; - } else - *max = 0; - - return *max; -} - -/* This function can also be called with seq = NULL for printk */ -static void print_cpu_data(struct seq_file *seq, unsigned type, - u32 low, u32 high) -{ - struct cpu_private *priv; - u64 val = high; - - if (seq) { - priv = seq->private; - if (priv->file) { - val = (val << 32) | low; - seq_printf(seq, "0x%llx\n", val); - } else - seq_printf(seq, " %08x: %08x_%08x\n", - type, high, low); - } else - printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low); -} - -/* This function can also be called with seq = NULL for printk */ -static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) -{ - unsigned msr, msr_min, msr_max; - struct cpu_private *priv; - u32 low, high; - int i; - - if (seq) { - priv = seq->private; - if (priv->file) { - if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg, - &low, &high)) - print_cpu_data(seq, priv->reg, low, high); - return; - } - } - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) - continue; - - for (msr = msr_min; msr <= msr_max; msr++) { - if (rdmsr_safe_on_cpu(cpu, msr, &low, &high)) - continue; - print_cpu_data(seq, msr, low, high); - } - } -} - -static void print_tss(void *arg) -{ - struct pt_regs *regs = task_pt_regs(current); - struct seq_file *seq = arg; - unsigned int seg; - - seq_printf(seq, " RAX\t: %016lx\n", regs->ax); - seq_printf(seq, " RBX\t: %016lx\n", regs->bx); - seq_printf(seq, " RCX\t: %016lx\n", regs->cx); - seq_printf(seq, " RDX\t: %016lx\n", regs->dx); - - seq_printf(seq, " RSI\t: %016lx\n", regs->si); - seq_printf(seq, " RDI\t: %016lx\n", regs->di); - seq_printf(seq, " RBP\t: %016lx\n", regs->bp); - seq_printf(seq, " ESP\t: %016lx\n", regs->sp); - -#ifdef CONFIG_X86_64 - seq_printf(seq, " R08\t: %016lx\n", regs->r8); - seq_printf(seq, " R09\t: %016lx\n", regs->r9); - seq_printf(seq, " R10\t: %016lx\n", regs->r10); - seq_printf(seq, " R11\t: %016lx\n", regs->r11); - seq_printf(seq, " R12\t: %016lx\n", regs->r12); - seq_printf(seq, " R13\t: %016lx\n", regs->r13); - seq_printf(seq, " R14\t: %016lx\n", regs->r14); - seq_printf(seq, " R15\t: %016lx\n", regs->r15); -#endif - - asm("movl %%cs,%0" : "=r" (seg)); - seq_printf(seq, " CS\t: %04x\n", seg); - asm("movl %%ds,%0" : "=r" (seg)); - seq_printf(seq, " DS\t: %04x\n", seg); - seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff); - asm("movl %%es,%0" : "=r" (seg)); - seq_printf(seq, " ES\t: %04x\n", seg); - asm("movl %%fs,%0" : "=r" (seg)); - seq_printf(seq, " FS\t: %04x\n", seg); - asm("movl %%gs,%0" : "=r" (seg)); - seq_printf(seq, " GS\t: %04x\n", seg); - - seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags); - - seq_printf(seq, " EIP\t: %016lx\n", regs->ip); -} - -static void print_cr(void *arg) -{ - struct seq_file *seq = arg; - - seq_printf(seq, " cr0\t: %016lx\n", read_cr0()); - seq_printf(seq, " cr2\t: %016lx\n", read_cr2()); - seq_printf(seq, " cr3\t: %016lx\n", read_cr3()); - seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe()); -#ifdef CONFIG_X86_64 - seq_printf(seq, " cr8\t: %016lx\n", read_cr8()); -#endif -} - -static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt) -{ - seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size)); -} - -static void print_dt(void *seq) -{ - struct desc_ptr dt; - unsigned long ldt; - - /* IDT */ - store_idt((struct desc_ptr *)&dt); - print_desc_ptr("IDT", seq, dt); - - /* GDT */ - store_gdt((struct desc_ptr *)&dt); - print_desc_ptr("GDT", seq, dt); - - /* LDT */ - store_ldt(ldt); - seq_printf(seq, " LDT\t: %016lx\n", ldt); - - /* TR */ - store_tr(ldt); - seq_printf(seq, " TR\t: %016lx\n", ldt); -} - -static void print_dr(void *arg) -{ - struct seq_file *seq = arg; - unsigned long dr; - int i; - - for (i = 0; i < 8; i++) { - /* Ignore db4, db5 */ - if ((i == 4) || (i == 5)) - continue; - get_debugreg(dr, i); - seq_printf(seq, " dr%d\t: %016lx\n", i, dr); - } - - seq_printf(seq, "\n MSR\t:\n"); -} - -static void print_apic(void *arg) -{ - struct seq_file *seq = arg; - -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(seq, " LAPIC\t:\n"); - seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24); - seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR)); - seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI)); - seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI)); - seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI)); - seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR)); - seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR)); - seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV)); - seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR)); - seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR)); - seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR)); - seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2)); - seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT)); - seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR)); - seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC)); - seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0)); - seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1)); - seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR)); - seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); - seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); - seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); - if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { - unsigned int i, v, maxeilvt; - - v = apic_read(APIC_EFEAT); - maxeilvt = (v >> 16) & 0xff; - seq_printf(seq, " EFEAT\t\t: %08x\n", v); - seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); - - for (i = 0; i < maxeilvt; i++) { - v = apic_read(APIC_EILVTn(i)); - seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); - } - } -#endif /* CONFIG_X86_LOCAL_APIC */ - seq_printf(seq, "\n MSR\t:\n"); -} - -static int cpu_seq_show(struct seq_file *seq, void *v) -{ - struct cpu_private *priv = seq->private; - - if (priv == NULL) - return -EINVAL; - - switch (cpu_base[priv->type].flag) { - case CPU_TSS: - smp_call_function_single(priv->cpu, print_tss, seq, 1); - break; - case CPU_CR: - smp_call_function_single(priv->cpu, print_cr, seq, 1); - break; - case CPU_DT: - smp_call_function_single(priv->cpu, print_dt, seq, 1); - break; - case CPU_DEBUG: - if (priv->file == CPU_INDEX_BIT) - smp_call_function_single(priv->cpu, print_dr, seq, 1); - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - case CPU_APIC: - if (priv->file == CPU_INDEX_BIT) - smp_call_function_single(priv->cpu, print_apic, seq, 1); - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - - default: - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - } - seq_printf(seq, "\n"); - - return 0; -} - -static void *cpu_seq_start(struct seq_file *seq, loff_t *pos) -{ - if (*pos == 0) /* One time is enough ;-) */ - return seq; - - return NULL; -} - -static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - (*pos)++; - - return cpu_seq_start(seq, pos); -} - -static void cpu_seq_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations cpu_seq_ops = { - .start = cpu_seq_start, - .next = cpu_seq_next, - .stop = cpu_seq_stop, - .show = cpu_seq_show, -}; - -static int cpu_seq_open(struct inode *inode, struct file *file) -{ - struct cpu_private *priv = inode->i_private; - struct seq_file *seq; - int err; - - err = seq_open(file, &cpu_seq_ops); - if (!err) { - seq = file->private_data; - seq->private = priv; - } - - return err; -} - -static int write_msr(struct cpu_private *priv, u64 val) -{ - u32 low, high; - - high = (val >> 32) & 0xffffffff; - low = val & 0xffffffff; - - if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high)) - return 0; - - return -EPERM; -} - -static int write_cpu_register(struct cpu_private *priv, const char *buf) -{ - int ret = -EPERM; - u64 val; - - ret = strict_strtoull(buf, 0, &val); - if (ret < 0) - return ret; - - /* Supporting only MSRs */ - if (priv->type < CPU_TSS_BIT) - return write_msr(priv, val); - - return ret; -} - -static ssize_t cpu_write(struct file *file, const char __user *ubuf, - size_t count, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct cpu_private *priv = seq->private; - char buf[19]; - - if ((priv == NULL) || (count >= sizeof(buf))) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, count)) - return -EFAULT; - - buf[count] = 0; - - if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write)) - if (!write_cpu_register(priv, buf)) - return count; - - return -EACCES; -} - -static const struct file_operations cpu_fops = { - .owner = THIS_MODULE, - .open = cpu_seq_open, - .read = seq_read, - .write = cpu_write, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, - unsigned file, struct dentry *dentry) -{ - struct cpu_private *priv = NULL; - - /* Already intialized */ - if (file == CPU_INDEX_BIT) - if (per_cpu(cpu_arr[type].init, cpu)) - return 0; - - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv == NULL) - return -ENOMEM; - - priv->cpu = cpu; - priv->type = type; - priv->reg = reg; - priv->file = file; - mutex_lock(&cpu_debug_lock); - per_cpu(priv_arr[type], cpu) = priv; - per_cpu(cpu_priv_count, cpu)++; - mutex_unlock(&cpu_debug_lock); - - if (file) - debugfs_create_file(cpu_file[file].name, S_IRUGO, - dentry, (void *)priv, &cpu_fops); - else { - debugfs_create_file(cpu_base[type].name, S_IRUGO, - per_cpu(cpu_arr[type].dentry, cpu), - (void *)priv, &cpu_fops); - mutex_lock(&cpu_debug_lock); - per_cpu(cpu_arr[type].init, cpu) = 1; - mutex_unlock(&cpu_debug_lock); - } - - return 0; -} - -static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg, - struct dentry *dentry) -{ - unsigned file; - int err = 0; - - for (file = 0; file < ARRAY_SIZE(cpu_file); file++) { - err = cpu_create_file(cpu, type, reg, file, dentry); - if (err) - return err; - } - - return err; -} - -static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) -{ - struct dentry *cpu_dentry = NULL; - unsigned reg, reg_min, reg_max; - int i, err = 0; - char reg_dir[12]; - u32 low, high; - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (!get_cpu_range(cpu, ®_min, ®_max, i, - cpu_base[type].flag)) - continue; - - for (reg = reg_min; reg <= reg_max; reg++) { - if (rdmsr_safe_on_cpu(cpu, reg, &low, &high)) - continue; - - sprintf(reg_dir, "0x%x", reg); - cpu_dentry = debugfs_create_dir(reg_dir, dentry); - err = cpu_init_regfiles(cpu, type, reg, cpu_dentry); - if (err) - return err; - } - } - - return err; -} - -static int cpu_init_allreg(unsigned cpu, struct dentry *dentry) -{ - struct dentry *cpu_dentry = NULL; - unsigned type; - int err = 0; - - for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) { - if (!is_typeflag_valid(cpu, cpu_base[type].flag)) - continue; - cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry); - per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry; - - if (type < CPU_TSS_BIT) - err = cpu_init_msr(cpu, type, cpu_dentry); - else - err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT, - cpu_dentry); - if (err) - return err; - } - - return err; -} - -static int cpu_init_cpu(void) -{ - struct dentry *cpu_dentry = NULL; - struct cpuinfo_x86 *cpui; - char cpu_dir[12]; - unsigned cpu; - int err = 0; - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - cpui = &cpu_data(cpu); - if (!cpu_has(cpui, X86_FEATURE_MSR)) - continue; - - sprintf(cpu_dir, "cpu%d", cpu); - cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); - err = cpu_init_allreg(cpu, cpu_dentry); - - pr_info("cpu%d(%d) debug files %d\n", - cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu)); - if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) { - pr_err("Register files count %d exceeds limit %d\n", - per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES); - per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES; - err = -ENFILE; - } - if (err) - return err; - } - - return err; -} - -static int __init cpu_debug_init(void) -{ - cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir); - - return cpu_init_cpu(); -} - -static void __exit cpu_debug_exit(void) -{ - int i, cpu; - - if (cpu_debugfs_dir) - debugfs_remove_recursive(cpu_debugfs_dir); - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) - for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++) - kfree(per_cpu(priv_arr[i], cpu)); -} - -module_init(cpu_debug_init); -module_exit(cpu_debug_exit); - -MODULE_AUTHOR("Jaswinder Singh Rajput"); -MODULE_DESCRIPTION("CPU Debug module"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 8b581d3905cb..1b1920fa7c80 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -68,9 +68,9 @@ struct acpi_cpufreq_data { unsigned int cpu_feature; }; -static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); +static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); -static DEFINE_PER_CPU(struct aperfmperf, old_perf); +static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -190,9 +190,11 @@ static void do_drv_write(void *_cmd) static void drv_read(struct drv_cmd *cmd) { + int err; cmd->val = 0; - smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1); + err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1); + WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */ } static void drv_write(struct drv_cmd *cmd) @@ -214,14 +216,14 @@ static u32 get_cur_val(const struct cpumask *mask) if (unlikely(cpumask_empty(mask))) return 0; - switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) { + switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; break; case SYSTEM_IO_CAPABLE: cmd.type = SYSTEM_IO_CAPABLE; - perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data; + perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data; cmd.addr.io.port = perf->control_register.address; cmd.addr.io.bit_width = perf->control_register.bit_width; break; @@ -268,8 +270,8 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy, if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) return 0; - ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); - per_cpu(old_perf, cpu) = perf; + ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); + per_cpu(acfreq_old_perf, cpu) = perf; retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; @@ -278,7 +280,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy, static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); unsigned int freq; unsigned int cached_freq; @@ -322,7 +324,7 @@ static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq, static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); struct acpi_processor_performance *perf; struct cpufreq_freqs freqs; struct drv_cmd cmd; @@ -416,7 +418,7 @@ out: static int acpi_cpufreq_verify(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); dprintk("acpi_cpufreq_verify\n"); @@ -574,7 +576,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) return -ENOMEM; data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); - per_cpu(drv_data, cpu) = data; + per_cpu(acfreq_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; @@ -725,20 +727,20 @@ err_unreg: acpi_processor_unregister_performance(perf, cpu); err_free: kfree(data); - per_cpu(drv_data, cpu) = NULL; + per_cpu(acfreq_data, cpu) = NULL; return result; } static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); dprintk("acpi_cpufreq_cpu_exit\n"); if (data) { cpufreq_frequency_table_put_attr(policy->cpu); - per_cpu(drv_data, policy->cpu) = NULL; + per_cpu(acfreq_data, policy->cpu) = NULL; acpi_processor_unregister_performance(data->acpi_data, policy->cpu); kfree(data); @@ -749,7 +751,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) static int acpi_cpufreq_resume(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); dprintk("acpi_cpufreq_resume\n"); @@ -764,14 +766,15 @@ static struct freq_attr *acpi_cpufreq_attr[] = { }; static struct cpufreq_driver acpi_cpufreq_driver = { - .verify = acpi_cpufreq_verify, - .target = acpi_cpufreq_target, - .init = acpi_cpufreq_cpu_init, - .exit = acpi_cpufreq_cpu_exit, - .resume = acpi_cpufreq_resume, - .name = "acpi-cpufreq", - .owner = THIS_MODULE, - .attr = acpi_cpufreq_attr, + .verify = acpi_cpufreq_verify, + .target = acpi_cpufreq_target, + .bios_limit = acpi_processor_get_bios_limit, + .init = acpi_cpufreq_cpu_init, + .exit = acpi_cpufreq_cpu_exit, + .resume = acpi_cpufreq_resume, + .name = "acpi-cpufreq", + .owner = THIS_MODULE, + .attr = acpi_cpufreq_attr, }; static int __init acpi_cpufreq_init(void) diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index cabd2fa3fc93..7e7eea4f8261 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -885,7 +885,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) /* Find ACPI data for processor */ acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, &longhaul_walk_callback, + ACPI_UINT32_MAX, &longhaul_walk_callback, NULL, NULL, (void *)&pr); /* Check ACPI support for C3 state */ diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index f10dea409f40..cb01dac267d3 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -164,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) } /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cpuinfo.transition_latency = 200000; policy->cur = busfreq * max_multiplier; result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index d47c775eb0ab..9a97116f89e5 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -714,14 +714,17 @@ static struct freq_attr *powernow_table_attr[] = { }; static struct cpufreq_driver powernow_driver = { - .verify = powernow_verify, - .target = powernow_target, - .get = powernow_get, - .init = powernow_cpu_init, - .exit = powernow_cpu_exit, - .name = "powernow-k7", - .owner = THIS_MODULE, - .attr = powernow_table_attr, + .verify = powernow_verify, + .target = powernow_target, + .get = powernow_get, +#ifdef CONFIG_X86_POWERNOW_K7_ACPI + .bios_limit = acpi_processor_get_bios_limit, +#endif + .init = powernow_cpu_init, + .exit = powernow_cpu_exit, + .name = "powernow-k7", + .owner = THIS_MODULE, + .attr = powernow_table_attr, }; static int __init powernow_init(void) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 3f12dabeab52..6e44519960c8 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1118,7 +1118,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { - cpumask_t oldmask; + cpumask_var_t oldmask; struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); u32 checkfid; u32 checkvid; @@ -1131,9 +1131,13 @@ static int powernowk8_target(struct cpufreq_policy *pol, checkfid = data->currfid; checkvid = data->currvid; - /* only run on specific CPU from here on */ - oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); + /* only run on specific CPU from here on. */ + /* This is poor form: use a workqueue or smp_call_function_single */ + if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_copy(oldmask, tsk_cpus_allowed(current)); + set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); if (smp_processor_id() != pol->cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); @@ -1193,7 +1197,8 @@ static int powernowk8_target(struct cpufreq_policy *pol, ret = 0; err_out: - set_cpus_allowed_ptr(current, &oldmask); + set_cpus_allowed_ptr(current, oldmask); + free_cpumask_var(oldmask); return ret; } @@ -1351,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) kfree(data->powernow_table); kfree(data); + per_cpu(powernow_data, pol->cpu) = NULL; return 0; } @@ -1370,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu) int err; if (!data) - return -EINVAL; + return 0; smp_call_function_single(cpu, query_values_on_cpu, &err, true); if (err) @@ -1393,14 +1399,15 @@ static struct freq_attr *powernow_k8_attr[] = { }; static struct cpufreq_driver cpufreq_amd64_driver = { - .verify = powernowk8_verify, - .target = powernowk8_target, - .init = powernowk8_cpu_init, - .exit = __devexit_p(powernowk8_cpu_exit), - .get = powernowk8_get, - .name = "powernow-k8", - .owner = THIS_MODULE, - .attr = powernow_k8_attr, + .verify = powernowk8_verify, + .target = powernowk8_target, + .bios_limit = acpi_processor_get_bios_limit, + .init = powernowk8_cpu_init, + .exit = __devexit_p(powernowk8_cpu_exit), + .get = powernowk8_get, + .name = "powernow-k8", + .owner = THIS_MODULE, + .attr = powernow_k8_attr, }; /* driver entry point for init */ diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 3ae5a7a3a500..2ce8e0b5cc54 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev; /* speedstep_processor */ -static unsigned int speedstep_processor; +static enum speedstep_processor speedstep_processor; static u32 pmbase; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index f4c290b8482f..ad0083abfa23 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -34,7 +34,7 @@ static int relaxed_check; * GET PROCESSOR CORE SPEED IN KHZ * *********************************************************************/ -static unsigned int pentium3_get_frequency(unsigned int processor) +static unsigned int pentium3_get_frequency(enum speedstep_processor processor) { /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ struct { @@ -227,7 +227,7 @@ static unsigned int pentium4_get_frequency(void) /* Warning: may get called from smp_call_function_single. */ -unsigned int speedstep_get_frequency(unsigned int processor) +unsigned int speedstep_get_frequency(enum speedstep_processor processor) { switch (processor) { case SPEEDSTEP_CPU_PCORE: @@ -380,7 +380,7 @@ EXPORT_SYMBOL_GPL(speedstep_detect_processor); * DETECT SPEEDSTEP SPEEDS * *********************************************************************/ -unsigned int speedstep_get_freqs(unsigned int processor, +unsigned int speedstep_get_freqs(enum speedstep_processor processor, unsigned int *low_speed, unsigned int *high_speed, unsigned int *transition_latency, diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h index 2b6c04e5a304..70d9cea1219d 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h @@ -11,18 +11,18 @@ /* processors */ - -#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */ -#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */ -#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */ -#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */ - +enum speedstep_processor { + SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */ + SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */ + SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */ + SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */ /* the following processors are not speedstep-capable and are not auto-detected * in speedstep_detect_processor(). However, their speed can be detected using * the speedstep_get_frequency() call. */ -#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */ -#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */ -#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */ + SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */ + SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */ + SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */ +}; /* speedstep states -- only two of them */ @@ -31,10 +31,10 @@ /* detect a speedstep-capable processor */ -extern unsigned int speedstep_detect_processor (void); +extern enum speedstep_processor speedstep_detect_processor(void); /* detect the current speed (in khz) of the processor */ -extern unsigned int speedstep_get_frequency(unsigned int processor); +extern unsigned int speedstep_get_frequency(enum speedstep_processor processor); /* detect the low and high speeds of the processor. The callback @@ -42,7 +42,7 @@ extern unsigned int speedstep_get_frequency(unsigned int processor); * SPEEDSTEP_LOW; the second argument is zero so that no * cpufreq_notify_transition calls are initiated. */ -extern unsigned int speedstep_get_freqs(unsigned int processor, +extern unsigned int speedstep_get_freqs(enum speedstep_processor processor, unsigned int *low_speed, unsigned int *high_speed, unsigned int *transition_latency, diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index befea088e4f5..04d73c114e49 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -35,7 +35,7 @@ static int smi_cmd; static unsigned int smi_sig; /* info about the processor */ -static unsigned int speedstep_processor; +static enum speedstep_processor speedstep_processor; /* * There are only two frequency states for each processor. Values diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c900b73f9224..879666f4d871 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -70,7 +70,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); sched_clock_stable = 1; } @@ -270,8 +269,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) node = cpu_to_node(cpu); } numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6c40f6b5b340..eddb1bdd1b8f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -18,6 +18,7 @@ #include <asm/processor.h> #include <linux/smp.h> #include <asm/k8.h> +#include <asm/smp.h> #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -31,6 +32,8 @@ struct _cache_table { short size; }; +#define MB(x) ((x) * 1024) + /* All the cache descriptor types we care about (no TLB or trace cache entries) */ @@ -44,9 +47,9 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ @@ -59,16 +62,16 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ - { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */ - { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */ - { 0x46, LVL_3, 4096 }, /* 4-way set assoc, 64 byte line size */ - { 0x47, LVL_3, 8192 }, /* 8-way set assoc, 64 byte line size */ - { 0x49, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ - { 0x4a, LVL_3, 6144 }, /* 12-way set assoc, 64 byte line size */ - { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ - { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ - { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ - { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */ + { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ + { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ + { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ + { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ + { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ + { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ + { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ + { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ @@ -77,34 +80,34 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ - { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */ - { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */ - { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ - { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ - { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ - { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */ - { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */ - { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */ - { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ - { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ - { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */ - { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ - { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ - { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ - { 0xde, LVL_3, 8192 }, /* 12-way set assoc, 64 byte line size */ - { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ - { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ - { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ - { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */ - { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */ - { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */ + { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ + { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ + { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ + { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ + { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ + { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ + { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ + { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ + { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ + { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ + { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ + { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ { 0x00, 0, 0} }; @@ -150,7 +153,8 @@ struct _cpuid4_info { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + bool can_disable; + unsigned int l3_indices; DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; @@ -160,7 +164,8 @@ struct _cpuid4_info_regs { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + bool can_disable; + unsigned int l3_indices; }; unsigned short num_cache_leaves; @@ -290,6 +295,36 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } +struct _cache_attr { + struct attribute attr; + ssize_t (*show)(struct _cpuid4_info *, char *); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); +}; + +#ifdef CONFIG_CPU_SUP_AMD +static unsigned int __cpuinit amd_calc_l3_indices(void) +{ + /* + * We're called over smp_call_function_single() and therefore + * are on the correct cpu. + */ + int cpu = smp_processor_id(); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; + + pci_read_config_dword(dev, 0x1C4, &val); + + /* calculate subcache sizes */ + sc0 = !(val & BIT(0)); + sc1 = !(val & BIT(4)); + sc2 = !(val & BIT(8)) + !(val & BIT(9)); + sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; +} + static void __cpuinit amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { @@ -299,13 +334,104 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) if (boot_cpu_data.x86 == 0x11) return; - /* see erratum #382 */ - if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + /* see errata #382 and #388 */ + if ((boot_cpu_data.x86 == 0x10) && + ((boot_cpu_data.x86_model < 0x8) || + (boot_cpu_data.x86_mask < 0x1))) return; - this_leaf->can_disable = 1; + this_leaf->can_disable = true; + this_leaf->l3_indices = amd_calc_l3_indices(); +} + +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) +{ + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = amd_get_nb_id(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; + + if (!this_leaf->can_disable) + return -EINVAL; + + if (!dev) + return -EINVAL; + + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "0x%08x\n", reg); +} + +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) +{ + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = amd_get_nb_id(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; + +#define SUBCACHE_MASK (3UL << 20) +#define SUBCACHE_INDEX 0xfff + + if (!this_leaf->can_disable) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) + return -EINVAL; + + if (strict_strtoul(buf, 10, &val) < 0) + return -EINVAL; + + /* do not allow writes outside of allowed bits */ + if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) + return -EINVAL; + + val |= BIT(30); + pci_write_config_dword(dev, 0x1BC + index * 4, val); + /* + * We need to WBINVD on a core on the node containing the L3 cache which + * indices we disable therefore a simple wbinvd() is not sufficient. + */ + wbinvd_on_cpu(cpu); + pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); + return count; } +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ +} +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); + +#else /* CONFIG_CPU_SUP_AMD */ +static void __cpuinit +amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +{ +}; +#endif /* CONFIG_CPU_SUP_AMD */ + static int __cpuinit cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) @@ -499,26 +625,27 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) #ifdef CONFIG_SYSFS /* pointer to _cpuid4_info array (for each cache leaf) */ -static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); -#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) +static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); +#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) #ifdef CONFIG_SMP static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf, *sibling_leaf; unsigned long num_threads_sharing; - int index_msb, i; + int index_msb, i, sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { - struct cpuinfo_x86 *d; - for_each_online_cpu(i) { - if (!per_cpu(cpuid4_info, i)) + for_each_cpu(i, c->llc_shared_map) { + if (!per_cpu(ici_cpuid4_info, i)) continue; - d = &cpu_data(i); this_leaf = CPUID4_INFO_IDX(i, index); - cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), - d->llc_shared_map); + for_each_cpu(sibling, c->llc_shared_map) { + if (!cpu_online(sibling)) + continue; + set_bit(sibling, this_leaf->shared_cpu_map); + } } return; } @@ -535,7 +662,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) c->apicid >> index_msb) { cpumask_set_cpu(i, to_cpumask(this_leaf->shared_cpu_map)); - if (i != cpu && per_cpu(cpuid4_info, i)) { + if (i != cpu && per_cpu(ici_cpuid4_info, i)) { sibling_leaf = CPUID4_INFO_IDX(i, index); cpumask_set_cpu(cpu, to_cpumask( @@ -574,8 +701,8 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) for (i = 0; i < num_cache_leaves; i++) cache_remove_shared_cpu_map(cpu, i); - kfree(per_cpu(cpuid4_info, cpu)); - per_cpu(cpuid4_info, cpu) = NULL; + kfree(per_cpu(ici_cpuid4_info, cpu)); + per_cpu(ici_cpuid4_info, cpu) = NULL; } static int @@ -614,15 +741,15 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) if (num_cache_leaves == 0) return -ENOENT; - per_cpu(cpuid4_info, cpu) = kzalloc( + per_cpu(ici_cpuid4_info, cpu) = kzalloc( sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); - if (per_cpu(cpuid4_info, cpu) == NULL) + if (per_cpu(ici_cpuid4_info, cpu) == NULL) return -ENOMEM; smp_call_function_single(cpu, get_cpu_leaves, &retval, true); if (retval) { - kfree(per_cpu(cpuid4_info, cpu)); - per_cpu(cpuid4_info, cpu) = NULL; + kfree(per_cpu(ici_cpuid4_info, cpu)); + per_cpu(ici_cpuid4_info, cpu) = NULL; } return retval; @@ -634,7 +761,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ /* pointer to kobject for cpuX/cache */ -static DEFINE_PER_CPU(struct kobject *, cache_kobject); +static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); struct _index_kobject { struct kobject kobj; @@ -643,8 +770,8 @@ struct _index_kobject { }; /* pointer to array of kobjects for cpuX/cache/indexY */ -static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); -#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) +static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject); +#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) #define show_one_plus(file_name, object, val) \ static ssize_t show_##file_name \ @@ -710,82 +837,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, - unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned int reg = 0; - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!dev) - return -EINVAL; - - pci_read_config_dword(dev, 0x1BC + index * 4, ®); - return sprintf(buf, "%x\n", reg); -} - -#define SHOW_CACHE_DISABLE(index) \ -static ssize_t \ -show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ -{ \ - return show_cache_disable(this_leaf, buf, index); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned long val = 0; - unsigned int scrubber = 0; - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!dev) - return -EINVAL; - - if (strict_strtoul(buf, 10, &val) < 0) - return -EINVAL; - - val |= 0xc0000000; - - pci_read_config_dword(dev, 0x58, &scrubber); - scrubber &= ~0x1f000000; - pci_write_config_dword(dev, 0x58, scrubber); - - pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); - wbinvd(); - pci_write_config_dword(dev, 0x1BC + index * 4, val); - return count; -} - -#define STORE_CACHE_DISABLE(index) \ -static ssize_t \ -store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ -{ \ - return store_cache_disable(this_leaf, buf, count, index); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -struct _cache_attr { - struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); -}; - #define define_one_ro(_name) \ static struct _cache_attr _name = \ __ATTR(_name, 0444, show_##_name, NULL) @@ -800,23 +851,28 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, - show_cache_disable_0, store_cache_disable_0); -static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, - show_cache_disable_1, store_cache_disable_1); +#define DEFAULT_SYSFS_CACHE_ATTRS \ + &type.attr, \ + &level.attr, \ + &coherency_line_size.attr, \ + &physical_line_partition.attr, \ + &ways_of_associativity.attr, \ + &number_of_sets.attr, \ + &size.attr, \ + &shared_cpu_map.attr, \ + &shared_cpu_list.attr static struct attribute *default_attrs[] = { - &type.attr, - &level.attr, - &coherency_line_size.attr, - &physical_line_partition.attr, - &ways_of_associativity.attr, - &number_of_sets.attr, - &size.attr, - &shared_cpu_map.attr, - &shared_cpu_list.attr, + DEFAULT_SYSFS_CACHE_ATTRS, + NULL +}; + +static struct attribute *default_l3_attrs[] = { + DEFAULT_SYSFS_CACHE_ATTRS, +#ifdef CONFIG_CPU_SUP_AMD &cache_disable_0.attr, &cache_disable_1.attr, +#endif NULL }; @@ -863,10 +919,10 @@ static struct kobj_type ktype_percpu_entry = { static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) { - kfree(per_cpu(cache_kobject, cpu)); - kfree(per_cpu(index_kobject, cpu)); - per_cpu(cache_kobject, cpu) = NULL; - per_cpu(index_kobject, cpu) = NULL; + kfree(per_cpu(ici_cache_kobject, cpu)); + kfree(per_cpu(ici_index_kobject, cpu)); + per_cpu(ici_cache_kobject, cpu) = NULL; + per_cpu(ici_index_kobject, cpu) = NULL; free_cache_attributes(cpu); } @@ -882,14 +938,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) return err; /* Allocate all required memory */ - per_cpu(cache_kobject, cpu) = + per_cpu(ici_cache_kobject, cpu) = kzalloc(sizeof(struct kobject), GFP_KERNEL); - if (unlikely(per_cpu(cache_kobject, cpu) == NULL)) + if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL)) goto err_out; - per_cpu(index_kobject, cpu) = kzalloc( + per_cpu(ici_index_kobject, cpu) = kzalloc( sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); - if (unlikely(per_cpu(index_kobject, cpu) == NULL)) + if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL)) goto err_out; return 0; @@ -907,13 +963,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) unsigned int cpu = sys_dev->id; unsigned long i, j; struct _index_kobject *this_object; + struct _cpuid4_info *this_leaf; int retval; retval = cpuid4_cache_sysfs_init(cpu); if (unlikely(retval < 0)) return retval; - retval = kobject_init_and_add(per_cpu(cache_kobject, cpu), + retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), &ktype_percpu_entry, &sys_dev->kobj, "%s", "cache"); if (retval < 0) { @@ -925,14 +982,22 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_object = INDEX_KOBJECT_PTR(cpu, i); this_object->cpu = cpu; this_object->index = i; + + this_leaf = CPUID4_INFO_IDX(cpu, i); + + if (this_leaf->can_disable) + ktype_cache.default_attrs = default_l3_attrs; + else + ktype_cache.default_attrs = default_attrs; + retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, - per_cpu(cache_kobject, cpu), + per_cpu(ici_cache_kobject, cpu), "index%1lu", i); if (unlikely(retval)) { for (j = 0; j < i; j++) kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); - kobject_put(per_cpu(cache_kobject, cpu)); + kobject_put(per_cpu(ici_cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); return retval; } @@ -940,7 +1005,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); - kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); + kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD); return 0; } @@ -949,7 +1014,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) unsigned int cpu = sys_dev->id; unsigned long i; - if (per_cpu(cpuid4_info, cpu) == NULL) + if (per_cpu(ici_cpuid4_info, cpu) == NULL) return; if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) return; @@ -957,7 +1022,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) for (i = 0; i < num_cache_leaves; i++) kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); - kobject_put(per_cpu(cache_kobject, cpu)); + kobject_put(per_cpu(ici_cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); } diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 472763d92098..73734baa50f2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -74,7 +74,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) m->finished = 0; } -static cpumask_t mce_inject_cpumask; +static cpumask_var_t mce_inject_cpumask; static int mce_raise_notify(struct notifier_block *self, unsigned long val, void *data) @@ -82,9 +82,9 @@ static int mce_raise_notify(struct notifier_block *self, struct die_args *args = (struct die_args *)data; int cpu = smp_processor_id(); struct mce *m = &__get_cpu_var(injectm); - if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) + if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) return NOTIFY_DONE; - cpu_clear(cpu, mce_inject_cpumask); + cpumask_clear_cpu(cpu, mce_inject_cpumask); if (m->inject_flags & MCJ_EXCEPTION) raise_exception(m, args->regs); else if (m->status) @@ -148,22 +148,22 @@ static void raise_mce(struct mce *m) unsigned long start; int cpu; get_online_cpus(); - mce_inject_cpumask = cpu_online_map; - cpu_clear(get_cpu(), mce_inject_cpumask); + cpumask_copy(mce_inject_cpumask, cpu_online_mask); + cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); for_each_online_cpu(cpu) { struct mce *mcpu = &per_cpu(injectm, cpu); if (!mcpu->finished || MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) - cpu_clear(cpu, mce_inject_cpumask); + cpumask_clear_cpu(cpu, mce_inject_cpumask); } - if (!cpus_empty(mce_inject_cpumask)) - apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); + if (!cpumask_empty(mce_inject_cpumask)) + apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); start = jiffies; - while (!cpus_empty(mce_inject_cpumask)) { + while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { printk(KERN_ERR "Timeout waiting for mce inject NMI %lx\n", - *cpus_addr(mce_inject_cpumask)); + *cpumask_bits(mce_inject_cpumask)); break; } cpu_relax(); @@ -210,6 +210,8 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, static int inject_init(void) { + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) + return -ENOMEM; printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; register_die_notifier(&mce_raise_nb); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d7ebf25d10ed..a8aacd4b513c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1388,13 +1388,14 @@ static void __mcheck_cpu_init_timer(void) struct timer_list *t = &__get_cpu_var(mce_timer); int *n = &__get_cpu_var(mce_next_interval); + setup_timer(t, mce_start_timer, smp_processor_id()); + if (mce_ignore_ce) return; *n = check_interval * HZ; if (!*n) return; - setup_timer(t, mce_start_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); add_timer_on(t, smp_processor_id()); } @@ -1928,7 +1929,7 @@ error2: sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); sysdev_unregister(&per_cpu(mce_dev, cpu)); diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 4fef985fc221..81c499eceb21 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -256,6 +256,16 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } +/* Thermal monitoring depends on APIC, ACPI and clock modulation */ +static int intel_thermal_supported(struct cpuinfo_x86 *c) +{ + if (!cpu_has_apic) + return 0; + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return 0; + return 1; +} + void __init mcheck_intel_therm_init(void) { /* @@ -263,8 +273,7 @@ void __init mcheck_intel_therm_init(void) * LVT value on BSP and use that value to restore APs' thermal LVT * entry BIOS programmed later */ - if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) && - cpu_has(&boot_cpu_data, X86_FEATURE_ACC)) + if (intel_thermal_supported(&boot_cpu_data)) lvtthmr_init = apic_read(APIC_LVTTHMR); } @@ -274,8 +283,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c) int tm2 = 0; u32 l, h; - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + if (!intel_thermal_supported(c)) return; /* @@ -339,8 +347,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c) l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); + printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n", + tm2 ? "TM2" : "TM1"); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index f4361b56f8e9..ad9e5ed81181 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,3 +1,3 @@ -obj-y := main.o if.o generic.o state.o cleanup.o +obj-y := main.o if.o generic.o cleanup.o obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index 33af14110dfd..92ba9cd31c9a 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) return 0; } -static struct mtrr_ops amd_mtrr_ops = { +static const struct mtrr_ops amd_mtrr_ops = { .vendor = X86_VENDOR_AMD, .set = amd_set_mtrr, .get = amd_get_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index de89f14eff3a..316fe3e60a97 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t return 0; } -static struct mtrr_ops centaur_mtrr_ops = { +static const struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, .set = centaur_set_mcr, .get = centaur_get_mcr, diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 228d982ce09c..68a3343e5798 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -265,7 +265,7 @@ static void cyrix_set_all(void) post_set(); } -static struct mtrr_ops cyrix_mtrr_ops = { +static const struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, .set_all = cyrix_set_all, .set = cyrix_set_arr, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 55da0c5f68dd..9aa5dc76ff4a 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -464,7 +464,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, tmp |= ~((1<<(hi - 1)) - 1); if (tmp != mask_lo) { - WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); mask_lo = tmp; } } @@ -570,7 +570,7 @@ static unsigned long set_mtrr_state(void) static unsigned long cr4; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts, @@ -590,7 +590,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) * changes to the way the kernel boots */ - spin_lock(&set_atomicity_lock); + raw_spin_lock(&set_atomicity_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; @@ -627,7 +627,7 @@ static void post_set(void) __releases(set_atomicity_lock) /* Restore value of CR4 */ if (cpu_has_pge) write_cr4(cr4); - spin_unlock(&set_atomicity_lock); + raw_spin_unlock(&set_atomicity_lock); } static void generic_set_all(void) @@ -752,7 +752,7 @@ int positive_have_wrcomb(void) /* * Generic structure... */ -struct mtrr_ops generic_mtrr_ops = { +const struct mtrr_ops generic_mtrr_ops = { .use_intel_if = 1, .set_all = generic_set_all, .get = generic_get_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 3c1b12d461d1..e006e56f699c 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -4,6 +4,7 @@ #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/ctype.h> +#include <linux/string.h> #include <linux/init.h> #define LINE_SIZE 80 @@ -133,8 +134,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return -EINVAL; base = simple_strtoull(line + 5, &ptr, 0); - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr); if (strncmp(ptr, "size=", 5)) return -EINVAL; @@ -142,14 +142,11 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr); if (strncmp(ptr, "type=", 5)) return -EINVAL; - ptr += 5; - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr + 5); for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 84e83de54575..fe4622e8c837 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; static bool mtrr_aps_delayed_init; -static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; -struct mtrr_ops *mtrr_if; +const struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(struct mtrr_ops *ops) +void set_mtrr_ops(const struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index a501dee9a87a..df5e41f31a27 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size, extern int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type); -extern struct mtrr_ops generic_mtrr_ops; +extern const struct mtrr_ops generic_mtrr_ops; extern int positive_have_wrcomb(void); @@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); -extern void set_mtrr_ops(struct mtrr_ops *ops); +extern void set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; -extern struct mtrr_ops *mtrr_if; +extern const struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c deleted file mode 100644 index dfc80b4e6b0d..000000000000 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ /dev/null @@ -1,94 +0,0 @@ -#include <linux/init.h> -#include <linux/io.h> -#include <linux/mm.h> - -#include <asm/processor-cyrix.h> -#include <asm/processor-flags.h> -#include <asm/mtrr.h> -#include <asm/msr.h> - -#include "mtrr.h" - -/* Put the processor into a state where MTRRs can be safely set */ -void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) -{ - unsigned int cr0; - - /* Disable interrupts locally */ - local_irq_save(ctxt->flags); - - if (use_intel() || is_cpu(CYRIX)) { - - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_pge) { - ctxt->cr4val = read_cr4(); - write_cr4(ctxt->cr4val & ~X86_CR4_PGE); - } - - /* - * Disable and flush caches. Note that wbinvd flushes the TLBs - * as a side-effect - */ - cr0 = read_cr0() | X86_CR0_CD; - wbinvd(); - write_cr0(cr0); - wbinvd(); - - if (use_intel()) { - /* Save MTRR state */ - rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - } else { - /* - * Cyrix ARRs - - * everything else were excluded at the top - */ - ctxt->ccr3 = getCx86(CX86_CCR3); - } - } -} - -void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) -{ - if (use_intel()) { - /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, - ctxt->deftype_hi); - } else { - if (is_cpu(CYRIX)) { - /* Cyrix ARRs - everything else were excluded at the top */ - setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); - } - } -} - -/* Restore the processor after a set_mtrr_prepare */ -void set_mtrr_done(struct set_mtrr_context *ctxt) -{ - if (use_intel() || is_cpu(CYRIX)) { - - /* Flush caches and TLBs */ - wbinvd(); - - /* Restore MTRRdefType */ - if (use_intel()) { - /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, - ctxt->deftype_hi); - } else { - /* - * Cyrix ARRs - - * everything else was excluded at the top - */ - setCx86(CX86_CCR3, ctxt->ccr3); - } - - /* Enable caches */ - write_cr0(read_cr0() & 0xbfffffff); - - /* Restore value of CR4 */ - if (cpu_has_pge) - write_cr4(ctxt->cr4val); - } - /* Re-enable interrupts locally (if enabled previously) */ - local_irq_restore(ctxt->flags); -} diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c1bbed1021d9..641ccb9dddbc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -7,6 +7,7 @@ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> + * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ @@ -22,6 +23,7 @@ #include <linux/uaccess.h> #include <linux/highmem.h> #include <linux/cpu.h> +#include <linux/bitops.h> #include <asm/apic.h> #include <asm/stacktrace.h> @@ -68,26 +70,59 @@ struct debug_store { u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; +struct event_constraint { + union { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 idxmsk64[1]; + }; + int code; + int cmask; + int weight; +}; + +struct amd_nb { + int nb_id; /* NorthBridge id */ + int refcnt; /* reference count */ + struct perf_event *owners[X86_PMC_IDX_MAX]; + struct event_constraint event_constraints[X86_PMC_IDX_MAX]; +}; + struct cpu_hw_events { - struct perf_event *events[X86_PMC_IDX_MAX]; - unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; int enabled; struct debug_store *ds; -}; -struct event_constraint { - unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int code; + int n_events; + int n_added; + int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + u64 tags[X86_PMC_IDX_MAX]; + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + struct amd_nb *amd_nb; }; -#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } -#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } +#define __EVENT_CONSTRAINT(c, n, m, w) {\ + { .idxmsk64[0] = (n) }, \ + .code = (c), \ + .cmask = (m), \ + .weight = (w), \ +} + +#define EVENT_CONSTRAINT(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->idxmsk[0]; (e)++) +#define INTEL_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) +#define FIXED_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) + +#define EVENT_CONSTRAINT_END \ + EVENT_CONSTRAINT(0, 0, 0) + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->cmask; (e)++) /* * struct x86_pmu - generic x86 pmu @@ -114,8 +149,14 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); - int (*get_event_idx)(struct cpu_hw_events *cpuc, - struct hw_perf_event *hwc); + + struct event_constraint * + (*get_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + + void (*put_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + struct event_constraint *event_constraints; }; static struct x86_pmu x86_pmu __read_mostly; @@ -124,111 +165,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -static const struct event_constraint *event_constraints; - -/* - * Not sure about some of these - */ -static const u64 p6_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -}; - -static u64 p6_pmu_event_map(int hw_event) -{ - return p6_perfmon_event_map[hw_event]; -} - -/* - * Event setting that is specified not to count anything. - * We use this to effectively disable a counter. - * - * L2_RQSTS with 0 MESI unit mask. - */ -#define P6_NOP_EVENT 0x0000002EULL - -static u64 p6_pmu_raw_event(u64 hw_event) -{ -#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL -#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL -#define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_REG_MASK 0xFF000000ULL - -#define P6_EVNTSEL_MASK \ - (P6_EVNTSEL_EVENT_MASK | \ - P6_EVNTSEL_UNIT_MASK | \ - P6_EVNTSEL_EDGE_MASK | \ - P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_REG_MASK) - - return hw_event & P6_EVNTSEL_MASK; -} - -static const struct event_constraint intel_p6_event_constraints[] = -{ - EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ - EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT_END -}; - -/* - * Intel PerfMon v3. Used on Core2 and later. - */ -static const u64 intel_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, -}; - -static const struct event_constraint intel_core_event_constraints[] = -{ - EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ - EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ - EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ - EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ - EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ - EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ - EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ - EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ - EVENT_CONSTRAINT_END -}; - -static const struct event_constraint intel_nehalem_event_constraints[] = -{ - EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ - EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ - EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ - EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ - EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ - EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ - EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ - EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ - EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ - EVENT_CONSTRAINT_END -}; - -static u64 intel_pmu_event_map(int hw_event) -{ - return intel_perfmon_event_map[hw_event]; -} +static int x86_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx); /* * Generalized hw caching related hw_event table, filled @@ -245,424 +183,6 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static __initconst u64 nehalem_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst u64 core2_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static __initconst u64 atom_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static u64 intel_pmu_raw_event(u64 hw_event) -{ -#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL -#define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL - -#define CORE_EVNTSEL_MASK \ - (CORE_EVNTSEL_EVENT_MASK | \ - CORE_EVNTSEL_UNIT_MASK | \ - CORE_EVNTSEL_EDGE_MASK | \ - CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_REG_MASK) - - return hw_event & CORE_EVNTSEL_MASK; -} - -static __initconst u64 amd_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ - [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ - [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ - [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ - [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -/* - * AMD Performance Monitor K7 and later. - */ -static const u64 amd_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, -}; - -static u64 amd_pmu_event_map(int hw_event) -{ - return amd_perfmon_event_map[hw_event]; -} - -static u64 amd_pmu_raw_event(u64 hw_event) -{ -#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL -#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL -#define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL - -#define K7_EVNTSEL_MASK \ - (K7_EVNTSEL_EVENT_MASK | \ - K7_EVNTSEL_UNIT_MASK | \ - K7_EVNTSEL_EDGE_MASK | \ - K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_REG_MASK) - - return hw_event & K7_EVNTSEL_MASK; -} - /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. @@ -914,42 +434,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) return 0; } -static void intel_pmu_enable_bts(u64 config) -{ - unsigned long debugctlmsr; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr |= X86_DEBUGCTL_TR; - debugctlmsr |= X86_DEBUGCTL_BTS; - debugctlmsr |= X86_DEBUGCTL_BTINT; - - if (!(config & ARCH_PERFMON_EVENTSEL_OS)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; - - if (!(config & ARCH_PERFMON_EVENTSEL_USR)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; - - update_debugctlmsr(debugctlmsr); -} - -static void intel_pmu_disable_bts(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - unsigned long debugctlmsr; - - if (!cpuc->ds) - return; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr &= - ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | - X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); - - update_debugctlmsr(debugctlmsr); -} - /* * Setup the hardware configuration for a given attr_type */ @@ -988,6 +472,8 @@ static int __hw_perf_event_init(struct perf_event *event) hwc->config = ARCH_PERFMON_EVENTSEL_INT; hwc->idx = -1; + hwc->last_cpu = -1; + hwc->last_tag = ~0ULL; /* * Count user and OS events unless requested not to. @@ -1056,216 +542,323 @@ static int __hw_perf_event_init(struct perf_event *event) return 0; } -static void p6_pmu_disable_all(void) +static void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val; - - if (!cpuc->enabled) - return; + int idx; - cpuc->enabled = 0; - barrier(); + for (idx = 0; idx < x86_pmu.num_events; idx++) { + u64 val; - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(x86_pmu.eventsel + idx, val); + if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(x86_pmu.eventsel + idx, val); + } } -static void intel_pmu_disable_all(void) +void hw_perf_disable(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + if (!x86_pmu_initialized()) + return; + if (!cpuc->enabled) return; + cpuc->n_added = 0; cpuc->enabled = 0; barrier(); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) - intel_pmu_disable_bts(); + x86_pmu.disable_all(); } -static void amd_pmu_disable_all(void) +static void x86_pmu_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - /* - * ensure we write the disable before we start disabling the - * events proper, so that amd_pmu_enable_event() does the - * right thing. - */ - barrier(); - for (idx = 0; idx < x86_pmu.num_events; idx++) { + struct perf_event *event = cpuc->events[idx]; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) - continue; - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + + val = event->hw.config; + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(x86_pmu.eventsel + idx, val); } } -void hw_perf_disable(void) +static const struct pmu pmu; + +static inline int is_x86_event(struct perf_event *event) { - if (!x86_pmu_initialized()) - return; - return x86_pmu.disable_all(); + return event->pmu == &pmu; } -static void p6_pmu_enable_all(void) +static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - unsigned long val; + struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int i, j, w, wmax, num = 0; + struct hw_perf_event *hwc; - if (cpuc->enabled) - return; + bitmap_zero(used_mask, X86_PMC_IDX_MAX); - cpuc->enabled = 1; - barrier(); + for (i = 0; i < n; i++) { + constraints[i] = + x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + } - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} + /* + * fastpath, try to reuse previous register + */ + for (i = 0; i < n; i++) { + hwc = &cpuc->event_list[i]->hw; + c = constraints[i]; -static void intel_pmu_enable_all(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + /* never assigned */ + if (hwc->idx == -1) + break; - if (cpuc->enabled) - return; + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; - cpuc->enabled = 1; - barrier(); + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + if (i == n) + goto done; - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_event *event = - cpuc->events[X86_PMC_IDX_FIXED_BTS]; + /* + * begin slow path + */ + + bitmap_zero(used_mask, X86_PMC_IDX_MAX); + + /* + * weight = number of possible counters + * + * 1 = most constrained, only works on one counter + * wmax = least constrained, works on any counter + * + * assign events to counters starting with most + * constrained events. + */ + wmax = x86_pmu.num_events; + + /* + * when fixed event counters are present, + * wmax is incremented by 1 to account + * for one more choice + */ + if (x86_pmu.num_events_fixed) + wmax++; - if (WARN_ON_ONCE(!event)) - return; + for (w = 1, num = n; num && w <= wmax; w++) { + /* for each event */ + for (i = 0; num && i < n; i++) { + c = constraints[i]; + hwc = &cpuc->event_list[i]->hw; - intel_pmu_enable_bts(event->hw.config); + if (c->weight != w) + continue; + + for_each_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { + if (!test_bit(j, used_mask)) + break; + } + + if (j == X86_PMC_IDX_MAX) + break; + + set_bit(j, used_mask); + + if (assign) + assign[i] = j; + num--; + } + } +done: + /* + * scheduling failed or is just a simulation, + * free resources if necessary + */ + if (!assign || num) { + for (i = 0; i < n; i++) { + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); + } } + return num ? -ENOSPC : 0; } -static void amd_pmu_enable_all(void) +/* + * dogrp: true if must collect siblings events (group) + * returns total number of events and error code + */ +static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - int idx; + struct perf_event *event; + int n, max_count; - if (cpuc->enabled) - return; + max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; - cpuc->enabled = 1; - barrier(); + /* current number of events already accepted */ + n = cpuc->n_events; - for (idx = 0; idx < x86_pmu.num_events; idx++) { - struct perf_event *event = cpuc->events[idx]; - u64 val; + if (is_x86_event(leader)) { + if (n >= max_count) + return -ENOSPC; + cpuc->event_list[n] = leader; + n++; + } + if (!dogrp) + return n; - if (!test_bit(idx, cpuc->active_mask)) + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (!is_x86_event(event) || + event->state <= PERF_EVENT_STATE_OFF) continue; - val = event->hw.config; - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } -} + if (n >= max_count) + return -ENOSPC; -void hw_perf_enable(void) -{ - if (!x86_pmu_initialized()) - return; - x86_pmu.enable_all(); + cpuc->event_list[n] = event; + n++; + } + return n; } -static inline u64 intel_pmu_get_status(void) +static inline void x86_assign_hw_event(struct perf_event *event, + struct cpu_hw_events *cpuc, int i) { - u64 status; + struct hw_perf_event *hwc = &event->hw; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + hwc->idx = cpuc->assign[i]; + hwc->last_cpu = smp_processor_id(); + hwc->last_tag = ++cpuc->tags[i]; - return status; + if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { + hwc->config_base = 0; + hwc->event_base = 0; + } else if (hwc->idx >= X86_PMC_IDX_FIXED) { + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that event_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->event_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + } else { + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; + } } -static inline void intel_pmu_ack_status(u64 ack) +static inline int match_prev_assignment(struct hw_perf_event *hwc, + struct cpu_hw_events *cpuc, + int i) { - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); + return hwc->idx == cpuc->assign[i] && + hwc->last_cpu == smp_processor_id() && + hwc->last_tag == cpuc->tags[i]; } -static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) -{ - (void)checking_wrmsrl(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); -} +static void x86_pmu_stop(struct perf_event *event); -static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) +void hw_perf_enable(void) { - (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); -} + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + struct hw_perf_event *hwc; + int i; -static inline void -intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, mask; + if (!x86_pmu_initialized()) + return; - mask = 0xfULL << (idx * 4); + if (cpuc->enabled) + return; - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - (void)checking_wrmsrl(hwc->config_base, ctrl_val); -} + if (cpuc->n_added) { + /* + * apply assignment obtained either from + * hw_perf_group_sched_in() or x86_pmu_enable() + * + * step1: save events moving to new counters + * step2: reprogram moved events into new counters + */ + for (i = 0; i < cpuc->n_events; i++) { -static inline void -p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val = P6_NOP_EVENT; + event = cpuc->event_list[i]; + hwc = &event->hw; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + /* + * we can avoid reprogramming counter if: + * - assigned same counter as last time + * - running on same CPU as last time + * - no other event has used the counter since + */ + if (hwc->idx == -1 || + match_prev_assignment(hwc, cpuc, i)) + continue; - (void)checking_wrmsrl(hwc->config_base + idx, val); -} + x86_pmu_stop(event); -static inline void -intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - intel_pmu_disable_bts(); - return; - } + hwc->idx = -1; + } - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc, idx); - return; + for (i = 0; i < cpuc->n_events; i++) { + + event = cpuc->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == -1) { + x86_assign_hw_event(event, cpuc, i); + x86_perf_event_set_period(event, hwc, hwc->idx); + } + /* + * need to mark as active because x86_pmu_disable() + * clear active_mask and events[] yet it preserves + * idx + */ + set_bit(hwc->idx, cpuc->active_mask); + cpuc->events[hwc->idx] = event; + + x86_pmu.enable(hwc, hwc->idx); + perf_event_update_userpage(event); + } + cpuc->n_added = 0; + perf_events_lapic_init(); } - x86_pmu_disable_event(hwc, idx); + cpuc->enabled = 1; + barrier(); + + x86_pmu.enable_all(); } -static inline void -amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + (void)checking_wrmsrl(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) { - x86_pmu_disable_event(hwc, idx); + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -1286,7 +879,7 @@ x86_perf_event_set_period(struct perf_event *event, return 0; /* - * If we are way outside a reasoable range then just skip forward: + * If we are way outside a reasonable range then just skip forward: */ if (unlikely(left <= -period)) { left = period; @@ -1326,213 +919,60 @@ x86_perf_event_set_period(struct perf_event *event, return ret; } -static inline void -intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, bits, mask; - int err; - - /* - * Enable IRQ generation (0x8), - * and enable ring-3 counting (0x2) and ring-0 counting (0x1) - * if requested: - */ - bits = 0x8ULL; - if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) - bits |= 0x2; - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - bits |= 0x1; - bits <<= (idx * 4); - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - err = checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val; - - val = hwc->config; if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); + __x86_pmu_enable_event(hwc, idx); } - -static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_events).enabled) - return; - - intel_pmu_enable_bts(hwc->config); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc, idx); - return; - } - - x86_pmu_enable_event(hwc, idx); -} - -static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) +/* + * activate a single event + * + * The event is added to the group of enabled events + * but only if it can be scehduled with existing events. + * + * Called with PMU disabled. If successful and return value 1, + * then guaranteed to call perf_enable() and hw_perf_enable() + */ +static int x86_pmu_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc; + int assign[X86_PMC_IDX_MAX]; + int n, n0, ret; - if (cpuc->enabled) - x86_pmu_enable_event(hwc, idx); -} - -static int fixed_mode_idx(struct hw_perf_event *hwc) -{ - unsigned int hw_event; - - hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - - if (unlikely((hw_event == - x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && - (hwc->sample_period == 1))) - return X86_PMC_IDX_FIXED_BTS; + hwc = &event->hw; - if (!x86_pmu.num_events_fixed) - return -1; + n0 = cpuc->n_events; + n = collect_events(cpuc, event, false); + if (n < 0) + return n; + ret = x86_schedule_events(cpuc, n, assign); + if (ret) + return ret; /* - * fixed counters do not take all possible filters + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() */ - if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) - return -1; + memcpy(cpuc->assign, assign, n*sizeof(int)); - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) - return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) - return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) - return X86_PMC_IDX_FIXED_BUS_CYCLES; - - return -1; -} + cpuc->n_events = n; + cpuc->n_added = n - n0; -/* - * generic counter allocator: get next free counter - */ -static int -gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - int idx; - - idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); - return idx == x86_pmu.num_events ? -1 : idx; -} - -/* - * intel-specific counter allocator: check event constraints - */ -static int -intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - const struct event_constraint *event_constraint; - int i, code; - - if (!event_constraints) - goto skip; - - code = hwc->config & CORE_EVNTSEL_EVENT_MASK; - - for_each_event_constraint(event_constraint, event_constraints) { - if (code == event_constraint->code) { - for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { - if (!test_and_set_bit(i, cpuc->used_mask)) - return i; - } - return -1; - } - } -skip: - return gen_get_event_idx(cpuc, hwc); -} - -static int -x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) -{ - int idx; - - idx = fixed_mode_idx(hwc); - if (idx == X86_PMC_IDX_FIXED_BTS) { - /* BTS is already occupied. */ - if (test_and_set_bit(idx, cpuc->used_mask)) - return -EAGAIN; - - hwc->config_base = 0; - hwc->event_base = 0; - hwc->idx = idx; - } else if (idx >= 0) { - /* - * Try to get the fixed event, if that is already taken - * then try to get a generic event: - */ - if (test_and_set_bit(idx, cpuc->used_mask)) - goto try_generic; - - hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that event_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->event_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; - hwc->idx = idx; - } else { - idx = hwc->idx; - /* Try to get the previous generic event again */ - if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { -try_generic: - idx = x86_pmu.get_event_idx(cpuc, hwc); - if (idx == -1) - return -EAGAIN; - - set_bit(idx, cpuc->used_mask); - hwc->idx = idx; - } - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; - } - - return idx; + return 0; } -/* - * Find a PMC slot for the freshly enabled / scheduled in event: - */ -static int x86_pmu_enable(struct perf_event *event) +static int x86_pmu_start(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int idx; - idx = x86_schedule_event(cpuc, hwc); - if (idx < 0) - return idx; + if (hwc->idx == -1) + return -EAGAIN; - perf_events_lapic_init(); - - x86_pmu.disable(hwc, idx); - - cpuc->events[idx] = event; - set_bit(idx, cpuc->active_mask); - - x86_perf_event_set_period(event, hwc, idx); - x86_pmu.enable(hwc, idx); - - perf_event_update_userpage(event); + x86_perf_event_set_period(event, hwc, hwc->idx); + x86_pmu.enable(hwc, hwc->idx); return 0; } @@ -1576,7 +1016,7 @@ void perf_event_print_debug(void) pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); + pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for (idx = 0; idx < x86_pmu.num_events; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); @@ -1600,66 +1040,7 @@ void perf_event_print_debug(void) local_irq_restore(flags); } -static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) -{ - struct debug_store *ds = cpuc->ds; - struct bts_record { - u64 from; - u64 to; - u64 flags; - }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_sample_data data; - struct pt_regs regs; - - if (!event) - return; - - if (!ds) - return; - - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; - - if (top <= at) - return; - - ds->bts_index = ds->bts_buffer_base; - - - data.period = event->hw.last_period; - data.addr = 0; - regs.ip = 0; - - /* - * Prepare a generic sample, i.e. fill in the invariant fields. - * We will overwrite the from and to address before we output - * the sample. - */ - perf_prepare_sample(&header, &data, event, ®s); - - if (perf_output_begin(&handle, event, - header.size * (top - at), 1, 1)) - return; - - for (; at < top; at++) { - data.ip = at->from; - data.addr = at->to; - - perf_output_sample(&handle, &header, &data, event); - } - - perf_output_end(&handle); - - /* There's new data available. */ - event->hw.interrupts++; - event->pending_kill = POLL_IN; -} - -static void x86_pmu_disable(struct perf_event *event) +static void x86_pmu_stop(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; @@ -1673,181 +1054,38 @@ static void x86_pmu_disable(struct perf_event *event) x86_pmu.disable(hwc, idx); /* - * Make sure the cleared pointer becomes visible before we - * (potentially) free the event: - */ - barrier(); - - /* * Drain the remaining delta count out of a event * that we are disabling: */ x86_perf_event_update(event, hwc, idx); - /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) - intel_pmu_drain_bts_buffer(cpuc); - cpuc->events[idx] = NULL; - clear_bit(idx, cpuc->used_mask); - - perf_event_update_userpage(event); -} - -/* - * Save and restart an expired event. Called by NMI contexts, - * so it has to be careful about preempting normal event ops: - */ -static int intel_pmu_save_and_restart(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - int ret; - - x86_perf_event_update(event, hwc, idx); - ret = x86_perf_event_set_period(event, hwc, idx); - - if (event->state == PERF_EVENT_STATE_ACTIVE) - intel_pmu_enable_event(hwc, idx); - - return ret; -} - -static void intel_pmu_reset(void) -{ - struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; - unsigned long flags; - int idx; - - if (!x86_pmu.num_events) - return; - - local_irq_save(flags); - - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - - for (idx = 0; idx < x86_pmu.num_events; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); - } - for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); - } - if (ds) - ds->bts_index = ds->bts_buffer_base; - - local_irq_restore(flags); } -static int p6_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - struct perf_event *event; - struct hw_perf_event *hwc; - int idx, handled = 0; - u64 val; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_events); - - for (idx = 0; idx < x86_pmu.num_events; idx++) { - if (!test_bit(idx, cpuc->active_mask)) - continue; - - event = cpuc->events[idx]; - hwc = &event->hw; - - val = x86_perf_event_update(event, hwc, idx); - if (val & (1ULL << (x86_pmu.event_bits - 1))) - continue; - - /* - * event overflow - */ - handled = 1; - data.period = event->hw.last_period; - - if (!x86_perf_event_set_period(event, hwc, idx)) - continue; - - if (perf_event_overflow(event, 1, &data, regs)) - p6_pmu_disable_event(hwc, idx); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; -} - -/* - * This handler is triggered by the local APIC, so the APIC IRQ handling - * rules apply: - */ -static int intel_pmu_handle_irq(struct pt_regs *regs) +static void x86_pmu_disable(struct perf_event *event) { - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - int bit, loops; - u64 ack, status; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_events); - - perf_disable(); - intel_pmu_drain_bts_buffer(cpuc); - status = intel_pmu_get_status(); - if (!status) { - perf_enable(); - return 0; - } - - loops = 0; -again: - if (++loops > 100) { - WARN_ONCE(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); - intel_pmu_reset(); - perf_enable(); - return 1; - } + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int i; - inc_irq_stat(apic_perf_irqs); - ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_event *event = cpuc->events[bit]; + x86_pmu_stop(event); - clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active_mask)) - continue; + for (i = 0; i < cpuc->n_events; i++) { + if (event == cpuc->event_list[i]) { - if (!intel_pmu_save_and_restart(event)) - continue; + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, event); - data.period = event->hw.last_period; + while (++i < cpuc->n_events) + cpuc->event_list[i-1] = cpuc->event_list[i]; - if (perf_event_overflow(event, 1, &data, regs)) - intel_pmu_disable_event(&event->hw, bit); + --cpuc->n_events; + break; + } } - - intel_pmu_ack_status(ack); - - /* - * Repeat if there is more work to be done: - */ - status = intel_pmu_get_status(); - if (status) - goto again; - - perf_enable(); - - return 1; + perf_event_update_userpage(event); } -static int amd_pmu_handle_irq(struct pt_regs *regs) +static int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; @@ -1857,6 +1095,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) u64 val; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); @@ -1881,7 +1120,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) continue; if (perf_event_overflow(event, 1, &data, regs)) - amd_pmu_disable_event(hwc, idx); + x86_pmu.disable(hwc, idx); } if (handled) @@ -1964,199 +1203,146 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = { .priority = 1 }; -static __initconst struct x86_pmu p6_pmu = { - .name = "p6", - .handle_irq = p6_pmu_handle_irq, - .disable_all = p6_pmu_disable_all, - .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_event, - .disable = p6_pmu_disable_event, - .eventsel = MSR_P6_EVNTSEL0, - .perfctr = MSR_P6_PERFCTR0, - .event_map = p6_pmu_event_map, - .raw_event = p6_pmu_raw_event, - .max_events = ARRAY_SIZE(p6_perfmon_event_map), - .apic = 1, - .max_period = (1ULL << 31) - 1, - .version = 0, - .num_events = 2, - /* - * Events have 40 bits implemented. However they are designed such - * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a event for P6-like PMU is 32 bits only. - * - * See IA-32 Intel Architecture Software developer manual Vol 3B - */ - .event_bits = 32, - .event_mask = (1ULL << 32) - 1, - .get_event_idx = intel_get_event_idx, -}; +static struct event_constraint unconstrained; +static struct event_constraint emptyconstraint; -static __initconst struct x86_pmu intel_pmu = { - .name = "Intel", - .handle_irq = intel_pmu_handle_irq, - .disable_all = intel_pmu_disable_all, - .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_event, - .disable = intel_pmu_disable_event, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL << 31) - 1, - .enable_bts = intel_pmu_enable_bts, - .disable_bts = intel_pmu_disable_bts, - .get_event_idx = intel_get_event_idx, -}; +static struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; -static __initconst struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = amd_pmu_handle_irq, - .disable_all = amd_pmu_disable_all, - .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_event, - .disable = amd_pmu_disable_event, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_events = 4, - .event_bits = 48, - .event_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_idx = gen_get_event_idx, -}; + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } -static __init int p6_pmu_init(void) + return &unconstrained; +} + +static int x86_event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx) { - switch (boot_cpu_data.x86_model) { - case 1: - case 3: /* Pentium Pro */ - case 5: - case 6: /* Pentium II */ - case 7: - case 8: - case 11: /* Pentium III */ - event_constraints = intel_p6_event_constraints; - break; - case 9: - case 13: - /* Pentium M */ - event_constraints = intel_p6_event_constraints; - break; - default: - pr_cont("unsupported p6 CPU model %d ", - boot_cpu_data.x86_model); - return -ENODEV; - } + int ret = 0; - x86_pmu = p6_pmu; + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = smp_processor_id(); + event->tstamp_running += event->ctx->time - event->tstamp_stopped; - if (!cpu_has_apic) { - pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - pr_info("no hardware sampling interrupt available.\n"); - x86_pmu.apic = 0; - } + if (!is_x86_event(event)) + ret = event->pmu->enable(event); - return 0; + if (!ret && !is_software_event(event)) + cpuctx->active_oncpu++; + + if (!ret && event->attr.exclusive) + cpuctx->exclusive = 1; + + return ret; } -static __init int intel_pmu_init(void) +static void x86_event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx) { - union cpuid10_edx edx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int ebx; - int version; - - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - /* check for P6 processor family */ - if (boot_cpu_data.x86 == 6) { - return p6_pmu_init(); - } else { - return -ENODEV; - } - } + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; - /* - * Check whether the Architectural PerfMon supports - * Branch Misses Retired hw_event or not. - */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return -ENODEV; + if (!is_x86_event(event)) + event->pmu->disable(event); - version = eax.split.version_id; - if (version < 2) - return -ENODEV; + event->tstamp_running -= event->ctx->time - event->tstamp_stopped; - x86_pmu = intel_pmu; - x86_pmu.version = version; - x86_pmu.num_events = eax.split.num_events; - x86_pmu.event_bits = eax.split.bit_width; - x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; + if (!is_software_event(event)) + cpuctx->active_oncpu--; - /* - * Quirk: v2 perfmon does not report fixed-purpose events, so - * assume at least 3 events: - */ - x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} +/* + * Called to enable a whole group of events. + * Returns 1 if the group was enabled, or -EAGAIN if it could not be. + * Assumes the caller has disabled interrupts and has + * frozen the PMU with hw_perf_save_disable. + * + * called with PMU disabled. If successful and return value 1, + * then guaranteed to call perf_enable() and hw_perf_enable() + */ +int hw_perf_group_sched_in(struct perf_event *leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_event *sub; + int assign[X86_PMC_IDX_MAX]; + int n0, n1, ret; + + /* n0 = total number of events */ + n0 = collect_events(cpuc, leader, true); + if (n0 < 0) + return n0; + + ret = x86_schedule_events(cpuc, n0, assign); + if (ret) + return ret; + + ret = x86_event_sched_in(leader, cpuctx); + if (ret) + return ret; + + n1 = 1; + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + if (sub->state > PERF_EVENT_STATE_OFF) { + ret = x86_event_sched_in(sub, cpuctx); + if (ret) + goto undo; + ++n1; + } + } /* - * Install the hw-cache-events table: + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() */ - switch (boot_cpu_data.x86_model) { - case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 29: /* six-core 45 nm xeon "Dunnington" */ - memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Core2 events, "); - event_constraints = intel_core_event_constraints; - break; - default: - case 26: - memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + memcpy(cpuc->assign, assign, n0*sizeof(int)); - event_constraints = intel_nehalem_event_constraints; - pr_cont("Nehalem/Corei7 events, "); - break; - case 28: - memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + cpuc->n_events = n0; + cpuc->n_added = n1; + ctx->nr_active += n1; - pr_cont("Atom events, "); - break; + /* + * 1 means successful and events are active + * This is not quite true because we defer + * actual activation until hw_perf_enable() but + * this way we* ensure caller won't try to enable + * individual events + */ + return 1; +undo: + x86_event_sched_out(leader, cpuctx); + n0 = 1; + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + if (sub->state == PERF_EVENT_STATE_ACTIVE) { + x86_event_sched_out(sub, cpuctx); + if (++n0 == n1) + break; + } } - return 0; + return ret; } -static __init int amd_pmu_init(void) -{ - /* Performance-monitoring supported from K7 and later: */ - if (boot_cpu_data.x86 < 6) - return -ENODEV; +#include "perf_event_amd.c" +#include "perf_event_p6.c" +#include "perf_event_intel.c" - x86_pmu = amd_pmu; - - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); +static void __init pmu_check_apic(void) +{ + if (cpu_has_apic) + return; - return 0; + x86_pmu.apic = 0; + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); } void __init init_hw_perf_events(void) @@ -2180,6 +1366,8 @@ void __init init_hw_perf_events(void) return; } + pmu_check_apic(); + pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { @@ -2203,6 +1391,10 @@ void __init init_hw_perf_events(void) perf_events_lapic_init(); register_die_notifier(&perf_event_nmi_notifier); + unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, + 0, x86_pmu.num_events); + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); pr_info("... generic registers: %d\n", x86_pmu.num_events); @@ -2220,50 +1412,79 @@ static inline void x86_pmu_read(struct perf_event *event) static const struct pmu pmu = { .enable = x86_pmu_enable, .disable = x86_pmu_disable, + .start = x86_pmu_start, + .stop = x86_pmu_stop, .read = x86_pmu_read, .unthrottle = x86_pmu_unthrottle, }; -static int -validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) -{ - struct hw_perf_event fake_event = event->hw; - - if (event->pmu && event->pmu != &pmu) - return 0; - - return x86_schedule_event(cpuc, &fake_event) >= 0; -} - +/* + * validate a single event group + * + * validation include: + * - check events are compatible which each other + * - events do not compete for the same counter + * - number of events <= number of counters + * + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ static int validate_group(struct perf_event *event) { - struct perf_event *sibling, *leader = event->group_leader; - struct cpu_hw_events fake_pmu; + struct perf_event *leader = event->group_leader; + struct cpu_hw_events *fake_cpuc; + int ret, n; - memset(&fake_pmu, 0, sizeof(fake_pmu)); + ret = -ENOMEM; + fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); + if (!fake_cpuc) + goto out; - if (!validate_event(&fake_pmu, leader)) - return -ENOSPC; + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + ret = -ENOSPC; + n = collect_events(fake_cpuc, leader, true); + if (n < 0) + goto out_free; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) { - if (!validate_event(&fake_pmu, sibling)) - return -ENOSPC; - } + fake_cpuc->n_events = n; + n = collect_events(fake_cpuc, event, false); + if (n < 0) + goto out_free; - if (!validate_event(&fake_pmu, event)) - return -ENOSPC; + fake_cpuc->n_events = n; - return 0; + ret = x86_schedule_events(fake_cpuc, n, NULL); + +out_free: + kfree(fake_cpuc); +out: + return ret; } const struct pmu *hw_perf_event_init(struct perf_event *event) { + const struct pmu *tmp; int err; err = __hw_perf_event_init(event); if (!err) { + /* + * we temporarily connect event to its pmu + * such that validate_group() can classify + * it as an x86 event using is_x86_event() + */ + tmp = event->pmu; + event->pmu = &pmu; + if (event->group_leader != event) err = validate_group(event); + + event->pmu = tmp; } if (err) { if (event->destroy) @@ -2287,7 +1508,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_nmi_frame); static void @@ -2303,9 +1523,6 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - per_cpu(in_nmi_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name); - return 0; } @@ -2313,9 +1530,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - if (per_cpu(in_nmi_frame, smp_processor_id())) - return; - if (reliable) callchain_store(entry, addr); } @@ -2325,6 +1539,7 @@ static const struct stacktrace_ops backtrace_ops = { .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, + .walk_stack = print_context_stack_bp, }; #include "../dumpstack.h" @@ -2335,7 +1550,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, PERF_CONTEXT_KERNEL); callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); } /* @@ -2421,9 +1636,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) is_user = user_mode(regs); - if (!current || current->pid == 0) - return; - if (is_user && current->state != TASK_RUNNING) return; @@ -2453,4 +1665,25 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) void hw_perf_event_setup_online(int cpu) { init_debug_store_on_cpu(cpu); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + amd_pmu_cpu_online(cpu); + break; + default: + return; + } +} + +void hw_perf_event_setup_offline(int cpu) +{ + init_debug_store_on_cpu(cpu); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + amd_pmu_cpu_offline(cpu); + break; + default: + return; + } } diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c new file mode 100644 index 000000000000..8f3dbfda3c4f --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -0,0 +1,416 @@ +#ifdef CONFIG_CPU_SUP_AMD + +static DEFINE_RAW_SPINLOCK(amd_nb_lock); + +static __initconst u64 amd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, +}; + +static u64 amd_pmu_event_map(int hw_event) +{ + return amd_perfmon_event_map[hw_event]; +} + +static u64 amd_pmu_raw_event(u64 hw_event) +{ +#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL +#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ + K7_EVNTSEL_REG_MASK) + + return hw_event & K7_EVNTSEL_MASK; +} + +/* + * AMD64 events are detected based on their event codes. + */ +static inline int amd_is_nb_event(struct hw_perf_event *hwc) +{ + return (hwc->config & 0xe0) == 0xe0; +} + +static void amd_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + int i; + + /* + * only care about NB events + */ + if (!(nb && amd_is_nb_event(hwc))) + return; + + /* + * need to scan whole list because event may not have + * been assigned during scheduling + * + * no race condition possible because event can only + * be removed on one CPU at a time AND PMU is disabled + * when we come here + */ + for (i = 0; i < x86_pmu.num_events; i++) { + if (nb->owners[i] == event) { + cmpxchg(nb->owners+i, event, NULL); + break; + } + } +} + + /* + * AMD64 NorthBridge events need special treatment because + * counter access needs to be synchronized across all cores + * of a package. Refer to BKDG section 3.12 + * + * NB events are events measuring L3 cache, Hypertransport + * traffic. They are identified by an event code >= 0xe00. + * They measure events on the NorthBride which is shared + * by all cores on a package. NB events are counted on a + * shared set of counters. When a NB event is programmed + * in a counter, the data actually comes from a shared + * counter. Thus, access to those counters needs to be + * synchronized. + * + * We implement the synchronization such that no two cores + * can be measuring NB events using the same counters. Thus, + * we maintain a per-NB allocation table. The available slot + * is propagated using the event_constraint structure. + * + * We provide only one choice for each NB event based on + * the fact that only NB events have restrictions. Consequently, + * if a counter is available, there is a guarantee the NB event + * will be assigned to it. If no slot is available, an empty + * constraint is returned and scheduling will eventually fail + * for this event. + * + * Note that all cores attached the same NB compete for the same + * counters to host NB events, this is why we use atomic ops. Some + * multi-chip CPUs may have more than one NB. + * + * Given that resources are allocated (cmpxchg), they must be + * eventually freed for others to use. This is accomplished by + * calling amd_put_event_constraints(). + * + * Non NB events are not impacted by this restriction. + */ +static struct event_constraint * +amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + struct perf_event *old = NULL; + int max = x86_pmu.num_events; + int i, j, k = -1; + + /* + * if not NB event or no NB, then no constraints + */ + if (!(nb && amd_is_nb_event(hwc))) + return &unconstrained; + + /* + * detect if already present, if so reuse + * + * cannot merge with actual allocation + * because of possible holes + * + * event can already be present yet not assigned (in hwc->idx) + * because of successive calls to x86_schedule_events() from + * hw_perf_group_sched_in() without hw_perf_enable() + */ + for (i = 0; i < max; i++) { + /* + * keep track of first free slot + */ + if (k == -1 && !nb->owners[i]) + k = i; + + /* already present, reuse */ + if (nb->owners[i] == event) + goto done; + } + /* + * not present, so grab a new slot + * starting either at: + */ + if (hwc->idx != -1) { + /* previous assignment */ + i = hwc->idx; + } else if (k != -1) { + /* start from free slot found */ + i = k; + } else { + /* + * event not found, no slot found in + * first pass, try again from the + * beginning + */ + i = 0; + } + j = i; + do { + old = cmpxchg(nb->owners+i, NULL, event); + if (!old) + break; + if (++i == max) + i = 0; + } while (i != j); +done: + if (!old) + return &nb->event_constraints[i]; + + return &emptyconstraint; +} + +static __initconst struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints +}; + +static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) +{ + struct amd_nb *nb; + int i; + + nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); + if (!nb) + return NULL; + + memset(nb, 0, sizeof(*nb)); + nb->nb_id = nb_id; + + /* + * initialize all possible NB constraints + */ + for (i = 0; i < x86_pmu.num_events; i++) { + set_bit(i, nb->event_constraints[i].idxmsk); + nb->event_constraints[i].weight = 1; + } + return nb; +} + +static void amd_pmu_cpu_online(int cpu) +{ + struct cpu_hw_events *cpu1, *cpu2; + struct amd_nb *nb = NULL; + int i, nb_id; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + /* + * function may be called too early in the + * boot process, in which case nb_id is bogus + */ + nb_id = amd_get_nb_id(cpu); + if (nb_id == BAD_APICID) + return; + + cpu1 = &per_cpu(cpu_hw_events, cpu); + cpu1->amd_nb = NULL; + + raw_spin_lock(&amd_nb_lock); + + for_each_online_cpu(i) { + cpu2 = &per_cpu(cpu_hw_events, i); + nb = cpu2->amd_nb; + if (!nb) + continue; + if (nb->nb_id == nb_id) + goto found; + } + + nb = amd_alloc_nb(cpu, nb_id); + if (!nb) { + pr_err("perf_events: failed NB allocation for CPU%d\n", cpu); + raw_spin_unlock(&amd_nb_lock); + return; + } +found: + nb->refcnt++; + cpu1->amd_nb = nb; + + raw_spin_unlock(&amd_nb_lock); +} + +static void amd_pmu_cpu_offline(int cpu) +{ + struct cpu_hw_events *cpuhw; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + cpuhw = &per_cpu(cpu_hw_events, cpu); + + raw_spin_lock(&amd_nb_lock); + + if (--cpuhw->amd_nb->refcnt == 0) + kfree(cpuhw->amd_nb); + + cpuhw->amd_nb = NULL; + + raw_spin_unlock(&amd_nb_lock); +} + +static __init int amd_pmu_init(void) +{ + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + + x86_pmu = amd_pmu; + + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + /* + * explicitly initialize the boot cpu, other cpus will get + * the cpu hotplug callbacks from smp_init() + */ + amd_pmu_cpu_online(smp_processor_id()); + return 0; +} + +#else /* CONFIG_CPU_SUP_AMD */ + +static int amd_pmu_init(void) +{ + return 0; +} + +static void amd_pmu_cpu_online(int cpu) +{ +} + +static void amd_pmu_cpu_offline(int cpu) +{ +} + +#endif diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c new file mode 100644 index 000000000000..cf6590cf4a5f --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -0,0 +1,971 @@ +#ifdef CONFIG_CPU_SUP_INTEL + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static const u64 intel_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, +}; + +static struct event_constraint intel_core_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_core2_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_nehalem_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */ + INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_westmere_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_gen_event_constraints[] = +{ + FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ + FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static u64 intel_pmu_event_map(int hw_event) +{ + return intel_perfmon_event_map[hw_event]; +} + +static __initconst u64 westmere_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static u64 intel_pmu_raw_event(u64 hw_event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL + +#define CORE_EVNTSEL_MASK \ + (INTEL_ARCH_EVTSEL_MASK | \ + INTEL_ARCH_UNIT_MASK | \ + INTEL_ARCH_EDGE_MASK | \ + INTEL_ARCH_INV_MASK | \ + INTEL_ARCH_CNT_MASK) + + return hw_event & CORE_EVNTSEL_MASK; +} + +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); +} + +static void intel_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_event *event = + cpuc->events[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!event)) + return; + + intel_pmu_enable_bts(event->hw.config); + } +} + +static inline u64 intel_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_drain_bts_buffer(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; + + if (!event) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + if (top <= at) + return; + + ds->bts_index = ds->bts_buffer_base; + + + data.period = event->hw.last_period; + data.addr = 0; + data.raw = NULL; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, + header.size * (top - at), 1, 1)) + return; + + for (; at < top; at++) { + data.ip = at->from; + data.addr = at->to; + + perf_output_sample(&handle, &header, &data, event); + } + + perf_output_end(&handle); + + /* There's new data available. */ + event->hw.interrupts++; + event->pending_kill = POLL_IN; +} + +static inline void +intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + intel_pmu_drain_bts_buffer(); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc, idx); + return; + } + + x86_pmu_disable_event(hwc, idx); +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + + /* + * ANY bit is supported in v3 and up + */ + if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) + bits |= 0x4; + + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_events).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc, idx); + return; + } + + __x86_pmu_enable_event(hwc, idx); +} + +/* + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: + */ +static int intel_pmu_save_and_restart(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + int ret; + + x86_perf_event_update(event, hwc, idx); + ret = x86_perf_event_set_period(event, hwc, idx); + + return ret; +} + +static void intel_pmu_reset(void) +{ + struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; + unsigned long flags; + int idx; + + if (!x86_pmu.num_events) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + if (ds) + ds->bts_index = ds->bts_buffer_base; + + local_irq_restore(flags); +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int bit, loops; + u64 ack, status; + + data.addr = 0; + data.raw = NULL; + + cpuc = &__get_cpu_var(cpu_hw_events); + + perf_disable(); + intel_pmu_drain_bts_buffer(); + status = intel_pmu_get_status(); + if (!status) { + perf_enable(); + return 0; + } + + loops = 0; +again: + if (++loops > 100) { + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + intel_pmu_reset(); + perf_enable(); + return 1; + } + + inc_irq_stat(apic_perf_irqs); + ack = status; + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + clear_bit(bit, (unsigned long *) &status); + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + data.period = event->hw.last_period; + + if (perf_event_overflow(event, 1, &data, regs)) + intel_pmu_disable_event(&event->hw, bit); + } + + intel_pmu_ack_status(ack); + + /* + * Repeat if there is more work to be done: + */ + status = intel_pmu_get_status(); + if (status) + goto again; + + perf_enable(); + + return 1; +} + +static struct event_constraint bts_constraint = + EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); + +static struct event_constraint * +intel_special_constraints(struct perf_event *event) +{ + unsigned int hw_event; + + hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK; + + if (unlikely((hw_event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (event->hw.sample_period == 1))) { + + return &bts_constraint; + } + return NULL; +} + +static struct event_constraint * +intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; + + c = intel_special_constraints(event); + if (c) + return c; + + return x86_get_event_constraints(cpuc, event); +} + +static __initconst struct x86_pmu core_pmu = { + .name = "core", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .get_event_constraints = intel_get_event_constraints, + .event_constraints = intel_core_event_constraints, +}; + +static __initconst struct x86_pmu intel_pmu = { + .name = "Intel", + .handle_irq = intel_pmu_handle_irq, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, + .get_event_constraints = intel_get_event_constraints +}; + +static __init int intel_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + int version; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { + return -ENODEV; + } + } + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired hw_event or not. + */ + cpuid(10, &eax.full, &ebx, &unused, &edx.full); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return -ENODEV; + + version = eax.split.version_id; + if (version < 2) + x86_pmu = core_pmu; + else + x86_pmu = intel_pmu; + + x86_pmu.version = version; + x86_pmu.num_events = eax.split.num_events; + x86_pmu.event_bits = eax.split.bit_width; + x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; + + /* + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: + */ + if (version > 1) + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + + /* + * Install the hw-cache-events table: + */ + switch (boot_cpu_data.x86_model) { + case 14: /* 65 nm core solo/duo, "Yonah" */ + pr_cont("Core events, "); + break; + + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_core2_event_constraints; + pr_cont("Core2 events, "); + break; + + case 26: /* 45 nm nehalem, "Bloomfield" */ + case 30: /* 45 nm nehalem, "Lynnfield" */ + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_nehalem_event_constraints; + pr_cont("Nehalem/Corei7 events, "); + break; + case 28: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("Atom events, "); + break; + + case 37: /* 32 nm nehalem, "Clarkdale" */ + case 44: /* 32 nm nehalem, "Gulftown" */ + memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = intel_westmere_event_constraints; + pr_cont("Westmere events, "); + break; + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); + } + return 0; +} + +#else /* CONFIG_CPU_SUP_INTEL */ + +static int intel_pmu_init(void) +{ + return 0; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c new file mode 100644 index 000000000000..1ca5ba078afd --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -0,0 +1,157 @@ +#ifdef CONFIG_CPU_SUP_INTEL + +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int hw_event) +{ + return p6_perfmon_event_map[hw_event]; +} + +/* + * Event setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_EVENT 0x0000002EULL + +static u64 p6_pmu_raw_event(u64 hw_event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_REG_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_REG_MASK) + + return hw_event & P6_EVNTSEL_MASK; +} + +static struct event_constraint p6_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; + +static void p6_pmu_disable_all(void) +{ + u64 val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void p6_pmu_enable_all(void) +{ + unsigned long val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static inline void +p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val = P6_NOP_EVENT; + + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + +static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + + val = hwc->config; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + +static __initconst struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = x86_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_events = 2, + /* + * Events have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a event for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .event_bits = 32, + .event_mask = (1ULL << 32) - 1, + .get_event_constraints = x86_get_event_constraints, + .event_constraints = p6_event_constraints, +}; + +static __init int p6_pmu_init(void) +{ + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + case 9: + case 13: + /* Pentium M */ + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + return 0; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 898df9719afb..74f4e85a5727 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) return !test_bit(counter, perfctr_nmi_owner); } - -/* checks the an msr for availability */ -int avail_to_resrv_perfctr_nmi(unsigned int msr) -{ - unsigned int counter; - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - return !test_bit(counter, perfctr_nmi_owner); -} EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); int reserve_perfctr_nmi(unsigned int msr) diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 7ef24a796992..83e5e628de73 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -187,7 +187,8 @@ static int __init cpuid_init(void) int i, err = 0; i = 0; - if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { + if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS, + "cpu/cpuid", &cpuid_fops)) { printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", CPUID_MAJOR); err = -EBUSY; @@ -216,7 +217,7 @@ out_class: } class_destroy(cpuid_class); out_chrdev: - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); out: return err; } @@ -228,7 +229,7 @@ static void __exit cpuid_exit(void) for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); } diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index ef42a038f1a6..1c47390dd0e5 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -265,13 +265,13 @@ struct ds_context { int cpu; }; -static DEFINE_PER_CPU(struct ds_context *, cpu_context); +static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context); static struct ds_context *ds_get_context(struct task_struct *task, int cpu) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); + (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu)); struct ds_context *context = NULL; struct ds_context *new_context = NULL; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b8ce165dde5d..6d817554780a 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -109,6 +109,32 @@ print_context_stack(struct thread_info *tinfo, } return bp; } +EXPORT_SYMBOL_GPL(print_context_stack); + +unsigned long +print_context_stack_bp(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + unsigned long *ret_addr = &frame->return_address; + + while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { + unsigned long addr = *ret_addr; + + if (!__kernel_text_address(addr)) + break; + + ops->address(data, addr, 1); + frame = frame->next_frame; + ret_addr = &frame->return_address; + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + } + + return (unsigned long)frame; +} +EXPORT_SYMBOL_GPL(print_context_stack_bp); static void @@ -141,10 +167,11 @@ static void print_trace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, + .walk_stack = print_context_stack, }; void @@ -188,7 +215,7 @@ void dump_stack(void) } EXPORT_SYMBOL(dump_stack); -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; @@ -207,11 +234,11 @@ unsigned __kprobes long oops_begin(void) /* racy, but better than risking deadlock. */ raw_local_irq_save(flags); cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { + if (!arch_spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; else - __raw_spin_lock(&die_lock); + arch_spin_lock(&die_lock); } die_nest_count++; die_owner = cpu; @@ -231,7 +258,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) die_nest_count--; if (!die_nest_count) /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); + arch_spin_unlock(&die_lock); raw_local_irq_restore(flags); oops_exit(); diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 81086c227ab7..4fd1420faffa 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -14,12 +14,6 @@ #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif -extern unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph); - extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e0ed4c7abb62..11540a189d93 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -18,11 +18,6 @@ #include "dumpstack.h" -/* Just a stub for now */ -int x86_is_stack_id(int id, char *name) -{ - return 0; -} void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, @@ -58,7 +53,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data, NULL, &graph); + bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 8e740934bd1f..dce99abb4496 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -33,11 +33,6 @@ static char x86_stack_ids[][8] = { #endif }; -int x86_is_stack_id(int id, char *name) -{ - return x86_stack_ids[id - 1] == name; -} - static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) { @@ -103,6 +98,35 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } +static inline int +in_irq_stack(unsigned long *stack, unsigned long *irq_stack, + unsigned long *irq_stack_end) +{ + return (stack >= irq_stack && stack < irq_stack_end); +} + +/* + * We are returning from the irq stack and go to the previous one. + * If the previous stack is also in the irq stack, then bp in the first + * frame of the irq stack points to the previous, interrupted one. + * Otherwise we have another level of indirection: We first save + * the bp of the previous stack, then we switch the stack to the irq one + * and save a new bp that links to the previous one. + * (See save_args()) + */ +static inline unsigned long +fixup_bp_irq_link(unsigned long bp, unsigned long *stack, + unsigned long *irq_stack, unsigned long *irq_stack_end) +{ +#ifdef CONFIG_FRAME_POINTER + struct stack_frame *frame = (struct stack_frame *)bp; + + if (!in_irq_stack(stack, irq_stack, irq_stack_end)) + return (unsigned long)frame->next_frame; +#endif + return bp; +} + /* * x86-64 can have up to three kernel stacks: * process stack @@ -159,8 +183,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, id) < 0) break; - bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end, &graph); + bp = ops->walk_stack(tinfo, stack, bp, ops, + data, estack_end, &graph); ops->stack(data, "<EOE>"); /* * We link to the next stack via the @@ -175,7 +199,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, irq_stack = irq_stack_end - (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); - if (stack >= irq_stack && stack < irq_stack_end) { + if (in_irq_stack(stack, irq_stack, irq_stack_end)) { if (ops->stack(data, "IRQ") < 0) break; bp = print_context_stack(tinfo, stack, bp, @@ -186,6 +210,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * pointer (index -1 to end) in the IRQ stack: */ stack = (unsigned long *) (irq_stack_end[-1]); + bp = fixup_bp_irq_link(bp, stack, irq_stack, + irq_stack_end); irq_stack_end = NULL; ops->stack(data, "EOI"); continue; @@ -260,6 +286,7 @@ void show_registers(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); + print_modules(); __show_regs(regs, 1); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", cur->comm, cur->pid, task_thread_info(cur), cur); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d17d482a04f4..a966b753e496 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -517,11 +517,19 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, int checktype) { int i; + u64 end; u64 real_removed_size = 0; if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; + end = start + size; + printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", + (unsigned long long) start, + (unsigned long long) end); + e820_print_type(old_type); + printk(KERN_CONT "\n"); + for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; u64 final_start, final_end; @@ -724,7 +732,7 @@ core_initcall(e820_mark_nvs_memory); /* * Early reserved memory areas. */ -#define MAX_EARLY_RES 20 +#define MAX_EARLY_RES 32 struct early_res { u64 start, end; @@ -732,7 +740,16 @@ struct early_res { char overlap_ok; }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ + { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE) + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 }, +#endif + {} }; diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index cdcfb122f256..c2fa9b8b497e 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -362,7 +362,7 @@ void __init efi_init(void) printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); early_iounmap(tmp, 2); - printk(KERN_INFO "EFI v%u.%.02u by %s \n", + printk(KERN_INFO "EFI v%u.%.02u by %s\n", efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 50b9c220e121..44a8e0dc6737 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -725,22 +725,61 @@ END(syscall_badsys) /* * System calls that need a pt_regs pointer. */ -#define PTREGSCALL(name) \ +#define PTREGSCALL0(name) \ ALIGN; \ ptregs_##name: \ leal 4(%esp),%eax; \ jmp sys_##name; -PTREGSCALL(iopl) -PTREGSCALL(fork) -PTREGSCALL(clone) -PTREGSCALL(vfork) -PTREGSCALL(execve) -PTREGSCALL(sigaltstack) -PTREGSCALL(sigreturn) -PTREGSCALL(rt_sigreturn) -PTREGSCALL(vm86) -PTREGSCALL(vm86old) +#define PTREGSCALL1(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%edx; \ + movl (PT_EBX+4)(%esp),%eax; \ + jmp sys_##name; + +#define PTREGSCALL2(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%ecx; \ + movl (PT_ECX+4)(%esp),%edx; \ + movl (PT_EBX+4)(%esp),%eax; \ + jmp sys_##name; + +#define PTREGSCALL3(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%eax; \ + pushl %eax; \ + movl PT_EDX(%eax),%ecx; \ + movl PT_ECX(%eax),%edx; \ + movl PT_EBX(%eax),%eax; \ + call sys_##name; \ + addl $4,%esp; \ + ret + +PTREGSCALL1(iopl) +PTREGSCALL0(fork) +PTREGSCALL0(vfork) +PTREGSCALL3(execve) +PTREGSCALL2(sigaltstack) +PTREGSCALL0(sigreturn) +PTREGSCALL0(rt_sigreturn) +PTREGSCALL2(vm86) +PTREGSCALL1(vm86old) + +/* Clone is an oddball. The 4th arg is in %edi */ + ALIGN; +ptregs_clone: + leal 4(%esp),%eax + pushl %eax + pushl PT_EDI(%eax) + movl PT_EDX(%eax),%ecx + movl PT_ECX(%eax),%edx + movl PT_EBX(%eax),%eax + call sys_clone + addl $8,%esp + ret .macro FIXUP_ESPFIX_STACK /* @@ -1008,12 +1047,8 @@ END(spurious_interrupt_bug) ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder CFI_STARTPROC - movl %edx,%eax - push %edx - CFI_ADJUST_CFA_OFFSET 4 - call *%ebx - push %eax - CFI_ADJUST_CFA_OFFSET 4 + movl %edi,%eax + call *%esi call do_exit ud2 # padding for call trace CFI_ENDPROC diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 63bca794c8f9..0697ff139837 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1076,10 +1076,10 @@ ENTRY(\sym) TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ - PER_CPU(init_tss, %rbp) - subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) + PER_CPU(init_tss, %r12) + subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) call \do_sym - addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) + addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) @@ -1166,63 +1166,20 @@ bad_gs: jmp 2b .previous -/* - * Create a kernel thread. - * - * C extern interface: - * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) - * - * asm input arguments: - * rdi: fn, rsi: arg, rdx: flags - */ -ENTRY(kernel_thread) - CFI_STARTPROC - FAKE_STACK_FRAME $child_rip - SAVE_ALL - - # rdi: flags, rsi: usp, rdx: will be &pt_regs - movq %rdx,%rdi - orq kernel_thread_flags(%rip),%rdi - movq $-1, %rsi - movq %rsp, %rdx - - xorl %r8d,%r8d - xorl %r9d,%r9d - - # clone now - call do_fork - movq %rax,RAX(%rsp) - xorl %edi,%edi - - /* - * It isn't worth to check for reschedule here, - * so internally to the x86_64 port you can rely on kernel_thread() - * not to reschedule the child before returning, this avoids the need - * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] - */ - RESTORE_ALL - UNFAKE_STACK_FRAME - ret - CFI_ENDPROC -END(kernel_thread) - -ENTRY(child_rip) +ENTRY(kernel_thread_helper) pushq $0 # fake return address CFI_STARTPROC /* * Here we are in the child and the registers are set as they were * at kernel_thread() invocation in the parent. */ - movq %rdi, %rax - movq %rsi, %rdi - call *%rax + call *%rsi # exit mov %eax, %edi call do_exit ud2 # padding for call trace CFI_ENDPROC -END(child_rip) +END(kernel_thread_helper) /* * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 309689245431..cd37469b54ee 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -30,14 +30,32 @@ #ifdef CONFIG_DYNAMIC_FTRACE +/* + * modifying_code is set to notify NMIs that they need to use + * memory barriers when entering or exiting. But we don't want + * to burden NMIs with unnecessary memory barriers when code + * modification is not being done (which is most of the time). + * + * A mutex is already held when ftrace_arch_code_modify_prepare + * and post_process are called. No locks need to be taken here. + * + * Stop machine will make sure currently running NMIs are done + * and new NMIs will see the updated variable before we need + * to worry about NMIs doing memory barriers. + */ +static int modifying_code __read_mostly; +static DEFINE_PER_CPU(int, save_modifying_code); + int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); + modifying_code = 1; return 0; } int ftrace_arch_code_modify_post_process(void) { + modifying_code = 0; set_kernel_text_ro(); return 0; } @@ -149,6 +167,11 @@ static void ftrace_mod_code(void) void ftrace_nmi_enter(void) { + __get_cpu_var(save_modifying_code) = modifying_code; + + if (!__get_cpu_var(save_modifying_code)) + return; + if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { smp_rmb(); ftrace_mod_code(); @@ -160,6 +183,9 @@ void ftrace_nmi_enter(void) void ftrace_nmi_exit(void) { + if (!__get_cpu_var(save_modifying_code)) + return; + /* Finish all executions before clearing nmi_running */ smp_mb(); atomic_dec(&nmi_running); @@ -484,13 +510,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#ifdef CONFIG_FTRACE_SYSCALLS - -extern unsigned long *sys_call_table; - -unsigned long __init arch_syscall_addr(int nr) -{ - return (unsigned long)(&sys_call_table)[nr]; -} -#endif diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c deleted file mode 100644 index 9b08e852fd1a..000000000000 --- a/arch/x86/kernel/geode_32.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * AMD Geode southbridge support code - * Copyright (C) 2006, Advanced Micro Devices, Inc. - * Copyright (C) 2007, Andres Salomon <dilinger@debian.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/ioport.h> -#include <linux/io.h> -#include <asm/msr.h> -#include <asm/geode.h> - -static struct { - char *name; - u32 msr; - int size; - u32 base; -} lbars[] = { - { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 }, - { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 }, - { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 }, - { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 } -}; - -static void __init init_lbars(void) -{ - u32 lo, hi; - int i; - - for (i = 0; i < ARRAY_SIZE(lbars); i++) { - rdmsr(lbars[i].msr, lo, hi); - if (hi & 0x01) - lbars[i].base = lo & 0x0000ffff; - - if (lbars[i].base == 0) - printk(KERN_ERR "geode: Couldn't initialize '%s'\n", - lbars[i].name); - } -} - -int geode_get_dev_base(unsigned int dev) -{ - BUG_ON(dev >= ARRAY_SIZE(lbars)); - return lbars[dev].base; -} -EXPORT_SYMBOL_GPL(geode_get_dev_base); - -/* === GPIO API === */ - -void geode_gpio_set(u32 gpio, unsigned int reg) -{ - u32 base = geode_get_dev_base(GEODE_DEV_GPIO); - - if (!base) - return; - - /* low bank register */ - if (gpio & 0xFFFF) - outl(gpio & 0xFFFF, base + reg); - /* high bank register */ - gpio >>= 16; - if (gpio) - outl(gpio, base + 0x80 + reg); -} -EXPORT_SYMBOL_GPL(geode_gpio_set); - -void geode_gpio_clear(u32 gpio, unsigned int reg) -{ - u32 base = geode_get_dev_base(GEODE_DEV_GPIO); - - if (!base) - return; - - /* low bank register */ - if (gpio & 0xFFFF) - outl((gpio & 0xFFFF) << 16, base + reg); - /* high bank register */ - gpio &= (0xFFFF << 16); - if (gpio) - outl(gpio, base + 0x80 + reg); -} -EXPORT_SYMBOL_GPL(geode_gpio_clear); - -int geode_gpio_isset(u32 gpio, unsigned int reg) -{ - u32 base = geode_get_dev_base(GEODE_DEV_GPIO); - u32 val; - - if (!base) - return 0; - - /* low bank register */ - if (gpio & 0xFFFF) { - val = inl(base + reg) & (gpio & 0xFFFF); - if ((gpio & 0xFFFF) == val) - return 1; - } - /* high bank register */ - gpio >>= 16; - if (gpio) { - val = inl(base + 0x80 + reg) & gpio; - if (gpio == val) - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(geode_gpio_isset); - -void geode_gpio_set_irq(unsigned int group, unsigned int irq) -{ - u32 lo, hi; - - if (group > 7 || irq > 15) - return; - - rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi); - - lo &= ~(0xF << (group * 4)); - lo |= (irq & 0xF) << (group * 4); - - wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi); -} -EXPORT_SYMBOL_GPL(geode_gpio_set_irq); - -void geode_gpio_setup_event(unsigned int gpio, int pair, int pme) -{ - u32 base = geode_get_dev_base(GEODE_DEV_GPIO); - u32 offset, shift, val; - - if (gpio >= 24) - offset = GPIO_MAP_W; - else if (gpio >= 16) - offset = GPIO_MAP_Z; - else if (gpio >= 8) - offset = GPIO_MAP_Y; - else - offset = GPIO_MAP_X; - - shift = (gpio % 8) * 4; - - val = inl(base + offset); - - /* Clear whatever was there before */ - val &= ~(0xF << shift); - - /* And set the new value */ - - val |= ((pair & 7) << shift); - - /* Set the PME bit if this is a PME event */ - - if (pme) - val |= (1 << (shift + 3)); - - outl(val, base + offset); -} -EXPORT_SYMBOL_GPL(geode_gpio_setup_event); - -int geode_has_vsa2(void) -{ - static int has_vsa2 = -1; - - if (has_vsa2 == -1) { - u16 val; - - /* - * The VSA has virtual registers that we can query for a - * signature. - */ - outw(VSA_VR_UNLOCK, VSA_VRC_INDEX); - outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX); - - val = inw(VSA_VRC_DATA); - has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG); - } - - return has_vsa2; -} -EXPORT_SYMBOL_GPL(geode_has_vsa2); - -static int __init geode_southbridge_init(void) -{ - if (!is_geode()) - return -ENODEV; - - init_lbars(); - (void) mfgpt_timer_setup(); - return 0; -} - -postcore_initcall(geode_southbridge_init); diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 4f8e2507e8f3..5051b94c9069 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,8 +29,6 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { - reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 0b06cd778fd9..b5a9896ca1e7 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -98,8 +98,6 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); - reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba6e65884603..ad80a1c718c6 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -34,6 +34,8 @@ */ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ +u8 hpet_msi_disable; + #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; #endif @@ -596,6 +598,9 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int num_timers_used = 0; int i; + if (hpet_msi_disable) + return; + if (boot_cpu_has(X86_FEATURE_ARAT)) return; id = hpet_readl(HPET_ID); @@ -928,6 +933,9 @@ static __init int hpet_late_init(void) hpet_reserve_platform_timers(hpet_readl(HPET_ID)); hpet_print_config(); + if (hpet_msi_disable) + return 0; + if (boot_cpu_has(X86_FEATURE_ARAT)) return 0; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index d42f65ac4927..dca2802c666f 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -212,25 +212,6 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } -/* - * Store a breakpoint's encoded address, length, and type. - */ -static int arch_store_info(struct perf_event *bp) -{ - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - /* - * For kernel-addresses, either the address or symbol name can be - * specified. - */ - if (info->name) - info->address = (unsigned long) - kallsyms_lookup_name(info->name); - if (info->address) - return 0; - - return -EINVAL; -} - int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { @@ -362,11 +343,13 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - if (bp->callback) - ret = arch_store_info(bp); - - if (ret < 0) - return ret; + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (info->name) + info->address = (unsigned long) + kallsyms_lookup_name(info->name); /* * Check that the low-order bits of the address are appropriate * for the alignment implied by len. @@ -503,8 +486,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) rcu_read_lock(); bp = per_cpu(bp_per_reg[i], cpu); - if (bp) - rc = NOTIFY_DONE; /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling @@ -519,11 +500,17 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) break; } - (bp->callback)(bp, args->regs); + perf_bp_event(bp, args->regs); rcu_read_unlock(); } - if (dr6 & (~DR_TRAP_BITS)) + /* + * Further processing in do_debug() is needed for a) user-space + * breakpoints (to generate signals) and b) when the system has + * taken exception due to multiple causes + */ + if ((current->thread.debugreg6 & DR_TRAP_BITS) || + (dr6 & (~DR_TRAP_BITS))) rc = NOTIFY_DONE; set_debugreg(dr7, 7); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index f2f8540a7f3d..c01a2b846d47 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -164,6 +164,11 @@ int init_fpu(struct task_struct *tsk) return 0; } +/* + * The xstateregs_active() routine is the same as the fpregs_active() routine, + * as the "regset->n" for the xstate regset will be updated based on the feature + * capabilites supported by the xsave. + */ int fpregs_active(struct task_struct *target, const struct user_regset *regset) { return tsk_used_math(target) ? regset->n : 0; @@ -204,8 +209,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - set_stopped_child_used_math(target); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.xstate->fxsave, 0, -1); @@ -224,6 +227,68 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, return ret; } +int xstateregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + /* + * Copy the 48bytes defined by the software first into the xstate + * memory layout in the thread struct, so that we can copy the entire + * xstateregs to the user using one user_regset_copyout(). + */ + memcpy(&target->thread.xstate->fxsave.sw_reserved, + xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); + + /* + * Copy the xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave, 0, -1); + return ret; +} + +int xstateregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct xsave_hdr_struct *xsave_hdr; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; + + xsave_hdr = &target->thread.xstate->xsave.xsave_hdr; + + xsave_hdr->xstate_bv &= pcntxt_mask; + /* + * These bits must be zero. + */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + + return ret; +} + #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION /* @@ -404,8 +469,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - set_stopped_child_used_math(target); - if (!HAVE_HWFP) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 99c4d308f16b..8eec0ec59af2 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -103,9 +103,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * on system-call entry - see also fork() and the signal handling * code. */ -static int do_iopl(unsigned int level, struct pt_regs *regs) +long sys_iopl(unsigned int level, struct pt_regs *regs) { unsigned int old = (regs->flags >> 12) & 3; + struct thread_struct *t = ¤t->thread; if (level > 3) return -EINVAL; @@ -115,29 +116,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); - - return 0; -} - -#ifdef CONFIG_X86_32 -long sys_iopl(struct pt_regs *regs) -{ - unsigned int level = regs->bx; - struct thread_struct *t = ¤t->thread; - int rc; - - rc = do_iopl(level, regs); - if (rc < 0) - goto out; - t->iopl = level << 12; set_iopl_mask(t->iopl); -out: - return rc; -} -#else -asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) -{ - return do_iopl(level, regs); + + return 0; } -#endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 664bcb7384ac..91fd0c70a18a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -149,7 +149,7 @@ int show_interrupts(struct seq_file *p, void *v) if (!desc) return 0; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); action = desc->action; @@ -170,7 +170,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); out: - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -294,12 +294,12 @@ void fixup_irqs(void) continue; /* interrupt's are disabled at this point */ - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); affinity = desc->affinity; if (!irq_has_action(irq) || cpumask_equal(affinity, cpu_online_mask)) { - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); continue; } @@ -326,7 +326,7 @@ void fixup_irqs(void) if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) desc->chip->unmask(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); if (break_affinity && set_affinity) printk("Broke affinity for irq %i\n", irq); @@ -356,10 +356,10 @@ void fixup_irqs(void) irq = __get_cpu_var(vector_irq)[vector]; desc = irq_to_desc(irq); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (desc->chip->retrigger) desc->chip->retrigger(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } } } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 20a5b3689463..bfba6019d762 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -42,6 +42,7 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/nmi.h> +#include <linux/hw_breakpoint.h> #include <asm/debugreg.h> #include <asm/apicdef.h> @@ -86,9 +87,15 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_DS] = regs->ds; gdb_regs[GDB_ES] = regs->es; gdb_regs[GDB_CS] = regs->cs; - gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; + if (user_mode_vm(regs)) { + gdb_regs[GDB_SS] = regs->ss; + gdb_regs[GDB_SP] = regs->sp; + } else { + gdb_regs[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + } #else gdb_regs[GDB_R8] = regs->r8; gdb_regs[GDB_R9] = regs->r9; @@ -101,8 +108,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs32[GDB_PS] = regs->flags; gdb_regs32[GDB_CS] = regs->cs; gdb_regs32[GDB_SS] = regs->ss; -#endif gdb_regs[GDB_SP] = kernel_stack_pointer(regs); +#endif } /** @@ -198,41 +205,81 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) static struct hw_breakpoint { unsigned enabled; - unsigned type; - unsigned len; unsigned long addr; + int len; + int type; + struct perf_event **pev; } breakinfo[4]; static void kgdb_correct_hw_break(void) { - unsigned long dr7; - int correctit = 0; - int breakbit; int breakno; - get_debugreg(dr7, 7); for (breakno = 0; breakno < 4; breakno++) { - breakbit = 2 << (breakno << 1); - if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { - correctit = 1; - dr7 |= breakbit; - dr7 &= ~(0xf0000 << (breakno << 2)); - dr7 |= ((breakinfo[breakno].len << 2) | - breakinfo[breakno].type) << - ((breakno << 2) + 16); - if (breakno >= 0 && breakno <= 3) - set_debugreg(breakinfo[breakno].addr, breakno); - - } else { - if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { - correctit = 1; - dr7 &= ~breakbit; - dr7 &= ~(0xf0000 << (breakno << 2)); - } - } + struct perf_event *bp; + struct arch_hw_breakpoint *info; + int val; + int cpu = raw_smp_processor_id(); + if (!breakinfo[breakno].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); + info = counter_arch_bp(bp); + if (bp->attr.disabled != 1) + continue; + bp->attr.bp_addr = breakinfo[breakno].addr; + bp->attr.bp_len = breakinfo[breakno].len; + bp->attr.bp_type = breakinfo[breakno].type; + info->address = breakinfo[breakno].addr; + info->len = breakinfo[breakno].len; + info->type = breakinfo[breakno].type; + val = arch_install_hw_breakpoint(bp); + if (!val) + bp->attr.disabled = 0; } - if (correctit) - set_debugreg(dr7, 7); + hw_breakpoint_restore(); +} + +static int hw_break_reserve_slot(int breakno) +{ + int cpu; + int cnt = 0; + struct perf_event **pevent; + + for_each_online_cpu(cpu) { + cnt++; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_reserve_bp_slot(*pevent)) + goto fail; + } + + return 0; + +fail: + for_each_online_cpu(cpu) { + cnt--; + if (!cnt) + break; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + dbg_release_bp_slot(*pevent); + } + return -1; +} + +static int hw_break_release_slot(int breakno) +{ + struct perf_event **pevent; + int cpu; + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_release_bp_slot(*pevent)) + /* + * The debugger is responisble for handing the retry on + * remove failure. + */ + return -1; + } + return 0; } static int @@ -246,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) if (i == 4) return -1; + if (hw_break_release_slot(i)) { + printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr); + return -1; + } breakinfo[i].enabled = 0; return 0; @@ -254,15 +305,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) static void kgdb_remove_all_hw_break(void) { int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; - for (i = 0; i < 4; i++) - memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); + for (i = 0; i < 4; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } } static int kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { - unsigned type; int i; for (i = 0; i < 4; i++) @@ -273,27 +332,42 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) switch (bptype) { case BP_HARDWARE_BREAKPOINT: - type = 0; - len = 1; + len = 1; + breakinfo[i].type = X86_BREAKPOINT_EXECUTE; break; case BP_WRITE_WATCHPOINT: - type = 1; + breakinfo[i].type = X86_BREAKPOINT_WRITE; break; case BP_ACCESS_WATCHPOINT: - type = 3; + breakinfo[i].type = X86_BREAKPOINT_RW; break; default: return -1; } - - if (len == 1 || len == 2 || len == 4) - breakinfo[i].len = len - 1; - else + switch (len) { + case 1: + breakinfo[i].len = X86_BREAKPOINT_LEN_1; + break; + case 2: + breakinfo[i].len = X86_BREAKPOINT_LEN_2; + break; + case 4: + breakinfo[i].len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case 8: + breakinfo[i].len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: return -1; - - breakinfo[i].enabled = 1; + } breakinfo[i].addr = addr; - breakinfo[i].type = type; + if (hw_break_reserve_slot(i)) { + breakinfo[i].addr = 0; + return -1; + } + breakinfo[i].enabled = 1; return 0; } @@ -308,8 +382,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) */ void kgdb_disable_hw_debug(struct pt_regs *regs) { + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + /* Disable hardware debugging while we are in kgdb: */ set_debugreg(0UL, 7); + for (i = 0; i < 4; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } } /** @@ -373,7 +460,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, struct pt_regs *linux_regs) { unsigned long addr; - unsigned long dr6; char *ptr; int newPC; @@ -395,25 +481,10 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, /* set the trace bit if we're stepping */ if (remcomInBuffer[0] == 's') { linux_regs->flags |= X86_EFLAGS_TF; - kgdb_single_step = 1; atomic_set(&kgdb_cpu_doing_single_step, raw_smp_processor_id()); } - get_debugreg(dr6, 6); - if (!(dr6 & 0x4000)) { - int breakno; - - for (breakno = 0; breakno < 4; breakno++) { - if (dr6 & (1 << breakno) && - breakinfo[breakno].type == 0) { - /* Set restore flag: */ - linux_regs->flags |= X86_EFLAGS_RF; - break; - } - } - } - set_debugreg(0UL, 6); kgdb_correct_hw_break(); return 0; @@ -481,8 +552,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) break; case DIE_DEBUG: - if (atomic_read(&kgdb_cpu_doing_single_step) == - raw_smp_processor_id()) { + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { if (user_mode(regs)) return single_step_cont(regs, args); break; @@ -535,7 +605,42 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { - return register_die_notifier(&kgdb_notifier); + int i, cpu; + int ret; + struct perf_event_attr attr; + struct perf_event **pevent; + + ret = register_die_notifier(&kgdb_notifier); + if (ret != 0) + return ret; + /* + * Pre-allocate the hw breakpoint structions in the non-atomic + * portion of kgdb because this operation requires mutexs to + * complete. + */ + attr.bp_addr = (unsigned long)kgdb_arch_init; + attr.type = PERF_TYPE_BREAKPOINT; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + for (i = 0; i < 4; i++) { + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); + if (IS_ERR(breakinfo[i].pev)) { + printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n"); + breakinfo[i].pev = NULL; + kgdb_arch_exit(); + return -1; + } + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[i].pev, cpu); + pevent[0]->hw.sample_period = 1; + if (pevent[0]->destroy != NULL) { + pevent[0]->destroy = NULL; + release_bp_slot(*pevent); + } + } + } + return ret; } /** @@ -546,6 +651,13 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { + int i; + for (i = 0; i < 4; i++) { + if (breakinfo[i].pev) { + unregister_wide_hw_breakpoint(breakinfo[i].pev); + breakinfo[i].pev = NULL; + } + } unregister_die_notifier(&kgdb_notifier); } diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 1f3186ce213c..5de9f4a9c3fd 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -337,6 +337,9 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (alternatives_text_reserved(p->addr, p->addr)) + return -EINVAL; + if (!can_probe((unsigned long)p->addr)) return -EILSEQ; /* insn: must be on special executable page on x86. */ @@ -429,7 +432,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) +#if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); @@ -481,7 +484,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they - * remain disabled thorough out this function. + * remain disabled throughout this function. */ static int __kprobes kprobe_handler(struct pt_regs *regs) { @@ -818,7 +821,7 @@ no_change: /* * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled thoroughout this function. + * remain disabled throughout this function. */ static int __kprobes post_kprobe_handler(struct pt_regs *regs) { diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c deleted file mode 100644 index 2a62d843f015..000000000000 --- a/arch/x86/kernel/mfgpt_32.c +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Driver/API for AMD Geode Multi-Function General Purpose Timers (MFGPT) - * - * Copyright (C) 2006, Advanced Micro Devices, Inc. - * Copyright (C) 2007, Andres Salomon <dilinger@debian.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - * - * The MFGPTs are documented in AMD Geode CS5536 Companion Device Data Book. - */ - -/* - * We are using the 32.768kHz input clock - it's the only one that has the - * ranges we find desirable. The following table lists the suitable - * divisors and the associated Hz, minimum interval and the maximum interval: - * - * Divisor Hz Min Delta (s) Max Delta (s) - * 1 32768 .00048828125 2.000 - * 2 16384 .0009765625 4.000 - * 4 8192 .001953125 8.000 - * 8 4096 .00390625 16.000 - * 16 2048 .0078125 32.000 - * 32 1024 .015625 64.000 - * 64 512 .03125 128.000 - * 128 256 .0625 256.000 - * 256 128 .125 512.000 - */ - -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <asm/geode.h> - -#define MFGPT_DEFAULT_IRQ 7 - -static struct mfgpt_timer_t { - unsigned int avail:1; -} mfgpt_timers[MFGPT_MAX_TIMERS]; - -/* Selected from the table above */ - -#define MFGPT_DIVISOR 16 -#define MFGPT_SCALE 4 /* divisor = 2^(scale) */ -#define MFGPT_HZ (32768 / MFGPT_DIVISOR) -#define MFGPT_PERIODIC (MFGPT_HZ / HZ) - -/* Allow for disabling of MFGPTs */ -static int disable; -static int __init mfgpt_disable(char *s) -{ - disable = 1; - return 1; -} -__setup("nomfgpt", mfgpt_disable); - -/* Reset the MFGPT timers. This is required by some broken BIOSes which already - * do the same and leave the system in an unstable state. TinyBIOS 0.98 is - * affected at least (0.99 is OK with MFGPT workaround left to off). - */ -static int __init mfgpt_fix(char *s) -{ - u32 val, dummy; - - /* The following udocumented bit resets the MFGPT timers */ - val = 0xFF; dummy = 0; - wrmsr(MSR_MFGPT_SETUP, val, dummy); - return 1; -} -__setup("mfgptfix", mfgpt_fix); - -/* - * Check whether any MFGPTs are available for the kernel to use. In most - * cases, firmware that uses AMD's VSA code will claim all timers during - * bootup; we certainly don't want to take them if they're already in use. - * In other cases (such as with VSAless OpenFirmware), the system firmware - * leaves timers available for us to use. - */ - - -static int timers = -1; - -static void geode_mfgpt_detect(void) -{ - int i; - u16 val; - - timers = 0; - - if (disable) { - printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n"); - goto done; - } - - if (!geode_get_dev_base(GEODE_DEV_MFGPT)) { - printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n"); - goto done; - } - - for (i = 0; i < MFGPT_MAX_TIMERS; i++) { - val = geode_mfgpt_read(i, MFGPT_REG_SETUP); - if (!(val & MFGPT_SETUP_SETUP)) { - mfgpt_timers[i].avail = 1; - timers++; - } - } - -done: - printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers); -} - -int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable) -{ - u32 msr, mask, value, dummy; - int shift = (cmp == MFGPT_CMP1) ? 0 : 8; - - if (timer < 0 || timer >= MFGPT_MAX_TIMERS) - return -EIO; - - /* - * The register maps for these are described in sections 6.17.1.x of - * the AMD Geode CS5536 Companion Device Data Book. - */ - switch (event) { - case MFGPT_EVENT_RESET: - /* - * XXX: According to the docs, we cannot reset timers above - * 6; that is, resets for 7 and 8 will be ignored. Is this - * a problem? -dilinger - */ - msr = MSR_MFGPT_NR; - mask = 1 << (timer + 24); - break; - - case MFGPT_EVENT_NMI: - msr = MSR_MFGPT_NR; - mask = 1 << (timer + shift); - break; - - case MFGPT_EVENT_IRQ: - msr = MSR_MFGPT_IRQ; - mask = 1 << (timer + shift); - break; - - default: - return -EIO; - } - - rdmsr(msr, value, dummy); - - if (enable) - value |= mask; - else - value &= ~mask; - - wrmsr(msr, value, dummy); - return 0; -} -EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); - -int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable) -{ - u32 zsel, lpc, dummy; - int shift; - - if (timer < 0 || timer >= MFGPT_MAX_TIMERS) - return -EIO; - - /* - * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA - * is using the same CMP of the timer's Siamese twin, the IRQ is set to - * 2, and we mustn't use nor change it. - * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the - * IRQ of the 1st. This can only happen if forcing an IRQ, calling this - * with *irq==0 is safe. Currently there _are_ no 2 drivers. - */ - rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy); - shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4; - if (((zsel >> shift) & 0xF) == 2) - return -EIO; - - /* Choose IRQ: if none supplied, keep IRQ already set or use default */ - if (!*irq) - *irq = (zsel >> shift) & 0xF; - if (!*irq) - *irq = MFGPT_DEFAULT_IRQ; - - /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */ - if (*irq < 1 || *irq == 2 || *irq > 15) - return -EIO; - rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy); - if (lpc & (1 << *irq)) - return -EIO; - - /* All chosen and checked - go for it */ - if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) - return -EIO; - if (enable) { - zsel = (zsel & ~(0xF << shift)) | (*irq << shift); - wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy); - } - - return 0; -} - -static int mfgpt_get(int timer) -{ - mfgpt_timers[timer].avail = 0; - printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer); - return timer; -} - -int geode_mfgpt_alloc_timer(int timer, int domain) -{ - int i; - - if (timers == -1) { - /* timers haven't been detected yet */ - geode_mfgpt_detect(); - } - - if (!timers) - return -1; - - if (timer >= MFGPT_MAX_TIMERS) - return -1; - - if (timer < 0) { - /* Try to find an available timer */ - for (i = 0; i < MFGPT_MAX_TIMERS; i++) { - if (mfgpt_timers[i].avail) - return mfgpt_get(i); - - if (i == 5 && domain == MFGPT_DOMAIN_WORKING) - break; - } - } else { - /* If they requested a specific timer, try to honor that */ - if (mfgpt_timers[timer].avail) - return mfgpt_get(timer); - } - - /* No timers available - too bad */ - return -1; -} -EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer); - - -#ifdef CONFIG_GEODE_MFGPT_TIMER - -/* - * The MFPGT timers on the CS5536 provide us with suitable timers to use - * as clock event sources - not as good as a HPET or APIC, but certainly - * better than the PIT. This isn't a general purpose MFGPT driver, but - * a simplified one designed specifically to act as a clock event source. - * For full details about the MFGPT, please consult the CS5536 data sheet. - */ - -#include <linux/clocksource.h> -#include <linux/clockchips.h> - -static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN; -static u16 mfgpt_event_clock; - -static int irq; -static int __init mfgpt_setup(char *str) -{ - get_option(&str, &irq); - return 1; -} -__setup("mfgpt_irq=", mfgpt_setup); - -static void mfgpt_disable_timer(u16 clock) -{ - /* avoid races by clearing CMP1 and CMP2 unconditionally */ - geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN | - MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2); -} - -static int mfgpt_next_event(unsigned long, struct clock_event_device *); -static void mfgpt_set_mode(enum clock_event_mode, struct clock_event_device *); - -static struct clock_event_device mfgpt_clockevent = { - .name = "mfgpt-timer", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = mfgpt_set_mode, - .set_next_event = mfgpt_next_event, - .rating = 250, - .cpumask = cpu_all_mask, - .shift = 32 -}; - -static void mfgpt_start_timer(u16 delta) -{ - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta); - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0); - - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, - MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2); -} - -static void mfgpt_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - mfgpt_disable_timer(mfgpt_event_clock); - - if (mode == CLOCK_EVT_MODE_PERIODIC) - mfgpt_start_timer(MFGPT_PERIODIC); - - mfgpt_tick_mode = mode; -} - -static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt) -{ - mfgpt_start_timer(delta); - return 0; -} - -static irqreturn_t mfgpt_tick(int irq, void *dev_id) -{ - u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP); - - /* See if the interrupt was for us */ - if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1))) - return IRQ_NONE; - - /* Turn off the clock (and clear the event) */ - mfgpt_disable_timer(mfgpt_event_clock); - - if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN) - return IRQ_HANDLED; - - /* Clear the counter */ - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0); - - /* Restart the clock in periodic mode */ - - if (mfgpt_tick_mode == CLOCK_EVT_MODE_PERIODIC) { - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, - MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2); - } - - mfgpt_clockevent.event_handler(&mfgpt_clockevent); - return IRQ_HANDLED; -} - -static struct irqaction mfgptirq = { - .handler = mfgpt_tick, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, - .name = "mfgpt-timer" -}; - -int __init mfgpt_timer_setup(void) -{ - int timer, ret; - u16 val; - - timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING); - if (timer < 0) { - printk(KERN_ERR - "mfgpt-timer: Could not allocate a MFPGT timer\n"); - return -ENODEV; - } - - mfgpt_event_clock = timer; - - /* Set up the IRQ on the MFGPT side */ - if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) { - printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq); - return -EIO; - } - - /* And register it with the kernel */ - ret = setup_irq(irq, &mfgptirq); - - if (ret) { - printk(KERN_ERR - "mfgpt-timer: Unable to set up the interrupt.\n"); - goto err; - } - - /* Set the clock scale and enable the event mode for CMP2 */ - val = MFGPT_SCALE | (3 << 8); - - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val); - - /* Set up the clock event */ - mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC, - mfgpt_clockevent.shift); - mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF, - &mfgpt_clockevent); - mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE, - &mfgpt_clockevent); - - printk(KERN_INFO - "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n", - timer, irq); - clockevents_register_device(&mfgpt_clockevent); - - return 0; - -err: - geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq); - printk(KERN_ERR - "mfgpt-timer: Unable to set up the MFGPT clock source\n"); - return -EIO; -} - -#endif diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 63123d902103..e1af7c055c7d 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,6 +13,9 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/firmware.h> #include <linux/pci_ids.h> #include <linux/uaccess.h> @@ -33,9 +36,6 @@ MODULE_LICENSE("GPL v2"); #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 -const struct firmware *firmware; -static int supported_cpu; - struct equiv_cpu_entry { u32 installed_cpu; u32 fixed_errata_mask; @@ -74,14 +74,17 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { + struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - if (!supported_cpu) - return -1; - memset(csig, 0, sizeof(*csig)); + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " + "supported\n", cpu, c->x86); + return -1; + } rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); + pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; } @@ -111,8 +114,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - pr_err(KERN_ERR "microcode: CPU%d: loading of chipset " - "specific code not yet supported\n", cpu); + pr_err("CPU%d: loading of chipset specific code not yet supported\n", + cpu); return 0; } @@ -141,12 +144,12 @@ static int apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - pr_err("microcode: CPU%d: update failed " - "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); + pr_err("CPU%d: update failed (for patch_level=0x%x)\n", + cpu, mc_amd->hdr.patch_id); return -1; } - pr_info("microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); + pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; return 0; @@ -169,15 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) return NULL; if (section_hdr[0] != UCODE_UCODE_TYPE) { - pr_err("microcode: error: invalid type field in " - "container file section header\n"); + pr_err("error: invalid type field in container file section header\n"); return NULL; } total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); if (total_size > size || total_size > UCODE_MAX_SIZE) { - pr_err("microcode: error: size mismatch\n"); + pr_err("error: size mismatch\n"); return NULL; } @@ -206,14 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf) size = buf_pos[2]; if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - pr_err("microcode: error: invalid type field in " - "container file section header\n"); + pr_err("error: invalid type field in container file section header\n"); return 0; } equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); if (!equiv_cpu_table) { - pr_err("microcode: failed to allocate equivalent CPU table\n"); + pr_err("failed to allocate equivalent CPU table\n"); return 0; } @@ -246,7 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { - pr_err("microcode: failed to create equivalent cpu table\n"); + pr_err("failed to create equivalent cpu table\n"); return UCODE_ERROR; } @@ -277,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) if (!leftover) { vfree(uci->mc); uci->mc = new_mc; - pr_debug("microcode: CPU%d found a matching microcode " - "update with version 0x%x (current=0x%x)\n", + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); } else { vfree(new_mc); @@ -294,27 +294,32 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) static enum ucode_state request_microcode_fw(int cpu, struct device *device) { + const char *fw_name = "amd-ucode/microcode_amd.bin"; + const struct firmware *firmware; enum ucode_state ret; - if (firmware == NULL) + if (request_firmware(&firmware, fw_name, device)) { + printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); return UCODE_NFOUND; + } if (*(u32 *)firmware->data != UCODE_MAGIC) { - pr_err("microcode: invalid UCODE_MAGIC (0x%08x)\n", + pr_err("invalid UCODE_MAGIC (0x%08x)\n", *(u32 *)firmware->data); return UCODE_ERROR; } ret = generic_load_microcode(cpu, firmware->data, firmware->size); + release_firmware(firmware); + return ret; } static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { - pr_info("microcode: AMD microcode update via " - "/dev/cpu/microcode not supported\n"); + pr_info("AMD microcode update via /dev/cpu/microcode not supported\n"); return UCODE_ERROR; } @@ -326,32 +331,7 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } -void init_microcode_amd(struct device *device) -{ - const char *fw_name = "amd-ucode/microcode_amd.bin"; - struct cpuinfo_x86 *c = &boot_cpu_data; - - WARN_ON(c->x86_vendor != X86_VENDOR_AMD); - - if (c->x86 < 0x10) { - pr_warning("microcode: AMD CPU family 0x%x not supported\n", - c->x86); - return; - } - supported_cpu = 1; - - if (request_firmware(&firmware, fw_name, device)) - pr_err("microcode: failed to load file %s\n", fw_name); -} - -void fini_microcode_amd(void) -{ - release_firmware(firmware); -} - static struct microcode_ops microcode_amd_ops = { - .init = init_microcode_amd, - .fini = fini_microcode_amd, .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info_amd, diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index e68aae397869..cceb5bc3c3c2 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -70,6 +70,9 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/platform_device.h> #include <linux/miscdevice.h> #include <linux/capability.h> @@ -209,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > totalram_pages) { - pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); + pr_err("too much data (max %ld pages)\n", totalram_pages); return ret; } @@ -244,7 +247,7 @@ static int __init microcode_dev_init(void) error = misc_register(µcode_dev); if (error) { - pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); + pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -359,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu) if (!uci->mc) return UCODE_NFOUND; - pr_debug("microcode: CPU%d updated upon resume\n", cpu); + pr_debug("CPU%d updated upon resume\n", cpu); apply_microcode_on_target(cpu); return UCODE_OK; @@ -379,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu) ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); if (ustate == UCODE_OK) { - pr_debug("microcode: CPU%d updated upon init\n", cpu); + pr_debug("CPU%d updated upon init\n", cpu); apply_microcode_on_target(cpu); } @@ -391,7 +394,7 @@ static enum ucode_state microcode_update_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; enum ucode_state ustate; - if (uci->valid && uci->mc) + if (uci->valid) ustate = microcode_resume_cpu(cpu); else ustate = microcode_init_cpu(cpu); @@ -406,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev) if (!cpu_online(cpu)) return 0; - pr_debug("microcode: CPU%d added\n", cpu); + pr_debug("CPU%d added\n", cpu); err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); if (err) @@ -425,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) if (!cpu_online(cpu)) return 0; - pr_debug("microcode: CPU%d removed\n", cpu); + pr_debug("CPU%d removed\n", cpu); microcode_fini_cpu(cpu); sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); return 0; @@ -473,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) microcode_update_cpu(cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - pr_debug("microcode: CPU%d added\n", cpu); + pr_debug("CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - pr_err("microcode: Failed to create group for CPU%d\n", cpu); + pr_err("Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - pr_debug("microcode: CPU%d removed\n", cpu); + pr_debug("CPU%d removed\n", cpu); break; case CPU_DEAD: case CPU_UP_CANCELED_FROZEN: @@ -507,7 +510,7 @@ static int __init microcode_init(void) microcode_ops = init_amd_microcode(); if (!microcode_ops) { - pr_err("microcode: no support for this CPU vendor\n"); + pr_err("no support for this CPU vendor\n"); return -ENODEV; } @@ -518,9 +521,6 @@ static int __init microcode_init(void) return PTR_ERR(microcode_pdev); } - if (microcode_ops->init) - microcode_ops->init(µcode_pdev->dev); - get_online_cpus(); mutex_lock(µcode_mutex); @@ -541,8 +541,7 @@ static int __init microcode_init(void) register_hotcpu_notifier(&mc_cpu_notifier); pr_info("Microcode Update Driver: v" MICROCODE_VERSION - " <tigran@aivazian.fsnet.co.uk>," - " Peter Oruba\n"); + " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); return 0; } @@ -564,9 +563,6 @@ static void __exit microcode_exit(void) platform_device_unregister(microcode_pdev); - if (microcode_ops->fini) - microcode_ops->fini(); - microcode_ops = NULL; pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 0d334ddd0a96..85a343e28937 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,6 +70,9 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/firmware.h> #include <linux/uaccess.h> #include <linux/kernel.h> @@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || cpu_has(c, X86_FEATURE_IA64)) { - printk(KERN_ERR "microcode: CPU%d not a capable Intel " - "processor\n", cpu_num); + pr_err("CPU%d not a capable Intel processor\n", cpu_num); return -1; } @@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", - cpu_num, csig->sig, csig->pf, csig->rev); + pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); return 0; } @@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc) data_size = get_datasize(mc_header); if (data_size + MC_HEADER_SIZE > total_size) { - printk(KERN_ERR "microcode: error! " - "Bad data size in microcode data file\n"); + pr_err("error! Bad data size in microcode data file\n"); return -EINVAL; } if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - printk(KERN_ERR "microcode: error! " - "Unknown microcode update format\n"); + pr_err("error! Unknown microcode update format\n"); return -EINVAL; } ext_table_size = total_size - (MC_HEADER_SIZE + data_size); if (ext_table_size) { if ((ext_table_size < EXT_HEADER_SIZE) || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - printk(KERN_ERR "microcode: error! " - "Small exttable size in microcode data file\n"); + pr_err("error! Small exttable size in microcode data file\n"); return -EINVAL; } ext_header = mc + MC_HEADER_SIZE + data_size; if (ext_table_size != exttable_size(ext_header)) { - printk(KERN_ERR "microcode: error! " - "Bad exttable size in microcode data file\n"); + pr_err("error! Bad exttable size in microcode data file\n"); return -EFAULT; } ext_sigcount = ext_header->count; @@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc) while (i--) ext_table_sum += ext_tablep[i]; if (ext_table_sum) { - printk(KERN_WARNING "microcode: aborting, " - "bad extended signature table checksum\n"); + pr_warning("aborting, bad extended signature table checksum\n"); return -EINVAL; } } @@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc) while (i--) orig_sum += ((int *)mc)[i]; if (orig_sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); + pr_err("aborting, bad checksum\n"); return -EINVAL; } if (!ext_table_size) @@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc) - (mc_header->sig + mc_header->pf + mc_header->cksum) + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); if (sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); + pr_err("aborting, bad checksum\n"); return -EINVAL; } } @@ -327,13 +324,11 @@ static int apply_microcode(int cpu) rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); if (val[1] != mc_intel->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update " - "to revision 0x%x failed\n", - cpu_num, mc_intel->hdr.rev); + pr_err("CPU%d update to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); return -1; } - printk(KERN_INFO "microcode: CPU%d updated to revision " - "0x%x, date = %04x-%02x-%02x \n", + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, @@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, mc_size = get_totalsize(&mc_header); if (!mc_size || mc_size > leftover) { - printk(KERN_ERR "microcode: error!" - "Bad data in microcode data file\n"); + pr_err("error! Bad data in microcode data file\n"); break; } @@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); out: return state; } @@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) c->x86, c->x86_model, c->x86_mask); if (request_firmware(&firmware, name, device)) { - pr_debug("microcode: data file %s load failed\n", name); + pr_debug("data file %s load failed\n", name); return UCODE_NFOUND; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 35a57c963df9..a2c1edd2d3ac 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -359,13 +359,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) x86_init.mpparse.mpc_record(1); } -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); - if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; @@ -945,9 +938,6 @@ void __init early_reserve_e820_mpc_new(void) { if (enable_update_mptable && alloc_mptable) { u64 startt = 0; -#ifdef CONFIG_X86_TRAMPOLINE - startt = TRAMPOLINE_BASE; -#endif mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); } } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 553449951b84..206735ac8cbd 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -172,11 +172,10 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) static int msr_open(struct inode *inode, struct file *file) { - unsigned int cpu = iminor(file->f_path.dentry->d_inode); - struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int cpu; + struct cpuinfo_x86 *c; cpu = iminor(file->f_path.dentry->d_inode); - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; /* No such CPU */ @@ -247,7 +246,7 @@ static int __init msr_init(void) int i, err = 0; i = 0; - if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { + if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { printk(KERN_ERR "msr: unable to get major %d for msr\n", MSR_MAJOR); err = -EBUSY; @@ -275,7 +274,7 @@ out_class: msr_device_destroy(i); class_destroy(msr_class); out_chrdev: - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); out: return err; } @@ -286,7 +285,7 @@ static void __exit msr_exit(void) for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); } diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 4006c522adc7..9d1d263f786f 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -212,7 +212,7 @@ static int __init olpc_init(void) unsigned char *romsig; /* The ioremap check is dangerous; limit what we run it on */ - if (!is_geode() || geode_has_vsa2()) + if (!is_geode() || cs5535_has_vsa2()) return 0; spin_lock_init(&ec_lock); @@ -244,7 +244,7 @@ static int __init olpc_init(void) (unsigned char *) &olpc_platform_info.ecver, 1); /* check to see if the VSA exists */ - if (geode_has_vsa2()) + if (cs5535_has_vsa2()) olpc_platform_info.flags |= OLPC_F_VSA; printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 3a7c5a44082e..676b8c77a976 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -8,9 +8,9 @@ #include <asm/paravirt.h> static inline void -default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { - __raw_spin_lock(lock); + arch_spin_lock(lock); } struct pv_lock_ops pv_lock_ops = { diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index c563e4c8ff39..2bbde6078143 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -31,7 +31,7 @@ #include <linux/string.h> #include <linux/crash_dump.h> #include <linux/dma-mapping.h> -#include <linux/bitops.h> +#include <linux/bitmap.h> #include <linux/pci_ids.h> #include <linux/pci.h> #include <linux/delay.h> @@ -212,7 +212,7 @@ static void iommu_range_reserve(struct iommu_table *tbl, spin_lock_irqsave(&tbl->it_lock, flags); - iommu_area_reserve(tbl->it_map, index, npages); + bitmap_set(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); } @@ -303,7 +303,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, spin_lock_irqsave(&tbl->it_lock, flags); - iommu_area_free(tbl->it_map, entry, npages); + bitmap_clear(tbl->it_map, entry, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index afcc58b69c7c..75e14e21f61a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -124,8 +124,8 @@ void __init pci_iommu_alloc(void) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif - if (pci_swiotlb_init()) - return; + if (pci_swiotlb_detect()) + goto out; gart_iommu_hole_init(); @@ -135,6 +135,8 @@ void __init pci_iommu_alloc(void) /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); +out: + pci_swiotlb_init(); } void *dma_generic_alloc_coherent(struct device *dev, size_t size, diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index e6a0d402f171..34de53b46f87 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -23,7 +23,7 @@ #include <linux/module.h> #include <linux/topology.h> #include <linux/interrupt.h> -#include <linux/bitops.h> +#include <linux/bitmap.h> #include <linux/kdebug.h> #include <linux/scatterlist.h> #include <linux/iommu-helper.h> @@ -126,7 +126,7 @@ static void free_iommu(unsigned long offset, int size) unsigned long flags; spin_lock_irqsave(&iommu_bitmap_lock, flags); - iommu_area_free(iommu_gart_bitmap, offset, size); + bitmap_clear(iommu_gart_bitmap, offset, size); if (offset >= next_bit) next_bit = offset + size; spin_unlock_irqrestore(&iommu_bitmap_lock, flags); @@ -710,7 +710,8 @@ static void gart_iommu_shutdown(void) struct pci_dev *dev; int i; - if (no_agp) + /* don't shutdown it if there is AGP installed */ + if (!no_agp) return; for (i = 0; i < num_k8_northbridges; i++) { @@ -791,7 +792,7 @@ int __init gart_iommu_init(void) * Out of IOMMU space handling. * Reserve some invalid pages at the beginning of the GART. */ - iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES); pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", iommu_size >> 20); diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index e3c0a66b9e77..7d2829dde20e 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -43,12 +43,12 @@ static struct dma_map_ops swiotlb_dma_ops = { }; /* - * pci_swiotlb_init - initialize swiotlb if necessary + * pci_swiotlb_detect - set swiotlb to 1 if necessary * * This returns non-zero if we are forced to use swiotlb (by the boot * option). */ -int __init pci_swiotlb_init(void) +int __init pci_swiotlb_detect(void) { int use_swiotlb = swiotlb | swiotlb_force; @@ -60,10 +60,13 @@ int __init pci_swiotlb_init(void) if (swiotlb_force) swiotlb = 1; + return use_swiotlb; +} + +void __init pci_swiotlb_init(void) +{ if (swiotlb) { swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } - - return use_swiotlb; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5e2ba634ea15..02d678065d7d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -10,6 +10,8 @@ #include <linux/clockchips.h> #include <linux/random.h> #include <linux/user-return-notifier.h> +#include <linux/dmi.h> +#include <linux/utsname.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> #include <asm/system.h> @@ -90,22 +92,36 @@ void exit_thread(void) } } +void show_regs(struct pt_regs *regs) +{ + show_registers(regs); + show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), + regs->bp); +} + +void show_regs_common(void) +{ + const char *board, *product; + + board = dmi_get_system_info(DMI_BOARD_NAME); + if (!board) + board = ""; + product = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!product) + product = ""; + + printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, board, product); +} + void flush_thread(void) { struct task_struct *tsk = current; -#ifdef CONFIG_X86_64 - if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { - clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - clear_tsk_thread_flag(tsk, TIF_IA32); - } else { - set_tsk_thread_flag(tsk, TIF_IA32); - current_thread_info()->status |= TS_COMPAT; - } - } -#endif - flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* @@ -234,6 +250,78 @@ int sys_vfork(struct pt_regs *regs) NULL, NULL); } +long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +{ + if (!newsp) + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + +/* + * This gets run with %si containing the + * function to call, and %di containing + * the "args". + */ +extern void kernel_thread_helper(void); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.si = (unsigned long) fn; + regs.di = (unsigned long) arg; + +#ifdef CONFIG_X86_32 + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; +#else + regs.ss = __KERNEL_DS; +#endif + + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | 0x2; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); + +/* + * sys_execve() executes a new program. + */ +long sys_execve(char __user *name, char __user * __user *argv, + char __user * __user *envp, struct pt_regs *regs) +{ + long error; + char *filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = do_execve(filename, argv, envp, regs); + +#ifdef CONFIG_X86_32 + if (error == 0) { + /* Make sure we don't return using sysenter.. */ + set_thread_flag(TIF_IRET); + } +#endif + + putname(filename); + return error; +} /* * Idle related variables and functions diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 075580b35682..f6c62667e30c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -23,7 +23,6 @@ #include <linux/vmalloc.h> #include <linux/user.h> #include <linux/interrupt.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/reboot.h> #include <linux/init.h> @@ -35,7 +34,6 @@ #include <linux/tick.h> #include <linux/percpu.h> #include <linux/prctl.h> -#include <linux/dmi.h> #include <linux/ftrace.h> #include <linux/uaccess.h> #include <linux/io.h> @@ -128,7 +126,6 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long d0, d1, d2, d3, d6, d7; unsigned long sp; unsigned short ss, gs; - const char *board; if (user_mode_vm(regs)) { sp = regs->sp; @@ -140,27 +137,18 @@ void __show_regs(struct pt_regs *regs, int all) savesegment(gs, gs); } - printk("\n"); + show_regs_common(); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", - task_pid_nr(current), current->comm, - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); - - printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, smp_processor_id()); print_symbol("EIP is at %s\n", regs->ip); - printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); - printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); - printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); if (!all) @@ -170,61 +158,22 @@ void __show_regs(struct pt_regs *regs, int all) cr2 = read_cr2(); cr3 = read_cr3(); cr4 = read_cr4_safe(); - printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); get_debugreg(d3, 3); - printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", d0, d1, d2, d3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR6: %08lx DR7: %08lx\n", + printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", d6, d7); } -void show_regs(struct pt_regs *regs) -{ - show_registers(regs); - show_trace(NULL, regs, ®s->sp, regs->bp); -} - -/* - * This gets run with %bx containing the - * function to call, and %dx containing - * the "args". - */ -extern void kernel_thread_helper(void); - -/* - * Create a kernel thread - */ -int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) -{ - struct pt_regs regs; - - memset(®s, 0, sizeof(regs)); - - regs.bx = (unsigned long) fn; - regs.dx = (unsigned long) arg; - - regs.ds = __USER_DS; - regs.es = __USER_DS; - regs.fs = __KERNEL_PERCPU; - regs.gs = __KERNEL_STACK_CANARY; - regs.orig_ax = -1; - regs.ip = (unsigned long) kernel_thread_helper; - regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; - - /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -} -EXPORT_SYMBOL(kernel_thread); - void release_thread(struct task_struct *dead_task) { BUG_ON(dead_task->mm); @@ -436,46 +385,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } -int sys_clone(struct pt_regs *regs) -{ - unsigned long clone_flags; - unsigned long newsp; - int __user *parent_tidptr, *child_tidptr; - - clone_flags = regs->bx; - newsp = regs->cx; - parent_tidptr = (int __user *)regs->dx; - child_tidptr = (int __user *)regs->di; - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); -} - -/* - * sys_execve() executes a new program. - */ -int sys_execve(struct pt_regs *regs) -{ - int error; - char *filename; - - filename = getname((char __user *) regs->bx); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - error = do_execve(filename, - (char __user * __user *) regs->cx, - (char __user * __user *) regs->dx, - regs); - if (error == 0) { - /* Make sure we don't return using sysenter.. */ - set_thread_flag(TIF_IRET); - } - putname(filename); -out: - return error; -} - #define top_esp (THREAD_SIZE - sizeof(unsigned long)) #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c95c8f4e790a..dc9690b4c4cc 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -26,7 +26,6 @@ #include <linux/slab.h> #include <linux/user.h> #include <linux/interrupt.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/ptrace.h> @@ -38,7 +37,6 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> -#include <linux/dmi.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -59,8 +57,6 @@ asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); -unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; - static ATOMIC_NOTIFIER_HEAD(idle_notifier); void idle_notifier_register(struct notifier_block *n) @@ -163,31 +159,21 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, cs, es; - const char *board; - - printk("\n"); - print_modules(); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); - printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + + show_regs_common(); + printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); - printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, regs->flags); - printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); - printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", + printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", regs->dx, regs->si, regs->di); - printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", + printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", regs->bp, regs->r8, regs->r9); - printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); - printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); asm("movl %%ds,%0" : "=r" (ds)); @@ -208,27 +194,21 @@ void __show_regs(struct pt_regs *regs, int all) cr3 = read_cr3(); cr4 = read_cr4(); - printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); - printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); - printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); -} - -void show_regs(struct pt_regs *regs) -{ - show_registers(regs); - show_trace(NULL, regs, (void *)(regs + 1), regs->bp); + printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void release_thread(struct task_struct *dead_task) @@ -285,8 +265,9 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, *childregs = *regs; childregs->ax = 0; - childregs->sp = sp; - if (sp == ~0UL) + if (user_mode(regs)) + childregs->sp = sp; + else childregs->sp = (unsigned long)childregs; p->thread.sp = (unsigned long) childregs; @@ -520,25 +501,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } -/* - * sys_execve() executes a new program. - */ -asmlinkage -long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs *regs) -{ - long error; - char *filename; - - filename = getname(name); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - return error; - error = do_execve(filename, argv, envp, regs); - putname(filename); - return error; -} - void set_personality_64bit(void) { /* inherit personality from parent */ @@ -553,13 +515,16 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -asmlinkage long -sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +void set_personality_ia32(void) { - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); + /* inherit personality from parent */ + + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_IA32); + current->personality |= force_personality32; + + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; } unsigned long get_wchan(struct task_struct *p) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 04d182a7cfdb..2d96aab82a48 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -48,6 +48,7 @@ enum x86_regset { REGSET_FP, REGSET_XFP, REGSET_IOPERM64 = REGSET_XFP, + REGSET_XSTATE, REGSET_TLS, REGSET_IOPERM32, }; @@ -140,30 +141,6 @@ static const int arg_offs_table[] = { #endif }; -/** - * regs_get_argument_nth() - get Nth argument at function call - * @regs: pt_regs which contains registers at function entry. - * @n: argument number. - * - * regs_get_argument_nth() returns @n th argument of a function call. - * Since usually the kernel stack will be changed right after function entry, - * you must use this at function entry. If the @n th entry is NOT in the - * kernel stack or pt_regs, this returns 0. - */ -unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) -{ - if (n < ARRAY_SIZE(arg_offs_table)) - return *(unsigned long *)((char *)regs + arg_offs_table[n]); - else { - /* - * The typical case: arg n is on the stack. - * (Note: stack[0] = return address, so skip it) - */ - n -= ARRAY_SIZE(arg_offs_table); - return regs_get_kernel_stack_nth(regs, 1 + n); - } -} - /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. @@ -509,14 +486,14 @@ static int genregs_get(struct task_struct *target, { if (kbuf) { unsigned long *k = kbuf; - while (count > 0) { + while (count >= sizeof(*k)) { *k++ = getreg(target, pos); count -= sizeof(*k); pos += sizeof(*k); } } else { unsigned long __user *u = ubuf; - while (count > 0) { + while (count >= sizeof(*u)) { if (__put_user(getreg(target, pos), u++)) return -EFAULT; count -= sizeof(*u); @@ -535,14 +512,14 @@ static int genregs_set(struct task_struct *target, int ret = 0; if (kbuf) { const unsigned long *k = kbuf; - while (count > 0 && !ret) { + while (count >= sizeof(*k) && !ret) { ret = putreg(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const unsigned long __user *u = ubuf; - while (count > 0 && !ret) { + while (count >= sizeof(*u) && !ret) { unsigned long word; ret = __get_user(word, u++); if (ret) @@ -555,7 +532,9 @@ static int genregs_set(struct task_struct *target, return ret; } -static void ptrace_triggered(struct perf_event *bp, void *data) +static void ptrace_triggered(struct perf_event *bp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { int i; struct thread_struct *thread = &(current->thread); @@ -593,13 +572,13 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } -static struct perf_event * +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, struct task_struct *tsk, int disabled) { int err; int gen_len, gen_type; - DEFINE_BREAKPOINT_ATTR(attr); + struct perf_event_attr attr; /* * We shoud have at least an inactive breakpoint at this @@ -607,18 +586,18 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, * written the address register first */ if (!bp) - return ERR_PTR(-EINVAL); + return -EINVAL; err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); if (err) - return ERR_PTR(err); + return err; attr = bp->attr; attr.bp_len = gen_len; attr.bp_type = gen_type; attr.disabled = disabled; - return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); + return modify_user_hw_breakpoint(bp, &attr); } /* @@ -656,28 +635,17 @@ restore: if (!second_pass) continue; - thread->ptrace_bps[i] = NULL; - bp = ptrace_modify_breakpoint(bp, len, type, + rc = ptrace_modify_breakpoint(bp, len, type, tsk, 1); - if (IS_ERR(bp)) { - rc = PTR_ERR(bp); - thread->ptrace_bps[i] = NULL; + if (rc) break; - } - thread->ptrace_bps[i] = bp; } continue; } - bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0); - - /* Incorrect bp, or we have a bug in bp API */ - if (IS_ERR(bp)) { - rc = PTR_ERR(bp); - thread->ptrace_bps[i] = NULL; + rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + if (rc) break; - } - thread->ptrace_bps[i] = bp; } /* * Make a second pass to free the remaining unused breakpoints @@ -711,7 +679,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) } else if (n == 6) { val = thread->debugreg6; } else if (n == 7) { - val = ptrace_get_dr7(thread->ptrace_bps); + val = thread->ptrace_dr7; } return val; } @@ -721,9 +689,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, { struct perf_event *bp; struct thread_struct *t = &tsk->thread; - DEFINE_BREAKPOINT_ATTR(attr); + struct perf_event_attr attr; if (!t->ptrace_bps[nr]) { + hw_breakpoint_init(&attr); /* * Put stub len and type to register (reserve) an inactive but * correct bp @@ -734,26 +703,32 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr.disabled = 1; bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + + /* + * CHECKME: the previous code returned -EIO if the addr wasn't + * a valid task virtual addr. The new one will return -EINVAL in + * this case. + * -EINVAL may be what we want for in-kernel breakpoints users, + * but -EIO looks better for ptrace, since we refuse a register + * writing for the user. And anyway this is the previous + * behaviour. + */ + if (IS_ERR(bp)) + return PTR_ERR(bp); + + t->ptrace_bps[nr] = bp; } else { + int err; + bp = t->ptrace_bps[nr]; - t->ptrace_bps[nr] = NULL; attr = bp->attr; attr.bp_addr = addr; - bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); + err = modify_user_hw_breakpoint(bp, &attr); + if (err) + return err; } - /* - * CHECKME: the previous code returned -EIO if the addr wasn't a - * valid task virtual addr. The new one will return -EINVAL in this - * case. - * -EINVAL may be what we want for in-kernel breakpoints users, but - * -EIO looks better for ptrace, since we refuse a register writing - * for the user. And anyway this is the previous behaviour. - */ - if (IS_ERR(bp)) - return PTR_ERR(bp); - t->ptrace_bps[nr] = bp; return 0; } @@ -780,8 +755,11 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) return rc; } /* All that's left is DR7 */ - if (n == 7) + if (n == 7) { rc = ptrace_write_dr7(tsk, val); + if (!rc) + thread->ptrace_dr7 = val; + } ret_path: return rc; @@ -1460,14 +1438,14 @@ static int genregs32_get(struct task_struct *target, { if (kbuf) { compat_ulong_t *k = kbuf; - while (count > 0) { + while (count >= sizeof(*k)) { getreg32(target, pos, k++); count -= sizeof(*k); pos += sizeof(*k); } } else { compat_ulong_t __user *u = ubuf; - while (count > 0) { + while (count >= sizeof(*u)) { compat_ulong_t word; getreg32(target, pos, &word); if (__put_user(word, u++)) @@ -1488,14 +1466,14 @@ static int genregs32_set(struct task_struct *target, int ret = 0; if (kbuf) { const compat_ulong_t *k = kbuf; - while (count > 0 && !ret) { + while (count >= sizeof(*k) && !ret) { ret = putreg32(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const compat_ulong_t __user *u = ubuf; - while (count > 0 && !ret) { + while (count >= sizeof(*u) && !ret) { compat_ulong_t word; ret = __get_user(word, u++); if (ret) @@ -1586,7 +1564,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 -static const struct user_regset x86_64_regsets[] = { +static struct user_regset x86_64_regsets[] __read_mostly = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), @@ -1599,6 +1577,12 @@ static const struct user_regset x86_64_regsets[] = { .size = sizeof(long), .align = sizeof(long), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, [REGSET_IOPERM64] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_LONGS, @@ -1624,7 +1608,7 @@ static const struct user_regset_view user_x86_64_view = { #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -static const struct user_regset x86_32_regsets[] = { +static struct user_regset x86_32_regsets[] __read_mostly = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), @@ -1643,6 +1627,12 @@ static const struct user_regset x86_32_regsets[] = { .size = sizeof(u32), .align = sizeof(u32), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, [REGSET_TLS] = { .core_note_type = NT_386_TLS, .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, @@ -1665,6 +1655,23 @@ static const struct user_regset_view user_x86_32_view = { }; #endif +/* + * This represents bytes 464..511 in the memory layout exported through + * the REGSET_XSTATE interface. + */ +u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; + +void update_regset_xstate_info(unsigned int size, u64 xstate_mask) +{ +#ifdef CONFIG_X86_64 + x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif + xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; +} + const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION @@ -1678,21 +1685,33 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, - int error_code, int si_code) +static void fill_sigtrap_info(struct task_struct *tsk, + struct pt_regs *regs, + int error_code, int si_code, + struct siginfo *info) { - struct siginfo info; - tsk->thread.trap_no = 1; tsk->thread.error_code = error_code; - memset(&info, 0, sizeof(info)); - info.si_signo = SIGTRAP; - info.si_code = si_code; + memset(info, 0, sizeof(*info)); + info->si_signo = SIGTRAP; + info->si_code = si_code; + info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; +} - /* User-mode ip? */ - info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; +void user_single_step_siginfo(struct task_struct *tsk, + struct pt_regs *regs, + struct siginfo *info) +{ + fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info); +} +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, + int error_code, int si_code) +{ + struct siginfo info; + + fill_sigtrap_info(tsk, regs, error_code, si_code, &info); /* Send us the fake SIGTRAP */ force_sig_info(SIGTRAP, &info, tsk); } @@ -1757,29 +1776,22 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) asmregparm void syscall_trace_leave(struct pt_regs *regs) { + bool step; + if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); - if (test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall_exit(regs, 0); - /* * If TIF_SYSCALL_EMU is set, we only get here because of * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). * We already reported this syscall instruction in - * syscall_trace_enter(), so don't do any more now. - */ - if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) - return; - - /* - * If we are single-stepping, synthesize a trap to follow the - * system call instruction. + * syscall_trace_enter(). */ - if (test_thread_flag(TIF_SINGLESTEP) && - tracehook_consider_fatal_signal(current, SIGTRAP)) - send_sigtrap(current, regs, 0, TRAP_BRKPT); + step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && + !test_thread_flag(TIF_SYSCALL_EMU); + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, step); } diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 18093d7498f0..12e9feaa2f7a 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -491,6 +491,19 @@ void force_hpet_resume(void) break; } } + +/* + * HPET MSI on some boards (ATI SB700/SB800) has side effect on + * floppy DMA. Disable HPET MSI on such platforms. + */ +static void force_disable_hpet_msi(struct pci_dev *unused) +{ + hpet_msi_disable = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + force_disable_hpet_msi); + #endif #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 2b97fc5b124e..704bddcdf64d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -203,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0T656F"), }, }, + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 760", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), + DMI_MATCH(DMI_BOARD_NAME, "0G919G"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", @@ -259,6 +268,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), }, }, + { /* Handle problems with rebooting on ASUS P4S800 */ + .callback = set_bios_reboot, + .ident = "ASUS P4S800", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4S800"), + }, + }, { } }; diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index 201eab63b05f..fda313ebbb03 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -12,7 +12,7 @@ #include <linux/interrupt.h> #include <asm/reboot_fixups.h> #include <asm/msr.h> -#include <asm/geode.h> +#include <linux/cs5535.h> static void cs5530a_warm_reset(struct pci_dev *dev) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 946a311a25c9..cb42109a55b4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -73,6 +73,7 @@ #include <asm/mtrr.h> #include <asm/apic.h> +#include <asm/trampoline.h> #include <asm/e820.h> #include <asm/mpspec.h> #include <asm/setup.h> @@ -120,7 +121,9 @@ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; +#ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); +#endif unsigned int boot_cpu_id __read_mostly; @@ -641,23 +644,48 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), }, }, - { /* - * AMI BIOS with low memory corruption was found on Intel DG45ID board. - * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * AMI BIOS with low memory corruption was found on Intel DG45ID and + * DG45FC boards. + * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will * match only DMI_BOARD_NAME and see if there is more bad products * with this vendor. */ + { .callback = dmi_low_memory_corruption, .ident = "AMI BIOS", .matches = { DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), + }, + }, #endif {} }; +static void __init trim_bios_range(void) +{ + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + */ + e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + /* + * special case: Some BIOSen report the PC BIOS + * area (640->1Mb) as ram even though it is not. + * take them out. + */ + e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -821,7 +849,7 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - + trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, @@ -875,6 +903,13 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); + + reserve_trampoline_memory(); + #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. @@ -921,11 +956,6 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); - #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d559af913e1f..35abcb8b00e9 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> @@ -20,9 +22,9 @@ #include <asm/stackprotector.h> #ifdef CONFIG_DEBUG_PER_CPU_MAPS -# define DBG(x...) printk(KERN_DEBUG x) +# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__) #else -# define DBG(x...) +# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0) #endif DEFINE_PER_CPU(int, cpu_number); @@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } else { ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), size, align, goal); - pr_debug("per cpu data for cpu%d %lu bytes on node%d at " - "%016lx\n", cpu, size, node, __pa(ptr)); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", + cpu, size, node, __pa(ptr)); } return ptr; #else @@ -198,8 +200,7 @@ void __init setup_per_cpu_areas(void) pcpu_cpu_distance, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - pr_warning("PERCPU: %s allocator failed (%d), " - "falling back to page size\n", + pr_warning("%s allocator failed (%d), falling back to page size\n", pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 74fe6d86dc5d..4fd173cd8e57 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -545,22 +545,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, } #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_X86_32 -int sys_sigaltstack(struct pt_regs *regs) -{ - const stack_t __user *uss = (const stack_t __user *)regs->bx; - stack_t __user *uoss = (stack_t __user *)regs->cx; - - return do_sigaltstack(uss, uoss, regs->sp); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long +long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) { return do_sigaltstack(uss, uoss, regs->sp); } -#endif /* CONFIG_X86_32 */ /* * Do a signal return; undo the signal stack. diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 324f2a44c221..9b4401115ea1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -320,6 +320,7 @@ notrace static void __cpuinit start_secondary(void *unused) unlock_vector_lock(); ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + x86_platform.nmi_init(); /* enable local interrupts */ local_irq_enable(); @@ -671,6 +672,26 @@ static void __cpuinit do_fork_idle(struct work_struct *work) complete(&c_idle->done); } +/* reduce the number of lines printed when booting a large cpu count system */ +static void __cpuinit announce_cpu(int cpu, int apicid) +{ + static int current_node = -1; + int node = cpu_to_node(cpu); + + if (system_state == SYSTEM_BOOTING) { + if (node != current_node) { + if (current_node > (-1)) + pr_cont(" Ok.\n"); + current_node = node; + pr_info("Booting Node %3d, Processors ", node); + } + pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); + return; + } else + pr_info("Booting Node %d Processor %d APIC 0x%x\n", + node, cpu, apicid); +} + /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -687,7 +708,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; - INIT_WORK(&c_idle.work, do_fork_idle); + INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); @@ -713,6 +734,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); + destroy_work_on_stack(&c_idle.work); return PTR_ERR(c_idle.idle); } @@ -736,9 +758,8 @@ do_rest: /* start_ip had better be page-aligned! */ start_ip = setup_trampoline(); - /* So we see what's up */ - printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", - cpu, apicid, start_ip); + /* So we see what's up */ + announce_cpu(cpu, apicid); /* * This grunge runs the startup process for @@ -787,21 +808,17 @@ do_rest: udelay(100); } - if (cpumask_test_cpu(cpu, cpu_callin_mask)) { - /* number CPUs logically, starting from 1 (BSP is 0) */ - pr_debug("OK.\n"); - printk(KERN_INFO "CPU%d: ", cpu); - print_cpu_info(&cpu_data(cpu)); - pr_debug("CPU has booted.\n"); - } else { + if (cpumask_test_cpu(cpu, cpu_callin_mask)) + pr_debug("CPU%d: has booted.\n", cpu); + else { boot_error = 1; if (*((volatile unsigned char *)trampoline_base) == 0xA5) /* trampoline started but...? */ - printk(KERN_ERR "Stuck ??\n"); + pr_err("CPU%d: Stuck ??\n", cpu); else /* trampoline code not run */ - printk(KERN_ERR "Not responding.\n"); + pr_err("CPU%d: Not responding.\n", cpu); if (apic->inquire_remote_apic) apic->inquire_remote_apic(apicid); } @@ -831,6 +848,7 @@ do_rest: smpboot_restore_warm_reset_vector(); } + destroy_work_on_stack(&c_idle.work); return boot_error; } @@ -1066,9 +1084,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_cpu_sibling_map(0); enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); -#endif if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); @@ -1291,14 +1307,16 @@ void native_cpu_die(unsigned int cpu) for (i = 0; i < 10; i++) { /* They ack this in play_dead by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { - printk(KERN_INFO "CPU %d is now offline\n", cpu); + if (system_state == SYSTEM_RUNNING) + pr_info("CPU %u is now offline\n", cpu); + if (1 == num_online_cpus()) alternatives_smp_switch(0); return; } msleep(100); } - printk(KERN_ERR "CPU %u didn't die...\n", cpu); + pr_err("CPU %u didn't die...\n", cpu); } void play_dead_common(void) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index c3eb207181fe..922eefbb3f6c 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -53,17 +53,19 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops save_stack_ops = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, - .stack = save_stack_stack, - .address = save_stack_address, + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address, + .walk_stack = print_context_stack, }; static const struct stacktrace_ops save_stack_ops_nosched = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, - .stack = save_stack_stack, - .address = save_stack_address_nosched, + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address_nosched, + .walk_stack = print_context_stack, }; /* diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 1884a8d12bfa..dee1ff7cba58 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -24,31 +24,6 @@ #include <asm/syscalls.h> -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - struct mm_struct *mm = current->mm; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/i386 didn't use to be able to handle more than @@ -77,7 +52,7 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) if (a.offset & ~PAGE_MASK) goto out; - err = sys_mmap2(a.addr, a.len, a.prot, a.flags, + err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return err; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 45e00eb09c3a..8aa2057efd12 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -23,26 +23,11 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, fd, unsigned long, off) { long error; - struct file *file; - error = -EINVAL; if (off & ~PAGE_MASK) goto out; - error = -EBADF; - file = NULL; - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); + error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return error; } diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 70c2125d55b9..15228b5d3eb7 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -191,7 +191,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* reserved for streams2 */ .long ptregs_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index cd022121cab6..c652ef62742d 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -12,21 +12,19 @@ #endif /* ready for x86_64 and x86 */ -unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE); +unsigned char *__trampinitdata trampoline_base; void __init reserve_trampoline_memory(void) { -#ifdef CONFIG_X86_32 - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); -#endif + unsigned long mem; + /* Has to be in very low memory so we can execute real-mode AP code. */ - reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, - "TRAMPOLINE"); + mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); + if (mem == -1L) + panic("Cannot allocate trampoline\n"); + + trampoline_base = __va(mem); + reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); } /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 33399176512a..1168e4454188 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -534,6 +534,9 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) get_debugreg(dr6, 6); + /* Filter out all the reserved bits which are preset to 1 */ + dr6 &= ~DR6_RESERVED; + /* Catch kmemcheck conditions first of all! */ if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) return; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cd982f48e23e..23066ecf12fa 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -763,6 +763,7 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; + sched_clock_stable = 0; printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) @@ -805,7 +806,7 @@ static void __init check_system_tsc_reliable(void) unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); - /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ + /* Geode_LX - the OLPC CPU has a very reliable TSC */ if (res_low & RTSC_SUSP) tsc_clocksource_reliable = 1; #endif diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index eed156851f5d..0aa5fed8b9e6 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count; * we want to have the fastest, inlined, non-debug version * of a critical section, to be able to prove TSC time-warps: */ -static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; @@ -62,13 +62,13 @@ static __cpuinit void check_tsc_warp(void) * previous TSC that was measured (possibly on * another CPU) and update the previous TSC timestamp. */ - __raw_spin_lock(&sync_lock); + arch_spin_lock(&sync_lock); prev = last_tsc; rdtsc_barrier(); now = get_cycles(); rdtsc_barrier(); last_tsc = now; - __raw_spin_unlock(&sync_lock); + arch_spin_unlock(&sync_lock); /* * Be nice every now and then (and also check whether @@ -87,10 +87,10 @@ static __cpuinit void check_tsc_warp(void) * we saw a time-warp of the TSC going backwards: */ if (unlikely(prev > now)) { - __raw_spin_lock(&sync_lock); + arch_spin_lock(&sync_lock); max_warp = max(max_warp, prev - now); nr_warps++; - __raw_spin_unlock(&sync_lock); + arch_spin_unlock(&sync_lock); } } WARN(!(now-start), diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 61d805df4c91..ece73d8e3240 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -215,8 +215,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) unsigned long mmr_offset; unsigned mmr_pnode; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; mmr_value = 0; diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c index 36afb98675a4..309c70fb7759 100644 --- a/arch/x86/kernel/uv_sysfs.c +++ b/arch/x86/kernel/uv_sysfs.c @@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void) if (!sgi_uv_kobj) sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); if (!sgi_uv_kobj) { - printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); + printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n"); return -EINVAL; } ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); if (ret) { - printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); + printk(KERN_WARNING "sysfs_create_file partition_id failed\n"); return ret; } ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); if (ret) { - printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); + printk(KERN_WARNING "sysfs_create_file coherence_id failed\n"); return ret; } diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 3c84aa001c11..2b75ef638dbc 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -282,10 +282,21 @@ static int uv_rtc_unset_timer(int cpu, int force) /* * Read the RTC. + * + * Starting with HUB rev 2.0, the UV RTC register is replicated across all + * cachelines of it's own page. This allows faster simultaneous reads + * from a given socket. */ static cycle_t uv_read_rtc(struct clocksource *cs) { - return (cycle_t)uv_read_local_mmr(UVH_RTC); + unsigned long offset; + + if (uv_get_min_hub_revision_id() == 1) + offset = 0; + else + offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE; + + return (cycle_t)uv_read_local_mmr(UVH_RTC | offset); } /* diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 9c4e62539058..5ffb5622f793 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -197,9 +197,8 @@ out: static int do_vm86_irq_handling(int subfunction, int irqnumber); static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); -int sys_vm86old(struct pt_regs *regs) +int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs) { - struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx; struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. * This remains on the stack until we @@ -227,7 +226,7 @@ out: } -int sys_vm86(struct pt_regs *regs) +int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs) { struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. @@ -239,12 +238,12 @@ int sys_vm86(struct pt_regs *regs) struct vm86plus_struct __user *v86; tsk = current; - switch (regs->bx) { + switch (cmd) { case VM86_REQUEST_IRQ: case VM86_FREE_IRQ: case VM86_GET_IRQ_BITS: case VM86_GET_AND_RESET_IRQ: - ret = do_vm86_irq_handling(regs->bx, (int)regs->cx); + ret = do_vm86_irq_handling(cmd, (int)arg); goto out; case VM86_PLUS_INSTALL_CHECK: /* @@ -261,7 +260,7 @@ int sys_vm86(struct pt_regs *regs) ret = -EPERM; if (tsk->thread.saved_sp0) goto out; - v86 = (struct vm86plus_struct __user *)regs->cx; + v86 = (struct vm86plus_struct __user *)arg; tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, offsetof(struct kernel_vm86_struct, regs32) - sizeof(info.regs)); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index f3f2104408d9..f92a0da608cb 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -319,9 +319,7 @@ SECTIONS __brk_limit = .; } - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = .; - } + _end = .; STABS_DEBUG DWARF_DEBUG diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a1029769b6f2..693920b22496 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -17,8 +17,6 @@ EXPORT_SYMBOL(mcount); #endif -EXPORT_SYMBOL(kernel_thread); - EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); EXPORT_SYMBOL(__get_user_4); @@ -28,7 +26,8 @@ EXPORT_SYMBOL(__put_user_2); EXPORT_SYMBOL(__put_user_4); EXPORT_SYMBOL(__put_user_8); -EXPORT_SYMBOL(copy_user_generic); +EXPORT_SYMBOL(copy_user_generic_string); +EXPORT_SYMBOL(copy_user_generic_unrolled); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); @@ -56,4 +55,6 @@ EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(init_level4_pgt); -EXPORT_SYMBOL(load_gs_index); +#ifndef CONFIG_PARAVIRT +EXPORT_SYMBOL(native_load_gs_index); +#endif diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ccd179dec36e..ee5746c94628 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -76,10 +76,13 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { .setup_percpu_clockev = setup_secondary_APIC_clock, }; +static void default_nmi_init(void) { }; + struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, + .nmi_init = default_nmi_init }; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index c5ee17e8c6d9..782c3a362ec6 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void) cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); xstate_size = ebx; + update_regset_xstate_info(xstate_size, pcntxt_mask); prepare_fx_sw_frame(); setup_xstate_init(); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index fab7440c9bb2..15578f180e59 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -29,6 +29,8 @@ * Based on QEMU and Xen. */ +#define pr_fmt(fmt) "pit: " fmt + #include <linux/kvm_host.h> #include "irq.h" @@ -262,7 +264,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) static void destroy_pit_timer(struct kvm_timer *pt) { - pr_debug("pit: execute del timer!\n"); + pr_debug("execute del timer!\n"); hrtimer_cancel(&pt->timer); } @@ -284,7 +286,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); - pr_debug("pit: create pit timer, interval is %llu nsec\n", interval); + pr_debug("create pit timer, interval is %llu nsec\n", interval); /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); @@ -309,7 +311,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) WARN_ON(!mutex_is_locked(&ps->lock)); - pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); + pr_debug("load_count val is %d, channel is %d\n", val, channel); /* * The largest possible initial count is 0; this is equivalent @@ -395,8 +397,8 @@ static int pit_ioport_write(struct kvm_io_device *this, mutex_lock(&pit_state->lock); if (val != 0) - pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n", - (unsigned int)addr, len, val); + pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n", + (unsigned int)addr, len, val); if (addr == 3) { channel = val >> 6; @@ -465,6 +467,9 @@ static int pit_ioport_read(struct kvm_io_device *this, return -EOPNOTSUPP; addr &= KVM_PIT_CHANNEL_MASK; + if (addr == 3) + return 0; + s = &pit_state->channels[addr]; mutex_lock(&pit_state->lock); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cd60c0bd1b32..ba8c045da782 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -373,6 +373,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (unlikely(!apic_enabled(apic))) break; + if (trig_mode) { + apic_debug("level trig mode for vector %d", vector); + apic_set_vector(vector, apic->regs + APIC_TMR); + } else + apic_clear_vector(vector, apic->regs + APIC_TMR); + result = !apic_test_and_set_irr(vector, apic); trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector, !result); @@ -383,11 +389,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; } - if (trig_mode) { - apic_debug("level trig mode for vector %d", vector); - apic_set_vector(vector, apic->regs + APIC_TMR); - } else - apic_clear_vector(vector, apic->regs + APIC_TMR); kvm_vcpu_kick(vcpu); break; @@ -1150,6 +1151,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); start_apic_timer(apic); + apic->irr_pending = true; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4c3e5b2314cb..89a49fb46a27 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -477,7 +477,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return page_size; + return PT_PAGE_TABLE_LEVEL; down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); @@ -515,11 +515,9 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { - + for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; - } return level - 1; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a6017132fba8..ede2131a9225 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -150,7 +150,9 @@ walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) + goto not_present; + trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) @@ -455,8 +457,6 @@ out_unlock: static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; - pt_element_t gpte; - gpa_t pte_gpa = -1; int level; u64 *sptep; int need_flush = 0; @@ -470,10 +470,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (level == PT_PAGE_TABLE_LEVEL || ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); - - pte_gpa = (sp->gfn << PAGE_SHIFT); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (is_shadow_present_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); @@ -492,18 +488,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (need_flush) kvm_flush_remote_tlbs(vcpu->kvm); spin_unlock(&vcpu->kvm->mmu_lock); - - if (pte_gpa == -1) - return; - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) - return; - if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) { - if (mmu_topup_memory_caches(vcpu)) - return; - kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, - sizeof(pt_element_t), 0); - } } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3de0b37ec038..1d9b33843c80 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -316,7 +316,7 @@ static void svm_hardware_disable(void *garbage) static int svm_hardware_enable(void *garbage) { - struct svm_cpu_data *svm_data; + struct svm_cpu_data *sd; uint64_t efer; struct descriptor_table gdt_descr; struct desc_struct *gdt; @@ -331,63 +331,61 @@ static int svm_hardware_enable(void *garbage) me); return -EINVAL; } - svm_data = per_cpu(svm_data, me); + sd = per_cpu(svm_data, me); - if (!svm_data) { + if (!sd) { printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", me); return -EINVAL; } - svm_data->asid_generation = 1; - svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; - svm_data->next_asid = svm_data->max_asid + 1; + sd->asid_generation = 1; + sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; + sd->next_asid = sd->max_asid + 1; kvm_get_gdt(&gdt_descr); gdt = (struct desc_struct *)gdt_descr.base; - svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); + sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); wrmsrl(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, - page_to_pfn(svm_data->save_area) << PAGE_SHIFT); + wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); return 0; } static void svm_cpu_uninit(int cpu) { - struct svm_cpu_data *svm_data - = per_cpu(svm_data, raw_smp_processor_id()); + struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id()); - if (!svm_data) + if (!sd) return; per_cpu(svm_data, raw_smp_processor_id()) = NULL; - __free_page(svm_data->save_area); - kfree(svm_data); + __free_page(sd->save_area); + kfree(sd); } static int svm_cpu_init(int cpu) { - struct svm_cpu_data *svm_data; + struct svm_cpu_data *sd; int r; - svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); - if (!svm_data) + sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); + if (!sd) return -ENOMEM; - svm_data->cpu = cpu; - svm_data->save_area = alloc_page(GFP_KERNEL); + sd->cpu = cpu; + sd->save_area = alloc_page(GFP_KERNEL); r = -ENOMEM; - if (!svm_data->save_area) + if (!sd->save_area) goto err_1; - per_cpu(svm_data, cpu) = svm_data; + per_cpu(svm_data, cpu) = sd; return 0; err_1: - kfree(svm_data); + kfree(sd); return r; } @@ -1092,16 +1090,16 @@ static void save_host_msrs(struct kvm_vcpu *vcpu) #endif } -static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) +static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) { - if (svm_data->next_asid > svm_data->max_asid) { - ++svm_data->asid_generation; - svm_data->next_asid = 1; + if (sd->next_asid > sd->max_asid) { + ++sd->asid_generation; + sd->next_asid = 1; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; } - svm->asid_generation = svm_data->asid_generation; - svm->vmcb->control.asid = svm_data->next_asid++; + svm->asid_generation = sd->asid_generation; + svm->vmcb->control.asid = sd->next_asid++; } static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) @@ -2429,8 +2427,8 @@ static void reload_tss(struct kvm_vcpu *vcpu) { int cpu = raw_smp_processor_id(); - struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); - svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + sd->tss_desc->type = 9; /* available 32/64-bit TSS */ load_TR_desc(); } @@ -2438,12 +2436,12 @@ static void pre_svm_run(struct vcpu_svm *svm) { int cpu = raw_smp_processor_id(); - struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; /* FIXME: handle wraparound of asid_generation */ - if (svm->asid_generation != svm_data->asid_generation) - new_asid(svm, svm_data); + if (svm->asid_generation != sd->asid_generation) + new_asid(svm, sd); } static void svm_inject_nmi(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9d068966fb2a..a1e1bc9d412d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -670,7 +670,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { static int version; struct pvclock_wall_clock wc; - struct timespec now, sys, boot; + struct timespec boot; if (!wall_clock) return; @@ -685,9 +685,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ - now = current_kernel_time(); - ktime_get_ts(&sys); - boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); + getboottime(&boot); wc.sec = boot.tv_sec; wc.nsec = boot.tv_nsec; @@ -762,6 +760,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); ktime_get_ts(&ts); + monotonic_to_bootbased(&ts); local_irq_restore(flags); /* With all the info we got, fill in the values */ @@ -1913,7 +1912,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->sipi_vector = vcpu->arch.sipi_vector; - events->flags = 0; + events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING + | KVM_VCPUEVENT_VALID_SIPI_VECTOR); vcpu_put(vcpu); } @@ -1921,7 +1921,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - if (events->flags) + if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING + | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) return -EINVAL; vcpu_load(vcpu); @@ -1938,10 +1939,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, kvm_pic_clear_isr_ack(vcpu->kvm); vcpu->arch.nmi_injected = events->nmi.injected; - vcpu->arch.nmi_pending = events->nmi.pending; + if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) + vcpu->arch.nmi_pending = events->nmi.pending; kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); - vcpu->arch.sipi_vector = events->sipi_vector; + if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) + vcpu->arch.sipi_vector = events->sipi_vector; vcpu_put(vcpu); @@ -5068,12 +5071,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) GFP_KERNEL); if (!vcpu->arch.mce_banks) { r = -ENOMEM; - goto fail_mmu_destroy; + goto fail_free_lapic; } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; return 0; - +fail_free_lapic: + kvm_free_lapic(vcpu); fail_mmu_destroy: kvm_mmu_destroy(vcpu); fail_free_pio_data: @@ -5084,6 +5088,7 @@ fail: void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { + kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); down_read(&vcpu->kvm->slots_lock); kvm_mmu_destroy(vcpu); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index a2d6472895fb..419386c24b82 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -5,7 +5,7 @@ inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ - cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@ $(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) $(call cmd,inat_tables) @@ -14,15 +14,15 @@ $(obj)/inat.o: $(obj)/inat-tables.c clean-files := inat-tables.c -obj-$(CONFIG_SMP) := msr.o +obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o -lib-y += insn.o inat.o +lib-$(CONFIG_KPROBES) += insn.o inat.o -obj-y += msr-reg.o msr-reg-export.o +obj-y += msr.o msr-reg.o msr-reg-export.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o @@ -34,9 +34,10 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y) endif lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else - obj-y += io_64.o iomap_copy_64.o + obj-y += iomap_copy_64.o lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += thunk_64.o clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o + lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o endif diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c new file mode 100644 index 000000000000..a3c668875038 --- /dev/null +++ b/arch/x86/lib/cache-smp.c @@ -0,0 +1,19 @@ +#include <linux/smp.h> +#include <linux/module.h> + +static void __wbinvd(void *dummy) +{ + wbinvd(); +} + +void wbinvd_on_cpu(int cpu) +{ + smp_call_function_single(cpu, __wbinvd, NULL, 1); +} +EXPORT_SYMBOL(wbinvd_on_cpu); + +int wbinvd_on_all_cpus(void) +{ + return on_each_cpu(__wbinvd, NULL, 1); +} +EXPORT_SYMBOL(wbinvd_on_all_cpus); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index cf889d4e076a..71100c98e337 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -90,12 +90,6 @@ ENTRY(_copy_from_user) CFI_ENDPROC ENDPROC(_copy_from_user) -ENTRY(copy_user_generic) - CFI_STARTPROC - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC -ENDPROC(copy_user_generic) - .section .fixup,"ax" /* must zero dest */ ENTRY(bad_from_user) diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c deleted file mode 100644 index 3f1eb59b5f08..000000000000 --- a/arch/x86/lib/io_64.c +++ /dev/null @@ -1,25 +0,0 @@ -#include <linux/string.h> -#include <linux/module.h> -#include <asm/io.h> - -void __memcpy_toio(unsigned long dst, const void *src, unsigned len) -{ - __inline_memcpy((void *)dst, src, len); -} -EXPORT_SYMBOL(__memcpy_toio); - -void __memcpy_fromio(void *dst, unsigned long src, unsigned len) -{ - __inline_memcpy(dst, (const void *)src, len); -} -EXPORT_SYMBOL(__memcpy_fromio); - -void memset_io(volatile void __iomem *a, int b, size_t c) -{ - /* - * TODO: memset can mangle the IO patterns quite a bit. - * perhaps it would be better to use a dumb one: - */ - memset((void *)a, b, c); -} -EXPORT_SYMBOL(memset_io); diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index ad5441ed1b57..f82e884928af 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -20,12 +20,11 @@ /* * memcpy_c() - fast string ops (REP MOVSQ) based variant. * - * Calls to this get patched into the kernel image via the + * This gets patched over the unrolled variant (below) via the * alternative instructions framework: */ - ALIGN -memcpy_c: - CFI_STARTPROC + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c: movq %rdi, %rax movl %edx, %ecx @@ -35,8 +34,8 @@ memcpy_c: movl %edx, %ecx rep movsb ret - CFI_ENDPROC -ENDPROC(memcpy_c) +.Lmemcpy_e: + .previous ENTRY(__memcpy) ENTRY(memcpy) @@ -128,16 +127,10 @@ ENDPROC(__memcpy) * It is also a lot simpler. Use this when possible: */ - .section .altinstr_replacement, "ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions, "a" .align 8 .quad memcpy - .quad 1b + .quad .Lmemcpy_c .byte X86_FEATURE_REP_GOOD /* @@ -145,6 +138,6 @@ ENDPROC(__memcpy) * so it is silly to overwrite itself with nops - reboot is the * only outcome... */ - .byte 2b - 1b - .byte 2b - 1b + .byte .Lmemcpy_e - .Lmemcpy_c + .byte .Lmemcpy_e - .Lmemcpy_c .previous diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 2c5948116bd2..e88d3b81644a 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -12,9 +12,8 @@ * * rax original destination */ - ALIGN -memset_c: - CFI_STARTPROC + .section .altinstr_replacement, "ax", @progbits +.Lmemset_c: movq %rdi,%r9 movl %edx,%r8d andl $7,%r8d @@ -29,8 +28,8 @@ memset_c: rep stosb movq %r9,%rax ret - CFI_ENDPROC -ENDPROC(memset_c) +.Lmemset_e: + .previous ENTRY(memset) ENTRY(__memset) @@ -118,16 +117,11 @@ ENDPROC(__memset) #include <asm/cpufeature.h> - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (memset_c - memset) - (2f - 1b) /* offset */ -2: - .previous .section .altinstructions,"a" .align 8 .quad memset - .quad 1b + .quad .Lmemset_c .byte X86_FEATURE_REP_GOOD .byte .Lfinal - memset - .byte 2b - 1b + .byte .Lmemset_e - .Lmemset_c .previous diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c new file mode 100644 index 000000000000..a6b1b86d2253 --- /dev/null +++ b/arch/x86/lib/msr-smp.c @@ -0,0 +1,204 @@ +#include <linux/module.h> +#include <linux/preempt.h> +#include <linux/smp.h> +#include <asm/msr.h> + +static void __rdmsr_on_cpu(void *info) +{ + struct msr_info *rv = info; + struct msr *reg; + int this_cpu = raw_smp_processor_id(); + + if (rv->msrs) + reg = per_cpu_ptr(rv->msrs, this_cpu); + else + reg = &rv->reg; + + rdmsr(rv->msr_no, reg->l, reg->h); +} + +static void __wrmsr_on_cpu(void *info) +{ + struct msr_info *rv = info; + struct msr *reg; + int this_cpu = raw_smp_processor_id(); + + if (rv->msrs) + reg = per_cpu_ptr(rv->msrs, this_cpu); + else + reg = &rv->reg; + + wrmsr(rv->msr_no, reg->l, reg->h); +} + +int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); + *l = rv.reg.l; + *h = rv.reg.h; + + return err; +} +EXPORT_SYMBOL(rdmsr_on_cpu); + +int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.l = l; + rv.reg.h = h; + err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); + + return err; +} +EXPORT_SYMBOL(wrmsr_on_cpu); + +static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, + struct msr *msrs, + void (*msr_func) (void *info)) +{ + struct msr_info rv; + int this_cpu; + + memset(&rv, 0, sizeof(rv)); + + rv.msrs = msrs; + rv.msr_no = msr_no; + + this_cpu = get_cpu(); + + if (cpumask_test_cpu(this_cpu, mask)) + msr_func(&rv); + + smp_call_function_many(mask, msr_func, &rv, 1); + put_cpu(); +} + +/* rdmsr on a bunch of CPUs + * + * @mask: which CPUs + * @msr_no: which MSR + * @msrs: array of MSR values + * + */ +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) +{ + __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu); +} +EXPORT_SYMBOL(rdmsr_on_cpus); + +/* + * wrmsr on a bunch of CPUs + * + * @mask: which CPUs + * @msr_no: which MSR + * @msrs: array of MSR values + * + */ +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) +{ + __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu); +} +EXPORT_SYMBOL(wrmsr_on_cpus); + +/* These "safe" variants are slower and should be used when the target MSR + may not actually exist. */ +static void __rdmsr_safe_on_cpu(void *info) +{ + struct msr_info *rv = info; + + rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h); +} + +static void __wrmsr_safe_on_cpu(void *info) +{ + struct msr_info *rv = info; + + rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); +} + +int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); + *l = rv.reg.l; + *h = rv.reg.h; + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsr_safe_on_cpu); + +int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.l = l; + rv.reg.h = h; + err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsr_safe_on_cpu); + +/* + * These variants are significantly slower, but allows control over + * the entire 32-bit GPR set. + */ +static void __rdmsr_safe_regs_on_cpu(void *info) +{ + struct msr_regs_info *rv = info; + + rv->err = rdmsr_safe_regs(rv->regs); +} + +static void __wrmsr_safe_regs_on_cpu(void *info) +{ + struct msr_regs_info *rv = info; + + rv->err = wrmsr_safe_regs(rv->regs); +} + +int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +{ + int err; + struct msr_regs_info rv; + + rv.regs = regs; + rv.err = -EIO; + err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); + +int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +{ + int err; + struct msr_regs_info rv; + + rv.regs = regs; + rv.err = -EIO; + err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 41628b104b9e..8f8eebdca7d4 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -1,218 +1,23 @@ #include <linux/module.h> #include <linux/preempt.h> -#include <linux/smp.h> #include <asm/msr.h> -struct msr_info { - u32 msr_no; - struct msr reg; - struct msr *msrs; - int off; - int err; -}; - -static void __rdmsr_on_cpu(void *info) -{ - struct msr_info *rv = info; - struct msr *reg; - int this_cpu = raw_smp_processor_id(); - - if (rv->msrs) - reg = &rv->msrs[this_cpu - rv->off]; - else - reg = &rv->reg; - - rdmsr(rv->msr_no, reg->l, reg->h); -} - -static void __wrmsr_on_cpu(void *info) -{ - struct msr_info *rv = info; - struct msr *reg; - int this_cpu = raw_smp_processor_id(); - - if (rv->msrs) - reg = &rv->msrs[this_cpu - rv->off]; - else - reg = &rv->reg; - - wrmsr(rv->msr_no, reg->l, reg->h); -} - -int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - int err; - struct msr_info rv; - - memset(&rv, 0, sizeof(rv)); - - rv.msr_no = msr_no; - err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); - *l = rv.reg.l; - *h = rv.reg.h; - - return err; -} -EXPORT_SYMBOL(rdmsr_on_cpu); - -int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) -{ - int err; - struct msr_info rv; - - memset(&rv, 0, sizeof(rv)); - - rv.msr_no = msr_no; - rv.reg.l = l; - rv.reg.h = h; - err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); - - return err; -} -EXPORT_SYMBOL(wrmsr_on_cpu); - -static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, - struct msr *msrs, - void (*msr_func) (void *info)) -{ - struct msr_info rv; - int this_cpu; - - memset(&rv, 0, sizeof(rv)); - - rv.off = cpumask_first(mask); - rv.msrs = msrs; - rv.msr_no = msr_no; - - this_cpu = get_cpu(); - - if (cpumask_test_cpu(this_cpu, mask)) - msr_func(&rv); - - smp_call_function_many(mask, msr_func, &rv, 1); - put_cpu(); -} - -/* rdmsr on a bunch of CPUs - * - * @mask: which CPUs - * @msr_no: which MSR - * @msrs: array of MSR values - * - */ -void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) -{ - __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu); -} -EXPORT_SYMBOL(rdmsr_on_cpus); - -/* - * wrmsr on a bunch of CPUs - * - * @mask: which CPUs - * @msr_no: which MSR - * @msrs: array of MSR values - * - */ -void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) +struct msr *msrs_alloc(void) { - __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu); -} -EXPORT_SYMBOL(wrmsr_on_cpus); - -/* These "safe" variants are slower and should be used when the target MSR - may not actually exist. */ -static void __rdmsr_safe_on_cpu(void *info) -{ - struct msr_info *rv = info; - - rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h); -} - -static void __wrmsr_safe_on_cpu(void *info) -{ - struct msr_info *rv = info; - - rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); -} - -int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - int err; - struct msr_info rv; - - memset(&rv, 0, sizeof(rv)); - - rv.msr_no = msr_no; - err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); - *l = rv.reg.l; - *h = rv.reg.h; - - return err ? err : rv.err; -} -EXPORT_SYMBOL(rdmsr_safe_on_cpu); - -int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) -{ - int err; - struct msr_info rv; + struct msr *msrs = NULL; - memset(&rv, 0, sizeof(rv)); + msrs = alloc_percpu(struct msr); + if (!msrs) { + pr_warning("%s: error allocating msrs\n", __func__); + return NULL; + } - rv.msr_no = msr_no; - rv.reg.l = l; - rv.reg.h = h; - err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); - - return err ? err : rv.err; + return msrs; } -EXPORT_SYMBOL(wrmsr_safe_on_cpu); - -/* - * These variants are significantly slower, but allows control over - * the entire 32-bit GPR set. - */ -struct msr_regs_info { - u32 *regs; - int err; -}; +EXPORT_SYMBOL(msrs_alloc); -static void __rdmsr_safe_regs_on_cpu(void *info) +void msrs_free(struct msr *msrs) { - struct msr_regs_info *rv = info; - - rv->err = rdmsr_safe_regs(rv->regs); -} - -static void __wrmsr_safe_regs_on_cpu(void *info) -{ - struct msr_regs_info *rv = info; - - rv->err = wrmsr_safe_regs(rv->regs); -} - -int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) -{ - int err; - struct msr_regs_info rv; - - rv.regs = regs; - rv.err = -EIO; - err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); - - return err ? err : rv.err; -} -EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); - -int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) -{ - int err; - struct msr_regs_info rv; - - rv.regs = regs; - rv.err = -EIO; - err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); - - return err ? err : rv.err; + free_percpu(msrs); } -EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); +EXPORT_SYMBOL(msrs_free); diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S new file mode 100644 index 000000000000..15acecf0d7aa --- /dev/null +++ b/arch/x86/lib/rwsem_64.S @@ -0,0 +1,81 @@ +/* + * x86-64 rwsem wrappers + * + * This interfaces the inline asm code to the slow-path + * C routines. We need to save the call-clobbered regs + * that the asm does not mark as clobbered, and move the + * argument from %rax to %rdi. + * + * NOTE! We don't need to save %rax, because the functions + * will always return the semaphore pointer in %rax (which + * is also the input argument to these helpers) + * + * The following can clobber %rdx because the asm clobbers it: + * call_rwsem_down_write_failed + * call_rwsem_wake + * but %rdi, %rsi, %rcx, %r8-r11 always need saving. + */ + +#include <linux/linkage.h> +#include <asm/rwlock.h> +#include <asm/alternative-asm.h> +#include <asm/frame.h> +#include <asm/dwarf2.h> + +#define save_common_regs \ + pushq %rdi; \ + pushq %rsi; \ + pushq %rcx; \ + pushq %r8; \ + pushq %r9; \ + pushq %r10; \ + pushq %r11 + +#define restore_common_regs \ + popq %r11; \ + popq %r10; \ + popq %r9; \ + popq %r8; \ + popq %rcx; \ + popq %rsi; \ + popq %rdi + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_down_read_failed) + save_common_regs + pushq %rdx + movq %rax,%rdi + call rwsem_down_read_failed + popq %rdx + restore_common_regs + ret + ENDPROC(call_rwsem_down_read_failed) + +ENTRY(call_rwsem_down_write_failed) + save_common_regs + movq %rax,%rdi + call rwsem_down_write_failed + restore_common_regs + ret + ENDPROC(call_rwsem_down_write_failed) + +ENTRY(call_rwsem_wake) + decw %dx /* do nothing if still outstanding active readers */ + jnz 1f + save_common_regs + movq %rax,%rdi + call rwsem_wake + restore_common_regs +1: ret + ENDPROC(call_rwsem_wake) + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_downgrade_wake) + save_common_regs + pushq %rdx + movq %rax,%rdi + call rwsem_downgrade_wake + popq %rdx + restore_common_regs + ret + ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 71da1bca13cb..738e6593799d 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -18,7 +18,7 @@ static inline pte_t gup_get_pte(pte_t *ptep) #else /* * With get_user_pages_fast, we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atoimcally, + * any locks. For this we would like to load the pointers atomically, * but that is not possible (without expensive cmpxchg8b) on PAE. What * we do have is the guarantee that a pte will only either go from not * present to present, or present to not present or both -- it will not diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d406c5239019..e71c5cbc8f35 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -266,16 +266,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (!after_bootmem) find_early_table_space(end, use_pse, use_gbpages); -#ifdef CONFIG_X86_32 - for (i = 0; i < nr_range; i++) - kernel_physical_mapping_init(mr[i].start, mr[i].end, - mr[i].page_size_mask); - ret = end; -#else /* CONFIG_X86_64 */ for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); -#endif #ifdef CONFIG_X86_32 early_ioremap_page_table_range_init(); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c973f8e2a6cf..2226f2c70ea3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -241,6 +241,7 @@ kernel_physical_mapping_init(unsigned long start, unsigned long page_size_mask) { int use_pse = page_size_mask == (1<<PG_LEVEL_2M); + unsigned long last_map_addr = end; unsigned long start_pfn, end_pfn; pgd_t *pgd_base = swapper_pg_dir; int pgd_idx, pmd_idx, pte_ofs; @@ -341,9 +342,10 @@ repeat: prot = PAGE_KERNEL_EXEC; pages_4k++; - if (mapping_iter == 1) + if (mapping_iter == 1) { set_pte(pte, pfn_pte(pfn, init_prot)); - else + last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE; + } else set_pte(pte, pfn_pte(pfn, prot)); } } @@ -368,7 +370,7 @@ repeat: mapping_iter = 2; goto repeat; } - return 0; + return last_map_addr; } pte_t *kmap_pte; @@ -892,8 +894,7 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) - ); + totalhigh_pages << (PAGE_SHIFT-10)); printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5198b9bb34ef..69ddfbd91135 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -49,6 +49,7 @@ #include <asm/numa.h> #include <asm/cacheflush.h> #include <asm/init.h> +#include <linux/bootmem.h> static unsigned long dma_reserve __initdata; @@ -616,6 +617,21 @@ void __init paging_init(void) */ #ifdef CONFIG_MEMORY_HOTPLUG /* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) +{ + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } +} + +/* * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ @@ -634,6 +650,9 @@ int arch_add_memory(int nid, u64 start, u64 size) ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start, size); + return ret; } EXPORT_SYMBOL_GPL(arch_add_memory); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index c246d259822d..5eb1ba74a3a9 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -24,43 +24,6 @@ #include "physaddr.h" -int page_is_ram(unsigned long pagenr) -{ - resource_size_t addr, end; - int i; - - /* - * A special case is the first 4Kb of memory; - * This is a BIOS owned area, not kernel ram, but generally - * not listed as such in the E820 table. - */ - if (pagenr == 0) - return 0; - - /* - * Second special case: Some BIOSen report the PC BIOS - * area (640->1Mb) as ram even though it is not. - */ - if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) && - pagenr < (BIOS_END >> PAGE_SHIFT)) - return 0; - - for (i = 0; i < e820.nr_map; i++) { - /* - * Not usable memory: - */ - if (e820.map[i].type != E820_RAM) - continue; - addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT; - - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; -} - /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. @@ -422,6 +385,10 @@ void __init early_ioremap_init(void) * The boot-ioremap range spans multiple pmds, for which * we are not prepared: */ +#define __FIXADDR_TOP (-PAGE_SIZE) + BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); +#undef __FIXADDR_TOP if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { WARN_ON(1); printk(KERN_WARNING "pmd %p != %p\n", diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index 4901d0dafda6..af3b6c8a436f 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -106,26 +106,25 @@ void kmemcheck_error_recall(void) switch (e->type) { case KMEMCHECK_ERROR_INVALID_ACCESS: - printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read " - "from %s memory (%p)\n", + printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n", 8 * e->size, e->state < ARRAY_SIZE(desc) ? desc[e->state] : "(invalid shadow state)", (void *) e->address); - printk(KERN_INFO); + printk(KERN_WARNING); for (i = 0; i < SHADOW_COPY_SIZE; ++i) - printk("%02x", e->memory_copy[i]); - printk("\n"); + printk(KERN_CONT "%02x", e->memory_copy[i]); + printk(KERN_CONT "\n"); - printk(KERN_INFO); + printk(KERN_WARNING); for (i = 0; i < SHADOW_COPY_SIZE; ++i) { if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) - printk(" %c", short_desc[e->shadow_copy[i]]); + printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]); else - printk(" ?"); + printk(KERN_CONT " ?"); } - printk("\n"); - printk(KERN_INFO "%*c\n", 2 + 2 + printk(KERN_CONT "\n"); + printk(KERN_WARNING "%*c\n", 2 + 2 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); break; case KMEMCHECK_ERROR_BUG: diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index 8cc183344140..b3b531a4f8e5 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -337,7 +337,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) if (!shadow) return true; - status = kmemcheck_shadow_test(shadow, size); + status = kmemcheck_shadow_test_all(shadow, size); return status == KMEMCHECK_SHADOW_INITIALIZED; } diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c index 3f66b82076a3..aec124214d97 100644 --- a/arch/x86/mm/kmemcheck/shadow.c +++ b/arch/x86/mm/kmemcheck/shadow.c @@ -125,12 +125,12 @@ void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) { +#ifdef CONFIG_KMEMCHECK_PARTIAL_OK uint8_t *x; unsigned int i; x = shadow; -#ifdef CONFIG_KMEMCHECK_PARTIAL_OK /* * Make sure _some_ bytes are initialized. Gcc frequently generates * code to access neighboring bytes. @@ -139,13 +139,25 @@ enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) return x[i]; } + + return x[0]; #else + return kmemcheck_shadow_test_all(shadow, size); +#endif +} + +enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size) +{ + uint8_t *x; + unsigned int i; + + x = shadow; + /* All bytes must be initialized. */ for (i = 0; i < size; ++i) { if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) return x[i]; } -#endif return x[0]; } diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h index af46d9ab9d86..ff0b2f70fbcb 100644 --- a/arch/x86/mm/kmemcheck/shadow.h +++ b/arch/x86/mm/kmemcheck/shadow.h @@ -11,6 +11,8 @@ enum kmemcheck_shadow { void *kmemcheck_shadow_lookup(unsigned long address); enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); +enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, + unsigned int size); void kmemcheck_shadow_set(void *shadow, unsigned int size); #endif diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 11a4ad4d6253..536fb6823366 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -5,6 +5,8 @@ * 2008 Pekka Paalanen <pq@iki.fi> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/list.h> #include <linux/rculist.h> #include <linux/spinlock.h> @@ -136,7 +138,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) pte_t *pte = lookup_address(f->page, &level); if (!pte) { - pr_err("kmmio: no pte for page 0x%08lx\n", f->page); + pr_err("no pte for page 0x%08lx\n", f->page); return -1; } @@ -148,7 +150,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) clear_pte_presence(pte, clear, &f->old_presence); break; default: - pr_err("kmmio: unexpected page level 0x%x.\n", level); + pr_err("unexpected page level 0x%x.\n", level); return -1; } @@ -170,13 +172,14 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) static int arm_kmmio_fault_page(struct kmmio_fault_page *f) { int ret; - WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); + WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); if (f->armed) { - pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, !!f->old_presence); + pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n", + f->page, f->count, !!f->old_presence); } ret = clear_page_presence(f, true); - WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); + WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"), + f->page); f->armed = true; return ret; } @@ -203,7 +206,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) */ /* * Interrupts are disabled on entry as trap3 is an interrupt gate - * and they remain disabled thorough out this function. + * and they remain disabled throughout this function. */ int kmmio_handler(struct pt_regs *regs, unsigned long addr) { @@ -240,24 +243,21 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) * condition needs handling by do_page_fault(), the * page really not being present is the most common. */ - pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", - addr, smp_processor_id()); + pr_debug("secondary hit for 0x%08lx CPU %d.\n", + addr, smp_processor_id()); if (!faultpage->old_presence) - pr_info("kmmio: unexpected secondary hit for " - "address 0x%08lx on CPU %d.\n", addr, - smp_processor_id()); + pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n", + addr, smp_processor_id()); } else { /* * Prevent overwriting already in-flight context. * This should not happen, let's hope disarming at * least prevents a panic. */ - pr_emerg("kmmio: recursive probe hit on CPU %d, " - "for address 0x%08lx. Ignoring.\n", - smp_processor_id(), addr); - pr_emerg("kmmio: previous hit was at 0x%08lx.\n", - ctx->addr); + pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n", + smp_processor_id(), addr); + pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr); disarm_kmmio_fault_page(faultpage); } goto no_kmmio_ctx; @@ -302,7 +302,7 @@ no_kmmio: /* * Interrupts are disabled on entry as trap1 is an interrupt gate - * and they remain disabled thorough out this function. + * and they remain disabled throughout this function. * This must always get called as the pair to kmmio_handler(). */ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) @@ -316,8 +316,8 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) * something external causing them (f.e. using a debugger while * mmio tracing enabled), or erroneous behaviour */ - pr_warning("kmmio: unexpected debug trap on CPU %d.\n", - smp_processor_id()); + pr_warning("unexpected debug trap on CPU %d.\n", + smp_processor_id()); goto out; } @@ -425,7 +425,7 @@ int register_kmmio_probe(struct kmmio_probe *p) list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { if (add_kmmio_fault_page(p->addr + size)) - pr_err("kmmio: Unable to set page fault.\n"); + pr_err("Unable to set page fault.\n"); size += PAGE_SIZE; } out: @@ -490,7 +490,7 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) * 2. remove_kmmio_fault_pages() * Remove the pages from kmmio_page_table. * 3. rcu_free_kmmio_fault_pages() - * Actally free the kmmio_fault_page structs as with RCU. + * Actually free the kmmio_fault_page structs as with RCU. */ void unregister_kmmio_probe(struct kmmio_probe *p) { @@ -511,7 +511,7 @@ void unregister_kmmio_probe(struct kmmio_probe *p) drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); if (!drelease) { - pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); + pr_crit("leaking kmmio_fault_page objects.\n"); return; } drelease->release_list = release_list; @@ -538,14 +538,15 @@ static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) { struct die_args *arg = args; + unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err); - if (val == DIE_DEBUG && (arg->err & DR_STEP)) - if (post_kmmio_handler(arg->err, arg->regs) == 1) { + if (val == DIE_DEBUG && (*dr6_p & DR_STEP)) + if (post_kmmio_handler(*dr6_p, arg->regs) == 1) { /* * Reset the BS bit in dr6 (pointed by args->err) to * denote completion of processing */ - (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP; + *dr6_p &= ~DR_STEP; return NOTIFY_STOP; } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index c8191defc38a..1dab5194fd9d 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -71,7 +71,7 @@ static int mmap_is_legacy(void) if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) + if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; @@ -96,7 +96,7 @@ static unsigned long mmap_rnd(void) static unsigned long mmap_base(void) { - unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; + unsigned long gap = rlimit(RLIMIT_STACK); if (gap < MIN_GAP) gap = MIN_GAP; diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 132772a8ec57..34a3291ca103 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -19,6 +19,9 @@ * * Derived from the read-mod example from relay-examples by Tom Zanussi. */ + +#define pr_fmt(fmt) "mmiotrace: " fmt + #define DEBUG 1 #include <linux/module.h> @@ -36,8 +39,6 @@ #include "pf_in.h" -#define NAME "mmiotrace: " - struct trap_reason { unsigned long addr; unsigned long ip; @@ -96,17 +97,18 @@ static void print_pte(unsigned long address) pte_t *pte = lookup_address(address, &level); if (!pte) { - pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", - __func__, address); + pr_err("Error in %s: no pte for page 0x%08lx\n", + __func__, address); return; } if (level == PG_LEVEL_2M) { - pr_emerg(NAME "4MB pages are not currently supported: " - "0x%08lx\n", address); + pr_emerg("4MB pages are not currently supported: 0x%08lx\n", + address); BUG(); } - pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address, + pr_info("pte for 0x%lx: 0x%llx 0x%llx\n", + address, (unsigned long long)pte_val(*pte), (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); } @@ -118,22 +120,21 @@ static void print_pte(unsigned long address) static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) { const struct trap_reason *my_reason = &get_cpu_var(pf_reason); - pr_emerg(NAME "unexpected fault for address: 0x%08lx, " - "last fault for address: 0x%08lx\n", - addr, my_reason->addr); + pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n", + addr, my_reason->addr); print_pte(addr); print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); #ifdef __i386__ pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", - regs->ax, regs->bx, regs->cx, regs->dx); + regs->ax, regs->bx, regs->cx, regs->dx); pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", - regs->si, regs->di, regs->bp, regs->sp); + regs->si, regs->di, regs->bp, regs->sp); #else pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", - regs->ax, regs->cx, regs->dx); + regs->ax, regs->cx, regs->dx); pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", - regs->si, regs->di, regs->bp, regs->sp); + regs->si, regs->di, regs->bp, regs->sp); #endif put_cpu_var(pf_reason); BUG(); @@ -213,7 +214,7 @@ static void post(struct kmmio_probe *p, unsigned long condition, /* this should always return the active_trace count to 0 */ my_reason->active_traces--; if (my_reason->active_traces) { - pr_emerg(NAME "unexpected post handler"); + pr_emerg("unexpected post handler"); BUG(); } @@ -244,7 +245,7 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size, }; if (!trace) { - pr_err(NAME "kmalloc failed in ioremap\n"); + pr_err("kmalloc failed in ioremap\n"); return; } @@ -282,8 +283,8 @@ void mmiotrace_ioremap(resource_size_t offset, unsigned long size, if (!is_enabled()) /* recheck and proper locking in *_core() */ return; - pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n", - (unsigned long long)offset, size, addr); + pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n", + (unsigned long long)offset, size, addr); if ((filter_offset) && (offset != filter_offset)) return; ioremap_trace_core(offset, size, addr); @@ -301,7 +302,7 @@ static void iounmap_trace_core(volatile void __iomem *addr) struct remap_trace *tmp; struct remap_trace *found_trace = NULL; - pr_debug(NAME "Unmapping %p.\n", addr); + pr_debug("Unmapping %p.\n", addr); spin_lock_irq(&trace_lock); if (!is_enabled()) @@ -363,9 +364,8 @@ static void clear_trace_list(void) * Caller also ensures is_enabled() cannot change. */ list_for_each_entry(trace, &trace_list, list) { - pr_notice(NAME "purging non-iounmapped " - "trace @0x%08lx, size 0x%lx.\n", - trace->probe.addr, trace->probe.len); + pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n", + trace->probe.addr, trace->probe.len); if (!nommiotrace) unregister_kmmio_probe(&trace->probe); } @@ -387,7 +387,7 @@ static void enter_uniprocessor(void) if (downed_cpus == NULL && !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { - pr_notice(NAME "Failed to allocate mask\n"); + pr_notice("Failed to allocate mask\n"); goto out; } @@ -395,20 +395,19 @@ static void enter_uniprocessor(void) cpumask_copy(downed_cpus, cpu_online_mask); cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus); if (num_online_cpus() > 1) - pr_notice(NAME "Disabling non-boot CPUs...\n"); + pr_notice("Disabling non-boot CPUs...\n"); put_online_cpus(); for_each_cpu(cpu, downed_cpus) { err = cpu_down(cpu); if (!err) - pr_info(NAME "CPU%d is down.\n", cpu); + pr_info("CPU%d is down.\n", cpu); else - pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); + pr_err("Error taking CPU%d down: %d\n", cpu, err); } out: if (num_online_cpus() > 1) - pr_warning(NAME "multiple CPUs still online, " - "may miss events.\n"); + pr_warning("multiple CPUs still online, may miss events.\n"); } /* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, @@ -420,13 +419,13 @@ static void __ref leave_uniprocessor(void) if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) return; - pr_notice(NAME "Re-enabling CPUs...\n"); + pr_notice("Re-enabling CPUs...\n"); for_each_cpu(cpu, downed_cpus) { err = cpu_up(cpu); if (!err) - pr_info(NAME "enabled CPU%d.\n", cpu); + pr_info("enabled CPU%d.\n", cpu); else - pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err); + pr_err("cannot re-enable CPU%d: %d\n", cpu, err); } } @@ -434,8 +433,8 @@ static void __ref leave_uniprocessor(void) static void enter_uniprocessor(void) { if (num_online_cpus() > 1) - pr_warning(NAME "multiple CPUs are online, may miss events. " - "Suggest booting with maxcpus=1 kernel argument.\n"); + pr_warning("multiple CPUs are online, may miss events. " + "Suggest booting with maxcpus=1 kernel argument.\n"); } static void leave_uniprocessor(void) @@ -450,13 +449,13 @@ void enable_mmiotrace(void) goto out; if (nommiotrace) - pr_info(NAME "MMIO tracing disabled.\n"); + pr_info("MMIO tracing disabled.\n"); kmmio_init(); enter_uniprocessor(); spin_lock_irq(&trace_lock); atomic_inc(&mmiotrace_enabled); spin_unlock_irq(&trace_lock); - pr_info(NAME "enabled.\n"); + pr_info("enabled.\n"); out: mutex_unlock(&mmiotrace_mutex); } @@ -475,7 +474,7 @@ void disable_mmiotrace(void) clear_trace_list(); /* guarantees: no more kmmio callbacks */ leave_uniprocessor(); kmmio_cleanup(); - pr_info(NAME "disabled.\n"); + pr_info("disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 83bbc70d11bb..3307ea8bd43a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -427,7 +427,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, * Calculate the number of big nodes that can be allocated as a result * of consolidating the remainder. */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / FAKE_NODE_MIN_SIZE; size &= FAKE_NODE_MIN_HASH_MASK; @@ -502,77 +502,99 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, } /* - * Splits num_nodes nodes up equally starting at node_start. The return value - * is the number of nodes split up and addr is adjusted to be at the end of the - * last node allocated. + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. */ -static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, - int num_nodes) +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) { - unsigned int big; - u64 size; - int i; - - if (num_nodes <= 0) - return -1; - if (num_nodes > MAX_NUMNODES) - num_nodes = MAX_NUMNODES; - size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / - num_nodes; - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the leftovers. - */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / - FAKE_NODE_MIN_SIZE; - - /* Round down to nearest FAKE_NODE_MIN_SIZE. */ - size &= FAKE_NODE_MIN_HASH_MASK; - if (!size) { - printk(KERN_ERR "Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - - for (i = node_start; i < num_nodes + node_start; i++) { - u64 end = *addr + size; + u64 end = start + size; - if (i < big) - end += FAKE_NODE_MIN_SIZE; - /* - * The final node can have the remaining system RAM. Other - * nodes receive roughly the same amount of available pages. - */ - if (i == num_nodes + node_start - 1) + while (end - start - e820_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { end = max_addr; - else - while (end - *addr - e820_hole_size(*addr, end) < - size) { - end += FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end = max_addr; - break; - } - } - if (setup_node_range(i, addr, end - *addr, max_addr) < 0) break; + } } - return i - node_start + 1; + return end; } /* - * Splits the remaining system RAM into chunks of size. The remaining memory is - * always assigned to a final node and can be asymmetric. Returns the number of - * nodes split. + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. The return value is the number of nodes allocated. */ -static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, - u64 size) +static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) { - int i = node_start; - size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; - while (!setup_node_range(i++, addr, size, max_addr)) - ; - return i - node_start; + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 min_size; + int ret = 0; + int i; + + if (!size) + return -1; + /* + * The limit on emulated nodes is MAX_NUMNODES, so the size per node is + * increased accordingly if the requested size is too small. This + * creates a uniform distribution of node sizes across the entire + * machine (but not necessarily over physical nodes). + */ + min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / + MAX_NUMNODES; + min_size = max(min_size, FAKE_NODE_MIN_SIZE); + if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) + min_size = (min_size + FAKE_NODE_MIN_SIZE) & + FAKE_NODE_MIN_HASH_MASK; + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size &= FAKE_NODE_MIN_HASH_MASK; + + for (i = 0; i < MAX_NUMNODES; i++) + if (physnodes[i].start != physnodes[i].end) + node_set(i, physnode_mask); + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; + u64 end; + + end = find_end_of_node(physnodes[i].start, + physnodes[i].end, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (physnodes[i].end - end - + e820_hole_size(end, physnodes[i].end) < size) + end = physnodes[i].end; + + /* + * Setup the fake node that will be allocated as bootmem + * later. If setup_node_range() returns non-zero, there + * is no more memory available on this physical node. + */ + if (setup_node_range(ret++, &physnodes[i].start, + end - physnodes[i].start, + physnodes[i].end) < 0) + node_clear(i, physnode_mask); + } + } + return ret; } /* @@ -582,87 +604,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn, int acpi, int k8) { - u64 size, addr = start_pfn << PAGE_SHIFT; + u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; - int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; int num_phys_nodes; + int num_nodes; + int i; num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); /* - * If the numa=fake command-line is just a single number N, split the - * system RAM into N fake nodes. + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. */ - if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { - long n = simple_strtol(cmdline, NULL, 0); - - num_nodes = split_nodes_interleave(addr, max_addr, - num_phys_nodes, n); - if (num_nodes < 0) - return num_nodes; - goto out; - } + if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { + u64 size; - /* Parse the command line. */ - for (coeff_flag = 0; ; cmdline++) { - if (*cmdline && isdigit(*cmdline)) { - num = num * 10 + *cmdline - '0'; - continue; - } - if (*cmdline == '*') { - if (num > 0) - coeff = num; - coeff_flag = 1; - } - if (!*cmdline || *cmdline == ',') { - if (!coeff_flag) - coeff = 1; - /* - * Round down to the nearest FAKE_NODE_MIN_SIZE. - * Command-line coefficients are in megabytes. - */ - size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; - if (size) - for (i = 0; i < coeff; i++, num_nodes++) - if (setup_node_range(num_nodes, &addr, - size, max_addr) < 0) - goto done; - if (!*cmdline) - break; - coeff_flag = 0; - coeff = -1; - } - num = 0; - } -done: - if (!num_nodes) - return -1; - /* Fill remainder of system RAM, if appropriate. */ - if (addr < max_addr) { - if (coeff_flag && coeff < 0) { - /* Split remaining nodes into num-sized chunks */ - num_nodes += split_nodes_by_size(&addr, max_addr, - num_nodes, num); - goto out; - } - switch (*(cmdline - 1)) { - case '*': - /* Split remaining nodes into coeff chunks */ - if (coeff <= 0) - break; - num_nodes += split_nodes_equally(&addr, max_addr, - num_nodes, coeff); - break; - case ',': - /* Do not allocate remaining system RAM */ - break; - default: - /* Give one final node */ - setup_node_range(num_nodes, &addr, max_addr - addr, - max_addr); - num_nodes++; - } + size = memparse(cmdline, &cmdline); + num_nodes = split_nodes_size_interleave(addr, max_addr, size); + } else { + unsigned long n; + + n = simple_strtoul(cmdline, NULL, 0); + num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); } -out: + + if (num_nodes < 0) + return num_nodes; memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); if (memnode_shift < 0) { memnode_shift = 0; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 66b55d6e69ed..ae9648eb1c7f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -704,9 +704,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (!range_is_allowed(pfn, size)) return 0; - if (file->f_flags & O_SYNC) { + if (file->f_flags & O_DSYNC) flags = _PAGE_CACHE_UC_MINUS; - } #ifdef CONFIG_X86_32 /* diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ed34f5e35999..c9ba9deafe83 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -6,6 +6,14 @@ #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#ifdef CONFIG_HIGHPTE +#define PGALLOC_USER_GFP __GFP_HIGHMEM +#else +#define PGALLOC_USER_GFP 0 +#endif + +gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; + pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { return (pte_t *)__get_free_page(PGALLOC_GFP); @@ -15,16 +23,29 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *pte; -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); -#else - pte = alloc_pages(PGALLOC_GFP, 0); -#endif + pte = alloc_pages(__userpte_alloc_gfp, 0); if (pte) pgtable_page_ctor(pte); return pte; } +static int __init setup_userpte(char *arg) +{ + if (!arg) + return -EINVAL; + + /* + * "userpte=nohigh" disables allocation of user pagetables in + * high memory. + */ + if (strcmp(arg, "nohigh") == 0) + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + else + return -EINVAL; + return 0; +} +early_param("userpte", setup_userpte); + void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 6f8aa33031c7..9324f13492d5 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -267,6 +267,8 @@ int __init get_memcfg_from_srat(void) e820_register_active_regions(chunk->nid, chunk->start_pfn, min(chunk->end_pfn, max_pfn)); } + /* for out of order entries in SRAT */ + sort_node_map(); for_each_online_node(nid) { unsigned long start = node_start_pfn[nid]; diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index d89075489664..28c68762648f 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -229,9 +229,11 @@ update_nodes_add(int node, unsigned long start, unsigned long end) printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); } - if (changed) + if (changed) { + node_set(node, cpu_nodes_parsed); printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); + } } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ @@ -317,7 +319,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) unsigned long s = nodes[i].start >> PAGE_SHIFT; unsigned long e = nodes[i].end >> PAGE_SHIFT; pxmram += e - s; - pxmram -= absent_pages_in_range(s, e); + pxmram -= __absent_pages_in_range(i, s, e); if ((long)pxmram < 0) pxmram = 0; } @@ -373,6 +375,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) for_each_node_mask(i, nodes_parsed) e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); + /* for out of order entries in SRAT */ + sort_node_map(); if (!nodes_cover_memory(nodes)) { bad_srat(); return -1; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 65b58e4b0b8b..426f3a1a64d3 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -41,7 +41,7 @@ union smp_flush_state { struct { struct mm_struct *flush_mm; unsigned long flush_va; - spinlock_t tlbstate_lock; + raw_spinlock_t tlbstate_lock; DECLARE_BITMAP(flush_cpumask, NR_CPUS); }; char pad[INTERNODE_CACHE_BYTES]; @@ -181,7 +181,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is * probably not worth checking this for a cache-hot lock. */ - spin_lock(&f->tlbstate_lock); + raw_spin_lock(&f->tlbstate_lock); f->flush_mm = mm; f->flush_va = va; @@ -199,7 +199,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, f->flush_mm = NULL; f->flush_va = 0; - spin_unlock(&f->tlbstate_lock); + raw_spin_unlock(&f->tlbstate_lock); } void native_flush_tlb_others(const struct cpumask *cpumask, @@ -223,7 +223,7 @@ static int __cpuinit init_smp_flush(void) int i; for (i = 0; i < ARRAY_SIZE(flush_state); i++) - spin_lock_init(&flush_state[i].tlbstate_lock); + raw_spin_lock_init(&flush_state[i].tlbstate_lock); return 0; } diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 044897be021f..3855096c59b8 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -41,10 +41,11 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) } static struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, - .stack = backtrace_stack, - .address = backtrace_address, + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, + .walk_stack = print_context_stack, }; struct frame_head { diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index cb88b1a0bd5f..2c505ee71014 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -159,7 +159,7 @@ static int nmi_setup_mux(void) for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).multiplex = - kmalloc(multiplex_size, GFP_KERNEL); + kzalloc(multiplex_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).multiplex) return 0; } @@ -179,7 +179,6 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) if (counter_config[i].enabled) { multiplex[i].saved = -(u64)counter_config[i].count; } else { - multiplex[i].addr = 0; multiplex[i].saved = 0; } } @@ -189,25 +188,27 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) { + struct op_msr *counters = msrs->counters; struct op_msr *multiplex = msrs->multiplex; int i; for (i = 0; i < model->num_counters; ++i) { int virt = op_x86_phys_to_virt(i); - if (multiplex[virt].addr) - rdmsrl(multiplex[virt].addr, multiplex[virt].saved); + if (counters[i].addr) + rdmsrl(counters[i].addr, multiplex[virt].saved); } } static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) { + struct op_msr *counters = msrs->counters; struct op_msr *multiplex = msrs->multiplex; int i; for (i = 0; i < model->num_counters; ++i) { int virt = op_x86_phys_to_virt(i); - if (multiplex[virt].addr) - wrmsrl(multiplex[virt].addr, multiplex[virt].saved); + if (counters[i].addr) + wrmsrl(counters[i].addr, multiplex[virt].saved); } } @@ -222,7 +223,7 @@ static void nmi_cpu_switch(void *dummy) /* move to next set */ si += model->num_counters; - if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) + if ((si >= model->num_virt_counters) || (counter_config[si].count == 0)) per_cpu(switch_index, cpu) = 0; else per_cpu(switch_index, cpu) = si; @@ -303,11 +304,11 @@ static int allocate_msrs(void) int i; for_each_possible_cpu(i) { - per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, + per_cpu(cpu_msrs, i).counters = kzalloc(counters_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).counters) return 0; - per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, + per_cpu(cpu_msrs, i).controls = kzalloc(controls_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).controls) return 0; @@ -598,6 +599,7 @@ static int __init ppro_init(char **cpu_type) case 15: case 23: *cpu_type = "i386/core_2"; break; + case 0x2e: case 26: spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 39686c29f03a..6a58256dce9f 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -22,6 +22,9 @@ #include <asm/ptrace.h> #include <asm/msr.h> #include <asm/nmi.h> +#include <asm/apic.h> +#include <asm/processor.h> +#include <asm/cpufeature.h> #include "op_x86_model.h" #include "op_counter.h" @@ -43,15 +46,13 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS]; -#ifdef CONFIG_OPROFILE_IBS - /* IbsFetchCtl bits/masks */ #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) #define IBS_FETCH_CNT_MASK 0xFFFF0000ULL -/*IbsOpCtl bits */ +/* IbsOpCtl bits */ #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) @@ -59,7 +60,7 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS]; #define IBS_FETCH_SIZE 6 #define IBS_OP_SIZE 12 -static int has_ibs; /* AMD Family10h and later */ +static u32 ibs_caps; struct op_ibs_config { unsigned long op_enabled; @@ -71,24 +72,52 @@ struct op_ibs_config { }; static struct op_ibs_config ibs_config; +static u64 ibs_op_ctl; -#endif +/* + * IBS cpuid feature detection + */ -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +#define IBS_CPUID_FEATURES 0x8000001b + +/* + * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but + * bit 0 is used to indicate the existence of IBS. + */ +#define IBS_CAPS_AVAIL (1LL<<0) +#define IBS_CAPS_RDWROPCNT (1LL<<3) +#define IBS_CAPS_OPCNT (1LL<<4) + +/* + * IBS randomization macros + */ +#define IBS_RANDOM_BITS 12 +#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1) +#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5)) -static void op_mux_fill_in_addresses(struct op_msrs * const msrs) +static u32 get_ibs_caps(void) { - int i; + u32 ibs_caps; + unsigned int max_level; - for (i = 0; i < NUM_VIRT_COUNTERS; i++) { - int hw_counter = op_x86_virt_to_phys(i); - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; - else - msrs->multiplex[i].addr = 0; - } + if (!boot_cpu_has(X86_FEATURE_IBS)) + return 0; + + /* check IBS cpuid feature flags */ + max_level = cpuid_eax(0x80000000); + if (max_level < IBS_CPUID_FEATURES) + return IBS_CAPS_AVAIL; + + ibs_caps = cpuid_eax(IBS_CPUID_FEATURES); + if (!(ibs_caps & IBS_CAPS_AVAIL)) + /* cpuid flags not valid */ + return IBS_CAPS_AVAIL; + + return ibs_caps; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, struct op_msrs const * const msrs) { @@ -98,7 +127,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { int virt = op_x86_phys_to_virt(i); - if (!counter_config[virt].enabled) + if (!reset_value[virt]) continue; rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; @@ -107,10 +136,6 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, } } -#else - -static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { } - #endif /* functions for op_amd_spec */ @@ -122,18 +147,12 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) for (i = 0; i < NUM_COUNTERS; i++) { if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - else - msrs->counters[i].addr = 0; } for (i = 0; i < NUM_CONTROLS; i++) { if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; - else - msrs->controls[i].addr = 0; } - - op_mux_fill_in_addresses(msrs); } static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, @@ -144,7 +163,8 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* setup reset_value */ for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { - if (counter_config[i].enabled) + if (counter_config[i].enabled + && msrs->counters[op_x86_virt_to_phys(i)].addr) reset_value[i] = counter_config[i].count; else reset_value[i] = 0; @@ -152,9 +172,18 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* clear all counters */ for (i = 0; i < NUM_CONTROLS; ++i) { - if (unlikely(!msrs->controls[i].addr)) + if (unlikely(!msrs->controls[i].addr)) { + if (counter_config[i].enabled && !smp_processor_id()) + /* + * counter is reserved, this is on all + * cpus, so report only for cpu #0 + */ + op_x86_warn_reserved(i); continue; + } rdmsrl(msrs->controls[i].addr, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); } @@ -169,9 +198,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { int virt = op_x86_phys_to_virt(i); - if (!counter_config[virt].enabled) - continue; - if (!msrs->counters[i].addr) + if (!reset_value[virt]) continue; /* setup counter registers */ @@ -185,7 +212,60 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } } -#ifdef CONFIG_OPROFILE_IBS +/* + * 16-bit Linear Feedback Shift Register (LFSR) + * + * 16 14 13 11 + * Feedback polynomial = X + X + X + X + 1 + */ +static unsigned int lfsr_random(void) +{ + static unsigned int lfsr_value = 0xF00D; + unsigned int bit; + + /* Compute next bit to shift in */ + bit = ((lfsr_value >> 0) ^ + (lfsr_value >> 2) ^ + (lfsr_value >> 3) ^ + (lfsr_value >> 5)) & 0x0001; + + /* Advance to next register value */ + lfsr_value = (lfsr_value >> 1) | (bit << 15); + + return lfsr_value; +} + +/* + * IBS software randomization + * + * The IBS periodic op counter is randomized in software. The lower 12 + * bits of the 20 bit counter are randomized. IbsOpCurCnt is + * initialized with a 12 bit random value. + */ +static inline u64 op_amd_randomize_ibs_op(u64 val) +{ + unsigned int random = lfsr_random(); + + if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) + /* + * Work around if the hw can not write to IbsOpCurCnt + * + * Randomize the lower 8 bits of the 16 bit + * IbsOpMaxCnt [15:0] value in the range of -128 to + * +127 by adding/subtracting an offset to the + * maximum count (IbsOpMaxCnt). + * + * To avoid over or underflows and protect upper bits + * starting at bit 16, the initial value for + * IbsOpMaxCnt must fit in the range from 0x0081 to + * 0xff80. + */ + val += (s8)(random >> 4); + else + val |= (u64)(random & IBS_RANDOM_MASK) << 32; + + return val; +} static inline void op_amd_handle_ibs(struct pt_regs * const regs, @@ -194,7 +274,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, u64 val, ctl; struct op_entry entry; - if (!has_ibs) + if (!ibs_caps) return; if (ibs_config.fetch_enabled) { @@ -236,8 +316,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, oprofile_write_commit(&entry); /* reenable the IRQ */ - ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; - ctl |= IBS_OP_ENABLE; + ctl = op_amd_randomize_ibs_op(ibs_op_ctl); wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } @@ -246,41 +325,57 @@ op_amd_handle_ibs(struct pt_regs * const regs, static inline void op_amd_start_ibs(void) { u64 val; - if (has_ibs && ibs_config.fetch_enabled) { + + if (!ibs_caps) + return; + + if (ibs_config.fetch_enabled) { val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; val |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, val); } - if (has_ibs && ibs_config.op_enabled) { - val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; - val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; - val |= IBS_OP_ENABLE; + if (ibs_config.op_enabled) { + ibs_op_ctl = ibs_config.max_cnt_op >> 4; + if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { + /* + * IbsOpCurCnt not supported. See + * op_amd_randomize_ibs_op() for details. + */ + ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL); + } else { + /* + * The start value is randomized with a + * positive offset, we need to compensate it + * with the half of the randomized range. Also + * avoid underflows. + */ + ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET, + 0xFFFFULL); + } + if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops) + ibs_op_ctl |= IBS_OP_CNT_CTL; + ibs_op_ctl |= IBS_OP_ENABLE; + val = op_amd_randomize_ibs_op(ibs_op_ctl); wrmsrl(MSR_AMD64_IBSOPCTL, val); } } static void op_amd_stop_ibs(void) { - if (has_ibs && ibs_config.fetch_enabled) + if (!ibs_caps) + return; + + if (ibs_config.fetch_enabled) /* clear max count and enable */ wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); - if (has_ibs && ibs_config.op_enabled) + if (ibs_config.op_enabled) /* clear max count and enable */ wrmsrl(MSR_AMD64_IBSOPCTL, 0); } -#else - -static inline void op_amd_handle_ibs(struct pt_regs * const regs, - struct op_msrs const * const msrs) { } -static inline void op_amd_start_ibs(void) { } -static inline void op_amd_stop_ibs(void) { } - -#endif - static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { @@ -355,8 +450,6 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } -#ifdef CONFIG_OPROFILE_IBS - static u8 ibs_eilvt_off; static inline void apic_init_ibs_nmi_per_cpu(void *arg) @@ -405,45 +498,36 @@ static int init_ibs_nmi(void) return 1; } -#ifdef CONFIG_NUMA - /* Sanity check */ - /* Works only for 64bit with proper numa implementation. */ - if (nodes != num_possible_nodes()) { - printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " - "found: %d, expected %d", - nodes, num_possible_nodes()); - return 1; - } -#endif return 0; } /* uninitialize the APIC for the IBS interrupts if needed */ static void clear_ibs_nmi(void) { - if (has_ibs) + if (ibs_caps) on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } /* initialize the APIC for the IBS interrupts if available */ static void ibs_init(void) { - has_ibs = boot_cpu_has(X86_FEATURE_IBS); + ibs_caps = get_ibs_caps(); - if (!has_ibs) + if (!ibs_caps) return; if (init_ibs_nmi()) { - has_ibs = 0; + ibs_caps = 0; return; } - printk(KERN_INFO "oprofile: AMD IBS detected\n"); + printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", + (unsigned)ibs_caps); } static void ibs_exit(void) { - if (!has_ibs) + if (!ibs_caps) return; clear_ibs_nmi(); @@ -463,7 +547,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) if (ret) return ret; - if (!has_ibs) + if (!ibs_caps) return ret; /* model specific files */ @@ -473,7 +557,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) ibs_config.fetch_enabled = 0; ibs_config.max_cnt_op = 250000; ibs_config.op_enabled = 0; - ibs_config.dispatched_ops = 1; + ibs_config.dispatched_ops = 0; dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); oprofilefs_create_ulong(sb, dir, "enable", @@ -488,8 +572,9 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) &ibs_config.op_enabled); oprofilefs_create_ulong(sb, dir, "max_count", &ibs_config.max_cnt_op); - oprofilefs_create_ulong(sb, dir, "dispatched_ops", - &ibs_config.dispatched_ops); + if (ibs_caps & IBS_CAPS_OPCNT) + oprofilefs_create_ulong(sb, dir, "dispatched_ops", + &ibs_config.dispatched_ops); return 0; } @@ -507,19 +592,6 @@ static void op_amd_exit(void) ibs_exit(); } -#else - -/* no IBS support */ - -static int op_amd_init(struct oprofile_operations *ops) -{ - return 0; -} - -static void op_amd_exit(void) {} - -#endif /* CONFIG_OPROFILE_IBS */ - struct op_x86_model_spec op_amd_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index ac6b354becdf..e6a160a4684a 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -394,12 +394,6 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs) setup_num_counters(); stag = get_stagger(); - /* initialize some registers */ - for (i = 0; i < num_counters; ++i) - msrs->counters[i].addr = 0; - for (i = 0; i < num_controls; ++i) - msrs->controls[i].addr = 0; - /* the counter & cccr registers we pay attention to */ for (i = 0; i < num_counters; ++i) { addr = p4_counters[VIRT_CTR(stag, i)].counter_address; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 8eb05878554c..5d1727ba409e 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -37,15 +37,11 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs) for (i = 0; i < num_counters; i++) { if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; - else - msrs->counters[i].addr = 0; } for (i = 0; i < num_counters; i++) { if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; - else - msrs->controls[i].addr = 0; } } @@ -57,7 +53,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, int i; if (!reset_value) { - reset_value = kmalloc(sizeof(reset_value[0]) * num_counters, + reset_value = kzalloc(sizeof(reset_value[0]) * num_counters, GFP_ATOMIC); if (!reset_value) return; @@ -82,9 +78,18 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, /* clear all counters */ for (i = 0; i < num_counters; ++i) { - if (unlikely(!msrs->controls[i].addr)) + if (unlikely(!msrs->controls[i].addr)) { + if (counter_config[i].enabled && !smp_processor_id()) + /* + * counter is reserved, this is on all + * cpus, so report only for cpu #0 + */ + op_x86_warn_reserved(i); continue; + } rdmsrl(msrs->controls[i].addr, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); } diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 7b8e75d16081..ff82a755edd4 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -57,6 +57,26 @@ struct op_x86_model_spec { struct op_counter_config; +static inline void op_x86_warn_in_use(int counter) +{ + /* + * The warning indicates an already running counter. If + * oprofile doesn't collect data, then try using a different + * performance counter on your platform to monitor the desired + * event. Delete counter #%d from the desired event by editing + * the /usr/share/oprofile/%s/<cpu>/events file. If the event + * cannot be monitored by any other counter, contact your + * hardware or BIOS vendor. + */ + pr_warning("oprofile: counter #%d on cpu #%d may already be used\n", + counter, smp_processor_id()); +} + +static inline void op_x86_warn_reserved(int counter) +{ + pr_warning("oprofile: counter #%d is already reserved\n", counter); +} + extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, struct op_counter_config *counter_config); extern int op_x86_phys_to_virt(int phys); diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index d49202e740ea..39fba37f702f 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -15,3 +15,8 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-y += common.o early.o obj-y += amd_bus.o +obj-$(CONFIG_X86_64) += bus_numa.o + +ifeq ($(CONFIG_PCI_DEBUG),y) +EXTRA_CFLAGS += -DDEBUG +endif diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 1014eb4bfc37..5f11ff6f5389 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -7,6 +7,7 @@ #include <asm/pci_x86.h> struct pci_root_info { + struct acpi_device *bridge; char *name; unsigned int res_num; struct resource *res; @@ -14,6 +15,51 @@ struct pci_root_info { int busnum; }; +static bool pci_use_crs = true; + +static int __init set_use_crs(const struct dmi_system_id *id) +{ + pci_use_crs = true; + return 0; +} + +static const struct dmi_system_id pci_use_crs_table[] __initconst = { + /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */ + { + .callback = set_use_crs, + .ident = "IBM System x3800", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), + }, + }, + {} +}; + +void __init pci_acpi_crs_quirks(void) +{ + int year; + + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008) + pci_use_crs = false; + + dmi_check_system(pci_use_crs_table); + + /* + * If the user specifies "pci=use_crs" or "pci=nocrs" explicitly, that + * takes precedence over anything we figured out above. + */ + if (pci_probe & PCI_ROOT_NO_CRS) + pci_use_crs = false; + else if (pci_probe & PCI_USE__CRS) + pci_use_crs = true; + + printk(KERN_INFO "PCI: %s host bridge windows from ACPI; " + "if necessary, use \"pci=%s\" and report a bug\n", + pci_use_crs ? "Using" : "Ignoring", + pci_use_crs ? "nocrs" : "use_crs"); +} + static acpi_status resource_to_addr(struct acpi_resource *resource, struct acpi_resource_address64 *addr) @@ -44,18 +90,28 @@ count_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } -static int -bus_has_transparent_bridge(struct pci_bus *bus) +static void +align_resource(struct acpi_device *bridge, struct resource *res) { - struct pci_dev *dev; + int align = (res->flags & IORESOURCE_MEM) ? 16 : 4; - list_for_each_entry(dev, &bus->devices, bus_list) { - u16 class = dev->class >> 8; - - if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) - return true; + /* + * Host bridge windows are not BARs, but the decoders on the PCI side + * that claim this address space have starting alignment and length + * constraints, so fix any obvious BIOS goofs. + */ + if (!IS_ALIGNED(res->start, align)) { + dev_printk(KERN_DEBUG, &bridge->dev, + "host bridge window %pR invalid; " + "aligning start to %d-byte boundary\n", res, align); + res->start &= ~(align - 1); + } + if (!IS_ALIGNED(res->end + 1, align)) { + dev_printk(KERN_DEBUG, &bridge->dev, + "host bridge window %pR invalid; " + "aligning end to %d-byte boundary\n", res, align); + res->end = ALIGN(res->end, align) - 1; } - return false; } static acpi_status @@ -67,12 +123,8 @@ setup_resource(struct acpi_resource *acpi_res, void *data) acpi_status status; unsigned long flags; struct resource *root; - int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; u64 start, end; - if (bus_has_transparent_bridge(info->bus)) - max_root_bus_resources -= 3; - status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) return AE_OK; @@ -90,14 +142,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) start = addr.minimum + addr.translation_offset; end = start + addr.address_length - 1; - if (info->res_num >= max_root_bus_resources) { - printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " - "from %s for %s due to _CRS returning more than " - "%d resource descriptors\n", (unsigned long) start, - (unsigned long) end, root->name, info->name, - max_root_bus_resources); - return AE_OK; - } res = &info->res[info->res_num]; res->name = info->name; @@ -105,14 +149,28 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->start = start; res->end = end; res->child = NULL; + align_resource(info->bridge, res); + + if (!pci_use_crs) { + dev_printk(KERN_DEBUG, &info->bridge->dev, + "host bridge window %pR (ignored)\n", res); + return AE_OK; + } if (insert_resource(root, res)) { - printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " - "from %s for %s\n", (unsigned long) res->start, - (unsigned long) res->end, root->name, info->name); + dev_err(&info->bridge->dev, + "can't allocate host bridge window %pR\n", res); } else { - info->bus->resource[info->res_num] = res; + pci_bus_add_resource(info->bus, res, 0); info->res_num++; + if (addr.translation_offset) + dev_info(&info->bridge->dev, "host bridge window %pR " + "(PCI address [%#llx-%#llx])\n", + res, res->start - addr.translation_offset, + res->end - addr.translation_offset); + else + dev_info(&info->bridge->dev, + "host bridge window %pR\n", res); } return AE_OK; } @@ -124,6 +182,10 @@ get_current_resources(struct acpi_device *device, int busnum, struct pci_root_info info; size_t size; + if (pci_use_crs) + pci_bus_remove_resources(bus); + + info.bridge = device; info.bus = bus; info.res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, @@ -163,8 +225,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do #endif if (domain && !pci_domains_supported) { - printk(KERN_WARNING "PCI: Multiple domains not supported " - "(dom %d, bus %d)\n", domain, busnum); + printk(KERN_WARNING "pci_bus %04x:%02x: " + "ignored (multiple domains not supported)\n", + domain, busnum); return NULL; } @@ -188,7 +251,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do */ sd = kzalloc(sizeof(*sd), GFP_KERNEL); if (!sd) { - printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); + printk(KERN_WARNING "pci_bus %04x:%02x: " + "ignored (out of memory)\n", domain, busnum); return NULL; } @@ -209,9 +273,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do } else { bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); if (bus) { - if (pci_probe & PCI_USE__CRS) - get_current_resources(device, busnum, domain, - bus); + get_current_resources(device, busnum, domain, bus); bus->subordinate = pci_scan_child_bus(bus); } } diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 572ee9782f2a..95ecbd495955 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -6,10 +6,10 @@ #ifdef CONFIG_X86_64 #include <asm/pci-direct.h> -#include <asm/mpspec.h> -#include <linux/cpumask.h> #endif +#include "bus_numa.h" + /* * This discovers the pcibus <-> node mapping on AMD K8. * also get peer root bus resource for io,mmio @@ -17,67 +17,6 @@ #ifdef CONFIG_X86_64 -/* - * sub bus (transparent) will use entres from 3 to store extra from root, - * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES? - */ -#define RES_NUM 16 -struct pci_root_info { - char name[12]; - unsigned int res_num; - struct resource res[RES_NUM]; - int bus_min; - int bus_max; - int node; - int link; -}; - -/* 4 at this time, it may become to 32 */ -#define PCI_ROOT_NR 4 -static int pci_root_num; -static struct pci_root_info pci_root_info[PCI_ROOT_NR]; - -void x86_pci_root_bus_res_quirks(struct pci_bus *b) -{ - int i; - int j; - struct pci_root_info *info; - - /* don't go for it if _CRS is used already */ - if (b->resource[0] != &ioport_resource || - b->resource[1] != &iomem_resource) - return; - - /* if only one root bus, don't need to anything */ - if (pci_root_num < 2) - return; - - for (i = 0; i < pci_root_num; i++) { - if (pci_root_info[i].bus_min == b->number) - break; - } - - if (i == pci_root_num) - return; - - printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", - b->number); - - info = &pci_root_info[i]; - for (j = 0; j < info->res_num; j++) { - struct resource *res; - struct resource *root; - - res = &info->res[j]; - b->resource[j] = res; - if (res->flags & IORESOURCE_IO) - root = &ioport_resource; - else - root = &iomem_resource; - insert_resource(root, res); - } -} - #define RANGE_NUM 16 struct res_range { @@ -130,52 +69,6 @@ static void __init update_range(struct res_range *range, size_t start, } } -static void __init update_res(struct pci_root_info *info, size_t start, - size_t end, unsigned long flags, int merge) -{ - int i; - struct resource *res; - - if (!merge) - goto addit; - - /* try to merge it with old one */ - for (i = 0; i < info->res_num; i++) { - size_t final_start, final_end; - size_t common_start, common_end; - - res = &info->res[i]; - if (res->flags != flags) - continue; - - common_start = max((size_t)res->start, start); - common_end = min((size_t)res->end, end); - if (common_start > common_end + 1) - continue; - - final_start = min((size_t)res->start, start); - final_end = max((size_t)res->end, end); - - res->start = final_start; - res->end = final_end; - return; - } - -addit: - - /* need to add that */ - if (info->res_num >= RES_NUM) - return; - - res = &info->res[info->res_num]; - res->name = info->name; - res->flags = flags; - res->start = start; - res->end = end; - res->child = NULL; - info->res_num++; -} - struct pci_hostbridge_probe { u32 bus; u32 slot; @@ -230,7 +123,6 @@ static int __init early_fill_mp_bus_info(void) int j; unsigned bus; unsigned slot; - int found; int node; int link; int def_node; @@ -247,7 +139,7 @@ static int __init early_fill_mp_bus_info(void) if (!early_pci_allowed()) return -1; - found = 0; + found_all_numa_early = 0; for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { u32 id; u16 device; @@ -261,12 +153,12 @@ static int __init early_fill_mp_bus_info(void) device = (id>>16) & 0xffff; if (pci_probes[i].vendor == vendor && pci_probes[i].device == device) { - found = 1; + found_all_numa_early = 1; break; } } - if (!found) + if (!found_all_numa_early) return 0; pci_root_num = 0; @@ -488,7 +380,7 @@ static int __init early_fill_mp_bus_info(void) info = &pci_root_info[i]; res_num = info->res_num; busnum = info->bus_min; - printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n", + printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n", info->bus_min, info->bus_max, info->node, info->link); for (j = 0; j < res_num; j++) { res = &info->res[j]; diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c new file mode 100644 index 000000000000..12d54ff3654d --- /dev/null +++ b/arch/x86/pci/bus_numa.c @@ -0,0 +1,102 @@ +#include <linux/init.h> +#include <linux/pci.h> + +#include "bus_numa.h" + +int pci_root_num; +struct pci_root_info pci_root_info[PCI_ROOT_NR]; +int found_all_numa_early; + +void x86_pci_root_bus_res_quirks(struct pci_bus *b) +{ + int i; + int j; + struct pci_root_info *info; + + /* don't go for it if _CRS is used already */ + if (b->resource[0] != &ioport_resource || + b->resource[1] != &iomem_resource) + return; + + if (!pci_root_num) + return; + + /* for amd, if only one root bus, don't need to do anything */ + if (pci_root_num < 2 && found_all_numa_early) + return; + + for (i = 0; i < pci_root_num; i++) { + if (pci_root_info[i].bus_min == b->number) + break; + } + + if (i == pci_root_num) + return; + + printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", + b->number); + + pci_bus_remove_resources(b); + info = &pci_root_info[i]; + for (j = 0; j < info->res_num; j++) { + struct resource *res; + struct resource *root; + + res = &info->res[j]; + pci_bus_add_resource(b, res, 0); + if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + else + root = &iomem_resource; + insert_resource(root, res); + } +} + +void __devinit update_res(struct pci_root_info *info, size_t start, + size_t end, unsigned long flags, int merge) +{ + int i; + struct resource *res; + + if (start > end) + return; + + if (!merge) + goto addit; + + /* try to merge it with old one */ + for (i = 0; i < info->res_num; i++) { + size_t final_start, final_end; + size_t common_start, common_end; + + res = &info->res[i]; + if (res->flags != flags) + continue; + + common_start = max((size_t)res->start, start); + common_end = min((size_t)res->end, end); + if (common_start > common_end + 1) + continue; + + final_start = min((size_t)res->start, start); + final_end = max((size_t)res->end, end); + + res->start = final_start; + res->end = final_end; + return; + } + +addit: + + /* need to add that */ + if (info->res_num >= RES_NUM) + return; + + res = &info->res[info->res_num]; + res->name = info->name; + res->flags = flags; + res->start = start; + res->end = end; + res->child = NULL; + info->res_num++; +} diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h new file mode 100644 index 000000000000..731b64ee8d84 --- /dev/null +++ b/arch/x86/pci/bus_numa.h @@ -0,0 +1,26 @@ +#ifdef CONFIG_X86_64 + +/* + * sub bus (transparent) will use entres from 3 to store extra from + * root, so need to make sure we have enough slot there. + */ +#define RES_NUM 16 +struct pci_root_info { + char name[12]; + unsigned int res_num; + struct resource res[RES_NUM]; + int bus_min; + int bus_max; + int node; + int link; +}; + +/* 4 at this time, it may become to 32 */ +#define PCI_ROOT_NR 4 +extern int pci_root_num; +extern struct pci_root_info pci_root_info[PCI_ROOT_NR]; +extern int found_all_numa_early; + +extern void update_res(struct pci_root_info *info, size_t start, + size_t end, unsigned long flags, int merge); +#endif diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 1331fcf26143..3736176acaab 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -410,8 +410,6 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) return bus; } -extern u8 pci_cache_line_size; - int __init pcibios_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -422,15 +420,19 @@ int __init pcibios_init(void) } /* - * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8 - * and P4. It's also good for 386/486s (which actually have 16) + * Set PCI cacheline size to that of the CPU if the CPU has reported it. + * (For older CPUs that don't support cpuid, we se it to 32 bytes + * It's also good for 386/486s (which actually have 16) * as quite a few PCI devices do not support smaller values. */ - pci_cache_line_size = 32 >> 2; - if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) - pci_cache_line_size = 64 >> 2; /* K7 & K8 */ - else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) - pci_cache_line_size = 128 >> 2; /* P4 */ + if (c->x86_clflush_size > 0) { + pci_dfl_cache_line_size = c->x86_clflush_size >> 2; + printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n", + pci_dfl_cache_line_size << 2); + } else { + pci_dfl_cache_line_size = 32 >> 2; + printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n"); + } pcibios_resource_survey(); @@ -518,6 +520,9 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "use_crs")) { pci_probe |= PCI_USE__CRS; return NULL; + } else if (!strcmp(str, "nocrs")) { + pci_probe |= PCI_ROOT_NO_CRS; + return NULL; } else if (!strcmp(str, "earlydump")) { pci_early_dump_regs = 1; return NULL; diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index aaf26ae58cd5..d1067d539bee 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -12,8 +12,6 @@ u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) u32 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inl(0xcfc); - if (v != 0xffffffff) - pr_debug("%x reading 4 from %x: %x\n", slot, offset, v); return v; } @@ -22,7 +20,6 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset) u8 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inb(0xcfc + (offset&3)); - pr_debug("%x reading 1 from %x: %x\n", slot, offset, v); return v; } @@ -31,28 +28,24 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset) u16 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inw(0xcfc + (offset&2)); - pr_debug("%x reading 2 from %x: %x\n", slot, offset, v); return v; } void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val) { - pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outl(val, 0xcfc); } void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) { - pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outb(val, 0xcfc + (offset&3)); } void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) { - pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outw(val, 0xcfc + (offset&2)); } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index b22d13b0c71d..5a8fbf8d4cac 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -60,22 +60,20 @@ skip_isa_ioresource_align(struct pci_dev *dev) { * but we want to try to avoid allocating at 0x2900-0x2bff * which might have be mirrored at 0x0100-0x03ff.. */ -void -pcibios_align_resource(void *data, struct resource *res, +resource_size_t +pcibios_align_resource(void *data, const struct resource *res, resource_size_t size, resource_size_t align) { struct pci_dev *dev = data; + resource_size_t start = res->start; if (res->flags & IORESOURCE_IO) { - resource_size_t start = res->start; - if (skip_isa_ioresource_align(dev)) - return; - if (start & 0x300) { + return start; + if (start & 0x300) start = (start + 0x3ff) & ~0x3ff; - res->start = start; - } } + return start; } EXPORT_SYMBOL(pcibios_align_resource); @@ -129,7 +127,9 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); + dev_info(&dev->dev, + "can't reserve window %pR\n", + r); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -144,16 +144,29 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) } } +struct pci_check_idx_range { + int start; + int end; +}; + static void __init pcibios_allocate_resources(int pass) { struct pci_dev *dev = NULL; - int idx, disabled; + int idx, disabled, i; u16 command; struct resource *r; + struct pci_check_idx_range idx_range[] = { + { PCI_STD_RESOURCES, PCI_STD_RESOURCE_END }, +#ifdef CONFIG_PCI_IOV + { PCI_IOV_RESOURCES, PCI_IOV_RESOURCE_END }, +#endif + }; + for_each_pci_dev(dev) { pci_read_config_word(dev, PCI_COMMAND, &command); - for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) { + for (i = 0; i < ARRAY_SIZE(idx_range); i++) + for (idx = idx_range[i].start; idx <= idx_range[i].end; idx++) { r = &dev->resource[idx]; if (r->parent) /* Already allocated */ continue; @@ -164,12 +177,12 @@ static void __init pcibios_allocate_resources(int pass) else disabled = !(command & PCI_COMMAND_MEMORY); if (pass == disabled) { - dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n", - (unsigned long long) r->start, - (unsigned long long) r->end, - r->flags, disabled, pass); + dev_dbg(&dev->dev, + "BAR %d: reserving %pr (d=%d, p=%d)\n", + idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); + dev_info(&dev->dev, + "can't reserve %pR\n", r); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; @@ -182,7 +195,7 @@ static void __init pcibios_allocate_resources(int pass) /* Turn the ROM off, leave the resource region, * but keep it unregistered. */ u32 reg; - dev_dbg(&dev->dev, "disabling ROM\n"); + dev_dbg(&dev->dev, "disabling ROM %pR\n", r); r->flags &= ~IORESOURCE_ROM_ENABLE; pci_read_config_dword(dev, dev->rom_base_reg, ®); @@ -282,6 +295,15 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, return -EINVAL; prot = pgprot_val(vma->vm_page_prot); + + /* + * Return error if pat is not enabled and write_combine is requested. + * Caller can followup with UC MINUS request and add a WC mtrr if there + * is a free mtrr slot. + */ + if (!pat_enabled && write_combine) + return -EINVAL; + if (pat_enabled && write_combine) prot |= _PAGE_CACHE_WC; else if (pat_enabled || boot_cpu_data.x86 > 3) diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 0696d506c4ad..b02f6d8ac922 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -590,6 +590,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: + case PCI_DEVICE_ID_INTEL_CPT_LPC1: + case PCI_DEVICE_ID_INTEL_CPT_LPC2: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 602c172d3bd5..8f3f9a50b1e0 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -15,48 +15,98 @@ #include <linux/acpi.h> #include <linux/sfi_acpi.h> #include <linux/bitmap.h> -#include <linux/sort.h> +#include <linux/dmi.h> #include <asm/e820.h> #include <asm/pci_x86.h> #include <asm/acpi.h> #define PREFIX "PCI: " -/* aperture is up to 256MB but BIOS may reserve less */ -#define MMCONFIG_APER_MIN (2 * 1024*1024) -#define MMCONFIG_APER_MAX (256 * 1024*1024) - /* Indicate if the mmcfg resources have been placed into the resource table. */ static int __initdata pci_mmcfg_resources_inserted; -static __init int extend_mmcfg(int num) +LIST_HEAD(pci_mmcfg_list); + +static __init void pci_mmconfig_remove(struct pci_mmcfg_region *cfg) { - struct acpi_mcfg_allocation *new; - int new_num = pci_mmcfg_config_num + num; + if (cfg->res.parent) + release_resource(&cfg->res); + list_del(&cfg->list); + kfree(cfg); +} - new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL); - if (!new) - return -1; +static __init void free_all_mmcfg(void) +{ + struct pci_mmcfg_region *cfg, *tmp; + + pci_mmcfg_arch_free(); + list_for_each_entry_safe(cfg, tmp, &pci_mmcfg_list, list) + pci_mmconfig_remove(cfg); +} - if (pci_mmcfg_config) { - memcpy(new, pci_mmcfg_config, - sizeof(pci_mmcfg_config[0]) * new_num); - kfree(pci_mmcfg_config); +static __init void list_add_sorted(struct pci_mmcfg_region *new) +{ + struct pci_mmcfg_region *cfg; + + /* keep list sorted by segment and starting bus number */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + if (cfg->segment > new->segment || + (cfg->segment == new->segment && + cfg->start_bus >= new->start_bus)) { + list_add_tail(&new->list, &cfg->list); + return; + } } - pci_mmcfg_config = new; + list_add_tail(&new->list, &pci_mmcfg_list); +} - return 0; +static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, + int end, u64 addr) +{ + struct pci_mmcfg_region *new; + int num_buses; + struct resource *res; + + if (addr == 0) + return NULL; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->address = addr; + new->segment = segment; + new->start_bus = start; + new->end_bus = end; + + list_add_sorted(new); + + num_buses = end - start + 1; + res = &new->res; + res->start = addr + PCI_MMCFG_BUS_OFFSET(start); + res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN, + "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); + res->name = new->name; + + printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at " + "%pR (base %#lx)\n", segment, start, end, &new->res, + (unsigned long) addr); + + return new; } -static __init void fill_one_mmcfg(u64 addr, int segment, int start, int end) +struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus) { - int i = pci_mmcfg_config_num; + struct pci_mmcfg_region *cfg; - pci_mmcfg_config_num++; - pci_mmcfg_config[i].address = addr; - pci_mmcfg_config[i].pci_segment = segment; - pci_mmcfg_config[i].start_bus_number = start; - pci_mmcfg_config[i].end_bus_number = end; + list_for_each_entry(cfg, &pci_mmcfg_list, list) + if (cfg->segment == segment && + cfg->start_bus <= bus && bus <= cfg->end_bus) + return cfg; + + return NULL; } static const char __init *pci_mmcfg_e7520(void) @@ -68,11 +118,9 @@ static const char __init *pci_mmcfg_e7520(void) if (win == 0x0000 || win == 0xf000) return NULL; - if (extend_mmcfg(1) == -1) + if (pci_mmconfig_add(0, 0, 255, win << 16) == NULL) return NULL; - fill_one_mmcfg(win << 16, 0, 0, 255); - return "Intel Corporation E7520 Memory Controller Hub"; } @@ -114,11 +162,9 @@ static const char __init *pci_mmcfg_intel_945(void) if ((pciexbar & mask) >= 0xf0000000U) return NULL; - if (extend_mmcfg(1) == -1) + if (pci_mmconfig_add(0, 0, (len >> 20) - 1, pciexbar & mask) == NULL) return NULL; - fill_one_mmcfg(pciexbar & mask, 0, 0, (len >> 20) - 1); - return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; } @@ -127,7 +173,7 @@ static const char __init *pci_mmcfg_amd_fam10h(void) u32 low, high, address; u64 base, msr; int i; - unsigned segnbits = 0, busnbits; + unsigned segnbits = 0, busnbits, end_bus; if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) return NULL; @@ -161,11 +207,13 @@ static const char __init *pci_mmcfg_amd_fam10h(void) busnbits = 8; } - if (extend_mmcfg(1 << segnbits) == -1) - return NULL; - + end_bus = (1 << busnbits) - 1; for (i = 0; i < (1 << segnbits); i++) - fill_one_mmcfg(base + (1<<28) * i, i, 0, (1 << busnbits) - 1); + if (pci_mmconfig_add(i, 0, end_bus, + base + (1<<28) * i) == NULL) { + free_all_mmcfg(); + return NULL; + } return "AMD Family 10h NB"; } @@ -190,7 +238,7 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void) /* * do check if amd fam10h already took over */ - if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked) + if (!acpi_disabled || !list_empty(&pci_mmcfg_list) || mcp55_checked) return NULL; mcp55_checked = true; @@ -213,16 +261,14 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void) if (!(extcfg & extcfg_enable_mask)) continue; - if (extend_mmcfg(1) == -1) - continue; - size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift; base = extcfg & extcfg_base_mask[size_index]; /* base could > 4G */ base <<= extcfg_base_lshift; start = (extcfg & extcfg_start_mask) >> extcfg_start_shift; end = start + extcfg_sizebus[size_index] - 1; - fill_one_mmcfg(base, 0, start, end); + if (pci_mmconfig_add(0, start, end, base) == NULL) + continue; mcp55_mmconf_found++; } @@ -253,45 +299,22 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = { 0x0369, pci_mmcfg_nvidia_mcp55 }, }; -static int __init cmp_mmcfg(const void *x1, const void *x2) -{ - const typeof(pci_mmcfg_config[0]) *m1 = x1; - const typeof(pci_mmcfg_config[0]) *m2 = x2; - int start1, start2; - - start1 = m1->start_bus_number; - start2 = m2->start_bus_number; - - return start1 - start2; -} - static void __init pci_mmcfg_check_end_bus_number(void) { - int i; - typeof(pci_mmcfg_config[0]) *cfg, *cfgx; - - /* sort them at first */ - sort(pci_mmcfg_config, pci_mmcfg_config_num, - sizeof(pci_mmcfg_config[0]), cmp_mmcfg, NULL); - - /* last one*/ - if (pci_mmcfg_config_num > 0) { - i = pci_mmcfg_config_num - 1; - cfg = &pci_mmcfg_config[i]; - if (cfg->end_bus_number < cfg->start_bus_number) - cfg->end_bus_number = 255; - } + struct pci_mmcfg_region *cfg, *cfgx; - /* don't overlap please */ - for (i = 0; i < pci_mmcfg_config_num - 1; i++) { - cfg = &pci_mmcfg_config[i]; - cfgx = &pci_mmcfg_config[i+1]; + /* Fixup overlaps */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + if (cfg->end_bus < cfg->start_bus) + cfg->end_bus = 255; - if (cfg->end_bus_number < cfg->start_bus_number) - cfg->end_bus_number = 255; + /* Don't access the list head ! */ + if (cfg->list.next == &pci_mmcfg_list) + break; - if (cfg->end_bus_number >= cfgx->start_bus_number) - cfg->end_bus_number = cfgx->start_bus_number - 1; + cfgx = list_entry(cfg->list.next, typeof(*cfg), list); + if (cfg->end_bus >= cfgx->start_bus) + cfg->end_bus = cfgx->start_bus - 1; } } @@ -306,8 +329,7 @@ static int __init pci_mmcfg_check_hostbridge(void) if (!raw_pci_ops) return 0; - pci_mmcfg_config_num = 0; - pci_mmcfg_config = NULL; + free_all_mmcfg(); for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) { bus = pci_mmcfg_probes[i].bus; @@ -322,45 +344,22 @@ static int __init pci_mmcfg_check_hostbridge(void) name = pci_mmcfg_probes[i].probe(); if (name) - printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n", + printk(KERN_INFO PREFIX "%s with MMCONFIG support\n", name); } /* some end_bus_number is crazy, fix it */ pci_mmcfg_check_end_bus_number(); - return pci_mmcfg_config_num != 0; + return !list_empty(&pci_mmcfg_list); } static void __init pci_mmcfg_insert_resources(void) { -#define PCI_MMCFG_RESOURCE_NAME_LEN 24 - int i; - struct resource *res; - char *names; - unsigned num_buses; - - res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), - pci_mmcfg_config_num, GFP_KERNEL); - if (!res) { - printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); - return; - } + struct pci_mmcfg_region *cfg; - names = (void *)&res[pci_mmcfg_config_num]; - for (i = 0; i < pci_mmcfg_config_num; i++, res++) { - struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i]; - num_buses = cfg->end_bus_number - cfg->start_bus_number + 1; - res->name = names; - snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, - "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment, - cfg->start_bus_number, cfg->end_bus_number); - res->start = cfg->address + (cfg->start_bus_number << 20); - res->end = res->start + (num_buses << 20) - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - insert_resource(&iomem_resource, res); - names += PCI_MMCFG_RESOURCE_NAME_LEN; - } + list_for_each_entry(cfg, &pci_mmcfg_list, list) + insert_resource(&iomem_resource, &cfg->res); /* Mark that the resources have been inserted. */ pci_mmcfg_resources_inserted = 1; @@ -437,11 +436,12 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); static int __init is_mmconf_reserved(check_reserved_t is_reserved, - u64 addr, u64 size, int i, - typeof(pci_mmcfg_config[0]) *cfg, int with_e820) + struct pci_mmcfg_region *cfg, int with_e820) { + u64 addr = cfg->res.start; + u64 size = resource_size(&cfg->res); u64 old_size = size; - int valid = 0; + int valid = 0, num_buses; while (!is_reserved(addr, addr + size, E820_RESERVED)) { size >>= 1; @@ -450,19 +450,25 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, } if (size >= (16UL<<20) || size == old_size) { - printk(KERN_NOTICE - "PCI: MCFG area at %Lx reserved in %s\n", - addr, with_e820?"E820":"ACPI motherboard resources"); + printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n", + &cfg->res, + with_e820 ? "E820" : "ACPI motherboard resources"); valid = 1; if (old_size != size) { - /* update end_bus_number */ - cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1); - printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx " - "segment %hu buses %u - %u\n", - i, (unsigned long)cfg->address, cfg->pci_segment, - (unsigned int)cfg->start_bus_number, - (unsigned int)cfg->end_bus_number); + /* update end_bus */ + cfg->end_bus = cfg->start_bus + ((size>>20) - 1); + num_buses = cfg->end_bus - cfg->start_bus + 1; + cfg->res.end = cfg->res.start + + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; + snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN, + "PCI MMCONFIG %04x [bus %02x-%02x]", + cfg->segment, cfg->start_bus, cfg->end_bus); + printk(KERN_INFO PREFIX + "MMCONFIG for %04x [bus%02x-%02x] " + "at %pR (base %#lx) (size reduced!)\n", + cfg->segment, cfg->start_bus, cfg->end_bus, + &cfg->res, (unsigned long) cfg->address); } } @@ -471,45 +477,26 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved, static void __init pci_mmcfg_reject_broken(int early) { - typeof(pci_mmcfg_config[0]) *cfg; - int i; + struct pci_mmcfg_region *cfg; - if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || - (pci_mmcfg_config[0].address == 0)) - return; - - for (i = 0; i < pci_mmcfg_config_num; i++) { + list_for_each_entry(cfg, &pci_mmcfg_list, list) { int valid = 0; - u64 addr, size; - - cfg = &pci_mmcfg_config[i]; - addr = cfg->start_bus_number; - addr <<= 20; - addr += cfg->address; - size = cfg->end_bus_number + 1 - cfg->start_bus_number; - size <<= 20; - printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " - "segment %hu buses %u - %u\n", - i, (unsigned long)cfg->address, cfg->pci_segment, - (unsigned int)cfg->start_bus_number, - (unsigned int)cfg->end_bus_number); if (!early && !acpi_disabled) - valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); + valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0); if (valid) continue; if (!early) - printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" - " reserved in ACPI motherboard resources\n", - cfg->address); + printk(KERN_ERR FW_BUG PREFIX + "MMCONFIG at %pR not reserved in " + "ACPI motherboard resources\n", &cfg->res); /* Don't try to do this check unless configuration type 1 is available. how about type 2 ?*/ if (raw_pci_ops) - valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1); + valid = is_mmconf_reserved(e820_all_mapped, cfg, 1); if (!valid) goto reject; @@ -518,34 +505,41 @@ static void __init pci_mmcfg_reject_broken(int early) return; reject: - printk(KERN_INFO "PCI: Not using MMCONFIG.\n"); - pci_mmcfg_arch_free(); - kfree(pci_mmcfg_config); - pci_mmcfg_config = NULL; - pci_mmcfg_config_num = 0; + printk(KERN_INFO PREFIX "not using MMCONFIG\n"); + free_all_mmcfg(); } static int __initdata known_bridge; -static int acpi_mcfg_64bit_base_addr __initdata = FALSE; +static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, + struct acpi_mcfg_allocation *cfg) +{ + int year; -/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ -struct acpi_mcfg_allocation *pci_mmcfg_config; -int pci_mmcfg_config_num; + if (cfg->address < 0xFFFFFFFF) + return 0; -static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) -{ if (!strcmp(mcfg->header.oem_id, "SGI")) - acpi_mcfg_64bit_base_addr = TRUE; + return 0; - return 0; + if (mcfg->header.revision >= 1) { + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && + year >= 2010) + return 0; + } + + printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx " + "is above 4GB, ignored\n", cfg->pci_segment, + cfg->start_bus_number, cfg->end_bus_number, cfg->address); + return -EINVAL; } static int __init pci_parse_mcfg(struct acpi_table_header *header) { struct acpi_table_mcfg *mcfg; + struct acpi_mcfg_allocation *cfg_table, *cfg; unsigned long i; - int config_size; + int entries; if (!header) return -EINVAL; @@ -553,38 +547,33 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) mcfg = (struct acpi_table_mcfg *)header; /* how many config structures do we have */ - pci_mmcfg_config_num = 0; + free_all_mmcfg(); + entries = 0; i = header->length - sizeof(struct acpi_table_mcfg); while (i >= sizeof(struct acpi_mcfg_allocation)) { - ++pci_mmcfg_config_num; + entries++; i -= sizeof(struct acpi_mcfg_allocation); }; - if (pci_mmcfg_config_num == 0) { + if (entries == 0) { printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); return -ENODEV; } - config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); - pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); - if (!pci_mmcfg_config) { - printk(KERN_WARNING PREFIX - "No memory for MCFG config tables\n"); - return -ENOMEM; - } - - memcpy(pci_mmcfg_config, &mcfg[1], config_size); - - acpi_mcfg_oem_check(mcfg); - - for (i = 0; i < pci_mmcfg_config_num; ++i) { - if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && - !acpi_mcfg_64bit_base_addr) { - printk(KERN_ERR PREFIX - "MMCONFIG not in low 4GB of memory\n"); - kfree(pci_mmcfg_config); - pci_mmcfg_config_num = 0; + cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1]; + for (i = 0; i < entries; i++) { + cfg = &cfg_table[i]; + if (acpi_mcfg_check_entry(mcfg, cfg)) { + free_all_mmcfg(); return -ENODEV; } + + if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number, + cfg->end_bus_number, cfg->address) == NULL) { + printk(KERN_WARNING PREFIX + "no memory for MCFG entries\n"); + free_all_mmcfg(); + return -ENOMEM; + } } return 0; @@ -614,9 +603,7 @@ static void __init __pci_mmcfg_init(int early) pci_mmcfg_reject_broken(early); - if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || - (pci_mmcfg_config[0].address == 0)) + if (list_empty(&pci_mmcfg_list)) return; if (pci_mmcfg_arch_init()) @@ -648,9 +635,7 @@ static int __init pci_mmcfg_late_insert_resources(void) */ if ((pci_mmcfg_resources_inserted == 1) || (pci_probe & PCI_PROBE_MMCONF) == 0 || - (pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || - (pci_mmcfg_config[0].address == 0)) + list_empty(&pci_mmcfg_list)) return 1; /* diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index f10a7e94a84c..90d5fd476ed4 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -27,18 +27,10 @@ static int mmcfg_last_accessed_cpu; */ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) { - struct acpi_mcfg_allocation *cfg; - int cfg_num; - - for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { - cfg = &pci_mmcfg_config[cfg_num]; - if (cfg->pci_segment == seg && - (cfg->start_bus_number <= bus) && - (cfg->end_bus_number >= bus)) - return cfg->address; - } + struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus); - /* Fall back to type 0 */ + if (cfg) + return cfg->address; return 0; } @@ -47,7 +39,7 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) */ static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn) { - u32 dev_base = base | (bus << 20) | (devfn << 12); + u32 dev_base = base | PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12); int cpu = smp_processor_id(); if (dev_base != mmcfg_last_accessed_device || cpu != mmcfg_last_accessed_cpu) { diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index 94349f8b2f96..e783841bd1d7 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -12,38 +12,15 @@ #include <asm/e820.h> #include <asm/pci_x86.h> -/* Static virtual mapping of the MMCONFIG aperture */ -struct mmcfg_virt { - struct acpi_mcfg_allocation *cfg; - char __iomem *virt; -}; -static struct mmcfg_virt *pci_mmcfg_virt; - -static char __iomem *get_virt(unsigned int seg, unsigned bus) -{ - struct acpi_mcfg_allocation *cfg; - int cfg_num; - - for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { - cfg = pci_mmcfg_virt[cfg_num].cfg; - if (cfg->pci_segment == seg && - (cfg->start_bus_number <= bus) && - (cfg->end_bus_number >= bus)) - return pci_mmcfg_virt[cfg_num].virt; - } - - /* Fall back to type 0 */ - return NULL; -} +#define PREFIX "PCI: " static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) { - char __iomem *addr; + struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus); - addr = get_virt(seg, bus); - if (!addr) - return NULL; - return addr + ((bus << 20) | (devfn << 12)); + if (cfg && cfg->virt) + return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12)); + return NULL; } static int pci_mmcfg_read(unsigned int seg, unsigned int bus, @@ -109,42 +86,30 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; -static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) +static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg) { void __iomem *addr; u64 start, size; + int num_buses; - start = cfg->start_bus_number; - start <<= 20; - start += cfg->address; - size = cfg->end_bus_number + 1 - cfg->start_bus_number; - size <<= 20; + start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus); + num_buses = cfg->end_bus - cfg->start_bus + 1; + size = PCI_MMCFG_BUS_OFFSET(num_buses); addr = ioremap_nocache(start, size); - if (addr) { - printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", - start, start + size - 1); - addr -= cfg->start_bus_number << 20; - } + if (addr) + addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus); return addr; } int __init pci_mmcfg_arch_init(void) { - int i; - pci_mmcfg_virt = kzalloc(sizeof(*pci_mmcfg_virt) * - pci_mmcfg_config_num, GFP_KERNEL); - if (pci_mmcfg_virt == NULL) { - printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n"); - return 0; - } + struct pci_mmcfg_region *cfg; - for (i = 0; i < pci_mmcfg_config_num; ++i) { - pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; - pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]); - if (!pci_mmcfg_virt[i].virt) { - printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " - "segment %d\n", - pci_mmcfg_config[i].pci_segment); + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + cfg->virt = mcfg_ioremap(cfg); + if (!cfg->virt) { + printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n", + &cfg->res); pci_mmcfg_arch_free(); return 0; } @@ -155,19 +120,12 @@ int __init pci_mmcfg_arch_init(void) void __init pci_mmcfg_arch_free(void) { - int i; - - if (pci_mmcfg_virt == NULL) - return; + struct pci_mmcfg_region *cfg; - for (i = 0; i < pci_mmcfg_config_num; ++i) { - if (pci_mmcfg_virt[i].virt) { - iounmap(pci_mmcfg_virt[i].virt + (pci_mmcfg_virt[i].cfg->start_bus_number << 20)); - pci_mmcfg_virt[i].virt = NULL; - pci_mmcfg_virt[i].cfg = NULL; + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + if (cfg->virt) { + iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus)); + cfg->virt = NULL; } } - - kfree(pci_mmcfg_virt); - pci_mmcfg_virt = NULL; } diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 8eb295e116f6..8884a1c1ada6 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -8,9 +8,7 @@ #include <asm/apic.h> #include <asm/mpspec.h> #include <asm/pci_x86.h> - -#define XQUAD_PORTIO_BASE 0xfe400000 -#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ +#include <asm/numaq.h> #define BUS2QUAD(global) (mp_bus_id_to_node[global]) @@ -18,8 +16,6 @@ #define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) -#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) - #define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk index 0d13cd9fdcff..fd1ab80be0de 100644 --- a/arch/x86/tools/chkobjdump.awk +++ b/arch/x86/tools/chkobjdump.awk @@ -8,14 +8,24 @@ BEGIN { od_sver = 19; } -/^GNU/ { - split($4, ver, "."); +/^GNU objdump/ { + verstr = "" + for (i = 3; i <= NF; i++) + if (match($(i), "^[0-9]")) { + verstr = $(i); + break; + } + if (verstr == "") { + printf("Warning: Failed to find objdump version number.\n"); + exit 0; + } + split(verstr, ver, "."); if (ver[1] > od_ver || (ver[1] == od_ver && ver[2] >= od_sver)) { exit 1; } else { printf("Warning: objdump version %s is older than %d.%d\n", - $4, od_ver, od_sver); + verstr, od_ver, od_sver); print("Warning: Skipping posttest."); # Logic is inverted, because we just skip test without error. exit 0; diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index e34e92a28eb6..eaf11f52fc0b 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -6,8 +6,6 @@ # Awk implementation sanity check function check_awk_implement() { - if (!match("abc", "[[:lower:]]+")) - return "Your awk doesn't support charactor-class." if (sprintf("%x", 0) != "0") return "Your awk has a printf-format problem." return "" @@ -44,12 +42,12 @@ BEGIN { delete gtable delete atable - opnd_expr = "^[[:alpha:]/]" + opnd_expr = "^[A-Za-z/]" ext_expr = "^\\(" sep_expr = "^\\|$" - group_expr = "^Grp[[:alnum:]]+" + group_expr = "^Grp[0-9A-Za-z]+" - imm_expr = "^[IJAO][[:lower:]]" + imm_expr = "^[IJAO][a-z]" imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" @@ -62,7 +60,7 @@ BEGIN { imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" - modrm_expr = "^([CDEGMNPQRSUVW/][[:lower:]]+|NTA|T[012])" + modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" rex_expr = "^REX(\\.[XRWB]+)*" fpu_expr = "^ESC" # TODO @@ -226,12 +224,12 @@ function add_flags(old,new) { } # convert operands to flags. -function convert_operands(opnd, i,imm,mod) +function convert_operands(count,opnd, i,j,imm,mod) { imm = null mod = null - for (i in opnd) { - i = opnd[i] + for (j = 1; j <= count; j++) { + i = opnd[j] if (match(i, imm_expr) == 1) { if (!imm_flag[i]) semantic_error("Unknown imm opnd: " i) @@ -282,8 +280,8 @@ function convert_operands(opnd, i,imm,mod) # parse one opcode if (match($i, opnd_expr)) { opnd = $i - split($(i++), opnds, ",") - flags = convert_operands(opnds) + count = split($(i++), opnds, ",") + flags = convert_operands(count, opnds) } if (match($i, ext_expr)) ext = $(i++) diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index d8214dc03fa7..13403fc95a96 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -43,7 +43,7 @@ static int x86_64; static void usage(void) { fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" - " %s [-y|-n] [-v] \n", prog); + " %s [-y|-n] [-v]\n", prog); fprintf(stderr, "\t-y 64bit mode\n"); fprintf(stderr, "\t-n 32bit mode\n"); fprintf(stderr, "\t-v verbose mode\n"); @@ -69,7 +69,7 @@ static void dump_field(FILE *fp, const char *name, const char *indent, static void dump_insn(FILE *fp, struct insn *insn) { - fprintf(fp, "Instruction = { \n"); + fprintf(fp, "Instruction = {\n"); dump_field(fp, "prefixes", "\t", &insn->prefixes); dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); @@ -113,7 +113,7 @@ int main(int argc, char **argv) char line[BUFSIZE], sym[BUFSIZE] = "<unknown>"; unsigned char insn_buf[16]; struct insn insn; - int insns = 0, c; + int insns = 0; int warnings = 0; parse_args(argc, argv); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c462cea8ef09..36daccb68642 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -27,7 +27,9 @@ #include <linux/page-flags.h> #include <linux/highmem.h> #include <linux/console.h> +#include <linux/pci.h> +#include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/interface/version.h> #include <xen/interface/physdev.h> @@ -138,24 +140,23 @@ static void xen_vcpu_setup(int cpu) */ void xen_vcpu_restore(void) { - if (have_vcpu_info_placement) { - int cpu; + int cpu; - for_each_online_cpu(cpu) { - bool other_cpu = (cpu != smp_processor_id()); + for_each_online_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); - if (other_cpu && - HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) - BUG(); + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) + BUG(); - xen_vcpu_setup(cpu); + xen_setup_runstate_info(cpu); - if (other_cpu && - HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) - BUG(); - } + if (have_vcpu_info_placement) + xen_vcpu_setup(cpu); - BUG_ON(!have_vcpu_info_placement); + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) + BUG(); } } @@ -1150,9 +1151,13 @@ asmlinkage void __init xen_start_kernel(void) /* keep using Xen gdt for now; no urgent need to change it */ +#ifdef CONFIG_X86_32 pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; +#else + pv_info.kernel_rpl = 0; +#endif /* set the limit of our address space */ xen_reserve_top(); @@ -1176,10 +1181,16 @@ asmlinkage void __init xen_start_kernel(void) add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); + } else { + /* Make sure ACS will be enabled */ + pci_request_acs(); } + xen_raw_console_write("about to get started...\n"); + xen_setup_runstate_info(0); + /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3bf7b1d250ce..bf4cd6bfe959 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -185,7 +185,7 @@ static inline unsigned p2m_index(unsigned long pfn) } /* Build the parallel p2m_top_mfn structures */ -static void __init xen_build_mfn_list_list(void) +void xen_build_mfn_list_list(void) { unsigned pfn, idx; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 738da0cb0d8b..563d20504988 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -35,10 +35,10 @@ cpumask_var_t xen_cpu_initialized_map; -static DEFINE_PER_CPU(int, resched_irq); -static DEFINE_PER_CPU(int, callfunc_irq); -static DEFINE_PER_CPU(int, callfuncsingle_irq); -static DEFINE_PER_CPU(int, debug_irq) = -1; +static DEFINE_PER_CPU(int, xen_resched_irq); +static DEFINE_PER_CPU(int, xen_callfunc_irq); +static DEFINE_PER_CPU(int, xen_callfuncsingle_irq); +static DEFINE_PER_CPU(int, xen_debug_irq) = -1; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); @@ -103,7 +103,7 @@ static int xen_smp_intr_init(unsigned int cpu) NULL); if (rc < 0) goto fail; - per_cpu(resched_irq, cpu) = rc; + per_cpu(xen_resched_irq, cpu) = rc; callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, @@ -114,7 +114,7 @@ static int xen_smp_intr_init(unsigned int cpu) NULL); if (rc < 0) goto fail; - per_cpu(callfunc_irq, cpu) = rc; + per_cpu(xen_callfunc_irq, cpu) = rc; debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, @@ -122,7 +122,7 @@ static int xen_smp_intr_init(unsigned int cpu) debug_name, NULL); if (rc < 0) goto fail; - per_cpu(debug_irq, cpu) = rc; + per_cpu(xen_debug_irq, cpu) = rc; callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, @@ -133,19 +133,20 @@ static int xen_smp_intr_init(unsigned int cpu) NULL); if (rc < 0) goto fail; - per_cpu(callfuncsingle_irq, cpu) = rc; + per_cpu(xen_callfuncsingle_irq, cpu) = rc; return 0; fail: - if (per_cpu(resched_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); - if (per_cpu(callfunc_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); - if (per_cpu(debug_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); - if (per_cpu(callfuncsingle_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + if (per_cpu(xen_resched_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); + if (per_cpu(xen_callfunc_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); + if (per_cpu(xen_debug_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); + if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), + NULL); return rc; } @@ -295,6 +296,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; #endif + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_init_lock_cpu(cpu); @@ -348,10 +350,10 @@ static void xen_cpu_die(unsigned int cpu) current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ/10); } - unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 36a5141108df..24ded31b5aec 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -120,14 +120,14 @@ struct xen_spinlock { unsigned short spinners; /* count of waiting cpus */ }; -static int xen_spin_is_locked(struct raw_spinlock *lock) +static int xen_spin_is_locked(struct arch_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; return xl->lock != 0; } -static int xen_spin_is_contended(struct raw_spinlock *lock) +static int xen_spin_is_contended(struct arch_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; @@ -136,7 +136,7 @@ static int xen_spin_is_contended(struct raw_spinlock *lock) return xl->spinners != 0; } -static int xen_spin_trylock(struct raw_spinlock *lock) +static int xen_spin_trylock(struct arch_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; u8 old = 1; @@ -181,7 +181,7 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock __get_cpu_var(lock_spinners) = prev; } -static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable) +static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; struct xen_spinlock *prev; @@ -254,7 +254,7 @@ out: return ret; } -static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) +static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; unsigned timeout; @@ -291,12 +291,12 @@ static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) spin_time_accum_total(start_spin); } -static void xen_spin_lock(struct raw_spinlock *lock) +static void xen_spin_lock(struct arch_spinlock *lock) { __xen_spin_lock(lock, false); } -static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags) { __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); } @@ -317,7 +317,7 @@ static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) } } -static void xen_spin_unlock(struct raw_spinlock *lock) +static void xen_spin_unlock(struct arch_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 95be7b434724..987267f79bf5 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -1,4 +1,5 @@ #include <linux/types.h> +#include <linux/clockchips.h> #include <xen/interface/xen.h> #include <xen/grant_table.h> @@ -27,6 +28,8 @@ void xen_pre_suspend(void) void xen_post_suspend(int suspend_cancelled) { + xen_build_mfn_list_list(); + xen_setup_shared_info(); if (suspend_cancelled) { @@ -44,7 +47,19 @@ void xen_post_suspend(int suspend_cancelled) } +static void xen_vcpu_notify_restore(void *data) +{ + unsigned long reason = (unsigned long)data; + + /* Boot processor notified via generic timekeeping_resume() */ + if ( smp_processor_id() == 0) + return; + + clockevents_notify(reason, NULL); +} + void xen_arch_resume(void) { - /* nothing */ + smp_call_function(xen_vcpu_notify_restore, + (void *)CLOCK_EVT_NOTIFY_RESUME, 1); } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 0a5aa44299a5..0d3f07cd1b5f 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -31,14 +31,14 @@ #define NS_PER_TICK (1000000000LL / HZ) /* runstate info updated by Xen */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); /* snapshots of runstate info */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); /* unused ns of stolen and blocked time */ -static DEFINE_PER_CPU(u64, residual_stolen); -static DEFINE_PER_CPU(u64, residual_blocked); +static DEFINE_PER_CPU(u64, xen_residual_stolen); +static DEFINE_PER_CPU(u64, xen_residual_blocked); /* return an consistent snapshot of 64-bit time/counter value */ static u64 get64(const u64 *p) @@ -79,7 +79,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) BUG_ON(preemptible()); - state = &__get_cpu_var(runstate); + state = &__get_cpu_var(xen_runstate); /* * The runstate info is always updated by the hypervisor on @@ -97,14 +97,14 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) /* return true when a vcpu could run but has no real cpu to run on */ bool xen_vcpu_stolen(int vcpu) { - return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; + return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; } -static void setup_runstate_info(int cpu) +void xen_setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; - area.addr.v = &per_cpu(runstate, cpu); + area.addr.v = &per_cpu(xen_runstate, cpu); if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area)) @@ -122,7 +122,7 @@ static void do_stolen_accounting(void) WARN_ON(state.state != RUNSTATE_running); - snap = &__get_cpu_var(runstate_snapshot); + snap = &__get_cpu_var(xen_runstate_snapshot); /* work out how much time the VCPU has not been runn*ing* */ blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; @@ -133,24 +133,24 @@ static void do_stolen_accounting(void) /* Add the appropriate number of ticks of stolen time, including any left-overs from last time. */ - stolen = runnable + offline + __get_cpu_var(residual_stolen); + stolen = runnable + offline + __get_cpu_var(xen_residual_stolen); if (stolen < 0) stolen = 0; ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); - __get_cpu_var(residual_stolen) = stolen; + __get_cpu_var(xen_residual_stolen) = stolen; account_steal_ticks(ticks); /* Add the appropriate number of ticks of blocked time, including any left-overs from last time. */ - blocked += __get_cpu_var(residual_blocked); + blocked += __get_cpu_var(xen_residual_blocked); if (blocked < 0) blocked = 0; ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); - __get_cpu_var(residual_blocked) = blocked; + __get_cpu_var(xen_residual_blocked) = blocked; account_idle_ticks(ticks); } @@ -434,7 +434,7 @@ void xen_setup_timer(int cpu) name = "<timer kasprintf failed>"; irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, name, NULL); evt = &per_cpu(xen_clock_events, cpu); @@ -442,8 +442,6 @@ void xen_setup_timer(int cpu) evt->cpumask = cpumask_of(cpu); evt->irq = irq; - - setup_runstate_info(cpu); } void xen_teardown_timer(int cpu) @@ -494,6 +492,7 @@ __init void xen_time_init(void) setup_force_cpu_cap(X86_FEATURE_TSC); + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); } diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 02f496a8dbaa..53adefda4275 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -96,7 +96,7 @@ ENTRY(xen_sysret32) pushq $__USER32_CS pushq %rcx - pushq $VGCF_in_syscall + pushq $0 1: jmp hypercall_iret ENDPATCH(xen_sysret32) RELOC(xen_sysret32, 1b+1) @@ -151,7 +151,7 @@ ENTRY(xen_syscall32_target) ENTRY(xen_sysenter_target) lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax - pushq $VGCF_in_syscall + pushq $0 jmp hypercall_iret ENDPROC(xen_syscall32_target) ENDPROC(xen_sysenter_target) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 355fa6b99c9c..f9153a300bce 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -25,6 +25,7 @@ extern struct shared_info *HYPERVISOR_shared_info; void xen_setup_mfn_list_list(void); void xen_setup_shared_info(void); +void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); @@ -41,6 +42,7 @@ void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); |