diff options
Diffstat (limited to 'arch/x86/kernel')
123 files changed, 5392 insertions, 4838 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 430d5b24af7b..4f2e66e29ecc 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -31,8 +31,8 @@ GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y += setup.o i8259.o irqinit.o +obj-y += time.o ioport.o ldt.o dumpstack.o +obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o @@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o -obj-y += alternative.o i8253.o pci-nommu.o +obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o @@ -52,9 +52,11 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o +obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ +obj-$(CONFIG_SFI) += sfi.o obj-y += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o @@ -104,6 +106,7 @@ obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_X86_MRST) += mrst.o microcode-y := microcode_core.o microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 8c44c232efcb..59cdfa4686b2 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, * P4, Core and beyond CPUs */ if (c->x86_vendor == X86_VENDOR_INTEL && - (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) + (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) flags->bm_control = 0; } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index d296f4a195c9..d85d1b2432ba 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -79,7 +79,8 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) struct cpuinfo_x86 *c = &cpu_data(pr->id); pr->pdc = NULL; - if (c->x86_vendor == X86_VENDOR_INTEL) + if (c->x86_vendor == X86_VENDOR_INTEL || + c->x86_vendor == X86_VENDOR_CENTAUR) init_intel_pdc(pr, c); return; diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 7da00b799cda..060fff8f5c5b 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -57,5 +57,8 @@ SECTIONS *(.note*) } + /* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 98f230f6a28d..32fb09102a13 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -28,6 +28,7 @@ #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> +#include <asm/amd_iommu_proto.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> @@ -56,20 +57,115 @@ struct iommu_cmd { u32 data[4]; }; -static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, - struct unity_map_entry *e); -static struct dma_ops_domain *find_protection_domain(u16 devid); -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, int end_lvl, - u64 **pte_page, gfp_t gfp); -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages); static void reset_iommu_command_buffer(struct amd_iommu *iommu); -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size); static void update_domain(struct protection_domain *domain); +/**************************************************************************** + * + * Helper functions + * + ****************************************************************************/ + +static inline u16 get_device_id(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return calc_devid(pdev->bus->number, pdev->devfn); +} + +static struct iommu_dev_data *get_dev_data(struct device *dev) +{ + return dev->archdata.iommu; +} + +/* + * In this function the list of preallocated protection domains is traversed to + * find the domain for a specific device + */ +static struct dma_ops_domain *find_protection_domain(u16 devid) +{ + struct dma_ops_domain *entry, *ret = NULL; + unsigned long flags; + u16 alias = amd_iommu_alias_table[devid]; + + if (list_empty(&iommu_pd_list)) + return NULL; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + + list_for_each_entry(entry, &iommu_pd_list, list) { + if (entry->target_dev == devid || + entry->target_dev == alias) { + ret = entry; + break; + } + } + + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + return ret; +} + +/* + * This function checks if the driver got a valid device from the caller to + * avoid dereferencing invalid pointers. + */ +static bool check_device(struct device *dev) +{ + u16 devid; + + if (!dev || !dev->dma_mask) + return false; + + /* No device or no PCI device */ + if (!dev || dev->bus != &pci_bus_type) + return false; + + devid = get_device_id(dev); + + /* Out of our scope? */ + if (devid > amd_iommu_last_bdf) + return false; + + if (amd_iommu_rlookup_table[devid] == NULL) + return false; + + return true; +} + +static int iommu_init_device(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct pci_dev *pdev; + u16 devid, alias; + + if (dev->archdata.iommu) + return 0; + + dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); + if (!dev_data) + return -ENOMEM; + + dev_data->dev = dev; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); + if (pdev) + dev_data->alias = &pdev->dev; + + atomic_set(&dev_data->bind, 0); + + dev->archdata.iommu = dev_data; + + + return 0; +} + +static void iommu_uninit_device(struct device *dev) +{ + kfree(dev->archdata.iommu); +} #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -90,7 +186,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem); DECLARE_STATS_COUNTER(total_map_requests); static struct dentry *stats_dir; -static struct dentry *de_isolate; static struct dentry *de_fflush; static void amd_iommu_stats_add(struct __iommu_counter *cnt) @@ -108,9 +203,6 @@ static void amd_iommu_stats_init(void) if (stats_dir == NULL) return; - de_isolate = debugfs_create_bool("isolation", 0444, stats_dir, - (u32 *)&amd_iommu_isolate); - de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, (u32 *)&amd_iommu_unmap_flush); @@ -130,12 +222,6 @@ static void amd_iommu_stats_init(void) #endif -/* returns !0 if the IOMMU is caching non-present entries in its TLB */ -static int iommu_has_npcache(struct amd_iommu *iommu) -{ - return iommu->cap & (1UL << IOMMU_CAP_NPCACHE); -} - /**************************************************************************** * * Interrupt handling functions @@ -199,6 +285,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + iommu->reset_in_progress = true; reset_iommu_command_buffer(iommu); dump_command(address); break; @@ -321,11 +408,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely(i == EXIT_LOOP_COUNT)) { - spin_unlock(&iommu->lock); - reset_iommu_command_buffer(iommu); - spin_lock(&iommu->lock); - } + if (unlikely(i == EXIT_LOOP_COUNT)) + iommu->reset_in_progress = true; } /* @@ -372,26 +456,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu) out: spin_unlock_irqrestore(&iommu->lock, flags); + if (iommu->reset_in_progress) + reset_iommu_command_buffer(iommu); + return 0; } +static void iommu_flush_complete(struct protection_domain *domain) +{ + int i; + + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + iommu_completion_wait(amd_iommus[i]); + } +} + /* * Command send function for invalidating a device table entry */ -static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) +static int iommu_flush_device(struct device *dev) { + struct amd_iommu *iommu; struct iommu_cmd cmd; - int ret; + u16 devid; - BUG_ON(iommu == NULL); + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + /* Build command */ memset(&cmd, 0, sizeof(cmd)); CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); cmd.data[0] = devid; - ret = iommu_queue_command(iommu, &cmd); - - return ret; + return iommu_queue_command(iommu, &cmd); } static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, @@ -430,11 +534,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, * It invalidates a single PTE if the range to flush is within a single * page. Otherwise it flushes the whole TLB of the IOMMU. */ -static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, - u64 address, size_t size) +static void __iommu_flush_pages(struct protection_domain *domain, + u64 address, size_t size, int pde) { - int s = 0; - unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); + int s = 0, i; + unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); address &= PAGE_MASK; @@ -447,142 +551,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, s = 1; } - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s); - return 0; + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need a TLB flush + */ + iommu_queue_inv_iommu_pages(amd_iommus[i], address, + domain->id, pde, s); + } + + return; } -/* Flush the whole IO/TLB for a given protection domain */ -static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_pages(struct protection_domain *domain, + u64 address, size_t size) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - - INC_STATS_COUNTER(domain_flush_single); + __iommu_flush_pages(domain, address, size, 0); +} - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); +/* Flush the whole IO/TLB for a given protection domain */ +static void iommu_flush_tlb(struct protection_domain *domain) +{ + __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ -static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_tlb_pde(struct protection_domain *domain) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - - INC_STATS_COUNTER(domain_flush_single); - - iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1); + __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); } + /* - * This function flushes one domain on one IOMMU + * This function flushes the DTEs for all devices in domain */ -static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_domain_devices(struct protection_domain *domain) { - struct iommu_cmd cmd; + struct iommu_dev_data *dev_data; unsigned long flags; - __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - domid, 1, 1); + spin_lock_irqsave(&domain->lock, flags); - spin_lock_irqsave(&iommu->lock, flags); - __iommu_queue_command(iommu, &cmd); - __iommu_completion_wait(iommu); - __iommu_wait_for_completion(iommu); - spin_unlock_irqrestore(&iommu->lock, flags); + list_for_each_entry(dev_data, &domain->dev_list, list) + iommu_flush_device(dev_data->dev); + + spin_unlock_irqrestore(&domain->lock, flags); } -static void flush_all_domains_on_iommu(struct amd_iommu *iommu) +static void iommu_flush_all_domain_devices(void) { - int i; + struct protection_domain *domain; + unsigned long flags; - for (i = 1; i < MAX_DOMAIN_ID; ++i) { - if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) - continue; - flush_domain_on_iommu(iommu, i); + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + + list_for_each_entry(domain, &amd_iommu_pd_list, list) { + iommu_flush_domain_devices(domain); + iommu_flush_complete(domain); } + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + +void amd_iommu_flush_all_devices(void) +{ + iommu_flush_all_domain_devices(); } /* - * This function is used to flush the IO/TLB for a given protection domain - * on every IOMMU in the system + * This function uses heavy locking and may disable irqs for some time. But + * this is no issue because it is only called during resume. */ -static void iommu_flush_domain(u16 domid) +void amd_iommu_flush_all_domains(void) { - struct amd_iommu *iommu; + struct protection_domain *domain; + unsigned long flags; - INC_STATS_COUNTER(domain_flush_all); + spin_lock_irqsave(&amd_iommu_pd_lock, flags); - for_each_iommu(iommu) - flush_domain_on_iommu(iommu, domid); + list_for_each_entry(domain, &amd_iommu_pd_list, list) { + spin_lock(&domain->lock); + iommu_flush_tlb_pde(domain); + iommu_flush_complete(domain); + spin_unlock(&domain->lock); + } + + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } -void amd_iommu_flush_all_domains(void) +static void reset_iommu_command_buffer(struct amd_iommu *iommu) { - struct amd_iommu *iommu; + pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); - for_each_iommu(iommu) - flush_all_domains_on_iommu(iommu); + if (iommu->reset_in_progress) + panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + + amd_iommu_reset_cmd_buffer(iommu); + amd_iommu_flush_all_devices(); + amd_iommu_flush_all_domains(); + + iommu->reset_in_progress = false; } -static void flush_all_devices_for_iommu(struct amd_iommu *iommu) +/**************************************************************************** + * + * The functions below are used the create the page table mappings for + * unity mapped regions. + * + ****************************************************************************/ + +/* + * This function is used to add another level to an IO page table. Adding + * another level increases the size of the address space by 9 bits to a size up + * to 64 bits. + */ +static bool increase_address_space(struct protection_domain *domain, + gfp_t gfp) { - int i; + u64 *pte; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (iommu != amd_iommu_rlookup_table[i]) - continue; + if (domain->mode == PAGE_MODE_6_LEVEL) + /* address space already 64 bit large */ + return false; - iommu_queue_inv_dev_entry(iommu, i); - iommu_completion_wait(iommu); - } + pte = (void *)get_zeroed_page(gfp); + if (!pte) + return false; + + *pte = PM_LEVEL_PDE(domain->mode, + virt_to_phys(domain->pt_root)); + domain->pt_root = pte; + domain->mode += 1; + domain->updated = true; + + return true; } -static void flush_devices_by_domain(struct protection_domain *domain) +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, + int end_lvl, + u64 **pte_page, + gfp_t gfp) { - struct amd_iommu *iommu; - int i; + u64 *pte, *page; + int level; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || - (amd_iommu_pd_table[i] != domain)) - continue; + while (address > PM_LEVEL_SIZE(domain->mode)) + increase_address_space(domain, gfp); - iommu = amd_iommu_rlookup_table[i]; - if (!iommu) - continue; + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + + while (level > end_lvl) { + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); + } - iommu_queue_inv_dev_entry(iommu, i); - iommu_completion_wait(iommu); + level -= 1; + + pte = IOMMU_PTE_PAGE(*pte); + + if (pte_page && level == end_lvl) + *pte_page = pte; + + pte = &pte[PM_LEVEL_INDEX(level, address)]; } + + return pte; } -static void reset_iommu_command_buffer(struct amd_iommu *iommu) +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size) { - pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); + int level; + u64 *pte; - if (iommu->reset_in_progress) - panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - iommu->reset_in_progress = true; + while (level > map_size) { + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; - amd_iommu_reset_cmd_buffer(iommu); - flush_all_devices_for_iommu(iommu); - flush_all_domains_on_iommu(iommu); + level -= 1; - iommu->reset_in_progress = false; -} + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[PM_LEVEL_INDEX(level, address)]; -void amd_iommu_flush_all_devices(void) -{ - flush_devices_by_domain(NULL); -} + if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { + pte = NULL; + break; + } + } -/**************************************************************************** - * - * The functions below are used the create the page table mappings for - * unity mapped regions. - * - ****************************************************************************/ + return pte; +} /* * Generic mapping functions. It maps a physical address into a DMA @@ -654,28 +828,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu, } /* - * Init the unity mappings for a specific IOMMU in the system - * - * Basically iterates over all unity mapping entries and applies them to - * the default domain DMA of that IOMMU if necessary. - */ -static int iommu_init_unity_mappings(struct amd_iommu *iommu) -{ - struct unity_map_entry *entry; - int ret; - - list_for_each_entry(entry, &amd_iommu_unity_map, list) { - if (!iommu_for_unity_map(iommu, entry)) - continue; - ret = dma_ops_unity_map(iommu->default_dom, entry); - if (ret) - return ret; - } - - return 0; -} - -/* * This function actually applies the mapping to the page table of the * dma_ops domain. */ @@ -704,6 +856,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, } /* + * Init the unity mappings for a specific IOMMU in the system + * + * Basically iterates over all unity mapping entries and applies them to + * the default domain DMA of that IOMMU if necessary. + */ +static int iommu_init_unity_mappings(struct amd_iommu *iommu) +{ + struct unity_map_entry *entry; + int ret; + + list_for_each_entry(entry, &amd_iommu_unity_map, list) { + if (!iommu_for_unity_map(iommu, entry)) + continue; + ret = dma_ops_unity_map(iommu->default_dom, entry); + if (ret) + return ret; + } + + return 0; +} + +/* * Inits the unity mappings required for a specific device */ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, @@ -740,34 +914,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, */ /* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. + * Used to reserve address ranges in the aperture (e.g. for exclusion + * ranges. */ -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size) +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages) { - int level; - u64 *pte; - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > map_size) { - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - level -= 1; + unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; + if (start_page + pages > last_page) + pages = last_page - start_page; - if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { - pte = NULL; - break; - } + for (i = start_page; i < start_page + pages; ++i) { + int index = i / APERTURE_RANGE_PAGES; + int page = i % APERTURE_RANGE_PAGES; + __set_bit(page, dom->aperture[index]->bitmap); } - - return pte; } /* @@ -775,11 +938,11 @@ static u64 *fetch_pte(struct protection_domain *domain, * aperture in case of dma_ops domain allocation or address allocation * failure. */ -static int alloc_new_range(struct amd_iommu *iommu, - struct dma_ops_domain *dma_dom, +static int alloc_new_range(struct dma_ops_domain *dma_dom, bool populate, gfp_t gfp) { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + struct amd_iommu *iommu; int i; #ifdef CONFIG_IOMMU_STRESS @@ -819,14 +982,17 @@ static int alloc_new_range(struct amd_iommu *iommu, dma_dom->aperture_size += APERTURE_RANGE_SIZE; /* Intialize the exclusion range if necessary */ - if (iommu->exclusion_start && - iommu->exclusion_start >= dma_dom->aperture[index]->offset && - iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - dma_ops_reserve_addresses(dma_dom, startpage, pages); + for_each_iommu(iommu) { + if (iommu->exclusion_start && + iommu->exclusion_start >= dma_dom->aperture[index]->offset + && iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage; + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); + startpage = iommu->exclusion_start >> PAGE_SHIFT; + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } } /* @@ -928,7 +1094,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, } if (unlikely(address == -1)) - address = bad_dma_address; + address = DMA_ERROR_CODE; WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -973,6 +1139,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, * ****************************************************************************/ +/* + * This function adds a protection domain to the global protection domain list + */ +static void add_domain_to_list(struct protection_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + list_add(&domain->list, &amd_iommu_pd_list); + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + +/* + * This function removes a protection domain to the global + * protection domain list + */ +static void del_domain_from_list(struct protection_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + list_del(&domain->list); + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + static u16 domain_id_alloc(void) { unsigned long flags; @@ -1000,26 +1191,6 @@ static void domain_id_free(int id) write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } -/* - * Used to reserve address ranges in the aperture (e.g. for exclusion - * ranges. - */ -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages) -{ - unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; - - if (start_page + pages > last_page) - pages = last_page - start_page; - - for (i = start_page; i < start_page + pages; ++i) { - int index = i / APERTURE_RANGE_PAGES; - int page = i % APERTURE_RANGE_PAGES; - __set_bit(page, dom->aperture[index]->bitmap); - } -} - static void free_pagetable(struct protection_domain *domain) { int i, j; @@ -1061,6 +1232,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) if (!dom) return; + del_domain_from_list(&dom->domain); + free_pagetable(&dom->domain); for (i = 0; i < APERTURE_MAX_RANGES; ++i) { @@ -1078,7 +1251,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) * It also intializes the page table and the address allocator data * structures required for the dma_ops interface */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) +static struct dma_ops_domain *dma_ops_domain_alloc(void) { struct dma_ops_domain *dma_dom; @@ -1091,6 +1264,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->domain.id = domain_id_alloc(); if (dma_dom->domain.id == 0) goto free_dma_dom; + INIT_LIST_HEAD(&dma_dom->domain.dev_list); dma_dom->domain.mode = PAGE_MODE_2_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); dma_dom->domain.flags = PD_DMA_OPS_MASK; @@ -1101,7 +1275,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; - if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) + add_domain_to_list(&dma_dom->domain); + + if (alloc_new_range(dma_dom, true, GFP_KERNEL)) goto free_dma_dom; /* @@ -1129,22 +1305,6 @@ static bool dma_ops_domain(struct protection_domain *domain) return domain->flags & PD_DMA_OPS_MASK; } -/* - * Find out the protection domain structure for a given PCI device. This - * will give us the pointer to the page table root for example. - */ -static struct protection_domain *domain_for_device(u16 devid) -{ - struct protection_domain *dom; - unsigned long flags; - - read_lock_irqsave(&amd_iommu_devtable_lock, flags); - dom = amd_iommu_pd_table[devid]; - read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - - return dom; -} - static void set_dte_entry(u16 devid, struct protection_domain *domain) { u64 pte_root = virt_to_phys(domain->pt_root); @@ -1156,42 +1316,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain) amd_iommu_dev_table[devid].data[2] = domain->id; amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); +} + +static void clear_dte_entry(u16 devid) +{ + /* remove entry from the device table seen by the hardware */ + amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; + amd_iommu_dev_table[devid].data[1] = 0; + amd_iommu_dev_table[devid].data[2] = 0; - amd_iommu_pd_table[devid] = domain; + amd_iommu_apply_erratum_63(devid); +} + +static void do_attach(struct device *dev, struct protection_domain *domain) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); + + /* Update data structures */ + dev_data->domain = domain; + list_add(&dev_data->list, &domain->dev_list); + set_dte_entry(devid, domain); + + /* Do reference counting */ + domain->dev_iommu[iommu->index] += 1; + domain->dev_cnt += 1; + + /* Flush the DTE entry */ + iommu_flush_device(dev); +} + +static void do_detach(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); + + /* decrease reference counters */ + dev_data->domain->dev_iommu[iommu->index] -= 1; + dev_data->domain->dev_cnt -= 1; + + /* Update data structures */ + dev_data->domain = NULL; + list_del(&dev_data->list); + clear_dte_entry(devid); + + /* Flush the DTE entry */ + iommu_flush_device(dev); } /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void __attach_device(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static int __attach_device(struct device *dev, + struct protection_domain *domain) { + struct iommu_dev_data *dev_data, *alias_data; + + dev_data = get_dev_data(dev); + alias_data = get_dev_data(dev_data->alias); + + if (!alias_data) + return -EINVAL; + /* lock domain */ spin_lock(&domain->lock); - /* update DTE entry */ - set_dte_entry(devid, domain); + /* Some sanity checks */ + if (alias_data->domain != NULL && + alias_data->domain != domain) + return -EBUSY; + + if (dev_data->domain != NULL && + dev_data->domain != domain) + return -EBUSY; - domain->dev_cnt += 1; + /* Do real assignment */ + if (dev_data->alias != dev) { + alias_data = get_dev_data(dev_data->alias); + if (alias_data->domain == NULL) + do_attach(dev_data->alias, domain); + + atomic_inc(&alias_data->bind); + } + + if (dev_data->domain == NULL) + do_attach(dev, domain); + + atomic_inc(&dev_data->bind); /* ready */ spin_unlock(&domain->lock); + + return 0; } /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void attach_device(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static int attach_device(struct device *dev, + struct protection_domain *domain) { unsigned long flags; + int ret; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __attach_device(iommu, domain, devid); + ret = __attach_device(dev, domain); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); /* @@ -1199,96 +1440,125 @@ static void attach_device(struct amd_iommu *iommu, * left the caches in the IOMMU dirty. So we have to flush * here to evict all dirty stuff. */ - iommu_queue_inv_dev_entry(iommu, devid); - iommu_flush_tlb_pde(iommu, domain->id); + iommu_flush_tlb_pde(domain); + + return ret; } /* * Removes a device from a protection domain (unlocked) */ -static void __detach_device(struct protection_domain *domain, u16 devid) +static void __detach_device(struct device *dev) { + struct iommu_dev_data *dev_data = get_dev_data(dev); + struct iommu_dev_data *alias_data; + unsigned long flags; - /* lock domain */ - spin_lock(&domain->lock); + BUG_ON(!dev_data->domain); - /* remove domain from the lookup table */ - amd_iommu_pd_table[devid] = NULL; + spin_lock_irqsave(&dev_data->domain->lock, flags); - /* remove entry from the device table seen by the hardware */ - amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; - amd_iommu_dev_table[devid].data[1] = 0; - amd_iommu_dev_table[devid].data[2] = 0; + if (dev_data->alias != dev) { + alias_data = get_dev_data(dev_data->alias); + if (atomic_dec_and_test(&alias_data->bind)) + do_detach(dev_data->alias); + } - /* decrease reference counter */ - domain->dev_cnt -= 1; + if (atomic_dec_and_test(&dev_data->bind)) + do_detach(dev); - /* ready */ - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&dev_data->domain->lock, flags); /* * If we run in passthrough mode the device must be assigned to the * passthrough domain if it is detached from any other domain */ - if (iommu_pass_through) { - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; - __attach_device(iommu, pt_domain, devid); - } + if (iommu_pass_through && dev_data->domain == NULL) + __attach_device(dev, pt_domain); } /* * Removes a device from a protection domain (with devtable_lock held) */ -static void detach_device(struct protection_domain *domain, u16 devid) +static void detach_device(struct device *dev) { unsigned long flags; /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __detach_device(domain, devid); + __detach_device(dev); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } +/* + * Find out the protection domain structure for a given PCI device. This + * will give us the pointer to the page table root for example. + */ +static struct protection_domain *domain_for_device(struct device *dev) +{ + struct protection_domain *dom; + struct iommu_dev_data *dev_data, *alias_data; + unsigned long flags; + u16 devid, alias; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + dev_data = get_dev_data(dev); + alias_data = get_dev_data(dev_data->alias); + if (!alias_data) + return NULL; + + read_lock_irqsave(&amd_iommu_devtable_lock, flags); + dom = dev_data->domain; + if (dom == NULL && + alias_data->domain != NULL) { + __attach_device(dev, alias_data->domain); + dom = alias_data->domain; + } + + read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + return dom; +} + static int device_change_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; - struct pci_dev *pdev = to_pci_dev(dev); - u16 devid = calc_devid(pdev->bus->number, pdev->devfn); + u16 devid; struct protection_domain *domain; struct dma_ops_domain *dma_domain; struct amd_iommu *iommu; unsigned long flags; - if (devid > amd_iommu_last_bdf) - goto out; - - devid = amd_iommu_alias_table[devid]; - - iommu = amd_iommu_rlookup_table[devid]; - if (iommu == NULL) - goto out; - - domain = domain_for_device(devid); + if (!check_device(dev)) + return 0; - if (domain && !dma_ops_domain(domain)) - WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " - "to a non-dma-ops domain\n", dev_name(dev)); + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; switch (action) { case BUS_NOTIFY_UNBOUND_DRIVER: + + domain = domain_for_device(dev); + if (!domain) goto out; if (iommu_pass_through) break; - detach_device(domain, devid); + detach_device(dev); break; case BUS_NOTIFY_ADD_DEVICE: + + iommu_init_device(dev); + + domain = domain_for_device(dev); + /* allocate a protection domain if a device is added */ dma_domain = find_protection_domain(devid); if (dma_domain) goto out; - dma_domain = dma_ops_domain_alloc(iommu); + dma_domain = dma_ops_domain_alloc(); if (!dma_domain) goto out; dma_domain->target_dev = devid; @@ -1298,11 +1568,15 @@ static int device_change_notifier(struct notifier_block *nb, spin_unlock_irqrestore(&iommu_pd_list_lock, flags); break; + case BUS_NOTIFY_DEL_DEVICE: + + iommu_uninit_device(dev); + default: goto out; } - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); iommu_completion_wait(iommu); out: @@ -1320,106 +1594,46 @@ static struct notifier_block device_nb = { *****************************************************************************/ /* - * This function checks if the driver got a valid device from the caller to - * avoid dereferencing invalid pointers. - */ -static bool check_device(struct device *dev) -{ - if (!dev || !dev->dma_mask) - return false; - - return true; -} - -/* - * In this function the list of preallocated protection domains is traversed to - * find the domain for a specific device - */ -static struct dma_ops_domain *find_protection_domain(u16 devid) -{ - struct dma_ops_domain *entry, *ret = NULL; - unsigned long flags; - - if (list_empty(&iommu_pd_list)) - return NULL; - - spin_lock_irqsave(&iommu_pd_list_lock, flags); - - list_for_each_entry(entry, &iommu_pd_list, list) { - if (entry->target_dev == devid) { - ret = entry; - break; - } - } - - spin_unlock_irqrestore(&iommu_pd_list_lock, flags); - - return ret; -} - -/* * In the dma_ops path we only have the struct device. This function * finds the corresponding IOMMU, the protection domain and the * requestor id for a given device. * If the device is not yet associated with a domain this is also done * in this function. */ -static int get_device_resources(struct device *dev, - struct amd_iommu **iommu, - struct protection_domain **domain, - u16 *bdf) +static struct protection_domain *get_domain(struct device *dev) { + struct protection_domain *domain; struct dma_ops_domain *dma_dom; - struct pci_dev *pcidev; - u16 _bdf; - - *iommu = NULL; - *domain = NULL; - *bdf = 0xffff; - - if (dev->bus != &pci_bus_type) - return 0; + u16 devid = get_device_id(dev); - pcidev = to_pci_dev(dev); - _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* device not translated by any IOMMU in the system? */ - if (_bdf > amd_iommu_last_bdf) - return 0; + if (!check_device(dev)) + return ERR_PTR(-EINVAL); - *bdf = amd_iommu_alias_table[_bdf]; + domain = domain_for_device(dev); + if (domain != NULL && !dma_ops_domain(domain)) + return ERR_PTR(-EBUSY); - *iommu = amd_iommu_rlookup_table[*bdf]; - if (*iommu == NULL) - return 0; - *domain = domain_for_device(*bdf); - if (*domain == NULL) { - dma_dom = find_protection_domain(*bdf); - if (!dma_dom) - dma_dom = (*iommu)->default_dom; - *domain = &dma_dom->domain; - attach_device(*iommu, *domain, *bdf); - DUMP_printk("Using protection domain %d for device %s\n", - (*domain)->id, dev_name(dev)); - } + if (domain != NULL) + return domain; - if (domain_for_device(_bdf) == NULL) - attach_device(*iommu, *domain, _bdf); + /* Device not bount yet - bind it */ + dma_dom = find_protection_domain(devid); + if (!dma_dom) + dma_dom = amd_iommu_rlookup_table[devid]->default_dom; + attach_device(dev, &dma_dom->domain); + DUMP_printk("Using protection domain %d for device %s\n", + dma_dom->domain.id, dev_name(dev)); - return 1; + return &dma_dom->domain; } static void update_device_table(struct protection_domain *domain) { - unsigned long flags; - int i; + struct iommu_dev_data *dev_data; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (amd_iommu_pd_table[i] != domain) - continue; - write_lock_irqsave(&amd_iommu_devtable_lock, flags); - set_dte_entry(i, domain); - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + list_for_each_entry(dev_data, &domain->dev_list, list) { + u16 devid = get_device_id(dev_data->dev); + set_dte_entry(devid, domain); } } @@ -1429,76 +1643,13 @@ static void update_domain(struct protection_domain *domain) return; update_device_table(domain); - flush_devices_by_domain(domain); - iommu_flush_domain(domain->id); + iommu_flush_domain_devices(domain); + iommu_flush_tlb_pde(domain); domain->updated = false; } /* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct protection_domain *domain, - gfp_t gfp) -{ - u64 *pte; - - if (domain->mode == PAGE_MODE_6_LEVEL) - /* address space already 64 bit large */ - return false; - - pte = (void *)get_zeroed_page(gfp); - if (!pte) - return false; - - *pte = PM_LEVEL_PDE(domain->mode, - virt_to_phys(domain->pt_root)); - domain->pt_root = pte; - domain->mode += 1; - domain->updated = true; - - return true; -} - -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, - int end_lvl, - u64 **pte_page, - gfp_t gfp) -{ - u64 *pte, *page; - int level; - - while (address > PM_LEVEL_SIZE(domain->mode)) - increase_address_space(domain, gfp); - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > end_lvl) { - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); - } - - level -= 1; - - pte = IOMMU_PTE_PAGE(*pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* * This function fetches the PTE for a given address in the aperture */ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, @@ -1528,8 +1679,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, * This is the generic map function. It maps one 4kb page at paddr to * the given address in the DMA address space for the domain. */ -static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, - struct dma_ops_domain *dom, +static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom, unsigned long address, phys_addr_t paddr, int direction) @@ -1542,7 +1692,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, pte = dma_ops_get_pte(dom, address); if (!pte) - return bad_dma_address; + return DMA_ERROR_CODE; __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1563,8 +1713,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, /* * The generic unmapping function for on page in the DMA address space. */ -static void dma_ops_domain_unmap(struct amd_iommu *iommu, - struct dma_ops_domain *dom, +static void dma_ops_domain_unmap(struct dma_ops_domain *dom, unsigned long address) { struct aperture_range *aperture; @@ -1595,7 +1744,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, * Must be called with the domain lock held. */ static dma_addr_t __map_single(struct device *dev, - struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, phys_addr_t paddr, size_t size, @@ -1623,7 +1771,7 @@ static dma_addr_t __map_single(struct device *dev, retry: address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, dma_mask); - if (unlikely(address == bad_dma_address)) { + if (unlikely(address == DMA_ERROR_CODE)) { /* * setting next_address here will let the address * allocator only scan the new allocated range in the @@ -1631,7 +1779,7 @@ retry: */ dma_dom->next_address = dma_dom->aperture_size; - if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) + if (alloc_new_range(dma_dom, false, GFP_ATOMIC)) goto out; /* @@ -1643,8 +1791,8 @@ retry: start = address; for (i = 0; i < pages; ++i) { - ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); - if (ret == bad_dma_address) + ret = dma_ops_domain_map(dma_dom, start, paddr, dir); + if (ret == DMA_ERROR_CODE) goto out_unmap; paddr += PAGE_SIZE; @@ -1655,10 +1803,10 @@ retry: ADD_STATS_COUNTER(alloced_io_mem, size); if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { - iommu_flush_tlb(iommu, dma_dom->domain.id); + iommu_flush_tlb(&dma_dom->domain); dma_dom->need_flush = false; - } else if (unlikely(iommu_has_npcache(iommu))) - iommu_flush_pages(iommu, dma_dom->domain.id, address, size); + } else if (unlikely(amd_iommu_np_cache)) + iommu_flush_pages(&dma_dom->domain, address, size); out: return address; @@ -1667,20 +1815,19 @@ out_unmap: for (--i; i >= 0; --i) { start -= PAGE_SIZE; - dma_ops_domain_unmap(iommu, dma_dom, start); + dma_ops_domain_unmap(dma_dom, start); } dma_ops_free_addresses(dma_dom, address, pages); - return bad_dma_address; + return DMA_ERROR_CODE; } /* * Does the reverse of the __map_single function. Must be called with * the domain lock held too */ -static void __unmap_single(struct amd_iommu *iommu, - struct dma_ops_domain *dma_dom, +static void __unmap_single(struct dma_ops_domain *dma_dom, dma_addr_t dma_addr, size_t size, int dir) @@ -1688,7 +1835,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_addr_t i, start; unsigned int pages; - if ((dma_addr == bad_dma_address) || + if ((dma_addr == DMA_ERROR_CODE) || (dma_addr + size > dma_dom->aperture_size)) return; @@ -1697,7 +1844,7 @@ static void __unmap_single(struct amd_iommu *iommu, start = dma_addr; for (i = 0; i < pages; ++i) { - dma_ops_domain_unmap(iommu, dma_dom, start); + dma_ops_domain_unmap(dma_dom, start); start += PAGE_SIZE; } @@ -1706,7 +1853,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); + iommu_flush_pages(&dma_dom->domain, dma_addr, size); dma_dom->need_flush = false; } } @@ -1720,36 +1867,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; dma_addr_t addr; u64 dma_mask; phys_addr_t paddr = page_to_phys(page) + offset; INC_STATS_COUNTER(cnt_map_single); - if (!check_device(dev)) - return bad_dma_address; - - dma_mask = *dev->dma_mask; - - get_device_resources(dev, &iommu, &domain, &devid); - - if (iommu == NULL || domain == NULL) - /* device not handled by any AMD IOMMU */ + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) return (dma_addr_t)paddr; + else if (IS_ERR(domain)) + return DMA_ERROR_CODE; - if (!dma_ops_domain(domain)) - return bad_dma_address; + dma_mask = *dev->dma_mask; spin_lock_irqsave(&domain->lock, flags); - addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, + + addr = __map_single(dev, domain->priv, paddr, size, dir, false, dma_mask); - if (addr == bad_dma_address) + if (addr == DMA_ERROR_CODE) goto out; - iommu_completion_wait(iommu); + iommu_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1764,25 +1904,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; INC_STATS_COUNTER(cnt_unmap_single); - if (!check_device(dev) || - !get_device_resources(dev, &iommu, &domain, &devid)) - /* device not handled by any AMD IOMMU */ - return; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) return; spin_lock_irqsave(&domain->lock, flags); - __unmap_single(iommu, domain->priv, dma_addr, size, dir); + __unmap_single(domain->priv, dma_addr, size, dir); - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1814,9 +1948,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; int i; struct scatterlist *s; phys_addr_t paddr; @@ -1825,25 +1957,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_map_sg); - if (!check_device(dev)) + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) + return map_sg_no_iommu(dev, sglist, nelems, dir); + else if (IS_ERR(domain)) return 0; dma_mask = *dev->dma_mask; - get_device_resources(dev, &iommu, &domain, &devid); - - if (!iommu || !domain) - return map_sg_no_iommu(dev, sglist, nelems, dir); - - if (!dma_ops_domain(domain)) - return 0; - spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { paddr = sg_phys(s); - s->dma_address = __map_single(dev, iommu, domain->priv, + s->dma_address = __map_single(dev, domain->priv, paddr, s->length, dir, false, dma_mask); @@ -1854,7 +1981,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, goto unmap; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1863,7 +1990,7 @@ out: unmap: for_each_sg(sglist, s, mapped_elems, i) { if (s->dma_address) - __unmap_single(iommu, domain->priv, s->dma_address, + __unmap_single(domain->priv, s->dma_address, s->dma_length, dir); s->dma_address = s->dma_length = 0; } @@ -1882,30 +2009,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; struct scatterlist *s; - u16 devid; int i; INC_STATS_COUNTER(cnt_unmap_sg); - if (!check_device(dev) || - !get_device_resources(dev, &iommu, &domain, &devid)) - return; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) return; spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { - __unmap_single(iommu, domain->priv, s->dma_address, + __unmap_single(domain->priv, s->dma_address, s->dma_length, dir); s->dma_address = s->dma_length = 0; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1918,49 +2040,44 @@ static void *alloc_coherent(struct device *dev, size_t size, { unsigned long flags; void *virt_addr; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; phys_addr_t paddr; u64 dma_mask = dev->coherent_dma_mask; INC_STATS_COUNTER(cnt_alloc_coherent); - if (!check_device(dev)) + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) { + virt_addr = (void *)__get_free_pages(flag, get_order(size)); + *dma_addr = __pa(virt_addr); + return virt_addr; + } else if (IS_ERR(domain)) return NULL; - if (!get_device_resources(dev, &iommu, &domain, &devid)) - flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + dma_mask = dev->coherent_dma_mask; + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + flag |= __GFP_ZERO; - flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) return NULL; paddr = virt_to_phys(virt_addr); - if (!iommu || !domain) { - *dma_addr = (dma_addr_t)paddr; - return virt_addr; - } - - if (!dma_ops_domain(domain)) - goto out_free; - if (!dma_mask) dma_mask = *dev->dma_mask; spin_lock_irqsave(&domain->lock, flags); - *dma_addr = __map_single(dev, iommu, domain->priv, paddr, + *dma_addr = __map_single(dev, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) { + if (*dma_addr == DMA_ERROR_CODE) { spin_unlock_irqrestore(&domain->lock, flags); goto out_free; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -1980,28 +2097,19 @@ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; INC_STATS_COUNTER(cnt_free_coherent); - if (!check_device(dev)) - return; - - get_device_resources(dev, &iommu, &domain, &devid); - - if (!iommu || !domain) - goto free_mem; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) goto free_mem; spin_lock_irqsave(&domain->lock, flags); - __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); + __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -2015,22 +2123,7 @@ free_mem: */ static int amd_iommu_dma_supported(struct device *dev, u64 mask) { - u16 bdf; - struct pci_dev *pcidev; - - /* No device or no PCI device */ - if (!dev || dev->bus != &pci_bus_type) - return 0; - - pcidev = to_pci_dev(dev); - - bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* Out of our scope? */ - if (bdf > amd_iommu_last_bdf) - return 0; - - return 1; + return check_device(dev); } /* @@ -2044,25 +2137,30 @@ static void prealloc_protection_domains(void) { struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; - struct amd_iommu *iommu; u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = calc_devid(dev->bus->number, dev->devfn); - if (devid > amd_iommu_last_bdf) - continue; - devid = amd_iommu_alias_table[devid]; - if (domain_for_device(devid)) + + /* Do we handle this device? */ + if (!check_device(&dev->dev)) continue; - iommu = amd_iommu_rlookup_table[devid]; - if (!iommu) + + iommu_init_device(&dev->dev); + + /* Is there already any domain for it? */ + if (domain_for_device(&dev->dev)) continue; - dma_dom = dma_ops_domain_alloc(iommu); + + devid = get_device_id(&dev->dev); + + dma_dom = dma_ops_domain_alloc(); if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); dma_dom->target_dev = devid; + attach_device(&dev->dev, &dma_dom->domain); + list_add_tail(&dma_dom->list, &iommu_pd_list); } } @@ -2091,7 +2189,7 @@ int __init amd_iommu_init_dma_ops(void) * protection domain will be assigned to the default one. */ for_each_iommu(iommu) { - iommu->default_dom = dma_ops_domain_alloc(iommu); + iommu->default_dom = dma_ops_domain_alloc(); if (iommu->default_dom == NULL) return -ENOMEM; iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; @@ -2101,15 +2199,12 @@ int __init amd_iommu_init_dma_ops(void) } /* - * If device isolation is enabled, pre-allocate the protection - * domains for each device. + * Pre-allocate the protection domains for each device. */ - if (amd_iommu_isolate) - prealloc_protection_domains(); + prealloc_protection_domains(); iommu_detected = 1; - force_iommu = 1; - bad_dma_address = 0; + swiotlb = 0; #ifdef CONFIG_GART_IOMMU gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; @@ -2148,14 +2243,17 @@ free_domains: static void cleanup_domain(struct protection_domain *domain) { + struct iommu_dev_data *dev_data, *next; unsigned long flags; - u16 devid; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) - if (amd_iommu_pd_table[devid] == domain) - __detach_device(domain, devid); + list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { + struct device *dev = dev_data->dev; + + do_detach(dev); + atomic_set(&dev_data->bind, 0); + } write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } @@ -2165,6 +2263,8 @@ static void protection_domain_free(struct protection_domain *domain) if (!domain) return; + del_domain_from_list(domain); + if (domain->id) domain_id_free(domain->id); @@ -2183,6 +2283,9 @@ static struct protection_domain *protection_domain_alloc(void) domain->id = domain_id_alloc(); if (!domain->id) goto out_err; + INIT_LIST_HEAD(&domain->dev_list); + + add_domain_to_list(domain); return domain; @@ -2239,26 +2342,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) static void amd_iommu_detach_device(struct iommu_domain *dom, struct device *dev) { - struct protection_domain *domain = dom->priv; + struct iommu_dev_data *dev_data = dev->archdata.iommu; struct amd_iommu *iommu; - struct pci_dev *pdev; u16 devid; - if (dev->bus != &pci_bus_type) + if (!check_device(dev)) return; - pdev = to_pci_dev(dev); - - devid = calc_devid(pdev->bus->number, pdev->devfn); + devid = get_device_id(dev); - if (devid > 0) - detach_device(domain, devid); + if (dev_data->domain != NULL) + detach_device(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) return; - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); iommu_completion_wait(iommu); } @@ -2266,35 +2366,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { struct protection_domain *domain = dom->priv; - struct protection_domain *old_domain; + struct iommu_dev_data *dev_data; struct amd_iommu *iommu; - struct pci_dev *pdev; + int ret; u16 devid; - if (dev->bus != &pci_bus_type) + if (!check_device(dev)) return -EINVAL; - pdev = to_pci_dev(dev); + dev_data = dev->archdata.iommu; - devid = calc_devid(pdev->bus->number, pdev->devfn); - - if (devid >= amd_iommu_last_bdf || - devid != amd_iommu_alias_table[devid]) - return -EINVAL; + devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) return -EINVAL; - old_domain = domain_for_device(devid); - if (old_domain) - detach_device(old_domain, devid); + if (dev_data->domain) + detach_device(dev); - attach_device(iommu, domain, devid); + ret = attach_device(dev, domain); iommu_completion_wait(iommu); - return 0; + return ret; } static int amd_iommu_map_range(struct iommu_domain *dom, @@ -2340,7 +2435,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova += PAGE_SIZE; } - iommu_flush_domain(domain->id); + iommu_flush_tlb_pde(domain); } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, @@ -2391,8 +2486,9 @@ static struct iommu_ops amd_iommu_ops = { int __init amd_iommu_init_passthrough(void) { + struct amd_iommu *iommu; struct pci_dev *dev = NULL; - u16 devid, devid2; + u16 devid; /* allocate passthroug domain */ pt_domain = protection_domain_alloc(); @@ -2402,20 +2498,17 @@ int __init amd_iommu_init_passthrough(void) pt_domain->mode |= PAGE_MODE_NONE; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - struct amd_iommu *iommu; - devid = calc_devid(dev->bus->number, dev->devfn); - if (devid > amd_iommu_last_bdf) + if (!check_device(&dev->dev)) continue; - devid2 = amd_iommu_alias_table[devid]; + devid = get_device_id(&dev->dev); - iommu = amd_iommu_rlookup_table[devid2]; + iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; - __attach_device(iommu, pt_domain, devid); - __attach_device(iommu, pt_domain, devid2); + attach_device(&dev->dev, pt_domain); } pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index b4b61d462dcc..7ffc39965233 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -25,10 +25,12 @@ #include <linux/interrupt.h> #include <linux/msi.h> #include <asm/pci-direct.h> +#include <asm/amd_iommu_proto.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> #include <asm/iommu.h> #include <asm/gart.h> +#include <asm/x86_init.h> /* * definitions for the ACPI scanning code @@ -123,18 +125,24 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -#ifdef CONFIG_IOMMU_STRESS -bool amd_iommu_isolate = false; -#else -bool amd_iommu_isolate = true; /* if true, device isolation is - enabled */ -#endif - bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ +/* Array to assign indices to IOMMUs*/ +struct amd_iommu *amd_iommus[MAX_IOMMUS]; +int amd_iommus_present; + +/* IOMMUs have a non-present cache? */ +bool amd_iommu_np_cache __read_mostly; + +/* + * List of protection domains - used during resume + */ +LIST_HEAD(amd_iommu_pd_list); +spinlock_t amd_iommu_pd_lock; + /* * Pointer to the device table which is shared by all AMD IOMMUs * it is indexed by the PCI device id or the HT unit id and contains @@ -157,12 +165,6 @@ u16 *amd_iommu_alias_table; struct amd_iommu **amd_iommu_rlookup_table; /* - * The pd table (protection domain table) is used to find the protection domain - * data structure a device belongs to. Indexed with the PCI device id too. - */ -struct protection_domain **amd_iommu_pd_table; - -/* * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap * to know which ones are already in use. */ @@ -240,7 +242,7 @@ static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } -static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -519,6 +521,26 @@ static void set_dev_entry_bit(u16 devid, u8 bit) amd_iommu_dev_table[devid].data[i] |= (1 << _bit); } +static int get_dev_entry_bit(u16 devid, u8 bit) +{ + int i = (bit >> 5) & 0x07; + int _bit = bit & 0x1f; + + return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit; +} + + +void amd_iommu_apply_erratum_63(u16 devid) +{ + int sysmgt; + + sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) | + (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1); + + if (sysmgt == 0x01) + set_dev_entry_bit(devid, DEV_ENTRY_IW); +} + /* Writes the specific IOMMU for a device into the rlookup table */ static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) { @@ -547,6 +569,8 @@ static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, if (flags & ACPI_DEVFLAG_LINT1) set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); + amd_iommu_apply_erratum_63(devid); + set_iommu_for_device(iommu, devid); } @@ -816,7 +840,18 @@ static void __init free_iommu_all(void) static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) { spin_lock_init(&iommu->lock); + + /* Add IOMMU to internal data structures */ list_add_tail(&iommu->list, &amd_iommu_list); + iommu->index = amd_iommus_present++; + + if (unlikely(iommu->index >= MAX_IOMMUS)) { + WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n"); + return -ENOSYS; + } + + /* Index is fine - add IOMMU to the array */ + amd_iommus[iommu->index] = iommu; /* * Copy data from ACPI table entry to the iommu struct @@ -846,6 +881,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); + if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) + amd_iommu_np_cache = true; + return pci_enable_device(iommu->dev); } @@ -903,7 +941,7 @@ static int __init init_iommu_all(struct acpi_table_header *table) * ****************************************************************************/ -static int __init iommu_setup_msi(struct amd_iommu *iommu) +static int iommu_setup_msi(struct amd_iommu *iommu) { int r; @@ -1154,19 +1192,10 @@ static struct sys_device device_amd_iommu = { * functions. Finally it prints some information about AMD IOMMUs and * the driver state and enables the hardware. */ -int __init amd_iommu_init(void) +static int __init amd_iommu_init(void) { int i, ret = 0; - - if (no_iommu) { - printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); - return 0; - } - - if (!amd_iommu_detected) - return -ENODEV; - /* * First parse ACPI tables to find the largest Bus/Dev/Func * we need to handle. Upon this information the shared data @@ -1203,15 +1232,6 @@ int __init amd_iommu_init(void) if (amd_iommu_rlookup_table == NULL) goto free; - /* - * Protection Domain table - maps devices to protection domains - * This table has the same size as the rlookup_table - */ - amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(rlookup_table_size)); - if (amd_iommu_pd_table == NULL) - goto free; - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( GFP_KERNEL | __GFP_ZERO, get_order(MAX_DOMAIN_ID/8)); @@ -1233,6 +1253,8 @@ int __init amd_iommu_init(void) */ amd_iommu_pd_alloc_bitmap[0] = 1; + spin_lock_init(&amd_iommu_pd_lock); + /* * now the data structures are allocated and basically initialized * start the real acpi table scan @@ -1264,17 +1286,12 @@ int __init amd_iommu_init(void) if (iommu_pass_through) goto out; - printk(KERN_INFO "AMD-Vi: device isolation "); - if (amd_iommu_isolate) - printk("enabled\n"); - else - printk("disabled\n"); - if (amd_iommu_unmap_flush) printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); else printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); + x86_platform.iommu_shutdown = disable_iommus; out: return ret; @@ -1282,9 +1299,6 @@ free: free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, get_order(MAX_DOMAIN_ID/8)); - free_pages((unsigned long)amd_iommu_pd_table, - get_order(rlookup_table_size)); - free_pages((unsigned long)amd_iommu_rlookup_table, get_order(rlookup_table_size)); @@ -1301,11 +1315,6 @@ free: goto out; } -void amd_iommu_shutdown(void) -{ - disable_iommus(); -} - /**************************************************************************** * * Early detect code. This code runs at IOMMU detection time in the DMA @@ -1320,16 +1329,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) void __init amd_iommu_detect(void) { - if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) + if (no_iommu || (iommu_detected && !gart_iommu_aperture)) return; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; amd_iommu_detected = 1; -#ifdef CONFIG_GART_IOMMU - gart_iommu_aperture_disabled = 1; - gart_iommu_aperture = 0; -#endif + x86_init.iommu.iommu_init = amd_iommu_init; } } @@ -1350,10 +1356,6 @@ static int __init parse_amd_iommu_dump(char *str) static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { - if (strncmp(str, "isolate", 7) == 0) - amd_iommu_isolate = true; - if (strncmp(str, "share", 5) == 0) - amd_iommu_isolate = false; if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 128111d8ffe0..e0dfb6856aa2 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -28,6 +28,7 @@ #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/k8.h> +#include <asm/x86_init.h> int gart_iommu_aperture; int gart_iommu_aperture_disabled __initdata; @@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void) iommu_detected = 1; gart_iommu_aperture = 1; + x86_init.iommu.iommu_init = gart_iommu_init; aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; @@ -456,7 +458,7 @@ out: if (aper_alloc) { /* Got the aperture from the AGP bridge */ - } else if (swiotlb && !valid_agp) { + } else if (!valid_agp) { /* Do nothing */ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || force_iommu || diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index da7b7b9f8bd8..565c1bfc507d 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,7 +2,7 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 159740decc41..ad8c75b9e453 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,7 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/acpi_pmtmr.h> @@ -35,7 +35,8 @@ #include <linux/smp.h> #include <linux/mm.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> +#include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/atomic.h> #include <asm/mpspec.h> @@ -61,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U; /* * The highest APIC ID seen during enumeration. * - * This determines the messaging protocol we can use: if all APIC IDs + * On AMD, this determines the messaging protocol we can use: if all APIC IDs * are in the 0 ... 7 range, then we can use logical addressing which * has some performance advantages (better broadcasting). * @@ -240,28 +241,13 @@ static int modern_apic(void) } /* - * bare function to substitute write operation - * and it's _that_ fast :) - */ -static void native_apic_write_dummy(u32 reg, u32 v) -{ - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); -} - -static u32 native_apic_read_dummy(u32 reg) -{ - WARN_ON_ONCE((cpu_has_apic && !disable_apic)); - return 0; -} - -/* - * right after this call apic->write/read doesn't do anything - * note that there is no restore operation it works one way + * right after this call apic become NOOP driven + * so apic->write/read doesn't do anything */ void apic_disable(void) { - apic->read = native_apic_read_dummy; - apic->write = native_apic_write_dummy; + pr_info("APIC: switched to apic NOOP\n"); + apic = &apic_noop; } void native_apic_wait_icr_idle(void) @@ -458,7 +444,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); - apic_write(APIC_TMICT, 0xffffffff); + apic_write(APIC_TMICT, 0); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ @@ -978,7 +964,7 @@ void lapic_shutdown(void) { unsigned long flags; - if (!cpu_has_apic) + if (!cpu_has_apic && !apic_from_smp_config()) return; local_irq_save(flags); @@ -1188,7 +1174,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_counters_lapic_init(); + perf_events_lapic_init(); preempt_disable(); @@ -1196,8 +1182,7 @@ void __cpuinit setup_local_APIC(void) * Double-check whether this APIC is really registered. * This is meaningless in clustered apic mode, so we skip it. */ - if (!apic->apic_id_registered()) - BUG(); + BUG_ON(!apic->apic_id_registered()); /* * Intel recommends to set DFR, LDR and TPR before enabling @@ -1392,14 +1377,11 @@ void __init enable_IR_x2apic(void) unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; int ret, x2apic_enabled = 0; - int dmar_table_init_ret = 0; + int dmar_table_init_ret; -#ifdef CONFIG_INTR_REMAP dmar_table_init_ret = dmar_table_init(); - if (dmar_table_init_ret) - pr_debug("dmar_table_init() failed with %d:\n", - dmar_table_init_ret); -#endif + if (dmar_table_init_ret && !x2apic_supported()) + return; ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { @@ -1709,7 +1691,7 @@ int __init APIC_init_uniprocessor(void) localise_nmi_watchdog(); #endif - setup_boot_clock(); + x86_init.timers.setup_percpu_clockev(); #ifdef CONFIG_X86_64 check_nmi_watchdog(); #endif @@ -1916,24 +1898,14 @@ void __cpuinit generic_processor_info(int apicid, int version) max_physical_apicid = apicid; #ifdef CONFIG_X86_32 - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj <ashok.raj@intel.com> - */ - if (max_physical_apicid >= 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (num_processors > 8) + def_to_bigsmp = 1; + break; + case X86_VENDOR_AMD: + if (max_physical_apicid >= 8) def_to_bigsmp = 1; - } } #endif diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c new file mode 100644 index 000000000000..d9acc3bee0f4 --- /dev/null +++ b/arch/x86/kernel/apic/apic_noop.c @@ -0,0 +1,200 @@ +/* + * NOOP APIC driver. + * + * Does almost nothing and should be substituted by a real apic driver via + * probe routine. + * + * Though in case if apic is disabled (for some reason) we try + * to not uglify the caller's code and allow to call (some) apic routines + * like self-ipi, etc... + */ + +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <asm/fixmap.h> +#include <asm/mpspec.h> +#include <asm/apicdef.h> +#include <asm/apic.h> +#include <asm/setup.h> + +#include <linux/smp.h> +#include <asm/ipi.h> + +#include <linux/interrupt.h> +#include <asm/acpi.h> +#include <asm/e820.h> + +static void noop_init_apic_ldr(void) { } +static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_allbutself(int vector) { } +static void noop_send_IPI_all(int vector) { } +static void noop_send_IPI_self(int vector) { } +static void noop_apic_wait_icr_idle(void) { } +static void noop_apic_icr_write(u32 low, u32 id) { } + +static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) +{ + return -1; +} + +static u32 noop_safe_apic_wait_icr_idle(void) +{ + return 0; +} + +static u64 noop_apic_icr_read(void) +{ + return 0; +} + +static int noop_cpu_to_logical_apicid(int cpu) +{ + return 0; +} + +static int noop_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return 0; +} + +static unsigned int noop_get_apic_id(unsigned long x) +{ + return 0; +} + +static int noop_probe(void) +{ + /* + * NOOP apic should not ever be + * enabled via probe routine + */ + return 0; +} + +static int noop_apic_id_registered(void) +{ + /* + * if we would be really "pedantic" + * we should pass read_apic_id() here + * but since NOOP suppose APIC ID = 0 + * lets save a few cycles + */ + return physid_isset(0, phys_cpu_present_map); +} + +static const struct cpumask *noop_target_cpus(void) +{ + /* only BSP here */ + return cpumask_of(0); +} + +static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid) +{ + return physid_isset(apicid, *map); +} + +static unsigned long noop_check_apicid_present(int bit) +{ + return physid_isset(bit, phys_cpu_present_map); +} + +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + if (cpu != 0) + pr_warning("APIC: Vector allocated for non-BSP cpu\n"); + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + +int noop_apicid_to_node(int logical_apicid) +{ + /* we're always on node 0 */ + return 0; +} + +static u32 noop_apic_read(u32 reg) +{ + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); + return 0; +} + +static void noop_apic_write(u32 reg, u32 v) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); +} + +struct apic apic_noop = { + .name = "noop", + .probe = noop_probe, + .acpi_madt_oem_check = NULL, + + .apic_id_registered = noop_apic_id_registered, + + .irq_delivery_mode = dest_LowestPrio, + /* logical delivery broadcast to all CPUs: */ + .irq_dest_mode = 1, + + .target_cpus = noop_target_cpus, + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = noop_check_apicid_used, + .check_apicid_present = noop_check_apicid_present, + + .vector_allocation_domain = noop_vector_allocation_domain, + .init_apic_ldr = noop_init_apic_ldr, + + .ioapic_phys_id_map = default_ioapic_phys_id_map, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .apicid_to_node = noop_apicid_to_node, + + .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = physid_set_mask_of_physid, + + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + + .phys_pkg_id = noop_phys_pkg_id, + + .mps_oem_check = NULL, + + .get_apic_id = noop_get_apic_id, + .set_apic_id = NULL, + .apic_id_mask = 0x0F << 24, + + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + + .send_IPI_mask = noop_send_IPI_mask, + .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, + .send_IPI_allbutself = noop_send_IPI_allbutself, + .send_IPI_all = noop_send_IPI_all, + .send_IPI_self = noop_send_IPI_self, + + .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, + + /* should be safe */ + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + + .wait_for_init_deassert = NULL, + + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, + + .read = noop_apic_read, + .write = noop_apic_write, + .icr_read = noop_apic_icr_read, + .icr_write = noop_apic_icr_write, + .wait_icr_idle = noop_apic_wait_icr_idle, + .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, +}; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 676cdac385c0..38dcecfa5818 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void) #endif } -static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } @@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid) -{ - return physid_mask_of_physid(phys_apicid); -} - /* Mapping from cpu number to logical apicid */ static inline int bigsmp_cpu_to_logical_apicid(int cpu) { @@ -106,13 +101,13 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu) return cpu_physical_id(cpu); } -static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) +static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0xFFL); + physids_promote(0xFFL, retmap); } -static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) +static int bigsmp_check_phys_apicid_present(int phys_apicid) { return 1; } @@ -230,7 +225,7 @@ struct apic apic_bigsmp = { .apicid_to_node = bigsmp_apicid_to_node, .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, - .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, + .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, .check_phys_apicid_present = bigsmp_check_phys_apicid_present, .enable_apic_mode = NULL, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 89174f847b49..e85f8fb7f8e7 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -466,11 +466,11 @@ static const struct cpumask *es7000_target_cpus(void) return cpumask_of(smp_processor_id()); } -static unsigned long -es7000_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } + static unsigned long es7000_check_apicid_present(int bit) { return physid_isset(bit, phys_cpu_present_map); @@ -539,14 +539,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu) static int cpu_id; -static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) +static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) { - physid_mask_t mask; - - mask = physid_mask_of_physid(cpu_id); + physid_set_mask_of_physid(cpu_id, retmap); ++cpu_id; - - return mask; } /* Mapping from cpu number to logical apicid */ @@ -561,10 +557,10 @@ static int es7000_cpu_to_logical_apicid(int cpu) #endif } -static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) +static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0xff); + physids_promote(0xFFL, retmap); } static int es7000_check_phys_apicid_present(int cpu_physical_apicid) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3c8f9e75d038..c0b4468683f9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -60,8 +60,6 @@ #include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/hw_irq.h> -#include <asm/uv/uv_hub.h> -#include <asm/uv/uv_irq.h> #include <asm/apic.h> @@ -96,6 +94,11 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +/* Number of legacy interrupts */ +static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; +/* GSI interrupts */ +static int nr_irqs_gsi = NR_IRQS_LEGACY; + #if defined (CONFIG_MCA) || defined (CONFIG_EISA) int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -135,20 +138,6 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) return pin; } -/* - * This is performance-critical, we want to do it O(1) - * - * Most irqs are mapped 1:1 with pins. - */ -struct irq_cfg { - struct irq_pin_list *irq_2_pin; - cpumask_var_t domain; - cpumask_var_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ static struct irq_cfg irq_cfgx[] = { @@ -173,6 +162,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = { [15] = { .vector = IRQ15_VECTOR, }, }; +void __init io_apic_disable_legacy(void) +{ + nr_legacy_irqs = 0; + nr_irqs_gsi = 0; +} + int __init arch_early_irq_init(void) { struct irq_cfg *cfg; @@ -190,7 +185,7 @@ int __init arch_early_irq_init(void) desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < NR_IRQS_LEGACY) + if (i < nr_legacy_irqs) cpumask_setall(cfg[i].domain); } @@ -198,7 +193,7 @@ int __init arch_early_irq_init(void) } #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { struct irq_cfg *cfg = NULL; struct irq_desc *desc; @@ -216,17 +211,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node) cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { - if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { kfree(cfg); cfg = NULL; - } else if (!alloc_cpumask_var_node(&cfg->old_domain, + } else if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_ATOMIC, node)) { free_cpumask_var(cfg->domain); kfree(cfg); cfg = NULL; - } else { - cpumask_clear(cfg->domain); - cpumask_clear(cfg->old_domain); } } @@ -353,7 +345,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) /* end for move_irq_desc */ #else -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { return irq < nr_irqs ? irq_cfgx + irq : NULL; } @@ -547,23 +539,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, add_pin_to_irq_node(cfg, node, newapic, newpin); } +static void __io_apic_modify_irq(struct irq_pin_list *entry, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + unsigned int reg, pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); +} + static void io_apic_modify_irq(struct irq_cfg *cfg, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { - int pin; struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin * 2); - reg &= mask_and; - reg |= mask_or; - io_apic_modify(entry->apic, 0x10 + pin * 2, reg); - if (final) - final(entry); - } + for_each_irq_pin(entry, cfg->irq_2_pin) + __io_apic_modify_irq(entry, mask_and, mask_or, final); +} + +static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry) +{ + __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED, NULL); +} + +static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry) +{ + __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) @@ -587,18 +597,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED, NULL); -} - -static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER, NULL); -} - static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { struct irq_cfg *cfg = desc->chip_data; @@ -867,7 +865,7 @@ static int __init find_isa_irq_apic(int irq, int type) */ static int EISA_ELCR(unsigned int irq) { - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1169,7 +1167,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int cpu, err; cpumask_var_t tmp_mask; - if ((cfg->move_in_progress) || cfg->move_cleanup_count) + if (cfg->move_in_progress) return -EBUSY; if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) @@ -1229,8 +1227,7 @@ next: return err; } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { int err; unsigned long flags; @@ -1464,7 +1461,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq } ioapic_register_intr(irq, desc, trigger); - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) disable_8259A_irq(irq); ioapic_write_entry(apic_id, pin, entry); @@ -1591,9 +1588,6 @@ __apicdebuginit(void) print_IO_APIC(void) struct irq_desc *desc; unsigned int irq; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", @@ -1700,9 +1694,6 @@ __apicdebuginit(void) print_APIC_field(int base) { int i; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG); for (i = 0; i < 8; i++) @@ -1716,9 +1707,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy) unsigned int i, v, ver, maxlvt; u64 icr; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); @@ -1816,13 +1804,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk("\n"); } -__apicdebuginit(void) print_all_local_APICs(void) +__apicdebuginit(void) print_local_APICs(int maxcpu) { int cpu; + if (!maxcpu) + return; + preempt_disable(); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + if (cpu >= maxcpu) + break; smp_call_function_single(cpu, print_local_APIC, NULL, 1); + } preempt_enable(); } @@ -1831,7 +1825,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (apic_verbosity == APIC_QUIET) + if (!nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1858,21 +1852,41 @@ __apicdebuginit(void) print_PIC(void) printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } -__apicdebuginit(int) print_all_ICs(void) +static int __initdata show_lapic = 1; +static __init int setup_show_lapic(char *arg) +{ + int num = -1; + + if (strcmp(arg, "all") == 0) { + show_lapic = CONFIG_NR_CPUS; + } else { + get_option(&arg, &num); + if (num >= 0) + show_lapic = num; + } + + return 1; +} +__setup("show_lapic=", setup_show_lapic); + +__apicdebuginit(int) print_ICs(void) { + if (apic_verbosity == APIC_QUIET) + return 0; + print_PIC(); /* don't print out if apic is not there */ - if (!cpu_has_apic || disable_apic) + if (!cpu_has_apic && !apic_from_smp_config()) return 0; - print_all_local_APICs(); + print_local_APICs(show_lapic); print_IO_APIC(); return 0; } -fs_initcall(print_all_ICs); +fs_initcall(print_ICs); /* Where if anywhere is the i8259 connect in external int mode */ @@ -1894,6 +1908,10 @@ void __init enable_IO_APIC(void) spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } + + if (!nr_legacy_irqs) + return; + for(apic = 0; apic < nr_ioapics; apic++) { int pin; /* See if any of the pins is in ExtINT mode */ @@ -1948,6 +1966,9 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); + if (!nr_legacy_irqs) + return; + /* * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode @@ -1981,7 +2002,7 @@ void disable_IO_APIC(void) /* * Use virtual wire A mode when interrupt remapping is enabled. */ - if (cpu_has_apic) + if (cpu_has_apic || apic_from_smp_config()) disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); } @@ -1994,7 +2015,7 @@ void disable_IO_APIC(void) * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 */ -static void __init setup_ioapic_ids_from_mpc(void) +void __init setup_ioapic_ids_from_mpc(void) { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; @@ -2003,9 +2024,8 @@ static void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; - if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + if (acpi_ioapic) return; - /* * Don't check I/O APIC IDs for xAPIC systems. They have * no meaning without the serial APIC bus. @@ -2017,7 +2037,7 @@ static void __init setup_ioapic_ids_from_mpc(void) * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. */ - phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); + apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); /* * Set the IOAPIC ID to the value stored in the MPC table. @@ -2044,7 +2064,7 @@ static void __init setup_ioapic_ids_from_mpc(void) * system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(phys_id_present_map, + if (apic->check_apicid_used(&phys_id_present_map, mp_ioapics[apic_id].apicid)) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", apic_id, mp_ioapics[apic_id].apicid); @@ -2059,7 +2079,7 @@ static void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic_id].apicid = i; } else { physid_mask_t tmp; - tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); + apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", mp_ioapics[apic_id].apicid); @@ -2179,7 +2199,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) was_pending = 1; @@ -2214,20 +2234,16 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) +void send_cleanup_vector(struct irq_cfg *cfg) { cpumask_var_t cleanup_mask; if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } @@ -2258,15 +2274,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq } } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - /* * Either sets desc->affinity to a valid value, and returns * ->cpu_mask_to_apicid of that, or returns BAD_APICID and * leaves desc->affinity untouched. */ -static unsigned int +unsigned int set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; @@ -2419,8 +2432,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) cfg = irq_cfg(irq); spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; @@ -2438,7 +2449,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) goto unlock; } __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; unlock: spin_unlock(&desc->lock); } @@ -2446,21 +2456,33 @@ unlock: irq_exit(); } -static void irq_complete_move(struct irq_desc **descp) +static void __irq_complete_move(struct irq_desc **descp, unsigned vector) { struct irq_desc *desc = *descp; struct irq_cfg *cfg = desc->chip_data; - unsigned vector, me; + unsigned me; if (likely(!cfg->move_in_progress)) return; - vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); } + +static void irq_complete_move(struct irq_desc **descp) +{ + __irq_complete_move(descp, ~get_irq_regs()->orig_ax); +} + +void irq_force_complete_move(int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + + __irq_complete_move(&desc, cfg->vector); +} #else static inline void irq_complete_move(struct irq_desc **descp) {} #endif @@ -2476,6 +2498,59 @@ static void ack_apic_edge(unsigned int irq) atomic_t irq_mis_count; +/* + * IO-APIC versions below 0x20 don't support EOI register. + * For the record, here is the information about various versions: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic + * version as 0x2. This is an error with documentation and these ICH chips + * use io-apic's of version 0x20. + * + * For IO-APIC's with EOI register, we use that to do an explicit EOI. + * Otherwise, we simulate the EOI message manually by changing the trigger + * mode to edge and then back to level, with RTE being masked during this. +*/ +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + if (mp_ioapics[entry->apic].apicver >= 0x20) { + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + if (irq_remapped(irq)) + io_apic_eoi(entry->apic, entry->pin); + else + io_apic_eoi(entry->apic, cfg->vector); + } else { + __mask_and_edge_IO_APIC_irq(entry); + __unmask_and_level_IO_APIC_irq(entry); + } + } +} + +static void eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2511,6 +2586,19 @@ static void ack_apic_level(unsigned int irq) * level-triggered interrupt. We mask the source for the time of the * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro + * + * Also in the case when cpu goes offline, fixup_irqs() will forward + * any unhandled interrupt on the offlined cpu to the new cpu + * destination that is handling the corresponding interrupt. This + * interrupt forwarding is done via IPI's. Hence, in this case also + * level-triggered io-apic interrupt will be seen as an edge + * interrupt in the IRR. And we can't rely on the cpu's EOI + * to be broadcasted to the IO-APIC's which will clear the remoteIRR + * corresponding to the level-triggered interrupt. Hence on IO-APIC's + * supporting EOI register, we do an explicit EOI to clear the + * remote IRR and on IO-APIC's which don't have an EOI register, + * we use the above logic (mask+edge followed by unmask+level) from + * Manfred Spraul to clear the remote IRR. */ cfg = desc->chip_data; i = cfg->vector; @@ -2522,6 +2610,19 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); + /* + * Tail end of clearing remote IRR bit (either by delivering the EOI + * message via io-apic EOI register write or simulating it using + * mask+edge followed by unnask+level logic) manually when the + * level triggered interrupt is seen as the edge triggered interrupt + * at the cpu. + */ + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + + eoi_ioapic_irq(desc); + } + /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2555,41 +2656,9 @@ static void ack_apic_level(unsigned int irq) move_masked_irq(irq); unmask_IO_APIC_irq_desc(desc); } - - /* Tail end of version 0x11 I/O APIC bug workaround */ - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(cfg); - __unmask_and_level_IO_APIC_irq(cfg); - spin_unlock(&ioapic_lock); - } } #ifdef CONFIG_INTR_REMAP -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - - for_each_irq_pin(entry, cfg->irq_2_pin) - io_apic_eoi(entry->apic, entry->pin); -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - static void ir_ack_apic_edge(unsigned int irq) { ack_APIC_irq(); @@ -2657,7 +2726,7 @@ static inline void init_IO_APIC_traps(void) * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) make_8259A_irq(irq); else /* Strange. Oh, well.. */ @@ -2993,7 +3062,7 @@ out: * the I/O APIC in all cases now. No actual device should request * it anyway. --macro */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) +#define PIC_IRQS (1UL << PIC_CASCADE_IR) void __init setup_IO_APIC(void) { @@ -3001,21 +3070,19 @@ void __init setup_IO_APIC(void) /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - - io_apic_irqs = ~PIC_IRQS; + io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* * Set up IO-APIC IRQ routing. */ -#ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#endif + x86_init.mpparse.setup_ioapic_ids(); + sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - check_timer(); + if (nr_legacy_irqs) + check_timer(); } /* @@ -3116,7 +3183,6 @@ static int __init ioapic_init_sysfs(void) device_initcall(ioapic_init_sysfs); -static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ @@ -3146,6 +3212,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) continue; desc_new = move_irq_desc(desc_new, node); + cfg_new = desc_new->chip_data; if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; @@ -3697,75 +3764,6 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ -#ifdef CONFIG_X86_UV -/* - * Re-target the irq to the specified CPU and enable the specified MMR located - * on the specified blade to allow the sending of MSIs to the specified CPU. - */ -int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset) -{ - const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_cfg *cfg; - int mmr_pnode; - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - unsigned long flags; - int err; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - cfg = irq_cfg(irq); - - err = assign_irq_vector(irq, cfg, eligible_cpu); - if (err != 0) - return err; - - spin_lock_irqsave(&vector_lock, flags); - set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, - irq_name); - spin_unlock_irqrestore(&vector_lock, flags); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return irq; -} - -/* - * Disable the specified MMR located on the specified blade so that MSIs are - * longer allowed to be sent. - */ -void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) -{ - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->mask = 1; - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); -} -#endif /* CONFIG_X86_64 */ - int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; @@ -3856,7 +3854,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= NR_IRQS_LEGACY) { + if (irq >= nr_legacy_irqs) { cfg = desc->chip_data; if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { printk(KERN_INFO "can not add pin %d for irq %d\n", @@ -3933,7 +3931,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) */ if (physids_empty(apic_id_map)) - apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); + apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); @@ -3949,10 +3947,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) * Every APIC in a system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(apic_id_map, apic_id)) { + if (apic->check_apicid_used(&apic_id_map, apic_id)) { for (i = 0; i < get_physical_broadcast(); i++) { - if (!apic->check_apicid_used(apic_id_map, i)) + if (!apic->check_apicid_used(&apic_id_map, i)) break; } @@ -3965,7 +3963,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) apic_id = i; } - tmp = apic->apicid_to_cpu_present(apic_id); + apic->apicid_to_cpu_present(apic_id, &tmp); physids_or(apic_id_map, apic_id_map, tmp); if (reg_00.bits.ID != apic_id) { @@ -4095,7 +4093,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) for (i = 0; i < nr_ioapics; i++) { res[i].name = mem; res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; } @@ -4129,18 +4127,17 @@ void __init ioapic_init_mappings(void) #ifdef CONFIG_X86_32 fake_ioapic_page: #endif - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); + apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), + ioapic_phys); idx++; ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } } diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index db7220220d09..7ff61d6a188a 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIG_X86_NEW_MCE) +#if defined(CONFIG_X86_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; @@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) /* * proc handler for /proc/sys/kernel/nmi */ -int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, +int proc_nmi_enabled(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int old_state; nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (!!old_state == !!nmi_watchdog_enabled) return 0; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index ca96e68f0d23..07cdbdcd7a92 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -66,7 +66,6 @@ struct mpc_trans { unsigned short trans_reserved; }; -/* x86_quirks member */ static int mpc_record; static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; @@ -130,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void) } } -static int __init numaq_pre_time_init(void) +static void __init numaq_tsc_init(void) { numaq_tsc_disable(); - return 0; } static inline int generate_logical_apicid(int quad, int phys_apicid) @@ -177,6 +175,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m) quad_local_to_mp_bus_id[quad][local] = m->busid; } +/* + * Called from mpparse code. + * mode = 0: prescan + * mode = 1: one mpc entry scanned + */ +static void numaq_mpc_record(unsigned int mode) +{ + if (!mode) + mpc_record = 0; + else + mpc_record++; +} + static void __init MP_translation_info(struct mpc_trans *m) { printk(KERN_INFO @@ -206,9 +217,9 @@ static int __init mpf_checksum(unsigned char *mp, int len) /* * Read/parse the MPC oem tables */ -static void __init - smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize) +static void __init smp_read_mpc_oem(struct mpc_table *mpc) { + struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr; int count = sizeof(*oemtable); /* the header size */ unsigned char *oemptr = ((unsigned char *)oemtable) + count; @@ -250,29 +261,6 @@ static void __init } } -static int __init numaq_setup_ioapic_ids(void) -{ - /* so can skip it */ - return 1; -} - -static struct x86_quirks numaq_x86_quirks __initdata = { - .arch_pre_time_init = numaq_pre_time_init, - .arch_time_init = NULL, - .arch_pre_intr_init = NULL, - .arch_memory_setup = NULL, - .arch_intr_init = NULL, - .arch_trap_init = NULL, - .mach_get_smp_config = NULL, - .mach_find_smp_config = NULL, - .mpc_record = &mpc_record, - .mpc_apic_id = mpc_apic_id, - .mpc_oem_bus_info = mpc_oem_bus_info, - .mpc_oem_pci_bus = mpc_oem_pci_bus, - .smp_read_mpc_oem = smp_read_mpc_oem, - .setup_ioapic_ids = numaq_setup_ioapic_ids, -}; - static __init void early_check_numaq(void) { /* @@ -286,8 +274,15 @@ static __init void early_check_numaq(void) if (smp_found_config) early_get_smp_config(); - if (found_numaq) - x86_quirks = &numaq_x86_quirks; + if (found_numaq) { + x86_init.mpparse.mpc_record = numaq_mpc_record; + x86_init.mpparse.setup_ioapic_ids = x86_init_noop; + x86_init.mpparse.mpc_apic_id = mpc_apic_id; + x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; + x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; + x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; + x86_init.timers.tsc_pre_init = numaq_tsc_init; + } } int __init get_memcfg_numaq(void) @@ -339,10 +334,9 @@ static inline const struct cpumask *numaq_target_cpus(void) return cpu_all_mask; } -static inline unsigned long -numaq_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid) { - return physid_isset(apicid, bitmap); + return physid_isset(apicid, *map); } static inline unsigned long numaq_check_apicid_present(int bit) @@ -376,10 +370,10 @@ static inline int numaq_multi_timer_check(int apic, int irq) return apic != 0 && irq == 0; } -static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) +static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* We don't have a good way to do this yet - hack */ - return physids_promote(0xFUL); + return physids_promote(0xFUL, retmap); } static inline int numaq_cpu_to_logical_apicid(int cpu) @@ -407,18 +401,18 @@ static inline int numaq_apicid_to_node(int logical_apicid) return logical_apicid >> 4; } -static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) +static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) { int node = numaq_apicid_to_node(logical_apicid); int cpu = __ffs(logical_apicid & 0xf); - return physid_mask_of_physid(cpu + 4*node); + physid_set_mask_of_physid(cpu + 4*node, retmap); } /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; -static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) +static inline int numaq_check_phys_apicid_present(int phys_apicid) { return 1; } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 0c0182cc947d..1a6559f6768c 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -108,7 +108,7 @@ struct apic apic_default = { .apicid_to_node = default_apicid_to_node, .cpu_to_logical_apicid = default_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = default_apicid_to_cpu_present, + .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, .enable_apic_mode = NULL, diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 65edc180fc82..c4cbd3080c1c 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -64,16 +64,23 @@ void __init default_setup_apic_routing(void) apic = &apic_x2apic_phys; else apic = &apic_x2apic_cluster; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } #endif if (apic == &apic_flat) { - if (max_physical_apicid >= 8) - apic = &apic_physflat; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (num_processors > 8) + apic = &apic_physflat; + break; + case X86_VENDOR_AMD: + if (max_physical_apicid >= 8) + apic = &apic_physflat; + } } + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); + if (is_vsmp_box()) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index eafdfbd1ea95..9b419263d90d 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void) return cpumask_of(0); } -static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } @@ -261,18 +261,18 @@ static int summit_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) +static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0x0F); + physids_promote(0x0FL, retmap); } -static physid_mask_t summit_apicid_to_cpu_present(int apicid) +static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap) { - return physid_mask_of_physid(0); + physid_set_mask_of_physid(0, retmap); } -static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) +static int summit_check_phys_apicid_present(int physical_apicid) { return 1; } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 601159374e87..130c4b934877 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -352,14 +352,14 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { alias.v = uv_read_local_mmr(redir_addrs[i].alias); - if (alias.s.base == 0) { + if (alias.s.enable && alias.s.base == 0) { *size = (1UL << alias.s.m_alias); redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; return; } } - BUG(); + *base = *size = 0; } enum map_type {map_wb, map_uc}; @@ -389,6 +389,16 @@ static __init void map_gru_high(int max_pnode) map_high("GRU", gru.s.base, shift, max_pnode, map_wb); } +static __init void map_mmr_high(int max_pnode) +{ + union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; + int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; + + mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); + if (mmr.s.enable) + map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); +} + static __init void map_mmioh_high(int max_pnode) { union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; @@ -399,6 +409,12 @@ static __init void map_mmioh_high(int max_pnode) map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); } +static __init void map_low_mmrs(void) +{ + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); +} + static __init void uv_rtc_init(void) { long status; @@ -540,6 +556,8 @@ void __init uv_system_init(void) unsigned long mmr_base, present, paddr; unsigned short pnode_mask; + map_low_mmrs(); + m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; @@ -609,12 +627,12 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; - uv_cpu_hub_info(cpu)->n_val = m_val; + uv_cpu_hub_info(cpu)->n_val = n_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; uv_cpu_hub_info(cpu)->pnode = pnode; uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; - uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; + uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; @@ -643,6 +661,7 @@ void __init uv_system_init(void) } map_gru_high(max_pnode); + map_mmr_high(max_pnode); map_mmioh_high(max_pnode); uv_cpu_init(); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 151ace69a5aa..b5b6b23bce53 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -204,7 +204,6 @@ #include <linux/module.h> #include <linux/poll.h> -#include <linux/smp_lock.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/timer.h> @@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); +static DEFINE_MUTEX(apm_mutex); /* * Set up a segment that references the real mode segment 0x40 @@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) return -EPERM; switch (cmd) { case APM_IOC_STANDBY: - lock_kernel(); + mutex_lock(&apm_mutex); if (as->standbys_read > 0) { as->standbys_read--; as->standbys_pending--; @@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) queue_event(APM_USER_STANDBY, as); if (standbys_pending <= 0) standby(); - unlock_kernel(); + mutex_unlock(&apm_mutex); break; case APM_IOC_SUSPEND: - lock_kernel(); + mutex_lock(&apm_mutex); if (as->suspends_read > 0) { as->suspends_read--; as->suspends_pending--; @@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) queue_event(APM_USER_SUSPEND, as); if (suspends_pending <= 0) { ret = suspend(1); + mutex_unlock(&apm_mutex); } else { as->suspend_wait = 1; + mutex_unlock(&apm_mutex); wait_event_interruptible(apm_suspend_waitqueue, as->suspend_wait == 0); ret = as->suspend_result; } - unlock_kernel(); return ret; default: return -ENOTTY; @@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp) { struct apm_user *as; - lock_kernel(); as = kmalloc(sizeof(*as), GFP_KERNEL); if (as == NULL) { printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", sizeof(*as)); - unlock_kernel(); return -ENOMEM; } as->magic = APM_BIOS_MAGIC; @@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp) user_list = as; spin_unlock(&user_list_lock); filp->private_data = as; - unlock_kernel(); return 0; } diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c1f253dac155..1d2cb383410e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -5,6 +5,7 @@ # Don't trace early stages of a secondary CPU boot ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif # Make sure load_percpu_segment has no stackprotector @@ -13,7 +14,7 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o +obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o @@ -27,7 +28,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 22a47c82f3c0..c910a716a71c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -184,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) * approved Athlon */ WARN_ONCE(1, "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); + " processors is not suitable for SMP.\n"); if (!test_taint(TAINT_UNSAFE_SMP)) add_taint(TAINT_UNSAFE_SMP); @@ -333,6 +333,16 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) #endif } +int amd_get_nb_id(int cpu) +{ + int id = 0; +#ifdef CONFIG_SMP + id = per_cpu(cpu_llc_id, cpu); +#endif + return id; +} +EXPORT_SYMBOL_GPL(amd_get_nb_id); + static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2055fc2b2e6b..9053be5d95cd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,7 +13,7 @@ #include <linux/io.h> #include <asm/stackprotector.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/hypervisor.h> #include <asm/processor.h> @@ -34,7 +34,6 @@ #include <asm/mce.h> #include <asm/msr.h> #include <asm/pat.h> -#include <linux/smp.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> @@ -838,10 +837,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } -#ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ - mcheck_init(c); -#endif + mcheck_cpu_init(c); select_idle_routine(c); @@ -870,7 +867,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_counters(); + init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 6b2a52dd0403..dca325c03999 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -30,8 +30,8 @@ #include <asm/apic.h> #include <asm/desc.h> -static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); -static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); +static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); +static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); static DEFINE_PER_CPU(int, cpu_priv_count); static DEFINE_MUTEX(cpu_debug_lock); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index ae9b503220ca..8b581d3905cb 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,7 +33,7 @@ #include <linux/cpufreq.h> #include <linux/compiler.h> #include <linux/dmi.h> -#include <trace/power.h> +#include <trace/events/power.h> #include <linux/acpi.h> #include <linux/io.h> @@ -60,7 +60,6 @@ enum { }; #define INTEL_MSR_RANGE (0xffff) -#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1) struct acpi_cpufreq_data { struct acpi_processor_performance *acpi_data; @@ -71,13 +70,7 @@ struct acpi_cpufreq_data { static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); -struct acpi_msr_data { - u64 saved_aperf, saved_mperf; -}; - -static DEFINE_PER_CPU(struct acpi_msr_data, msr_data); - -DEFINE_TRACE(power_mark); +static DEFINE_PER_CPU(struct aperfmperf, old_perf); /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -244,23 +237,12 @@ static u32 get_cur_val(const struct cpumask *mask) return cmd.val; } -struct perf_pair { - union { - struct { - u32 lo; - u32 hi; - } split; - u64 whole; - } aperf, mperf; -}; - /* Called via smp_call_function_single(), on the target CPU */ static void read_measured_perf_ctrs(void *_cur) { - struct perf_pair *cur = _cur; + struct aperfmperf *am = _cur; - rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); - rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi); + get_aperfmperf(am); } /* @@ -279,63 +261,17 @@ static void read_measured_perf_ctrs(void *_cur) static unsigned int get_measured_perf(struct cpufreq_policy *policy, unsigned int cpu) { - struct perf_pair readin, cur; - unsigned int perf_percent; + struct aperfmperf perf; + unsigned long ratio; unsigned int retval; - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) + if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) return 0; - cur.aperf.whole = readin.aperf.whole - - per_cpu(msr_data, cpu).saved_aperf; - cur.mperf.whole = readin.mperf.whole - - per_cpu(msr_data, cpu).saved_mperf; - per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole; - per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole; - -#ifdef __i386__ - /* - * We dont want to do 64 bit divide with 32 bit kernel - * Get an approximate value. Return failure in case we cannot get - * an approximate value. - */ - if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) { - int shift_count; - u32 h; + ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); + per_cpu(old_perf, cpu) = perf; - h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi); - shift_count = fls(h); - - cur.aperf.whole >>= shift_count; - cur.mperf.whole >>= shift_count; - } - - if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) { - int shift_count = 7; - cur.aperf.split.lo >>= shift_count; - cur.mperf.split.lo >>= shift_count; - } - - if (cur.aperf.split.lo && cur.mperf.split.lo) - perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo; - else - perf_percent = 0; - -#else - if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) { - int shift_count = 7; - cur.aperf.whole >>= shift_count; - cur.mperf.whole >>= shift_count; - } - - if (cur.aperf.whole && cur.mperf.whole) - perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole; - else - perf_percent = 0; - -#endif - - retval = (policy->cpuinfo.max_freq * perf_percent) / 100; + retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; return retval; } @@ -394,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int i; int result = 0; - struct power_trace it; dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -426,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_mark(&it, POWER_PSTATE, next_perf_state); + trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: @@ -588,6 +523,27 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = { }, { } }; + +static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) +{ + /* Intel Xeon Processor 7100 Series Specification Update + * http://www.intel.com/Assets/PDF/specupdate/314554.pdf + * AL30: A Machine Check Exception (MCE) Occurring during an + * Enhanced Intel SpeedStep Technology Ratio Change May Cause + * Both Processor Cores to Lock Up. */ + if (c->x86_vendor == X86_VENDOR_INTEL) { + if ((c->x86 == 15) && + (c->x86_model == 6) && + (c->x86_mask == 8)) { + printk(KERN_INFO "acpi-cpufreq: Intel(R) " + "Xeon(R) 7100 Errata AL30, processors may " + "lock up on frequency changes: disabling " + "acpi-cpufreq.\n"); + return -ENODEV; + } + } + return 0; +} #endif static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) @@ -599,9 +555,20 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) unsigned int result = 0; struct cpuinfo_x86 *c = &cpu_data(policy->cpu); struct acpi_processor_performance *perf; +#ifdef CONFIG_SMP + static int blacklisted; +#endif dprintk("acpi_cpufreq_cpu_init\n"); +#ifdef CONFIG_SMP + if (blacklisted) + return blacklisted; + blacklisted = acpi_cpufreq_blacklist(c); + if (blacklisted) + return blacklisted; +#endif + data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); if (!data) return -ENOMEM; @@ -731,12 +698,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) acpi_processor_notify_smm(THIS_MODULE); /* Check for APERF/MPERF support in hardware */ - if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { - unsigned int ecx; - ecx = cpuid_ecx(6); - if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY) - acpi_cpufreq_driver.getavg = get_measured_perf; - } + if (cpu_has(c, X86_FEATURE_APERFMPERF)) + acpi_cpufreq_driver.getavg = get_measured_perf; dprintk("CPU%u - ACPI performance management activated.\n", cpu); for (i = 0; i < perf->state_count; i++) diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index ce2ed3e4aad9..cabd2fa3fc93 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); break; case 1 ... 15: - longhaul_version = TYPE_LONGHAUL_V1; + longhaul_version = TYPE_LONGHAUL_V2; if (c->x86_mask < 8) { cpu_model = CPU_SAMUEL2; cpuname = "C3 'Samuel 2' [C5B]"; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 2a50ef891000..3f12dabeab52 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -605,9 +605,10 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, return 0; } -static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry) +static void invalidate_entry(struct cpufreq_frequency_table *powernow_table, + unsigned int entry) { - data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; + powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; } static void print_basics(struct powernow_k8_data *data) @@ -854,6 +855,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) goto err_out; } + /* fill in data */ + data->numps = data->acpi_data.state_count; + powernow_k8_acpi_pst_values(data, 0); + if (cpu_family == CPU_HW_PSTATE) ret_val = fill_powernow_table_pstate(data, powernow_table); else @@ -866,11 +871,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) powernow_table[data->acpi_data.state_count].index = 0; data->powernow_table = powernow_table; - /* fill in data */ - data->numps = data->acpi_data.state_count; if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) print_basics(data); - powernow_k8_acpi_pst_values(data, 0); /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); @@ -914,13 +916,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, "bad value %d.\n", i, index); printk(KERN_ERR PFX "Please report to BIOS " "manufacturer\n"); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); if (!(hi & HW_PSTATE_VALID_MASK)) { dprintk("invalid pstate %d, ignoring\n", index); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } @@ -941,7 +943,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) { int i; - int cntlofreq = 0; for (i = 0; i < data->acpi_data.state_count; i++) { u32 fid; @@ -970,7 +971,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, /* verify frequency is OK */ if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { dprintk("invalid freq %u kHz, ignoring\n", freq); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } @@ -978,38 +979,17 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, * BIOSs are using "off" to indicate invalid */ if (vid == VID_OFF) { dprintk("invalid vid %u, ignoring\n", vid); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } - /* verify only 1 entry from the lo frequency table */ - if (fid < HI_FID_TABLE_BOTTOM) { - if (cntlofreq) { - /* if both entries are the same, - * ignore this one ... */ - if ((freq != powernow_table[cntlofreq].frequency) || - (index != powernow_table[cntlofreq].index)) { - printk(KERN_ERR PFX - "Too many lo freq table " - "entries\n"); - return 1; - } - - dprintk("double low frequency table entry, " - "ignoring it.\n"); - invalidate_entry(data, i); - continue; - } else - cntlofreq = i; - } - if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { printk(KERN_INFO PFX "invalid freq entries " "%u kHz vs. %u kHz\n", freq, (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } } @@ -1042,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data) * set it to 1 to avoid problems in the future. * For all others it's a BIOS bug. */ - if (!boot_cpu_data.x86 == 0x11) + if (boot_cpu_data.x86 != 0x11) printk(KERN_ERR FW_WARN PFX "Invalid zero transition " "latency\n"); max_latency = 1; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 6911e91fb4f6..3ae5a7a3a500 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void) return 0; } -struct get_freq_data { - unsigned int speed; - unsigned int processor; -}; - -static void get_freq_data(void *_data) +static void get_freq_data(void *_speed) { - struct get_freq_data *data = _data; + unsigned int *speed = _speed; - data->speed = speedstep_get_frequency(data->processor); + *speed = speedstep_get_frequency(speedstep_processor); } static unsigned int speedstep_get(unsigned int cpu) { - struct get_freq_data data = { .processor = cpu }; + unsigned int speed; /* You're supposed to ensure CPU is online. */ - if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) + if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0) BUG(); - dprintk("detected %u kHz as current frequency\n", data.speed); - return data.speed; + dprintk("detected %u kHz as current frequency\n", speed); + return speed; } /** diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 93ba8eeb100a..08be922de33a 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,13 +34,6 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c) c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; } -unsigned long get_hypervisor_tsc_freq(void) -{ - if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) - return vmware_get_tsc_khz(); - return 0; -} - static inline void __cpuinit hypervisor_set_feature_bits(struct cpuinfo_x86 *c) { @@ -55,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) detect_hypervisor_vendor(c); hypervisor_set_feature_bits(c); } + +void __init init_hypervisor_platform(void) +{ + init_hypervisor(&boot_cpu_data); + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) + vmware_platform_setup(); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 80a722a071b5..40e1835b35e8 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -350,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } + if (c->cpuid_level > 6) { + unsigned ecx = cpuid_ecx(6); + if (ecx & 0x01) + set_cpu_cap(c, X86_FEATURE_APERFMPERF); + } + if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (cpu_has_ds) { diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 188a1ca5ad2b..4ac6d48fe11b 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,11 +1,8 @@ -obj-y = mce.o +obj-y = mce.o mce-severity.o -obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o -obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o -obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c deleted file mode 100644 index b945d5dbc609..000000000000 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Athlon specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones <davej@redhat.com> - */ -#include <linux/interrupt.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* Machine Check Handler For AMD Athlon/Duron: */ -static void k7_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 1; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - - /* Clear it: */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - - -/* AMD K7 machine check is Intel like: */ -void amd_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - machine_check_vector = k7_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Clear status for MC index 0 separately, we don't touch CTL, - * as some K7 Athlons cause spurious MCEs when its enabled: - */ - if (boot_cpu_data.x86 == 6) { - wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); - i = 1; - } else - i = 0; - - for (; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index a3a235a53f09..472763d92098 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -18,7 +18,12 @@ #include <linux/string.h> #include <linux/fs.h> #include <linux/smp.h> +#include <linux/notifier.h> +#include <linux/kdebug.h> +#include <linux/cpu.h> +#include <linux/sched.h> #include <asm/mce.h> +#include <asm/apic.h> /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) @@ -39,44 +44,142 @@ static void inject_mce(struct mce *m) i->finished = 1; } -struct delayed_mce { - struct timer_list timer; - struct mce m; -}; +static void raise_poll(struct mce *m) +{ + unsigned long flags; + mce_banks_t b; -/* Inject mce on current CPU */ -static void raise_mce(unsigned long data) + memset(&b, 0xff, sizeof(mce_banks_t)); + local_irq_save(flags); + machine_check_poll(0, &b); + local_irq_restore(flags); + m->finished = 0; +} + +static void raise_exception(struct mce *m, struct pt_regs *pregs) { - struct delayed_mce *dm = (struct delayed_mce *)data; - struct mce *m = &dm->m; - int cpu = m->extcpu; + struct pt_regs regs; + unsigned long flags; - inject_mce(m); - if (m->status & MCI_STATUS_UC) { - struct pt_regs regs; + if (!pregs) { memset(®s, 0, sizeof(struct pt_regs)); regs.ip = m->ip; regs.cs = m->cs; + pregs = ®s; + } + /* in mcheck exeception handler, irq will be disabled */ + local_irq_save(flags); + do_machine_check(pregs, 0); + local_irq_restore(flags); + m->finished = 0; +} + +static cpumask_t mce_inject_cpumask; + +static int mce_raise_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) + return NOTIFY_DONE; + cpu_clear(cpu, mce_inject_cpumask); + if (m->inject_flags & MCJ_EXCEPTION) + raise_exception(m, args->regs); + else if (m->status) + raise_poll(m); + return NOTIFY_STOP; +} + +static struct notifier_block mce_raise_nb = { + .notifier_call = mce_raise_notify, + .priority = 1000, +}; + +/* Inject mce on current CPU */ +static int raise_local(void) +{ + struct mce *m = &__get_cpu_var(injectm); + int context = MCJ_CTX(m->inject_flags); + int ret = 0; + int cpu = m->extcpu; + + if (m->inject_flags & MCJ_EXCEPTION) { printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); - do_machine_check(®s, 0); + switch (context) { + case MCJ_CTX_IRQ: + /* + * Could do more to fake interrupts like + * calling irq_enter, but the necessary + * machinery isn't exported currently. + */ + /*FALL THROUGH*/ + case MCJ_CTX_PROCESS: + raise_exception(m, NULL); + break; + default: + printk(KERN_INFO "Invalid MCE context\n"); + ret = -EINVAL; + } printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); - } else { - mce_banks_t b; - memset(&b, 0xff, sizeof(mce_banks_t)); + } else if (m->status) { printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); - machine_check_poll(0, &b); + raise_poll(m); mce_notify_irq(); - printk(KERN_INFO "Finished machine check poll on CPU %d\n", - cpu); - } - kfree(dm); + printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); + } else + m->finished = 0; + + return ret; +} + +static void raise_mce(struct mce *m) +{ + int context = MCJ_CTX(m->inject_flags); + + inject_mce(m); + + if (context == MCJ_CTX_RANDOM) + return; + +#ifdef CONFIG_X86_LOCAL_APIC + if (m->inject_flags & MCJ_NMI_BROADCAST) { + unsigned long start; + int cpu; + get_online_cpus(); + mce_inject_cpumask = cpu_online_map; + cpu_clear(get_cpu(), mce_inject_cpumask); + for_each_online_cpu(cpu) { + struct mce *mcpu = &per_cpu(injectm, cpu); + if (!mcpu->finished || + MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) + cpu_clear(cpu, mce_inject_cpumask); + } + if (!cpus_empty(mce_inject_cpumask)) + apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); + start = jiffies; + while (!cpus_empty(mce_inject_cpumask)) { + if (!time_before(jiffies, start + 2*HZ)) { + printk(KERN_ERR + "Timeout waiting for mce inject NMI %lx\n", + *cpus_addr(mce_inject_cpumask)); + break; + } + cpu_relax(); + } + raise_local(); + put_cpu(); + put_online_cpus(); + } else +#endif + raise_local(); } /* Error injection interface */ static ssize_t mce_write(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off) { - struct delayed_mce *dm; struct mce m; if (!capable(CAP_SYS_ADMIN)) @@ -96,19 +199,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; - dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); - if (!dm) - return -ENOMEM; - /* * Need to give user space some time to set everything up, * so do it a jiffie or two later everywhere. - * Should we use a hrtimer here for better synchronization? */ - memcpy(&dm->m, &m, sizeof(struct mce)); - setup_timer(&dm->timer, raise_mce, (unsigned long)dm); - dm->timer.expires = jiffies + 2; - add_timer_on(&dm->timer, m.extcpu); + schedule_timeout(2); + raise_mce(&m); return usize; } @@ -116,6 +212,7 @@ static int inject_init(void) { printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; + register_die_notifier(&mce_raise_nb); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 54dcb8ff12e5..32996f9fab67 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,4 @@ +#include <linux/sysdev.h> #include <asm/mce.h> enum severity_level { @@ -10,6 +11,20 @@ enum severity_level { MCE_PANIC_SEVERITY, }; +#define ATTR_LEN 16 + +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank { + u64 ctl; /* subevents to enable */ + unsigned char init; /* initialise bank? */ + struct sysdev_attribute attr; /* sysdev attribute */ + char attrname[ATTR_LEN]; /* attribute name */ +}; + int mce_severity(struct mce *a, int tolerant, char **msg); +struct dentry *mce_get_debugfs_dir(void); extern int mce_ser; + +extern struct mce_bank *mce_banks; + diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index ff0807f97056..8a85dd1b1aa1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -139,6 +139,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg) } } +#ifdef CONFIG_DEBUG_FS static void *s_start(struct seq_file *f, loff_t *pos) { if (*pos >= ARRAY_SIZE(severities)) @@ -197,7 +198,7 @@ static int __init severities_debugfs_init(void) { struct dentry *dmce = NULL, *fseverities_coverage = NULL; - dmce = debugfs_create_dir("mce", NULL); + dmce = mce_get_debugfs_dir(); if (dmce == NULL) goto err_out; fseverities_coverage = debugfs_create_file("severities-coverage", @@ -209,10 +210,7 @@ static int __init severities_debugfs_init(void) return 0; err_out: - if (fseverities_coverage) - debugfs_remove(fseverities_coverage); - if (dmce) - debugfs_remove(dmce); return -ENOMEM; } late_initcall(severities_debugfs_init); +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 01213048f62f..0bcaa3875863 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -34,6 +34,7 @@ #include <linux/smp.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/debugfs.h> #include <asm/processor.h> #include <asm/hw_irq.h> @@ -45,21 +46,11 @@ #include "mce-internal.h" -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; +#define CREATE_TRACE_POINTS +#include <trace/events/mce.h> int mce_disabled __read_mostly; -#ifdef CONFIG_X86_NEW_MCE - #define MISC_MCELOG_MINOR 227 #define SPINUNIT 100 /* 100ns */ @@ -77,7 +68,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); */ static int tolerant __read_mostly = 1; static int banks __read_mostly; -static u64 *bank __read_mostly; static int rip_msr __read_mostly; static int mce_bootlog __read_mostly = -1; static int monarch_timeout __read_mostly = -1; @@ -87,28 +77,43 @@ int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; +struct mce_bank *mce_banks __read_mostly; + /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -static unsigned long dont_init_banks; - static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); + +static int default_decode_mce(struct notifier_block *nb, unsigned long val, + void *data) +{ + pr_emerg("No human readable MCE decoding support on this CPU type.\n"); + pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); + + return NOTIFY_STOP; +} + +static struct notifier_block mce_dec_nb = { + .notifier_call = default_decode_mce, + .priority = -1, +}; /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; -static inline int skip_bank_init(int i) -{ - return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); -} - static DEFINE_PER_CPU(struct work_struct, mce_work); /* Do initial initialization of a struct mce */ @@ -147,6 +152,9 @@ void mce_log(struct mce *mce) { unsigned next, entry; + /* Emit the trace record: */ + trace_mce_record(mce); + mce->finished = 0; wmb(); for (;;) { @@ -185,47 +193,58 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { - printk(KERN_EMERG - "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); + if (m->ip) { - printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", - !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); + pr_emerg("RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); } - printk(KERN_EMERG "TSC %llx ", m->tsc); + + pr_emerg("TSC %llx ", m->tsc); if (m->addr) - printk(KERN_CONT "ADDR %llx ", m->addr); + pr_cont("ADDR %llx ", m->addr); if (m->misc) - printk(KERN_CONT "MISC %llx ", m->misc); - printk(KERN_CONT "\n"); - printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", - m->cpuvendor, m->cpuid, m->time, m->socketid, - m->apicid); + pr_cont("MISC %llx ", m->misc); + + pr_cont("\n"); + pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + + /* + * Print out human-readable details about the MCE error, + * (if the CPU has an implementation for that) + */ + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); } static void print_mce_head(void) { - printk(KERN_EMERG "\nHARDWARE ERROR\n"); + pr_emerg("\nHARDWARE ERROR\n"); } static void print_mce_tail(void) { - printk(KERN_EMERG "This is not a software problem!\n" - "Run through mcelog --ascii to decode and contact your hardware vendor\n"); + pr_emerg("This is not a software problem!\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ static atomic_t mce_paniced; +static int fake_panic; +static atomic_t mce_fake_paniced; + /* Panic in progress. Enable interrupts and wait for final IPI */ static void wait_for_panic(void) { long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + preempt_disable(); local_irq_enable(); while (timeout-- > 0) @@ -239,15 +258,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp) { int i; - /* - * Make sure only one CPU runs in machine check panic - */ - if (atomic_add_return(1, &mce_paniced) > 1) - wait_for_panic(); - barrier(); + if (!fake_panic) { + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_inc_return(&mce_paniced) > 1) + wait_for_panic(); + barrier(); - bust_spinlocks(1); - console_verbose(); + bust_spinlocks(1); + console_verbose(); + } else { + /* Don't log too much for fake panic */ + if (atomic_inc_return(&mce_fake_paniced) > 1) + return; + } print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -274,9 +299,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp) print_mce_tail(); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); - if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; - panic(msg); + if (!fake_panic) { + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; + panic(msg); + } else + printk(KERN_EMERG "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -284,13 +312,14 @@ static void mce_panic(char *msg, struct mce *final, char *exp) static int msr_to_offset(u32 msr) { unsigned bank = __get_cpu_var(injectm.bank); + if (msr == rip_msr) return offsetof(struct mce, ip); - if (msr == MSR_IA32_MC0_STATUS + bank*4) + if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); - if (msr == MSR_IA32_MC0_ADDR + bank*4) + if (msr == MSR_IA32_MCx_ADDR(bank)) return offsetof(struct mce, addr); - if (msr == MSR_IA32_MC0_MISC + bank*4) + if (msr == MSR_IA32_MCx_MISC(bank)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); @@ -301,13 +330,25 @@ static int msr_to_offset(u32 msr) static u64 mce_rdmsrl(u32 msr) { u64 v; + if (__get_cpu_var(injectm).finished) { int offset = msr_to_offset(msr); + if (offset < 0) return 0; return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); } - rdmsrl(msr, v); + + if (rdmsrl_safe(msr, &v)) { + WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); + /* + * Return zero in case the access faulted. This should + * not happen normally but can happen if the CPU does + * something weird, or if the code is buggy. + */ + v = 0; + } + return v; } @@ -315,6 +356,7 @@ static void mce_wrmsrl(u32 msr, u64 v) { if (__get_cpu_var(injectm).finished) { int offset = msr_to_offset(msr); + if (offset >= 0) *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; return; @@ -411,7 +453,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) m->ip = mce_rdmsrl(rip_msr); } -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_X86_LOCAL_APIC /* * Called after interrupts have been reenabled again * when a MCE happened during an interrupts off region @@ -495,7 +537,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); for (i = 0; i < banks; i++) { - if (!bank[i] || !test_bit(i, *b)) + if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; m.misc = 0; @@ -504,7 +546,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -519,9 +561,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -537,7 +579,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } /* @@ -558,7 +600,7 @@ static int mce_no_way_out(struct mce *m, char **msg) int i; for (i = 0; i < banks; i++) { - m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) return 1; } @@ -618,7 +660,7 @@ out: * This way we prevent any potential data corruption in a unrecoverable case * and also makes sure always all CPU's errors are examined. * - * Also this detects the case of an machine check event coming from outer + * Also this detects the case of a machine check event coming from outer * space (not detected by any CPUs) In this case some external agent wants * us to shut down, so panic too. * @@ -671,7 +713,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (!m && tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) mce_panic("Machine check from unknown source", NULL, NULL); /* @@ -705,7 +747,7 @@ static int mce_start(int *no_way_out) * global_nwo should be updated before mce_callin */ smp_wmb(); - order = atomic_add_return(1, &mce_callin); + order = atomic_inc_return(&mce_callin); /* * Wait for everyone. @@ -842,7 +884,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } @@ -895,11 +937,11 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - no_way_out = mce_no_way_out(&m, &msg); - final = &__get_cpu_var(mces_seen); *final = m; + no_way_out = mce_no_way_out(&m, &msg); + barrier(); /* @@ -916,14 +958,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); - if (!bank[i]) + if (!mce_banks[i].ctl) continue; m.misc = 0; m.addr = 0; m.bank = i; - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -964,9 +1006,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); /* * Action optional error. Queue address for later processing. @@ -1091,10 +1133,10 @@ void mce_log_therm_throt_event(__u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_timer(unsigned long data) +static void mce_start_timer(unsigned long data) { struct timer_list *t = &per_cpu(mce_timer, data); int *n; @@ -1110,7 +1152,7 @@ static void mcheck_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(next_interval); + n = &__get_cpu_var(mce_next_interval); if (mce_notify_irq()) *n = max(*n/2, HZ/100); else @@ -1159,10 +1201,26 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); +static int __cpuinit __mcheck_cpu_mce_banks_init(void) +{ + int i; + + mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + if (!mce_banks) + return -ENOMEM; + for (i = 0; i < banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + b->ctl = -1ULL; + b->init = 1; + } + return 0; +} + /* * Initialize Machine Checks for a CPU. */ -static int mce_cap_init(void) +static int __cpuinit __mcheck_cpu_cap_init(void) { unsigned b; u64 cap; @@ -1170,7 +1228,8 @@ static int mce_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + if (!banks) + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { printk(KERN_WARNING @@ -1182,11 +1241,11 @@ static int mce_cap_init(void) /* Don't support asymmetric configurations today */ WARN_ON(banks != 0 && b != banks); banks = b; - if (!bank) { - bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); - if (!bank) - return -ENOMEM; - memset(bank, 0xff, banks * sizeof(u64)); + if (!mce_banks) { + int err = __mcheck_cpu_mce_banks_init(); + + if (err) + return err; } /* Use accurate RIP reporting if available. */ @@ -1199,7 +1258,7 @@ static int mce_cap_init(void) return 0; } -static void mce_init(void) +static void __mcheck_cpu_init_generic(void) { mce_banks_t all_banks; u64 cap; @@ -1218,15 +1277,17 @@ static void mce_init(void) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + + if (!b->init) continue; - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); + wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } /* Add per CPU specific workarounds here */ -static int mce_cpu_quirks(struct cpuinfo_x86 *c) +static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { if (c->x86_vendor == X86_VENDOR_UNKNOWN) { pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); @@ -1241,7 +1302,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c) * trips off incorrectly with the IOMMU & 3ware * & Cerberus: */ - clear_bit(10, (unsigned long *)&bank[4]); + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } if (c->x86 <= 17 && mce_bootlog < 0) { /* @@ -1255,7 +1316,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c) * by default. */ if (c->x86 == 6 && banks > 0) - bank[0] = 0; + mce_banks[0].ctl = 0; } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1268,8 +1329,8 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A) - __set_bit(0, &dont_init_banks); + if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + mce_banks[0].init = 0; /* * All newer Intel systems support MCE broadcasting. Enable @@ -1294,7 +1355,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c) return 0; } -static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) return; @@ -1308,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) } } -static void mce_cpu_features(struct cpuinfo_x86 *c) +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { case X86_VENDOR_INTEL: @@ -1322,10 +1383,10 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) } } -static void mce_init_timer(void) +static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); + int *n = &__get_cpu_var(mce_next_interval); if (mce_ignore_ce) return; @@ -1333,36 +1394,48 @@ static void mce_init_timer(void) *n = check_interval * HZ; if (!*n) return; - setup_timer(t, mcheck_timer, smp_processor_id()); + setup_timer(t, mce_start_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); add_timer_on(t, smp_processor_id()); } +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) { if (mce_disabled) return; - mce_ancient_init(c); + __mcheck_cpu_ancient_init(c); if (!mce_available(c)) return; - if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { + if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { mce_disabled = 1; return; } machine_check_vector = do_machine_check; - mce_init(); - mce_cpu_features(c); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); + } /* @@ -1551,8 +1624,10 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_enable(char *str) { - if (*str == 0) + if (*str == 0) { enable_p5_mce(); + return 1; + } if (*str == '=') str++; if (!strcmp(str, "off")) @@ -1580,6 +1655,15 @@ static int __init mcheck_enable(char *str) } __setup("mce", mcheck_enable); +int __init mcheck_init(void) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); + + mcheck_intel_therm_init(); + + return 0; +} + /* * Sysfs support */ @@ -1588,25 +1672,27 @@ __setup("mce", mcheck_enable); * Disable machine checks on suspend and shutdown. We can't really handle * them later. */ -static int mce_disable(void) +static int mce_disable_error_reporting(void) { int i; for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } return 0; } static int mce_suspend(struct sys_device *dev, pm_message_t state) { - return mce_disable(); + return mce_disable_error_reporting(); } static int mce_shutdown(struct sys_device *dev) { - return mce_disable(); + return mce_disable_error_reporting(); } /* @@ -1616,8 +1702,8 @@ static int mce_shutdown(struct sys_device *dev) */ static int mce_resume(struct sys_device *dev) { - mce_init(); - mce_cpu_features(¤t_cpu_data); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(¤t_cpu_data); return 0; } @@ -1627,8 +1713,8 @@ static void mce_cpu_restart(void *data) del_timer_sync(&__get_cpu_var(mce_timer)); if (!mce_available(¤t_cpu_data)) return; - mce_init(); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_timer(); } /* Reinit MCEs after user configuration changes */ @@ -1654,7 +1740,7 @@ static void mce_enable_ce(void *all) cmci_reenable(); cmci_recheck(); if (all) - mce_init_timer(); + __mcheck_cpu_init_timer(); } static struct sysdev_class mce_sysclass = { @@ -1669,14 +1755,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static struct sysdev_attribute *bank_attrs; +static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +{ + return container_of(attr, struct mce_bank, attr); +} static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { - u64 b = bank[attr - bank_attrs]; - - return sprintf(buf, "%llx\n", b); + return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, @@ -1687,7 +1774,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - bank[attr - bank_attrs] = new; + attr_to_bank(attr)->ctl = new; mce_restart(); return size; @@ -1829,7 +1916,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) } for (j = 0; j < banks; j++) { err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &bank_attrs[j]); + &mce_banks[j].attr); if (err) goto error2; } @@ -1838,10 +1925,10 @@ static __cpuinit int mce_create_device(unsigned int cpu) return 0; error2: while (--j >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); @@ -1859,29 +1946,32 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); cpumask_clear_cpu(cpu, mce_dev_initialized); } /* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) +static void __cpuinit mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } } -static void mce_reenable_cpu(void *h) +static void __cpuinit mce_reenable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; @@ -1892,8 +1982,10 @@ static void mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); } } @@ -1925,7 +2017,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); + __get_cpu_var(mce_next_interval)); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; @@ -1941,38 +2033,24 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; -static __init int mce_init_banks(void) +static __init void mce_init_banks(void) { int i; - bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, - GFP_KERNEL); - if (!bank_attrs) - return -ENOMEM; - for (i = 0; i < banks; i++) { - struct sysdev_attribute *a = &bank_attrs[i]; + struct mce_bank *b = &mce_banks[i]; + struct sysdev_attribute *a = &b->attr; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); - if (!a->attr.name) - goto nomem; + a->attr.name = b->attrname; + snprintf(b->attrname, ATTR_LEN, "bank%d", i); a->attr.mode = 0644; a->show = show_bank; a->store = set_bank; } - return 0; - -nomem: - while (--i >= 0) - kfree(bank_attrs[i].attr.name); - kfree(bank_attrs); - bank_attrs = NULL; - - return -ENOMEM; } -static __init int mce_init_device(void) +static __init int mcheck_init_device(void) { int err; int i = 0; @@ -1982,9 +2060,7 @@ static __init int mce_init_device(void) zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); - err = mce_init_banks(); - if (err) - return err; + mce_init_banks(); err = sysdev_class_register(&mce_sysclass); if (err) @@ -2002,59 +2078,67 @@ static __init int mce_init_device(void) return err; } -device_initcall(mce_init_device); - -#else /* CONFIG_X86_OLD_MCE: */ +device_initcall(mcheck_init_device); -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} +__setup("nomce", mcheck_disable); -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) +#ifdef CONFIG_DEBUG_FS +struct dentry *mce_get_debugfs_dir(void) { - if (mce_disabled) - return; + static struct dentry *dmce; - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; + if (!dmce) + dmce = debugfs_create_dir("mce", NULL); - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; + return dmce; +} - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; +static void mce_reset(void) +{ + cpu_missing = 0; + atomic_set(&mce_fake_paniced, 0); + atomic_set(&mce_executing, 0); + atomic_set(&mce_callin, 0); + atomic_set(&global_nwo, 0); +} - default: - break; - } - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); +static int fake_panic_get(void *data, u64 *val) +{ + *val = fake_panic; + return 0; } -static int __init mcheck_enable(char *str) +static int fake_panic_set(void *data, u64 val) { - mce_p5_enabled = 1; - return 1; + mce_reset(); + fake_panic = val; + return 0; } -__setup("mce", mcheck_enable); -#endif /* CONFIG_X86_OLD_MCE */ +DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, + fake_panic_set, "%llu\n"); -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) +static int __init mcheck_debugfs_init(void) { - mce_disabled = 1; - return 1; + struct dentry *dmce, *ffake_panic; + + dmce = mce_get_debugfs_dir(); + if (!dmce) + return -ENOMEM; + ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); + if (!ffake_panic) + return -ENOMEM; + + return 0; } -__setup("nomce", mcheck_disable); +late_initcall(mcheck_debugfs_init); +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 1fecba404fd8..83a3d1f4efca 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -69,7 +69,7 @@ struct threshold_bank { struct threshold_block *blocks; cpumask_var_t cpus; }; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); +static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); #ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { @@ -489,8 +489,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) int i, err = 0; struct threshold_bank *b = NULL; char name[32]; +#ifdef CONFIG_SMP struct cpuinfo_x86 *c = &cpu_data(cpu); - +#endif sprintf(name, "threshold_bank%i", bank); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index e1acec0f7a32..7c785634af2b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -8,6 +8,7 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/percpu.h> +#include <linux/sched.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/msr.h> @@ -90,7 +91,7 @@ static void cmci_discover(int banks, int boot) if (test_bit(i, owned)) continue; - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ if (val & CMCI_EN) { @@ -101,8 +102,8 @@ static void cmci_discover(int banks, int boot) } val |= CMCI_EN | CMCI_THRESHOLD; - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { @@ -152,9 +153,9 @@ void cmci_clear(void) if (!test_bit(i, __get_cpu_var(mce_banks_owned))) continue; /* Disable CMCI */ - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } spin_unlock_irqrestore(&cmci_discover_lock, flags); diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c deleted file mode 100644 index f5f2d6f71fb6..000000000000 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Non Fatal Machine Check Exception Reporting - * - * (C) Copyright 2002 Dave Jones. <davej@redhat.com> - * - * This file contains routines to check for non-fatal MCEs every 15s - * - */ -#include <linux/interrupt.h> -#include <linux/workqueue.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -static int firstbank; - -#define MCE_RATE (15*HZ) /* timer rate is 15s */ - -static void mce_checkregs(void *info) -{ - u32 low, high; - int i; - - for (i = firstbank; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - - if (!(high & (1<<31))) - continue; - - printk(KERN_INFO "MCE: The hardware reports a non fatal, " - "correctable incident occurred on CPU %d.\n", - smp_processor_id()); - - printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - - /* - * Scrub the error so we don't pick it up in MCE_RATE - * seconds time: - */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } -} - -static void mce_work_fn(struct work_struct *work); -static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); - -static void mce_work_fn(struct work_struct *work) -{ - on_each_cpu(mce_checkregs, NULL, 1); - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); -} - -static int __init init_nonfatal_mce_checker(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return -ENODEV; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return -ENODEV; - - /* Some Athlons misbehave when we frob bank 0 */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; - else - firstbank = 0; - - /* - * Check for non-fatal errors every MCE_RATE s - */ - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); - printk(KERN_INFO "Machine check exception polling timer started.\n"); - - return 0; -} -module_init(init_nonfatal_mce_checker); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c deleted file mode 100644 index 4482aea9aa2e..000000000000 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * P4 specific Machine Check Exception Reporting - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* as supported by the P4/Xeon family */ -struct intel_mce_extended_msrs { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - u32 edi; - u32 ebp; - u32 esp; - u32 eflags; - u32 eip; - /* u32 *reserved[]; */ -}; - -static int mce_num_extended_msrs; - -/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) -{ - u32 h; - - rdmsr(MSR_IA32_MCG_EAX, r->eax, h); - rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); - rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); - rdmsr(MSR_IA32_MCG_EDX, r->edx, h); - rdmsr(MSR_IA32_MCG_ESI, r->esi, h); - rdmsr(MSR_IA32_MCG_EDI, r->edi, h); - rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); - rdmsr(MSR_IA32_MCG_ESP, r->esp, h); - rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); - rdmsr(MSR_IA32_MCG_EIP, r->eip, h); -} - -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - if (mce_num_extended_msrs > 0) { - struct intel_mce_extended_msrs dbg; - - intel_get_extended_msrs(&dbg); - - printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" - "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" - "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags, - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, - dbg.esi, dbg.edi, dbg.ebp, dbg.esp); - } - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = addr[0] = '\0'; - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. - */ - for (i = 0; i < nr_mce_banks; i++) { - u32 msr; - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high&(1<<31)) { - /* Clear it */ - wrmsr(msr, 0UL, 0UL); - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -void intel_p4_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - machine_check_vector = intel_machine_check; - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - for (i = 0; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); - - /* Check for P4/Xeon extended MCE MSRs */ - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<9)) {/* MCG_EXT_P */ - mce_num_extended_msrs = (l >> 16) & 0xff; - printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" - " available\n", - smp_processor_id(), mce_num_extended_msrs); - -#ifdef CONFIG_X86_MCE_P4THERMAL - /* Check for P4/Xeon Thermal monitor */ - intel_init_thermal(c); -#endif - } -} diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c deleted file mode 100644 index 01e4f8178183..000000000000 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * P6 specific Machine Check Exception Reporting - * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> - */ -#include <linux/interrupt.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* Machine Check Handler For PII/PIII */ -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error: - */ - for (i = 0; i < nr_mce_banks; i++) { - unsigned int msr; - - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high & (1<<31)) { - /* Clear it: */ - wrmsr(msr, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -/* Set up machine check reporting for processors with Intel style MCE: */ -void intel_p6_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return; - - /* Ok machine check is available */ - machine_check_vector = intel_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Following the example in IA-32 SDM Vol 3: - * - MC0_CTL should not be written - * - Status registers on all banks should be cleared on reset - */ - for (i = 1; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - - for (i = 0; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 5957a93e5173..4fef985fc221 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -34,20 +34,33 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) -static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; -static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -static DEFINE_PER_CPU(bool, thermal_throttle_active); +/* + * Current thermal throttling state: + */ +struct thermal_state { + bool is_throttled; + + u64 next_check; + unsigned long throttle_count; + unsigned long last_throttle_count; +}; + +static DEFINE_PER_CPU(struct thermal_state, thermal_state); + +static atomic_t therm_throt_en = ATOMIC_INIT(0); -static atomic_t therm_throt_en = ATOMIC_INIT(0); +static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) #define define_therm_throt_sysdev_show_func(name) \ -static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ + \ +static ssize_t therm_throt_sysdev_show_##name( \ + struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ { \ unsigned int cpu = dev->id; \ ssize_t ret; \ @@ -55,7 +68,7 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ preempt_disable(); /* CPU hotplug */ \ if (cpu_online(cpu)) \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_throttle_##name, cpu)); \ + per_cpu(thermal_state, cpu).name); \ else \ ret = 0; \ preempt_enable(); \ @@ -63,11 +76,11 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ return ret; \ } -define_therm_throt_sysdev_show_func(count); -define_therm_throt_sysdev_one_ro(count); +define_therm_throt_sysdev_show_func(throttle_count); +define_therm_throt_sysdev_one_ro(throttle_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_count.attr, + &attr_throttle_count.attr, NULL }; @@ -93,33 +106,39 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -static int therm_throt_process(int curr) +static int therm_throt_process(bool is_throttled) { - unsigned int cpu = smp_processor_id(); - __u64 tmp_jiffs = get_jiffies_64(); - bool was_throttled = __get_cpu_var(thermal_throttle_active); - bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; + struct thermal_state *state; + unsigned int this_cpu; + bool was_throttled; + u64 now; + + this_cpu = smp_processor_id(); + now = get_jiffies_64(); + state = &per_cpu(thermal_state, this_cpu); + + was_throttled = state->is_throttled; + state->is_throttled = is_throttled; if (is_throttled) - __get_cpu_var(thermal_throttle_count)++; + state->throttle_count++; - if (!(was_throttled ^ is_throttled) && - time_before64(tmp_jiffs, __get_cpu_var(next_check))) + if (time_before64(now, state->next_check) && + state->throttle_count != state->last_throttle_count) return 0; - __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; + state->next_check = now + CHECK_INTERVAL; + state->last_throttle_count = state->throttle_count; /* if we just entered the thermal event */ if (is_throttled) { - printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", - cpu, __get_cpu_var(thermal_throttle_count)); + printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); add_taint(TAINT_MACHINE_CHECK); return 1; } if (was_throttled) { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); return 1; } @@ -213,7 +232,7 @@ static void intel_thermal_interrupt(void) __u64 msr_val; rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) + if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) mce_log_therm_throt_event(msr_val); } @@ -237,6 +256,18 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } +void __init mcheck_intel_therm_init(void) +{ + /* + * This function is only called on boot CPU. Save the init thermal + * LVT value on BSP and use that value to restore APs' thermal LVT + * entry BIOS programmed later + */ + if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) && + cpu_has(&boot_cpu_data, X86_FEATURE_ACC)) + lvtthmr_init = apic_read(APIC_LVTTHMR); +} + void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -253,16 +284,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c) * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); + + /* + * The initial value of thermal LVT entries on all APs always reads + * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI + * sequence to them and LVT registers are reset to 0s except for + * the mask bits which are set to 1s when APs receive INIT IPI. + * Always restore the value that BIOS has programmed on AP based on + * BSP's info we saved since BIOS is always setting the same value + * for all threads/cores + */ + apic_write(APIC_LVTTHMR, lvtthmr_init); + + h = lvtthmr_init; + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); return; } - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG @@ -271,6 +312,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } + /* early Pentium M models use different method for enabling TM2 */ + if (cpu_has(c, X86_FEATURE_TM2)) { + if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { + rdmsr(MSR_THERM2_CTL, l, h); + if (l & MSR_THERM2_CTL_TM_SELECT) + tm2 = 1; + } else if (l & MSR_IA32_MISC_ENABLE_TM2) + tm2 = 1; + } + /* We'll mask the thermal vector in the lapic till we're ready: */ h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 315738c74aad..73c86db5acbe 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -846,7 +846,7 @@ int __init mtrr_cleanup(unsigned address_bits) sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); range_sums = sum_ranges(range, nr_range); - printk(KERN_INFO "total RAM coverred: %ldM\n", + printk(KERN_INFO "total RAM covered: %ldM\n", range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 08b6ea4c62b4..3c1b12d461d1 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -96,17 +96,24 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) unsigned long long base, size; char *ptr; char line[LINE_SIZE]; + int length; size_t linelen; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!len) - return -EINVAL; memset(line, 0, LINE_SIZE); - if (len > LINE_SIZE) - len = LINE_SIZE; - if (copy_from_user(line, buf, len - 1)) + + length = len; + length--; + + if (length > LINE_SIZE - 1) + length = LINE_SIZE - 1; + + if (length < 0) + return -EINVAL; + + if (copy_from_user(line, buf, length)) return -EFAULT; linelen = strlen(line); @@ -126,8 +133,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return -EINVAL; base = simple_strtoull(line + 5, &ptr, 0); - for (; isspace(*ptr); ++ptr) - ; + while (isspace(*ptr)) + ptr++; if (strncmp(ptr, "size=", 5)) return -EINVAL; @@ -135,14 +142,14 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; - for (; isspace(*ptr); ++ptr) - ; + while (isspace(*ptr)) + ptr++; if (strncmp(ptr, "type=", 5)) return -EINVAL; ptr += 5; - for (; isspace(*ptr); ++ptr) - ; + while (isspace(*ptr)) + ptr++; for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7af0f88a4163..84e83de54575 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; +static bool mtrr_aps_delayed_init; static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; @@ -163,7 +164,10 @@ static void ipi_handler(void *info) if (data->smp_reg != ~0U) { mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - } else { + } else if (mtrr_aps_delayed_init) { + /* + * Initialize the MTRRs inaddition to the synchronisation. + */ mtrr_if->set_all(); } @@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ */ if (reg != ~0U) mtrr_if->set(reg, base, size, type); + else if (!mtrr_aps_delayed_init) + mtrr_if->set_all(); /* Wait for the others */ while (atomic_read(&data.count)) @@ -721,9 +727,7 @@ void __init mtrr_bp_init(void) void mtrr_ap_init(void) { - unsigned long flags; - - if (!mtrr_if || !use_intel()) + if (!use_intel() || mtrr_aps_delayed_init) return; /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries @@ -738,11 +742,7 @@ void mtrr_ap_init(void) * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug * lock to prevent mtrr entry changes */ - local_irq_save(flags); - - mtrr_if->set_all(); - - local_irq_restore(flags); + set_mtrr(~0U, 0, 0, 0); } /** @@ -753,6 +753,34 @@ void mtrr_save_state(void) smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); } +void set_mtrr_aps_delayed_init(void) +{ + if (!use_intel()) + return; + + mtrr_aps_delayed_init = true; +} + +/* + * MTRR initialization for all AP's + */ +void mtrr_aps_init(void) +{ + if (!use_intel()) + return; + + set_mtrr(~0U, 0, 0, 0); + mtrr_aps_delayed_init = false; +} + +void mtrr_bp_restore(void) +{ + if (!use_intel()) + return; + + mtrr_if->set_all(); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_event.c index f9cd0849bd42..c1bbed1021d9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1,5 +1,5 @@ /* - * Performance counter x86 architecture code + * Performance events x86 architecture code * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar @@ -11,7 +11,7 @@ * For licencing details see kernel-base/COPYING */ -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/capability.h> #include <linux/notifier.h> #include <linux/hardirq.h> @@ -27,19 +27,19 @@ #include <asm/stacktrace.h> #include <asm/nmi.h> -static u64 perf_counter_mask __read_mostly; +static u64 perf_event_mask __read_mostly; -/* The maximal number of PEBS counters: */ -#define MAX_PEBS_COUNTERS 4 +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 4 /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 /* The size of a per-cpu BTS buffer in bytes: */ -#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) /* The BTS overflow threshold in bytes from the end of the buffer: */ -#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) /* @@ -65,11 +65,11 @@ struct debug_store { u64 pebs_index; u64 pebs_absolute_maximum; u64 pebs_interrupt_threshold; - u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; -struct cpu_hw_counters { - struct perf_counter *counters[X86_PMC_IDX_MAX]; +struct cpu_hw_events { + struct perf_event *events[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; @@ -77,6 +77,18 @@ struct cpu_hw_counters { struct debug_store *ds; }; +struct event_constraint { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int code; +}; + +#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } +#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->idxmsk[0]; (e)++) + + /* * struct x86_pmu - generic x86 pmu */ @@ -86,30 +98,34 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(void); - void (*enable)(struct hw_perf_counter *, int); - void (*disable)(struct hw_perf_counter *, int); + void (*enable)(struct hw_perf_event *, int); + void (*disable)(struct hw_perf_event *, int); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); u64 (*raw_event)(u64); int max_events; - int num_counters; - int num_counters_fixed; - int counter_bits; - u64 counter_mask; + int num_events; + int num_events_fixed; + int event_bits; + u64 event_mask; int apic; u64 max_period; u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); + int (*get_event_idx)(struct cpu_hw_events *cpuc, + struct hw_perf_event *hwc); }; static struct x86_pmu x86_pmu __read_mostly; -static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; +static const struct event_constraint *event_constraints; + /* * Not sure about some of these */ @@ -124,37 +140,47 @@ static const u64 p6_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, }; -static u64 p6_pmu_event_map(int event) +static u64 p6_pmu_event_map(int hw_event) { - return p6_perfmon_event_map[event]; + return p6_perfmon_event_map[hw_event]; } /* - * Counter setting that is specified not to count anything. + * Event setting that is specified not to count anything. * We use this to effectively disable a counter. * * L2_RQSTS with 0 MESI unit mask. */ -#define P6_NOP_COUNTER 0x0000002EULL +#define P6_NOP_EVENT 0x0000002EULL -static u64 p6_pmu_raw_event(u64 event) +static u64 p6_pmu_raw_event(u64 hw_event) { #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL #define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL #define P6_EVNTSEL_EDGE_MASK 0x00040000ULL #define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL +#define P6_EVNTSEL_REG_MASK 0xFF000000ULL #define P6_EVNTSEL_MASK \ (P6_EVNTSEL_EVENT_MASK | \ P6_EVNTSEL_UNIT_MASK | \ P6_EVNTSEL_EDGE_MASK | \ P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_COUNTER_MASK) + P6_EVNTSEL_REG_MASK) - return event & P6_EVNTSEL_MASK; + return hw_event & P6_EVNTSEL_MASK; } +static const struct event_constraint intel_p6_event_constraints[] = +{ + EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; /* * Intel PerfMon v3. Used on Core2 and later. @@ -170,16 +196,45 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; -static u64 intel_pmu_event_map(int event) +static const struct event_constraint intel_core_event_constraints[] = +{ + EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static const struct event_constraint intel_nehalem_event_constraints[] = +{ + EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ + EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static u64 intel_pmu_event_map(int hw_event) { - return intel_perfmon_event_map[event]; + return intel_perfmon_event_map[hw_event]; } /* - * Generalized hw caching related event table, filled + * Generalized hw caching related hw_event table, filled * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'event makes no sense on - * this CPU', any other value means the raw event + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event * ID. */ @@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static const u64 nehalem_hw_cache_event_ids +static __initconst u64 nehalem_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids }, }; -static const u64 core2_hw_cache_event_ids +static __initconst u64 core2_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids }, }; -static const u64 atom_hw_cache_event_ids +static __initconst u64 atom_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -463,25 +518,25 @@ static const u64 atom_hw_cache_event_ids }, }; -static u64 intel_pmu_raw_event(u64 event) +static u64 intel_pmu_raw_event(u64 hw_event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL #define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL #define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ CORE_EVNTSEL_UNIT_MASK | \ CORE_EVNTSEL_EDGE_MASK | \ CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_COUNTER_MASK) + CORE_EVNTSEL_REG_MASK) - return event & CORE_EVNTSEL_MASK; + return hw_event & CORE_EVNTSEL_MASK; } -static const u64 amd_hw_cache_event_ids +static __initconst u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -585,39 +640,39 @@ static const u64 amd_perfmon_event_map[] = [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, }; -static u64 amd_pmu_event_map(int event) +static u64 amd_pmu_event_map(int hw_event) { - return amd_perfmon_event_map[event]; + return amd_perfmon_event_map[hw_event]; } -static u64 amd_pmu_raw_event(u64 event) +static u64 amd_pmu_raw_event(u64 hw_event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL #define K7_EVNTSEL_EDGE_MASK 0x000040000ULL #define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL +#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL #define K7_EVNTSEL_MASK \ (K7_EVNTSEL_EVENT_MASK | \ K7_EVNTSEL_UNIT_MASK | \ K7_EVNTSEL_EDGE_MASK | \ K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_COUNTER_MASK) + K7_EVNTSEL_REG_MASK) - return event & K7_EVNTSEL_MASK; + return hw_event & K7_EVNTSEL_MASK; } /* - * Propagate counter elapsed time into the generic counter. - * Can only be executed on the CPU where the counter is active. + * Propagate event elapsed time into the generic event. + * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ static u64 -x86_perf_counter_update(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +x86_perf_event_update(struct perf_event *event, + struct hw_perf_event *hwc, int idx) { - int shift = 64 - x86_pmu.counter_bits; + int shift = 64 - x86_pmu.event_bits; u64 prev_raw_count, new_raw_count; s64 delta; @@ -625,15 +680,15 @@ x86_perf_counter_update(struct perf_counter *counter, return 0; /* - * Careful: an NMI might modify the previous counter value. + * Careful: an NMI might modify the previous event value. * * Our tactic to handle this is to first atomically read and * exchange a new raw count - then add that new-prev delta - * count to the generic counter atomically: + * count to the generic event atomically: */ again: prev_raw_count = atomic64_read(&hwc->prev_count); - rdmsrl(hwc->counter_base + idx, new_raw_count); + rdmsrl(hwc->event_base + idx, new_raw_count); if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -642,7 +697,7 @@ again: /* * Now we have the new raw value and have updated the prev * timestamp already. We can now calculate the elapsed delta - * (counter-)time and add that to the generic counter. + * (event-)time and add that to the generic event. * * Careful, not all hw sign-extends above the physical width * of the count. @@ -650,13 +705,13 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &counter->count); + atomic64_add(delta, &event->count); atomic64_sub(delta, &hwc->period_left); return new_raw_count; } -static atomic_t active_counters; +static atomic_t active_events; static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) @@ -667,12 +722,12 @@ static bool reserve_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) disable_lapic_nmi_watchdog(); - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < x86_pmu.num_events; i++) { if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < x86_pmu.num_events; i++) { if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } @@ -685,7 +740,7 @@ eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); - i = x86_pmu.num_counters; + i = x86_pmu.num_events; perfctr_fail: for (i--; i >= 0; i--) @@ -703,7 +758,7 @@ static void release_pmc_hardware(void) #ifdef CONFIG_X86_LOCAL_APIC int i; - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < x86_pmu.num_events; i++) { release_perfctr_nmi(x86_pmu.perfctr + i); release_evntsel_nmi(x86_pmu.eventsel + i); } @@ -720,7 +775,7 @@ static inline bool bts_available(void) static inline void init_debug_store_on_cpu(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; if (!ds) return; @@ -732,7 +787,7 @@ static inline void init_debug_store_on_cpu(int cpu) static inline void fini_debug_store_on_cpu(int cpu) { - if (!per_cpu(cpu_hw_counters, cpu).ds) + if (!per_cpu(cpu_hw_events, cpu).ds) return; wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); @@ -751,12 +806,12 @@ static void release_bts_hardware(void) fini_debug_store_on_cpu(cpu); for_each_possible_cpu(cpu) { - struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; if (!ds) continue; - per_cpu(cpu_hw_counters, cpu).ds = NULL; + per_cpu(cpu_hw_events, cpu).ds = NULL; kfree((void *)(unsigned long)ds->bts_buffer_base); kfree(ds); @@ -796,7 +851,7 @@ static int reserve_bts_hardware(void) ds->bts_interrupt_threshold = ds->bts_absolute_maximum - BTS_OVFL_TH; - per_cpu(cpu_hw_counters, cpu).ds = ds; + per_cpu(cpu_hw_events, cpu).ds = ds; err = 0; } @@ -812,9 +867,9 @@ static int reserve_bts_hardware(void) return err; } -static void hw_perf_counter_destroy(struct perf_counter *counter) +static void hw_perf_event_destroy(struct perf_event *event) { - if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { + if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { release_pmc_hardware(); release_bts_hardware(); mutex_unlock(&pmc_reserve_mutex); @@ -827,7 +882,7 @@ static inline int x86_pmu_initialized(void) } static inline int -set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) { unsigned int cache_type, cache_op, cache_result; u64 config, val; @@ -880,7 +935,7 @@ static void intel_pmu_enable_bts(u64 config) static void intel_pmu_disable_bts(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long debugctlmsr; if (!cpuc->ds) @@ -898,10 +953,10 @@ static void intel_pmu_disable_bts(void) /* * Setup the hardware configuration for a given attr_type */ -static int __hw_perf_counter_init(struct perf_counter *counter) +static int __hw_perf_event_init(struct perf_event *event) { - struct perf_counter_attr *attr = &counter->attr; - struct hw_perf_counter *hwc = &counter->hw; + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; u64 config; int err; @@ -909,27 +964,31 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -ENODEV; err = 0; - if (!atomic_inc_not_zero(&active_counters)) { + if (!atomic_inc_not_zero(&active_events)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0) { + if (atomic_read(&active_events) == 0) { if (!reserve_pmc_hardware()) err = -EBUSY; else err = reserve_bts_hardware(); } if (!err) - atomic_inc(&active_counters); + atomic_inc(&active_events); mutex_unlock(&pmc_reserve_mutex); } if (err) return err; + event->destroy = hw_perf_event_destroy; + /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ hwc->config = ARCH_PERFMON_EVENTSEL_INT; + hwc->idx = -1; + /* * Count user and OS events unless requested not to. */ @@ -946,17 +1005,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * If we have a PMU initialized but no APIC * interrupts, we cannot sample hardware - * counters (user-space has to fall back and - * sample via a hrtimer based software counter): + * events (user-space has to fall back and + * sample via a hrtimer based software event): */ if (!x86_pmu.apic) return -EOPNOTSUPP; } - counter->destroy = hw_perf_counter_destroy; - /* - * Raw event type provide the config in the event structure + * Raw hw_event type provide the config in the hw_event structure */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); @@ -1001,7 +1058,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) static void p6_pmu_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); u64 val; if (!cpuc->enabled) @@ -1018,7 +1075,7 @@ static void p6_pmu_disable_all(void) static void intel_pmu_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (!cpuc->enabled) return; @@ -1034,7 +1091,7 @@ static void intel_pmu_disable_all(void) static void amd_pmu_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; if (!cpuc->enabled) @@ -1043,12 +1100,12 @@ static void amd_pmu_disable_all(void) cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the - * counters proper, so that amd_pmu_enable_counter() does the + * events proper, so that amd_pmu_enable_event() does the * right thing. */ barrier(); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -1070,7 +1127,7 @@ void hw_perf_disable(void) static void p6_pmu_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long val; if (cpuc->enabled) @@ -1087,7 +1144,7 @@ static void p6_pmu_enable_all(void) static void intel_pmu_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (cpuc->enabled) return; @@ -1098,19 +1155,19 @@ static void intel_pmu_enable_all(void) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_counter *counter = - cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + struct perf_event *event = + cpuc->events[X86_PMC_IDX_FIXED_BTS]; - if (WARN_ON_ONCE(!counter)) + if (WARN_ON_ONCE(!event)) return; - intel_pmu_enable_bts(counter->hw.config); + intel_pmu_enable_bts(event->hw.config); } } static void amd_pmu_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; if (cpuc->enabled) @@ -1119,14 +1176,14 @@ static void amd_pmu_enable_all(void) cpuc->enabled = 1; barrier(); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_counter *counter = cpuc->counters[idx]; + for (idx = 0; idx < x86_pmu.num_events; idx++) { + struct perf_event *event = cpuc->events[idx]; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; - val = counter->hw.config; + val = event->hw.config; val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } @@ -1153,19 +1210,19 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } -static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); } static inline void -intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -1178,10 +1235,10 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) } static inline void -p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val = P6_NOP_COUNTER; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val = P6_NOP_EVENT; if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -1190,7 +1247,7 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) } static inline void -intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) { if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); @@ -1202,24 +1259,24 @@ intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) return; } - x86_pmu_disable_counter(hwc, idx); + x86_pmu_disable_event(hwc, idx); } static inline void -amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) { - x86_pmu_disable_counter(hwc, idx); + x86_pmu_disable_event(hwc, idx); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. - * To be called with the counter disabled in hw: + * To be called with the event disabled in hw: */ static int -x86_perf_counter_set_period(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +x86_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); s64 period = hwc->sample_period; @@ -1245,7 +1302,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, ret = 1; } /* - * Quirk: certain CPUs dont like it if just 1 event is left: + * Quirk: certain CPUs dont like it if just 1 hw_event is left: */ if (unlikely(left < 2)) left = 2; @@ -1253,24 +1310,24 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (left > x86_pmu.max_period) left = x86_pmu.max_period; - per_cpu(prev_left[idx], smp_processor_id()) = left; + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* - * The hw counter starts counting from this counter offset, + * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ atomic64_set(&hwc->prev_count, (u64)-left); - err = checking_wrmsrl(hwc->counter_base + idx, - (u64)(-left) & x86_pmu.counter_mask); + err = checking_wrmsrl(hwc->event_base + idx, + (u64)(-left) & x86_pmu.event_mask); - perf_counter_update_userpage(counter); + perf_event_update_userpage(event); return ret; } static inline void -intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -1295,9 +1352,9 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) err = checking_wrmsrl(hwc->config_base, ctrl_val); } -static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); u64 val; val = hwc->config; @@ -1308,10 +1365,10 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) } -static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) { if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_counters).enabled) + if (!__get_cpu_var(cpu_hw_events).enabled) return; intel_pmu_enable_bts(hwc->config); @@ -1323,134 +1380,189 @@ static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) return; } - x86_pmu_enable_counter(hwc, idx); + x86_pmu_enable_event(hwc, idx); } -static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (cpuc->enabled) - x86_pmu_enable_counter(hwc, idx); + x86_pmu_enable_event(hwc, idx); } -static int -fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) +static int fixed_mode_idx(struct hw_perf_event *hwc) { - unsigned int event; + unsigned int hw_event; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; + hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely((event == + if (unlikely((hw_event == x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && (hwc->sample_period == 1))) return X86_PMC_IDX_FIXED_BTS; - if (!x86_pmu.num_counters_fixed) + if (!x86_pmu.num_events_fixed) return -1; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) + /* + * fixed counters do not take all possible filters + */ + if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) + return -1; + + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; } /* - * Find a PMC slot for the freshly enabled / scheduled in counter: + * generic counter allocator: get next free counter */ -static int x86_pmu_enable(struct perf_counter *counter) +static int +gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; int idx; - idx = fixed_mode_idx(counter, hwc); + idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); + return idx == x86_pmu.num_events ? -1 : idx; +} + +/* + * intel-specific counter allocator: check event constraints + */ +static int +intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) +{ + const struct event_constraint *event_constraint; + int i, code; + + if (!event_constraints) + goto skip; + + code = hwc->config & CORE_EVNTSEL_EVENT_MASK; + + for_each_event_constraint(event_constraint, event_constraints) { + if (code == event_constraint->code) { + for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { + if (!test_and_set_bit(i, cpuc->used_mask)) + return i; + } + return -1; + } + } +skip: + return gen_get_event_idx(cpuc, hwc); +} + +static int +x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) +{ + int idx; + + idx = fixed_mode_idx(hwc); if (idx == X86_PMC_IDX_FIXED_BTS) { /* BTS is already occupied. */ if (test_and_set_bit(idx, cpuc->used_mask)) return -EAGAIN; hwc->config_base = 0; - hwc->counter_base = 0; + hwc->event_base = 0; hwc->idx = idx; } else if (idx >= 0) { /* - * Try to get the fixed counter, if that is already taken - * then try to get a generic counter: + * Try to get the fixed event, if that is already taken + * then try to get a generic event: */ if (test_and_set_bit(idx, cpuc->used_mask)) goto try_generic; hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; /* - * We set it so that counter_base + idx in wrmsr/rdmsr maps to + * We set it so that event_base + idx in wrmsr/rdmsr maps to * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: */ - hwc->counter_base = + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; hwc->idx = idx; } else { idx = hwc->idx; - /* Try to get the previous generic counter again */ - if (test_and_set_bit(idx, cpuc->used_mask)) { + /* Try to get the previous generic event again */ + if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = find_first_zero_bit(cpuc->used_mask, - x86_pmu.num_counters); - if (idx == x86_pmu.num_counters) + idx = x86_pmu.get_event_idx(cpuc, hwc); + if (idx == -1) return -EAGAIN; set_bit(idx, cpuc->used_mask); hwc->idx = idx; } - hwc->config_base = x86_pmu.eventsel; - hwc->counter_base = x86_pmu.perfctr; + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; } - perf_counters_lapic_init(); + return idx; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in event: + */ +static int x86_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx; + + idx = x86_schedule_event(cpuc, hwc); + if (idx < 0) + return idx; + + perf_events_lapic_init(); x86_pmu.disable(hwc, idx); - cpuc->counters[idx] = counter; + cpuc->events[idx] = event; set_bit(idx, cpuc->active_mask); - x86_perf_counter_set_period(counter, hwc, idx); + x86_perf_event_set_period(event, hwc, idx); x86_pmu.enable(hwc, idx); - perf_counter_update_userpage(counter); + perf_event_update_userpage(event); return 0; } -static void x86_pmu_unthrottle(struct perf_counter *counter) +static void x86_pmu_unthrottle(struct perf_event *event) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->counters[hwc->idx] != counter)) + cpuc->events[hwc->idx] != event)) return; x86_pmu.enable(hwc, hwc->idx); } -void perf_counter_print_debug(void) +void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; - struct cpu_hw_counters *cpuc; + struct cpu_hw_events *cpuc; unsigned long flags; int cpu, idx; - if (!x86_pmu.num_counters) + if (!x86_pmu.num_events) return; local_irq_save(flags); cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + cpuc = &per_cpu(cpu_hw_events, cpu); if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); @@ -1466,11 +1578,11 @@ void perf_counter_print_debug(void) } pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); - prev_left = per_cpu(prev_left[idx], cpu); + prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); @@ -1479,7 +1591,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -1488,8 +1600,7 @@ void perf_counter_print_debug(void) local_irq_restore(flags); } -static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, - struct perf_sample_data *data) +static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; struct bts_record { @@ -1497,11 +1608,14 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, u64 to; u64 flags; }; - struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; - unsigned long orig_ip = data->regs->ip; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; - if (!counter) + if (!event) return; if (!ds) @@ -1510,26 +1624,45 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; top = (struct bts_record *)(unsigned long)ds->bts_index; + if (top <= at) + return; + ds->bts_index = ds->bts_buffer_base; + + data.period = event->hw.last_period; + data.addr = 0; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, + header.size * (top - at), 1, 1)) + return; + for (; at < top; at++) { - data->regs->ip = at->from; - data->addr = at->to; + data.ip = at->from; + data.addr = at->to; - perf_counter_output(counter, 1, data); + perf_output_sample(&handle, &header, &data, event); } - data->regs->ip = orig_ip; - data->addr = 0; + perf_output_end(&handle); /* There's new data available. */ - counter->pending_kill = POLL_IN; + event->hw.interrupts++; + event->pending_kill = POLL_IN; } -static void x86_pmu_disable(struct perf_counter *counter) +static void x86_pmu_disable(struct perf_event *event) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; /* @@ -1541,67 +1674,63 @@ static void x86_pmu_disable(struct perf_counter *counter) /* * Make sure the cleared pointer becomes visible before we - * (potentially) free the counter: + * (potentially) free the event: */ barrier(); /* - * Drain the remaining delta count out of a counter + * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_counter_update(counter, hwc, idx); + x86_perf_event_update(event, hwc, idx); /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - struct perf_sample_data data; - struct pt_regs regs; + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) + intel_pmu_drain_bts_buffer(cpuc); - data.regs = ®s; - intel_pmu_drain_bts_buffer(cpuc, &data); - } - cpuc->counters[idx] = NULL; + cpuc->events[idx] = NULL; clear_bit(idx, cpuc->used_mask); - perf_counter_update_userpage(counter); + perf_event_update_userpage(event); } /* - * Save and restart an expired counter. Called by NMI contexts, - * so it has to be careful about preempting normal counter ops: + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: */ -static int intel_pmu_save_and_restart(struct perf_counter *counter) +static int intel_pmu_save_and_restart(struct perf_event *event) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; int ret; - x86_perf_counter_update(counter, hwc, idx); - ret = x86_perf_counter_set_period(counter, hwc, idx); + x86_perf_event_update(event, hwc, idx); + ret = x86_perf_event_set_period(event, hwc, idx); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - intel_pmu_enable_counter(hwc, idx); + if (event->state == PERF_EVENT_STATE_ACTIVE) + intel_pmu_enable_event(hwc, idx); return ret; } static void intel_pmu_reset(void) { - struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; + struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; unsigned long flags; int idx; - if (!x86_pmu.num_counters) + if (!x86_pmu.num_events) return; local_irq_save(flags); printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); } if (ds) @@ -1613,39 +1742,38 @@ static void intel_pmu_reset(void) static int p6_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; int idx, handled = 0; u64 val; - data.regs = regs; data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_counters); + cpuc = &__get_cpu_var(cpu_hw_events); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; - counter = cpuc->counters[idx]; - hwc = &counter->hw; + event = cpuc->events[idx]; + hwc = &event->hw; - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; /* - * counter overflow + * event overflow */ handled = 1; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; - if (!x86_perf_counter_set_period(counter, hwc, idx)) + if (!x86_perf_event_set_period(event, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, &data)) - p6_pmu_disable_counter(hwc, idx); + if (perf_event_overflow(event, 1, &data, regs)) + p6_pmu_disable_event(hwc, idx); } if (handled) @@ -1661,17 +1789,16 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) static int intel_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; - struct cpu_hw_counters *cpuc; + struct cpu_hw_events *cpuc; int bit, loops; u64 ack, status; - data.regs = regs; data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_counters); + cpuc = &__get_cpu_var(cpu_hw_events); perf_disable(); - intel_pmu_drain_bts_buffer(cpuc, &data); + intel_pmu_drain_bts_buffer(cpuc); status = intel_pmu_get_status(); if (!status) { perf_enable(); @@ -1681,8 +1808,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) loops = 0; again: if (++loops > 100) { - WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); - perf_counter_print_debug(); + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); intel_pmu_reset(); perf_enable(); return 1; @@ -1691,19 +1818,19 @@ again: inc_irq_stat(apic_perf_irqs); ack = status; for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_event *event = cpuc->events[bit]; clear_bit(bit, (unsigned long *) &status); if (!test_bit(bit, cpuc->active_mask)) continue; - if (!intel_pmu_save_and_restart(counter)) + if (!intel_pmu_save_and_restart(event)) continue; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; - if (perf_counter_overflow(counter, 1, &data)) - intel_pmu_disable_counter(&counter->hw, bit); + if (perf_event_overflow(event, 1, &data, regs)) + intel_pmu_disable_event(&event->hw, bit); } intel_pmu_ack_status(ack); @@ -1723,39 +1850,38 @@ again: static int amd_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; int idx, handled = 0; u64 val; - data.regs = regs; data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_counters); + cpuc = &__get_cpu_var(cpu_hw_events); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; - counter = cpuc->counters[idx]; - hwc = &counter->hw; + event = cpuc->events[idx]; + hwc = &event->hw; - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; /* - * counter overflow + * event overflow */ handled = 1; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; - if (!x86_perf_counter_set_period(counter, hwc, idx)) + if (!x86_perf_event_set_period(event, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, &data)) - amd_pmu_disable_counter(hwc, idx); + if (perf_event_overflow(event, 1, &data, regs)) + amd_pmu_disable_event(hwc, idx); } if (handled) @@ -1769,18 +1895,21 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) irq_enter(); ack_APIC_irq(); inc_irq_stat(apic_pending_irqs); - perf_counter_do_pending(); + perf_event_do_pending(); irq_exit(); } -void set_perf_counter_pending(void) +void set_perf_event_pending(void) { #ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) + return; + apic->send_IPI_self(LOCAL_PENDING_VECTOR); #endif } -void perf_counters_lapic_init(void) +void perf_events_lapic_init(void) { #ifdef CONFIG_X86_LOCAL_APIC if (!x86_pmu.apic || !x86_pmu_initialized()) @@ -1794,13 +1923,13 @@ void perf_counters_lapic_init(void) } static int __kprobes -perf_counter_nmi_handler(struct notifier_block *self, +perf_event_nmi_handler(struct notifier_block *self, unsigned long cmd, void *__args) { struct die_args *args = __args; struct pt_regs *regs; - if (!atomic_read(&active_counters)) + if (!atomic_read(&active_events)) return NOTIFY_DONE; switch (cmd) { @@ -1819,7 +1948,7 @@ perf_counter_nmi_handler(struct notifier_block *self, #endif /* * Can't rely on the handled return value to say it was our NMI, two - * counters could trigger 'simultaneously' raising two back-to-back NMIs. + * events could trigger 'simultaneously' raising two back-to-back NMIs. * * If the first NMI handles both, the latter will be empty and daze * the CPU. @@ -1829,19 +1958,19 @@ perf_counter_nmi_handler(struct notifier_block *self, return NOTIFY_STOP; } -static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler, +static __read_mostly struct notifier_block perf_event_nmi_notifier = { + .notifier_call = perf_event_nmi_handler, .next = NULL, .priority = 1 }; -static struct x86_pmu p6_pmu = { +static __initconst struct x86_pmu p6_pmu = { .name = "p6", .handle_irq = p6_pmu_handle_irq, .disable_all = p6_pmu_disable_all, .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_counter, - .disable = p6_pmu_disable_counter, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, .eventsel = MSR_P6_EVNTSEL0, .perfctr = MSR_P6_PERFCTR0, .event_map = p6_pmu_event_map, @@ -1850,25 +1979,26 @@ static struct x86_pmu p6_pmu = { .apic = 1, .max_period = (1ULL << 31) - 1, .version = 0, - .num_counters = 2, + .num_events = 2, /* - * Counters have 40 bits implemented. However they are designed such + * Events have 40 bits implemented. However they are designed such * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a counter for P6-like PMU is 32 bits only. + * effective width of a event for P6-like PMU is 32 bits only. * * See IA-32 Intel Architecture Software developer manual Vol 3B */ - .counter_bits = 32, - .counter_mask = (1ULL << 32) - 1, + .event_bits = 32, + .event_mask = (1ULL << 32) - 1, + .get_event_idx = intel_get_event_idx, }; -static struct x86_pmu intel_pmu = { +static __initconst struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, .disable_all = intel_pmu_disable_all, .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_counter, - .disable = intel_pmu_disable_counter, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, .event_map = intel_pmu_event_map, @@ -1878,34 +2008,36 @@ static struct x86_pmu intel_pmu = { /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of - * the generic counter period: + * the generic event period: */ .max_period = (1ULL << 31) - 1, .enable_bts = intel_pmu_enable_bts, .disable_bts = intel_pmu_disable_bts, + .get_event_idx = intel_get_event_idx, }; -static struct x86_pmu amd_pmu = { +static __initconst struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_counter, - .disable = amd_pmu_disable_counter, + .enable = amd_pmu_enable_event, + .disable = amd_pmu_disable_event, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 4, - .counter_bits = 48, - .counter_mask = (1ULL << 48) - 1, + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, + .get_event_idx = gen_get_event_idx, }; -static int p6_pmu_init(void) +static __init int p6_pmu_init(void) { switch (boot_cpu_data.x86_model) { case 1: @@ -1915,10 +2047,12 @@ static int p6_pmu_init(void) case 7: case 8: case 11: /* Pentium III */ + event_constraints = intel_p6_event_constraints; break; case 9: case 13: /* Pentium M */ + event_constraints = intel_p6_event_constraints; break; default: pr_cont("unsupported p6 CPU model %d ", @@ -1937,7 +2071,7 @@ static int p6_pmu_init(void) return 0; } -static int intel_pmu_init(void) +static __init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -1956,7 +2090,7 @@ static int intel_pmu_init(void) /* * Check whether the Architectural PerfMon supports - * Branch Misses Retired Event or not. + * Branch Misses Retired hw_event or not. */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) @@ -1968,15 +2102,15 @@ static int intel_pmu_init(void) x86_pmu = intel_pmu; x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.counter_bits = eax.split.bit_width; - x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.num_events = eax.split.num_events; + x86_pmu.event_bits = eax.split.bit_width; + x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; /* - * Quirk: v2 perfmon does not report fixed-purpose counters, so - * assume at least 3 counters: + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: */ - x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); /* * Install the hw-cache-events table: @@ -1990,12 +2124,14 @@ static int intel_pmu_init(void) sizeof(hw_cache_event_ids)); pr_cont("Core2 events, "); + event_constraints = intel_core_event_constraints; break; default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + event_constraints = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; case 28: @@ -2008,7 +2144,7 @@ static int intel_pmu_init(void) return 0; } -static int amd_pmu_init(void) +static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) @@ -2023,11 +2159,11 @@ static int amd_pmu_init(void) return 0; } -void __init init_hw_perf_counters(void) +void __init init_hw_perf_events(void) { int err; - pr_info("Performance Counters: "); + pr_info("Performance Events: "); switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: @@ -2040,45 +2176,45 @@ void __init init_hw_perf_counters(void) return; } if (err != 0) { - pr_cont("no PMU driver, software counters only.\n"); + pr_cont("no PMU driver, software events only.\n"); return; } pr_cont("%s PMU driver.\n", x86_pmu.name); - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; + if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_events, X86_PMC_MAX_GENERIC); + x86_pmu.num_events = X86_PMC_MAX_GENERIC; } - perf_counter_mask = (1 << x86_pmu.num_counters) - 1; - perf_max_counters = x86_pmu.num_counters; + perf_event_mask = (1 << x86_pmu.num_events) - 1; + perf_max_events = x86_pmu.num_events; - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; + if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; } - perf_counter_mask |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; - x86_pmu.intel_ctrl = perf_counter_mask; + perf_event_mask |= + ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_event_mask; - perf_counters_lapic_init(); - register_die_notifier(&perf_counter_nmi_notifier); + perf_events_lapic_init(); + register_die_notifier(&perf_event_nmi_notifier); - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.counter_bits); - pr_info("... generic counters: %d\n", x86_pmu.num_counters); - pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); - pr_info("... counter mask: %016Lx\n", perf_counter_mask); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.event_bits); + pr_info("... generic registers: %d\n", x86_pmu.num_events); + pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); + pr_info("... event mask: %016Lx\n", perf_event_mask); } -static inline void x86_pmu_read(struct perf_counter *counter) +static inline void x86_pmu_read(struct perf_event *event) { - x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); + x86_perf_event_update(event, &event->hw, event->hw.idx); } static const struct pmu pmu = { @@ -2088,13 +2224,52 @@ static const struct pmu pmu = { .unthrottle = x86_pmu_unthrottle, }; -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +static int +validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event fake_event = event->hw; + + if (event->pmu && event->pmu != &pmu) + return 0; + + return x86_schedule_event(cpuc, &fake_event) >= 0; +} + +static int validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct cpu_hw_events fake_pmu; + + memset(&fake_pmu, 0, sizeof(fake_pmu)); + + if (!validate_event(&fake_pmu, leader)) + return -ENOSPC; + + list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + if (!validate_event(&fake_pmu, sibling)) + return -ENOSPC; + } + + if (!validate_event(&fake_pmu, event)) + return -ENOSPC; + + return 0; +} + +const struct pmu *hw_perf_event_init(struct perf_event *event) { int err; - err = __hw_perf_counter_init(counter); - if (err) + err = __hw_perf_event_init(event); + if (!err) { + if (event->group_leader != event) + err = validate_group(event); + } + if (err) { + if (event->destroy) + event->destroy(event); return ERR_PTR(err); + } return &pmu; } @@ -2110,8 +2285,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) entry->ip[entry->nr++] = ip; } -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); static DEFINE_PER_CPU(int, in_nmi_frame); @@ -2264,9 +2439,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_callchain_entry *entry; if (in_nmi()) - entry = &__get_cpu_var(nmi_entry); + entry = &__get_cpu_var(pmc_nmi_entry); else - entry = &__get_cpu_var(irq_entry); + entry = &__get_cpu_var(pmc_irq_entry); entry->nr = 0; @@ -2275,7 +2450,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -void hw_perf_counter_setup_online(int cpu) +void hw_perf_event_setup_online(int cpu) { init_debug_store_on_cpu(cpu); } diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 392bea43b890..898df9719afb 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -20,7 +20,7 @@ #include <linux/kprobes.h> #include <asm/apic.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> struct nmi_watchdog_ctlblk { unsigned int cccr_msr; @@ -712,7 +712,7 @@ static void probe_nmi_watchdog(void) switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && - boot_cpu_data.x86 != 16) + boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17) return; wd_ops = &k7_wd_ops; break; diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c new file mode 100644 index 000000000000..a640ae5ad201 --- /dev/null +++ b/arch/x86/kernel/cpu/sched.c @@ -0,0 +1,55 @@ +#include <linux/sched.h> +#include <linux/math64.h> +#include <linux/percpu.h> +#include <linux/irqflags.h> + +#include <asm/cpufeature.h> +#include <asm/processor.h> + +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); + +static unsigned long scale_aperfmperf(void) +{ + struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); + unsigned long ratio, flags; + + local_irq_save(flags); + get_aperfmperf(&val); + local_irq_restore(flags); + + ratio = calc_aperfmperf_ratio(old, &val); + *old = val; + + return ratio; +} + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + /* + * do aperf/mperf on the cpu level because it includes things + * like turbo mode, which are relevant to full cores. + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return scale_aperfmperf(); + + /* + * maybe have something cpufreq here + */ + + return default_scale_freq_power(sd, cpu); +} + +unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + /* + * aperf/mperf already includes the smt gain + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return SCHED_LOAD_SCALE; + + return default_scale_smt_power(sd, cpu); +} + +#endif diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index bc24f514ec93..1cbed97b59cf 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,6 +24,7 @@ #include <linux/dmi.h> #include <asm/div64.h> #include <asm/vmware.h> +#include <asm/x86_init.h> #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -47,21 +48,35 @@ static inline int __vmware_platform(void) return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; } -static unsigned long __vmware_get_tsc_khz(void) +static unsigned long vmware_get_tsc_khz(void) { uint64_t tsc_hz; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (ebx == UINT_MAX) - return 0; tsc_hz = eax | (((uint64_t)ebx) << 32); do_div(tsc_hz, 1000); BUG_ON(tsc_hz >> 32); + printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", + (unsigned long) tsc_hz / 1000, + (unsigned long) tsc_hz % 1000); return tsc_hz; } +void __init vmware_platform_setup(void) +{ + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + if (ebx != UINT_MAX) + x86_platform.calibrate_tsc = vmware_get_tsc_khz; + else + printk(KERN_WARNING + "Failed to get TSC freq from the hypervisor\n"); +} + /* * While checking the dmi string infomation, just checking the product * serial key should be enough, as this will always have a VMware @@ -87,12 +102,6 @@ int vmware_platform(void) return 0; } -unsigned long vmware_get_tsc_khz(void) -{ - BUG_ON(!vmware_platform()); - return __vmware_get_tsc_khz(); -} - /* * VMware hypervisor takes care of exporting a reliable TSC to the guest. * Still, due to timing difference when running on virtual cpus, the TSC can diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 48e8e6558b26..7ef24a796992 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = .notifier_call = cpuid_class_cpu_callback, }; -static char *cpuid_nodename(struct device *dev) +static char *cpuid_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } @@ -198,7 +198,7 @@ static int __init cpuid_init(void) err = PTR_ERR(cpuid_class); goto out_chrdev; } - cpuid_class->nodename = cpuid_nodename; + cpuid_class->devnode = cpuid_devnode; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 5e409dc298a4..a4849c10a77e 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,8 +27,7 @@ #include <asm/cpu.h> #include <asm/reboot.h> #include <asm/virtext.h> -#include <asm/iommu.h> - +#include <asm/x86_init.h> #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #endif #ifdef CONFIG_X86_64 - pci_iommu_shutdown(); + x86_platform.iommu_shutdown(); #endif crash_save_cpu(regs, safe_smp_processor_id()); diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index f7cdb3b457aa..cd97ce18c29d 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -16,6 +16,22 @@ static void *kdump_buf_page; /* Stores the physical address of elf header of crash image. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +static inline bool is_crashed_pfn_valid(unsigned long pfn) +{ +#ifndef CONFIG_X86_PAE + /* + * non-PAE kdump kernel executed from a PAE one will crop high pte + * bits and poke unwanted space counting again from address 0, we + * don't want that. pte must fit into unsigned long. In fact the + * test checks high 12 bits for being zero (pfn will be shifted left + * by PAGE_SHIFT). + */ + return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn; +#else + return true; +#endif +} + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied @@ -41,6 +57,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!csize) return 0; + if (!is_crashed_pfn_valid(pfn)) + return -EFAULT; + vaddr = kmap_atomic_pfn(pfn, KM_PTE0); if (!userbuf) { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2d8a371d4339..b8ce165dde5d 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -268,11 +268,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) show_registers(regs); #ifdef CONFIG_X86_32 - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { + if (user_mode_vm(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; + } else { + sp = kernel_stack_pointer(regs); + savesegment(ss, ss); } printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); print_symbol("%s", regs->ip); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index bca5fba91c9e..f7dd2a7c3bf4 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -5,7 +5,6 @@ #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/module.h> diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 54b0a3276766..a071e6be177e 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -5,7 +5,6 @@ #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/module.h> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 147005a1cc3c..d17d482a04f4 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void) struct resource *res; u64 end; - res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); e820_res = res; for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; @@ -1378,8 +1378,8 @@ static unsigned long ram_alignment(resource_size_t pos) if (mb < 16) return 1024*1024; - /* To 32MB for anything above that */ - return 32*1024*1024; + /* To 64MB for anything above that */ + return 64*1024*1024; } #define MAX_RESOURCE_SIZE ((resource_size_t)-1) @@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void) return who; } -char *__init __attribute__((weak)) machine_specific_memory_setup(void) -{ - if (x86_quirks->arch_memory_setup) { - char *who = x86_quirks->arch_memory_setup(); - - if (who) - return who; - } - return default_machine_specific_memory_setup(); -} - -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -char * __init __attribute__((weak)) memory_setup(void) -{ - return machine_specific_memory_setup(); -} - void __init setup_memory_map(void) { char *who; - who = memory_setup(); + who = x86_init.resources.memory_setup(); memcpy(&e820_saved, &e820, sizeof(struct e820map)); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map(who); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 335f049d110f..b9c830c12b4a 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -160,721 +160,6 @@ static struct console early_serial_console = { .index = -1, }; -#ifdef CONFIG_EARLY_PRINTK_DBGP - -static struct ehci_caps __iomem *ehci_caps; -static struct ehci_regs __iomem *ehci_regs; -static struct ehci_dbg_port __iomem *ehci_debug; -static unsigned int dbgp_endpoint_out; - -struct ehci_dev { - u32 bus; - u32 slot; - u32 func; -}; - -static struct ehci_dev ehci_dev; - -#define USB_DEBUG_DEVNUM 127 - -#define DBGP_DATA_TOGGLE 0x8800 - -static inline u32 dbgp_pid_update(u32 x, u32 tok) -{ - return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); -} - -static inline u32 dbgp_len_update(u32 x, u32 len) -{ - return (x & ~0x0f) | (len & 0x0f); -} - -/* - * USB Packet IDs (PIDs) - */ - -/* token */ -#define USB_PID_OUT 0xe1 -#define USB_PID_IN 0x69 -#define USB_PID_SOF 0xa5 -#define USB_PID_SETUP 0x2d -/* handshake */ -#define USB_PID_ACK 0xd2 -#define USB_PID_NAK 0x5a -#define USB_PID_STALL 0x1e -#define USB_PID_NYET 0x96 -/* data */ -#define USB_PID_DATA0 0xc3 -#define USB_PID_DATA1 0x4b -#define USB_PID_DATA2 0x87 -#define USB_PID_MDATA 0x0f -/* Special */ -#define USB_PID_PREAMBLE 0x3c -#define USB_PID_ERR 0x3c -#define USB_PID_SPLIT 0x78 -#define USB_PID_PING 0xb4 -#define USB_PID_UNDEF_0 0xf0 - -#define USB_PID_DATA_TOGGLE 0x88 -#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) - -#define PCI_CAP_ID_EHCI_DEBUG 0xa - -#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ -#define HUB_SHORT_RESET_TIME 10 -#define HUB_LONG_RESET_TIME 200 -#define HUB_RESET_TIMEOUT 500 - -#define DBGP_MAX_PACKET 8 - -static int dbgp_wait_until_complete(void) -{ - u32 ctrl; - int loop = 0x100000; - - do { - ctrl = readl(&ehci_debug->control); - /* Stop when the transaction is finished */ - if (ctrl & DBGP_DONE) - break; - } while (--loop > 0); - - if (!loop) - return -1; - - /* - * Now that we have observed the completed transaction, - * clear the done bit. - */ - writel(ctrl | DBGP_DONE, &ehci_debug->control); - return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); -} - -static void __init dbgp_mdelay(int ms) -{ - int i; - - while (ms--) { - for (i = 0; i < 1000; i++) - outb(0x1, 0x80); - } -} - -static void dbgp_breath(void) -{ - /* Sleep to give the debug port a chance to breathe */ -} - -static int dbgp_wait_until_done(unsigned ctrl) -{ - u32 pids, lpid; - int ret; - int loop = 3; - -retry: - writel(ctrl | DBGP_GO, &ehci_debug->control); - ret = dbgp_wait_until_complete(); - pids = readl(&ehci_debug->pids); - lpid = DBGP_PID_GET(pids); - - if (ret < 0) - return ret; - - /* - * If the port is getting full or it has dropped data - * start pacing ourselves, not necessary but it's friendly. - */ - if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) - dbgp_breath(); - - /* If I get a NACK reissue the transmission */ - if (lpid == USB_PID_NAK) { - if (--loop > 0) - goto retry; - } - - return ret; -} - -static void dbgp_set_data(const void *buf, int size) -{ - const unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = hi = 0; - for (i = 0; i < 4 && i < size; i++) - lo |= bytes[i] << (8*i); - for (; i < 8 && i < size; i++) - hi |= bytes[i] << (8*(i - 4)); - writel(lo, &ehci_debug->data03); - writel(hi, &ehci_debug->data47); -} - -static void __init dbgp_get_data(void *buf, int size) -{ - unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = readl(&ehci_debug->data03); - hi = readl(&ehci_debug->data47); - for (i = 0; i < 4 && i < size; i++) - bytes[i] = (lo >> (8*i)) & 0xff; - for (; i < 8 && i < size; i++) - bytes[i] = (hi >> (8*(i - 4))) & 0xff; -} - -static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, - const char *bytes, int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_OUT); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - dbgp_set_data(bytes, size); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - return ret; -} - -static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, - int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_IN); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl &= ~DBGP_OUT; - ctrl |= DBGP_GO; - - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - if (size > ret) - size = ret; - dbgp_get_data(data, size); - return ret; -} - -static int __init dbgp_control_msg(unsigned devnum, int requesttype, - int request, int value, int index, void *data, int size) -{ - u32 pids, addr, ctrl; - struct usb_ctrlrequest req; - int read; - int ret; - - read = (requesttype & USB_DIR_IN) != 0; - if (size > (read ? DBGP_MAX_PACKET:0)) - return -1; - - /* Compute the control message */ - req.bRequestType = requesttype; - req.bRequest = request; - req.wValue = cpu_to_le16(value); - req.wIndex = cpu_to_le16(index); - req.wLength = cpu_to_le16(size); - - pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); - addr = DBGP_EPADDR(devnum, 0); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, sizeof(req)); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - /* Send the setup message */ - dbgp_set_data(&req, sizeof(req)); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - /* Read the result */ - return dbgp_bulk_read(devnum, 0, data, size); -} - - -/* Find a PCI capability */ -static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) -{ - u8 pos; - int bytes; - - if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & - PCI_STATUS_CAP_LIST)) - return 0; - - pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { - u8 id; - - pos &= ~3; - id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); - if (id == 0xff) - break; - if (id == cap) - return pos; - - pos = read_pci_config_byte(num, slot, func, - pos+PCI_CAP_LIST_NEXT); - } - return 0; -} - -static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) -{ - u32 class; - - class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); - if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) - return 0; - - return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); -} - -static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) -{ - u32 bus, slot, func; - - for (bus = 0; bus < 256; bus++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - unsigned cap; - - cap = __find_dbgp(bus, slot, func); - - if (!cap) - continue; - if (ehci_num-- != 0) - continue; - *rbus = bus; - *rslot = slot; - *rfunc = func; - return cap; - } - } - } - return 0; -} - -static int __init ehci_reset_port(int port) -{ - u32 portsc; - u32 delay_time, delay; - int loop; - - /* Reset the usb debug port */ - portsc = readl(&ehci_regs->port_status[port - 1]); - portsc &= ~PORT_PE; - portsc |= PORT_RESET; - writel(portsc, &ehci_regs->port_status[port - 1]); - - delay = HUB_ROOT_RESET_TIME; - for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; - delay_time += delay) { - dbgp_mdelay(delay); - - portsc = readl(&ehci_regs->port_status[port - 1]); - if (portsc & PORT_RESET) { - /* force reset to complete */ - loop = 2; - writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), - &ehci_regs->port_status[port - 1]); - do { - portsc = readl(&ehci_regs->port_status[port-1]); - } while ((portsc & PORT_RESET) && (--loop > 0)); - } - - /* Device went away? */ - if (!(portsc & PORT_CONNECT)) - return -ENOTCONN; - - /* bomb out completely if something weird happend */ - if ((portsc & PORT_CSC)) - return -EINVAL; - - /* If we've finished resetting, then break out of the loop */ - if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) - return 0; - } - return -EBUSY; -} - -static int __init ehci_wait_for_port(int port) -{ - u32 status; - int ret, reps; - - for (reps = 0; reps < 3; reps++) { - dbgp_mdelay(100); - status = readl(&ehci_regs->status); - if (status & STS_PCD) { - ret = ehci_reset_port(port); - if (ret == 0) - return 0; - } - } - return -ENOTCONN; -} - -#ifdef DBGP_DEBUG -# define dbgp_printk early_printk -#else -static inline void dbgp_printk(const char *fmt, ...) { } -#endif - -typedef void (*set_debug_port_t)(int port); - -static void __init default_set_debug_port(int port) -{ -} - -static set_debug_port_t __initdata set_debug_port = default_set_debug_port; - -static void __init nvidia_set_debug_port(int port) -{ - u32 dword; - dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x74); - dword &= ~(0x0f<<12); - dword |= ((port & 0x0f)<<12); - write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, - dword); - dbgp_printk("set debug port to %d\n", port); -} - -static void __init detect_set_debug_port(void) -{ - u32 vendorid; - - vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x00); - - if ((vendorid & 0xffff) == 0x10de) { - dbgp_printk("using nvidia set_debug_port\n"); - set_debug_port = nvidia_set_debug_port; - } -} - -static int __init ehci_setup(void) -{ - struct usb_debug_descriptor dbgp_desc; - u32 cmd, ctrl, status, portsc, hcs_params; - u32 debug_port, new_debug_port = 0, n_ports; - u32 devnum; - int ret, i; - int loop; - int port_map_tried; - int playtimes = 3; - -try_next_time: - port_map_tried = 0; - -try_next_port: - - hcs_params = readl(&ehci_caps->hcs_params); - debug_port = HCS_DEBUG_PORT(hcs_params); - n_ports = HCS_N_PORTS(hcs_params); - - dbgp_printk("debug_port: %d\n", debug_port); - dbgp_printk("n_ports: %d\n", n_ports); - - for (i = 1; i <= n_ports; i++) { - portsc = readl(&ehci_regs->port_status[i-1]); - dbgp_printk("portstatus%d: %08x\n", i, portsc); - } - - if (port_map_tried && (new_debug_port != debug_port)) { - if (--playtimes) { - set_debug_port(new_debug_port); - goto try_next_time; - } - return -1; - } - - loop = 10; - /* Reset the EHCI controller */ - cmd = readl(&ehci_regs->command); - cmd |= CMD_RESET; - writel(cmd, &ehci_regs->command); - do { - cmd = readl(&ehci_regs->command); - } while ((cmd & CMD_RESET) && (--loop > 0)); - - if (!loop) { - dbgp_printk("can not reset ehci\n"); - return -1; - } - dbgp_printk("ehci reset done\n"); - - /* Claim ownership, but do not enable yet */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_OWNER; - ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); - writel(ctrl, &ehci_debug->control); - - /* Start the ehci running */ - cmd = readl(&ehci_regs->command); - cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); - cmd |= CMD_RUN; - writel(cmd, &ehci_regs->command); - - /* Ensure everything is routed to the EHCI */ - writel(FLAG_CF, &ehci_regs->configured_flag); - - /* Wait until the controller is no longer halted */ - loop = 10; - do { - status = readl(&ehci_regs->status); - } while ((status & STS_HALT) && (--loop > 0)); - - if (!loop) { - dbgp_printk("ehci can be started\n"); - return -1; - } - dbgp_printk("ehci started\n"); - - /* Wait for a device to show up in the debug port */ - ret = ehci_wait_for_port(debug_port); - if (ret < 0) { - dbgp_printk("No device found in debug port\n"); - goto next_debug_port; - } - dbgp_printk("ehci wait for port done\n"); - - /* Enable the debug port */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_CLAIM; - writel(ctrl, &ehci_debug->control); - ctrl = readl(&ehci_debug->control); - if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { - dbgp_printk("No device in debug port\n"); - writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); - goto err; - } - dbgp_printk("debug ported enabled\n"); - - /* Completely transfer the debug device to the debug controller */ - portsc = readl(&ehci_regs->port_status[debug_port - 1]); - portsc &= ~PORT_PE; - writel(portsc, &ehci_regs->port_status[debug_port - 1]); - - dbgp_mdelay(100); - - /* Find the debug device and make it device number 127 */ - for (devnum = 0; devnum <= 127; devnum++) { - ret = dbgp_control_msg(devnum, - USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, - &dbgp_desc, sizeof(dbgp_desc)); - if (ret > 0) - break; - } - if (devnum > 127) { - dbgp_printk("Could not find attached debug device\n"); - goto err; - } - if (ret < 0) { - dbgp_printk("Attached device is not a debug device\n"); - goto err; - } - dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; - - /* Move the device to 127 if it isn't already there */ - if (devnum != USB_DEBUG_DEVNUM) { - ret = dbgp_control_msg(devnum, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); - if (ret < 0) { - dbgp_printk("Could not move attached device to %d\n", - USB_DEBUG_DEVNUM); - goto err; - } - devnum = USB_DEBUG_DEVNUM; - dbgp_printk("debug device renamed to 127\n"); - } - - /* Enable the debug interface */ - ret = dbgp_control_msg(USB_DEBUG_DEVNUM, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); - if (ret < 0) { - dbgp_printk(" Could not enable the debug device\n"); - goto err; - } - dbgp_printk("debug interface enabled\n"); - - /* Perform a small write to get the even/odd data state in sync - */ - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); - if (ret < 0) { - dbgp_printk("dbgp_bulk_write failed: %d\n", ret); - goto err; - } - dbgp_printk("small write doned\n"); - - return 0; -err: - /* Things didn't work so remove my claim */ - ctrl = readl(&ehci_debug->control); - ctrl &= ~(DBGP_CLAIM | DBGP_OUT); - writel(ctrl, &ehci_debug->control); - return -1; - -next_debug_port: - port_map_tried |= (1<<(debug_port - 1)); - new_debug_port = ((debug_port-1+1)%n_ports) + 1; - if (port_map_tried != ((1<<n_ports) - 1)) { - set_debug_port(new_debug_port); - goto try_next_port; - } - if (--playtimes) { - set_debug_port(new_debug_port); - goto try_next_time; - } - - return -1; -} - -static int __init early_dbgp_init(char *s) -{ - u32 debug_port, bar, offset; - u32 bus, slot, func, cap; - void __iomem *ehci_bar; - u32 dbgp_num; - u32 bar_val; - char *e; - int ret; - u8 byte; - - if (!early_pci_allowed()) - return -1; - - dbgp_num = 0; - if (*s) - dbgp_num = simple_strtoul(s, &e, 10); - dbgp_printk("dbgp_num: %d\n", dbgp_num); - - cap = find_dbgp(dbgp_num, &bus, &slot, &func); - if (!cap) - return -1; - - dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot, - func); - - debug_port = read_pci_config(bus, slot, func, cap); - bar = (debug_port >> 29) & 0x7; - bar = (bar * 4) + 0xc; - offset = (debug_port >> 16) & 0xfff; - dbgp_printk("bar: %02x offset: %03x\n", bar, offset); - if (bar != PCI_BASE_ADDRESS_0) { - dbgp_printk("only debug ports on bar 1 handled.\n"); - - return -1; - } - - bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); - dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); - if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { - dbgp_printk("only simple 32bit mmio bars supported\n"); - - return -1; - } - - /* double check if the mem space is enabled */ - byte = read_pci_config_byte(bus, slot, func, 0x04); - if (!(byte & 0x2)) { - byte |= 0x02; - write_pci_config_byte(bus, slot, func, 0x04, byte); - dbgp_printk("mmio for ehci enabled\n"); - } - - /* - * FIXME I don't have the bar size so just guess PAGE_SIZE is more - * than enough. 1K is the biggest I have seen. - */ - set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); - ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); - ehci_bar += bar_val & ~PAGE_MASK; - dbgp_printk("ehci_bar: %p\n", ehci_bar); - - ehci_caps = ehci_bar; - ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); - ehci_debug = ehci_bar + offset; - ehci_dev.bus = bus; - ehci_dev.slot = slot; - ehci_dev.func = func; - - detect_set_debug_port(); - - ret = ehci_setup(); - if (ret < 0) { - dbgp_printk("ehci_setup failed\n"); - ehci_debug = NULL; - - return -1; - } - - return 0; -} - -static void early_dbgp_write(struct console *con, const char *str, u32 n) -{ - int chunk, ret; - - if (!ehci_debug) - return; - while (n > 0) { - chunk = n; - if (chunk > DBGP_MAX_PACKET) - chunk = DBGP_MAX_PACKET; - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, - dbgp_endpoint_out, str, chunk); - str += chunk; - n -= chunk; - } -} - -static struct console early_dbgp_console = { - .name = "earlydbg", - .write = early_dbgp_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; -#endif - /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; static int __initdata early_console_initialized; @@ -891,10 +176,24 @@ asmlinkage void early_printk(const char *fmt, ...) va_end(ap); } +static inline void early_console_register(struct console *con, int keep_early) +{ + if (early_console->index != -1) { + printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", + con->name); + return; + } + early_console = con; + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} static int __init setup_early_printk(char *buf) { - int keep_early; + int keep; if (!buf) return 0; @@ -903,42 +202,37 @@ static int __init setup_early_printk(char *buf) return 0; early_console_initialized = 1; - keep_early = (strstr(buf, "keep") != NULL); - - if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); - early_console = &early_serial_console; - } else if (!strncmp(buf, "ttyS", 4)) { - early_serial_init(buf); - early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3) - && boot_params.screen_info.orig_video_isVGA == 1) { - max_xpos = boot_params.screen_info.orig_video_cols; - max_ypos = boot_params.screen_info.orig_video_lines; - current_ypos = boot_params.screen_info.orig_y; - early_console = &early_vga_console; + keep = (strstr(buf, "keep") != NULL); + + while (*buf != '\0') { + if (!strncmp(buf, "serial", 6)) { + buf += 6; + early_serial_init(buf); + early_console_register(&early_serial_console, keep); + if (!strncmp(buf, ",ttyS", 5)) + buf += 5; + } + if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + } + if (!strncmp(buf, "vga", 3) && + boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; + early_console_register(&early_vga_console, keep); + } #ifdef CONFIG_EARLY_PRINTK_DBGP - } else if (!strncmp(buf, "dbgp", 4)) { - if (early_dbgp_init(buf+4) < 0) - return 0; - early_console = &early_dbgp_console; - /* - * usb subsys will reset ehci controller, so don't keep - * that early console - */ - keep_early = 0; + if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) + early_console_register(&early_dbgp_console, keep); #endif #ifdef CONFIG_HVC_XEN - } else if (!strncmp(buf, "xen", 3)) { - early_console = &xenboot_console; + if (!strncmp(buf, "xen", 3)) + early_console_register(&xenboot_console, keep); #endif + buf++; } - - if (keep_early) - early_console->flags &= ~CON_BOOT; - else - early_console->flags |= CON_BOOT; - register_console(early_console); return 0; } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index fe26ba3e3451..cdcfb122f256 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -42,6 +42,7 @@ #include <asm/time.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include <asm/x86_init.h> #define EFI_DEBUG 1 #define PFX "EFI: " @@ -453,6 +454,11 @@ void __init efi_init(void) if (add_efi_memmap) do_add_efi_memmap(); +#ifdef CONFIG_X86_32 + x86_platform.get_wallclock = efi_get_time; + x86_platform.set_wallclock = efi_set_rtc_mmss; +#endif + /* Setup for EFI runtime service */ reboot_type = BOOT_EFI; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c097e7d607c6..50b9c220e121 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -334,6 +334,10 @@ ENTRY(ret_from_fork) END(ret_from_fork) /* + * Interrupt exit functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" +/* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to * go as quickly as possible which is why some of this is @@ -383,6 +387,10 @@ need_resched: END(resume_kernel) #endif CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ @@ -513,6 +521,10 @@ sysexit_audit: PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) +/* + * syscall stub including irq exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway @@ -705,6 +717,10 @@ syscall_badsys: jmp resume_userspace END(syscall_badsys) CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* * System calls that need a pt_regs pointer. @@ -814,6 +830,10 @@ common_interrupt: ENDPROC(common_interrupt) CFI_ENDPROC +/* + * Irq entries should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ @@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug) jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) +/* + * End of kprobes section + */ + .popsection ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder @@ -1185,17 +1209,14 @@ END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl $0 pushl %eax - pushl %ecx pushl %edx movl %ebp, %eax call ftrace_return_to_handler - movl %eax, 0xc(%esp) + movl %eax, %ecx popl %edx - popl %ecx popl %eax - ret + jmp *%ecx #endif .section .rodata,"a" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c251be745107..4deb8fc849dd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller) END(ftrace_graph_caller) GLOBAL(return_to_handler) - subq $80, %rsp + subq $24, %rsp /* Save the return values */ movq %rax, (%rsp) @@ -155,11 +155,11 @@ GLOBAL(return_to_handler) call ftrace_return_to_handler - movq %rax, 72(%rsp) + movq %rax, %rdi movq 8(%rsp), %rdx movq (%rsp), %rax - addq $72, %rsp - retq + addq $24, %rsp + jmp *%rdi #endif @@ -536,20 +536,13 @@ sysret_signal: bt $TIF_SYSCALL_AUDIT,%edx jc sysret_audit #endif - /* edx: work flags (arg3) */ - leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 - xorl %esi,%esi # oldset -> arg2 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call do_notify_resume - RESTORE_TOP_OF_STACK %r11 - RESTORE_REST - movl $_TIF_WORK_MASK,%edi - /* Use IRET because user could have changed frame. This - works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check + /* + * We have a signal, or exit tracing or single-step. + * These all wind up with the iret return path anyway, + * so just join that path right now. + */ + FIXUP_TOP_OF_STACK %r11, -ARGOFFSET + jmp int_check_syscall_exit_work badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) @@ -654,6 +647,7 @@ int_careful: int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) +int_check_syscall_exit_work: SAVE_REST /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx @@ -809,6 +803,10 @@ END(interrupt) call \func .endm +/* + * Interrupt entry/exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" /* * The interrupt stubs push (~vector+0x80) onto the stack and * then jump to common_interrupt. @@ -947,6 +945,10 @@ ENTRY(retint_kernel) CFI_ENDPROC END(common_interrupt) +/* + * End of kprobes section + */ + .popsection /* * APIC interrupts. @@ -1021,7 +1023,7 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS apicinterrupt LOCAL_PENDING_VECTOR \ perf_pending_interrupt smp_perf_pending_interrupt #endif @@ -1497,12 +1499,17 @@ error_kernelspace: leaq irq_return(%rip),%rcx cmpq %rcx,RIP+8(%rsp) je error_swapgs - movl %ecx,%ecx /* zero extend */ - cmpq %rcx,RIP+8(%rsp) - je error_swapgs + movl %ecx,%eax /* zero extend */ + cmpq %rax,RIP+8(%rsp) + je bstep_iret cmpq $gs_change,RIP+8(%rsp) je error_swapgs jmp error_sti + +bstep_iret: + /* Fix truncated RIP */ + movq %rcx,RIP+8(%rsp) + jmp error_swapgs END(error_entry) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9dbb527e1652..5a1b9758fd62 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -9,6 +9,8 @@ * the dangers of modifying code on the run. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/spinlock.h> #include <linux/hardirq.h> #include <linux/uaccess.h> @@ -336,15 +338,15 @@ int __init ftrace_dyn_arch_init(void *data) switch (faulted) { case 0: - pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); + pr_info("converting mcount calls to 0f 1f 44 00 00\n"); memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); break; case 1: - pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); + pr_info("converting mcount calls to 66 66 66 66 90\n"); memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); break; case 2: - pr_info("ftrace: converting mcount calls to jmp . + 5\n"); + pr_info("converting mcount calls to jmp . + 5\n"); memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); break; } @@ -468,82 +470,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, #ifdef CONFIG_FTRACE_SYSCALLS -extern unsigned long __start_syscalls_metadata[]; -extern unsigned long __stop_syscalls_metadata[]; extern unsigned long *sys_call_table; -static struct syscall_metadata **syscalls_metadata; - -static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) -{ - struct syscall_metadata *start; - struct syscall_metadata *stop; - char str[KSYM_SYMBOL_LEN]; - - - start = (struct syscall_metadata *)__start_syscalls_metadata; - stop = (struct syscall_metadata *)__stop_syscalls_metadata; - kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); - - for ( ; start < stop; start++) { - if (start->name && !strcmp(start->name, str)) - return start; - } - return NULL; -} - -struct syscall_metadata *syscall_nr_to_meta(int nr) -{ - if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) - return NULL; - - return syscalls_metadata[nr]; -} - -int syscall_name_to_nr(char *name) +unsigned long __init arch_syscall_addr(int nr) { - int i; - - if (!syscalls_metadata) - return -1; - - for (i = 0; i < NR_syscalls; i++) { - if (syscalls_metadata[i]) { - if (!strcmp(syscalls_metadata[i]->name, name)) - return i; - } - } - return -1; -} - -void set_syscall_enter_id(int num, int id) -{ - syscalls_metadata[num]->enter_id = id; -} - -void set_syscall_exit_id(int num, int id) -{ - syscalls_metadata[num]->exit_id = id; -} - -static int __init arch_init_ftrace_syscalls(void) -{ - int i; - struct syscall_metadata *meta; - unsigned long **psys_syscall_table = &sys_call_table; - - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - NR_syscalls, GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return -ENOMEM; - } - - for (i = 0; i < NR_syscalls; i++) { - meta = find_syscall_meta(psys_syscall_table[i]); - syscalls_metadata[i] = meta; - } - return 0; + return (unsigned long)(&sys_call_table)[nr]; } -arch_initcall(arch_init_ftrace_syscalls); #endif diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3f8579f8d42c..4f8e2507e8f3 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -11,8 +11,21 @@ #include <asm/setup.h> #include <asm/sections.h> #include <asm/e820.h> -#include <asm/bios_ebda.h> +#include <asm/page.h> #include <asm/trampoline.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/bios_ebda.h> + +static void __init i386_default_early_setup(void) +{ + /* Initilize 32bit specific setup functions */ + x86_init.resources.probe_roms = probe_roms; + x86_init.resources.reserve_resources = i386_reserve_resources; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; + + reserve_ebda_region(); +} void __init i386_start_kernel(void) { @@ -29,7 +42,16 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif - reserve_ebda_region(); + + /* Call the subarch specific early setup function */ + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_MRST: + x86_mrst_early_setup(); + break; + default: + i386_default_early_setup(); + break; + } /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 70eaa852c732..0b06cd778fd9 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -23,8 +23,8 @@ #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/e820.h> -#include <asm/bios_ebda.h> #include <asm/trampoline.h> +#include <asm/bios_ebda.h> static void __init zap_identity_mappings(void) { diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 7ffec6b3b331..050c278481b1 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -79,7 +79,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * any particular GDT layout, because we load our own as soon as we * can. */ -.section .text.head,"ax",@progbits +__HEAD ENTRY(startup_32) /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ @@ -157,6 +157,7 @@ subarch_entries: .long default_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ + .long default_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 .previous #endif /* CONFIG_PARAVIRT */ @@ -607,7 +608,7 @@ ENTRY(initial_code) /* * BSS section */ -.section ".bss.page_aligned","wa" +__PAGE_ALIGNED_BSS .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE swapper_pg_pmd: @@ -625,7 +626,7 @@ ENTRY(empty_zero_page) * This starts the data section. */ #ifdef CONFIG_X86_PAE -.section ".data.page_aligned","wa" +__PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index fa54f78e2a05..22db86a37643 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -40,7 +40,7 @@ L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) .text - .section .text.head + __HEAD .code64 .globl startup_64 startup_64: @@ -212,8 +212,8 @@ ENTRY(secondary_startup_64) */ lgdt early_gdt_descr(%rip) - /* set up data segments. actually 0 would do too */ - movl $__KERNEL_DS,%eax + /* set up data segments */ + xorl %eax,%eax movl %eax,%ds movl %eax,%ss movl %eax,%es @@ -418,7 +418,7 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 - .section .bss.page_aligned, "aw", @nobits + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c new file mode 100644 index 000000000000..d42f65ac4927 --- /dev/null +++ b/arch/x86/kernel/hw_breakpoint.c @@ -0,0 +1,555 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) 2009 IBM Corporation + * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com> + * + * Authors: Alan Stern <stern@rowland.harvard.edu> + * K.Prasad <prasad@linux.vnet.ibm.com> + * Frederic Weisbecker <fweisbec@gmail.com> + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + */ + +#include <linux/perf_event.h> +#include <linux/hw_breakpoint.h> +#include <linux/irqflags.h> +#include <linux/notifier.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/percpu.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <asm/hw_breakpoint.h> +#include <asm/processor.h> +#include <asm/debugreg.h> + +/* Per cpu debug control register value */ +DEFINE_PER_CPU(unsigned long, cpu_dr7); +EXPORT_PER_CPU_SYMBOL(cpu_dr7); + +/* Per cpu debug address registers values */ +static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); + +/* + * Stores the breakpoints currently in use on each breakpoint address + * register for each cpus + */ +static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); + + +static inline unsigned long +__encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + unsigned long bp_info; + + bp_info = (len | type) & 0xf; + bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); + + return bp_info; +} + +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; +} + +/* + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. + */ +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) +{ + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); + + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; + + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; +} + +/* + * Install a perf counter breakpoint. + * + * We seek a free debug address register and use it for this + * breakpoint. Eventually we enable it in the debug control register. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +int arch_install_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (!*slot) { + *slot = bp; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return -EBUSY; + + set_debugreg(info->address, i); + __get_cpu_var(cpu_debugreg[i]) = info->address; + + dr7 = &__get_cpu_var(cpu_dr7); + *dr7 |= encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); + + return 0; +} + +/* + * Uninstall the breakpoint contained in the given counter. + * + * First we search the debug address register it uses and then we disable + * it. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +void arch_uninstall_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (*slot == bp) { + *slot = NULL; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return; + + dr7 = &__get_cpu_var(cpu_dr7); + *dr7 &= ~__encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); +} + +static int get_hbp_len(u8 hbp_len) +{ + unsigned int len_in_bytes = 0; + + switch (hbp_len) { + case X86_BREAKPOINT_LEN_1: + len_in_bytes = 1; + break; + case X86_BREAKPOINT_LEN_2: + len_in_bytes = 2; + break; + case X86_BREAKPOINT_LEN_4: + len_in_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + len_in_bytes = 8; + break; +#endif + } + return len_in_bytes; +} + +/* + * Check for virtual address in user space. + */ +int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va <= TASK_SIZE - len); +} + +/* + * Check for virtual address in kernel space. + */ +static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); +} + +/* + * Store a breakpoint's encoded address, length, and type. + */ +static int arch_store_info(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (info->name) + info->address = (unsigned long) + kallsyms_lookup_name(info->name); + if (info->address) + return 0; + + return -EINVAL; +} + +int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type) +{ + /* Len */ + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + *gen_len = HW_BREAKPOINT_LEN_1; + break; + case X86_BREAKPOINT_LEN_2: + *gen_len = HW_BREAKPOINT_LEN_2; + break; + case X86_BREAKPOINT_LEN_4: + *gen_len = HW_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + *gen_len = HW_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + *gen_type = HW_BREAKPOINT_X; + break; + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; + break; + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + break; + default: + return -EINVAL; + } + + return 0; +} + + +static int arch_build_bp_info(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + + info->address = bp->attr.bp_addr; + + /* Len */ + switch (bp->attr.bp_len) { + case HW_BREAKPOINT_LEN_1: + info->len = X86_BREAKPOINT_LEN_1; + break; + case HW_BREAKPOINT_LEN_2: + info->len = X86_BREAKPOINT_LEN_2; + break; + case HW_BREAKPOINT_LEN_4: + info->len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + info->len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + /* Type */ + switch (bp->attr.bp_type) { + case HW_BREAKPOINT_W: + info->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + info->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + info->type = X86_BREAKPOINT_EXECUTE; + break; + default: + return -EINVAL; + } + + return 0; +} +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct perf_event *bp, + struct task_struct *tsk) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned int align; + int ret; + + + ret = arch_build_bp_info(bp); + if (ret) + return ret; + + ret = -EINVAL; + + if (info->type == X86_BREAKPOINT_EXECUTE) + /* + * Ptrace-refactoring code + * For now, we'll allow instruction breakpoint only for user-space + * addresses + */ + if ((!arch_check_va_in_userspace(info->address, info->len)) && + info->len != X86_BREAKPOINT_EXECUTE) + return ret; + + switch (info->len) { + case X86_BREAKPOINT_LEN_1: + align = 0; + break; + case X86_BREAKPOINT_LEN_2: + align = 1; + break; + case X86_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + align = 7; + break; +#endif + default: + return ret; + } + + if (bp->callback) + ret = arch_store_info(bp); + + if (ret < 0) + return ret; + /* + * Check that the low-order bits of the address are appropriate + * for the alignment implied by len. + */ + if (info->address & align) + return -EINVAL; + + /* Check that the virtual address is in the proper range */ + if (tsk) { + if (!arch_check_va_in_userspace(info->address, info->len)) + return -EFAULT; + } else { + if (!arch_check_va_in_kernelspace(info->address, info->len)) + return -EFAULT; + } + + return 0; +} + +/* + * Dump the debug register contents to the user. + * We can't dump our per cpu values because it + * may contain cpu wide breakpoint, something that + * doesn't belong to the current task. + * + * TODO: include non-ptrace user breakpoints (perf) + */ +void aout_dump_debugregs(struct user *dump) +{ + int i; + int dr7 = 0; + struct perf_event *bp; + struct arch_hw_breakpoint *info; + struct thread_struct *thread = ¤t->thread; + + for (i = 0; i < HBP_NUM; i++) { + bp = thread->ptrace_bps[i]; + + if (bp && !bp->attr.disabled) { + dump->u_debugreg[i] = bp->attr.bp_addr; + info = counter_arch_bp(bp); + dr7 |= encode_dr7(i, info->len, info->type); + } else { + dump->u_debugreg[i] = 0; + } + } + + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + + dump->u_debugreg[7] = dr7; +} +EXPORT_SYMBOL_GPL(aout_dump_debugregs); + +/* + * Release the user breakpoints used by ptrace + */ +void flush_ptrace_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *t = &tsk->thread; + + for (i = 0; i < HBP_NUM; i++) { + unregister_hw_breakpoint(t->ptrace_bps[i]); + t->ptrace_bps[i] = NULL; + } +} + +void hw_breakpoint_restore(void) +{ + set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); + set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); + set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); + set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(__get_cpu_var(cpu_dr7), 7); +} +EXPORT_SYMBOL_GPL(hw_breakpoint_restore); + +/* + * Handle debug exception notifications. + * + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. + * + * NOTIFY_DONE returned if one of the following conditions is true. + * i) When the causative address is from user-space and the exception + * is a valid one, i.e. not triggered as a result of lazy debug register + * switching + * ii) When there are more bits than trap<n> set in DR6 register (such + * as BD, BS or BT) indicating that more than one debug condition is + * met and requires some more action in do_debug(). + * + * NOTIFY_STOP returned for all other cases + * + */ +static int __kprobes hw_breakpoint_handler(struct die_args *args) +{ + int i, cpu, rc = NOTIFY_STOP; + struct perf_event *bp; + unsigned long dr7, dr6; + unsigned long *dr6_p; + + /* The DR6 value is pointed by args->err */ + dr6_p = (unsigned long *)ERR_PTR(args->err); + dr6 = *dr6_p; + + /* Do an early return if no trap bits are set in DR6 */ + if ((dr6 & DR_TRAP_BITS) == 0) + return NOTIFY_DONE; + + get_debugreg(dr7, 7); + /* Disable breakpoints during exception handling */ + set_debugreg(0UL, 7); + /* + * Assert that local interrupts are disabled + * Reset the DRn bits in the virtualized register value. + * The ptrace trigger routine will add in whatever is needed. + */ + current->thread.debugreg6 &= ~DR_TRAP_BITS; + cpu = get_cpu(); + + /* Handle all the breakpoints that were triggered */ + for (i = 0; i < HBP_NUM; ++i) { + if (likely(!(dr6 & (DR_TRAP0 << i)))) + continue; + + /* + * The counter may be concurrently released but that can only + * occur from a call_rcu() path. We can then safely fetch + * the breakpoint, use its callback, touch its counter + * while we are in an rcu_read_lock() path. + */ + rcu_read_lock(); + + bp = per_cpu(bp_per_reg[i], cpu); + if (bp) + rc = NOTIFY_DONE; + /* + * Reset the 'i'th TRAP bit in dr6 to denote completion of + * exception handling + */ + (*dr6_p) &= ~(DR_TRAP0 << i); + /* + * bp can be NULL due to lazy debug register switching + * or due to concurrent perf counter removing. + */ + if (!bp) { + rcu_read_unlock(); + break; + } + + (bp->callback)(bp, args->regs); + + rcu_read_unlock(); + } + if (dr6 & (~DR_TRAP_BITS)) + rc = NOTIFY_DONE; + + set_debugreg(dr7, 7); + put_cpu(); + + return rc; +} + +/* + * Handle debug exception notifications. + */ +int __kprobes hw_breakpoint_exceptions_notify( + struct notifier_block *unused, unsigned long val, void *data) +{ + if (val != DIE_DEBUG) + return NOTIFY_DONE; + + return hw_breakpoint_handler(data); +} + +void hw_breakpoint_pmu_read(struct perf_event *bp) +{ + /* TODO */ +} + +void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) +{ + /* TODO */ +} diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 43cec6bdda63..9c3bd4a2050e 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -10,6 +10,16 @@ EXPORT_SYMBOL(mcount); #endif +/* + * Note, this is a prototype to get at the symbol for + * the export, but dont use it from C code, it is used + * by assembly code and is not using C calling convention! + */ +#ifndef CONFIG_X86_CMPXCHG64 +extern void cmpxchg8b_emu(void); +EXPORT_SYMBOL(cmpxchg8b_emu); +#endif + /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 5cf36c053ac4..23c167925a5c 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -19,12 +19,6 @@ DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -#ifdef CONFIG_X86_32 -static void pit_disable_clocksource(void); -#else -static inline void pit_disable_clocksource(void) { } -#endif - /* * HPET replaces the PIT, when enabled. So we need to know, which of * the two timers is used @@ -57,12 +51,10 @@ static void init_pit_timer(enum clock_event_mode mode, outb_pit(0, PIT_CH0); outb_pit(0, PIT_CH0); } - pit_disable_clocksource(); break; case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ - pit_disable_clocksource(); outb_pit(0x38, PIT_MODE); break; @@ -200,17 +192,6 @@ static struct clocksource pit_cs = { .shift = 20, }; -static void pit_disable_clocksource(void) -{ - /* - * Use mult to check whether it is registered or not - */ - if (pit_cs.mult) { - clocksource_unregister(&pit_cs); - pit_cs.mult = 0; - } -} - static int __init init_pit_clocksource(void) { /* diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 270ff83efc11..3a54dcb9cd0e 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b0cdde6932f5..fee6cc2b2079 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -63,10 +63,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "%*s: ", prec, "CNT"); + seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); - seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "PND"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); @@ -92,19 +92,19 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); -# endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -194,13 +194,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_call_count; sum += irq_stats(cpu)->irq_tlb_count; #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; -# endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); #endif @@ -274,3 +274,93 @@ void smp_generic_interrupt(struct pt_regs *regs) } EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); + +#ifdef CONFIG_HOTPLUG_CPU +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irq, vector; + static int warned; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + int break_affinity = 0; + int set_affinity = 1; + const struct cpumask *affinity; + + if (!desc) + continue; + if (irq == 2) + continue; + + /* interrupt's are disabled at this point */ + spin_lock(&desc->lock); + + affinity = desc->affinity; + if (!irq_has_action(irq) || + cpumask_equal(affinity, cpu_online_mask)) { + spin_unlock(&desc->lock); + continue; + } + + /* + * Complete the irq move. This cpu is going down and for + * non intr-remapping case, we can't wait till this interrupt + * arrives at this cpu before completing the irq move. + */ + irq_force_complete_move(irq); + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + break_affinity = 1; + affinity = cpu_all_mask; + } + + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask) + desc->chip->mask(irq); + + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, affinity); + else if (!(warned++)) + set_affinity = 0; + + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) + desc->chip->unmask(irq); + + spin_unlock(&desc->lock); + + if (break_affinity && set_affinity) + printk("Broke affinity for irq %i\n", irq); + else if (!set_affinity) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* + * We can remove mdelay() and then send spuriuous interrupts to + * new cpu targets for all the irqs that were handled previously by + * this cpu. While it works, I have seen spurious interrupt messages + * (nothing wrong but still...). + * + * So for now, retain mdelay(1) and check the IRR and then send those + * interrupts to new targets as this cpu is already offlined... + */ + mdelay(1); + + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irr; + + if (__get_cpu_var(vector_irq)[vector] < 0) + continue; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1 << (vector % 32))) { + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + spin_lock(&desc->lock); + if (desc->chip->retrigger) + desc->chip->retrigger(irq); + spin_unlock(&desc->lock); + } + } +} +#endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 7d35d0fe2329..10709f29d166 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) return true; } - -#ifdef CONFIG_HOTPLUG_CPU - -/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -void fixup_irqs(void) -{ - unsigned int irq; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - affinity = desc->affinity; - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - printk("Breaking affinity for irq %i\n", irq); - affinity = cpu_all_mask; - } - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (desc->action) - printk_once("Cannot set affinity for irq %i\n", irq); - } - -#if 0 - barrier(); - /* Ingo Molnar says: "after the IO-APIC masks have been redirected - [note the nop - the interrupt-enable boundary on x86 is two - instructions from sti] - to flush out pending hardirqs and - IPIs. After this point nothing is supposed to reach this CPU." */ - __asm__ __volatile__("sti; nop; cli"); - barrier(); -#else - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -#endif -} -#endif - diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 977d8b43a0dd..acf8fbf8fbda 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) return true; } -#ifdef CONFIG_HOTPLUG_CPU -/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -void fixup_irqs(void) -{ - unsigned int irq; - static int warned; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - int break_affinity = 0; - int set_affinity = 1; - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - /* interrupt's are disabled at this point */ - spin_lock(&desc->lock); - - affinity = desc->affinity; - if (!irq_has_action(irq) || - cpumask_equal(affinity, cpu_online_mask)) { - spin_unlock(&desc->lock); - continue; - } - - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - break_affinity = 1; - affinity = cpu_all_mask; - } - - if (desc->chip->mask) - desc->chip->mask(irq); - - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (!(warned++)) - set_affinity = 0; - - if (desc->chip->unmask) - desc->chip->unmask(irq); - - spin_unlock(&desc->lock); - - if (break_affinity && set_affinity) - printk("Broke affinity for irq %i\n", irq); - else if (!set_affinity) - printk("Cannot set affinity for irq %i\n", irq); - } - - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -} -#endif extern void call_softirq(void); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 92b7703d3d58..40f30773fb29 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -static void __init init_ISA_irqs(void) +void __init init_ISA_irqs(void) { int i; @@ -140,8 +140,10 @@ static void __init init_ISA_irqs(void) } } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +void __init init_IRQ(void) +{ + x86_init.irqs.intr_init(); +} static void __init smp_intr_init(void) { @@ -190,7 +192,7 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_MCE_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif -#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC) alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); #endif @@ -206,39 +208,19 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); /* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_COUNTERS +# ifdef CONFIG_PERF_EVENTS alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif #endif } -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ -#ifdef CONFIG_X86_32 - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } -#endif - init_ISA_irqs(); -} - void __init native_init_IRQ(void) { int i; /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); + x86_init.irqs.pre_vector_init(); apic_intr_init(); @@ -258,12 +240,6 @@ void __init native_init_IRQ(void) #ifdef CONFIG_X86_32 /* - * Call quirks after call gates are initialised (usually add in - * the architecture specific gates): - */ - x86_quirk_intr_init(); - - /* * External FPU? Set up irq13 if so, for * original braindamaged IBM FERR coupling. */ diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8d82a77a3f3b..20a5b3689463 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -43,6 +43,7 @@ #include <linux/smp.h> #include <linux/nmi.h> +#include <asm/debugreg.h> #include <asm/apicdef.h> #include <asm/system.h> @@ -88,7 +89,6 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; - gdb_regs[GDB_SP] = (int)®s->sp; #else gdb_regs[GDB_R8] = regs->r8; gdb_regs[GDB_R9] = regs->r9; @@ -101,8 +101,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs32[GDB_PS] = regs->flags; gdb_regs32[GDB_CS] = regs->cs; gdb_regs32[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = regs->sp; #endif + gdb_regs[GDB_SP] = kernel_stack_pointer(regs); } /** @@ -434,6 +434,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) "resuming...\n"); kgdb_arch_handle_exception(args->trapnr, args->signr, args->err, "c", "", regs); + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; return NOTIFY_STOP; } diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..1f3186ce213c 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -48,31 +48,22 @@ #include <linux/preempt.h> #include <linux/module.h> #include <linux/kdebug.h> +#include <linux/kallsyms.h> #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/pgtable.h> #include <asm/uaccess.h> #include <asm/alternative.h> +#include <asm/insn.h> +#include <asm/debugreg.h> void jprobe_return_end(void); DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#ifdef CONFIG_X86_64 -#define stack_addr(regs) ((unsigned long *)regs->sp) -#else -/* - * "®s->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs - * don't save the ss and esp registers if the CPU is already in kernel - * mode when it traps. So for kprobes, regs->sp and regs->ss are not - * the [nonexistent] saved stack pointer and ss register, but rather - * the top 8 bytes of the pre-int3 stack. So ®s->sp happens to - * point to the top of the pre-int3 stack. - */ -#define stack_addr(regs) ((unsigned long *)®s->sp) -#endif +#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs)) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -106,50 +97,6 @@ static const u32 twobyte_is_boostable[256 / 32] = { /* ----------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -static const u32 onebyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ - W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ - W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ - W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ - W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ - W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ - W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ - W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ - W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; -static const u32 twobyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ - W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ - W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ - W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ - W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ - W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ - W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ - W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ - W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ - W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ - W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; #undef W struct kretprobe_blackpoint kretprobe_blacklist[] = { @@ -244,6 +191,75 @@ retry: } } +/* Recover the probed instruction at addr for further analysis. */ +static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ + struct kprobe *kp; + kp = get_kprobe((void *)addr); + if (!kp) + return -EINVAL; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, fix_riprel() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, kp->opcode has a copy of the first byte of + * the probed instruction, which is overwritten by int3. And + * the instruction at kp->addr is not modified by kprobes except + * for the first byte, we can recover the original instruction + * from it and kp->opcode. + */ + memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + buf[0] = kp->opcode; + return 0; +} + +/* Dummy buffers for kallsyms_lookup */ +static char __dummy_buf[KSYM_NAME_LEN]; + +/* Check if paddr is at an instruction boundary */ +static int __kprobes can_probe(unsigned long paddr) +{ + int ret; + unsigned long addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + + /* + * Check if the instruction has been modified by another + * kprobe, in which case we replace the breakpoint by the + * original instruction in our buffer. + */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + /* + * Another debugging subsystem might insert + * this breakpoint. In that case, we can't + * recover it. + */ + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + addr += insn.length; + } + + return (addr == paddr); +} + /* * Returns non-zero if opcode modifies the interrupt flag. */ @@ -277,68 +293,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) static void __kprobes fix_riprel(struct kprobe *p) { #ifdef CONFIG_X86_64 - u8 *insn = p->ainsn.insn; - s64 disp; - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } + struct insn insn; + kernel_insn_init(&insn, p->ainsn.insn); - /* Skip REX instruction prefix. */ - if (is_REX_prefix(insn)) - ++insn; - - if (*insn == 0x0f) { - /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, - (unsigned long *)twobyte_has_modrm); - } else - /* One-byte opcode. */ - need_modrm = test_bit(*insn, - (unsigned long *)onebyte_has_modrm); - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { - /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - ++insn; - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - disp = (u8 *) p->addr + *((s32 *) insn) - - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *(s32 *)insn = (s32) disp; - } + if (insn_rip_relative(&insn)) { + s64 newdisp; + u8 *disp; + insn_get_displacement(&insn); + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) p->addr + (s64) insn.displacement.value - + (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ + disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); + *(s32 *) disp = (s32) newdisp; } #endif } @@ -359,6 +337,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) @@ -472,17 +452,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, { switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: -#ifdef CONFIG_X86_64 - /* TODO: Provide re-entrancy from post_kprobes_handler() and - * avoid exception stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->ip = (unsigned long)p->addr; - reset_current_kprobe(); - preempt_enable_no_resched(); - break; -#endif case KPROBE_HIT_ACTIVE: save_previous_kprobe(kcb); set_current_kprobe(p, regs, kcb); @@ -491,18 +460,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, kcb->kprobe_status = KPROBE_REENTER; break; case KPROBE_HIT_SS: - if (p == kprobe_running()) { - regs->flags &= ~X86_EFLAGS_TF; - regs->flags |= kcb->kprobe_saved_flags; - return 0; - } else { - /* A probe has been hit in the codepath leading up - * to, or just after, single-stepping of a probed - * instruction. This entire codepath should strictly - * reside in .kprobes.text section. Raise a warning - * to highlight this peculiar case. - */ - } + /* A probe has been hit in the codepath leading up to, or just + * after, single-stepping of a probed instruction. This entire + * codepath should strictly reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless reentering loop + * and eventually a stack overflow. + */ + printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", + p->addr); + dump_kprobe(p); + BUG(); default: /* impossible cases */ WARN_ON(1); @@ -967,8 +934,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ret = NOTIFY_STOP; break; case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) + if (post_kprobe_handler(args->regs)) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; ret = NOTIFY_STOP; + } break; case DIE_GPF: /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index c664d515f613..63b0ec8d3d4a 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -34,7 +34,6 @@ struct kvm_para_state { u8 mmu_queue[MMU_QUEUE_SIZE]; int mmu_queue_len; - enum paravirt_lazy_mode mode; }; static DEFINE_PER_CPU(struct kvm_para_state, para_state); @@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len) { struct kvm_para_state *state = kvm_para_state(); - if (state->mode != PARAVIRT_LAZY_MMU) { + if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { kvm_mmu_op(buffer, len); return; } @@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn) static void kvm_enter_lazy_mmu(void) { - struct kvm_para_state *state = kvm_para_state(); - paravirt_enter_lazy_mmu(); - state->mode = paravirt_get_lazy_mode(); } static void kvm_leave_lazy_mmu(void) @@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void) mmu_queue_flush(state); paravirt_leave_lazy_mmu(); - state->mode = paravirt_get_lazy_mode(); } static void __init paravirt_ops_setup(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 223af43f1526..feaeb0d3aa4f 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -22,6 +22,8 @@ #include <asm/msr.h> #include <asm/apic.h> #include <linux/percpu.h> + +#include <asm/x86_init.h> #include <asm/reboot.h> #define KVM_SCALE 22 @@ -50,8 +52,8 @@ static unsigned long kvm_get_wallclock(void) struct timespec ts; int low, high; - low = (int)__pa(&wall_clock); - high = ((u64)__pa(&wall_clock) >> 32); + low = (int)__pa_symbol(&wall_clock); + high = ((u64)__pa_symbol(&wall_clock) >> 32); native_write_msr(MSR_KVM_WALL_CLOCK, low, high); vcpu_time = &get_cpu_var(hv_clock); @@ -182,12 +184,13 @@ void __init kvmclock_init(void) if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { if (kvm_register_clock("boot clock")) return; - pv_time_ops.get_wallclock = kvm_get_wallclock; - pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; - pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; + x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.get_wallclock = kvm_get_wallclock; + x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; + x86_cpuinit.setup_percpu_clockev = + kvm_setup_secondary_clock; #endif #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 71f1d99a635d..ec6ef60cbd17 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #ifdef CONFIG_SMP preempt_disable(); load_LDT(pc); - if (!cpus_equal(current->mm->cpu_vm_mask, - cpumask_of_cpu(smp_processor_id()))) + if (!cpumask_equal(mm_cpumask(current->mm), + cpumask_of(smp_processor_id()))) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index c1c429d00130..c843f8406da2 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -25,6 +25,7 @@ #include <asm/desc.h> #include <asm/system.h> #include <asm/cacheflush.h> +#include <asm/debugreg.h> static void set_idt(void *newidt, __u16 limit) { @@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 84c3bf209e98..4a8bb82248ae 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -18,6 +18,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> +#include <asm/debugreg.h> static int init_one_level2_page(struct kimage *image, pgd_t *pgd, unsigned long addr) @@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 366baa179913..f4c538b681ca 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -317,6 +317,12 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) return UCODE_NFOUND; } + if (*(u32 *)firmware->data != UCODE_MAGIC) { + printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", + *(u32 *)firmware->data); + return UCODE_ERROR; + } + ret = generic_load_microcode(cpu, firmware->data, firmware->size); release_firmware(firmware); diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9371448290ac..2bcad3926edb 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -73,7 +73,6 @@ #include <linux/platform_device.h> #include <linux/miscdevice.h> #include <linux/capability.h> -#include <linux/smp_lock.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> @@ -201,7 +200,6 @@ static int do_microcode_update(const void __user *buf, size_t size) static int microcode_open(struct inode *unused1, struct file *unused2) { - cycle_kernel_lock(); return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } @@ -210,8 +208,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, { ssize_t ret = -EINVAL; - if ((len >> PAGE_SHIFT) > num_physpages) { - pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); return ret; } @@ -236,7 +234,7 @@ static const struct file_operations microcode_fops = { static struct miscdevice microcode_dev = { .minor = MICROCODE_MINOR, .name = "microcode", - .devnode = "cpu/microcode", + .nodename = "cpu/microcode", .fops = µcode_fops, }; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index fcd513bf2846..5be95ef4ffec 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } +int __init default_mpc_apic_id(struct mpc_cpu *m) +{ + return m->apicid; +} + static void __init MP_processor_info(struct mpc_cpu *m) { int apicid; @@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - if (x86_quirks->mpc_apic_id) - apicid = x86_quirks->mpc_apic_id(m); - else - apicid = m->apicid; + apicid = x86_init.mpparse.mpc_apic_id(m); if (m->cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; @@ -70,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m) } #ifdef CONFIG_X86_IO_APIC -static void __init MP_bus_info(struct mpc_bus *m) +void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) { - char str[7]; memcpy(str, m->bustype, 6); str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} - if (x86_quirks->mpc_oem_bus_info) - x86_quirks->mpc_oem_bus_info(m, str); - else - apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; + + x86_init.mpparse.mpc_oem_bus_info(m, str); #if MAX_MP_BUSSES < 256 if (m->busid >= MAX_MP_BUSSES) { @@ -96,8 +100,8 @@ static void __init MP_bus_info(struct mpc_bus *m) mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { - if (x86_quirks->mpc_oem_pci_bus) - x86_quirks->mpc_oem_pci_bus(m); + if (x86_init.mpparse.mpc_oem_pci_bus) + x86_init.mpparse.mpc_oem_pci_bus(m); clear_bit(m->busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) @@ -291,6 +295,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) 1, mpc, mpc->length, 1); } +void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -312,16 +318,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (early) return 1; - if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { - struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; - x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize); - } + if (mpc->oemptr) + x86_init.mpparse.smp_read_mpc_oem(mpc); /* * Now process the configuration blocks. */ - if (x86_quirks->mpc_record) - *x86_quirks->mpc_record = 0; + x86_init.mpparse.mpc_record(0); while (count < mpc->length) { switch (*mpt) { @@ -353,8 +356,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) count = mpc->length; break; } - if (x86_quirks->mpc_record) - (*x86_quirks->mpc_record)++; + x86_init.mpparse.mpc_record(1); } #ifdef CONFIG_X86_BIGSMP @@ -608,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) /* * Scan the memory blocks for an SMP configuration block. */ -static void __init __get_smp_config(unsigned int early) +void __init default_get_smp_config(unsigned int early) { struct mpf_intel *mpf = mpf_found; @@ -625,11 +627,6 @@ static void __init __get_smp_config(unsigned int early) if (acpi_lapic && acpi_ioapic) return; - if (x86_quirks->mach_get_smp_config) { - if (x86_quirks->mach_get_smp_config(early)) - return; - } - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) @@ -670,16 +667,6 @@ static void __init __get_smp_config(unsigned int early) */ } -void __init early_get_smp_config(void) -{ - __get_smp_config(1); -} - -void __init get_smp_config(void) -{ - __get_smp_config(0); -} - static void __init smp_reserve_bootmem(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); @@ -745,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -static void __init __find_smp_config(unsigned int reserve) +void __init default_find_smp_config(unsigned int reserve) { unsigned int address; - if (x86_quirks->mach_find_smp_config) { - if (x86_quirks->mach_find_smp_config(reserve)) - return; - } /* * FIXME: Linux assumes you have 640K of base ram.. * this continues the error... @@ -787,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve) smp_scan_config(address, 0x400, reserve); } -void __init early_find_smp_config(void) -{ - __find_smp_config(0); -} - -void __init find_smp_config(void) -{ - __find_smp_config(1); -} - #ifdef CONFIG_X86_IO_APIC static u8 __initdata irq_used[MAX_IRQ_SOURCES]; diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c new file mode 100644 index 000000000000..3b7078abc871 --- /dev/null +++ b/arch/x86/kernel/mrst.c @@ -0,0 +1,24 @@ +/* + * mrst.c: Intel Moorestown platform specific setup code + * + * (C) Copyright 2008 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/init.h> + +#include <asm/setup.h> + +/* + * Moorestown specific x86_init function overrides and early setup + * calls. + */ +void __init x86_mrst_early_setup(void) +{ + x86_init.resources.probe_roms = x86_init_noop; + x86_init.resources.reserve_resources = x86_init_noop; +} diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index c00610963238..553449951b84 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -237,7 +237,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; -static char *msr_nodename(struct device *dev) +static char *msr_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); } @@ -258,7 +258,7 @@ static int __init msr_init(void) err = PTR_ERR(msr_class); goto out_chrdev; } - msr_class->nodename = msr_nodename; + msr_class->devnode = msr_devnode; for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f5b0b4a01fb2..1b1739d16310 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -54,17 +54,12 @@ u64 _paravirt_ident_64(u64 x) return x; } -static void __init default_banner(void) +void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); } -char *memory_setup(void) -{ - return pv_init_ops.memory_setup(); -} - /* Simple instruction patching code. */ #define DEF_NATIVE(ops, name, code) \ extern const char start_##ops##_##name[], end_##ops##_##name[]; \ @@ -188,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len, return insn_len; } -void init_IRQ(void) -{ - pv_irq_ops.init_IRQ(); -} - static void native_flush_tlb(void) { __native_flush_tlb(); @@ -218,13 +208,6 @@ extern void native_irq_enable_sysexit(void); extern void native_usergs_sysret32(void); extern void native_usergs_sysret64(void); -static int __init print_banner(void) -{ - pv_init_ops.banner(); - return 0; -} -core_initcall(print_banner); - static struct resource reserve_ioports = { .start = 0, .end = IO_SPACE_LIMIT, @@ -320,21 +303,13 @@ struct pv_info pv_info = { struct pv_init_ops pv_init_ops = { .patch = native_patch, - .banner = default_banner, - .arch_setup = paravirt_nop, - .memory_setup = machine_specific_memory_setup, }; struct pv_time_ops pv_time_ops = { - .time_init = hpet_time_init, - .get_wallclock = native_get_wallclock, - .set_wallclock = native_set_wallclock, .sched_clock = native_sched_clock, - .get_tsc_khz = native_calibrate_tsc, }; struct pv_irq_ops pv_irq_ops = { - .init_IRQ = native_init_IRQ, .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), @@ -409,8 +384,6 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC - .setup_boot_clock = setup_boot_APIC_clock, - .setup_secondary_clock = setup_secondary_APIC_clock, .startup_ipi_hook = paravirt_nop, #endif }; @@ -424,13 +397,6 @@ struct pv_apic_ops pv_apic_ops = { #endif struct pv_mmu_ops pv_mmu_ops = { -#ifndef CONFIG_X86_64 - .pagetable_setup_start = native_pagetable_setup_start, - .pagetable_setup_done = native_pagetable_setup_done, -#else - .pagetable_setup_start = paravirt_nop, - .pagetable_setup_done = paravirt_nop, -#endif .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 971a3bec47a8..c563e4c8ff39 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -46,6 +46,7 @@ #include <asm/dma.h> #include <asm/rio.h> #include <asm/bios_ebda.h> +#include <asm/x86_init.h> #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT int use_calgary __read_mostly = 1; @@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev, if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); else - return bad_dma_address; + return DMA_ERROR_CODE; } } @@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, void *vaddr, unsigned int npages, int direction) { unsigned long entry; - dma_addr_t ret = bad_dma_address; + dma_addr_t ret; entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == bad_dma_address)) - goto error; + if (unlikely(entry == DMA_ERROR_CODE)) { + printk(KERN_WARNING "Calgary: failed to allocate %u pages in " + "iommu %p\n", npages, tbl); + return DMA_ERROR_CODE; + } /* set the return dma address */ ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); @@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, /* put the TCEs in the HW table */ tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, direction); - return ret; - -error: - printk(KERN_WARNING "Calgary: failed to allocate %u pages in " - "iommu %p\n", npages, tbl); - return bad_dma_address; } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, @@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned long flags; /* were we called with bad_dma_address? */ - badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); - if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { + badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); return; @@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev) pdev = to_pci_dev(dev); + /* search up the device tree for an iommu */ pbus = pdev->bus; - - /* is the device behind a bridge? Look for the root bus */ - while (pbus->parent) + do { + tbl = pci_iommu(pbus); + if (tbl && tbl->it_busno == pbus->number) + break; + tbl = NULL; pbus = pbus->parent; - - tbl = pci_iommu(pbus); + } while (pbus); BUG_ON(tbl && (tbl->it_busno != pbus->number)); @@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); - if (entry == bad_dma_address) { + if (entry == DMA_ERROR_CODE) { /* makes sure unmap knows to stop */ s->dma_length = 0; goto error; @@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, error: calgary_unmap_sg(dev, sg, nelems, dir, NULL); for_each_sg(sg, s, nelems, i) { - sg->dma_address = bad_dma_address; + sg->dma_address = DMA_ERROR_CODE; sg->dma_length = 0; } return 0; @@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, /* set up tces to cover the allocated range */ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) + if (mapping == DMA_ERROR_CODE) goto free; *dma_handle = mapping; return ret; @@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) struct iommu_table *tbl = pci_iommu(dev->bus); /* reserve EMERGENCY_PAGES from bad_dma_address and up */ - iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); + iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ /* for CalIOC2 - avoid the entire first MB */ @@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void) return; } +static int __init calgary_iommu_init(void) +{ + int ret; + + /* ok, we're trying to use Calgary - let's roll */ + printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); + + ret = calgary_init(); + if (ret) { + printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " + "falling back to no_iommu\n", ret); + return ret; + } + + return 0; +} + void __init detect_calgary(void) { int bus; @@ -1357,7 +1374,7 @@ void __init detect_calgary(void) * if the user specified iommu=off or iommu=soft or we found * another HW IOMMU already, bail out. */ - if (swiotlb || no_iommu || iommu_detected) + if (no_iommu || iommu_detected) return; if (!use_calgary) @@ -1442,9 +1459,7 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", specified_table_size); - /* swiotlb for devices that aren't behind the Calgary. */ - if (max_pfn > MAX_DMA32_PFN) - swiotlb = 1; + x86_init.iommu.iommu_init = calgary_iommu_init; } return; @@ -1457,35 +1472,6 @@ cleanup: } } -int __init calgary_iommu_init(void) -{ - int ret; - - if (no_iommu || (swiotlb && !calgary_detected)) - return -ENODEV; - - if (!calgary_detected) - return -ENODEV; - - /* ok, we're trying to use Calgary - let's roll */ - printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); - - ret = calgary_init(); - if (ret) { - printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " - "falling back to no_iommu\n", ret); - return ret; - } - - force_iommu = 1; - bad_dma_address = 0x0; - /* dma_ops is set to swiotlb or nommu */ - if (!dma_ops) - dma_ops = &nommu_dma_ops; - - return 0; -} - static int __init calgary_parse_options(char *p) { unsigned int bridge; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index d71c8655905b..afcc58b69c7c 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -11,10 +11,11 @@ #include <asm/gart.h> #include <asm/calgary.h> #include <asm/amd_iommu.h> +#include <asm/x86_init.h> static int forbid_dac __read_mostly; -struct dma_map_ops *dma_ops; +struct dma_map_ops *dma_ops = &nommu_dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -35,22 +36,17 @@ int iommu_detected __read_mostly = 0; /* * This variable becomes 1 if iommu=pt is passed on the kernel command line. - * If this variable is 1, IOMMU implementations do no DMA ranslation for + * If this variable is 1, IOMMU implementations do no DMA translation for * devices and allow every device to access to whole physical memory. This is * useful if a user want to use an IOMMU only for KVM device assignment to * guests and not for driver dma translation. */ int iommu_pass_through __read_mostly; -dma_addr_t bad_dma_address __read_mostly = 0; -EXPORT_SYMBOL(bad_dma_address); - -/* Dummy device used for NULL arguments (normally ISA). Better would - be probably a smaller DMA mask, but this is bug-to-bug compatible - to older i386. */ +/* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", - .coherent_dma_mask = DMA_BIT_MASK(32), + .coherent_dma_mask = ISA_DMA_BIT_MASK, .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; EXPORT_SYMBOL(x86_dma_fallback_dev); @@ -128,20 +124,17 @@ void __init pci_iommu_alloc(void) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif + if (pci_swiotlb_init()) + return; - /* - * The order of these functions is important for - * fall-back/fail-over reasons - */ gart_iommu_hole_init(); detect_calgary(); detect_intel_iommu(); + /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); - - pci_swiotlb_init(); } void *dma_generic_alloc_coherent(struct device *dev, size_t size, @@ -216,7 +209,7 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "allowdac", 8)) forbid_dac = 0; if (!strncmp(p, "nodac", 5)) - forbid_dac = -1; + forbid_dac = 1; if (!strncmp(p, "usedac", 6)) { forbid_dac = -1; return 1; @@ -225,10 +218,8 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "soft", 4)) swiotlb = 1; #endif - if (!strncmp(p, "pt", 2)) { + if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; - return 1; - } gart_parse_options(p); @@ -293,27 +284,19 @@ static int __init pci_iommu_init(void) #ifdef CONFIG_PCI dma_debug_add_bus(&pci_bus_type); #endif + x86_init.iommu.iommu_init(); - calgary_iommu_init(); + if (swiotlb) { + printk(KERN_INFO "PCI-DMA: " + "Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else + swiotlb_free(); - intel_iommu_init(); - - amd_iommu_init(); - - gart_iommu_init(); - - no_iommu_init(); return 0; } - -void pci_iommu_shutdown(void) -{ - gart_iommu_shutdown(); - - amd_iommu_shutdown(); -} /* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); +rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 98a827ee9ed7..e6a0d402f171 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -16,6 +16,7 @@ #include <linux/agp_backend.h> #include <linux/init.h> #include <linux/mm.h> +#include <linux/sched.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/pci.h> @@ -38,6 +39,7 @@ #include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/k8.h> +#include <asm/x86_init.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -45,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */ static u32 *iommu_gatt_base; /* Remapping table */ +static dma_addr_t bad_dma_addr; + /* * If this is disabled the IOMMU will use an optimized flushing strategy * of only flushing when an mapping is reused. With it true the GART is @@ -91,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size, base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, + boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; spin_lock_irqsave(&iommu_bitmap_lock, flags); @@ -215,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, if (panic_on_overflow) panic("dma_map_area overflow %lu bytes\n", size); iommu_full(dev, size, dir); - return bad_dma_address; + return bad_dma_addr; } for (i = 0; i < npages; i++) { @@ -293,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, int i; #ifdef CONFIG_IOMMU_DEBUG - printk(KERN_DEBUG "dma_map_sg overflow\n"); + pr_debug("dma_map_sg overflow\n"); #endif for_each_sg(sg, s, nents, i) { @@ -301,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, if (nonforced_iommu(dev, addr, s->length)) { addr = dma_map_area(dev, addr, s->length, dir, 0); - if (addr == bad_dma_address) { + if (addr == bad_dma_addr) { if (i > 0) gart_unmap_sg(dev, sg, i, dir, NULL); nents = 0; @@ -388,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, if (!dev) dev = &x86_dma_fallback_dev; - out = 0; - start = 0; - start_sg = sgmap = sg; - seg_size = 0; - max_seg_size = dma_get_max_seg_size(dev); - ps = NULL; /* shut up gcc */ + out = 0; + start = 0; + start_sg = sg; + sgmap = sg; + seg_size = 0; + max_seg_size = dma_get_max_seg_size(dev); + ps = NULL; /* shut up gcc */ + for_each_sg(sg, s, nents, i) { dma_addr_t addr = sg_phys(s); @@ -416,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, sgmap, pages, need) < 0) goto error; out++; - seg_size = 0; - sgmap = sg_next(sgmap); - pages = 0; - start = i; - start_sg = s; + + seg_size = 0; + sgmap = sg_next(sgmap); + pages = 0; + start = i; + start_sg = s; } } @@ -454,7 +461,7 @@ error: iommu_full(dev, pages << PAGE_SHIFT, dir); for_each_sg(sg, s, nents, i) - s->dma_address = bad_dma_address; + s->dma_address = bad_dma_addr; return 0; } @@ -478,7 +485,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, DMA_BIDIRECTIONAL, align_mask); flush_gart(); - if (paddr != bad_dma_address) { + if (paddr != bad_dma_addr) { *dma_addr = paddr; return page_address(page); } @@ -498,6 +505,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return (dma_addr == bad_dma_addr); +} + static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -514,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { - printk(KERN_WARNING + pr_warning( "PCI-DMA: Warning: Small IOMMU %luMB." " Consider increasing the AGP aperture in BIOS\n", iommu_size >> 20); @@ -569,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc) aperture_alloc = aper_alloc; } -static int gart_resume(struct sys_device *dev) +static void gart_fixup_northbridges(struct sys_device *dev) { - printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); + int i; - if (fix_up_north_bridges) { - int i; + if (!fix_up_north_bridges) + return; - printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); + pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + for (i = 0; i < num_k8_northbridges; i++) { + struct pci_dev *dev = k8_northbridges[i]; - /* - * Don't enable translations just yet. That is the next - * step. Restore the pre-suspend aperture settings. - */ - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, - aperture_order << 1); - pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, - aperture_alloc >> 25); - } + /* + * Don't enable translations just yet. That is the next + * step. Restore the pre-suspend aperture settings. + */ + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); + pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); } +} + +static int gart_resume(struct sys_device *dev) +{ + pr_info("PCI-DMA: Resuming GART IOMMU\n"); + + gart_fixup_northbridges(dev); enable_gart_translations(); @@ -603,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state) } static struct sysdev_class gart_sysdev_class = { - .name = "gart", - .suspend = gart_suspend, - .resume = gart_resume, + .name = "gart", + .suspend = gart_suspend, + .resume = gart_resume, }; static struct sys_device device_gart = { - .id = 0, - .cls = &gart_sysdev_class, + .cls = &gart_sysdev_class, }; /* @@ -626,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) void *gatt; int i, error; - printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); + pr_info("PCI-DMA: Disabling AGP.\n"); + aper_size = aper_base = info->aper_size = 0; dev = NULL; for (i = 0; i < num_k8_northbridges; i++) { @@ -644,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) } if (!aper_base) goto nommu; + info->aper_base = aper_base; info->aper_size = aper_size >> 20; @@ -666,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info) flush_gart(); - printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", + pr_info("PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); return 0; nommu: /* Should not happen anymore */ - printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" + pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" "falling back to iommu=soft.\n"); return -1; } @@ -685,14 +702,15 @@ static struct dma_map_ops gart_dma_ops = { .unmap_page = gart_unmap_page, .alloc_coherent = gart_alloc_coherent, .free_coherent = gart_free_coherent, + .mapping_error = gart_mapping_error, }; -void gart_iommu_shutdown(void) +static void gart_iommu_shutdown(void) { struct pci_dev *dev; int i; - if (no_agp && (dma_ops != &gart_dma_ops)) + if (no_agp) return; for (i = 0; i < num_k8_northbridges; i++) { @@ -707,7 +725,7 @@ void gart_iommu_shutdown(void) } } -void __init gart_iommu_init(void) +int __init gart_iommu_init(void) { struct agp_kern_info info; unsigned long iommu_start; @@ -717,7 +735,7 @@ void __init gart_iommu_init(void) long i; if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) - return; + return 0; #ifndef CONFIG_AGP_AMD64 no_agp = 1; @@ -729,35 +747,28 @@ void __init gart_iommu_init(void) (agp_copy_info(agp_bridge, &info) < 0); #endif - if (swiotlb) - return; - - /* Did we detect a different HW IOMMU? */ - if (iommu_detected && !gart_iommu_aperture) - return; - if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || (no_agp && init_k8_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { - printk(KERN_WARNING "More than 4GB of memory " - "but GART IOMMU not available.\n"); - printk(KERN_WARNING "falling back to iommu=soft.\n"); + pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warning("falling back to iommu=soft.\n"); } - return; + return 0; } /* need to map that range */ - aper_size = info.aper_size << 20; - aper_base = info.aper_base; - end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + aper_size = info.aper_size << 20; + aper_base = info.aper_base; + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + if (end_pfn > max_low_pfn_mapped) { start_pfn = (aper_base>>PAGE_SHIFT); init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); } - printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); + pr_info("PCI-DMA: using GART IOMMU.\n"); iommu_size = check_iommu_size(info.aper_base, aper_size); iommu_pages = iommu_size >> PAGE_SHIFT; @@ -772,8 +783,7 @@ void __init gart_iommu_init(void) ret = dma_debug_resize_entries(iommu_pages); if (ret) - printk(KERN_DEBUG - "PCI-DMA: Cannot trace all the entries\n"); + pr_debug("PCI-DMA: Cannot trace all the entries\n"); } #endif @@ -783,15 +793,14 @@ void __init gart_iommu_init(void) */ iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); - agp_memory_reserved = iommu_size; - printk(KERN_INFO - "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", iommu_size >> 20); - iommu_start = aper_size - iommu_size; - iommu_bus_base = info.aper_base + iommu_start; - bad_dma_address = iommu_bus_base; - iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + agp_memory_reserved = iommu_size; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + bad_dma_addr = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); /* * Unmap the IOMMU part of the GART. The alias of the page is @@ -813,7 +822,7 @@ void __init gart_iommu_init(void) * the pages as Not-Present: */ wbinvd(); - + /* * Now all caches are flushed and we can safely enable * GART hardware. Doing it early leaves the possibility @@ -837,6 +846,10 @@ void __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; + x86_platform.iommu_shutdown = gart_iommu_shutdown; + swiotlb = 0; + + return 0; } void __init gart_parse_options(char *p) @@ -855,7 +868,7 @@ void __init gart_parse_options(char *p) #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; - if (!strncmp(p, "fullflush", 8)) + if (!strncmp(p, "fullflush", 9)) iommu_fullflush = 1; if (!strncmp(p, "nofullflush", 11)) iommu_fullflush = 0; diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a3933d4330cd..22be12b60a8f 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, dma_addr_t bus = page_to_phys(page) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) - return bad_dma_address; + return DMA_ERROR_CODE; flush_write_buffers(); return bus; } @@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = { .sync_sg_for_device = nommu_sync_sg_for_device, .is_phys = 1, }; - -void __init no_iommu_init(void) -{ - if (dma_ops) - return; - - force_iommu = 0; /* no HW IOMMU */ - dma_ops = &nommu_dma_ops; -} diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index e8a35016115f..e3c0a66b9e77 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -42,19 +42,28 @@ static struct dma_map_ops swiotlb_dma_ops = { .dma_supported = NULL, }; -void __init pci_swiotlb_init(void) +/* + * pci_swiotlb_init - initialize swiotlb if necessary + * + * This returns non-zero if we are forced to use swiotlb (by the boot + * option). + */ +int __init pci_swiotlb_init(void) { + int use_swiotlb = swiotlb | swiotlb_force; + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || - iommu_pass_through) - swiotlb = 1; + if (!no_iommu && max_pfn > MAX_DMA32_PFN) + swiotlb = 1; #endif if (swiotlb_force) swiotlb = 1; + if (swiotlb) { - printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_init(); + swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } + + return use_swiotlb; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 071166a4ba83..744508e7cfdd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -9,7 +9,8 @@ #include <linux/pm.h> #include <linux/clockchips.h> #include <linux/random.h> -#include <trace/power.h> +#include <trace/events/power.h> +#include <linux/hw_breakpoint.h> #include <asm/system.h> #include <asm/apic.h> #include <asm/syscalls.h> @@ -17,6 +18,7 @@ #include <asm/uaccess.h> #include <asm/i387.h> #include <asm/ds.h> +#include <asm/debugreg.h> unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -25,9 +27,6 @@ EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; -DEFINE_TRACE(power_start); -DEFINE_TRACE(power_end); - int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -106,14 +105,7 @@ void flush_thread(void) } #endif - clear_tsk_thread_flag(tsk, TIF_DEBUG); - - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; + flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -195,16 +187,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg0, 0); - set_debugreg(next->debugreg1, 1); - set_debugreg(next->debugreg2, 2); - set_debugreg(next->debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg6, 6); - set_debugreg(next->debugreg7, 7); - } - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ test_tsk_thread_flag(next_p, TIF_NOTSC)) { /* prev and next are different */ @@ -299,9 +281,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -314,7 +294,6 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; - trace_power_end(&it); } else { local_irq_enable(); /* loop is done by the caller */ @@ -372,9 +351,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -384,15 +361,13 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) if (!need_resched()) __mwait(ax, cx); } - trace_power_end(&it); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - struct power_trace it; if (!need_resched()) { - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -402,7 +377,6 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); - trace_power_end(&it); } else local_irq_enable(); } @@ -414,13 +388,11 @@ static void mwait_idle(void) */ static void poll_idle(void) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(&it); + trace_power_end(0); } /* @@ -568,10 +540,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) void __init init_c1e_mask(void) { /* If we're using c1e_idle, we need to allocate c1e_mask. */ - if (pm_idle == c1e_idle) { - alloc_cpumask_var(&c1e_mask, GFP_KERNEL); - cpumask_clear(c1e_mask); - } + if (pm_idle == c1e_idle) + zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); } static int __init idle_setup(char *str) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4cf79567cdab..540140284f60 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -58,6 +58,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/ds.h> +#include <asm/debugreg.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -134,7 +135,7 @@ void __show_regs(struct pt_regs *regs, int all) ss = regs->ss & 0xffff; gs = get_user_gs(regs); } else { - sp = (unsigned long) (®s->sp); + sp = kernel_stack_pointer(regs); savesegment(ss, ss); savesegment(gs, gs); } @@ -259,7 +260,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, task_user_gs(p) = get_user_gs(regs); + p->thread.io_bitmap_ptr = NULL; tsk = current; + err = -ENOMEM; + + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad535b683170..70cf15873f3d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,6 +52,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/ds.h> +#include <asm/debugreg.h> asmlinkage extern void ret_from_fork(void); @@ -297,12 +298,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.fs = me->thread.fs; p->thread.gs = me->thread.gs; + p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); savesegment(fs, p->thread.fsindex); savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + err = -ENOMEM; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -341,6 +346,7 @@ out: kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + return err; } @@ -495,6 +501,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (preload_fpu) __math_state_restore(); + return prev_p; } @@ -664,3 +671,8 @@ long sys_arch_prctl(int code, unsigned long addr) return do_arch_prctl(current, code, addr); } +unsigned long KSTK_ESP(struct task_struct *task) +{ + return (test_tsk_thread_flag(task, TIF_IA32)) ? + (task_pt_regs(task)->sp) : ((task)->thread.usersp); +} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8d7d5c9c1be3..04d182a7cfdb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,8 @@ #include <linux/seccomp.h> #include <linux/signal.h> #include <linux/workqueue.h> +#include <linux/perf_event.h> +#include <linux/hw_breakpoint.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -34,6 +36,7 @@ #include <asm/prctl.h> #include <asm/proto.h> #include <asm/ds.h> +#include <asm/hw_breakpoint.h> #include "tls.h" @@ -49,6 +52,118 @@ enum x86_regset { REGSET_IOPERM32, }; +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { +#ifdef CONFIG_X86_64 + REG_OFFSET_NAME(r15), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), +#endif + REG_OFFSET_NAME(bx), + REG_OFFSET_NAME(cx), + REG_OFFSET_NAME(dx), + REG_OFFSET_NAME(si), + REG_OFFSET_NAME(di), + REG_OFFSET_NAME(bp), + REG_OFFSET_NAME(ax), +#ifdef CONFIG_X86_32 + REG_OFFSET_NAME(ds), + REG_OFFSET_NAME(es), + REG_OFFSET_NAME(fs), + REG_OFFSET_NAME(gs), +#endif + REG_OFFSET_NAME(orig_ax), + REG_OFFSET_NAME(ip), + REG_OFFSET_NAME(cs), + REG_OFFSET_NAME(flags), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(ss), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_query_register_name() - query register name from its offset + * @offset: the offset of a register in struct pt_regs. + * + * regs_query_register_name() returns the name of a register from its + * offset in struct pt_regs. If the @offset is invalid, this returns NULL; + */ +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +static const int arg_offs_table[] = { +#ifdef CONFIG_X86_32 + [0] = offsetof(struct pt_regs, ax), + [1] = offsetof(struct pt_regs, dx), + [2] = offsetof(struct pt_regs, cx) +#else /* CONFIG_X86_64 */ + [0] = offsetof(struct pt_regs, di), + [1] = offsetof(struct pt_regs, si), + [2] = offsetof(struct pt_regs, dx), + [3] = offsetof(struct pt_regs, cx), + [4] = offsetof(struct pt_regs, r8), + [5] = offsetof(struct pt_regs, r9) +#endif +}; + +/** + * regs_get_argument_nth() - get Nth argument at function call + * @regs: pt_regs which contains registers at function entry. + * @n: argument number. + * + * regs_get_argument_nth() returns @n th argument of a function call. + * Since usually the kernel stack will be changed right after function entry, + * you must use this at function entry. If the @n th entry is NOT in the + * kernel stack or pt_regs, this returns 0. + */ +unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) +{ + if (n < ARRAY_SIZE(arg_offs_table)) + return *(unsigned long *)((char *)regs + arg_offs_table[n]); + else { + /* + * The typical case: arg n is on the stack. + * (Note: stack[0] = return address, so skip it) + */ + n -= ARRAY_SIZE(arg_offs_table); + return regs_get_kernel_stack_nth(regs, 1 + n); + } +} + /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. @@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ - return TASK_SIZE - 3; -} - #else /* CONFIG_X86_64 */ #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) @@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - return IA32_PAGE_OFFSET - 3; -#endif - return TASK_SIZE_MAX - 7; -} - #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -325,16 +426,6 @@ static int putreg(struct task_struct *child, return set_flags(child, value); #ifdef CONFIG_X86_64 - /* - * Orig_ax is really just a flag with small positive and - * negative values, so make sure to always sign-extend it - * from 32 bits so that it works correctly regardless of - * whether we come from a 32-bit environment or not. - */ - case offsetof(struct user_regs_struct, orig_ax): - value = (long) (s32) value; - break; - case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_OF(child)) return -EIO; @@ -464,99 +555,239 @@ static int genregs_set(struct task_struct *target, return ret; } +static void ptrace_triggered(struct perf_event *bp, void *data) +{ + int i; + struct thread_struct *thread = &(current->thread); + + /* + * Store in the virtual DR6 register the fact that the breakpoint + * was hit so the thread's debugger will see it. + */ + for (i = 0; i < HBP_NUM; i++) { + if (thread->ptrace_bps[i] == bp) + break; + } + + thread->debugreg6 |= (DR_TRAP0 << i); +} + /* - * This function is trivial and will be inlined by the compiler. - * Having it separates the implementation details of debug - * registers from the interface details of ptrace. + * Walk through every ptrace breakpoints for this thread and + * build the dr7 value on top of their attributes. + * */ -static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) +static unsigned long ptrace_get_dr7(struct perf_event *bp[]) { - switch (n) { - case 0: return child->thread.debugreg0; - case 1: return child->thread.debugreg1; - case 2: return child->thread.debugreg2; - case 3: return child->thread.debugreg3; - case 6: return child->thread.debugreg6; - case 7: return child->thread.debugreg7; + int i; + int dr7 = 0; + struct arch_hw_breakpoint *info; + + for (i = 0; i < HBP_NUM; i++) { + if (bp[i] && !bp[i]->attr.disabled) { + info = counter_arch_bp(bp[i]); + dr7 |= encode_dr7(i, info->len, info->type); + } } - return 0; + + return dr7; } -static int ptrace_set_debugreg(struct task_struct *child, - int n, unsigned long data) +static struct perf_event * +ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + struct task_struct *tsk, int disabled) { - int i; + int err; + int gen_len, gen_type; + DEFINE_BREAKPOINT_ATTR(attr); - if (unlikely(n == 4 || n == 5)) - return -EIO; + /* + * We shoud have at least an inactive breakpoint at this + * slot. It means the user is writing dr7 without having + * written the address register first + */ + if (!bp) + return ERR_PTR(-EINVAL); - if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) - return -EIO; + err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + if (err) + return ERR_PTR(err); - switch (n) { - case 0: child->thread.debugreg0 = data; break; - case 1: child->thread.debugreg1 = data; break; - case 2: child->thread.debugreg2 = data; break; - case 3: child->thread.debugreg3 = data; break; + attr = bp->attr; + attr.bp_len = gen_len; + attr.bp_type = gen_type; + attr.disabled = disabled; - case 6: - if ((data & ~0xffffffffUL) != 0) - return -EIO; - child->thread.debugreg6 = data; - break; + return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); +} + +/* + * Handle ptrace writes to debug register 7. + */ +static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long old_dr7; + int i, orig_ret = 0, rc = 0; + int enabled, second_pass = 0; + unsigned len, type; + struct perf_event *bp; + + data &= ~DR_CONTROL_RESERVED; + old_dr7 = ptrace_get_dr7(thread->ptrace_bps); +restore: + /* + * Loop through all the hardware breakpoints, making the + * appropriate changes to each. + */ + for (i = 0; i < HBP_NUM; i++) { + enabled = decode_dr7(data, i, &len, &type); + bp = thread->ptrace_bps[i]; + + if (!enabled) { + if (bp) { + /* + * Don't unregister the breakpoints right-away, + * unless all register_user_hw_breakpoint() + * requests have succeeded. This prevents + * any window of opportunity for debug + * register grabbing by other users. + */ + if (!second_pass) + continue; + + thread->ptrace_bps[i] = NULL; + bp = ptrace_modify_breakpoint(bp, len, type, + tsk, 1); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + thread->ptrace_bps[i] = NULL; + break; + } + thread->ptrace_bps[i] = bp; + } + continue; + } - case 7: + bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + + /* Incorrect bp, or we have a bug in bp API */ + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + thread->ptrace_bps[i] = NULL; + break; + } + thread->ptrace_bps[i] = bp; + } + /* + * Make a second pass to free the remaining unused breakpoints + * or to restore the original breakpoints if an error occurred. + */ + if (!second_pass) { + second_pass = 1; + if (rc < 0) { + orig_ret = rc; + data = old_dr7; + } + goto restore; + } + return ((orig_ret < 0) ? orig_ret : rc); +} + +/* + * Handle PTRACE_PEEKUSR calls for the debug register area. + */ +static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long val = 0; + + if (n < HBP_NUM) { + struct perf_event *bp; + bp = thread->ptrace_bps[n]; + if (!bp) + return 0; + val = bp->hw.info.address; + } else if (n == 6) { + val = thread->debugreg6; + } else if (n == 7) { + val = ptrace_get_dr7(thread->ptrace_bps); + } + return val; +} + +static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, + unsigned long addr) +{ + struct perf_event *bp; + struct thread_struct *t = &tsk->thread; + DEFINE_BREAKPOINT_ATTR(attr); + + if (!t->ptrace_bps[nr]) { /* - * Sanity-check data. Take one half-byte at once with - * check = (val >> (16 + 4*i)) & 0xf. It contains the - * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits - * 2 and 3 are LENi. Given a list of invalid values, - * we do mask |= 1 << invalid_value, so that - * (mask >> check) & 1 is a correct test for invalid - * values. - * - * R/Wi contains the type of the breakpoint / - * watchpoint, LENi contains the length of the watched - * data in the watchpoint case. - * - * The invalid values are: - * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit] - * - R/Wi == 0x10 (break on I/O reads or writes), so - * mask |= 0x4444. - * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= - * 0x1110. - * - * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. - * - * See the Intel Manual "System Programming Guide", - * 15.2.4 - * - * Note that LENi == 0x10 is defined on x86_64 in long - * mode (i.e. even for 32-bit userspace software, but - * 64-bit kernel), so the x86_64 mask value is 0x5454. - * See the AMD manual no. 24593 (AMD64 System Programming) + * Put stub len and type to register (reserve) an inactive but + * correct bp */ -#ifdef CONFIG_X86_32 -#define DR7_MASK 0x5f54 -#else -#define DR7_MASK 0x5554 -#endif - data &= ~DR_CONTROL_RESERVED; - for (i = 0; i < 4; i++) - if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) - return -EIO; - child->thread.debugreg7 = data; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - break; + attr.bp_addr = addr; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + } else { + bp = t->ptrace_bps[nr]; + t->ptrace_bps[nr] = NULL; + + attr = bp->attr; + attr.bp_addr = addr; + bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); } + /* + * CHECKME: the previous code returned -EIO if the addr wasn't a + * valid task virtual addr. The new one will return -EINVAL in this + * case. + * -EINVAL may be what we want for in-kernel breakpoints users, but + * -EIO looks better for ptrace, since we refuse a register writing + * for the user. And anyway this is the previous behaviour. + */ + if (IS_ERR(bp)) + return PTR_ERR(bp); + + t->ptrace_bps[nr] = bp; return 0; } /* + * Handle PTRACE_POKEUSR calls for the debug register area. + */ +int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +{ + struct thread_struct *thread = &(tsk->thread); + int rc = 0; + + /* There are no DR4 or DR5 registers */ + if (n == 4 || n == 5) + return -EIO; + + if (n == 6) { + thread->debugreg6 = val; + goto ret_path; + } + if (n < HBP_NUM) { + rc = ptrace_set_breakpoint_addr(tsk, n, val); + if (rc) + return rc; + } + /* All that's left is DR7 */ + if (n == 7) + rc = ptrace_write_dr7(tsk, val); + +ret_path: + return rc; +} + +/* * These access the current or another (stopped) task's io permission * bitmap for debugging or core dump. */ @@ -1126,10 +1357,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) case offsetof(struct user32, regs.orig_eax): /* - * Sign-extend the value so that orig_eax = -1 - * causes (long)orig_ax < 0 tests to fire correctly. + * A 32-bit debugger setting orig_eax means to restore + * the state of the task restarting a 32-bit syscall. + * Make sure we interpret the -ERESTART* codes correctly + * in case the task is not actually still sitting at the + * exit from a 32-bit syscall with TS_COMPAT still set. */ - regs->orig_ax = (long) (s32) value; + regs->orig_ax = value; + if (syscall_get_nr(child, regs) >= 0) + task_thread_info(child)->status |= TS_COMPAT; break; case offsetof(struct user32, regs.eflags): diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index af71d06624bf..6c3b2c6fd772 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) pci_read_config_dword(nb_ht, 0x60, &val); set_dev_node(&dev->dev, val & 7); - pci_dev_put(dev); + pci_dev_put(nb_ht); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a06e8d101844..2b97fc5b124e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -4,6 +4,8 @@ #include <linux/pm.h> #include <linux/efi.h> #include <linux/dmi.h> +#include <linux/sched.h> +#include <linux/tboot.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -21,7 +23,7 @@ # include <linux/ctype.h> # include <linux/mc146818rtc.h> #else -# include <asm/iommu.h> +# include <asm/x86_init.h> #endif /* @@ -434,6 +436,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), }, }, + { /* Handle problems with rebooting on Apple Macmini3,1 */ + .callback = set_pci_reboot, + .ident = "Apple Macmini3,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), + }, + }, { } }; @@ -508,6 +518,8 @@ static void native_machine_emergency_restart(void) if (reboot_emergency) emergency_vmx_disable_all(); + tboot_shutdown(TB_SHUTDOWN_REBOOT); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -610,7 +622,7 @@ void native_machine_shutdown(void) #endif #ifdef CONFIG_X86_64 - pci_iommu_shutdown(); + x86_platform.iommu_shutdown(); #endif } @@ -634,6 +646,8 @@ static void native_machine_halt(void) /* stop other cpus and apics */ machine_shutdown(); + tboot_shutdown(TB_SHUTDOWN_HALT); + /* stop this cpu */ stop_this_cpu(NULL); } @@ -645,6 +659,8 @@ static void native_machine_power_off(void) machine_shutdown(); pm_power_off(); } + /* a fallback in case there is no PM info available */ + tboot_shutdown(TB_SHUTDOWN_HALT); } struct machine_ops machine_ops = { diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 5d465b207e72..1cfbbfc3ae26 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -8,6 +8,7 @@ #include <linux/pnp.h> #include <asm/vsyscall.h> +#include <asm/x86_init.h> #include <asm/time.h> #ifdef CONFIG_X86_32 @@ -165,33 +166,29 @@ void rtc_cmos_write(unsigned char val, unsigned char addr) } EXPORT_SYMBOL(rtc_cmos_write); -static int set_rtc_mmss(unsigned long nowtime) +int update_persistent_clock(struct timespec now) { unsigned long flags; int retval; spin_lock_irqsave(&rtc_lock, flags); - retval = set_wallclock(nowtime); + retval = x86_platform.set_wallclock(now.tv_sec); spin_unlock_irqrestore(&rtc_lock, flags); return retval; } /* not static: needed by APM */ -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned long retval, flags; spin_lock_irqsave(&rtc_lock, flags); - retval = get_wallclock(); + retval = x86_platform.get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); - return retval; -} - -int update_persistent_clock(struct timespec now) -{ - return set_rtc_mmss(now.tv_sec); + ts->tv_sec = retval; + ts->tv_nsec = 0; } unsigned long long native_read_tsc(void) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 63f32d220ef2..82e88cdda9bc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -27,6 +27,7 @@ #include <linux/screen_info.h> #include <linux/ioport.h> #include <linux/acpi.h> +#include <linux/sfi.h> #include <linux/apm_bios.h> #include <linux/initrd.h> #include <linux/bootmem.h> @@ -66,6 +67,7 @@ #include <linux/percpu.h> #include <linux/crash_dump.h> +#include <linux/tboot.h> #include <video/edid.h> @@ -107,10 +109,7 @@ #ifdef CONFIG_X86_64 #include <asm/numa_64.h> #endif - -#ifndef ARCH_SETUP -#define ARCH_SETUP -#endif +#include <asm/mce.h> /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -133,9 +132,9 @@ int default_cpu_present_to_apicid(int mps_cpu) return __default_cpu_present_to_apicid(mps_cpu); } -int default_check_phys_apicid_present(int boot_cpu_physical_apicid) +int default_check_phys_apicid_present(int phys_apicid) { - return __default_check_phys_apicid_present(boot_cpu_physical_apicid); + return __default_check_phys_apicid_present(phys_apicid); } #endif @@ -171,13 +170,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; /* common cpu data for all cpus */ @@ -256,7 +248,7 @@ EXPORT_SYMBOL(edd); * from boot_params into a safe place. * */ -static inline void copy_edd(void) +static inline void __init copy_edd(void) { memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, sizeof(edd.mbr_signature)); @@ -265,7 +257,7 @@ static inline void copy_edd(void) edd.edd_info_nr = boot_params.eddbuf_entries; } #else -static inline void copy_edd(void) +static inline void __init copy_edd(void) { } #endif @@ -605,7 +597,7 @@ static struct resource standard_io_resources[] = { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -static void __init reserve_standard_io_resources(void) +void __init reserve_standard_io_resources(void) { int i; @@ -637,10 +629,6 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; - -struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; - #ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { @@ -673,6 +661,13 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { }, }, { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix/MSC BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), + }, + }, + { /* * AMI BIOS with low memory corruption was found on Intel DG45ID board. * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will @@ -757,7 +752,7 @@ void __init setup_arch(char **cmdline_p) } #endif - ARCH_SETUP + x86_init.oem.arch_setup(); setup_memory_map(); parse_setup_data(); @@ -796,6 +791,16 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; +#ifdef CONFIG_X86_64 + /* + * Must call this twice: Once just to detect whether hardware doesn't + * support NX (so that the early EHCI debug console setup can safely + * call set_fixmap(), and then again after parsing early parameters to + * honor the respective command line option. + */ + check_efer(); +#endif + parse_early_param(); #ifdef CONFIG_X86_64 @@ -833,11 +838,9 @@ void __init setup_arch(char **cmdline_p) * VMware detection requires dmi to be available, so this * needs to be done after dmi_scan_machine, for the BP. */ - init_hypervisor(&boot_cpu_data); + init_hypervisor_platform(); -#ifdef CONFIG_X86_32 - probe_roms(); -#endif + x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ insert_resource(&iomem_resource, &code_resource); @@ -972,10 +975,11 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif - paravirt_pagetable_setup_start(swapper_pg_dir); + x86_init.paging.pagetable_setup_start(swapper_pg_dir); paging_init(); - paravirt_pagetable_setup_done(swapper_pg_dir); - paravirt_post_allocator_init(); + x86_init.paging.pagetable_setup_done(swapper_pg_dir); + + tboot_probe(); #ifdef CONFIG_X86_64 map_vsyscall(); @@ -990,13 +994,13 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_init(); -#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) + sfi_init(); + /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); -#endif prefill_possible_map(); @@ -1015,10 +1019,7 @@ void __init setup_arch(char **cmdline_p) e820_reserve_resources(); e820_mark_nosave_regions(max_low_pfn); -#ifdef CONFIG_X86_32 - request_resource(&iomem_resource, &video_ram_resource); -#endif - reserve_standard_io_resources(); + x86_init.resources.reserve_resources(); e820_setup_gap(); @@ -1030,78 +1031,24 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif #endif -} + x86_init.oem.banner(); -#ifdef CONFIG_X86_32 - -/** - * x86_quirk_intr_init - post gate setup interrupt initialisation - * - * Description: - * Fill in any interrupts that may have been left out by the general - * init_IRQ() routine. interrupts having to do with the machine rather - * than the devices on the I/O bus (like APIC interrupts in intel MP - * systems) are started here. - **/ -void __init x86_quirk_intr_init(void) -{ - if (x86_quirks->arch_intr_init) { - if (x86_quirks->arch_intr_init()) - return; - } + mcheck_init(); } -/** - * x86_quirk_trap_init - initialise system specific traps - * - * Description: - * Called as the final act of trap_init(). Used in VISWS to initialise - * the various board specific APIC traps. - **/ -void __init x86_quirk_trap_init(void) -{ - if (x86_quirks->arch_trap_init) { - if (x86_quirks->arch_trap_init()) - return; - } -} +#ifdef CONFIG_X86_32 -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; -/** - * x86_quirk_pre_time_init - do any specific initialisations before. - * - **/ -void __init x86_quirk_pre_time_init(void) +void __init i386_reserve_resources(void) { - if (x86_quirks->arch_pre_time_init) - x86_quirks->arch_pre_time_init(); + request_resource(&iomem_resource, &video_ram_resource); + reserve_standard_io_resources(); } -/** - * x86_quirk_time_init - do any specific initialisations for the system timer. - * - * Description: - * Must plug the system timer interrupt source at HZ into the IRQ listed - * in irq_vectors.h:TIMER_IRQ - **/ -void __init x86_quirk_time_init(void) -{ - if (x86_quirks->arch_time_init) { - /* - * A nonzero return code does not mean failure, it means - * that the architecture quirk does not want any - * generic (timer) setup to be performed after this: - */ - if (x86_quirks->arch_time_init()) - return; - } - - irq0.mask = cpumask_of_cpu(0); - setup_irq(0, &irq0); -} #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 07d81916f212..d559af913e1f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset); #define PERCPU_FIRST_CHUNK_RESERVE 0 #endif +#ifdef CONFIG_X86_32 /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void) #endif return false; } +#endif /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu @@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Large page remap allocator - * - * This allocator uses PMD page as unit. A PMD page is allocated for - * each cpu and each is remapped into vmalloc area using PMD mapping. - * As PMD page is quite large, only part of it is used for the first - * chunk. Unused part is returned to the bootmem allocator. - * - * So, the PMD pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more PMD TLB entry pressure but still is much - * better than only using 4k mappings while still being NUMA friendly. + * Helpers for first chunk memory allocation */ -#ifdef CONFIG_NEED_MULTIPLE_NODES -struct pcpul_ent { - unsigned int cpu; - void *ptr; -}; - -static size_t pcpul_size; -static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; - -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); + return pcpu_alloc_bootmem(cpu, size, align); } -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static void __init pcpu_fc_free(void *ptr, size_t size) { - size_t map_size, dyn_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - if (!chosen) { - size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = nr_cpu_ids * PMD_SIZE; - - /* on non-NUMA, embedding is better */ - if (!pcpu_need_numa()) - return -EINVAL; - - /* don't consume more than 20% of vmalloc area */ - if (tot_size > vm_size / 5) { - pr_info("PERCPU: too large chunk size %zuMB for " - "large page remap\n", tot_size >> 20); - return -EINVAL; - } - } - - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - if (pcpul_size > PMD_SIZE) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } - dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0])); - pcpul_map = alloc_bootmem(map_size); - - for_each_possible_cpu(cpu) { - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, - PMD_SIZE); - if (!pcpul_map[cpu].ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The 2MB up-rounding bootmem is needed to make - * sure the partial 2MB page is still fully RAM - it's - * not well-specified to have a PAT-incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), - PMD_SIZE - pcpul_size); - - memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); - } - - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = nr_cpu_ids * PMD_SIZE; - vm_area_register_early(&pcpul_vm, PMD_SIZE); - - for_each_possible_cpu(cpu) { - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + - cpu * PMD_SIZE); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), - PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); - } - - /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); - - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < nr_cpu_ids - 1; i++) - for (j = i + 1; j < nr_cpu_ids; j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - return ret; - -enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; + free_bootmem(__pa(ptr), size); } -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu PMD - * mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used PMD - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { - void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); - unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; - int left = 0, right = nr_cpu_ids - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < pmd_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > pmd_addr) - right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * PMD_SIZE + offset; - } - } - - return NULL; -} +#ifdef CONFIG_NEED_MULTIPLE_NODES + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; #else -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) -{ - return -EINVAL; -} + return LOCAL_DISTANCE; #endif - -/* - * Embedding allocator - * - * The first chunk is sized to just contain the static area plus - * module and dynamic reserves and embedded into linear physical - * mapping so that it can use PMD mapping without additional TLB - * pressure. - */ -static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, embedding allocation doesn't play well with - * NUMA. - */ - if (!chosen && (!cpu_has_pse || pcpu_need_numa())) - return -EINVAL; - - return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); } -/* - * 4k page allocator - * - * This is the basic allocator. Static percpu area is allocated - * page-by-page and most of initialization is done by the generic - * setup function. - */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; - return NULL; -} - -static void __init pcpu4k_populate_pte(unsigned long addr) +static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_4k(size_t static_size) -{ - size_t pages_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - pcpu4k_nr_static_pages = PFN_UP(static_size); - - /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids - * sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); - - /* allocate and copy */ - j = 0; - for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { - void *ptr; - - ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); - goto enomem; - } - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pcpu4k_pages[j++] = virt_to_page(ptr); - } - - /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); - - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, - -1, NULL, pcpu4k_populate_pte); - goto out_free_ar; - -enomem: - while (--j >= 0) - free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); - return ret; -} - -/* for explicit first chunk allocator selection */ -static char pcpu_chosen_alloc[16] __initdata; - -static int __init percpu_alloc_setup(char *str) -{ - strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); - return 0; -} -early_param("percpu_alloc", percpu_alloc_setup); - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu) void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; unsigned int cpu; unsigned long delta; - size_t pcpu_unit_size; - ssize_t ret; + int rc; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* - * Allocate percpu area. If PSE is supported, try to make use - * of large page mappings. Please read comments on top of - * each allocator for details. + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. */ - ret = -EINVAL; - if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "4k")) { - if (!strcmp(pcpu_chosen_alloc, "lpage")) - ret = setup_pcpu_lpage(static_size, true); - else if (!strcmp(pcpu_chosen_alloc, "embed")) - ret = setup_pcpu_embed(static_size, true); - else - pr_warning("PERCPU: unknown allocator %s " - "specified\n", pcpu_chosen_alloc); - if (ret < 0) - pr_warning("PERCPU: %s allocator failed (%zd), " - "falling back to 4k\n", - pcpu_chosen_alloc, ret); - } - } else { - ret = setup_pcpu_lpage(static_size, false); - if (ret < 0) - ret = setup_pcpu_embed(static_size, false); +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + rc = -EINVAL; + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); + if (rc < 0) + pr_warning("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } - if (ret < 0) - ret = setup_pcpu_4k(static_size); - if (ret < 0) - panic("cannot allocate static percpu area (%zu bytes, err=%zd)", - static_size, ret); - - pcpu_unit_size = ret; + if (rc < 0) + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c new file mode 100644 index 000000000000..34e099382651 --- /dev/null +++ b/arch/x86/kernel/sfi.c @@ -0,0 +1,122 @@ +/* + * sfi.c - x86 architecture SFI support. + * + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#define KMSG_COMPONENT "SFI" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/acpi.h> +#include <linux/init.h> +#include <linux/sfi.h> +#include <linux/io.h> + +#include <asm/io_apic.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/apic.h> + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; + +void __init mp_sfi_register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = read_apic_id(); + + pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid); +} + +/* All CPUs enumerated by SFI must be present and enabled */ +void __cpuinit mp_sfi_register_lapic(u8 id) +{ + if (MAX_APICS - id <= 0) { + pr_warning("Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + pr_info("registering lapic[%d]\n", id); + + generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR))); +} + +static int __init sfi_parse_cpus(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_cpu_table_entry *pentry; + int i; + int cpu_num; + + sb = (struct sfi_table_simple *)table; + cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry); + pentry = (struct sfi_cpu_table_entry *)sb->pentry; + + for (i = 0; i < cpu_num; i++) { + mp_sfi_register_lapic(pentry->apic_id); + pentry++; + } + + smp_found_config = 1; + return 0; +} +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +static u32 gsi_base; + +static int __init sfi_parse_ioapic(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_apic_table_entry *pentry; + int i, num; + + sb = (struct sfi_table_simple *)table; + num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); + pentry = (struct sfi_apic_table_entry *)sb->pentry; + + for (i = 0; i < num; i++) { + mp_register_ioapic(i, pentry->phys_addr, gsi_base); + gsi_base += io_apic_get_redir_entries(i); + pentry++; + } + + WARN(pic_mode, KERN_WARNING + "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); + pic_mode = 0; + return 0; +} +#endif /* CONFIG_X86_IO_APIC */ + +/* + * sfi_platform_init(): register lapics & io-apics + */ +int __init sfi_platform_init(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + mp_sfi_register_lapic_address(sfi_lapic_addr); + sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); +#endif +#ifdef CONFIG_X86_IO_APIC + sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic); +#endif + return 0; +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 81e58238c4ce..fbf3b07c8567 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -799,15 +799,6 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* - * Re-enable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* @@ -856,7 +847,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_process(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c36cc1452cdc..324f2a44c221 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -47,6 +47,7 @@ #include <linux/bootmem.h> #include <linux/err.h> #include <linux/nmi.h> +#include <linux/tboot.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -323,7 +324,7 @@ notrace static void __cpuinit start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); - setup_secondary_clock(); + x86_cpuinit.setup_percpu_clockev(); wmb(); cpu_idle(); @@ -1058,12 +1059,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) #endif current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { - alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); - alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); - cpumask_clear(per_cpu(cpu_core_map, i)); - cpumask_clear(per_cpu(cpu_sibling_map, i)); - cpumask_clear(cpu_data(i).llc_shared_map); + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); } set_cpu_sibling_map(0); @@ -1113,13 +1111,26 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) printk(KERN_INFO "CPU%d: ", 0); print_cpu_info(&cpu_data(0)); - setup_boot_clock(); + x86_init.timers.setup_percpu_clockev(); if (is_uv_system()) uv_system_init(); + + set_mtrr_aps_delayed_init(); out: preempt_enable(); } + +void arch_enable_nonboot_cpus_begin(void) +{ + set_mtrr_aps_delayed_init(); +} + +void arch_enable_nonboot_cpus_end(void) +{ + mtrr_aps_init(); +} + /* * Early setup to make printk work. */ @@ -1141,6 +1152,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif check_nmi_watchdog(); + mtrr_aps_init(); } static int __initdata setup_possible_cpus = -1; @@ -1238,16 +1250,7 @@ static void __ref remove_cpu_from_maps(int cpu) void cpu_disable_common(void) { int cpu = smp_processor_id(); - /* - * HACK: - * Allow any queued timer interrupts to get serviced - * This is only a temporary solution until we cleanup - * fixup_irqs as we do for IA64. - */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ @@ -1318,6 +1321,7 @@ void play_dead_common(void) void native_play_dead(void) { play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); wbinvd_halt(); } diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d51321ddafda..0157cd26d7cc 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -335,4 +335,4 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c new file mode 100644 index 000000000000..86c9f91b48ae --- /dev/null +++ b/arch/x86/kernel/tboot.c @@ -0,0 +1,447 @@ +/* + * tboot.c: main implementation of helper functions used by kernel for + * runtime support of Intel(R) Trusted Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/dma_remapping.h> +#include <linux/init_task.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/dmar.h> +#include <linux/cpu.h> +#include <linux/pfn.h> +#include <linux/mm.h> +#include <linux/tboot.h> + +#include <asm/trampoline.h> +#include <asm/processor.h> +#include <asm/bootparam.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/proto.h> +#include <asm/setup.h> +#include <asm/e820.h> +#include <asm/io.h> + +#include "acpi/realmode/wakeup.h" + +/* Global pointer to shared data; NULL means no measured launch. */ +struct tboot *tboot __read_mostly; + +/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ +#define AP_WAIT_TIMEOUT 1 + +#undef pr_fmt +#define pr_fmt(fmt) "tboot: " fmt + +static u8 tboot_uuid[16] __initdata = TBOOT_UUID; + +void __init tboot_probe(void) +{ + /* Look for valid page-aligned address for shared page. */ + if (!boot_params.tboot_addr) + return; + /* + * also verify that it is mapped as we expect it before calling + * set_fixmap(), to reduce chance of garbage value causing crash + */ + if (!e820_any_mapped(boot_params.tboot_addr, + boot_params.tboot_addr, E820_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + return; + } + + /* only a natively booted kernel should be using TXT */ + if (paravirt_enabled()) { + pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); + return; + } + + /* Map and check for tboot UUID. */ + set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); + tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warning("tboot at 0x%llx is invalid\n", + boot_params.tboot_addr); + tboot = NULL; + return; + } + if (tboot->version < 5) { + pr_warning("tboot version is invalid: %u\n", tboot->version); + tboot = NULL; + return; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); +} + +static pgd_t *tboot_pg_dir; +static struct mm_struct tboot_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .cpu_vm_mask = CPU_MASK_ALL, +}; + +static inline void switch_to_tboot_pt(void) +{ + write_cr3(virt_to_phys(tboot_pg_dir)); +} + +static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(&tboot_mm, vaddr); + pud = pud_alloc(&tboot_mm, pgd, vaddr); + if (!pud) + return -1; + pmd = pmd_alloc(&tboot_mm, pud, vaddr); + if (!pmd) + return -1; + pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + if (!pte) + return -1; + set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); + pte_unmap(pte); + return 0; +} + +static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn, + unsigned long nr) +{ + /* Reuse the original kernel mapping */ + tboot_pg_dir = pgd_alloc(&tboot_mm); + if (!tboot_pg_dir) + return -1; + + for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) { + if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC)) + return -1; + } + + return 0; +} + +static void tboot_create_trampoline(void) +{ + u32 map_base, map_size; + + /* Create identity map for tboot shutdown code. */ + map_base = PFN_DOWN(tboot->tboot_base); + map_size = PFN_UP(tboot->tboot_size); + if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", + map_base, map_size); +} + +#ifdef CONFIG_ACPI_SLEEP + +static void add_mac_region(phys_addr_t start, unsigned long size) +{ + struct tboot_mac_region *mr; + phys_addr_t end = start + size; + + if (start && size) { + mr = &tboot->mac_regions[tboot->num_mac_regions++]; + mr->start = round_down(start, PAGE_SIZE); + mr->size = round_up(end, PAGE_SIZE) - mr->start; + } +} + +static int tboot_setup_sleep(void) +{ + tboot->num_mac_regions = 0; + + /* S3 resume code */ + add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); + +#ifdef CONFIG_X86_TRAMPOLINE + /* AP trampoline code */ + add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); +#endif + + /* kernel code + data + bss */ + add_mac_region(virt_to_phys(_text), _end - _text); + + tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + + return 0; +} + +#else /* no CONFIG_ACPI_SLEEP */ + +static int tboot_setup_sleep(void) +{ + /* S3 shutdown requested, but S3 not supported by the kernel... */ + BUG(); + return -1; +} + +#endif + +void tboot_shutdown(u32 shutdown_type) +{ + void (*shutdown)(void); + + if (!tboot_enabled()) + return; + + /* + * if we're being called before the 1:1 mapping is set up then just + * return and let the normal shutdown happen; this should only be + * due to very early panic() + */ + if (!tboot_pg_dir) + return; + + /* if this is S3 then set regions to MAC */ + if (shutdown_type == TB_SHUTDOWN_S3) + if (tboot_setup_sleep()) + return; + + tboot->shutdown_type = shutdown_type; + + switch_to_tboot_pt(); + + shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry; + shutdown(); + + /* should not reach here */ + while (1) + halt(); +} + +static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) +{ +#define TB_COPY_GAS(tbg, g) \ + tbg.space_id = g.space_id; \ + tbg.bit_width = g.bit_width; \ + tbg.bit_offset = g.bit_offset; \ + tbg.access_width = g.access_width; \ + tbg.address = g.address; + + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block); + + /* + * We need phys addr of waking vector, but can't use virt_to_phys() on + * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys + * addr. + */ + tboot->acpi_sinfo.wakeup_vector = fadt->facs + + offsetof(struct acpi_table_facs, firmware_waking_vector); +} + +void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) +{ + static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { + /* S0,1,2: */ -1, -1, -1, + /* S3: */ TB_SHUTDOWN_S3, + /* S4: */ TB_SHUTDOWN_S4, + /* S5: */ TB_SHUTDOWN_S5 }; + + if (!tboot_enabled()) + return; + + tboot_copy_fadt(&acpi_gbl_FADT); + tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; + tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; + /* we always use the 32b wakeup vector */ + tboot->acpi_sinfo.vector_width = 32; + + if (sleep_state >= ACPI_S_STATE_COUNT || + acpi_shutdown_map[sleep_state] == -1) { + pr_warning("unsupported sleep state 0x%x\n", sleep_state); + return; + } + + tboot_shutdown(acpi_shutdown_map[sleep_state]); +} + +static atomic_t ap_wfs_count; + +static int tboot_wait_for_aps(int num_aps) +{ + unsigned long timeout; + + timeout = AP_WAIT_TIMEOUT*HZ; + while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps && + timeout) { + mdelay(1); + timeout--; + } + + if (timeout) + pr_warning("tboot wait for APs timeout\n"); + + return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); +} + +static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_DYING: + atomic_inc(&ap_wfs_count); + if (num_online_cpus() == 1) + if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) + return NOTIFY_BAD; + break; + } + return NOTIFY_OK; +} + +static struct notifier_block tboot_cpu_notifier __cpuinitdata = +{ + .notifier_call = tboot_cpu_callback, +}; + +static __init int tboot_late_init(void) +{ + if (!tboot_enabled()) + return 0; + + tboot_create_trampoline(); + + atomic_set(&ap_wfs_count, 0); + register_hotcpu_notifier(&tboot_cpu_notifier); + return 0; +} + +late_initcall(tboot_late_init); + +/* + * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE) + */ + +#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000 +#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000 + +/* # pages for each config regs space - used by fixmap */ +#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \ + TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT) + +/* offsets from pub/priv config space */ +#define TXTCR_HEAP_BASE 0x0300 +#define TXTCR_HEAP_SIZE 0x0308 + +#define SHA1_SIZE 20 + +struct sha1_hash { + u8 hash[SHA1_SIZE]; +}; + +struct sinit_mle_data { + u32 version; /* currently 6 */ + struct sha1_hash bios_acm_id; + u32 edx_senter_flags; + u64 mseg_valid; + struct sha1_hash sinit_hash; + struct sha1_hash mle_hash; + struct sha1_hash stm_hash; + struct sha1_hash lcp_policy_hash; + u32 lcp_policy_control; + u32 rlp_wakeup_addr; + u32 reserved; + u32 num_mdrs; + u32 mdrs_off; + u32 num_vtd_dmars; + u32 vtd_dmars_off; +} __packed; + +struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl) +{ + void *heap_base, *heap_ptr, *config; + + if (!tboot_enabled()) + return dmar_tbl; + + /* + * ACPI tables may not be DMA protected by tboot, so use DMAR copy + * SINIT saved in SinitMleData in TXT heap (which is DMA protected) + */ + + /* map config space in order to get heap addr */ + config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES * + PAGE_SIZE); + if (!config) + return NULL; + + /* now map TXT heap */ + heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE), + *(u64 *)(config + TXTCR_HEAP_SIZE)); + iounmap(config); + if (!heap_base) + return NULL; + + /* walk heap to SinitMleData */ + /* skip BiosData */ + heap_ptr = heap_base + *(u64 *)heap_base; + /* skip OsMleData */ + heap_ptr += *(u64 *)heap_ptr; + /* skip OsSinitData */ + heap_ptr += *(u64 *)heap_ptr; + /* now points to SinitMleDataSize; set to SinitMleData */ + heap_ptr += sizeof(u64); + /* get addr of DMAR table */ + dmar_tbl = (struct acpi_table_header *)(heap_ptr + + ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off - + sizeof(u64)); + + /* don't unmap heap because dmar.c needs access to this */ + + return dmar_tbl; +} + +int tboot_force_iommu(void) +{ + if (!tboot_enabled()) + return 0; + + if (no_iommu || swiotlb || dmar_disabled) + pr_warning("Forcing Intel-IOMMU to enabled\n"); + + dmar_disabled = 0; +#ifdef CONFIG_SWIOTLB + swiotlb = 0; +#endif + no_iommu = 0; + + return 1; +} diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c new file mode 100644 index 000000000000..be2573448ed9 --- /dev/null +++ b/arch/x86/kernel/time.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 1991,1992,1995 Linus Torvalds + * Copyright (c) 1994 Alan Modra + * Copyright (c) 1995 Markus Kuhn + * Copyright (c) 1996 Ingo Molnar + * Copyright (c) 1998 Andrea Arcangeli + * Copyright (c) 2002,2006 Vojtech Pavlik + * Copyright (c) 2003 Andi Kleen + * + */ + +#include <linux/clockchips.h> +#include <linux/interrupt.h> +#include <linux/time.h> +#include <linux/mca.h> + +#include <asm/vsyscall.h> +#include <asm/x86_init.h> +#include <asm/i8259.h> +#include <asm/i8253.h> +#include <asm/timer.h> +#include <asm/hpet.h> +#include <asm/time.h> + +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) +int timer_ack; +#endif + +#ifdef CONFIG_X86_64 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +#endif + +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long pc = instruction_pointer(regs); + + if (!user_mode_vm(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER + return *(unsigned long *)(regs->bp + sizeof(long)); +#else + unsigned long *sp = + (unsigned long *)kernel_stack_pointer(regs); + /* + * Return address is either directly at stack pointer + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; +#endif + } + return pc; +} +EXPORT_SYMBOL(profile_pc); + +/* + * Default timer interrupt handler for PIT/HPET + */ +static irqreturn_t timer_interrupt(int irq, void *dev_id) +{ + /* Keep nmi watchdog up to date */ + inc_irq_stat(irq0_irqs); + + /* Optimized out for !IO_APIC and x86_64 */ + if (timer_ack) { + /* + * Subtle, when I/O APICs are used we have to ack timer IRQ + * manually to deassert NMI lines for the watchdog if run + * on an 82489DX-based system. + */ + spin_lock(&i8259A_lock); + outb(0x0c, PIC_MASTER_OCW3); + /* Ack the IRQ; AEOI will end it automatically. */ + inb(PIC_MASTER_POLL); + spin_unlock(&i8259A_lock); + } + + global_clock_event->event_handler(global_clock_event); + + /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ + if (MCA_bus) + outb_p(inb_p(0x61)| 0x80, 0x61); + + return IRQ_HANDLED; +} + +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .name = "timer" +}; + +void __init setup_default_timer_irq(void) +{ + setup_irq(0, &irq0); +} + +/* Default timer init function */ +void __init hpet_time_init(void) +{ + if (!hpet_enable()) + setup_pit_timer(); + setup_default_timer_irq(); +} + +static __init void x86_late_time_init(void) +{ + x86_init.timers.timer_init(); + tsc_init(); +} + +/* + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. + */ +void __init time_init(void) +{ + late_time_init = x86_late_time_init; +} diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c deleted file mode 100644 index 5c5d87f0b2e1..000000000000 --- a/arch/x86/kernel/time_32.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (C) 1991, 1992, 1995 Linus Torvalds - * - * This file contains the PC-specific time handling details: - * reading the RTC at bootup, etc.. - * 1994-07-02 Alan Modra - * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime - * 1995-03-26 Markus Kuhn - * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 - * precision CMOS clock update - * 1996-05-03 Ingo Molnar - * fixed time warps in do_[slow|fast]_gettimeoffset() - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-09-05 (Various) - * More robust do_fast_gettimeoffset() algorithm implemented - * (works with APM, Cyrix 6x86MX and Centaur C6), - * monotonic gettimeofday() with fast_get_timeoffset(), - * drift-proof precision TSC calibration on boot - * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D. - * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>; - * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>). - * 1998-12-16 Andrea Arcangeli - * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy - * because was not accounting lost_ticks. - * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli - * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). - */ - -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/time.h> -#include <linux/mca.h> - -#include <asm/setup.h> -#include <asm/hpet.h> -#include <asm/time.h> -#include <asm/timer.h> - -#include <asm/do_timer.h> - -int timer_ack; - -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - -#ifdef CONFIG_SMP - if (!user_mode_vm(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)®s->sp; - - /* Return address is either directly at stack pointer - or above a saved flags. Eflags has bits 22-31 zero, - kernel addresses don't. */ - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } -#endif - return pc; -} -EXPORT_SYMBOL(profile_pc); - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - /* Keep nmi watchdog up to date */ - inc_irq_stat(irq0_irqs); - -#ifdef CONFIG_X86_IO_APIC - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); - } -#endif - - do_timer_interrupt_hook(); - -#ifdef CONFIG_MCA - if (MCA_bus) { - /* The PS/2 uses level-triggered interrupts. You can't - turn them off, nor would you want to (any attempt to - enable edge-triggered interrupts usually gets intercepted by a - special hardware circuit). Hence we have to acknowledge - the timer interrupt. Through some incredibly stupid - design idea, the reset for IRQ 0 is done by setting the - high bit of the PPI port B (0x61). Note that some PS/2s, - notably the 55SX, work fine if this is removed. */ - - u8 irq_v = inb_p(0x61); /* read the current state */ - outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */ - } -#endif - - return IRQ_HANDLED; -} - -/* Duplicate of time_init() below, with hpet_enable part added */ -void __init hpet_time_init(void) -{ - if (!hpet_enable()) - setup_pit_timer(); - x86_quirk_time_init(); -} - -/* - * This is called directly from init code; we must delay timer setup in the - * HPET case as we can't make the decision to turn on HPET this early in the - * boot process. - * - * The chosen time_init function will usually be hpet_time_init, above, but - * in the case of virtual hardware, an alternative function may be substituted. - */ -void __init time_init(void) -{ - x86_quirk_pre_time_init(); - tsc_init(); - late_time_init = choose_time_init(); -} diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c deleted file mode 100644 index 5ba343e61844..000000000000 --- a/arch/x86/kernel/time_64.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * "High Precision Event Timer" based timekeeping. - * - * Copyright (c) 1991,1992,1995 Linus Torvalds - * Copyright (c) 1994 Alan Modra - * Copyright (c) 1995 Markus Kuhn - * Copyright (c) 1996 Ingo Molnar - * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002,2006 Vojtech Pavlik - * Copyright (c) 2003 Andi Kleen - * RTC support code taken from arch/i386/kernel/timers/time_hpet.c - */ - -#include <linux/clockchips.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/time.h> -#include <linux/mca.h> -#include <linux/nmi.h> - -#include <asm/i8253.h> -#include <asm/hpet.h> -#include <asm/vgtod.h> -#include <asm/time.h> -#include <asm/timer.h> - -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; - -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - - /* Assume the lock function has either no stack frame or a copy - of flags from PUSHF - Eflags always has bits 22 and up cleared unlike kernel addresses. */ - if (!user_mode_vm(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)regs->sp; - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } - return pc; -} -EXPORT_SYMBOL(profile_pc); - -static irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - inc_irq_stat(irq0_irqs); - - global_clock_event->event_handler(global_clock_event); - -#ifdef CONFIG_MCA - if (MCA_bus) { - u8 irq_v = inb_p(0x61); /* read the current state */ - outb_p(irq_v|0x80, 0x61); /* reset the IRQ */ - } -#endif - - return IRQ_HANDLED; -} - -/* calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency */ -#define TICK_COUNT 100000000 -unsigned long __init calibrate_cpu(void) -{ - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " - "cpu_khz value may be incorrect.\n"); - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start measuring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); -} - -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER, - .name = "timer" -}; - -void __init hpet_time_init(void) -{ - if (!hpet_enable()) - setup_pit_timer(); - - setup_irq(0, &irq0); -} - -void __init time_init(void) -{ - tsc_init(); - - late_time_init = choose_time_init(); -} diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 503c1f2e8835..1740c85e24bb 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -23,8 +23,6 @@ static struct bau_control **uv_bau_table_bases __read_mostly; static int uv_bau_retry_limit __read_mostly; -/* position of pnode (which is nasid>>1): */ -static int uv_nshift __read_mostly; /* base pnode in this partition */ static int uv_partition_base_pnode __read_mostly; @@ -723,7 +721,7 @@ uv_activation_descriptor_init(int node, int pnode) BUG_ON(!adp); pa = uv_gpa(adp); /* need the real nasid*/ - n = pa >> uv_nshift; + n = uv_gpa_to_pnode(pa); m = pa & uv_mmask; uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, @@ -778,7 +776,7 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) * need the pnode of where the memory was really allocated */ pa = uv_gpa(pqp); - pn = pa >> uv_nshift; + pn = uv_gpa_to_pnode(pa); uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | @@ -843,8 +841,7 @@ static int __init uv_bau_init(void) GFP_KERNEL, cpu_to_node(cur_cpu)); uv_bau_retry_limit = 1; - uv_nshift = uv_hub_info->n_val; - uv_mmask = (1UL << uv_hub_info->n_val) - 1; + uv_mmask = (1UL << uv_hub_info->m_val) - 1; nblades = uv_num_possible_blades(); uv_bau_table_bases = (struct bau_control **) diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 808031a5ba19..cd022121cab6 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -3,8 +3,16 @@ #include <asm/trampoline.h> #include <asm/e820.h> +#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) +#define __trampinit +#define __trampinitdata +#else +#define __trampinit __cpuinit +#define __trampinitdata __cpuinitdata +#endif + /* ready for x86_64 and x86 */ -unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); +unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE); void __init reserve_trampoline_memory(void) { @@ -26,7 +34,7 @@ void __init reserve_trampoline_memory(void) * bootstrap into the page concerned. The caller * has made sure it's suitably aligned. */ -unsigned long setup_trampoline(void) +unsigned long __trampinit setup_trampoline(void) { memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 66d874e5404c..8508237e8e43 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -28,16 +28,12 @@ */ #include <linux/linkage.h> +#include <linux/init.h> #include <asm/segment.h> #include <asm/page_types.h> /* We can free up trampoline after bootup if cpu hotplug is not supported. */ -#ifndef CONFIG_HOTPLUG_CPU -.section ".cpuinit.data","aw",@progbits -#else -.section .rodata,"a",@progbits -#endif - +__CPUINITRODATA .code16 ENTRY(trampoline_data) diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index cddfb8d386b9..3af2dff58b21 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -25,14 +25,19 @@ */ #include <linux/linkage.h> +#include <linux/init.h> #include <asm/pgtable_types.h> #include <asm/page_types.h> #include <asm/msr.h> #include <asm/segment.h> #include <asm/processor-flags.h> +#ifdef CONFIG_ACPI_SLEEP .section .rodata, "a", @progbits - +#else +/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ +__CPUINITRODATA +#endif .code16 ENTRY(trampoline_data) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 83264922a878..33399176512a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -14,7 +14,6 @@ #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/module.h> @@ -59,12 +58,12 @@ #include <asm/mach_traps.h> #ifdef CONFIG_X86_64 +#include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/proto.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> -#include <asm/traps.h> asmlinkage int system_call(void); @@ -73,11 +72,9 @@ char ignore_fpu_irq; /* * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. + * F0 0F bug workaround. */ -gate_desc idt_table[NR_VECTORS] - __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; #endif DECLARE_BITMAP(used_vectors, NR_VECTORS); @@ -532,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned long condition; + unsigned long dr6; int si_code; - get_debugreg(condition, 6); + get_debugreg(dr6, 6); /* Catch kmemcheck conditions first of all! */ - if (condition & DR_STEP && kmemcheck_trap(regs)) + if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) return; + /* DR6 may or may not be cleared by the CPU */ + set_debugreg(0, 6); /* * The processor cleared BTF, so don't mark that we need it set. */ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); tsk->thread.debugctlmsr = 0; - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) + /* Store the virtualized DR6 value */ + tsk->thread.debugreg6 = dr6; + + if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, + SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; + if (regs->flags & X86_VM_MASK) { + handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, 1); + return; } -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). + * Single-stepping through system calls: ignore any exceptions in + * kernel space, but re-enable TF when returning to user mode. + * + * We already checked v86 mode above, so we can check for kernel mode + * by just checking the CPL of CS. */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; + if ((dr6 & DR_STEP) && !user_mode(regs)) { + tsk->thread.debugreg6 &= ~DR_STEP; + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; } - - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); + si_code = get_si_code(tsk->thread.debugreg6); + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) + send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); - return; -#ifdef CONFIG_X86_32 -debug_vm86: - /* reenable preemption: handle_vm86_trap() might sleep */ - dec_preempt_count(); - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); return; } @@ -972,7 +948,5 @@ void __init trap_init(void) */ cpu_init(); -#ifdef CONFIG_X86_32 - x86_quirk_trap_init(); -#endif + x86_init.irqs.trap_init(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 71f4368b357e..cd982f48e23e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -17,6 +17,8 @@ #include <asm/time.h> #include <asm/delay.h> #include <asm/hypervisor.h> +#include <asm/nmi.h> +#include <asm/x86_init.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -400,15 +402,9 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; + unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; - hv_tsc_khz = get_hypervisor_tsc_freq(); - if (hv_tsc_khz) { - printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return hv_tsc_khz; - } - local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); @@ -566,7 +562,7 @@ int recalibrate_cpu_khz(void) unsigned long cpu_khz_old = cpu_khz; if (cpu_has_tsc) { - tsc_khz = calibrate_tsc(); + tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, @@ -670,7 +666,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || (val == CPUFREQ_RESUMECHANGE)) { - *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) @@ -744,10 +740,16 @@ static cycle_t __vsyscall_fn vread_tsc(void) } #endif +static void resume_tsc(void) +{ + clocksource_tsc.cycle_last = 0; +} + static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, + .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS | @@ -761,12 +763,14 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - printk("Marking TSC unstable due to %s\n", reason); + printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else + clocksource_mark_unstable(&clocksource_tsc); + else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; clocksource_tsc.rating = 0; + } } } @@ -852,15 +856,71 @@ static void __init init_tsc_clocksource(void) clocksource_register(&clocksource_tsc); } +#ifdef CONFIG_X86_64 +/* + * calibrate_cpu is used on systems with fixed rate TSCs to determine + * processor frequency + */ +#define TICK_COUNT 100000000 +static unsigned long __init calibrate_cpu(void) +{ + int tsc_start, tsc_now; + int i, no_ctr_free; + unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; + unsigned long flags; + + for (i = 0; i < 4; i++) + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; + no_ctr_free = (i == 4); + if (no_ctr_free) { + WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " + "cpu_khz value may be incorrect.\n"); + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); + } else { + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + local_irq_save(flags); + /* start measuring cycles, incrementing from 0 */ + wrmsrl(MSR_K7_PERFCTR0 + i, 0); + wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); + rdtscl(tsc_start); + do { + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles(); + } while ((tsc_now - tsc_start) < TICK_COUNT); + + local_irq_restore(flags); + if (no_ctr_free) { + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + } else { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + + return pmc_now * tsc_khz / (tsc_now - tsc_start); +} +#else +static inline unsigned long calibrate_cpu(void) { return cpu_khz; } +#endif + void __init tsc_init(void) { u64 lpj; int cpu; + x86_init.timers.tsc_pre_init(); + if (!cpu_has_tsc) return; - tsc_khz = calibrate_tsc(); + tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; if (!tsc_khz) { @@ -868,11 +928,9 @@ void __init tsc_init(void) return; } -#ifdef CONFIG_X86_64 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) cpu_khz = calibrate_cpu(); -#endif printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 027b5b498993..f37930954d15 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -114,7 +114,7 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - pr_info("Skipping synchronization checks as TSC is reliable.\n"); + printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); return; } diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index aeef529917e4..61d805df4c91 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -9,10 +9,25 @@ */ #include <linux/module.h> +#include <linux/rbtree.h> #include <linux/irq.h> #include <asm/apic.h> #include <asm/uv/uv_irq.h> +#include <asm/uv/uv_hub.h> + +/* MMR offset and pnode of hub sourcing interrupts for a given irq */ +struct uv_irq_2_mmr_pnode{ + struct rb_node list; + unsigned long offset; + int pnode; + int irq; +}; + +static spinlock_t uv_irq_lock; +static struct rb_root uv_irq_root; + +static int uv_set_irq_affinity(unsigned int, const struct cpumask *); static void uv_noop(unsigned int irq) { @@ -39,25 +54,214 @@ struct irq_chip uv_irq_chip = { .unmask = uv_noop, .eoi = uv_ack_apic, .end = uv_noop, + .set_affinity = uv_set_irq_affinity, }; /* + * Add offset and pnode information of the hub sourcing interrupts to the + * rb tree for a specific irq. + */ +static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade) +{ + struct rb_node **link = &uv_irq_root.rb_node; + struct rb_node *parent = NULL; + struct uv_irq_2_mmr_pnode *n; + struct uv_irq_2_mmr_pnode *e; + unsigned long irqflags; + + n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL, + uv_blade_to_memory_nid(blade)); + if (!n) + return -ENOMEM; + + n->irq = irq; + n->offset = offset; + n->pnode = uv_blade_to_pnode(blade); + spin_lock_irqsave(&uv_irq_lock, irqflags); + /* Find the right place in the rbtree: */ + while (*link) { + parent = *link; + e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list); + + if (unlikely(irq == e->irq)) { + /* irq entry exists */ + e->pnode = uv_blade_to_pnode(blade); + e->offset = offset; + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + kfree(n); + return 0; + } + + if (irq < e->irq) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + /* Insert the node into the rbtree. */ + rb_link_node(&n->list, parent, link); + rb_insert_color(&n->list, &uv_irq_root); + + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return 0; +} + +/* Retrieve offset and pnode information from the rb tree for a specific irq */ +int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) +{ + struct uv_irq_2_mmr_pnode *e; + struct rb_node *n; + unsigned long irqflags; + + spin_lock_irqsave(&uv_irq_lock, irqflags); + n = uv_irq_root.rb_node; + while (n) { + e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); + + if (e->irq == irq) { + *offset = e->offset; + *pnode = e->pnode; + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return 0; + } + + if (irq < e->irq) + n = n->rb_left; + else + n = n->rb_right; + } + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return -1; +} + +/* + * Re-target the irq to the specified CPU and enable the specified MMR located + * on the specified blade to allow the sending of MSIs to the specified CPU. + */ +static int +arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, + unsigned long mmr_offset, int restrict) +{ + const struct cpumask *eligible_cpu = cpumask_of(cpu); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg; + int mmr_pnode; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + int err; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + cfg = irq_cfg(irq); + + err = assign_irq_vector(irq, cfg, eligible_cpu); + if (err != 0) + return err; + + if (restrict == UV_AFFINITY_CPU) + desc->status |= IRQ_NO_BALANCING; + else + desc->status |= IRQ_MOVE_PCNTXT; + + set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_name); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return irq; +} + +/* + * Disable the specified MMR located on the specified blade so that MSIs are + * longer allowed to be sent. + */ +static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->mask = 1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +} + +static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + unsigned int dest; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long mmr_offset; + unsigned mmr_pnode; + + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) + return -1; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = dest; + + /* Get previously stored MMR and pnode of hub sourcing interrupts */ + if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) + return -1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return 0; +} + +/* * Set up a mapping of an available irq and vector, and enable the specified * MMR that defines the MSI that is to be sent to the specified CPU when an * interrupt is raised. */ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, - unsigned long mmr_offset) + unsigned long mmr_offset, int restrict) { - int irq; - int ret; + int irq, ret; + + irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade)); - irq = create_irq(); if (irq <= 0) return -EBUSY; - ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); - if (ret != irq) + ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, + restrict); + if (ret == irq) + uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); + else destroy_irq(irq); return ret; @@ -71,9 +275,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq); * * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). */ -void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) +void uv_teardown_irq(unsigned int irq) { - arch_disable_uv_irq(mmr_blade, mmr_offset); + struct uv_irq_2_mmr_pnode *e; + struct rb_node *n; + unsigned long irqflags; + + spin_lock_irqsave(&uv_irq_lock, irqflags); + n = uv_irq_root.rb_node; + while (n) { + e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); + if (e->irq == irq) { + arch_disable_uv_irq(e->pnode, e->offset); + rb_erase(n, &uv_irq_root); + kfree(e); + break; + } + if (irq < e->irq) + n = n->rb_left; + else + n = n->rb_right; + } + spin_unlock_irqrestore(&uv_irq_lock, irqflags); destroy_irq(irq); } EXPORT_SYMBOL_GPL(uv_teardown_irq); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 31ffc24eec4d..abda6f53e71e 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -30,6 +30,7 @@ #include <asm/setup.h> #include <asm/apic.h> #include <asm/e820.h> +#include <asm/time.h> #include <asm/io.h> #include <linux/kernel_stat.h> @@ -53,7 +54,7 @@ int is_visws_box(void) return visws_board_type >= 0; } -static int __init visws_time_init(void) +static void __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -66,21 +67,13 @@ static int __init visws_time_init(void) /* Enable (unmask) the timer interrupt */ co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); - /* - * Zero return means the generic timer setup code will set up - * the standard vector: - */ - return 0; + setup_default_timer_irq(); } -static int __init visws_pre_intr_init(void) +/* Replaces the default init_ISA_irqs in the generic setup */ +static void __init visws_pre_intr_init(void) { init_VISWS_APIC_irqs(); - - /* - * We dont want ISA irqs to be set up by the generic code: - */ - return 1; } /* Quirk for machine specific memory setup. */ @@ -156,12 +149,8 @@ static void visws_machine_power_off(void) outl(PIIX_SPECIAL_STOP, 0xCFC); } -static int __init visws_get_smp_config(unsigned int early) +static void __init visws_get_smp_config(unsigned int early) { - /* - * Prevent MP-table parsing by the generic code: - */ - return 1; } /* @@ -194,7 +183,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - apic_cpus = apic->apicid_to_cpu_present(m->apicid); + apic->apicid_to_cpu_present(m->apicid, &apic_cpus); physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); /* * Validate version @@ -208,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) apic_version[m->apicid] = ver; } -static int __init visws_find_smp_config(unsigned int reserve) +static void __init visws_find_smp_config(unsigned int reserve) { struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); @@ -230,21 +219,9 @@ static int __init visws_find_smp_config(unsigned int reserve) MP_processor_info(mp++); mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - return 1; } -static int visws_trap_init(void); - -static struct x86_quirks visws_x86_quirks __initdata = { - .arch_time_init = visws_time_init, - .arch_pre_intr_init = visws_pre_intr_init, - .arch_memory_setup = visws_memory_setup, - .arch_intr_init = NULL, - .arch_trap_init = visws_trap_init, - .mach_get_smp_config = visws_get_smp_config, - .mach_find_smp_config = visws_find_smp_config, -}; +static void visws_trap_init(void); void __init visws_early_detect(void) { @@ -257,11 +234,14 @@ void __init visws_early_detect(void) return; /* - * Install special quirks for timer, interrupt and memory setup: - * Fall back to generic behavior for traps: - * Override generic MP-table parsing: + * Override the default platform setup functions */ - x86_quirks = &visws_x86_quirks; + x86_init.resources.memory_setup = visws_memory_setup; + x86_init.mpparse.get_smp_config = visws_get_smp_config; + x86_init.mpparse.find_smp_config = visws_find_smp_config; + x86_init.irqs.pre_vector_init = visws_pre_intr_init; + x86_init.irqs.trap_init = visws_trap_init; + x86_init.timers.timer_init = visws_time_init; /* * Install reboot quirks: @@ -400,12 +380,10 @@ static __init void cobalt_init(void) co_apic_read(CO_APIC_ID)); } -static int __init visws_trap_init(void) +static void __init visws_trap_init(void) { lithium_init(); cobalt_init(); - - return 1; } /* @@ -508,7 +486,7 @@ static void end_cobalt_irq(unsigned int irq) } static struct irq_chip cobalt_irq_type = { - .typename = "Cobalt-APIC", + .name = "Cobalt-APIC", .startup = startup_cobalt_irq, .shutdown = disable_cobalt_irq, .enable = enable_cobalt_irq, @@ -545,7 +523,7 @@ static void end_piix4_master_irq(unsigned int irq) } static struct irq_chip piix4_master_irq_type = { - .typename = "PIIX4-master", + .name = "PIIX4-master", .startup = startup_piix4_master_irq, .ack = ack_cobalt_irq, .end = end_piix4_master_irq, @@ -553,7 +531,7 @@ static struct irq_chip piix4_master_irq_type = { static struct irq_chip piix4_virtual_irq_type = { - .typename = "PIIX4-virtual", + .name = "PIIX4-virtual", .shutdown = disable_8259A_irq, .enable = enable_8259A_irq, .disable = disable_8259A_irq, diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 95a7289e4b0c..d430e4c30193 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -648,7 +648,7 @@ static inline int __init activate_vmi(void) pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; - pv_info.name = "vmi"; + pv_info.name = "vmi [deprecated]"; pv_init_ops.patch = vmi_patch; @@ -817,15 +817,15 @@ static inline int __init activate_vmi(void) vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); vmi_timer_ops.cancel_alarm = vmi_get_function(VMI_CALL_CancelAlarm); - pv_time_ops.time_init = vmi_time_init; - pv_time_ops.get_wallclock = vmi_get_wallclock; - pv_time_ops.set_wallclock = vmi_set_wallclock; + x86_init.timers.timer_init = vmi_time_init; #ifdef CONFIG_X86_LOCAL_APIC - pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; - pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; + x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init; + x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init; #endif pv_time_ops.sched_clock = vmi_sched_clock; - pv_time_ops.get_tsc_khz = vmi_tsc_khz; + x86_platform.calibrate_tsc = vmi_tsc_khz; + x86_platform.get_wallclock = vmi_get_wallclock; + x86_platform.set_wallclock = vmi_set_wallclock; /* We have true wallclock functions; disable CMOS clock sync */ no_sync_cmos_clock = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 2b3eb82efeeb..611b9e2360d3 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void) return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } -/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ +/* x86_platform.calibrate_tsc = vmi_tsc_khz */ unsigned long vmi_tsc_khz(void) { unsigned long long khz; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9fc178255c04..3c68fe2d46cf 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -45,9 +45,9 @@ PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_X86_64 - user PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(5); /* R_E */ #ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(7); /* RWE */ + percpu PT_LOAD FLAGS(6); /* RW_ */ #endif init PT_LOAD FLAGS(7); /* RWE */ #endif @@ -65,17 +65,11 @@ SECTIONS #endif /* Text and read-only data */ - - /* bootstrapping code */ - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; - *(.text.head) - } :text = 0x9090 - - /* The rest of the text */ .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; + /* bootstrapping code */ + HEAD_TEXT #ifdef CONFIG_X86_32 - /* not really needed, already page aligned */ . = ALIGN(PAGE_SIZE); *(.text.page_aligned) #endif @@ -94,13 +88,7 @@ SECTIONS NOTES :text :note - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 + EXCEPTION_TABLE(16) :text = 0x9090 RO_DATA(PAGE_SIZE) @@ -118,7 +106,6 @@ SECTIONS #endif PAGE_ALIGNED_DATA(PAGE_SIZE) - *(.data.idt) CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) @@ -135,24 +122,21 @@ SECTIONS #ifdef CONFIG_X86_64 #define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \ - PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \ - PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + . = ALIGN(4096); + __vsyscall_0 = .; + . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + .vsyscall_0 : AT(VLOAD(.vsyscall_0)) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) @@ -192,11 +176,9 @@ SECTIONS *(.vsyscall_3) } - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + . = __vsyscall_0 + PAGE_SIZE; #undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR #undef VLOAD_OFFSET #undef VLOAD #undef VVIRT_OFFSET @@ -219,36 +201,12 @@ SECTIONS PERCPU_VADDR(0, :percpu) #endif - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } + INIT_TEXT_SECTION(PAGE_SIZE) #ifdef CONFIG_X86_64 :init #endif - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } + INIT_DATA_SECTION(16) .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; @@ -256,8 +214,6 @@ SECTIONS __x86_cpu_dev_end = .; } - SECURITY_INIT - . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { __parainstructions = .; @@ -288,15 +244,6 @@ SECTIONS EXIT_DATA } -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) PERCPU(PAGE_SIZE) #endif @@ -348,19 +295,19 @@ SECTIONS _end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.eh_frame) } } #ifdef CONFIG_X86_32 +/* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); #else diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 25ee06a80aad..8cb4974ff599 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -227,19 +228,11 @@ static long __vsyscall(3) venosys_1(void) } #ifdef CONFIG_SYSCTL - -static int -vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return proc_dointvec(ctl, write, filp, buffer, lenp, ppos); -} - static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = vsyscall_sysctl_change }, + .proc_handler = proc_dointvec }, {} }; diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 3909e3ba5ce3..a1029769b6f2 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -30,9 +30,8 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(__copy_user_nocache); -EXPORT_SYMBOL(copy_from_user); -EXPORT_SYMBOL(copy_to_user); -EXPORT_SYMBOL(__copy_from_user_inatomic); +EXPORT_SYMBOL(_copy_from_user); +EXPORT_SYMBOL(_copy_to_user); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c new file mode 100644 index 000000000000..d11c5ff7c65e --- /dev/null +++ b/arch/x86/kernel/x86_init.c @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de> + * + * For licencing details see kernel-base/COPYING + */ +#include <linux/init.h> + +#include <asm/bios_ebda.h> +#include <asm/paravirt.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/apic.h> +#include <asm/e820.h> +#include <asm/time.h> +#include <asm/irq.h> +#include <asm/tsc.h> +#include <asm/iommu.h> + +void __cpuinit x86_init_noop(void) { } +void __init x86_init_uint_noop(unsigned int unused) { } +void __init x86_init_pgd_noop(pgd_t *unused) { } +int __init iommu_init_noop(void) { return 0; } +void iommu_shutdown_noop(void) { } + +/* + * The platform setup functions are preset with the default functions + * for standard PC hardware. + */ +struct x86_init_ops x86_init __initdata = { + + .resources = { + .probe_roms = x86_init_noop, + .reserve_resources = reserve_standard_io_resources, + .memory_setup = default_machine_specific_memory_setup, + }, + + .mpparse = { + .mpc_record = x86_init_uint_noop, + .setup_ioapic_ids = x86_init_noop, + .mpc_apic_id = default_mpc_apic_id, + .smp_read_mpc_oem = default_smp_read_mpc_oem, + .mpc_oem_bus_info = default_mpc_oem_bus_info, + .find_smp_config = default_find_smp_config, + .get_smp_config = default_get_smp_config, + }, + + .irqs = { + .pre_vector_init = init_ISA_irqs, + .intr_init = native_init_IRQ, + .trap_init = x86_init_noop, + }, + + .oem = { + .arch_setup = x86_init_noop, + .banner = default_banner, + }, + + .paging = { + .pagetable_setup_start = native_pagetable_setup_start, + .pagetable_setup_done = native_pagetable_setup_done, + }, + + .timers = { + .setup_percpu_clockev = setup_boot_APIC_clock, + .tsc_pre_init = x86_init_noop, + .timer_init = hpet_time_init, + }, + + .iommu = { + .iommu_init = iommu_init_noop, + }, +}; + +struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { + .setup_percpu_clockev = setup_secondary_APIC_clock, +}; + +struct x86_platform_ops x86_platform = { + .calibrate_tsc = native_calibrate_tsc, + .get_wallclock = mach_get_cmos_time, + .set_wallclock = mach_set_rtc_mmss, + .iommu_shutdown = iommu_shutdown_noop, +}; |