diff options
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r-- | arch/x86_64/kernel/Makefile | 4 | ||||
-rw-r--r-- | arch/x86_64/kernel/apic.c | 5 | ||||
-rw-r--r-- | arch/x86_64/kernel/early-quirks.c | 4 | ||||
-rw-r--r-- | arch/x86_64/kernel/hpet.c | 511 | ||||
-rw-r--r-- | arch/x86_64/kernel/i8259.c | 1 | ||||
-rw-r--r-- | arch/x86_64/kernel/io_apic.c | 4 | ||||
-rw-r--r-- | arch/x86_64/kernel/pmtimer.c | 58 | ||||
-rw-r--r-- | arch/x86_64/kernel/smpboot.c | 231 | ||||
-rw-r--r-- | arch/x86_64/kernel/time.c | 961 | ||||
-rw-r--r-- | arch/x86_64/kernel/tsc.c | 226 | ||||
-rw-r--r-- | arch/x86_64/kernel/tsc_sync.c | 187 | ||||
-rw-r--r-- | arch/x86_64/kernel/vmlinux.lds.S | 28 | ||||
-rw-r--r-- | arch/x86_64/kernel/vsyscall.c | 121 |
13 files changed, 1040 insertions, 1301 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index ae399458024b..bb47e86f3d02 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o + pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o @@ -19,7 +19,7 @@ obj-$(CONFIG_ACPI) += acpi/ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o +obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o obj-y += apic.o nmi.o obj-y += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 124b2d27b4ac..723417d924c0 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -37,6 +37,7 @@ #include <asm/idle.h> #include <asm/proto.h> #include <asm/timex.h> +#include <asm/hpet.h> #include <asm/apic.h> int apic_mapped; @@ -763,7 +764,7 @@ static void setup_APIC_timer(unsigned int clocks) local_irq_save(flags); /* wait for irq slice */ - if (vxtime.hpet_address && hpet_use_timer) { + if (hpet_address && hpet_use_timer) { int trigger = hpet_readl(HPET_T0_CMP); while (hpet_readl(HPET_COUNTER) >= trigger) /* do nothing */ ; @@ -785,7 +786,7 @@ static void setup_APIC_timer(unsigned int clocks) /* Turn off PIT interrupt if we use APIC timer as main timer. Only works with the PM timer right now TBD fix it for HPET too. */ - if (vxtime.mode == VXTIME_PMTMR && + if ((pmtmr_ioport != 0) && smp_processor_id() == boot_cpu_id && apic_runs_main_timer == 1 && !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index bd30d138113f..8047ea8c2ab2 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -53,7 +53,9 @@ static void nvidia_bugs(void) return; nvidia_hpet_detected = 0; - acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check); + if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) + return; + if (nvidia_hpet_detected == 0) { acpi_skip_timer_override = 1; printk(KERN_INFO "Nvidia board " diff --git a/arch/x86_64/kernel/hpet.c b/arch/x86_64/kernel/hpet.c new file mode 100644 index 000000000000..65a0edd71a17 --- /dev/null +++ b/arch/x86_64/kernel/hpet.c @@ -0,0 +1,511 @@ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/mc146818rtc.h> +#include <linux/time.h> +#include <linux/clocksource.h> +#include <linux/ioport.h> +#include <linux/acpi.h> +#include <linux/hpet.h> +#include <asm/pgtable.h> +#include <asm/vsyscall.h> +#include <asm/timex.h> +#include <asm/hpet.h> + +int nohpet __initdata; + +unsigned long hpet_address; +unsigned long hpet_period; /* fsecs / HPET clock */ +unsigned long hpet_tick; /* HPET clocks / interrupt */ + +int hpet_use_timer; /* Use counter of hpet for time keeping, + * otherwise PIT + */ + +#ifdef CONFIG_HPET +static __init int late_hpet_init(void) +{ + struct hpet_data hd; + unsigned int ntimer; + + if (!hpet_address) + return 0; + + memset(&hd, 0, sizeof(hd)); + + ntimer = hpet_readl(HPET_ID); + ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + ntimer++; + + /* + * Register with driver. + * Timer0 and Timer1 is used by platform. + */ + hd.hd_phys_address = hpet_address; + hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); + hd.hd_nirqs = ntimer; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + if (ntimer > 2) { + struct hpet *hpet; + struct hpet_timer *timer; + int i; + + hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); + timer = &hpet->hpet_timers[2]; + for (i = 2; i < ntimer; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & + Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + + } + + hpet_alloc(&hd); + return 0; +} +fs_initcall(late_hpet_init); +#endif + +int hpet_timer_stop_set_go(unsigned long tick) +{ + unsigned int cfg; + +/* + * Stop the timers and reset the main counter. + */ + + cfg = hpet_readl(HPET_CFG); + cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); + hpet_writel(cfg, HPET_CFG); + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); + +/* + * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, + * and period also hpet_tick. + */ + if (hpet_use_timer) { + hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + HPET_TN_32BIT, HPET_T0_CFG); + hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ + cfg |= HPET_CFG_LEGACY; + } +/* + * Go! + */ + + cfg |= HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); + + return 0; +} + +int hpet_arch_init(void) +{ + unsigned int id; + + if (!hpet_address) + return -1; + set_fixmap_nocache(FIX_HPET_BASE, hpet_address); + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + +/* + * Read the period, compute tick and quotient. + */ + + id = hpet_readl(HPET_ID); + + if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) + return -1; + + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < 100000 || hpet_period > 100000000) + return -1; + + hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; + + hpet_use_timer = (id & HPET_ID_LEGSUP); + + return hpet_timer_stop_set_go(hpet_tick); +} + +int hpet_reenable(void) +{ + return hpet_timer_stop_set_go(hpet_tick); +} + +/* + * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing + * it to the HPET timer of known frequency. + */ + +#define TICK_COUNT 100000000 +#define TICK_MIN 5000 + +/* + * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none + * occurs between the reads of the hpet & TSC. + */ +static void __init read_hpet_tsc(int *hpet, int *tsc) +{ + int tsc1, tsc2, hpet1; + + do { + tsc1 = get_cycles_sync(); + hpet1 = hpet_readl(HPET_COUNTER); + tsc2 = get_cycles_sync(); + } while (tsc2 - tsc1 > TICK_MIN); + *hpet = hpet1; + *tsc = tsc2; +} + +unsigned int __init hpet_calibrate_tsc(void) +{ + int tsc_start, hpet_start; + int tsc_now, hpet_now; + unsigned long flags; + + local_irq_save(flags); + + read_hpet_tsc(&hpet_start, &tsc_start); + + do { + local_irq_disable(); + read_hpet_tsc(&hpet_now, &tsc_now); + local_irq_restore(flags); + } while ((tsc_now - tsc_start) < TICK_COUNT && + (hpet_now - hpet_start) < TICK_COUNT); + + return (tsc_now - tsc_start) * 1000000000L + / ((hpet_now - hpet_start) * hpet_period / 1000); +} + +#ifdef CONFIG_HPET_EMULATE_RTC +/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET + * is enabled, we support RTC interrupt functionality in software. + * RTC has 3 kinds of interrupts: + * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock + * is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) + * (1) and (2) above are implemented using polling at a frequency of + * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt + * overhead. (DEFAULT_RTC_INT_FREQ) + * For (3), we use interrupts at 64Hz or user specified periodic + * frequency, whichever is higher. + */ +#include <linux/rtc.h> + +#define DEFAULT_RTC_INT_FREQ 64 +#define RTC_NUM_INTS 1 + +static unsigned long UIE_on; +static unsigned long prev_update_sec; + +static unsigned long AIE_on; +static struct rtc_time alarm_time; + +static unsigned long PIE_on; +static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; +static unsigned long PIE_count; + +static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ +static unsigned int hpet_t1_cmp; /* cached comparator register */ + +int is_hpet_enabled(void) +{ + return hpet_address != 0; +} + +/* + * Timer 1 for RTC, we do not use periodic interrupt feature, + * even if HPET supports periodic interrupts on Timer 1. + * The reason being, to set up a periodic interrupt in HPET, we need to + * stop the main counter. And if we do that everytime someone diables/enables + * RTC, we will have adverse effect on main kernel timer running on Timer 0. + * So, for the time being, simulate the periodic interrupt in software. + * + * hpet_rtc_timer_init() is called for the first time and during subsequent + * interuppts reinit happens through hpet_rtc_timer_reinit(). + */ +int hpet_rtc_timer_init(void) +{ + unsigned int cfg, cnt; + unsigned long flags; + + if (!is_hpet_enabled()) + return 0; + /* + * Set the counter 1 and enable the interrupts. + */ + if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) + hpet_rtc_int_freq = PIE_freq; + else + hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; + + local_irq_save(flags); + + cnt = hpet_readl(HPET_COUNTER); + cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; + + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + local_irq_restore(flags); + + return 1; +} + +static void hpet_rtc_timer_reinit(void) +{ + unsigned int cfg, cnt, ticks_per_int, lost_ints; + + if (unlikely(!(PIE_on | AIE_on | UIE_on))) { + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); + return; + } + + if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) + hpet_rtc_int_freq = PIE_freq; + else + hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; + + /* It is more accurate to use the comparator value than current count.*/ + ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; + hpet_t1_cmp += ticks_per_int; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + + /* + * If the interrupt handler was delayed too long, the write above tries + * to schedule the next interrupt in the past and the hardware would + * not interrupt until the counter had wrapped around. + * So we have to check that the comparator wasn't set to a past time. + */ + cnt = hpet_readl(HPET_COUNTER); + if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { + lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; + /* Make sure that, even with the time needed to execute + * this code, the next scheduled interrupt has been moved + * back to the future: */ + lost_ints++; + + hpet_t1_cmp += lost_ints * ticks_per_int; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + + if (PIE_on) + PIE_count += lost_ints; + + if (printk_ratelimit()) + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + hpet_rtc_int_freq); + } +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + if (bit_mask & RTC_UIE) + UIE_on = 0; + if (bit_mask & RTC_PIE) + PIE_on = 0; + if (bit_mask & RTC_AIE) + AIE_on = 0; + + return 1; +} + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + int timer_init_reqd = 0; + + if (!is_hpet_enabled()) + return 0; + + if (!(PIE_on | AIE_on | UIE_on)) + timer_init_reqd = 1; + + if (bit_mask & RTC_UIE) { + UIE_on = 1; + } + if (bit_mask & RTC_PIE) { + PIE_on = 1; + PIE_count = 0; + } + if (bit_mask & RTC_AIE) { + AIE_on = 1; + } + + if (timer_init_reqd) + hpet_rtc_timer_init(); + + return 1; +} + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + alarm_time.tm_hour = hrs; + alarm_time.tm_min = min; + alarm_time.tm_sec = sec; + + return 1; +} + +int hpet_set_periodic_freq(unsigned long freq) +{ + if (!is_hpet_enabled()) + return 0; + + PIE_freq = freq; + PIE_count = 0; + + return 1; +} + +int hpet_rtc_dropped_irq(void) +{ + if (!is_hpet_enabled()) + return 0; + + return 1; +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + int call_rtc_interrupt = 0; + + hpet_rtc_timer_reinit(); + + if (UIE_on | AIE_on) { + rtc_get_rtc_time(&curr_time); + } + if (UIE_on) { + if (curr_time.tm_sec != prev_update_sec) { + /* Set update int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag = RTC_UF; + prev_update_sec = curr_time.tm_sec; + } + } + if (PIE_on) { + PIE_count++; + if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { + /* Set periodic int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag |= RTC_PF; + PIE_count = 0; + } + } + if (AIE_on) { + if ((curr_time.tm_sec == alarm_time.tm_sec) && + (curr_time.tm_min == alarm_time.tm_min) && + (curr_time.tm_hour == alarm_time.tm_hour)) { + /* Set alarm int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag |= RTC_AF; + } + } + if (call_rtc_interrupt) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + rtc_interrupt(rtc_int_flag, dev_id); + } + return IRQ_HANDLED; +} +#endif + +static int __init nohpet_setup(char *s) +{ + nohpet = 1; + return 1; +} + +__setup("nohpet", nohpet_setup); + +#define HPET_MASK 0xFFFFFFFF +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +static void *hpet_ptr; + +static cycle_t read_hpet(void) +{ + return (cycle_t)readl(hpet_ptr); +} + +static cycle_t __vsyscall_fn vread_hpet(void) +{ + return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); +} + +struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = (cycle_t)HPET_MASK, + .mult = 0, /* set below */ + .shift = HPET_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .vread = vread_hpet, +}; + +static int __init init_hpet_clocksource(void) +{ + unsigned long hpet_period; + void __iomem *hpet_base; + u64 tmp; + + if (!hpet_address) + return -ENODEV; + + /* calculate the hpet address: */ + hpet_base = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_ptr = hpet_base + HPET_COUNTER; + + /* calculate the frequency: */ + hpet_period = readl(hpet_base + HPET_PERIOD); + + /* + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + return clocksource_register(&clocksource_hpet); +} + +module_init(init_hpet_clocksource); diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index d73c79e821f1..01e2cf0bdeb1 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -103,6 +103,7 @@ static void mask_and_ack_8259A(unsigned int); static struct irq_chip i8259A_chip = { .name = "XT-PIC", .mask = disable_8259A_irq, + .disable = disable_8259A_irq, .unmask = enable_8259A_irq, .mask_ack = mask_and_ack_8259A, }; diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 566e64d966c4..950682f35766 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -810,11 +810,9 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else { - irq_desc[irq].status |= IRQ_DELAYED_DISABLE; + else set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); - } } static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) { diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c index 7554458dc9cb..ae8f91214f15 100644 --- a/arch/x86_64/kernel/pmtimer.c +++ b/arch/x86_64/kernel/pmtimer.c @@ -24,15 +24,6 @@ #include <asm/msr.h> #include <asm/vsyscall.h> -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/kernel/acpi/boot.c */ -u32 pmtmr_ioport __read_mostly; - -/* value of the Power timer at last timer interrupt */ -static u32 offset_delay; -static u32 last_pmtmr_tick; - #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ static inline u32 cyc2us(u32 cycles) @@ -48,38 +39,6 @@ static inline u32 cyc2us(u32 cycles) return (cycles >> 10); } -int pmtimer_mark_offset(void) -{ - static int first_run = 1; - unsigned long tsc; - u32 lost; - - u32 tick = inl(pmtmr_ioport); - u32 delta; - - delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK); - - last_pmtmr_tick = tick; - monotonic_base += delta * NSEC_PER_USEC; - - delta += offset_delay; - - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - rdtscll(tsc); - vxtime.last_tsc = tsc - offset_delay * (u64)cpu_khz / 1000; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } - - return lost - 1; -} - static unsigned pmtimer_wait_tick(void) { u32 a, b; @@ -101,23 +60,6 @@ void pmtimer_wait(unsigned us) } while (cyc2us(b - a) < us); } -void pmtimer_resume(void) -{ - last_pmtmr_tick = inl(pmtmr_ioport); -} - -unsigned int do_gettimeoffset_pm(void) -{ - u32 now, offset, delta = 0; - - offset = last_pmtmr_tick; - now = inl(pmtmr_ioport); - delta = (now - offset) & ACPI_PM_MASK; - - return offset_delay + cyc2us(delta); -} - - static int __init nopmtimer_setup(char *s) { pmtmr_ioport = 0; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index daf19332f0dd..35443729aad8 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -148,217 +148,6 @@ static void __cpuinit smp_store_cpu_info(int id) print_cpu_info(c); } -/* - * New Funky TSC sync algorithm borrowed from IA64. - * Main advantage is that it doesn't reset the TSCs fully and - * in general looks more robust and it works better than my earlier - * attempts. I believe it was written by David Mosberger. Some minor - * adjustments for x86-64 by me -AK - * - * Original comment reproduced below. - * - * Synchronize TSC of the current (slave) CPU with the TSC of the - * MASTER CPU (normally the time-keeper CPU). We use a closed loop to - * eliminate the possibility of unaccounted-for errors (such as - * getting a machine check in the middle of a calibration step). The - * basic idea is for the slave to ask the master what itc value it has - * and to read its own itc before and after the master responds. Each - * iteration gives us three timestamps: - * - * slave master - * - * t0 ---\ - * ---\ - * ---> - * tm - * /--- - * /--- - * t1 <--- - * - * - * The goal is to adjust the slave's TSC such that tm falls exactly - * half-way between t0 and t1. If we achieve this, the clocks are - * synchronized provided the interconnect between the slave and the - * master is symmetric. Even if the interconnect were asymmetric, we - * would still know that the synchronization error is smaller than the - * roundtrip latency (t0 - t1). - * - * When the interconnect is quiet and symmetric, this lets us - * synchronize the TSC to within one or two cycles. However, we can - * only *guarantee* that the synchronization is accurate to within a - * round-trip time, which is typically in the range of several hundred - * cycles (e.g., ~500 cycles). In practice, this means that the TSCs - * are usually almost perfectly synchronized, but we shouldn't assume - * that the accuracy is much better than half a micro second or so. - * - * [there are other errors like the latency of RDTSC and of the - * WRMSR. These can also account to hundreds of cycles. So it's - * probably worse. It claims 153 cycles error on a dual Opteron, - * but I suspect the numbers are actually somewhat worse -AK] - */ - -#define MASTER 0 -#define SLAVE (SMP_CACHE_BYTES/8) - -/* Intentionally don't use cpu_relax() while TSC synchronization - because we don't want to go into funky power save modi or cause - hypervisors to schedule us away. Going to sleep would likely affect - latency and low latency is the primary objective here. -AK */ -#define no_cpu_relax() barrier() - -static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); -static volatile __cpuinitdata unsigned long go[SLAVE + 1]; -static int notscsync __cpuinitdata; - -#undef DEBUG_TSC_SYNC - -#define NUM_ROUNDS 64 /* magic value */ -#define NUM_ITERS 5 /* likewise */ - -/* Callback on boot CPU */ -static __cpuinit void sync_master(void *arg) -{ - unsigned long flags, i; - - go[MASTER] = 0; - - local_irq_save(flags); - { - for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { - while (!go[MASTER]) - no_cpu_relax(); - go[MASTER] = 0; - rdtscll(go[SLAVE]); - } - } - local_irq_restore(flags); -} - -/* - * Return the number of cycles by which our tsc differs from the tsc - * on the master (time-keeper) CPU. A positive number indicates our - * tsc is ahead of the master, negative that it is behind. - */ -static inline long -get_delta(long *rt, long *master) -{ - unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; - unsigned long tcenter, t0, t1, tm; - int i; - - for (i = 0; i < NUM_ITERS; ++i) { - rdtscll(t0); - go[MASTER] = 1; - while (!(tm = go[SLAVE])) - no_cpu_relax(); - go[SLAVE] = 0; - rdtscll(t1); - - if (t1 - t0 < best_t1 - best_t0) - best_t0 = t0, best_t1 = t1, best_tm = tm; - } - - *rt = best_t1 - best_t0; - *master = best_tm - best_t0; - - /* average best_t0 and best_t1 without overflow: */ - tcenter = (best_t0/2 + best_t1/2); - if (best_t0 % 2 + best_t1 % 2 == 2) - ++tcenter; - return tcenter - best_tm; -} - -static __cpuinit void sync_tsc(unsigned int master) -{ - int i, done = 0; - long delta, adj, adjust_latency = 0; - unsigned long flags, rt, master_time_stamp, bound; -#ifdef DEBUG_TSC_SYNC - static struct syncdebug { - long rt; /* roundtrip time */ - long master; /* master's timestamp */ - long diff; /* difference between midpoint and master's timestamp */ - long lat; /* estimate of tsc adjustment latency */ - } t[NUM_ROUNDS] __cpuinitdata; -#endif - - printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", - smp_processor_id(), master); - - go[MASTER] = 1; - - /* It is dangerous to broadcast IPI as cpus are coming up, - * as they may not be ready to accept them. So since - * we only need to send the ipi to the boot cpu direct - * the message, and avoid the race. - */ - smp_call_function_single(master, sync_master, NULL, 1, 0); - - while (go[MASTER]) /* wait for master to be ready */ - no_cpu_relax(); - - spin_lock_irqsave(&tsc_sync_lock, flags); - { - for (i = 0; i < NUM_ROUNDS; ++i) { - delta = get_delta(&rt, &master_time_stamp); - if (delta == 0) { - done = 1; /* let's lock on to this... */ - bound = rt; - } - - if (!done) { - unsigned long t; - if (i > 0) { - adjust_latency += -delta; - adj = -delta + adjust_latency/4; - } else - adj = -delta; - - rdtscll(t); - wrmsrl(MSR_IA32_TSC, t + adj); - } -#ifdef DEBUG_TSC_SYNC - t[i].rt = rt; - t[i].master = master_time_stamp; - t[i].diff = delta; - t[i].lat = adjust_latency/4; -#endif - } - } - spin_unlock_irqrestore(&tsc_sync_lock, flags); - -#ifdef DEBUG_TSC_SYNC - for (i = 0; i < NUM_ROUNDS; ++i) - printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", - t[i].rt, t[i].master, t[i].diff, t[i].lat); -#endif - - printk(KERN_INFO - "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " - "maxerr %lu cycles)\n", - smp_processor_id(), master, delta, rt); -} - -static void __cpuinit tsc_sync_wait(void) -{ - /* - * When the CPU has synchronized TSCs assume the BIOS - * or the hardware already synced. Otherwise we could - * mess up a possible perfect synchronization with a - * not-quite-perfect algorithm. - */ - if (notscsync || !cpu_has_tsc || !unsynchronized_tsc()) - return; - sync_tsc(0); -} - -static __init int notscsync_setup(char *s) -{ - notscsync = 1; - return 1; -} -__setup("notscsync", notscsync_setup); - static atomic_t init_deasserted __cpuinitdata; /* @@ -546,6 +335,11 @@ void __cpuinit start_secondary(void) /* otherwise gcc will move up the smp_processor_id before the cpu_init */ barrier(); + /* + * Check TSC sync first: + */ + check_tsc_sync_target(); + Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); setup_secondary_APIC_clock(); @@ -565,14 +359,6 @@ void __cpuinit start_secondary(void) */ set_cpu_sibling_map(smp_processor_id()); - /* - * Wait for TSC sync to not schedule things before. - * We still process interrupts, which could see an inconsistent - * time in that window unfortunately. - * Do this here because TSC sync has global unprotected state. - */ - tsc_sync_wait(); - /* * We need to hold call_lock, so there is no inconsistency * between the time smp_call_function() determines number of @@ -592,6 +378,7 @@ void __cpuinit start_secondary(void) cpu_set(smp_processor_id(), cpu_online_map); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; spin_unlock(&vector_lock); + unlock_ipi_call_lock(); cpu_idle(); @@ -1168,6 +955,11 @@ int __cpuinit __cpu_up(unsigned int cpu) /* Unleash the CPU! */ Dprintk("waiting for cpu %d\n", cpu); + /* + * Make sure and check TSC sync: + */ + check_tsc_sync_source(cpu); + while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); @@ -1190,7 +982,6 @@ void __init smp_cpus_done(unsigned int max_cpus) smp_cleanup_boot(); setup_ioapic_dest(); check_nmi_watchdog(); - time_init_gtod(); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 3cc6886f1fb7..a87c51705620 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -42,10 +42,8 @@ #include <linux/cpufreq.h> #include <linux/hpet.h> #include <asm/apic.h> +#include <asm/hpet.h> -#ifdef CONFIG_CPU_FREQ -static void cpufreq_delayed_get(void); -#endif extern void i8254_timer_resume(void); extern int using_apic_timer; @@ -55,128 +53,7 @@ DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); DEFINE_SPINLOCK(i8253_lock); -int nohpet __initdata = 0; -static int notsc __initdata = 0; - -#define USEC_PER_TICK (USEC_PER_SEC / HZ) -#define NSEC_PER_TICK (NSEC_PER_SEC / HZ) -#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) - -#define NS_SCALE 10 /* 2^10, carefully chosen */ -#define US_SCALE 32 /* 2^32, arbitralrily chosen */ - -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ -EXPORT_SYMBOL(cpu_khz); -static unsigned long hpet_period; /* fsecs / HPET clock */ -unsigned long hpet_tick; /* HPET clocks / interrupt */ -int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */ -unsigned long vxtime_hz = PIT_TICK_RATE; -int report_lost_ticks; /* command line option */ -unsigned long long monotonic_base; - -struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ - volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -struct timespec __xtime __section_xtime; -struct timezone __sys_tz __section_sys_tz; - -/* - * do_gettimeoffset() returns microseconds since last timer interrupt was - * triggered by hardware. A memory read of HPET is slower than a register read - * of TSC, but much more reliable. It's also synchronized to the timer - * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a - * timer interrupt has happened already, but vxtime.trigger wasn't updated yet. - * This is not a problem, because jiffies hasn't updated either. They are bound - * together by xtime_lock. - */ - -static inline unsigned int do_gettimeoffset_tsc(void) -{ - unsigned long t; - unsigned long x; - t = get_cycles_sync(); - if (t < vxtime.last_tsc) - t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE; - return x; -} - -static inline unsigned int do_gettimeoffset_hpet(void) -{ - /* cap counter read to one tick to avoid inconsistencies */ - unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; - return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE; -} - -unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; - -/* - * This version of gettimeofday() has microsecond resolution and better than - * microsecond precision, as we're using at least a 10 MHz (usually 14.31818 - * MHz) HPET timer. - */ - -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq; - unsigned int sec, usec; - - do { - seq = read_seqbegin(&xtime_lock); - - sec = xtime.tv_sec; - usec = xtime.tv_nsec / NSEC_PER_USEC; - - /* i386 does some correction here to keep the clock - monotonous even when ntpd is fixing drift. - But they didn't work for me, there is a non monotonic - clock anyways with ntp. - I dropped all corrections now until a real solution can - be found. Note when you fix it here you need to do the same - in arch/x86_64/kernel/vsyscall.c and export all needed - variables in vmlinux.lds. -AK */ - usec += do_gettimeoffset(); - - } while (read_seqretry(&xtime_lock, seq)); - - tv->tv_sec = sec + usec / USEC_PER_SEC; - tv->tv_usec = usec % USEC_PER_SEC; -} - -EXPORT_SYMBOL(do_gettimeofday); - -/* - * settimeofday() first undoes the correction that gettimeofday would do - * on the time, and then saves it. This is ugly, but has been like this for - * ages already. - */ - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - - nsec -= do_gettimeoffset() * NSEC_PER_USEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); unsigned long profile_pc(struct pt_regs *regs) { @@ -267,84 +144,9 @@ static void set_rtc_mmss(unsigned long nowtime) } -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -static inline unsigned long long cycles_2_ns(unsigned long long cyc); -unsigned long long monotonic_clock(void) -{ - unsigned long seq; - u32 last_offset, this_offset, offset; - unsigned long long base; - - if (vxtime.mode == VXTIME_HPET) { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last; - base = monotonic_base; - this_offset = hpet_readl(HPET_COUNTER); - } while (read_seqretry(&xtime_lock, seq)); - offset = (this_offset - last_offset); - offset *= NSEC_PER_TICK / hpet_tick; - } else { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last_tsc; - base = monotonic_base; - } while (read_seqretry(&xtime_lock, seq)); - this_offset = get_cycles_sync(); - offset = cycles_2_ns(this_offset - last_offset); - } - return base + offset; -} -EXPORT_SYMBOL(monotonic_clock); - -static noinline void handle_lost_ticks(int lost) -{ - static long lost_count; - static int warned; - if (report_lost_ticks) { - printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost); - print_symbol("rip %s)\n", get_irq_regs()->rip); - } - - if (lost_count == 1000 && !warned) { - printk(KERN_WARNING "warning: many lost ticks.\n" - KERN_WARNING "Your time source seems to be instable or " - "some driver is hogging interupts\n"); - print_symbol("rip %s\n", get_irq_regs()->rip); - if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) { - printk(KERN_WARNING "Falling back to HPET\n"); - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; - } - /* else should fall back to PIT, but code missing. */ - warned = 1; - } else - lost_count++; - -#ifdef CONFIG_CPU_FREQ - /* In some cases the CPU can change frequency without us noticing - Give cpufreq a change to catch up. */ - if ((lost_count+1) % 25 == 0) - cpufreq_delayed_get(); -#endif -} - void main_timer_handler(void) { static unsigned long rtc_update = 0; - unsigned long tsc; - int delay = 0, offset = 0, lost = 0; - /* * Here we are in the timer irq handler. We have irqs locally disabled (so we * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running @@ -354,72 +156,11 @@ void main_timer_handler(void) write_seqlock(&xtime_lock); - if (vxtime.hpet_address) - offset = hpet_readl(HPET_COUNTER); - - if (hpet_use_timer) { - /* if we're using the hpet timer functionality, - * we can more accurately know the counter value - * when the timer interrupt occured. - */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - delay = hpet_readl(HPET_COUNTER) - offset; - } else if (!pmtmr_ioport) { - spin_lock(&i8253_lock); - outb_p(0x00, 0x43); - delay = inb_p(0x40); - delay |= inb(0x40) << 8; - spin_unlock(&i8253_lock); - delay = LATCH - 1 - delay; - } - - tsc = get_cycles_sync(); - - if (vxtime.mode == VXTIME_HPET) { - if (offset - vxtime.last > hpet_tick) { - lost = (offset - vxtime.last) / hpet_tick - 1; - } - - monotonic_base += - (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick; - - vxtime.last = offset; -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - lost = pmtimer_mark_offset(); -#endif - } else { - offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK; - - if (offset < 0) - offset = 0; - - if (offset > USEC_PER_TICK) { - lost = offset / USEC_PER_TICK; - offset %= USEC_PER_TICK; - } - - monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc); - - vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; - - if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> US_SCALE) < offset) - vxtime.last_tsc = tsc - - (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1; - } - - if (lost > 0) - handle_lost_ticks(lost); - else - lost = 0; - /* * Do the timer stuff. */ - do_timer(lost + 1); + do_timer(1); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); #endif @@ -460,40 +201,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static unsigned int cyc2ns_scale __read_mostly; - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> NS_SCALE; -} - -unsigned long long sched_clock(void) -{ - unsigned long a = 0; - -#if 0 - /* Don't do a HPET read here. Using TSC always is much faster - and HPET may not be mapped yet when the scheduler first runs. - Disadvantage is a small drift between CPUs in some configurations, - but that should be tolerable. */ - if (__vxtime.mode == VXTIME_HPET) - return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE; -#endif - - /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, - which means it is not completely exact and may not be monotonous between - CPUs. But the errors should be too small to matter for scheduling - purposes. */ - - rdtscll(a); - return cycles_2_ns(a); -} - static unsigned long get_cmos_time(void) { unsigned int year, mon, day, hour, min, sec; @@ -545,164 +252,6 @@ static unsigned long get_cmos_time(void) return mktime(year, mon, day, hour, min, sec); } -#ifdef CONFIG_CPU_FREQ - -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency - changes. - - RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - not that important because current Opteron setups do not support - scaling on SMP anyroads. - - Should fix up last_tsc too. Currently gettimeofday in the - first tick after the change will be slightly wrong. */ - -#include <linux/workqueue.h> - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(struct work_struct *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static void cpufreq_delayed_get(void) -{ - static int warned; - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - if (!warned) { - warned = 1; - printk(KERN_DEBUG - "Losing some ticks... checking if CPU frequency changed.\n"); - } - schedule_work(&cpufreq_delayed_get_work); - } -} - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -static unsigned long cpu_khz_ref = 0; - -static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long *lpj, dummy; - - if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) - return 0; - - lpj = &dummy; - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -#ifdef CONFIG_SMP - lpj = &cpu_data[freq->cpu].loops_per_jiffy; -#else - lpj = &boot_cpu_data.loops_per_jiffy; -#endif - - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; - cpu_khz_ref = cpu_khz; - } - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - *lpj = - cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - } - - set_cyc2ns_scale(cpu_khz_ref); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); - if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER)) - cpufreq_init = 1; - return 0; -} - -core_initcall(cpufreq_tsc); - -#endif - -/* - * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing - * it to the HPET timer of known frequency. - */ - -#define TICK_COUNT 100000000 -#define TICK_MIN 5000 -#define MAX_READ_RETRIES 5 - -/* - * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none - * occurs between the reads of the hpet & TSC. - */ -static void __init read_hpet_tsc(int *hpet, int *tsc) -{ - int tsc1, tsc2, hpet1, retries = 0; - static int msg; - - do { - tsc1 = get_cycles_sync(); - hpet1 = hpet_readl(HPET_COUNTER); - tsc2 = get_cycles_sync(); - } while (tsc2 - tsc1 > TICK_MIN && retries++ < MAX_READ_RETRIES); - if (retries >= MAX_READ_RETRIES && !msg++) - printk(KERN_WARNING - "hpet.c: exceeded max retries to read HPET & TSC\n"); - *hpet = hpet1; - *tsc = tsc2; -} - - -static unsigned int __init hpet_calibrate_tsc(void) -{ - int tsc_start, hpet_start; - int tsc_now, hpet_now; - unsigned long flags; - - local_irq_save(flags); - local_irq_disable(); - - read_hpet_tsc(&hpet_start, &tsc_start); - - do { - local_irq_disable(); - read_hpet_tsc(&hpet_now, &tsc_now); - local_irq_restore(flags); - } while ((tsc_now - tsc_start) < TICK_COUNT && - (hpet_now - hpet_start) < TICK_COUNT); - - return (tsc_now - tsc_start) * 1000000000L - / ((hpet_now - hpet_start) * hpet_period / 1000); -} - /* * pit_calibrate_tsc() uses the speaker output (channel 2) of @@ -733,124 +282,6 @@ static unsigned int __init pit_calibrate_tsc(void) return (end - start) / 50; } -#ifdef CONFIG_HPET -static __init int late_hpet_init(void) -{ - struct hpet_data hd; - unsigned int ntimer; - - if (!vxtime.hpet_address) - return 0; - - memset(&hd, 0, sizeof (hd)); - - ntimer = hpet_readl(HPET_ID); - ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; - ntimer++; - - /* - * Register with driver. - * Timer0 and Timer1 is used by platform. - */ - hd.hd_phys_address = vxtime.hpet_address; - hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); - hd.hd_nirqs = ntimer; - hd.hd_flags = HPET_DATA_PLATFORM; - hpet_reserve_timer(&hd, 0); -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif - hd.hd_irq[0] = HPET_LEGACY_8254; - hd.hd_irq[1] = HPET_LEGACY_RTC; - if (ntimer > 2) { - struct hpet *hpet; - struct hpet_timer *timer; - int i; - - hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); - timer = &hpet->hpet_timers[2]; - for (i = 2; i < ntimer; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & - Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - - } - - hpet_alloc(&hd); - return 0; -} -fs_initcall(late_hpet_init); -#endif - -static int hpet_timer_stop_set_go(unsigned long tick) -{ - unsigned int cfg; - -/* - * Stop the timers and reset the main counter. - */ - - cfg = hpet_readl(HPET_CFG); - cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - hpet_writel(0, HPET_COUNTER); - hpet_writel(0, HPET_COUNTER + 4); - -/* - * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, - * and period also hpet_tick. - */ - if (hpet_use_timer) { - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | - HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ - hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ - cfg |= HPET_CFG_LEGACY; - } -/* - * Go! - */ - - cfg |= HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -static int hpet_init(void) -{ - unsigned int id; - - if (!vxtime.hpet_address) - return -1; - set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address); - __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); - -/* - * Read the period, compute tick and quotient. - */ - - id = hpet_readl(HPET_ID); - - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) - return -1; - - hpet_period = hpet_readl(HPET_PERIOD); - if (hpet_period < 100000 || hpet_period > 100000000) - return -1; - - hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; - - hpet_use_timer = (id & HPET_ID_LEGSUP); - - return hpet_timer_stop_set_go(hpet_tick); -} - -static int hpet_reenable(void) -{ - return hpet_timer_stop_set_go(hpet_tick); -} - #define PIT_MODE 0x43 #define PIT_CH0 0x40 @@ -878,7 +309,7 @@ void __init pit_stop_interrupt(void) void __init stop_timer_interrupt(void) { char *name; - if (vxtime.hpet_address) { + if (hpet_address) { name = "HPET"; hpet_timer_stop_set_go(0); } else { @@ -888,12 +319,6 @@ void __init stop_timer_interrupt(void) printk(KERN_INFO "timer: %s interrupt stopped.\n", name); } -int __init time_setup(char *str) -{ - report_lost_ticks = 1; - return 1; -} - static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL }; @@ -901,124 +326,41 @@ static struct irqaction irq0 = { void __init time_init(void) { if (nohpet) - vxtime.hpet_address = 0; - + hpet_address = 0; xtime.tv_sec = get_cmos_time(); xtime.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - if (!hpet_init()) - vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period; - else - vxtime.hpet_address = 0; + if (hpet_arch_init()) + hpet_address = 0; if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ tick_nsec = TICK_NSEC_HPET; cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; -#ifdef CONFIG_X86_PM_TIMER - } else if (pmtmr_ioport && !vxtime.hpet_address) { - vxtime_hz = PM_TIMER_FREQUENCY; - timename = "PM"; - pit_init(); - cpu_khz = pit_calibrate_tsc(); -#endif } else { pit_init(); cpu_khz = pit_calibrate_tsc(); timename = "PIT"; } - vxtime.mode = VXTIME_TSC; - vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - vxtime.last_tsc = get_cycles_sync(); - set_cyc2ns_scale(cpu_khz); - setup_irq(0, &irq0); - -#ifndef CONFIG_SMP - time_init_gtod(); -#endif -} - -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -__cpuinit int unsynchronized_tsc(void) -{ -#ifdef CONFIG_SMP - if (apic_is_clustered_box()) - return 1; -#endif - /* Most intel systems have synchronized TSCs except for - multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { -#ifdef CONFIG_ACPI - /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000) - return 1; -#endif - return 0; - } - - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; -} - -/* - * Decide what mode gettimeofday should use. - */ -void time_init_gtod(void) -{ - char *timetype; - if (unsynchronized_tsc()) - notsc = 1; + mark_tsc_unstable(); - if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) vgetcpu_mode = VGETCPU_RDTSCP; else vgetcpu_mode = VGETCPU_LSL; - if (vxtime.hpet_address && notsc) { - timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; -#ifdef CONFIG_X86_PM_TIMER - /* Using PM for gettimeofday is quite slow, but we have no other - choice because the TSC is too unreliable on some systems. */ - } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) { - timetype = "PM"; - do_gettimeoffset = do_gettimeoffset_pm; - vxtime.mode = VXTIME_PMTMR; - sysctl_vsyscall = 0; - printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); -#endif - } else { - timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; - vxtime.mode = VXTIME_TSC; - } - - printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype); + set_cyc2ns_scale(cpu_khz); printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); - vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - vxtime.last_tsc = get_cycles_sync(); - - set_cyc2ns_scale(cpu_khz); + setup_irq(0, &irq0); } -__setup("report_lost_ticks", time_setup); static long clock_cmos_diff; static unsigned long sleep_start; @@ -1055,7 +397,7 @@ static int timer_resume(struct sys_device *dev) sleep_length = 0; ctime = sleep_start; } - if (vxtime.hpet_address) + if (hpet_address) hpet_reenable(); else i8254_timer_resume(); @@ -1064,20 +406,8 @@ static int timer_resume(struct sys_device *dev) write_seqlock_irqsave(&xtime_lock,flags); xtime.tv_sec = sec; xtime.tv_nsec = 0; - if (vxtime.mode == VXTIME_HPET) { - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - pmtimer_resume(); -#endif - } else - vxtime.last_tsc = get_cycles_sync(); - write_sequnlock_irqrestore(&xtime_lock,flags); jiffies += sleep_length; - monotonic_base += sleep_length * (NSEC_PER_SEC/HZ); + write_sequnlock_irqrestore(&xtime_lock,flags); touch_softlockup_watchdog(); return 0; } @@ -1103,270 +433,3 @@ static int time_init_device(void) } device_initcall(time_init_device); - -#ifdef CONFIG_HPET_EMULATE_RTC -/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET - * is enabled, we support RTC interrupt functionality in software. - * RTC has 3 kinds of interrupts: - * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock - * is updated - * 2) Alarm Interrupt - generate an interrupt at a specific time of day - * 3) Periodic Interrupt - generate periodic interrupt, with frequencies - * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) - * (1) and (2) above are implemented using polling at a frequency of - * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt - * overhead. (DEFAULT_RTC_INT_FREQ) - * For (3), we use interrupts at 64Hz or user specified periodic - * frequency, whichever is higher. - */ -#include <linux/rtc.h> - -#define DEFAULT_RTC_INT_FREQ 64 -#define RTC_NUM_INTS 1 - -static unsigned long UIE_on; -static unsigned long prev_update_sec; - -static unsigned long AIE_on; -static struct rtc_time alarm_time; - -static unsigned long PIE_on; -static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; -static unsigned long PIE_count; - -static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ -static unsigned int hpet_t1_cmp; /* cached comparator register */ - -int is_hpet_enabled(void) -{ - return vxtime.hpet_address != 0; -} - -/* - * Timer 1 for RTC, we do not use periodic interrupt feature, - * even if HPET supports periodic interrupts on Timer 1. - * The reason being, to set up a periodic interrupt in HPET, we need to - * stop the main counter. And if we do that everytime someone diables/enables - * RTC, we will have adverse effect on main kernel timer running on Timer 0. - * So, for the time being, simulate the periodic interrupt in software. - * - * hpet_rtc_timer_init() is called for the first time and during subsequent - * interuppts reinit happens through hpet_rtc_timer_reinit(). - */ -int hpet_rtc_timer_init(void) -{ - unsigned int cfg, cnt; - unsigned long flags; - - if (!is_hpet_enabled()) - return 0; - /* - * Set the counter 1 and enable the interrupts. - */ - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - local_irq_save(flags); - - cnt = hpet_readl(HPET_COUNTER); - cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; - - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_PERIODIC; - cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); - - local_irq_restore(flags); - - return 1; -} - -static void hpet_rtc_timer_reinit(void) -{ - unsigned int cfg, cnt, ticks_per_int, lost_ints; - - if (unlikely(!(PIE_on | AIE_on | UIE_on))) { - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T1_CFG); - return; - } - - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - /* It is more accurate to use the comparator value than current count.*/ - ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; - hpet_t1_cmp += ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - /* - * If the interrupt handler was delayed too long, the write above tries - * to schedule the next interrupt in the past and the hardware would - * not interrupt until the counter had wrapped around. - * So we have to check that the comparator wasn't set to a past time. - */ - cnt = hpet_readl(HPET_COUNTER); - if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { - lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; - /* Make sure that, even with the time needed to execute - * this code, the next scheduled interrupt has been moved - * back to the future: */ - lost_ints++; - - hpet_t1_cmp += lost_ints * ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - if (PIE_on) - PIE_count += lost_ints; - - if (printk_ratelimit()) - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); - } -} - -/* - * The functions below are called from rtc driver. - * Return 0 if HPET is not being used. - * Otherwise do the necessary changes and return 1. - */ -int hpet_mask_rtc_irq_bit(unsigned long bit_mask) -{ - if (!is_hpet_enabled()) - return 0; - - if (bit_mask & RTC_UIE) - UIE_on = 0; - if (bit_mask & RTC_PIE) - PIE_on = 0; - if (bit_mask & RTC_AIE) - AIE_on = 0; - - return 1; -} - -int hpet_set_rtc_irq_bit(unsigned long bit_mask) -{ - int timer_init_reqd = 0; - - if (!is_hpet_enabled()) - return 0; - - if (!(PIE_on | AIE_on | UIE_on)) - timer_init_reqd = 1; - - if (bit_mask & RTC_UIE) { - UIE_on = 1; - } - if (bit_mask & RTC_PIE) { - PIE_on = 1; - PIE_count = 0; - } - if (bit_mask & RTC_AIE) { - AIE_on = 1; - } - - if (timer_init_reqd) - hpet_rtc_timer_init(); - - return 1; -} - -int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) -{ - if (!is_hpet_enabled()) - return 0; - - alarm_time.tm_hour = hrs; - alarm_time.tm_min = min; - alarm_time.tm_sec = sec; - - return 1; -} - -int hpet_set_periodic_freq(unsigned long freq) -{ - if (!is_hpet_enabled()) - return 0; - - PIE_freq = freq; - PIE_count = 0; - - return 1; -} - -int hpet_rtc_dropped_irq(void) -{ - if (!is_hpet_enabled()) - return 0; - - return 1; -} - -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - struct rtc_time curr_time; - unsigned long rtc_int_flag = 0; - int call_rtc_interrupt = 0; - - hpet_rtc_timer_reinit(); - - if (UIE_on | AIE_on) { - rtc_get_rtc_time(&curr_time); - } - if (UIE_on) { - if (curr_time.tm_sec != prev_update_sec) { - /* Set update int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag = RTC_UF; - prev_update_sec = curr_time.tm_sec; - } - } - if (PIE_on) { - PIE_count++; - if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { - /* Set periodic int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_PF; - PIE_count = 0; - } - } - if (AIE_on) { - if ((curr_time.tm_sec == alarm_time.tm_sec) && - (curr_time.tm_min == alarm_time.tm_min) && - (curr_time.tm_hour == alarm_time.tm_hour)) { - /* Set alarm int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_AF; - } - } - if (call_rtc_interrupt) { - rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id); - } - return IRQ_HANDLED; -} -#endif - -static int __init nohpet_setup(char *s) -{ - nohpet = 1; - return 1; -} - -__setup("nohpet", nohpet_setup); - -int __init notsc_setup(char *s) -{ - notsc = 1; - return 1; -} - -__setup("notsc", notsc_setup); diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c new file mode 100644 index 000000000000..895831865019 --- /dev/null +++ b/arch/x86_64/kernel/tsc.c @@ -0,0 +1,226 @@ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/clocksource.h> +#include <linux/time.h> +#include <linux/acpi.h> +#include <linux/cpufreq.h> + +#include <asm/timex.h> + +static int notsc __initdata = 0; + +unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); + +static unsigned int cyc2ns_scale __read_mostly; + +void set_cyc2ns_scale(unsigned long khz) +{ + cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; +} + +static unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> NS_SCALE; +} + +unsigned long long sched_clock(void) +{ + unsigned long a = 0; + + /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, + * which means it is not completely exact and may not be monotonous + * between CPUs. But the errors should be too small to matter for + * scheduling purposes. + */ + + rdtscll(a); + return cycles_2_ns(a); +} + +static int tsc_unstable; + +static inline int check_tsc_unstable(void) +{ + return tsc_unstable; +} +#ifdef CONFIG_CPU_FREQ + +/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency + * changes. + * + * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's + * not that important because current Opteron setups do not support + * scaling on SMP anyroads. + * + * Should fix up last_tsc too. Currently gettimeofday in the + * first tick after the change will be slightly wrong. + */ + +#include <linux/workqueue.h> + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(struct work_struct *v) +{ + unsigned int cpu; + for_each_online_cpu(cpu) { + cpufreq_get(cpu); + } + cpufreq_delayed_issched = 0; +} + +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; + +static unsigned long cpu_khz_ref = 0; + +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long *lpj, dummy; + + if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) + return 0; + + lpj = &dummy; + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) +#ifdef CONFIG_SMP + lpj = &cpu_data[freq->cpu].loops_per_jiffy; +#else + lpj = &boot_cpu_data.loops_per_jiffy; +#endif + + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = *lpj; + cpu_khz_ref = cpu_khz; + } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + *lpj = + cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + + cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + mark_tsc_unstable(); + } + + set_cyc2ns_scale(cpu_khz_ref); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); + if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER)) + cpufreq_init = 1; + return 0; +} + +core_initcall(cpufreq_tsc); + +#endif + +static int tsc_unstable = 0; + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +__cpuinit int unsynchronized_tsc(void) +{ + if (tsc_unstable) + return 1; + +#ifdef CONFIG_SMP + if (apic_is_clustered_box()) + return 1; +#endif + /* Most intel systems have synchronized TSCs except for + multi node systems */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { +#ifdef CONFIG_ACPI + /* But TSC doesn't tick in C3 so don't use it there */ + if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000) + return 1; +#endif + return 0; + } + + /* Assume multi socket systems are not synchronized */ + return num_present_cpus() > 1; +} + +int __init notsc_setup(char *s) +{ + notsc = 1; + return 1; +} + +__setup("notsc", notsc_setup); + + +/* clock source code: */ +static cycle_t read_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles_sync(); + return ret; +} + +static cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles_sync(); + return ret; +} + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, + .vread = vread_tsc, +}; + +void mark_tsc_unstable(void) +{ + if (!tsc_unstable) { + tsc_unstable = 1; + /* Change only the rating, when not registered */ + if (clocksource_tsc.mult) + clocksource_change_rating(&clocksource_tsc, 0); + else + clocksource_tsc.rating = 0; + } +} +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +static int __init init_tsc_clocksource(void) +{ + if (!notsc) { + clocksource_tsc.mult = clocksource_khz2mult(cpu_khz, + clocksource_tsc.shift); + if (check_tsc_unstable()) + clocksource_tsc.rating = 0; + + return clocksource_register(&clocksource_tsc); + } + return 0; +} + +module_init(init_tsc_clocksource); diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c new file mode 100644 index 000000000000..014f0db45dfa --- /dev/null +++ b/arch/x86_64/kernel/tsc_sync.c @@ -0,0 +1,187 @@ +/* + * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization. + * + * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar + * + * We check whether all boot CPUs have their TSC's synchronized, + * print a warning if not and turn off the TSC clock-source. + * + * The warp-check is point-to-point between two CPUs, the CPU + * initiating the bootup is the 'source CPU', the freshly booting + * CPU is the 'target CPU'. + * + * Only two CPUs may participate - they can enter in any order. + * ( The serial nature of the boot logic and the CPU hotplug lock + * protects against more than 2 CPUs entering this code. ) + */ +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/nmi.h> +#include <asm/tsc.h> + +/* + * Entry/exit counters that make sure that both CPUs + * run the measurement code at once: + */ +static __cpuinitdata atomic_t start_count; +static __cpuinitdata atomic_t stop_count; + +/* + * We use a raw spinlock in this exceptional case, because + * we want to have the fastest, inlined, non-debug version + * of a critical section, to be able to prove TSC time-warps: + */ +static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata cycles_t last_tsc; +static __cpuinitdata cycles_t max_warp; +static __cpuinitdata int nr_warps; + +/* + * TSC-warp measurement loop running on both CPUs: + */ +static __cpuinit void check_tsc_warp(void) +{ + cycles_t start, now, prev, end; + int i; + + start = get_cycles_sync(); + /* + * The measurement runs for 20 msecs: + */ + end = start + cpu_khz * 20ULL; + now = start; + + for (i = 0; ; i++) { + /* + * We take the global lock, measure TSC, save the + * previous TSC that was measured (possibly on + * another CPU) and update the previous TSC timestamp. + */ + __raw_spin_lock(&sync_lock); + prev = last_tsc; + now = get_cycles_sync(); + last_tsc = now; + __raw_spin_unlock(&sync_lock); + + /* + * Be nice every now and then (and also check whether + * measurement is done [we also insert a 100 million + * loops safety exit, so we dont lock up in case the + * TSC readout is totally broken]): + */ + if (unlikely(!(i & 7))) { + if (now > end || i > 100000000) + break; + cpu_relax(); + touch_nmi_watchdog(); + } + /* + * Outside the critical section we can now see whether + * we saw a time-warp of the TSC going backwards: + */ + if (unlikely(prev > now)) { + __raw_spin_lock(&sync_lock); + max_warp = max(max_warp, prev - now); + nr_warps++; + __raw_spin_unlock(&sync_lock); + } + + } +} + +/* + * Source CPU calls into this - it waits for the freshly booted + * target CPU to arrive and then starts the measurement: + */ +void __cpuinit check_tsc_sync_source(int cpu) +{ + int cpus = 2; + + /* + * No need to check if we already know that the TSC is not + * synchronized: + */ + if (unsynchronized_tsc()) + return; + + printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", + smp_processor_id(), cpu); + + /* + * Reset it - in case this is a second bootup: + */ + atomic_set(&stop_count, 0); + + /* + * Wait for the target to arrive: + */ + while (atomic_read(&start_count) != cpus-1) + cpu_relax(); + /* + * Trigger the target to continue into the measurement too: + */ + atomic_inc(&start_count); + + check_tsc_warp(); + + while (atomic_read(&stop_count) != cpus-1) + cpu_relax(); + + /* + * Reset it - just in case we boot another CPU later: + */ + atomic_set(&start_count, 0); + + if (nr_warps) { + printk("\n"); + printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," + " turning off TSC clock.\n", max_warp); + mark_tsc_unstable(); + nr_warps = 0; + max_warp = 0; + last_tsc = 0; + } else { + printk(" passed.\n"); + } + + /* + * Let the target continue with the bootup: + */ + atomic_inc(&stop_count); +} + +/* + * Freshly booted CPUs call into this: + */ +void __cpuinit check_tsc_sync_target(void) +{ + int cpus = 2; + + if (unsynchronized_tsc()) + return; + + /* + * Register this CPU's participation and wait for the + * source CPU to start the measurement: + */ + atomic_inc(&start_count); + while (atomic_read(&start_count) != cpus) + cpu_relax(); + + check_tsc_warp(); + + /* + * Ok, we are done: + */ + atomic_inc(&stop_count); + + /* + * Wait for the source CPU to print stuff: + */ + while (atomic_read(&stop_count) != cpus) + cpu_relax(); +} +#undef NR_LOOPS + diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index c360c4225244..b73212c0a550 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -88,31 +88,25 @@ SECTIONS __vsyscall_0 = VSYSCALL_VIRT_ADDR; . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) } - xtime_lock = VVIRT(.xtime_lock); - - .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } - vxtime = VVIRT(.vxtime); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) + { *(.vsyscall_gtod_data) } + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } vgetcpu_mode = VVIRT(.vgetcpu_mode); - .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) } - sys_tz = VVIRT(.sys_tz); - - .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) } - sysctl_vsyscall = VVIRT(.sysctl_vsyscall); - - .xtime : AT(VLOAD(.xtime)) { *(.xtime) } - xtime = VVIRT(.xtime); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } jiffies = VVIRT(.jiffies); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) + { *(.vsyscall_1) } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) + { *(.vsyscall_2) } + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) + { *(.vsyscall_3) } . = VSYSCALL_VIRT_ADDR + 4096; diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 313dc6ad780b..180ff919eaf9 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -26,6 +26,7 @@ #include <linux/seqlock.h> #include <linux/jiffies.h> #include <linux/sysctl.h> +#include <linux/clocksource.h> #include <linux/getcpu.h> #include <linux/cpu.h> #include <linux/smp.h> @@ -34,6 +35,7 @@ #include <asm/vsyscall.h> #include <asm/pgtable.h> #include <asm/page.h> +#include <asm/unistd.h> #include <asm/fixmap.h> #include <asm/errno.h> #include <asm/io.h> @@ -44,56 +46,41 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","rcx","memory" -int __sysctl_vsyscall __section_sysctl_vsyscall = 1; -seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +struct vsyscall_gtod_data_t { + seqlock_t lock; + int sysctl_enabled; + struct timeval wall_time_tv; + struct timezone sys_tz; + cycle_t offset_base; + struct clocksource clock; +}; int __vgetcpu_mode __section_vgetcpu_mode; -#include <asm/unistd.h> - -static __always_inline void timeval_normalize(struct timeval * tv) +struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data = { - time_t __sec; - - __sec = tv->tv_usec / 1000000; - if (__sec) { - tv->tv_usec %= 1000000; - tv->tv_sec += __sec; - } -} + .lock = SEQLOCK_UNLOCKED, + .sysctl_enabled = 1, +}; -static __always_inline void do_vgettimeofday(struct timeval * tv) +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) { - long sequence, t; - unsigned long sec, usec; - - do { - sequence = read_seqbegin(&__xtime_lock); - - sec = __xtime.tv_sec; - usec = __xtime.tv_nsec / 1000; - - if (__vxtime.mode != VXTIME_HPET) { - t = get_cycles_sync(); - if (t < __vxtime.last_tsc) - t = __vxtime.last_tsc; - usec += ((t - __vxtime.last_tsc) * - __vxtime.tsc_quot) >> 32; - /* See comment in x86_64 do_gettimeofday. */ - } else { - usec += ((readl((void __iomem *) - fix_to_virt(VSYSCALL_HPET) + 0xf0) - - __vxtime.last) * __vxtime.quot) >> 32; - } - } while (read_seqretry(&__xtime_lock, sequence)); - - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* copy vsyscall data */ + vsyscall_gtod_data.clock = *clock; + vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; + vsyscall_gtod_data.sys_tz = sys_tz; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ +/* RED-PEN may want to readd seq locking, but then the variable should be + * write-once. + */ static __always_inline void do_get_tz(struct timezone * tz) { - *tz = __sys_tz; + *tz = __vsyscall_gtod_data.sys_tz; } static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) @@ -101,7 +88,8 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) int ret; asm volatile("vsysc2: syscall" : "=a" (ret) - : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) + : __syscall_clobber ); return ret; } @@ -114,10 +102,44 @@ static __always_inline long time_syscall(long *t) return secs; } +static __always_inline void do_vgettimeofday(struct timeval * tv) +{ + cycle_t now, base, mask, cycle_delta; + unsigned long seq, mult, shift, nsec_delta; + cycle_t (*vread)(void); + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + + vread = __vsyscall_gtod_data.clock.vread; + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { + gettimeofday(tv,0); + return; + } + now = vread(); + base = __vsyscall_gtod_data.clock.cycle_last; + mask = __vsyscall_gtod_data.clock.mask; + mult = __vsyscall_gtod_data.clock.mult; + shift = __vsyscall_gtod_data.clock.shift; + + *tv = __vsyscall_gtod_data.wall_time_tv; + + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + + /* calculate interval: */ + cycle_delta = (now - base) & mask; + /* convert to nsecs: */ + nsec_delta = (cycle_delta * mult) >> shift; + + /* convert to usecs and add to timespec: */ + tv->tv_usec += nsec_delta / NSEC_PER_USEC; + while (tv->tv_usec > USEC_PER_SEC) { + tv->tv_sec += 1; + tv->tv_usec -= USEC_PER_SEC; + } +} + int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) { - if (!__sysctl_vsyscall) - return gettimeofday(tv,tz); if (tv) do_vgettimeofday(tv); if (tz) @@ -129,11 +151,11 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - if (!__sysctl_vsyscall) + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); else if (t) - *t = __xtime.tv_sec; - return __xtime.tv_sec; + *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; + return __vsyscall_gtod_data.wall_time_tv.tv_sec; } /* Fast way to get current CPU and node. @@ -210,7 +232,7 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, ret = -ENOMEM; goto out; } - if (!sysctl_vsyscall) { + if (!vsyscall_gtod_data.sysctl_enabled) { writew(SYSCALL, map1); writew(SYSCALL, map2); } else { @@ -232,7 +254,8 @@ static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, static ctl_table kernel_table2[] = { { .ctl_name = 99, .procname = "vsyscall64", - .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, + .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), + .mode = 0644, .strategy = vsyscall_sysctl_nostrat, .proc_handler = vsyscall_sysctl_change }, {} |