From 8d165db10772f238103c3e8f955c54145e5c07f3 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 10 May 2009 13:37:36 +0000 Subject: powerpc: Improve decrementer accuracy I have been looking at sources of OS jitter and notice that after a long NO_HZ idle period we wakeup too early: relative time (us) event timer irq exit 999946.405 timer irq entry 4.835 timer irq exit 21.685 timer irq entry 3.540 timer (tick_sched_timer) entry Here we slept for just under a second then took a timer interrupt that did nothing. 21.685 us later we wake up again and do the work. We set a rather low shift value of 16 for the decrementer clockevent, which I think is causing this issue. On this box we have a 207MHz decrementer and see: clockevent: decrementer mult[3501] shift[16] cpu[0] For calculations of large intervals this mult/shift combination could be off by a significant amount. I notice the sparc code has a loop that iterates to find a mult/shift combination that maximises the shift value while keeping mult under 32bit. With the patch below we get: clockevent: decrementer mult[35015c20] shift[32] cpu[15] And we no longer see the spurious wakeups. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/time.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/kernel/time.c') diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 48571ac56fb7..bee1443da763 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -109,7 +109,7 @@ static void decrementer_set_mode(enum clock_event_mode mode, static struct clock_event_device decrementer_clockevent = { .name = "decrementer", .rating = 200, - .shift = 16, + .shift = 0, /* To be filled in */ .mult = 0, /* To be filled in */ .irq = 0, .set_next_event = decrementer_set_next_event, @@ -843,6 +843,22 @@ static void decrementer_set_mode(enum clock_event_mode mode, decrementer_set_next_event(DECREMENTER_MAX, dev); } +static void __init setup_clockevent_multiplier(unsigned long hz) +{ + u64 mult, shift = 32; + + while (1) { + mult = div_sc(hz, NSEC_PER_SEC, shift); + if (mult && (mult >> 32UL) == 0UL) + break; + + shift--; + } + + decrementer_clockevent.shift = shift; + decrementer_clockevent.mult = mult; +} + static void register_decrementer_clockevent(int cpu) { struct clock_event_device *dec = &per_cpu(decrementers, cpu).event; @@ -860,8 +876,7 @@ static void __init init_decrementer_clockevent(void) { int cpu = smp_processor_id(); - decrementer_clockevent.mult = div_sc(ppc_tb_freq, NSEC_PER_SEC, - decrementer_clockevent.shift); + setup_clockevent_multiplier(ppc_tb_freq); decrementer_clockevent.max_delta_ns = clockevent_delta2ns(DECREMENTER_MAX, &decrementer_clockevent); decrementer_clockevent.min_delta_ns = -- cgit v1.2.3 From 177996e6e20f15004d6757d9b859f57d181ef443 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 9 Jun 2009 21:12:00 +0000 Subject: powerpc: Don't do generic calibrate_delay() Currently we are wasting time calling the generic calibrate_delay() function. We don't need it since our implementation of __delay() is based on the CPU timebase. So instead, we use our own small implementation that initializes loops_per_jiffy to something sensible to make the few users like spinlock debug be happy Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/time.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/powerpc/kernel/time.c') diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index bee1443da763..15391c2ab013 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -1143,6 +1144,15 @@ void div128_by_32(u64 dividend_high, u64 dividend_low, } +/* We don't need to calibrate delay, we use the CPU timebase for that */ +void calibrate_delay(void) +{ + /* Some generic code (such as spinlock debug) use loops_per_jiffy + * as the number of __delay(1) in a jiffy, so make it so + */ + loops_per_jiffy = tb_ticks_per_jiffy; +} + static int __init rtc_init(void) { struct platform_device *pdev; -- cgit v1.2.3 From 105988c015943e77092a6568bc5fb7e386df6ccd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 17 Jun 2009 21:50:04 +1000 Subject: perf_counter: powerpc: Enable use of software counters on 32-bit powerpc This enables the perf_counter subsystem on 32-bit powerpc. Since we don't have any support for hardware counters on 32-bit powerpc yet, only software counters can be used. Besides selecting HAVE_PERF_COUNTERS for 32-bit powerpc as well as 64-bit, the main thing this does is add an implementation of set_perf_counter_pending(). This needs to arrange for perf_counter_do_pending() to be called when interrupts are enabled. Rather than add code to local_irq_restore as 64-bit does, the 32-bit set_perf_counter_pending() generates an interrupt by setting the decrementer to 1 so that a decrementer interrupt will become pending in 1 or 2 timebase ticks (if a decrementer interrupt isn't already pending). When interrupts are enabled, timer_interrupt() will be called, and some new code in there calls perf_counter_do_pending(). We use a per-cpu array of flags to indicate whether we need to call perf_counter_do_pending() or not. This introduces a couple of new Kconfig symbols: PPC_HAVE_PMU_SUPPORT, which is selected by processor families for which we have hardware PMU support (currently only PPC64), and PPC_PERF_CTRS, which enables the powerpc-specific perf_counter back-end. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: linuxppc-dev@ozlabs.org Cc: benh@kernel.crashing.org LKML-Reference: <19000.55404.103840.393470@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/time.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/powerpc/kernel/time.c') diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 15391c2ab013..eae4511ceeac 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -525,6 +526,26 @@ void __init iSeries_time_init_early(void) } #endif /* CONFIG_PPC_ISERIES */ +#if defined(CONFIG_PERF_COUNTERS) && defined(CONFIG_PPC32) +DEFINE_PER_CPU(u8, perf_counter_pending); + +void set_perf_counter_pending(void) +{ + get_cpu_var(perf_counter_pending) = 1; + set_dec(1); + put_cpu_var(perf_counter_pending); +} + +#define test_perf_counter_pending() __get_cpu_var(perf_counter_pending) +#define clear_perf_counter_pending() __get_cpu_var(perf_counter_pending) = 0 + +#else /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */ + +#define test_perf_counter_pending() 0 +#define clear_perf_counter_pending() + +#endif /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */ + /* * For iSeries shared processors, we have to let the hypervisor * set the hardware decrementer. We set a virtual decrementer @@ -551,6 +572,10 @@ void timer_interrupt(struct pt_regs * regs) set_dec(DECREMENTER_MAX); #ifdef CONFIG_PPC32 + if (test_perf_counter_pending()) { + clear_perf_counter_pending(); + perf_counter_do_pending(); + } if (atomic_read(&ppc_n_lost_interrupts) != 0) do_IRQ(regs); #endif -- cgit v1.2.3