From 3565184ed0c1ea46bea5b792da5f72a83c43e49b Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 13 May 2013 18:56:06 +0100 Subject: x86: Increase precision of x86_platform.get/set_wallclock() All the virtualized platforms (KVM, lguest and Xen) have persistent wallclocks that have more than one second of precision. read_persistent_wallclock() and update_persistent_wallclock() allow for nanosecond precision but their implementation on x86 with x86_platform.get/set_wallclock() only allows for one second precision. This means guests may see a wallclock time that is off by up to 1 second. Make set_wallclock() and get_wallclock() take a struct timespec parameter (which allows for nanosecond precision) so KVM and Xen guests may start with a more accurate wallclock time and a Xen dom0 can maintain a more accurate wallclock for guests. Signed-off-by: David Vrabel Signed-off-by: John Stultz --- arch/x86/xen/time.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'arch/x86/xen/time.c') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 3d88bfdf9e1c..a1947ac2da82 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -191,32 +191,25 @@ static void xen_read_wallclock(struct timespec *ts) put_cpu_var(xen_vcpu); } -static unsigned long xen_get_wallclock(void) +static void xen_get_wallclock(struct timespec *now) { - struct timespec ts; - - xen_read_wallclock(&ts); - return ts.tv_sec; + xen_read_wallclock(now); } -static int xen_set_wallclock(unsigned long now) +static int xen_set_wallclock(const struct timespec *now) { struct xen_platform_op op; - int rc; /* do nothing for domU */ if (!xen_initial_domain()) return -1; op.cmd = XENPF_settime; - op.u.settime.secs = now; - op.u.settime.nsecs = 0; + op.u.settime.secs = now->tv_sec; + op.u.settime.nsecs = now->tv_nsec; op.u.settime.system_time = xen_clocksource_read(); - rc = HYPERVISOR_dom0_op(&op); - WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now); - - return rc; + return HYPERVISOR_dom0_op(&op); } static struct clocksource xen_clocksource __read_mostly = { -- cgit v1.2.3 From 31620a198cf6891dfdf5477607621da9aa092380 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 4 Jun 2013 17:06:36 -0400 Subject: xen/time: Encapsulate the struct clock_event_device in another structure. We don't do any code movement. We just encapsulate the struct clock_event_device in a new structure which contains said structure and a pointer to a char *name. The 'name' will be used in 'xen/time: Don't leak interrupt name when offlining'. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/time.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86/xen/time.c') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 3d88bfdf9e1c..5190687ca569 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -377,11 +377,16 @@ static const struct clock_event_device xen_vcpuop_clockevent = { static const struct clock_event_device *xen_clockevent = &xen_timerop_clockevent; -static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events) = { .irq = -1 }; + +struct xen_clock_event_device { + struct clock_event_device evt; + char *name; +}; +static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 }; static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) { - struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); + struct clock_event_device *evt = &__get_cpu_var(xen_clock_events).evt; irqreturn_t ret; ret = IRQ_NONE; @@ -401,7 +406,7 @@ void xen_setup_timer(int cpu) struct clock_event_device *evt; int irq; - evt = &per_cpu(xen_clock_events, cpu); + evt = &per_cpu(xen_clock_events, cpu).evt; WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu); printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); @@ -426,7 +431,7 @@ void xen_teardown_timer(int cpu) { struct clock_event_device *evt; BUG_ON(cpu == 0); - evt = &per_cpu(xen_clock_events, cpu); + evt = &per_cpu(xen_clock_events, cpu).evt; unbind_from_irqhandler(evt->irq, NULL); evt->irq = -1; } @@ -435,7 +440,7 @@ void xen_setup_cpu_clockevents(void) { BUG_ON(preemptible()); - clockevents_register_device(&__get_cpu_var(xen_clock_events)); + clockevents_register_device(&__get_cpu_var(xen_clock_events).evt); } void xen_timer_resume(void) -- cgit v1.2.3 From c9d76a24a28917c1ef6833f40c4ceff2e81b3ebb Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 4 Jun 2013 17:09:36 -0400 Subject: xen/time: Don't leak interrupt name when offlining. When the user does: echo 0 > /sys/devices/system/cpu/cpu1/online echo 1 > /sys/devices/system/cpu/cpu1/online kmemleak reports: kmemleak: 7 new suspected memory leaks (see /sys/kernel/debug/kmemleak) One of the leaks is from xen/time: unreferenced object 0xffff88003fa51280 (size 32): comm "swapper/0", pid 1, jiffies 4294667339 (age 1027.789s) hex dump (first 32 bytes): 74 69 6d 65 72 31 00 00 00 00 00 00 00 00 00 00 timer1.......... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x21/0x50 [] __kmalloc_track_caller+0xec/0x2a0 [] kvasprintf+0x5b/0x90 [] kasprintf+0x38/0x40 [] xen_setup_timer+0x51/0xf0 [] xen_cpu_up+0x5f/0x3e8 [] _cpu_up+0xd1/0x14b [] cpu_up+0xd9/0xec [] smp_init+0x4b/0xa3 [] kernel_init_freeable+0xdb/0x1e6 [] kernel_init+0x9/0xf0 [] ret_from_fork+0x7c/0xb0 [] 0xffffffffffffffff This patch fixes it by stashing away the 'name' in the per-cpu data structure and freeing it when offlining the CPU. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/time.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen/time.c') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 5190687ca569..011f1bf85765 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -402,7 +403,7 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) void xen_setup_timer(int cpu) { - const char *name; + char *name; struct clock_event_device *evt; int irq; @@ -425,6 +426,7 @@ void xen_setup_timer(int cpu) evt->cpumask = cpumask_of(cpu); evt->irq = irq; + per_cpu(xen_clock_events, cpu).name = name; } void xen_teardown_timer(int cpu) @@ -434,6 +436,8 @@ void xen_teardown_timer(int cpu) evt = &per_cpu(xen_clock_events, cpu).evt; unbind_from_irqhandler(evt->irq, NULL); evt->irq = -1; + kfree(per_cpu(xen_clock_events, cpu).name); + per_cpu(xen_clock_events, cpu).name = NULL; } void xen_setup_cpu_clockevents(void) -- cgit v1.2.3 From a05e2c371fbe73403793d126ceab93787cb4afd4 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 4 Jun 2013 17:11:52 -0400 Subject: xen/time: Check that the per_cpu data structure has data before freeing. We don't check whether the per_cpu data structure has actually been freed in the past. This checks it and if it has been freed in the past then just continues on without double-freeing. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/time.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/xen/time.c') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 011f1bf85765..6a56ae092994 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -434,10 +434,13 @@ void xen_teardown_timer(int cpu) struct clock_event_device *evt; BUG_ON(cpu == 0); evt = &per_cpu(xen_clock_events, cpu).evt; - unbind_from_irqhandler(evt->irq, NULL); - evt->irq = -1; - kfree(per_cpu(xen_clock_events, cpu).name); - per_cpu(xen_clock_events, cpu).name = NULL; + + if (evt->irq >= 0) { + unbind_from_irqhandler(evt->irq, NULL); + evt->irq = -1; + kfree(per_cpu(xen_clock_events, cpu).name); + per_cpu(xen_clock_events, cpu).name = NULL; + } } void xen_setup_cpu_clockevents(void) -- cgit v1.2.3 From 09e99da766a6a701eb4d72004872d1144291d53b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 4 Jun 2013 17:13:29 -0400 Subject: xen/time: Free onlined per-cpu data structure if we want to online it again. If the per-cpu time data structure has been onlined already and we are trying to online it again, then free the previous copy before blindly over-writting it. A developer naturally should not call this function multiple times but just in case. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/time.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'arch/x86/xen/time.c') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 6a56ae092994..aec0b14b6d76 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -401,6 +401,20 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) return ret; } +void xen_teardown_timer(int cpu) +{ + struct clock_event_device *evt; + BUG_ON(cpu == 0); + evt = &per_cpu(xen_clock_events, cpu).evt; + + if (evt->irq >= 0) { + unbind_from_irqhandler(evt->irq, NULL); + evt->irq = -1; + kfree(per_cpu(xen_clock_events, cpu).name); + per_cpu(xen_clock_events, cpu).name = NULL; + } +} + void xen_setup_timer(int cpu) { char *name; @@ -409,6 +423,8 @@ void xen_setup_timer(int cpu) evt = &per_cpu(xen_clock_events, cpu).evt; WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu); + if (evt->irq >= 0) + xen_teardown_timer(cpu); printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); @@ -429,19 +445,6 @@ void xen_setup_timer(int cpu) per_cpu(xen_clock_events, cpu).name = name; } -void xen_teardown_timer(int cpu) -{ - struct clock_event_device *evt; - BUG_ON(cpu == 0); - evt = &per_cpu(xen_clock_events, cpu).evt; - - if (evt->irq >= 0) { - unbind_from_irqhandler(evt->irq, NULL); - evt->irq = -1; - kfree(per_cpu(xen_clock_events, cpu).name); - per_cpu(xen_clock_events, cpu).name = NULL; - } -} void xen_setup_cpu_clockevents(void) { -- cgit v1.2.3 From 0b0c002c340e78173789f8afaa508070d838cf3d Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Tue, 18 Oct 2011 22:42:59 +0200 Subject: xen/time: remove blocked time accounting from xen "clockchip" ... because the "clock_event_device framework" already accounts for idle time through the "event_handler" function pointer in xen_timer_interrupt(). The patch is intended as the completion of [1]. It should fix the double idle times seen in PV guests' /proc/stat [2]. It should be orthogonal to stolen time accounting (the removed code seems to be isolated). The approach may be completely misguided. [1] https://lkml.org/lkml/2011/10/6/10 [2] http://lists.xensource.com/archives/html/xen-devel/2010-08/msg01068.html John took the time to retest this patch on top of v3.10 and reported: "idle time is correctly incremented for pv and hvm for the normal case, nohz=off and nohz=idle." so lets put this patch in. CC: stable@vger.kernel.org Signed-off-by: Laszlo Ersek Signed-off-by: John Haxby Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/time.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'arch/x86/xen/time.c') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index aec0b14b6d76..a690868be837 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -37,9 +37,8 @@ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); /* snapshots of runstate info */ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); -/* unused ns of stolen and blocked time */ +/* unused ns of stolen time */ static DEFINE_PER_CPU(u64, xen_residual_stolen); -static DEFINE_PER_CPU(u64, xen_residual_blocked); /* return an consistent snapshot of 64-bit time/counter value */ static u64 get64(const u64 *p) @@ -116,7 +115,7 @@ static void do_stolen_accounting(void) { struct vcpu_runstate_info state; struct vcpu_runstate_info *snap; - s64 blocked, runnable, offline, stolen; + s64 runnable, offline, stolen; cputime_t ticks; get_runstate_snapshot(&state); @@ -126,7 +125,6 @@ static void do_stolen_accounting(void) snap = &__get_cpu_var(xen_runstate_snapshot); /* work out how much time the VCPU has not been runn*ing* */ - blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; @@ -142,17 +140,6 @@ static void do_stolen_accounting(void) ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); __this_cpu_write(xen_residual_stolen, stolen); account_steal_ticks(ticks); - - /* Add the appropriate number of ticks of blocked time, - including any left-overs from last time. */ - blocked += __this_cpu_read(xen_residual_blocked); - - if (blocked < 0) - blocked = 0; - - ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); - __this_cpu_write(xen_residual_blocked, blocked); - account_idle_ticks(ticks); } /* Get the TSC speed from Xen */ -- cgit v1.2.3 From 5584880e44e49c587059801faa2a9f7d22619c48 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 27 Jun 2013 11:35:47 +0100 Subject: x86: xen: Sync the wallclock when the system time is set Currently the Xen wallclock is only updated every 11 minutes if NTP is synchronized to its clock source (using the sync_cmos_clock() work). If a guest is started before NTP is synchronized it may see an incorrect wallclock time. Use the pvclock_gtod notifier chain to receive a notification when the system time has changed and update the wallclock to match. This chain is called on every timer tick and we want to avoid an extra (expensive) hypercall on every tick. Because dom0 has historically never provided a very accurate wallclock and guests do not expect one, we can do this simply: the wallclock is only updated if the clock was set. Signed-off-by: David Vrabel Cc: Konrad Rzeszutek Wilk Cc: John Stultz Cc: Link: http://lkml.kernel.org/r/1372329348-20841-5-git-send-email-david.vrabel@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/xen/time.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'arch/x86/xen/time.c') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index a1947ac2da82..3364850d23e6 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -212,6 +213,30 @@ static int xen_set_wallclock(const struct timespec *now) return HYPERVISOR_dom0_op(&op); } +static int xen_pvclock_gtod_notify(struct notifier_block *nb, unsigned long was_set, + void *priv) +{ + struct timespec now; + struct xen_platform_op op; + + if (!was_set) + return NOTIFY_OK; + + now = __current_kernel_time(); + + op.cmd = XENPF_settime; + op.u.settime.secs = now.tv_sec; + op.u.settime.nsecs = now.tv_nsec; + op.u.settime.system_time = xen_clocksource_read(); + + (void)HYPERVISOR_dom0_op(&op); + return NOTIFY_OK; +} + +static struct notifier_block xen_pvclock_gtod_notifier = { + .notifier_call = xen_pvclock_gtod_notify, +}; + static struct clocksource xen_clocksource __read_mostly = { .name = "xen", .rating = 400, @@ -473,6 +498,9 @@ static void __init xen_time_init(void) xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); + + if (xen_initial_domain()) + pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } void __init xen_init_time_ops(void) -- cgit v1.2.3 From 47433b8c9d7480a3eebd99df38e857ce85a37cee Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 27 Jun 2013 11:35:48 +0100 Subject: x86: xen: Sync the CMOS RTC as well as the Xen wallclock Adjustments to Xen's persistent clock via update_persistent_clock() don't actually persist, as the Xen wallclock is a software only clock and modifications to it do not modify the underlying CMOS RTC. The x86_platform.set_wallclock hook is there to keep the hardware RTC synchronized. On a guest this is pointless. On Dom0 we can use the native implementaion which actually updates the hardware RTC, but we still need to keep the software emulation of RTC for the guests up to date. The subscription to the pvclock_notifier allows us to emulate this easily. The notifier is called at every tick and when the clock was set. Right now we only use that notifier when the clock was set, but due to the fact that it is called periodically from the timekeeping update code, we can utilize it to emulate the NTP driven drift compensation of update_persistant_clock() for the Xen wall (software) clock. Add a 11 minutes periodic update to the pvclock_gtod notifier callback to achieve that. The static variable 'next' which maintains that 11 minutes update cycle is protected by the core code serialization so there is no need to add a Xen specific serialization mechanism. [ tglx: Massaged changelog and added a few comments ] Signed-off-by: David Vrabel Cc: Konrad Rzeszutek Wilk Cc: John Stultz Cc: Link: http://lkml.kernel.org/r/1372329348-20841-6-git-send-email-david.vrabel@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/xen/time.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) (limited to 'arch/x86/xen/time.c') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 3364850d23e6..7a5671b4fec6 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -199,37 +199,42 @@ static void xen_get_wallclock(struct timespec *now) static int xen_set_wallclock(const struct timespec *now) { - struct xen_platform_op op; - - /* do nothing for domU */ - if (!xen_initial_domain()) - return -1; - - op.cmd = XENPF_settime; - op.u.settime.secs = now->tv_sec; - op.u.settime.nsecs = now->tv_nsec; - op.u.settime.system_time = xen_clocksource_read(); - - return HYPERVISOR_dom0_op(&op); + return -1; } -static int xen_pvclock_gtod_notify(struct notifier_block *nb, unsigned long was_set, - void *priv) +static int xen_pvclock_gtod_notify(struct notifier_block *nb, + unsigned long was_set, void *priv) { - struct timespec now; - struct xen_platform_op op; + /* Protected by the calling core code serialization */ + static struct timespec next_sync; - if (!was_set) - return NOTIFY_OK; + struct xen_platform_op op; + struct timespec now; now = __current_kernel_time(); + /* + * We only take the expensive HV call when the clock was set + * or when the 11 minutes RTC synchronization time elapsed. + */ + if (!was_set && timespec_compare(&now, &next_sync) < 0) + return NOTIFY_OK; + op.cmd = XENPF_settime; op.u.settime.secs = now.tv_sec; op.u.settime.nsecs = now.tv_nsec; op.u.settime.system_time = xen_clocksource_read(); (void)HYPERVISOR_dom0_op(&op); + + /* + * Move the next drift compensation time 11 minutes + * ahead. That's emulating the sync_cmos_clock() update for + * the hardware RTC. + */ + next_sync = now; + next_sync.tv_sec += 11 * 60; + return NOTIFY_OK; } @@ -513,7 +518,9 @@ void __init xen_init_time_ops(void) x86_platform.calibrate_tsc = xen_tsc_khz; x86_platform.get_wallclock = xen_get_wallclock; - x86_platform.set_wallclock = xen_set_wallclock; + /* Dom0 uses the native method to set the hardware RTC. */ + if (!xen_initial_domain()) + x86_platform.set_wallclock = xen_set_wallclock; } #ifdef CONFIG_XEN_PVHVM -- cgit v1.2.3