From 0061d53daf26ff713ab43ab84ae5c44b5edbefa9 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 9 May 2013 20:21:41 -0300 Subject: KVM: x86: limit difference between kvmclock updates kvmclock updates which are isolated to a given vcpu, such as vcpu->cpu migration, should not allow system_timestamp from the rest of the vcpus to remain static. Otherwise ntp frequency correction applies to one vcpu's system_timestamp but not the others. So in those cases, request a kvmclock update for all vcpus. The worst case for a remote vcpu to update its kvmclock is then bounded by maximum nohz sleep latency. Signed-off-by: Marcelo Tosatti Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 094b5d96ab14..8d28810a5f88 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1588,6 +1588,30 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) return 0; } +/* + * kvmclock updates which are isolated to a given vcpu, such as + * vcpu->cpu migration, should not allow system_timestamp from + * the rest of the vcpus to remain static. Otherwise ntp frequency + * correction applies to one vcpu's system_timestamp but not + * the others. + * + * So in those cases, request a kvmclock update for all vcpus. + * The worst case for a remote vcpu to update its kvmclock + * is then bounded by maximum nohz sleep latency. + */ + +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +{ + int i; + struct kvm *kvm = v->kvm; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_vcpu_kick(vcpu); + } +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -1985,7 +2009,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvmclock_reset(vcpu); vcpu->arch.time = data; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (!(data & 1)) @@ -2702,7 +2726,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * kvmclock on vcpu->cpu migration */ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; @@ -5703,6 +5727,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) __kvm_migrate_timers(vcpu); if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) kvm_gen_update_masterclock(vcpu->kvm); + if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) + kvm_gen_kvmclock_update(vcpu); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { r = kvm_guest_time_update(vcpu); if (unlikely(r)) -- cgit v1.2.3 From 758ccc89b83cc15d575204091c1a1fec306245cb Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:20 +0800 Subject: KVM: x86: drop calling kvm_mmu_zap_all in emulator_fix_hypercall Quote Gleb's mail: | Back then kvm->lock protected memslot access so code like: | | mutex_lock(&vcpu->kvm->lock); | kvm_mmu_zap_all(vcpu->kvm); | mutex_unlock(&vcpu->kvm->lock); | | which is what 7aa81cc0 does was enough to guaranty that no vcpu will | run while code is patched. This is no longer the case and | mutex_lock(&vcpu->kvm->lock); is gone from that code path long time ago, | so now kvm_mmu_zap_all() there is useless and the code is incorrect. So we drop it and it will be fixed later Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8d28810a5f88..6739b1d4ce7c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5523,13 +5523,6 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); - /* - * Blow out the MMU to ensure that no other VCPU has an active mapping - * to ensure that the updated hypercall appears atomically across all - * VCPUs. - */ - kvm_mmu_zap_all(vcpu->kvm); - kvm_x86_ops->patch_hypercall(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); -- cgit v1.2.3 From a2ae162265e88bf5490ce54fd5f2d430d6d992b7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:21 +0800 Subject: KVM: MMU: drop unnecessary kvm_reload_remote_mmus It is the responsibility of kvm_mmu_zap_all that keeps the consistent of mmu and tlbs. And it is also unnecessary after zap all mmio sptes since no mmio spte exists on root shadow page and it can not be cached into tlb Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6739b1d4ce7c..3758ff910d1f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7060,16 +7060,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * If memory slot is created, or moved, we need to clear all * mmio sptes. */ - if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { + if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) kvm_mmu_zap_mmio_sptes(kvm); - kvm_reload_remote_mmus(kvm); - } } void kvm_arch_flush_shadow_all(struct kvm *kvm) { kvm_mmu_zap_all(kvm); - kvm_reload_remote_mmus(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, -- cgit v1.2.3 From 6ca18b6950f8dee29361722f28f69847724b276f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:23 +0800 Subject: KVM: x86: use the fast way to invalidate all pages Replace kvm_mmu_zap_all by kvm_mmu_invalidate_zap_all_pages Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3758ff910d1f..15e10f7e68ac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7066,13 +7066,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_mmu_zap_all(kvm); + kvm_mmu_invalidate_zap_all_pages(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_arch_flush_shadow_all(kvm); + kvm_mmu_invalidate_zap_all_pages(kvm); } int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 365c886860c4ba670d245e762b23987c912c129a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:29 +0800 Subject: KVM: MMU: reclaim the zapped-obsolete page first As Marcelo pointed out that | "(retention of large number of pages while zapping) | can be fatal, it can lead to OOM and host crash" We introduce a list, kvm->arch.zapped_obsolete_pages, to link all the pages which are deleted from the mmu cache but not actually freed. When page reclaiming is needed, we always zap this kind of pages first. Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 15e10f7e68ac..6402951d5f3b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6832,6 +6832,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return -EINVAL; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ -- cgit v1.2.3 From 8915aa27d5efbb9185357175b0acf884325565f9 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 11 Jun 2013 23:31:12 -0300 Subject: KVM: x86: handle idiv overflow at kvm_write_tsc Its possible that idivl overflows (due to large delta stored in usdiff, valid scenario). Create an exception handler to catch the overflow exception (division by zero is protected by vcpu->arch.virtual_tsc_khz check), and interpret it accordingly (delta is larger than USEC_PER_SEC). Fixes https://bugzilla.redhat.com/show_bug.cgi?id=969644 Signed-off-by: Marcelo Tosatti Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6402951d5f3b..737c804b310c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1194,20 +1194,37 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { + int faulted = 0; + /* n.b - signed multiplication and division required */ usdiff = data - kvm->arch.last_tsc_write; #ifdef CONFIG_X86_64 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; #else /* do_div() only does unsigned */ - asm("idivl %2; xor %%edx, %%edx" - : "=A"(usdiff) - : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); + asm("1: idivl %[divisor]\n" + "2: xor %%edx, %%edx\n" + " movl $0, %[faulted]\n" + "3:\n" + ".section .fixup,\"ax\"\n" + "4: movl $1, %[faulted]\n" + " jmp 3b\n" + ".previous\n" + + _ASM_EXTABLE(1b, 4b) + + : "=A"(usdiff), [faulted] "=r" (faulted) + : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz)); + #endif do_div(elapsed, 1000); usdiff -= elapsed; if (usdiff < 0) usdiff = -usdiff; + + /* idivl overflow => difference is larger than USEC_PER_SEC */ + if (faulted) + usdiff = USEC_PER_SEC; } else usdiff = USEC_PER_SEC; /* disable TSC match window below */ -- cgit v1.2.3 From 885032b91042288f98d3888c2aaf3a108d348d5c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 7 Jun 2013 16:51:23 +0800 Subject: KVM: MMU: retain more available bits on mmio spte Let mmio spte only use bit62 and bit63 on upper 32 bits, then bit 52 ~ bit 61 can be used for other purposes Signed-off-by: Xiao Guangrong Reviewed-by: Gleb Natapov Reviewed-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 737c804b310c..15cf34d4ae95 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5280,7 +5280,13 @@ static void kvm_set_mmio_spte_mask(void) * Set the reserved bits and the present bit of an paging-structure * entry to generate page fault with PFER.RSV = 1. */ - mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; + /* Mask the reserved physical address bits. */ + mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr; + + /* Bit 62 is always reserved for 32bit host. */ + mask |= 0x3ull << 62; + + /* Set the present bit. */ mask |= 1ull; #ifdef CONFIG_X86_64 -- cgit v1.2.3 From f8f559422b6c6a05469dfde614b67789b6142cb5 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 7 Jun 2013 16:51:26 +0800 Subject: KVM: MMU: fast invalidate all mmio sptes This patch tries to introduce a very simple and scale way to invalidate all mmio sptes - it need not walk any shadow pages and hold mmu-lock KVM maintains a global mmio valid generation-number which is stored in kvm->memslots.generation and every mmio spte stores the current global generation-number into his available bits when it is created When KVM need zap all mmio sptes, it just simply increase the global generation-number. When guests do mmio access, KVM intercepts a MMIO #PF then it walks the shadow page table and get the mmio spte. If the generation-number on the spte does not equal the global generation-number, it will go to the normal #PF handler to update the mmio spte Since 19 bits are used to store generation-number on mmio spte, we zap all mmio sptes when the number is round Signed-off-by: Xiao Guangrong Reviewed-by: Gleb Natapov Reviewed-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 15cf34d4ae95..aac5ffcc8f8d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7084,8 +7084,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * If memory slot is created, or moved, we need to clear all * mmio sptes. */ - if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) - kvm_mmu_zap_mmio_sptes(kvm); + kvm_mmu_invalidate_mmio_sptes(kvm); } void kvm_arch_flush_shadow_all(struct kvm *kvm) -- cgit v1.2.3 From 489223edf29bc08f84e581c9495a2b42c9d52f08 Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Wed, 12 Jun 2013 16:43:44 +0900 Subject: kvm: Add a tracepoint write_tsc_offset Add a tracepoint write_tsc_offset for tracing TSC offset change. We want to merge ftrace's trace data of guest OSs and the host OS using TSC for timestamp in chronological order. We need "TSC offset" values for each guest when merge those because the TSC value on a guest is always the host TSC plus guest's TSC offset. If we get the TSC offset values, we can calculate the host TSC value for each guest events from the TSC offset and the event TSC value. The host TSC values of the guest events are used when we want to merge trace data of guests and the host in chronological order. (Note: the trace_clock of both the host and the guest must be set x86-tsc in this case) This tracepoint also records vcpu_id which can be used to merge trace data for SMP guests. A merge tool will read TSC offset for each vcpu, then the tool converts guest TSC values to host TSC values for each vcpu. TSC offset is stored in the VMCS by vmx_write_tsc_offset() or vmx_adjust_tsc_offset(). KVM executes the former function when a guest boots. The latter function is executed when kvm clock is updated. Only host can read TSC offset value from VMCS, so a host needs to output TSC offset value when TSC offset is changed. Since the TSC offset is not often changed, it could be overwritten by other frequent events while tracing. To avoid that, I recommend to use a special instance for getting this event: 1. set a instance before booting a guest # cd /sys/kernel/debug/tracing/instances # mkdir tsc_offset # cd tsc_offset # echo x86-tsc > trace_clock # echo 1 > events/kvm/kvm_write_tsc_offset/enable 2. boot a guest Signed-off-by: Yoshihiro YUNOMAE Cc: Joerg Roedel Cc: Marcelo Tosatti Cc: Gleb Natapov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Acked-by: Marcelo Tosatti Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aac5ffcc8f8d..7d71c0fb11de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7303,3 +7303,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); -- cgit v1.2.3