From 0061d53daf26ff713ab43ab84ae5c44b5edbefa9 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 9 May 2013 20:21:41 -0300
Subject: KVM: x86: limit difference between kvmclock updates

kvmclock updates which are isolated to a given vcpu, such as vcpu->cpu
migration, should not allow system_timestamp from the rest of the vcpus
to remain static. Otherwise ntp frequency correction applies to one
vcpu's system_timestamp but not the others.

So in those cases, request a kvmclock update for all vcpus. The worst
case for a remote vcpu to update its kvmclock is then bounded by maximum
nohz sleep latency.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 094b5d96ab14..8d28810a5f88 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1588,6 +1588,30 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	return 0;
 }
 
+/*
+ * kvmclock updates which are isolated to a given vcpu, such as
+ * vcpu->cpu migration, should not allow system_timestamp from
+ * the rest of the vcpus to remain static. Otherwise ntp frequency
+ * correction applies to one vcpu's system_timestamp but not
+ * the others.
+ *
+ * So in those cases, request a kvmclock update for all vcpus.
+ * The worst case for a remote vcpu to update its kvmclock
+ * is then bounded by maximum nohz sleep latency.
+ */
+
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+	int i;
+	struct kvm *kvm = v->kvm;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+		kvm_vcpu_kick(vcpu);
+	}
+}
+
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
@@ -1985,7 +2009,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		kvmclock_reset(vcpu);
 
 		vcpu->arch.time = data;
-		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+		kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 
 		/* we verify if the enable bit is set... */
 		if (!(data & 1))
@@ -2702,7 +2726,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 * kvmclock on vcpu->cpu migration
 		 */
 		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
-			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 		if (vcpu->cpu != cpu)
 			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
@@ -5703,6 +5727,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			__kvm_migrate_timers(vcpu);
 		if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
 			kvm_gen_update_masterclock(vcpu->kvm);
+		if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
+			kvm_gen_kvmclock_update(vcpu);
 		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
 			r = kvm_guest_time_update(vcpu);
 			if (unlikely(r))
-- 
cgit v1.2.3


From 758ccc89b83cc15d575204091c1a1fec306245cb Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 31 May 2013 08:36:20 +0800
Subject: KVM: x86: drop calling kvm_mmu_zap_all in emulator_fix_hypercall

Quote Gleb's mail:

| Back then kvm->lock protected memslot access so code like:
|
| mutex_lock(&vcpu->kvm->lock);
| kvm_mmu_zap_all(vcpu->kvm);
| mutex_unlock(&vcpu->kvm->lock);
|
| which is what 7aa81cc0 does was enough to guaranty that no vcpu will
| run while code is patched. This is no longer the case and
| mutex_lock(&vcpu->kvm->lock); is gone from that code path long time ago,
| so now kvm_mmu_zap_all() there is useless and the code is incorrect.

So we drop it and it will be fixed later

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8d28810a5f88..6739b1d4ce7c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5523,13 +5523,6 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 	char instruction[3];
 	unsigned long rip = kvm_rip_read(vcpu);
 
-	/*
-	 * Blow out the MMU to ensure that no other VCPU has an active mapping
-	 * to ensure that the updated hypercall appears atomically across all
-	 * VCPUs.
-	 */
-	kvm_mmu_zap_all(vcpu->kvm);
-
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
 	return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
-- 
cgit v1.2.3


From a2ae162265e88bf5490ce54fd5f2d430d6d992b7 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 31 May 2013 08:36:21 +0800
Subject: KVM: MMU: drop unnecessary kvm_reload_remote_mmus

It is the responsibility of kvm_mmu_zap_all that keeps the
consistent of mmu and tlbs. And it is also unnecessary after
zap all mmio sptes since no mmio spte exists on root shadow
page and it can not be cached into tlb

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6739b1d4ce7c..3758ff910d1f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7060,16 +7060,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 * If memory slot is created, or moved, we need to clear all
 	 * mmio sptes.
 	 */
-	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
+	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE))
 		kvm_mmu_zap_mmio_sptes(kvm);
-		kvm_reload_remote_mmus(kvm);
-	}
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
 	kvm_mmu_zap_all(kvm);
-	kvm_reload_remote_mmus(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-- 
cgit v1.2.3


From 6ca18b6950f8dee29361722f28f69847724b276f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 31 May 2013 08:36:23 +0800
Subject: KVM: x86: use the fast way to invalidate all pages

Replace kvm_mmu_zap_all by kvm_mmu_invalidate_zap_all_pages

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3758ff910d1f..15e10f7e68ac 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7066,13 +7066,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
-	kvm_mmu_zap_all(kvm);
+	kvm_mmu_invalidate_zap_all_pages(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 				   struct kvm_memory_slot *slot)
 {
-	kvm_arch_flush_shadow_all(kvm);
+	kvm_mmu_invalidate_zap_all_pages(kvm);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 365c886860c4ba670d245e762b23987c912c129a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 31 May 2013 08:36:29 +0800
Subject: KVM: MMU: reclaim the zapped-obsolete page first

As Marcelo pointed out that
| "(retention of large number of pages while zapping)
| can be fatal, it can lead to OOM and host crash"

We introduce a list, kvm->arch.zapped_obsolete_pages, to link all
the pages which are deleted from the mmu cache but not actually
freed. When page reclaiming is needed, we always zap this kind of
pages first.

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 15e10f7e68ac..6402951d5f3b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6832,6 +6832,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 		return -EINVAL;
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
-- 
cgit v1.2.3


From 8915aa27d5efbb9185357175b0acf884325565f9 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 11 Jun 2013 23:31:12 -0300
Subject: KVM: x86: handle idiv overflow at kvm_write_tsc

Its possible that idivl overflows (due to large delta stored in usdiff,
valid scenario).

Create an exception handler to catch the overflow exception (division by zero
is protected by vcpu->arch.virtual_tsc_khz check), and interpret it accordingly
(delta is larger than USEC_PER_SEC).

Fixes https://bugzilla.redhat.com/show_bug.cgi?id=969644

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6402951d5f3b..737c804b310c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1194,20 +1194,37 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	if (vcpu->arch.virtual_tsc_khz) {
+		int faulted = 0;
+
 		/* n.b - signed multiplication and division required */
 		usdiff = data - kvm->arch.last_tsc_write;
 #ifdef CONFIG_X86_64
 		usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
 #else
 		/* do_div() only does unsigned */
-		asm("idivl %2; xor %%edx, %%edx"
-		: "=A"(usdiff)
-		: "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+		asm("1: idivl %[divisor]\n"
+		    "2: xor %%edx, %%edx\n"
+		    "   movl $0, %[faulted]\n"
+		    "3:\n"
+		    ".section .fixup,\"ax\"\n"
+		    "4: movl $1, %[faulted]\n"
+		    "   jmp  3b\n"
+		    ".previous\n"
+
+		_ASM_EXTABLE(1b, 4b)
+
+		: "=A"(usdiff), [faulted] "=r" (faulted)
+		: "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
+
 #endif
 		do_div(elapsed, 1000);
 		usdiff -= elapsed;
 		if (usdiff < 0)
 			usdiff = -usdiff;
+
+		/* idivl overflow => difference is larger than USEC_PER_SEC */
+		if (faulted)
+			usdiff = USEC_PER_SEC;
 	} else
 		usdiff = USEC_PER_SEC; /* disable TSC match window below */
 
-- 
cgit v1.2.3


From 885032b91042288f98d3888c2aaf3a108d348d5c Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 7 Jun 2013 16:51:23 +0800
Subject: KVM: MMU: retain more available bits on mmio spte

Let mmio spte only use bit62 and bit63 on upper 32 bits, then bit 52 ~ bit 61
can be used for other purposes

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 737c804b310c..15cf34d4ae95 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5280,7 +5280,13 @@ static void kvm_set_mmio_spte_mask(void)
 	 * Set the reserved bits and the present bit of an paging-structure
 	 * entry to generate page fault with PFER.RSV = 1.
 	 */
-	mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
+	 /* Mask the reserved physical address bits. */
+	mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr;
+
+	/* Bit 62 is always reserved for 32bit host. */
+	mask |= 0x3ull << 62;
+
+	/* Set the present bit. */
 	mask |= 1ull;
 
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3


From f8f559422b6c6a05469dfde614b67789b6142cb5 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 7 Jun 2013 16:51:26 +0800
Subject: KVM: MMU: fast invalidate all mmio sptes

This patch tries to introduce a very simple and scale way to invalidate
all mmio sptes - it need not walk any shadow pages and hold mmu-lock

KVM maintains a global mmio valid generation-number which is stored in
kvm->memslots.generation and every mmio spte stores the current global
generation-number into his available bits when it is created

When KVM need zap all mmio sptes, it just simply increase the global
generation-number. When guests do mmio access, KVM intercepts a MMIO #PF
then it walks the shadow page table and get the mmio spte. If the
generation-number on the spte does not equal the global generation-number,
it will go to the normal #PF handler to update the mmio spte

Since 19 bits are used to store generation-number on mmio spte, we zap all
mmio sptes when the number is round

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 15cf34d4ae95..aac5ffcc8f8d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7084,8 +7084,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 * If memory slot is created, or moved, we need to clear all
 	 * mmio sptes.
 	 */
-	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE))
-		kvm_mmu_zap_mmio_sptes(kvm);
+	kvm_mmu_invalidate_mmio_sptes(kvm);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
-- 
cgit v1.2.3


From 489223edf29bc08f84e581c9495a2b42c9d52f08 Mon Sep 17 00:00:00 2001
From: Yoshihiro YUNOMAE <yoshihiro.yunomae.ez@hitachi.com>
Date: Wed, 12 Jun 2013 16:43:44 +0900
Subject: kvm: Add a tracepoint write_tsc_offset

Add a tracepoint write_tsc_offset for tracing TSC offset change.
We want to merge ftrace's trace data of guest OSs and the host OS using
TSC for timestamp in chronological order. We need "TSC offset" values for
each guest when merge those because the TSC value on a guest is always the
host TSC plus guest's TSC offset. If we get the TSC offset values, we can
calculate the host TSC value for each guest events from the TSC offset and
the event TSC value. The host TSC values of the guest events are used when we
want to merge trace data of guests and the host in chronological order.
(Note: the trace_clock of both the host and the guest must be set x86-tsc in
this case)

This tracepoint also records vcpu_id which can be used to merge trace data for
SMP guests. A merge tool will read TSC offset for each vcpu, then the tool
converts guest TSC values to host TSC values for each vcpu.

TSC offset is stored in the VMCS by vmx_write_tsc_offset() or
vmx_adjust_tsc_offset(). KVM executes the former function when a guest boots.
The latter function is executed when kvm clock is updated. Only host can read
TSC offset value from VMCS, so a host needs to output TSC offset value
when TSC offset is changed.

Since the TSC offset is not often changed, it could be overwritten by other
frequent events while tracing. To avoid that, I recommend to use a special
instance for getting this event:

1. set a instance before booting a guest
 # cd /sys/kernel/debug/tracing/instances
 # mkdir tsc_offset
 # cd tsc_offset
 # echo x86-tsc > trace_clock
 # echo 1 > events/kvm/kvm_write_tsc_offset/enable

2. boot a guest

Signed-off-by: Yoshihiro YUNOMAE <yoshihiro.yunomae.ez@hitachi.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Acked-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kvm/x86.c')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index aac5ffcc8f8d..7d71c0fb11de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7303,3 +7303,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
-- 
cgit v1.2.3