Avi Kivity | 16 Nov 18:59

[PATCH 0/3] KVM: Save/resume support

The following patchset adds the missing bits that allow kvm virtual
machines to be suspended and resumed, with appropriate
userspace support.

The changes are:
 - expose the pending interrupt bitmap to userspace
 - tsc save/restore
 - msr save/restore

--

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
Avi Kivity | 16 Nov 19:04

[PATCH 3/3] KVM: Expose MSRs to userspace

From: Uri Lublin <uril <at> qumranet.com>

Expose the model-specific registers to userspace.  Primarily useful for
save/resume.

Signed-off-by: Uri Lublin <uril <at> qumranet.com>
Signed-off-by: Avi Kivity <avi <at> qumranet.com>

Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -106,11 +106,7 @@ struct vmcs {
 	char data[0];
 };

-struct vmx_msr_entry {
-	u32 index;
-	u32 reserved;
-	u64 data;
-};
+#define vmx_msr_entry kvm_msr_entry

 struct kvm_vcpu;

Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -2298,21 +2298,22 @@ static int handle_cpuid(struct kvm_vcpu 
 	return 0;
 }

-static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
-	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
-	struct vmx_msr_entry *msr = find_msr_entry(vcpu, ecx);
 	u64 data;
+	struct vmx_msr_entry *msr;

-#ifdef KVM_DEBUG
-	if (guest_cpl() != 0) {
-		vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
-		inject_gp(vcpu);
-		return 1;
+	if (!pdata) {
+		printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
+		return -EINVAL;
 	}
-#endif

-	switch (ecx) {
+	switch (msr_index) {
 #ifdef __x86_64__
 	case MSR_FS_BASE:
 		data = vmcs_readl(GUEST_FS_BASE);
@@ -2351,11 +2352,25 @@ static int handle_rdmsr(struct kvm_vcpu 
 		data = vcpu->apic_base;
 		break;
 	default:
-		if (msr) {
-			data = msr->data;
-			break;
+		msr = find_msr_entry(vcpu, msr_index);
+		if (!msr) {
+			printk(KERN_ERR "kvm: unhandled rdmsr: %x\n", msr_index);
+			return 1;
 		}
-		printk(KERN_ERR "kvm: unhandled rdmsr: %x\n", ecx);
+		data = msr->data;
+		break;
+	}
+
+	*pdata = data;
+	return 0;
+}
+
+static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
+	u64 data;
+
+	if (get_msr(vcpu, ecx, &data)) {
 		inject_gp(vcpu);
 		return 1;
 	}
@@ -2401,22 +2416,16 @@ static void set_efer(struct kvm_vcpu *vc

 #endif

-static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
-	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
 	struct vmx_msr_entry *msr;
-	u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
-		| ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
-
-#ifdef KVM_DEBUG
-	if (guest_cpl() != 0) {
-		vcpu_printf(vcpu, "%s: not supervisor\n", __FUNCTION__);
-		inject_gp(vcpu);
-		return 1;
-	}
-#endif
-
-	switch (ecx) {
+	switch (msr_index) {
 #ifdef __x86_64__
 	case MSR_FS_BASE:
 		vmcs_writel(GUEST_FS_BASE, data);
@@ -2437,7 +2446,7 @@ static int handle_wrmsr(struct kvm_vcpu 
 #ifdef __x86_64
 	case MSR_EFER:
 		set_efer(vcpu, data);
-		return 1;
+		break;
 	case MSR_IA32_MC0_STATUS:
 		printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n"
 			    , __FUNCTION__, data);
@@ -2455,16 +2464,31 @@ static int handle_wrmsr(struct kvm_vcpu 
 		vcpu->apic_base = data;
 		break;
 	default:
-		msr = find_msr_entry(vcpu, ecx);
-		if (msr) {
-			msr->data = data;
-			break;
+		msr = find_msr_entry(vcpu, msr_index);
+		if (!msr) {
+			printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr_index);
+			return 1;
 		}
-		printk(KERN_ERR "kvm: unhandled wrmsr: %x\n", ecx);
+		msr->data = data;
+		break;
+	}
+
+	return 0;
+}
+
+static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
+	u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
+		| ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
+
+	if (set_msr(vcpu, ecx, data) != 0) {
 		inject_gp(vcpu);
 		return 1;
 	}
-	skip_emulated_instruction(vcpu);
+
+	if (ecx != MSR_EFER)
+		skip_emulated_instruction(vcpu);
 	return 1;
 }

@@ -3121,6 +3145,125 @@ static int kvm_dev_ioctl_set_sregs(struc
 }

 /*
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS.
+ */
+static u32 msrs_to_save[] = {
+	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+	MSR_K6_STAR,
+#ifdef __x86_64__
+	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+	MSR_IA32_TIME_STAMP_COUNTER,
+};
+
+static int kvm_dev_ioctl_get_msrs(struct kvm *kvm, struct kvm_msrs *msrs)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_msr_entry *entry, *entries;
+	int rc;
+	u32 size, num_entries, i;
+
+	if (msrs->vcpu < 0 || msrs->vcpu >= KVM_MAX_VCPUS)
+		return -EINVAL;
+
+	num_entries = ARRAY_SIZE(msrs_to_save);
+	if (msrs->nmsrs < num_entries) {
+		/* inform actual number of entries */
+		msrs->nmsrs = num_entries;
+		return -EINVAL;
+	}
+
+	vcpu = vcpu_load(kvm, msrs->vcpu);
+	if (!vcpu)
+		return -ENOENT;
+
+	msrs->nmsrs = num_entries; /* update to actual number of entries */
+	size = msrs->nmsrs * sizeof(struct kvm_msr_entry);
+	rc = -E2BIG;
+	if (size > 4096)
+		goto out_vcpu;
+
+	rc = -ENOMEM;
+	entries = vmalloc(size);
+	if (entries == NULL)
+		goto out_vcpu;
+
+	memset(entries, 0, size);
+	rc = -EINVAL;
+	for (i=0; i<num_entries; i++) {
+		entry = &entries[i];
+		entry->index = msrs_to_save[i];
+		if (get_msr(vcpu, entry->index, &entry->data))
+			goto out_free;
+	}
+
+	rc = -EFAULT;
+	if (copy_to_user(msrs->entries, entries, size))
+		goto out_free;
+
+	rc = 0;
+out_free:
+	vfree(entries);
+
+out_vcpu:
+	vcpu_put(vcpu);
+
+	return rc;
+}
+
+static int kvm_dev_ioctl_set_msrs(struct kvm *kvm, struct kvm_msrs *msrs)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_msr_entry *entry, *entries;
+	int rc;
+	u32 size, num_entries, i;
+
+	if (msrs->vcpu < 0 || msrs->vcpu >= KVM_MAX_VCPUS)
+		return -EINVAL;
+
+	num_entries = ARRAY_SIZE(msrs_to_save);
+	if (msrs->nmsrs < num_entries) {
+		msrs->nmsrs = num_entries; /* inform actual size */
+		return -EINVAL;
+	}
+
+	vcpu = vcpu_load(kvm, msrs->vcpu);
+	if (!vcpu)
+		return -ENOENT;
+
+	size = msrs->nmsrs * sizeof(struct kvm_msr_entry);
+	rc = -E2BIG;
+	if (size > 4096)
+		goto out_vcpu;
+
+	rc = -ENOMEM;
+	entries = vmalloc(size);
+	if (entries == NULL)
+		goto out_vcpu;
+
+	rc = -EFAULT;
+	if (copy_from_user(entries, msrs->entries, size))
+		goto out_free;
+
+	rc = -EINVAL;
+	for (i=0; i<num_entries; i++) {
+		entry = &entries[i];
+		if (set_msr(vcpu, entry->index,  entry->data))
+			goto out_free;
+	}
+
+	rc = 0;
+out_free:
+	vfree(entries);
+
+out_vcpu:
+	vcpu_put(vcpu);
+
+	return rc;
+}
+
+/*
  * Translate a guest virtual address to a guest physical address.
  */
 static int kvm_dev_ioctl_translate(struct kvm *kvm, struct kvm_translation *tr)
@@ -3361,6 +3504,33 @@ static long kvm_dev_ioctl(struct file *f
 			goto out;
 		break;
 	}
+	case KVM_GET_MSRS: {
+		struct kvm_msrs kvm_msrs;
+
+		r = -EFAULT;
+		if (copy_from_user(&kvm_msrs, (void *)arg, sizeof kvm_msrs))
+			goto out;
+		r = kvm_dev_ioctl_get_msrs(kvm, &kvm_msrs);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user((void *)arg, &kvm_msrs, sizeof kvm_msrs))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_MSRS: {
+		struct kvm_msrs kvm_msrs;
+
+		r = -EFAULT;
+		if (copy_from_user(&kvm_msrs, (void *)arg, sizeof kvm_msrs))
+			goto out;
+		r = kvm_dev_ioctl_set_msrs(kvm, &kvm_msrs);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
Index: linux-2.6/include/linux/kvm.h
===================================================================
--- linux-2.6.orig/include/linux/kvm.h
+++ linux-2.6/include/linux/kvm.h
@@ -141,6 +141,23 @@ struct kvm_sregs {
 	__u64 interrupt_bitmap[KVM_IRQ_BITMAP_SIZE(__u64)];
 };

+struct kvm_msr_entry {
+	__u32 index;
+	__u32 reserved;
+	__u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+	__u32 vcpu;
+	__u32 nmsrs; /* number of msrs in entries */
+
+	union {
+		struct kvm_msr_entry __user *entries;
+		__u64 padding;
+	};
+};
+
 /* for KVM_TRANSLATE */
 struct kvm_translation {
 	/* in */
@@ -200,5 +217,7 @@ struct kvm_dirty_log {
 #define KVM_SET_MEMORY_REGION     _IOW(KVMIO, 10, struct kvm_memory_region)
 #define KVM_CREATE_VCPU           _IOW(KVMIO, 11, int /* vcpu_slot */)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO, 12, struct kvm_dirty_log)
+#define KVM_GET_MSRS              _IOWR(KVMIO, 13, struct kvm_msrs)
+#define KVM_SET_MSRS              _IOW(KVMIO, 14, struct kvm_msrs)

 #endif

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
Andrew Morton | 17 Nov 02:02

Re: [PATCH 3/3] KVM: Expose MSRs to userspace

On Thu, 16 Nov 2006 18:04:22 -0000
Avi Kivity <avi <at> qumranet.com> wrote:

> +static int kvm_dev_ioctl_set_msrs(struct kvm *kvm, struct kvm_msrs *msrs)
> +{
> +	struct kvm_vcpu *vcpu;
> +	struct kvm_msr_entry *entry, *entries;
> +	int rc;
> +	u32 size, num_entries, i;
> +
> +	if (msrs->vcpu < 0 || msrs->vcpu >= KVM_MAX_VCPUS)
> +		return -EINVAL;
> +
> +	num_entries = ARRAY_SIZE(msrs_to_save);
> +	if (msrs->nmsrs < num_entries) {
> +		msrs->nmsrs = num_entries; /* inform actual size */
> +		return -EINVAL;
> +	}
> +
> +	vcpu = vcpu_load(kvm, msrs->vcpu);
> +	if (!vcpu)
> +		return -ENOENT;
> +
> +	size = msrs->nmsrs * sizeof(struct kvm_msr_entry);
> +	rc = -E2BIG;
> +	if (size > 4096)
> +		goto out_vcpu;

Classic mutiplicative overflow bug.  Only msrs->nmsrs doesn't get used
again, so there is no bug here.  Yet.

> +	rc = -ENOMEM;
> +	entries = vmalloc(size);
> +	if (entries == NULL)
> +		goto out_vcpu;
> +
> +	rc = -EFAULT;
> +	if (copy_from_user(entries, msrs->entries, size))
> +		goto out_free;
> +
> +	rc = -EINVAL;
> +	for (i=0; i<num_entries; i++) {
> +		entry = &entries[i];
> +		if (set_msr(vcpu, entry->index,  entry->data))
> +			goto out_free;
> +	}
> +
> +	rc = 0;
> +out_free:
> +	vfree(entries);
> +
> +out_vcpu:
> +	vcpu_put(vcpu);
> +
> +	return rc;
> +}

This function returns no indication of how many msrs it actually did set. 
Should it?
Avi Kivity | 17 Nov 08:20

Re: [PATCH 3/3] KVM: Expose MSRs to userspace

Andrew Morton wrote:
> On Thu, 16 Nov 2006 18:04:22 -0000
> Avi Kivity <avi <at> qumranet.com> wrote:
>
>   
>> +static int kvm_dev_ioctl_set_msrs(struct kvm *kvm, struct kvm_msrs *msrs)
>> +{
>> +	struct kvm_vcpu *vcpu;
>> +	struct kvm_msr_entry *entry, *entries;
>> +	int rc;
>> +	u32 size, num_entries, i;
>> +
>> +	if (msrs->vcpu < 0 || msrs->vcpu >= KVM_MAX_VCPUS)
>> +		return -EINVAL;
>> +
>> +	num_entries = ARRAY_SIZE(msrs_to_save);
>> +	if (msrs->nmsrs < num_entries) {
>> +		msrs->nmsrs = num_entries; /* inform actual size */
>> +		return -EINVAL;
>> +	}
>> +
>> +	vcpu = vcpu_load(kvm, msrs->vcpu);
>> +	if (!vcpu)
>> +		return -ENOENT;
>> +
>> +	size = msrs->nmsrs * sizeof(struct kvm_msr_entry);
>> +	rc = -E2BIG;
>> +	if (size > 4096)
>> +		goto out_vcpu;
>>     
>
> Classic mutiplicative overflow bug.  

Right, will fix.  The 4096 limit is arbitrary anyway, and can be 
replaced by an arbitrary limit on nmsrs.

> Only msrs->nmsrs doesn't get used
> again, so there is no bug here.  Yet.
>
>   

But why isn't it used again?  Looks like the kernel is forcing the user 
to send at least num_entries for no good reason, and ignoring any 
entries beyond num_entries.

>> +	rc = -ENOMEM;
>> +	entries = vmalloc(size);
>> +	if (entries == NULL)
>> +		goto out_vcpu;
>> +
>> +	rc = -EFAULT;
>> +	if (copy_from_user(entries, msrs->entries, size))
>> +		goto out_free;
>> +
>> +	rc = -EINVAL;
>> +	for (i=0; i<num_entries; i++) {
>> +		entry = &entries[i];
>> +		if (set_msr(vcpu, entry->index,  entry->data))
>> +			goto out_free;
>> +	}
>> +
>> +	rc = 0;
>> +out_free:
>> +	vfree(entries);
>> +
>> +out_vcpu:
>> +	vcpu_put(vcpu);
>> +
>> +	return rc;
>> +}
>>     
>
> This function returns no indication of how many msrs it actually did set. 
> Should it?
>   

It can't hurt.  Is returning the number of msrs set in the return code 
(ala short write) acceptable, or do I need to make this a read/write ioctl?

--

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
Andrew Morton | 17 Nov 09:15

Re: [PATCH 3/3] KVM: Expose MSRs to userspace

On Fri, 17 Nov 2006 09:20:49 +0200
Avi Kivity <avi <at> qumranet.com> wrote:


> >> +out_vcpu: > >> + vcpu_put(vcpu); > >> + > >> + return rc; > >> +} > >> > > > > This function returns no indication of how many msrs it actually did set. > > Should it? > > > > It can't hurt. Is returning the number of msrs set in the return code > (ala short write) acceptable, or do I need to make this a read/write ioctl? >
I'd have thought that you'd just copy the number written into msrs->nmsrs via msrs->nmsrs = num_entries; like kvm_dev_ioctl_set_msrs() does. Dunno... ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys - and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
Avi Kivity | 17 Nov 09:17

Re: [PATCH 3/3] KVM: Expose MSRs to userspace


Andrew Morton wrote: > On Fri, 17 Nov 2006 09:20:49 +0200 > Avi Kivity <avi <at> qumranet.com> wrote: > > >>>> +out_vcpu: >>>> + vcpu_put(vcpu); >>>> + >>>> + return rc; >>>> +} >>>> >>>> >>> This function returns no indication of how many msrs it actually did set. >>> Should it? >>> >>> >> It can't hurt. Is returning the number of msrs set in the return code >> (ala short write) acceptable, or do I need to make this a read/write ioctl? >> >> > > I'd have thought that you'd just copy the number written into msrs->nmsrs via > > msrs->nmsrs = num_entries; > > like kvm_dev_ioctl_set_msrs() does. Dunno... >
It means making it an _IOWR() ioctl, but no matter. -- -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys - and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
Arnd Bergmann | 16 Nov 20:08
Picon
Gravatar

Re: [PATCH 3/3] KVM: Expose MSRs to userspace


On Thursday 16 November 2006 19:04, Avi Kivity wrote: > +struct kvm_msr_entry { > +       __u32 index; > +       __u32 reserved; > +       __u64 data; > +}; > + > +/* for KVM_GET_MSRS and KVM_SET_MSRS */ > +struct kvm_msrs { > +       __u32 vcpu; > +       __u32 nmsrs; /* number of msrs in entries */ > + > +       union { > +               struct kvm_msr_entry __user *entries; > +               __u64 padding; > +       }; > +};
ioctl interfaces with pointers in them are generally a bad idea, though you handle most of the points against them fine here (endianess doesn't matter, padding is correct). Still, it might be better not to set a bad example. Is accessing the MSRs actually performance critical? If not, you could define the ioctl to take only a single entry argument. A possible alternative could also be to have a variable length argument like below, but that creates other problems: +struct kvm_msrs { + __u32 vcpu; + __u32 nmsrs; /* number of msrs in entries */ + struct kvm_msr_entry entries[0]; /* followed by actual msrs */ +}; This would mean that you can't tell the transfer size from the ioctl number, but you can't do that in your code either, because you do two separate transfers. Arnd <>< ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys - and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
Avi Kivity | 16 Nov 20:17

Re: [kvm-devel] [PATCH 3/3] KVM: Expose MSRs to userspace

Arnd Bergmann wrote:
> On Thursday 16 November 2006 19:04, Avi Kivity wrote:
>   
>> +struct kvm_msr_entry {
>> +       __u32 index;
>> +       __u32 reserved;
>> +       __u64 data;
>> +};
>> +
>> +/* for KVM_GET_MSRS and KVM_SET_MSRS */
>> +struct kvm_msrs {
>> +       __u32 vcpu;
>> +       __u32 nmsrs; /* number of msrs in entries */
>> +
>> +       union {
>> +               struct kvm_msr_entry __user *entries;
>> +               __u64 padding;
>> +       };
>> +};
>>     
>
> ioctl interfaces with pointers in them are generally a bad idea,
> though you handle most of the points against them fine here
> (endianess doesn't matter, padding is correct).
>
> Still, it might be better not to set a bad example. Is accessing
> the MSRs actually performance critical? If not, you could
> define the ioctl to take only a single entry argument.
>
>   

But then you can't dynamically determine which MSRs are available.

And no, reading/setting MSRs isn't performance critical for the current 
use cases.

> A possible alternative could also be to have a variable length
> argument like below, but that creates other problems:
>
> +struct kvm_msrs {
> +       __u32 vcpu;
> +       __u32 nmsrs; /* number of msrs in entries */
> +       struct kvm_msr_entry entries[0]; /* followed by actual msrs */
> +};
>
> This would mean that you can't tell the transfer size from the
> ioctl number, but you can't do that in your code either, because
> you do two separate transfers.
>
>   

Heh.  That was the original implementation by Uri.  I felt that was 
wrong because _IOW() encodes the size in the ioctl number, bit the 
actual size is different.

> 	Arnd <><
>   

--

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

Christoph Hellwig | 17 Nov 09:06
Favicon

Re: [PATCH 3/3] KVM: Expose MSRs to userspace


On Thu, Nov 16, 2006 at 09:17:18PM +0200, Avi Kivity wrote: > Heh. That was the original implementation by Uri. I felt that was > wrong because _IOW() encodes the size in the ioctl number, bit the > actual size is different.
That really shouldn't be a problem. After all the pointer approach doesn't encode the transfered size either. Given that the variable sized array gives a much cleaner interface you should use it. ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys - and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
Avi Kivity | 16 Nov 19:03

[PATCH 2/3] KVM: Add time stamp counter msr and accessors

From: Uri Lublin <uril <at> qumranet.com>

Add a couple of accessors for the time stamp counter, and expose the tsc msr.

Signed-off-by: Uri Lublin <uril <at> qumranet.com>
Signed-off-by: Avi Kivity <avi <at> qumranet.com>

Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -121,6 +121,7 @@ static const u32 vmx_msr_index[] = {
 #define TSS_REDIRECTION_SIZE (256 / 8)
 #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)

+#define MSR_IA32_TIME_STAMP_COUNTER		0x010
 #define MSR_IA32_FEATURE_CONTROL 		0x03a
 #define MSR_IA32_VMX_BASIC_MSR   		0x480
 #define MSR_IA32_VMX_PINBASED_CTLS_MSR		0x481
@@ -716,6 +717,31 @@ static void inject_gp(struct kvm_vcpu *v
 		     INTR_INFO_VALID_MASK);
 }

+/*
+ * reads and returns guest's timestamp counter "register"
+ * guest_tsc = host_tsc + tsc_offset    -- 21.3
+ */
+static u64 guest_read_tsc(void)
+{
+	u64 host_tsc, tsc_offset;
+
+	rdtscll(host_tsc);
+	tsc_offset = vmcs_read64(TSC_OFFSET);
+	return host_tsc + tsc_offset;
+}
+
+/*
+ * writes 'guest_tsc' into guest's timestamp counter "register"
+ * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
+ */
+static void guest_write_tsc(u64 guest_tsc)
+{
+	u64 host_tsc;
+
+	rdtscll(host_tsc);
+	vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
+}
+
 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->rmode.active)
@@ -1177,7 +1204,6 @@ static int kvm_vcpu_setup(struct kvm_vcp
 	struct descriptor_table dt;
 	int i;
 	int ret;
-	u64 tsc;
 	int nr_good_msrs;

 
@@ -1247,8 +1273,7 @@ static int kvm_vcpu_setup(struct kvm_vcp
 	vmcs_write64(IO_BITMAP_A, 0);
 	vmcs_write64(IO_BITMAP_B, 0);

-	rdtscll(tsc);
-	vmcs_write64(TSC_OFFSET, -tsc);
+	guest_write_tsc(0);

 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */

@@ -2297,6 +2322,9 @@ static int handle_rdmsr(struct kvm_vcpu 
 		data = vmcs_readl(GUEST_GS_BASE);
 		break;
 #endif
+	case MSR_IA32_TIME_STAMP_COUNTER:
+		data = guest_read_tsc();
+		break;
 	case MSR_IA32_SYSENTER_CS:
 		data = vmcs_read32(GUEST_SYSENTER_CS);
 		break;
@@ -2374,8 +2402,6 @@ static void set_efer(struct kvm_vcpu *vc

 #endif

-#define MSR_IA32_TIME_STAMP_COUNTER 0x10
-
 static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	u32 ecx = vcpu->regs[VCPU_REGS_RCX];
@@ -2419,10 +2445,7 @@ static int handle_wrmsr(struct kvm_vcpu 
 		break;
 #endif
 	case MSR_IA32_TIME_STAMP_COUNTER: {
-		u64 tsc;
-
-		rdtscll(tsc);
-		vmcs_write64(TSC_OFFSET, data - tsc);
+		guest_write_tsc(data);
 		break;
 	}
 	case MSR_IA32_UCODE_REV:

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
Avi Kivity | 16 Nov 19:02

[PATCH 1/3] KVM: Expose interrupt bitmap

From: Uri Lublin <uril <at> qumranet.com>

Expose all not-yet-delivered interrupts to userspace.  This allows a guest
to be saved and resumed even if some interrupts are yet pending.

Signed-off-by: Uri Lublin <uril <at> qumranet.com>
Signed-off-by: Avi Kivity <avi <at> qumranet.com>

Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -13,6 +13,7 @@
 #include <linux/mm.h>

 #include "vmx.h"
+#include <linux/kvm.h>

 #define CR0_PE_MASK (1ULL << 0)
 #define CR0_TS_MASK (1ULL << 3)
@@ -164,7 +165,7 @@ struct kvm_vcpu {
 	int   cpu;
 	int   launched;
 	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
-#define NR_IRQ_WORDS (256 / BITS_PER_LONG)
+#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
 	unsigned long irq_pending[NR_IRQ_WORDS];
 	unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
 	unsigned long rip;      /* needs vcpu_load_rsp_rip() */
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -3000,7 +3000,8 @@ static int kvm_dev_ioctl_get_sregs(struc
 	sregs->efer = vcpu->shadow_efer;
 	sregs->apic_base = vcpu->apic_base;

-	sregs->pending_int = vcpu->irq_summary != 0;
+	memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
+	       sizeof sregs->interrupt_bitmap);

 	vcpu_put(vcpu);

@@ -3034,6 +3035,7 @@ static int kvm_dev_ioctl_set_sregs(struc
 {
 	struct kvm_vcpu *vcpu;
 	int mmu_reset_needed = 0;
+	int i;

 	if (sregs->vcpu < 0 || sregs->vcpu >= KVM_MAX_VCPUS)
 		return -EINVAL;
@@ -3083,6 +3085,14 @@ static int kvm_dev_ioctl_set_sregs(struc

 	if (mmu_reset_needed)
 		kvm_mmu_reset_context(vcpu);
+
+	memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
+	       sizeof vcpu->irq_pending);
+	vcpu->irq_summary = 0;
+	for (i = 0; i < NR_IRQ_WORDS; ++i)
+		if (vcpu->irq_pending[i])
+			__set_bit(i, &vcpu->irq_summary);
+
 	vcpu_put(vcpu);

 	return 0;
Index: linux-2.6/include/linux/kvm.h
===================================================================
--- linux-2.6.orig/include/linux/kvm.h
+++ linux-2.6/include/linux/kvm.h
@@ -11,6 +11,15 @@
 #include <asm/types.h>
 #include <linux/ioctl.h>

+/*
+ * Architectural interrupt line count, and the size of the bitmap needed
+ * to hold them.
+ */
+#define KVM_NR_INTERRUPTS 256
+#define KVM_IRQ_BITMAP_SIZE_BYTES    ((KVM_NR_INTERRUPTS + 7) / 8)
+#define KVM_IRQ_BITMAP_SIZE(type)    (KVM_IRQ_BITMAP_SIZE_BYTES / sizeof(type))
+
+
 /* for KVM_CREATE_MEMORY_REGION */
 struct kvm_memory_region {
 	__u32 slot;
@@ -129,10 +138,7 @@ struct kvm_sregs {
 	__u64 cr0, cr2, cr3, cr4, cr8;
 	__u64 efer;
 	__u64 apic_base;
-
-	/* out (KVM_GET_SREGS) */
-	__u32 pending_int;
-	__u32 padding2;
+	__u64 interrupt_bitmap[KVM_IRQ_BITMAP_SIZE(__u64)];
 };

 /* for KVM_TRANSLATE */

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

Gmane