Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
home:lafenghu
xen
24887-x86-vmce-sr.patch
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File 24887-x86-vmce-sr.patch of Package xen
References: bnc#745367 # HG changeset patch # User Jan Beulich <jbeulich@suse.com> # Date 1330070874 -3600 # Node ID 35d855e520038bf5814f53d105dc4adf58a670f2 # Parent e80b0bb4470b944a5b52a91c0ec85a1d65d18c55 x86/vMCE: save/restore MCA capabilities This allows migration to a host with less MCA banks than the source host had, while without this patch accesses to the excess banks' MSRs caused #GP-s in the guest after migration (and it depended on the guest kernel whether this would be fatal). A fundamental question is whether we should also save/restore MCG_CTL and MCi_CTL, as the HVM save record would better be defined to the complete state that needs saving from the beginning (I'm unsure whether the save/restore logic allows for future extension of an existing record). Of course, this change is expected to make migration from new to older Xen impossible (again I'm unsure what the save/restore logic does with records it doesn't even know about). The (trivial) tools side change may seem unrelated, but the code should have been that way from the beginning to allow the hypervisor to look at currently unused ext_vcpucontext fields without risking to read garbage when those fields get a meaning assigned in the future. This isn't being enforced here - should it be? (Obviously, for backwards compatibility, the hypervisor must assume these fields to be clear only when the extended context's size exceeds the old original one.) A future addition to this change might be to allow configuration of the number of banks and other MCA capabilities for a guest before it starts (i.e. to not inherits the values seen on the first host it runs on). Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Keir Fraser <keir@xen.org> --- a/tools/libxc/xc_domain_save.c +++ b/tools/libxc/xc_domain_save.c @@ -1766,6 +1766,7 @@ int xc_domain_save(xc_interface *xch, in domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext; domctl.domain = dom; + memset(&domctl.u, 0, sizeof(domctl.u)); domctl.u.ext_vcpucontext.vcpu = i; if ( xc_domctl(xch, &domctl) < 0 ) { --- a/tools/misc/xen-hvmctx.c +++ b/tools/misc/xen-hvmctx.c @@ -375,6 +375,13 @@ static void dump_viridian(void) (unsigned long long) p.guest_os_id); } +static void dump_vmce_vcpu(void) +{ + HVM_SAVE_TYPE(VMCE_VCPU) p; + READ(p); + printf(" VMCE_VCPU: caps %" PRIx64 "\n", p.caps); +} + int main(int argc, char **argv) { int entry, domid; @@ -440,6 +447,7 @@ int main(int argc, char **argv) case HVM_SAVE_CODE(PMTIMER): dump_pmtimer(); break; case HVM_SAVE_CODE(MTRR): dump_mtrr(); break; case HVM_SAVE_CODE(VIRIDIAN): dump_viridian(); break; + case HVM_SAVE_CODE(VMCE_VCPU): dump_vmce_vcpu(); break; case HVM_SAVE_CODE(END): break; default: printf(" ** Don't understand type %u: skipping\n", --- a/xen/arch/x86/cpu/mcheck/mce.h +++ b/xen/arch/x86/cpu/mcheck/mce.h @@ -3,6 +3,7 @@ #define _MCE_H #include <xen/init.h> +#include <xen/sched.h> #include <xen/smp.h> #include <asm/types.h> #include <asm/traps.h> @@ -54,8 +55,8 @@ int unmmap_broken_page(struct domain *d, u64 mce_cap_init(void); extern unsigned int firstbank; -int intel_mce_rdmsr(uint32_t msr, uint64_t *val); -int intel_mce_wrmsr(uint32_t msr, uint64_t val); +int intel_mce_rdmsr(const struct vcpu *, uint32_t msr, uint64_t *val); +int intel_mce_wrmsr(struct vcpu *, uint32_t msr, uint64_t val); int mce_available(struct cpuinfo_x86 *c); unsigned int mce_firstbank(struct cpuinfo_x86 *c); @@ -169,18 +170,20 @@ int vmce_domain_inject(struct mcinfo_ban extern int vmce_init(struct cpuinfo_x86 *c); -static inline int mce_vendor_bank_msr(uint32_t msr) +static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr) { if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + nr_mce_banks) ) + msr >= MSR_IA32_MC0_CTL2 && + msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) ) return 1; return 0; } -static inline int mce_bank_msr(uint32_t msr) +static inline int mce_bank_msr(const struct vcpu *v, uint32_t msr) { - if ( (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(nr_mce_banks)) || - mce_vendor_bank_msr(msr) ) + if ( (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MCx_CTL(v->arch.mcg_cap & MCG_CAP_COUNT)) || + mce_vendor_bank_msr(v, msr) ) return 1; return 0; } --- a/xen/arch/x86/cpu/mcheck/mce_intel.c +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c @@ -1322,11 +1322,12 @@ enum mcheck_type intel_mcheck_init(struc } /* intel specific MCA MSR */ -int intel_mce_wrmsr(uint32_t msr, uint64_t val) +int intel_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) { int ret = 0; - if (msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + nr_mce_banks)) + if ( msr >= MSR_IA32_MC0_CTL2 && + msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) ) { mce_printk(MCE_QUIET, "We have disabled CMCI capability, " "Guest should not write this MSR!\n"); @@ -1336,11 +1337,12 @@ int intel_mce_wrmsr(uint32_t msr, uint64 return ret; } -int intel_mce_rdmsr(uint32_t msr, uint64_t *val) +int intel_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) { int ret = 0; - if (msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + nr_mce_banks)) + if ( msr >= MSR_IA32_MC0_CTL2 && + msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) ) { mce_printk(MCE_QUIET, "We have disabled CMCI capability, " "Guest should not read this MSR!\n"); --- a/xen/arch/x86/cpu/mcheck/vmce.c +++ b/xen/arch/x86/cpu/mcheck/vmce.c @@ -10,6 +10,7 @@ #include <xen/delay.h> #include <xen/smp.h> #include <xen/mm.h> +#include <xen/hvm/save.h> #include <asm/processor.h> #include <public/sysctl.h> #include <asm/system.h> @@ -42,7 +43,6 @@ int vmce_init_msr(struct domain *d) nr_mce_banks * sizeof(*dom_vmce(d)->mci_ctl)); dom_vmce(d)->mcg_status = 0x0; - dom_vmce(d)->mcg_cap = g_mcg_cap; dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0; dom_vmce(d)->nr_injection = 0; @@ -61,21 +61,41 @@ void vmce_destroy_msr(struct domain *d) dom_vmce(d) = NULL; } -static int bank_mce_rdmsr(struct domain *d, uint32_t msr, uint64_t *val) +void vmce_init_vcpu(struct vcpu *v) { - int bank, ret = 1; - struct domain_mca_msrs *vmce = dom_vmce(d); + v->arch.mcg_cap = g_mcg_cap; +} + +int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) +{ + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) + { + dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities" + " %#" PRIx64 " for d%d:v%u (supported: %#Lx)\n", + is_hvm_vcpu(v) ? "HVM" : "PV", caps, v->domain->domain_id, + v->vcpu_id, g_mcg_cap & ~MCG_CAP_COUNT); + return -EPERM; + } + + v->arch.mcg_cap = caps; + return 0; +} + +static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) +{ + int ret = 1; + unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; + struct domain_mca_msrs *vmce = dom_vmce(v->domain); struct bank_entry *entry; - bank = (msr - MSR_IA32_MC0_CTL) / 4; - if ( bank >= nr_mce_banks ) - return -1; + *val = 0; switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { case MSR_IA32_MC0_CTL: - *val = vmce->mci_ctl[bank] & - (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); + if ( bank < nr_mce_banks ) + *val = vmce->mci_ctl[bank] & + (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n", bank, *val); break; @@ -126,7 +146,7 @@ static int bank_mce_rdmsr(struct domain switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: - ret = intel_mce_rdmsr(msr, val); + ret = intel_mce_rdmsr(v, msr, val); break; default: ret = 0; @@ -145,13 +165,13 @@ static int bank_mce_rdmsr(struct domain */ int vmce_rdmsr(uint32_t msr, uint64_t *val) { - struct domain *d = current->domain; - struct domain_mca_msrs *vmce = dom_vmce(d); + const struct vcpu *cur = current; + struct domain_mca_msrs *vmce = dom_vmce(cur->domain); int ret = 1; *val = 0; - spin_lock(&dom_vmce(d)->lock); + spin_lock(&vmce->lock); switch ( msr ) { @@ -162,39 +182,38 @@ int vmce_rdmsr(uint32_t msr, uint64_t *v "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val); break; case MSR_IA32_MCG_CAP: - *val = vmce->mcg_cap; + *val = cur->arch.mcg_cap; mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n", *val); break; case MSR_IA32_MCG_CTL: /* Always 0 if no CTL support */ - *val = vmce->mcg_ctl & h_mcg_ctl; + if ( cur->arch.mcg_cap & MCG_CTL_P ) + *val = vmce->mcg_ctl & h_mcg_ctl; mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", *val); break; default: - ret = mce_bank_msr(msr) ? bank_mce_rdmsr(d, msr, val) : 0; + ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0; break; } - spin_unlock(&dom_vmce(d)->lock); + spin_unlock(&vmce->lock); return ret; } -static int bank_mce_wrmsr(struct domain *d, u32 msr, u64 val) +static int bank_mce_wrmsr(struct vcpu *v, u32 msr, u64 val) { - int bank, ret = 1; - struct domain_mca_msrs *vmce = dom_vmce(d); + int ret = 1; + unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; + struct domain_mca_msrs *vmce = dom_vmce(v->domain); struct bank_entry *entry = NULL; - bank = (msr - MSR_IA32_MC0_CTL) / 4; - if ( bank >= nr_mce_banks ) - return -EINVAL; - switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { case MSR_IA32_MC0_CTL: - vmce->mci_ctl[bank] = val; + if ( bank < nr_mce_banks ) + vmce->mci_ctl[bank] = val; break; case MSR_IA32_MC0_STATUS: /* Give the first entry of the list, it corresponds to current @@ -202,9 +221,9 @@ static int bank_mce_wrmsr(struct domain * the guest, this node will be deleted. * Only error bank is written. Non-error banks simply return. */ - if ( !list_empty(&dom_vmce(d)->impact_header) ) + if ( !list_empty(&vmce->impact_header) ) { - entry = list_entry(dom_vmce(d)->impact_header.next, + entry = list_entry(vmce->impact_header.next, struct bank_entry, list); if ( entry->bank == bank ) entry->mci_status = val; @@ -228,7 +247,7 @@ static int bank_mce_wrmsr(struct domain switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: - ret = intel_mce_wrmsr(msr, val); + ret = intel_mce_wrmsr(v, msr, val); break; default: ret = 0; @@ -247,9 +266,9 @@ static int bank_mce_wrmsr(struct domain */ int vmce_wrmsr(u32 msr, u64 val) { - struct domain *d = current->domain; + struct vcpu *cur = current; struct bank_entry *entry = NULL; - struct domain_mca_msrs *vmce = dom_vmce(d); + struct domain_mca_msrs *vmce = dom_vmce(cur->domain); int ret = 1; if ( !g_mcg_cap ) @@ -266,7 +285,7 @@ int vmce_wrmsr(u32 msr, u64 val) vmce->mcg_status = val; mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val); /* For HVM guest, this is the point for deleting vMCE injection node */ - if ( d->is_hvm && (vmce->nr_injection > 0) ) + if ( is_hvm_vcpu(cur) && (vmce->nr_injection > 0) ) { vmce->nr_injection--; /* Should be 0 */ if ( !list_empty(&vmce->impact_header) ) @@ -293,7 +312,7 @@ int vmce_wrmsr(u32 msr, u64 val) ret = -1; break; default: - ret = mce_bank_msr(msr) ? bank_mce_wrmsr(d, msr, val) : 0; + ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0; break; } @@ -301,6 +320,46 @@ int vmce_wrmsr(u32 msr, u64 val) return ret; } +static int vmce_save_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h) +{ + struct vcpu *v; + int err = 0; + + for_each_vcpu( d, v ) { + struct hvm_vmce_vcpu ctxt = { + .caps = v->arch.mcg_cap + }; + + err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt); + if ( err ) + break; + } + + return err; +} + +static int vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h) +{ + unsigned int vcpuid = hvm_load_instance(h); + struct vcpu *v; + struct hvm_vmce_vcpu ctxt; + int err; + + if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) + { + dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n", + d->domain_id, vcpuid); + err = -EINVAL; + } + else + err = hvm_load_entry(VMCE_VCPU, h, &ctxt); + + return err ?: vmce_restore_vcpu(v, ctxt.caps); +} + +HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt, + vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU); + int inject_vmce(struct domain *d) { int cpu = smp_processor_id(); --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -376,6 +376,8 @@ int vcpu_initialise(struct vcpu *v) if ( (rc = xsave_alloc_save_area(v)) != 0 ) return rc; + vmce_init_vcpu(v); + if ( is_hvm_domain(d) ) { if ( (rc = hvm_vcpu_initialise(v)) != 0 ) --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -1137,11 +1137,12 @@ long arch_do_domctl( evc->syscall32_callback_eip = 0; evc->syscall32_disables_events = 0; #endif + evc->mcg_cap = v->arch.mcg_cap; } else { ret = -EINVAL; - if ( evc->size != sizeof(*evc) ) + if ( evc->size < offsetof(typeof(*evc), mcg_cap) ) goto ext_vcpucontext_out; #ifdef __x86_64__ fixup_guest_code_selector(d, evc->sysenter_callback_cs); @@ -1160,6 +1161,9 @@ long arch_do_domctl( evc->syscall32_callback_eip ) goto ext_vcpucontext_out; #endif + if ( evc->size >= offsetof(typeof(*evc), mcg_cap) + + sizeof(evc->mcg_cap) ) + ret = vmce_restore_vcpu(v, evc->mcg_cap); } ret = 0; --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -425,6 +425,8 @@ struct arch_vcpu */ uint64_t xcr0_accum; + uint64_t mcg_cap; + /* Current LDT details. */ unsigned long shadow_ldt_mapcnt; spinlock_t shadow_ldt_lock; --- a/xen/include/asm-x86/mce.h +++ b/xen/include/asm-x86/mce.h @@ -16,7 +16,6 @@ struct bank_entry { struct domain_mca_msrs { /* Guest should not change below values after DOM boot up */ - uint64_t mcg_cap; uint64_t mcg_ctl; uint64_t mcg_status; uint64_t *mci_ctl; @@ -28,6 +27,8 @@ struct domain_mca_msrs /* Guest vMCE MSRs virtualization */ extern int vmce_init_msr(struct domain *d); extern void vmce_destroy_msr(struct domain *d); +extern void vmce_init_vcpu(struct vcpu *); +extern int vmce_restore_vcpu(struct vcpu *, uint64_t caps); extern int vmce_wrmsr(uint32_t msr, uint64_t val); extern int vmce_rdmsr(uint32_t msr, uint64_t *val); #endif --- a/xen/include/asm-x86/msr-index.h +++ b/xen/include/asm-x86/msr-index.h @@ -101,48 +101,40 @@ #define MSR_AMD64_MC0_MASK 0xc0010044 #define MSR_IA32_MC1_CTL 0x00000404 -#define MSR_IA32_MC1_CTL2 0x00000281 #define MSR_IA32_MC1_STATUS 0x00000405 #define MSR_IA32_MC1_ADDR 0x00000406 #define MSR_IA32_MC1_MISC 0x00000407 #define MSR_IA32_MC2_CTL 0x00000408 -#define MSR_IA32_MC2_CTL2 0x00000282 #define MSR_IA32_MC2_STATUS 0x00000409 #define MSR_IA32_MC2_ADDR 0x0000040A #define MSR_IA32_MC2_MISC 0x0000040B -#define MSR_IA32_MC3_CTL2 0x00000283 #define MSR_IA32_MC3_CTL 0x0000040C #define MSR_IA32_MC3_STATUS 0x0000040D #define MSR_IA32_MC3_ADDR 0x0000040E #define MSR_IA32_MC3_MISC 0x0000040F -#define MSR_IA32_MC4_CTL2 0x00000284 #define MSR_IA32_MC4_CTL 0x00000410 #define MSR_IA32_MC4_STATUS 0x00000411 #define MSR_IA32_MC4_ADDR 0x00000412 #define MSR_IA32_MC4_MISC 0x00000413 -#define MSR_IA32_MC5_CTL2 0x00000285 #define MSR_IA32_MC5_CTL 0x00000414 #define MSR_IA32_MC5_STATUS 0x00000415 #define MSR_IA32_MC5_ADDR 0x00000416 #define MSR_IA32_MC5_MISC 0x00000417 -#define MSR_IA32_MC6_CTL2 0x00000286 #define MSR_IA32_MC6_CTL 0x00000418 #define MSR_IA32_MC6_STATUS 0x00000419 #define MSR_IA32_MC6_ADDR 0x0000041A #define MSR_IA32_MC6_MISC 0x0000041B -#define MSR_IA32_MC7_CTL2 0x00000287 #define MSR_IA32_MC7_CTL 0x0000041C #define MSR_IA32_MC7_STATUS 0x0000041D #define MSR_IA32_MC7_ADDR 0x0000041E #define MSR_IA32_MC7_MISC 0x0000041F -#define MSR_IA32_MC8_CTL2 0x00000288 #define MSR_IA32_MC8_CTL 0x00000420 #define MSR_IA32_MC8_STATUS 0x00000421 #define MSR_IA32_MC8_ADDR 0x00000422 @@ -152,6 +144,7 @@ #define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) #define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) #define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) +#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) #define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) --- a/xen/include/public/arch-x86/hvm/save.h +++ b/xen/include/public/arch-x86/hvm/save.h @@ -580,9 +580,15 @@ struct hvm_hw_cpu_xsave { #define CPU_XSAVE_CODE 16 +struct hvm_vmce_vcpu { + uint64_t caps; +}; + +DECLARE_HVM_SAVE_TYPE(VMCE_VCPU, 18, struct hvm_vmce_vcpu); + /* * Largest type-code in use */ -#define HVM_SAVE_CODE_MAX 16 +#define HVM_SAVE_CODE_MAX 18 #endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */ --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -558,7 +558,7 @@ struct xen_domctl_ext_vcpucontext { uint32_t vcpu; /* * SET: Size of struct (IN) - * GET: Size of struct (OUT) + * GET: Size of struct (OUT, up to 128 bytes) */ uint32_t size; #if defined(__i386__) || defined(__x86_64__) @@ -570,6 +570,7 @@ struct xen_domctl_ext_vcpucontext { uint16_t sysenter_callback_cs; uint8_t syscall32_disables_events; uint8_t sysenter_disables_events; + uint64_aligned_t mcg_cap; #endif }; typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor