From 58219c1ab32be51e23f672ec52d2c6a86b0489d8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 1 Oct 2015 12:26:50 +0200 Subject: [PATCH 001/123] tools lib traceevent: update KVM plugin The format of the role word has changed through the years and the plugin was never updated; some VMX exit reasons were missing too. Signed-off-by: Paolo Bonzini --- tools/lib/traceevent/plugin_kvm.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/tools/lib/traceevent/plugin_kvm.c b/tools/lib/traceevent/plugin_kvm.c index 88fe83dff7cd..18536f756577 100644 --- a/tools/lib/traceevent/plugin_kvm.c +++ b/tools/lib/traceevent/plugin_kvm.c @@ -124,7 +124,10 @@ static const char *disassemble(unsigned char *insn, int len, uint64_t rip, _ER(WBINVD, 54) \ _ER(XSETBV, 55) \ _ER(APIC_WRITE, 56) \ - _ER(INVPCID, 58) + _ER(INVPCID, 58) \ + _ER(PML_FULL, 62) \ + _ER(XSAVES, 63) \ + _ER(XRSTORS, 64) #define SVM_EXIT_REASONS \ _ER(EXIT_READ_CR0, 0x000) \ @@ -352,15 +355,18 @@ static int kvm_nested_vmexit_handler(struct trace_seq *s, struct pevent_record * union kvm_mmu_page_role { unsigned word; struct { - unsigned glevels:4; unsigned level:4; + unsigned cr4_pae:1; unsigned quadrant:2; - unsigned pad_for_nice_hex_output:6; unsigned direct:1; unsigned access:3; unsigned invalid:1; - unsigned cr4_pge:1; unsigned nxe:1; + unsigned cr0_wp:1; + unsigned smep_and_not_wp:1; + unsigned smap_and_not_wp:1; + unsigned pad_for_nice_hex_output:8; + unsigned smm:8; }; }; @@ -385,15 +391,18 @@ static int kvm_mmu_print_role(struct trace_seq *s, struct pevent_record *record, if (pevent_is_file_bigendian(event->pevent) == pevent_is_host_bigendian(event->pevent)) { - trace_seq_printf(s, "%u/%u q%u%s %s%s %spge %snxe", + trace_seq_printf(s, "%u q%u%s %s%s %spae %snxe %swp%s%s%s", role.level, - role.glevels, role.quadrant, role.direct ? " direct" : "", access_str[role.access], role.invalid ? " invalid" : "", - role.cr4_pge ? "" : "!", - role.nxe ? "" : "!"); + role.cr4_pae ? "" : "!", + role.nxe ? "" : "!", + role.cr0_wp ? "" : "!", + role.smep_and_not_wp ? " smep" : "", + role.smap_and_not_wp ? " smap" : "", + role.smm ? " smm" : ""); } else trace_seq_printf(s, "WORD: %08x", role.word); From bdaffe1d93e7eddbcc71d074a5d49eba7fe1c765 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Jul 2015 15:03:06 +0200 Subject: [PATCH 002/123] KVM: x86: set TMR when the interrupt is accepted Do not compute TMR in advance. Instead, set the TMR just before the interrupt is accepted into the IRR. This limits the coupling between IOAPIC and LAPIC. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 9 ++------- arch/x86/kvm/ioapic.h | 3 +-- arch/x86/kvm/lapic.c | 19 ++++++++++--------- arch/x86/kvm/lapic.h | 1 - arch/x86/kvm/x86.c | 5 +---- 5 files changed, 14 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 856f79105bb5..eaf4ec38d980 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -246,8 +246,7 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic) smp_wmb(); } -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, - u32 *tmr) +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; union kvm_ioapic_redirect_entry *e; @@ -260,13 +259,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, - e->fields.dest_id, e->fields.dest_mode)) { + e->fields.dest_id, e->fields.dest_mode)) __set_bit(e->fields.vector, (unsigned long *)eoi_exit_bitmap); - if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG) - __set_bit(e->fields.vector, - (unsigned long *)tmr); - } } } spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index ca0b0b4e6256..3dbd0e2aac4e 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -120,7 +120,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, - u32 *tmr); +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8d9013c5e1ee..5693dd9fc163 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -551,15 +551,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int i; - - for (i = 0; i < 8; i++) - apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]); -} - static void apic_update_ppr(struct kvm_lapic *apic) { u32 tpr, isrv, ppr, old_ppr; @@ -781,6 +772,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; case APIC_DM_FIXED: + if (unlikely(trig_mode && !level)) + break; + /* FIXME add logic for vcpu on reset */ if (unlikely(!apic_enabled(apic))) break; @@ -790,6 +784,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); + if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { + if (trig_mode) + apic_set_vector(vector, apic->regs + APIC_TMR); + else + apic_clear_vector(vector, apic->regs + APIC_TMR); + } + if (kvm_x86_ops->deliver_posted_interrupt) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); else { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 764037991d26..eb46d6bcaa75 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); -void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92511d4b7236..c1ed74ebc502 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6144,17 +6144,14 @@ static void process_smi(struct kvm_vcpu *vcpu) static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; - u32 tmr[8]; if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; memset(eoi_exit_bitmap, 0, 32); - memset(tmr, 0, 32); - kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr); + kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap); kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); - kvm_apic_update_tmr(vcpu, tmr); } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) From 3bb345f387dd26beb097cf776e342bc0d96d805a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Jul 2015 10:43:18 +0200 Subject: [PATCH 003/123] KVM: x86: store IOAPIC-handled vectors in each VCPU We can reuse the algorithm that computes the EOI exit bitmap to figure out which vectors are handled by the IOAPIC. The only difference between the two is for edge-triggered interrupts other than IRQ8 that have no notifiers active; however, the IOAPIC does not have to do anything special for these interrupts anyway. This again limits the interactions between the IOAPIC and the LAPIC, making it easier to move the former to userspace. Inspired by a patch from Steve Rutherford. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/ioapic.c | 18 ++---------------- arch/x86/kvm/ioapic.h | 8 -------- arch/x86/kvm/lapic.c | 10 ++++++++-- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 3 ++- arch/x86/kvm/x86.c | 8 +++----- 7 files changed, 18 insertions(+), 34 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2beee0382088..33609c2c743b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -396,6 +396,7 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ + u64 eoi_exit_bitmap[4]; unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; @@ -822,7 +823,7 @@ struct kvm_x86_ops { int (*vm_has_apicv)(struct kvm *kvm); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm *kvm, int isr); - void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); + void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index eaf4ec38d980..2dcda0f188ba 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -233,19 +233,6 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) } -static void update_handled_vectors(struct kvm_ioapic *ioapic) -{ - DECLARE_BITMAP(handled_vectors, 256); - int i; - - memset(handled_vectors, 0, sizeof(handled_vectors)); - for (i = 0; i < IOAPIC_NUM_PINS; ++i) - __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors); - memcpy(ioapic->handled_vectors, handled_vectors, - sizeof(handled_vectors)); - smp_wmb(); -} - void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; @@ -310,7 +297,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) e->bits |= (u32) val; e->fields.remote_irr = 0; } - update_handled_vectors(ioapic); mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); @@ -594,7 +580,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->id = 0; memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); rtc_irq_eoi_tracking_reset(ioapic); - update_handled_vectors(ioapic); } static const struct kvm_io_device_ops ioapic_mmio_ops = { @@ -623,8 +608,10 @@ int kvm_ioapic_init(struct kvm *kvm) if (ret < 0) { kvm->arch.vioapic = NULL; kfree(ioapic); + return ret; } + kvm_vcpu_request_scan_ioapic(kvm); return ret; } @@ -661,7 +648,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); ioapic->irr = 0; ioapic->irr_delivered = 0; - update_handled_vectors(ioapic); kvm_vcpu_request_scan_ioapic(kvm); kvm_ioapic_inject_all(ioapic, state->irr); spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 3dbd0e2aac4e..bf36d66a1951 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -73,7 +73,6 @@ struct kvm_ioapic { struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); spinlock_t lock; - DECLARE_BITMAP(handled_vectors, 256); struct rtc_status rtc_status; struct delayed_work eoi_inject; u32 irq_eoi[IOAPIC_NUM_PINS]; @@ -98,13 +97,6 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) return kvm->arch.vioapic; } -static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) -{ - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - smp_rmb(); - return test_bit(vector, ioapic->handled_vectors); -} - void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5693dd9fc163..4c30fb0a48a1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -869,14 +869,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } +static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) +{ + return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap); +} + static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + if (kvm_ioapic_handles_vector(apic, vector)) { int trigger_mode; if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; + kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); } } @@ -1923,7 +1929,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, /* Cache not set: could be safe but we don't bother. */ apic->highest_isr_cache == -1 || /* Need EOI to update ioapic. */ - kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { + kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) { /* * PV EOI was disabled by apic_sync_pv_eoi_from_guest * so we need not do anything here. diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2f9ed1ff0632..09582081276e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3664,7 +3664,7 @@ static int svm_vm_has_apicv(struct kvm *kvm) return 0; } -static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu) { return; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06ef4908ba61..9381b1d34313 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8043,8 +8043,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) } } -static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu) { + u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap; if (!vmx_vm_has_apicv(vcpu->kvm)) return; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1ed74ebc502..c4adb8847b23 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6143,15 +6143,13 @@ static void process_smi(struct kvm_vcpu *vcpu) static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { - u64 eoi_exit_bitmap[4]; - if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - memset(eoi_exit_bitmap, 0, 32); + memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8); - kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap); - kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); + kvm_x86_ops->load_eoi_exitmap(vcpu); } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) From d50ab6c1a2b24e12d3012d7beb343eba5b94a6ca Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Jul 2015 11:49:59 +0200 Subject: [PATCH 004/123] KVM: x86: replace vm_has_apicv hook with cpu_uses_apicv This will avoid an unnecessary trip to ->kvm and from there to the VPIC. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/irq.c | 2 +- arch/x86/kvm/lapic.c | 4 ++-- arch/x86/kvm/lapic.h | 4 ++-- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 8 +++++++- 6 files changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33609c2c743b..a0ef289d5a86 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -820,7 +820,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - int (*vm_has_apicv)(struct kvm *kvm); + int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm *kvm, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index a1ec6a50a05a..c0dad893dc59 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -63,7 +63,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) if (kvm_cpu_has_extint(v)) return 1; - if (kvm_apic_vid_enabled(v->kvm)) + if (kvm_vcpu_apic_vid_enabled(v)) return 0; return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4c30fb0a48a1..c568d69c7060 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -390,7 +390,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) vcpu = apic->vcpu; - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) { + if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) { /* try to update RVI */ apic_clear_vector(vec, apic->regs + APIC_IRR); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -1622,7 +1622,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); + apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu); apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0; apic->highest_isr_cache = -1; update_divide_count(apic); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index eb46d6bcaa75..7259d272416f 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -143,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) return apic->vcpu->arch.apic_base & X2APIC_ENABLE; } -static inline bool kvm_apic_vid_enabled(struct kvm *kvm) +static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->vm_has_apicv(kvm); + return kvm_x86_ops->cpu_uses_apicv(vcpu); } static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 09582081276e..9a37e487aca0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3659,7 +3659,7 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } -static int svm_vm_has_apicv(struct kvm *kvm) +static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu) { return 0; } @@ -4425,7 +4425,7 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, - .vm_has_apicv = svm_vm_has_apicv, + .cpu_uses_apicv = svm_cpu_uses_apicv, .load_eoi_exitmap = svm_load_eoi_exitmap, .sync_pir_to_irr = svm_sync_pir_to_irr, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9381b1d34313..b8a90c4f676f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -810,6 +810,7 @@ static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); static int vmx_vm_has_apicv(struct kvm *kvm); +static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -4337,6 +4338,11 @@ static int vmx_vm_has_apicv(struct kvm *kvm) return enable_apicv && irqchip_in_kernel(kvm); } +static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu) +{ + return vmx_vm_has_apicv(vcpu->kvm); +} + static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -10362,7 +10368,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .vm_has_apicv = vmx_vm_has_apicv, + .cpu_uses_apicv = vmx_cpu_uses_apicv, .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, From 35754c987f252e859bfa390a6816e85563afe79d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Jul 2015 12:05:37 +0200 Subject: [PATCH 005/123] KVM: x86: introduce lapic_in_kernel Avoid pointer chasing and memory barriers, and simplify the code when split irqchip (LAPIC in kernel, IOAPIC/PIC in userspace) is introduced. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq.c | 6 +++--- arch/x86/kvm/irq.h | 8 ++++++++ arch/x86/kvm/lapic.c | 4 ++-- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 46 +++++++++++++++++++------------------------- arch/x86/kvm/x86.c | 18 ++++++++--------- 7 files changed, 45 insertions(+), 43 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index c0dad893dc59..b653ae202c8e 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -57,7 +57,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v) */ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) { - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.pending; if (kvm_cpu_has_extint(v)) @@ -75,7 +75,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) */ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.pending; if (kvm_cpu_has_extint(v)) @@ -103,7 +103,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) { int vector; - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.nr; vector = kvm_cpu_get_extint(v); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 3d782a2c336a..9e6e7e04de98 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -92,6 +92,14 @@ static inline int irqchip_in_kernel(struct kvm *kvm) return vpic != NULL; } +static inline int lapic_in_kernel(struct kvm_vcpu *vcpu) +{ + /* Same as irqchip_in_kernel(vcpu->kvm), but with less + * pointer chasing and no unnecessary memory barriers. + */ + return vcpu->arch.apic != NULL; +} + void kvm_pic_reset(struct kvm_kpic_state *s); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c568d69c7060..c4bcc86d6dc4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1985,7 +1985,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4; - if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; if (reg == APIC_ICR2) @@ -2002,7 +2002,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; - if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; if (reg == APIC_DFR || reg == APIC_ICR2) { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff606f507913..c3f39aa9b9cb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3427,7 +3427,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) static bool can_do_async_pf(struct kvm_vcpu *vcpu) { - if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu))) return false; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9a37e487aca0..34c089023827 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3060,7 +3060,7 @@ static int cr8_write_interception(struct vcpu_svm *svm) u8 cr8_prev = kvm_get_cr8(&svm->vcpu); /* instruction emulation calls kvm_set_cr8() */ r = cr_interception(svm); - if (irqchip_in_kernel(svm->vcpu.kvm)) + if (lapic_in_kernel(&svm->vcpu)) return r; if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return r; @@ -3305,7 +3305,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) * If the user space waits to inject interrupts, exit as soon as * possible */ - if (!irqchip_in_kernel(svm->vcpu.kvm) && + if (!lapic_in_kernel(&svm->vcpu) && kvm_run->request_interrupt_window && !kvm_cpu_has_interrupt(&svm->vcpu)) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b8a90c4f676f..4729b7f6e5f2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -809,7 +809,6 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); -static int vmx_vm_has_apicv(struct kvm *kvm); static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -947,9 +946,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void) return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; } -static inline bool vm_need_tpr_shadow(struct kvm *kvm) +static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) { - return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); + return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); } static inline bool cpu_has_secondary_exec_ctrls(void) @@ -1063,9 +1062,9 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } -static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) +static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { - return flexpriority_enabled && irqchip_in_kernel(kvm); + return flexpriority_enabled && lapic_in_kernel(vcpu); } static inline bool cpu_has_vmx_vpid(void) @@ -2378,7 +2377,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - if (vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (vmx_cpu_uses_apicv(&vmx->vcpu)) vmx->nested.nested_vmx_pinbased_ctls_high |= PIN_BASED_POSTED_INTR; @@ -4333,14 +4332,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr) msr, MSR_TYPE_W); } -static int vmx_vm_has_apicv(struct kvm *kvm) -{ - return enable_apicv && irqchip_in_kernel(kvm); -} - static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu) { - return vmx_vm_has_apicv(vcpu->kvm); + return enable_apicv && lapic_in_kernel(vcpu); } static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) @@ -4520,7 +4514,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; - if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (!vmx_cpu_uses_apicv(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; return pin_based_exec_ctrl; } @@ -4532,7 +4526,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) exec_control &= ~CPU_BASED_MOV_DR_EXITING; - if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { + if (!cpu_need_tpr_shadow(&vmx->vcpu)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 exec_control |= CPU_BASED_CR8_STORE_EXITING | @@ -4549,7 +4543,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; @@ -4563,7 +4557,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (!vmx_cpu_uses_apicv(&vmx->vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; @@ -4624,7 +4618,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx_secondary_exec_control(vmx)); } - if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { + if (vmx_cpu_uses_apicv(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); @@ -4768,7 +4762,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (cpu_has_vmx_tpr_shadow() && !init_event) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (vm_need_tpr_shadow(vcpu->kvm)) + if (cpu_need_tpr_shadow(vcpu)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, __pa(vcpu->arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); @@ -4776,7 +4770,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (vmx_vm_has_apicv(vcpu->kvm)) + if (vmx_cpu_uses_apicv(vcpu)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); if (vmx->vpid != 0) @@ -5316,7 +5310,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); kvm_complete_insn_gp(vcpu, err); - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) return 1; if (cr8_prev <= cr8) return 1; @@ -5535,7 +5529,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) * If the user space waits to inject interrupts, exit as soon as * possible */ - if (!irqchip_in_kernel(vcpu->kvm) && + if (!lapic_in_kernel(vcpu) && vcpu->run->request_interrupt_window && !kvm_cpu_has_interrupt(vcpu)) { vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; @@ -7944,10 +7938,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) * apicv */ if (!cpu_has_vmx_virtualize_x2apic_mode() || - !vmx_vm_has_apicv(vcpu->kvm)) + !vmx_cpu_uses_apicv(vcpu)) return; - if (!vm_need_tpr_shadow(vcpu->kvm)) + if (!cpu_need_tpr_shadow(vcpu)) return; sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); @@ -8052,7 +8046,7 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu) { u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap; - if (!vmx_vm_has_apicv(vcpu->kvm)) + if (!vmx_cpu_uses_apicv(vcpu)) return; vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); @@ -8551,7 +8545,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) put_cpu(); if (err) goto free_vmcs; - if (vm_need_virtualize_apic_accesses(kvm)) { + if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { err = alloc_apic_access_page(kvm); if (err) goto free_vmcs; @@ -9344,7 +9338,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && - (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { + cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; kvm_vcpu_reload_apic_access_page(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c4adb8847b23..88fb3bf2ae6d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -788,7 +788,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) return 1; - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; @@ -798,7 +798,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) { - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) return kvm_lapic_get_cr8(vcpu); else return vcpu->arch.cr8; @@ -3175,7 +3175,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vapic_addr va; r = -EINVAL; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) goto out; r = -EFAULT; if (copy_from_user(&va, argp, sizeof va)) @@ -5666,7 +5666,7 @@ void kvm_arch_exit(void) int kvm_vcpu_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; - if (irqchip_in_kernel(vcpu->kvm)) { + if (lapic_in_kernel(vcpu)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; return 1; } else { @@ -6162,7 +6162,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { struct page *page = NULL; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) return; if (!kvm_x86_ops->set_apic_access_page_addr) @@ -6200,7 +6200,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; - bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && + bool req_int_win = !lapic_in_kernel(vcpu) && vcpu->run->request_interrupt_window; bool req_immediate_exit = false; @@ -6597,7 +6597,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } /* re-sync apic's tpr */ - if (!irqchip_in_kernel(vcpu->kvm)) { + if (!lapic_in_kernel(vcpu)) { if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { r = -EINVAL; goto out; @@ -7297,7 +7297,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { - return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); + return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu); } struct static_key kvm_no_apic_vcpu __read_mostly; @@ -7391,7 +7391,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) static_key_slow_dec(&kvm_no_apic_vcpu); } From 4ca7dd8ce4b24e18f94eed90e80c6eb80fb48c9a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 30 Jul 2015 10:32:16 +0200 Subject: [PATCH 006/123] KVM: x86: unify handling of interrupt window The interrupt window is currently checked twice, once in vmx.c/svm.c and once in dm_request_for_irq_injection. The only difference is the extra check for kvm_arch_interrupt_allowed in dm_request_for_irq_injection, and the different return value (EINTR/KVM_EXIT_INTR for vmx.c/svm.c vs. 0/KVM_EXIT_IRQ_WINDOW_OPEN for dm_request_for_irq_injection). However, dm_request_for_irq_injection is basically dead code! Revive it by removing the checks in vmx.c and svm.c's vmexit handlers, and fixing the returned values for the dm_request_for_irq_injection case. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 13 ------------- arch/x86/kvm/vmx.c | 11 ----------- arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 2 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 34c089023827..54a86183e5d3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3294,24 +3294,11 @@ static int msr_interception(struct vcpu_svm *svm) static int interrupt_window_interception(struct vcpu_svm *svm) { - struct kvm_run *kvm_run = svm->vcpu.run; - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); ++svm->vcpu.stat.irq_window_exits; - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (!lapic_in_kernel(&svm->vcpu) && - kvm_run->request_interrupt_window && - !kvm_cpu_has_interrupt(&svm->vcpu)) { - kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } - return 1; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4729b7f6e5f2..6ee0dc69675b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5524,17 +5524,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); ++vcpu->stat.irq_window_exits; - - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (!lapic_in_kernel(vcpu) && - vcpu->run->request_interrupt_window && - !kvm_cpu_has_interrupt(vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } return 1; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 88fb3bf2ae6d..28c98c8f9d1c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6469,8 +6469,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu) kvm_inject_pending_timer_irqs(vcpu); if (dm_request_for_irq_injection(vcpu)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; + r = 0; + vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; ++vcpu->stat.request_irq_exits; break; } From 49df6397edfc5a8ba8ca813b51fb9729d8e94b40 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Wed, 29 Jul 2015 23:21:40 -0700 Subject: [PATCH 007/123] KVM: x86: Split the APIC from the rest of IRQCHIP. First patch in a series which enables the relocation of the PIC/IOAPIC to userspace. Adds capability KVM_CAP_SPLIT_IRQCHIP; KVM_CAP_SPLIT_IRQCHIP enables the construction of LAPICs without the rest of the irqchip. Compile tested for x86. Signed-off-by: Steve Rutherford Suggested-by: Andrew Honig Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 17 +++++++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/i8254.c | 4 +++- arch/x86/kvm/ioapic.h | 8 ++++++++ arch/x86/kvm/irq.h | 11 ++++++++++- arch/x86/kvm/irq_comm.c | 9 ++++++++- arch/x86/kvm/lapic.c | 6 ++++-- arch/x86/kvm/x86.c | 23 +++++++++++++++++++++-- include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 1 + 10 files changed, 75 insertions(+), 7 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index d9ecceea5a02..43e0816d0de1 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3627,6 +3627,23 @@ struct { KVM handlers should exit to userspace with rc = -EREMOTE. +7.5 KVM_CAP_SPLIT_IRQCHIP + +Architectures: x86 +Parameters: None +Returns: 0 on success, -1 on error + +Create a local apic for each processor in the kernel. This can be used +instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the +IOAPIC and PIC (and also the PIT, even though this has to be enabled +separately). + +This supersedes KVM_CREATE_IRQCHIP, creating only local APICs, but no in kernel +IOAPIC or PIC. This also enables in kernel routing of interrupt requests. + +Fails if VCPU has already been created, or if the irqchip is already in the +kernel (i.e. KVM_CREATE_IRQCHIP has already been called). + 8. Other capabilities. ---------------------- diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a0ef289d5a86..befcf555bddc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -684,6 +684,8 @@ struct kvm_arch { u32 bsp_vcpu_id; u64 disabled_quirks; + + bool irqchip_split; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index f90952f64e79..08116ff227cc 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -35,6 +35,7 @@ #include #include +#include "ioapic.h" #include "irq.h" #include "i8254.h" #include "x86.h" @@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; s64 interval; - if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) + if (!ioapic_in_kernel(kvm) || + ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) return; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index bf36d66a1951..a8842c0dee73 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -97,6 +97,14 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) return kvm->arch.vioapic; } +static inline int ioapic_in_kernel(struct kvm *kvm) +{ + int ret; + + ret = (ioapic_irqchip(kvm) != NULL); + return ret; +} + void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9e6e7e04de98..2f9703dcd913 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -83,13 +83,22 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) return kvm->arch.vpic; } +static inline int irqchip_split(struct kvm *kvm) +{ + return kvm->arch.irqchip_split; +} + static inline int irqchip_in_kernel(struct kvm *kvm) { struct kvm_pic *vpic = pic_irqchip(kvm); + bool ret; + + ret = (vpic != NULL); + ret |= irqchip_split(kvm); /* Read vpic before kvm->irq_routing. */ smp_rmb(); - return vpic != NULL; + return ret; } static inline int lapic_in_kernel(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 9efff9e5b58c..67f6b62a6814 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -208,7 +208,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!irqchip_in_kernel(kvm)) + if (!ioapic_in_kernel(kvm)) goto unlock; kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); @@ -328,3 +328,10 @@ int kvm_setup_default_irq_routing(struct kvm *kvm) return kvm_set_irq_routing(kvm, default_routing, ARRAY_SIZE(default_routing), 0); } + +static const struct kvm_irq_routing_entry empty_routing[] = {}; + +int kvm_setup_empty_irq_routing(struct kvm *kvm) +{ + return kvm_set_irq_routing(kvm, empty_routing, 0, 0); +} diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c4bcc86d6dc4..e05946c36b87 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -209,7 +209,8 @@ out: if (old) kfree_rcu(old, rcu); - kvm_vcpu_request_scan_ioapic(kvm); + if (ioapic_in_kernel(kvm)) + kvm_vcpu_request_scan_ioapic(kvm); } static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) @@ -1845,7 +1846,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_rtc_eoi_tracking_restore_one(vcpu); + if (ioapic_in_kernel(vcpu->kvm)) + kvm_rtc_eoi_tracking_restore_one(vcpu); } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 28c98c8f9d1c..f720774a4797 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2448,6 +2448,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: + case KVM_CAP_SPLIT_IRQCHIP: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -3555,6 +3556,24 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, kvm->arch.disabled_quirks = cap->args[0]; r = 0; break; + case KVM_CAP_SPLIT_IRQCHIP: { + mutex_lock(&kvm->lock); + r = -EEXIST; + if (irqchip_in_kernel(kvm)) + goto split_irqchip_unlock; + if (atomic_read(&kvm->online_vcpus)) + goto split_irqchip_unlock; + r = kvm_setup_empty_irq_routing(kvm); + if (r) + goto split_irqchip_unlock; + /* Pairs with irqchip_in_kernel. */ + smp_wmb(); + kvm->arch.irqchip_split = true; + r = 0; +split_irqchip_unlock: + mutex_unlock(&kvm->lock); + break; + } default: r = -EINVAL; break; @@ -3668,7 +3687,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm)) + if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -3692,7 +3711,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm)) + if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1bef9e21e725..354f147647ab 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1002,6 +1002,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) #endif int kvm_setup_default_irq_routing(struct kvm *kvm); +int kvm_setup_empty_irq_routing(struct kvm *kvm); int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a9256f0331ae..ed00f8fc9ea2 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -824,6 +824,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_MULTI_ADDRESS_SPACE 118 #define KVM_CAP_GUEST_DEBUG_HW_BPS 119 #define KVM_CAP_GUEST_DEBUG_HW_WPS 120 +#define KVM_CAP_SPLIT_IRQCHIP 121 #ifdef KVM_CAP_IRQ_ROUTING From 7543a635aa09eb138b2cbf60ac3ff19503ae6954 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Wed, 29 Jul 2015 23:21:41 -0700 Subject: [PATCH 008/123] KVM: x86: Add KVM exit for IOAPIC EOIs Adds KVM_EXIT_IOAPIC_EOI which allows the kernel to EOI level-triggered IOAPIC interrupts. Uses a per VCPU exit bitmap to decide whether or not the IOAPIC needs to be informed (which is identical to the EOI_EXIT_BITMAP field used by modern x86 processors, but can also be used to elide kvm IOAPIC EOI exits on older processors). [Note: A prototype using ResampleFDs found that decoupling the EOI from the VCPU's thread made it possible for the VCPU to not see a recent EOI after reentering the guest. This does not match real hardware.] Compile tested for Intel x86. Signed-off-by: Steve Rutherford Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 12 ++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/lapic.c | 24 +++++++++++++++++------- arch/x86/kvm/x86.c | 11 +++++++++++ include/linux/kvm_host.h | 2 +- include/uapi/linux/kvm.h | 5 +++++ 6 files changed, 48 insertions(+), 8 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 43e0816d0de1..0d14bf5db534 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3309,6 +3309,18 @@ Valid values for 'type' are: to ignore the request, or to gather VM memory core dump and/or reset/shutdown of the VM. + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; + +Indicates that the VCPU's in-kernel local APIC received an EOI for a +level-triggered IOAPIC interrupt. This exit only triggers when the +IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled); +the userspace IOAPIC should process the EOI and retrigger the interrupt if +it is still asserted. Vector is the LAPIC interrupt vector for which the +EOI was received. + /* Fix the size of the union. */ char padding[256]; }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index befcf555bddc..af09fa1d1be7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -574,6 +574,8 @@ struct kvm_vcpu_arch { struct { bool pv_unhalted; } pv; + + int pending_ioapic_eoi; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e05946c36b87..ef70f6f3a37a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -877,15 +877,25 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - if (kvm_ioapic_handles_vector(apic, vector)) { - int trigger_mode; - if (apic_test_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; + int trigger_mode; - kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); + /* Eoi the ioapic only if the ioapic doesn't own the vector. */ + if (!kvm_ioapic_handles_vector(apic, vector)) + return; + + /* Request a KVM exit to inform the userspace IOAPIC. */ + if (irqchip_split(apic->vcpu->kvm)) { + apic->vcpu->arch.pending_ioapic_eoi = vector; + kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); + return; } + + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + + kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); } static int apic_set_eoi(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f720774a4797..27429daa054e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6271,6 +6271,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_pmu_handle_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) kvm_pmu_deliver_pmi(vcpu); + if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { + BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); + if (test_bit(vcpu->arch.pending_ioapic_eoi, + (void *) vcpu->arch.eoi_exit_bitmap)) { + vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI; + vcpu->run->eoi.vector = + vcpu->arch.pending_ioapic_eoi; + r = 0; + goto out; + } + } if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) vcpu_scan_ioapic(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 354f147647ab..3b33215ed447 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -140,6 +140,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_APIC_PAGE_RELOAD 25 #define KVM_REQ_SMI 26 #define KVM_REQ_HV_CRASH 27 +#define KVM_REQ_IOAPIC_EOI_EXIT 28 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -1146,4 +1147,3 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ #endif - diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ed00f8fc9ea2..12e3afbf0f47 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -183,6 +183,7 @@ struct kvm_s390_skeys { #define KVM_EXIT_EPR 23 #define KVM_EXIT_SYSTEM_EVENT 24 #define KVM_EXIT_S390_STSI 25 +#define KVM_EXIT_IOAPIC_EOI 26 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -333,6 +334,10 @@ struct kvm_run { __u8 sel1; __u16 sel2; } s390_stsi; + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; /* Fix the size of the union. */ char padding[256]; }; From b053b2aef25d00773fa6762dcd4b7f5c9c42d171 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Wed, 29 Jul 2015 23:32:35 -0700 Subject: [PATCH 009/123] KVM: x86: Add EOI exit bitmap inference In order to support a userspace IOAPIC interacting with an in kernel APIC, the EOI exit bitmaps need to be configurable. If the IOAPIC is in userspace (i.e. the irqchip has been split), the EOI exit bitmaps will be set whenever the GSI Routes are configured. In particular, for the low MSI routes are reservable for userspace IOAPICs. For these MSI routes, the EOI Exit bit corresponding to the destination vector of the route will be set for the destination VCPU. The intention is for the userspace IOAPICs to use the reservable MSI routes to inject interrupts into the guest. This is a slight abuse of the notion of an MSI Route, given that MSIs classically bypass the IOAPIC. It might be worthwhile to add an additional route type to improve clarity. Compile tested for Intel x86. Signed-off-by: Steve Rutherford Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 9 ++++--- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/ioapic.h | 2 ++ arch/x86/kvm/irq_comm.c | 42 +++++++++++++++++++++++++++++++ arch/x86/kvm/lapic.c | 3 +-- arch/x86/kvm/x86.c | 9 ++++++- include/linux/kvm_host.h | 16 ++++++++++++ virt/kvm/irqchip.c | 12 ++------- 8 files changed, 78 insertions(+), 16 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 0d14bf5db534..89e71648d748 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3642,7 +3642,7 @@ KVM handlers should exit to userspace with rc = -EREMOTE. 7.5 KVM_CAP_SPLIT_IRQCHIP Architectures: x86 -Parameters: None +Parameters: args[0] - number of routes reserved for userspace IOAPICs Returns: 0 on success, -1 on error Create a local apic for each processor in the kernel. This can be used @@ -3650,8 +3650,11 @@ instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the IOAPIC and PIC (and also the PIT, even though this has to be enabled separately). -This supersedes KVM_CREATE_IRQCHIP, creating only local APICs, but no in kernel -IOAPIC or PIC. This also enables in kernel routing of interrupt requests. +This capability also enables in kernel routing of interrupt requests; +when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are +used in the IRQ routing table. The first args[0] MSI routes are reserved +for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes, +a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace. Fails if VCPU has already been created, or if the irqchip is already in the kernel (i.e. KVM_CREATE_IRQCHIP has already been called). diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index af09fa1d1be7..7a5f9debbcd8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -688,6 +688,7 @@ struct kvm_arch { u64 disabled_quirks; bool irqchip_split; + u8 nr_reserved_ioapic_pins; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index a8842c0dee73..084617d37c74 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -9,6 +9,7 @@ struct kvm; struct kvm_vcpu; #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ #define IOAPIC_EDGE_TRIG 0 #define IOAPIC_LEVEL_TRIG 1 @@ -121,5 +122,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); #endif diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 67f6b62a6814..177460998bb0 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -335,3 +335,45 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm) { return kvm_set_irq_routing(kvm, empty_routing, 0, 0); } + +void kvm_arch_irq_routing_update(struct kvm *kvm) +{ + if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) + return; + kvm_make_scan_ioapic_request(kvm); +} + +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_kernel_irq_routing_entry *entry; + struct kvm_irq_routing_table *table; + u32 i, nr_ioapic_pins; + int idx; + + /* kvm->irq_routing must be read after clearing + * KVM_SCAN_IOAPIC. */ + smp_mb(); + idx = srcu_read_lock(&kvm->irq_srcu); + table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + nr_ioapic_pins = min_t(u32, table->nr_rt_entries, + kvm->arch.nr_reserved_ioapic_pins); + for (i = 0; i < nr_ioapic_pins; ++i) { + hlist_for_each_entry(entry, &table->map[i], link) { + u32 dest_id, dest_mode; + + if (entry->type != KVM_IRQ_ROUTING_MSI) + continue; + dest_id = (entry->msi.address_lo >> 12) & 0xff; + dest_mode = (entry->msi.address_lo >> 2) & 0x1; + if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id, + dest_mode)) { + u32 vector = entry->msi.data & 0xff; + + __set_bit(vector, + (unsigned long *) eoi_exit_bitmap); + } + } + } + srcu_read_unlock(&kvm->irq_srcu, idx); +} diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ef70f6f3a37a..2f4c0d0cbe0a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -209,8 +209,7 @@ out: if (old) kfree_rcu(old, rcu); - if (ioapic_in_kernel(kvm)) - kvm_vcpu_request_scan_ioapic(kvm); + kvm_make_scan_ioapic_request(kvm); } static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27429daa054e..4aeed2086c5e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3558,6 +3558,9 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, break; case KVM_CAP_SPLIT_IRQCHIP: { mutex_lock(&kvm->lock); + r = -EINVAL; + if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS) + goto split_irqchip_unlock; r = -EEXIST; if (irqchip_in_kernel(kvm)) goto split_irqchip_unlock; @@ -3569,6 +3572,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, /* Pairs with irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_split = true; + kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; r = 0; split_irqchip_unlock: mutex_unlock(&kvm->lock); @@ -6167,7 +6171,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8); - kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); + if (irqchip_split(vcpu->kvm)) + kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap); + else + kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); kvm_x86_ops->load_eoi_exitmap(vcpu); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3b33215ed447..d754f2d826e9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -330,6 +330,18 @@ struct kvm_kernel_irq_routing_entry { struct hlist_node link; }; +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING +struct kvm_irq_routing_table { + int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; + u32 nr_rt_entries; + /* + * Array indexed by gsi. Each entry contains list of irq chips + * the gsi is connected to. + */ + struct hlist_head map[0]; +}; +#endif + #ifndef KVM_PRIVATE_MEM_SLOTS #define KVM_PRIVATE_MEM_SLOTS 0 #endif @@ -456,10 +468,14 @@ void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); +void kvm_arch_irq_routing_update(struct kvm *kvm); #else static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) { } +static inline void kvm_arch_irq_routing_update(struct kvm *kvm) +{ +} #endif #ifdef CONFIG_HAVE_KVM_IRQFD diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index d7ea8e20dae4..716a1c4db528 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -31,16 +31,6 @@ #include #include "irq.h" -struct kvm_irq_routing_table { - int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; - u32 nr_rt_entries; - /* - * Array indexed by gsi. Each entry contains list of irq chips - * the gsi is connected to. - */ - struct hlist_head map[0]; -}; - int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) { @@ -231,6 +221,8 @@ int kvm_set_irq_routing(struct kvm *kvm, kvm_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); + kvm_arch_irq_routing_update(kvm); + synchronize_srcu_expedited(&kvm->irq_srcu); new = old; From 1c1a9ce973a7863dd46767226bce2a5f12d48bc6 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Thu, 30 Jul 2015 11:27:16 +0200 Subject: [PATCH 010/123] KVM: x86: Add support for local interrupt requests from userspace In order to enable userspace PIC support, the userspace PIC needs to be able to inject local interrupts even when the APICs are in the kernel. KVM_INTERRUPT now supports sending local interrupts to an APIC when APICs are in the kernel. The ready_for_interrupt_request flag is now only set when the CPU/APIC will immediately accept and inject an interrupt (i.e. APIC has not masked the PIC). When the PIC wishes to initiate an INTA cycle with, say, CPU0, it kicks CPU0 out of the guest, and renedezvous with CPU0 once it arrives in userspace. When the CPU/APIC unmasks the PIC, a KVM_EXIT_IRQ_WINDOW_OPEN is triggered, so that userspace has a chance to inject a PIC interrupt if it had been pending. Overall, this design can lead to a small number of spurious userspace renedezvous. In particular, whenever the PIC transistions from low to high while it is masked and whenever the PIC becomes unmasked while it is low. Note: this does not buffer more than one local interrupt in the kernel, so the VMM needs to enter the guest in order to complete interrupt injection before injecting an additional interrupt. Compiles for x86. Can pass the KVM Unit Tests. Signed-off-by: Steve Rutherford Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 14 ++++++++--- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/irq.c | 32 ++++++++++++++++++----- arch/x86/kvm/irq.h | 8 ++++++ arch/x86/kvm/x86.c | 42 ++++++++++++++++++++++++------- 5 files changed, 78 insertions(+), 19 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 89e71648d748..e3e9c41721a2 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -401,10 +401,9 @@ Capability: basic Architectures: x86, ppc, mips Type: vcpu ioctl Parameters: struct kvm_interrupt (in) -Returns: 0 on success, -1 on error +Returns: 0 on success, negative on failure. -Queues a hardware interrupt vector to be injected. This is only -useful if in-kernel local APIC or equivalent is not used. +Queues a hardware interrupt vector to be injected. /* for KVM_INTERRUPT */ struct kvm_interrupt { @@ -414,7 +413,14 @@ struct kvm_interrupt { X86: -Note 'irq' is an interrupt vector, not an interrupt pin or line. +Returns: 0 on success, + -EEXIST if an interrupt is already enqueued + -EINVAL the the irq number is invalid + -ENXIO if the PIC is in the kernel + -EFAULT if the pointer is invalid + +Note 'irq' is an interrupt vector, not an interrupt pin or line. This +ioctl is useful if the in-kernel PIC is not used. PPC: diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7a5f9debbcd8..76a5b30979b3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -576,6 +576,7 @@ struct kvm_vcpu_arch { } pv; int pending_ioapic_eoi; + int pending_external_vector; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b653ae202c8e..097060e33bd6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -37,15 +37,28 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL(kvm_cpu_has_pending_timer); +/* + * check if there is a pending userspace external interrupt + */ +static int pending_userspace_extint(struct kvm_vcpu *v) +{ + return v->arch.pending_external_vector != -1; +} + /* * check if there is pending interrupt from * non-APIC source without intack. */ static int kvm_cpu_has_extint(struct kvm_vcpu *v) { - if (kvm_apic_accept_pic_intr(v)) - return pic_irqchip(v->kvm)->output; /* PIC */ - else + u8 accept = kvm_apic_accept_pic_intr(v); + + if (accept) { + if (irqchip_split(v->kvm)) + return pending_userspace_extint(v); + else + return pic_irqchip(v->kvm)->output; + } else return 0; } @@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); */ static int kvm_cpu_get_extint(struct kvm_vcpu *v) { - if (kvm_cpu_has_extint(v)) - return kvm_pic_read_irq(v->kvm); /* PIC */ - return -1; + if (kvm_cpu_has_extint(v)) { + if (irqchip_split(v->kvm)) { + int vector = v->arch.pending_external_vector; + + v->arch.pending_external_vector = -1; + return vector; + } else + return kvm_pic_read_irq(v->kvm); /* PIC */ + } else + return -1; } /* diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 2f9703dcd913..ae5c78f2337d 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -83,6 +83,14 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) return kvm->arch.vpic; } +static inline int pic_in_kernel(struct kvm *kvm) +{ + int ret; + + ret = (pic_irqchip(kvm) != NULL); + return ret; +} + static inline int irqchip_split(struct kvm *kvm) { return kvm->arch.irqchip_split; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4aeed2086c5e..8b97c54edebc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2662,12 +2662,24 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, { if (irq->irq >= KVM_NR_INTERRUPTS) return -EINVAL; - if (irqchip_in_kernel(vcpu->kvm)) + + if (!irqchip_in_kernel(vcpu->kvm)) { + kvm_queue_interrupt(vcpu, irq->irq, false); + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; + } + + /* + * With in-kernel LAPIC, we only use this to inject EXTINT, so + * fail for in-kernel 8259. + */ + if (pic_in_kernel(vcpu->kvm)) return -ENXIO; - kvm_queue_interrupt(vcpu, irq->irq, false); - kvm_make_request(KVM_REQ_EVENT, vcpu); + if (vcpu->arch.pending_external_vector != -1) + return -EEXIST; + vcpu->arch.pending_external_vector = irq->irq; return 0; } @@ -5796,9 +5808,15 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) */ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) { - return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && - vcpu->run->request_interrupt_window && - kvm_arch_interrupt_allowed(vcpu)); + if (!vcpu->run->request_interrupt_window || pic_in_kernel(vcpu->kvm)) + return false; + + if (kvm_cpu_has_interrupt(vcpu)) + return false; + + return (irqchip_split(vcpu->kvm) + ? kvm_apic_accept_pic_intr(vcpu) + : kvm_arch_interrupt_allowed(vcpu)); } static void post_kvm_run_save(struct kvm_vcpu *vcpu) @@ -5809,13 +5827,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_run->ready_for_interrupt_injection = 1; - else + if (!irqchip_in_kernel(vcpu->kvm)) kvm_run->ready_for_interrupt_injection = kvm_arch_interrupt_allowed(vcpu) && !kvm_cpu_has_interrupt(vcpu) && !kvm_event_needs_reinjection(vcpu); + else if (!pic_in_kernel(vcpu->kvm)) + kvm_run->ready_for_interrupt_injection = + kvm_apic_accept_pic_intr(vcpu) && + !kvm_cpu_has_interrupt(vcpu); + else + kvm_run->ready_for_interrupt_injection = 1; } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -7403,6 +7425,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); + vcpu->arch.pending_external_vector = -1; + return 0; fail_free_mce_banks: From d3febddde9c7a959dbb189a700e937db50fad4d6 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 25 Aug 2015 17:05:46 +0800 Subject: [PATCH 011/123] kvm: use kmalloc() instead of kzalloc() during iodev register/unregister All fields of kvm_io_range were initialized or copied explicitly afterwards. So switch to use kmalloc(). Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8db1d9361993..23116dcb2129 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3341,7 +3341,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; - new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * + new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) * sizeof(struct kvm_io_range)), GFP_KERNEL); if (!new_bus) return -ENOMEM; @@ -3373,7 +3373,7 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, if (r) return r; - new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) * + new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * sizeof(struct kvm_io_range)), GFP_KERNEL); if (!new_bus) return -ENOMEM; From 931c33b178b091cced2a6b3f57f04655f8ff5207 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 15 Sep 2015 14:41:58 +0800 Subject: [PATCH 012/123] kvm: add tracepoint for fast mmio Cc: Gleb Natapov Cc: Paolo Bonzini Signed-off-by: Jason Wang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 18 ++++++++++++++++++ arch/x86/kvm/vmx.c | 1 + arch/x86/kvm/x86.c | 1 + 3 files changed, 20 insertions(+) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4eae7c35ddf5..ce4abe333c39 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -128,6 +128,24 @@ TRACE_EVENT(kvm_pio, __entry->count > 1 ? "(...)" : "") ); +/* + * Tracepoint for fast mmio. + */ +TRACE_EVENT(kvm_fast_mmio, + TP_PROTO(u64 gpa), + TP_ARGS(gpa), + + TP_STRUCT__entry( + __field(u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = gpa; + ), + + TP_printk("fast mmio at gpa 0x%llx", __entry->gpa) +); + /* * Tracepoint for cpuid. */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6ee0dc69675b..324b09ff1def 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5756,6 +5756,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { skip_emulated_instruction(vcpu); + trace_kvm_fast_mmio(gpa); return 1; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b97c54edebc..b90ad01c617d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8070,6 +8070,7 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); From e9ea5069d9e569c32ab913c39467df32e056b3a7 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 15 Sep 2015 14:41:59 +0800 Subject: [PATCH 013/123] kvm: add capability for any-length ioeventfds Cc: Gleb Natapov Cc: Paolo Bonzini Signed-off-by: Jason Wang Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 6 +++++- include/uapi/linux/kvm.h | 1 + virt/kvm/eventfd.c | 4 +--- virt/kvm/kvm_main.c | 1 + 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e3e9c41721a2..34cc068e81ea 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1604,7 +1604,7 @@ provided event instead of triggering an exit. struct kvm_ioeventfd { __u64 datamatch; __u64 addr; /* legal pio/mmio address */ - __u32 len; /* 1, 2, 4, or 8 bytes */ + __u32 len; /* 0, 1, 2, 4, or 8 bytes */ __s32 fd; __u32 flags; __u8 pad[36]; @@ -1627,6 +1627,10 @@ to the registered address is equal to datamatch in struct kvm_ioeventfd. For virtio-ccw devices, addr contains the subchannel id and datamatch the virtqueue index. +With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and +the kernel will ignore the length of guest write and may get a faster vmexit. +The speedup may only apply to specific architectures, but the ioeventfd will +work anyway. 4.60 KVM_DIRTY_TLB diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 12e3afbf0f47..03f3618612aa 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -830,6 +830,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_GUEST_DEBUG_HW_BPS 119 #define KVM_CAP_GUEST_DEBUG_HW_WPS 120 #define KVM_CAP_SPLIT_IRQCHIP 121 +#define KVM_CAP_IOEVENTFD_ANY_LENGTH 122 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 79db45336e3a..ac89299b8699 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -914,9 +914,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) return -EINVAL; /* ioeventfd with no length can't be combined with DATAMATCH */ - if (!args->len && - args->flags & (KVM_IOEVENTFD_FLAG_PIO | - KVM_IOEVENTFD_FLAG_DATAMATCH)) + if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)) return -EINVAL; ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 23116dcb2129..afd7ae6aec65 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2718,6 +2718,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IRQFD: case KVM_CAP_IRQFD_RESAMPLE: #endif + case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: return 1; #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING From e516cebb4f2b2057a5a421fea589079502acfff6 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Wed, 16 Sep 2015 12:29:48 +0300 Subject: [PATCH 014/123] kvm/x86: Hyper-V HV_X64_MSR_RESET msr HV_X64_MSR_RESET msr is used by Hyper-V based Windows guest to reset guest VM by hypervisor. Necessary to support loading of winhv.sys in guest, which in turn is required to support Windows VMBus. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Paolo Bonzini CC: Gleb Natapov Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/hyperv.h | 3 +++ arch/x86/kvm/hyperv.c | 10 ++++++++++ arch/x86/kvm/x86.c | 7 +++++++ include/linux/kvm_host.h | 1 + 4 files changed, 21 insertions(+) diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index f0412c50c47b..dab584bf7ddf 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -153,6 +153,9 @@ /* MSR used to provide vcpu index */ #define HV_X64_MSR_VP_INDEX 0x40000002 +/* MSR used to reset the guest OS. */ +#define HV_X64_MSR_RESET 0x40000003 + /* MSR used to read the per-partition time reference counter */ #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a8160d2ae362..0ad11a232474 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -41,6 +41,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_TIME_REF_COUNT: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_RESET: r = true; break; } @@ -163,6 +164,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, data); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_set_crash_ctl(vcpu, data, host); + case HV_X64_MSR_RESET: + if (data == 1) { + vcpu_debug(vcpu, "hyper-v reset requested\n"); + kvm_make_request(KVM_REQ_HV_RESET, vcpu); + } + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -241,6 +248,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) pdata); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_get_crash_ctl(vcpu, pdata); + case HV_X64_MSR_RESET: + data = 0; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b90ad01c617d..297b36fa3b80 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -952,6 +952,7 @@ static u32 emulated_msrs[] = { HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, + HV_X64_MSR_RESET, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, @@ -6321,6 +6322,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; + r = 0; + goto out; + } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d754f2d826e9..cd0ba2e931e1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -141,6 +141,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_SMI 26 #define KVM_REQ_HV_CRASH 27 #define KVM_REQ_IOAPIC_EOI_EXIT 28 +#define KVM_REQ_HV_RESET 29 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 From 11c4b1ca719eaaa5ca6fe0e80bb009f3f2012fd2 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Wed, 16 Sep 2015 12:29:49 +0300 Subject: [PATCH 015/123] kvm/x86: Hyper-V HV_X64_MSR_VP_INDEX export for QEMU. Insert Hyper-V HV_X64_MSR_VP_INDEX into msr's emulated list, so QEMU can set Hyper-V features cpuid HV_X64_MSR_VP_INDEX_AVAILABLE bit correctly. KVM emulation part is in place already. Necessary to support loading of winhv.sys in guest, which in turn is required to support Windows VMBus. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Paolo Bonzini CC: Gleb Natapov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 297b36fa3b80..716b4610791c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -953,6 +953,7 @@ static u32 emulated_msrs[] = { HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, HV_X64_MSR_RESET, + HV_X64_MSR_VP_INDEX, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, From 9eec50b8bbe1535c440a1ee88c1958f78fc55957 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Wed, 16 Sep 2015 12:29:50 +0300 Subject: [PATCH 016/123] kvm/x86: Hyper-V HV_X64_MSR_VP_RUNTIME support HV_X64_MSR_VP_RUNTIME msr used by guest to get "the time the virtual processor consumes running guest code, and the time the associated logical processor spends running hypervisor code on behalf of that guest." Calculation of this time is performed by task_cputime_adjusted() for vcpu task. Necessary to support loading of winhv.sys in guest, which in turn is required to support Windows VMBus. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Paolo Bonzini CC: Gleb Natapov Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/uapi/asm/hyperv.h | 3 +++ arch/x86/kvm/hyperv.c | 21 +++++++++++++++++++-- arch/x86/kvm/x86.c | 1 + kernel/sched/cputime.c | 2 ++ 5 files changed, 26 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 76a5b30979b3..d064cb2e19e8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -374,6 +374,7 @@ struct kvm_mtrr { /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { u64 hv_vapic; + s64 runtime_offset; }; struct kvm_vcpu_arch { diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index dab584bf7ddf..2677a0aac2cc 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -156,6 +156,9 @@ /* MSR used to reset the guest OS. */ #define HV_X64_MSR_RESET 0x40000003 +/* MSR used to provide vcpu runtime in 100ns units */ +#define HV_X64_MSR_VP_RUNTIME 0x40000010 + /* MSR used to read the per-partition time reference counter */ #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 0ad11a232474..62cf8c915e95 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -178,7 +178,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, return 0; } -static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +/* Calculate cpu time spent by current task in 100ns units */ +static u64 current_task_runtime_100ns(void) +{ + cputime_t utime, stime; + + task_cputime_adjusted(current, &utime, &stime); + return div_u64(cputime_to_nsecs(utime + stime), 100); +} + +static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; @@ -212,6 +221,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + case HV_X64_MSR_VP_RUNTIME: + if (!host) + return 1; + hv->runtime_offset = data - current_task_runtime_100ns(); + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -287,6 +301,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_APIC_ASSIST_PAGE: data = hv->hv_vapic; break; + case HV_X64_MSR_VP_RUNTIME: + data = current_task_runtime_100ns() + hv->runtime_offset; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -305,7 +322,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) mutex_unlock(&vcpu->kvm->lock); return r; } else - return kvm_hv_set_msr(vcpu, msr, data); + return kvm_hv_set_msr(vcpu, msr, data, host); } int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 716b4610791c..185fc1619c99 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -954,6 +954,7 @@ static u32 emulated_msrs[] = { HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, HV_X64_MSR_RESET, HV_X64_MSR_VP_INDEX, + HV_X64_MSR_VP_RUNTIME, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8cbc3db671df..26a54461bf59 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) *ut = p->utime; *st = p->stime; } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { @@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { From d6a858d13e5d02b97daab5ba0648c704ae3f9517 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 28 Sep 2015 11:58:14 +0200 Subject: [PATCH 017/123] KVM: vmx: disable posted interrupts if no local APIC Uniprocessor 32-bit randconfigs can disable the local APIC, and posted interrupts require reserving a vector on the LAPIC, so they are incompatible. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 324b09ff1def..0f15e2382109 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -983,7 +983,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) static inline bool cpu_has_vmx_posted_intr(void) { - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; + return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && + vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; } static inline bool cpu_has_vmx_apicv(void) From eb1c31b46800a476644cd63ab3bc7ef918b0a917 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:50 +0800 Subject: [PATCH 018/123] KVM: x86: allow guest to use cflushopt and clwb Pass these CPU features to guest to enable them in guest They are needed by nvdimm drivers Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2fbea2544f24..962fc7d7e0d4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB); /* cpuid 0xD.1.eax */ const u32 kvm_supported_word10_x86_features = From 8b3e34e46aca9b6d349b331cd9cf71ccbdc91b2e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:51 +0800 Subject: [PATCH 019/123] KVM: x86: add pcommit support Pass PCOMMIT CPU feature to guest to enable PCOMMIT instruction Currently we do not catch pcommit instruction for L1 guest and allow L1 to catch this instruction for L2 if, as required by the spec, L1 can enumerate the PCOMMIT instruction via CPUID: | IA32_VMX_PROCBASED_CTLS2[53] (which enumerates support for the | 1-setting of PCOMMIT exiting) is always the same as | CPUID.07H:EBX.PCOMMIT[bit 22]. Thus, software can set PCOMMIT exiting | to 1 if and only if the PCOMMIT instruction is enumerated via CPUID The spec can be found at https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 2 +- arch/x86/include/uapi/asm/vmx.h | 4 +++- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 8 ++++++++ arch/x86/kvm/vmx.c | 34 ++++++++++++++++++++++++++++----- 5 files changed, 42 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 448b7ca61aee..d25f32ac9c5c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -72,7 +72,7 @@ #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 - +#define SECONDARY_EXEC_PCOMMIT 0x00200000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 37fee272618f..5b15d94a33f8 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -78,6 +78,7 @@ #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 +#define EXIT_REASON_PCOMMIT 65 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -126,7 +127,8 @@ { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ - { EXIT_REASON_XRSTORS, "XRSTORS" } + { EXIT_REASON_XRSTORS, "XRSTORS" }, \ + { EXIT_REASON_PCOMMIT, "PCOMMIT" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 962fc7d7e0d4..faeb0b3bb343 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT); /* cpuid 0xD.1.eax */ const u32 kvm_supported_word10_x86_features = diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index dd05b9cef6ae..aed7bfeeaf0d 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -133,4 +133,12 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_MPX)); } + +static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_PCOMMIT)); +} #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0f15e2382109..691010604144 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2475,7 +2475,8 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_PCOMMIT; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -3016,7 +3017,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_ENABLE_PML; + SECONDARY_EXEC_ENABLE_PML | + SECONDARY_EXEC_PCOMMIT; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -4571,6 +4573,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) /* PML is enabled/disabled in creating/destorying vcpu */ exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + /* Currently, we allow L1 guest to directly run pcommit instruction. */ + exec_control &= ~SECONDARY_EXEC_PCOMMIT; + return exec_control; } @@ -4614,10 +4619,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) { + if (cpu_has_secondary_exec_ctrls()) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control(vmx)); - } if (vmx_cpu_uses_apicv(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); @@ -7212,6 +7216,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } +static int handle_pcommit(struct kvm_vcpu *vcpu) +{ + /* we never catch pcommit instruct for L1 guest. */ + WARN_ON(1); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7262,6 +7273,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_PCOMMIT] = handle_pcommit, }; static const int kvm_vmx_max_exit_handlers = @@ -7563,6 +7575,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + case EXIT_REASON_PCOMMIT: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT); default: return true; } @@ -8697,6 +8711,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } + + if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) { + if (guest_cpuid_has_pcommit(vcpu)) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_PCOMMIT; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_PCOMMIT; + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9310,7 +9333,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_APIC_REGISTER_VIRT); + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_PCOMMIT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; From e2821620c0908593e6cb40f79c7e78b31921ed73 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:52 +0800 Subject: [PATCH 020/123] KVM: VMX: drop rdtscp_enabled check in prepare_vmcs02() SECONDARY_EXEC_RDTSCP set for L2 guest comes from vmcs12 Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 691010604144..8f4c13808232 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9327,8 +9327,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (cpu_has_secondary_exec_ctrls()) { exec_control = vmx_secondary_exec_control(vmx); - if (!vmx->rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_RDTSCP; + /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | From f36201e5f44b55faf44ddc820929548e51ec841b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:53 +0800 Subject: [PATCH 021/123] KVM: VMX: simplify rdtscp handling in vmx_cpuid_update() if vmx_rdtscp_supported() is true SECONDARY_EXEC_RDTSCP must have already been set in current vmcs by vmx_secondary_exec_control() Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8f4c13808232..d53f57aad5eb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8677,16 +8677,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) vmx->rdtscp_enabled = false; if (vmx_rdtscp_supported()) { exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - if (exec_control & SECONDARY_EXEC_RDTSCP) { - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) - vmx->rdtscp_enabled = true; - else { - exec_control &= ~SECONDARY_EXEC_RDTSCP; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) + vmx->rdtscp_enabled = true; + else { + exec_control &= ~SECONDARY_EXEC_RDTSCP; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); } + if (nested && !vmx->rdtscp_enabled) vmx->nested.nested_vmx_secondary_ctls_high &= ~SECONDARY_EXEC_RDTSCP; From 29541bb8f4a18b2939dea137315e1803b4617c8d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:54 +0800 Subject: [PATCH 022/123] KVM: VMX: simplify invpcid handling in vmx_cpuid_update() If vmx_invpcid_supported() is true, second execution control filed must be supported and SECONDARY_EXEC_ENABLE_INVPCID must have already been set in current vmcs by vmx_secondary_exec_control() If vmx_invpcid_supported() is false, no need to clear SECONDARY_EXEC_ENABLE_INVPCID Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d53f57aad5eb..a5b26a940df2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8694,19 +8694,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) /* Exposing INVPCID only when PCID is exposed */ best = kvm_find_cpuid_entry(vcpu, 0x7, 0); if (vmx_invpcid_supported() && - best && (best->ebx & bit(X86_FEATURE_INVPCID)) && - guest_cpuid_has_pcid(vcpu)) { + (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) || + !guest_cpuid_has_pcid(vcpu))) { exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } else { - if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } From 8b97265a1516819d54c2d24b3874489a52f269d4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 15 Sep 2015 17:34:42 +0200 Subject: [PATCH 023/123] KVM: VMX: align vmx->nested.nested_vmx_secondary_ctls_high to vmx->rdtscp_enabled The SECONDARY_EXEC_RDTSCP must be available iff RDTSCP is enabled in the guest. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a5b26a940df2..dad471f8ca57 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8686,9 +8686,14 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) exec_control); } - if (nested && !vmx->rdtscp_enabled) - vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_RDTSCP; + if (nested) { + if (vmx->rdtscp_enabled) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_RDTSCP; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_RDTSCP; + } } /* Exposing INVPCID only when PCID is exposed */ From feda805fe7c4ed9cf78158e73b1218752e3b4314 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:55 +0800 Subject: [PATCH 024/123] KVM: VMX: unify SECONDARY_VM_EXEC_CONTROL update Unify the update in vmx_cpuid_update() Signed-off-by: Xiao Guangrong [Rewrite to use vmcs_set_secondary_exec_control. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index dad471f8ca57..a7a0ed6906fe 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8668,23 +8668,38 @@ static int vmx_get_lpage_level(void) return PT_PDPE_LEVEL; } +static void vmcs_set_secondary_exec_control(u32 new_ctl) +{ + /* + * These bits in the secondary execution controls field + * are dynamic, the others are mostly based on the hypervisor + * architecture and the guest's CPUID. Do not touch the + * dynamic bits. + */ + u32 mask = + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + + u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + (new_ctl & ~mask) | (cur_ctl & mask)); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control; + u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); vmx->rdtscp_enabled = false; if (vmx_rdtscp_supported()) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) vmx->rdtscp_enabled = true; - else { - exec_control &= ~SECONDARY_EXEC_RDTSCP; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + else + secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP; if (nested) { if (vmx->rdtscp_enabled) @@ -8701,14 +8716,14 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (vmx_invpcid_supported() && (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) || !guest_cpuid_has_pcid(vcpu))) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID; if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } + vmcs_set_secondary_exec_control(secondary_exec_ctl); + if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) { if (guest_cpuid_has_pcommit(vcpu)) vmx->nested.nested_vmx_secondary_ctls_high |= From 7ec362964d6cc228ea68572fdd6d75a59f8b40fa Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:56 +0800 Subject: [PATCH 025/123] KVM: VMX: clean up bit operation on SECONDARY_VM_EXEC_CONTROL Use vmcs_set_bits() and vmcs_clear_bits() to clean up the code Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a7a0ed6906fe..2937cf136df6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6636,7 +6636,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { - u32 exec_control; if (vmx->nested.current_vmptr == -1ull) return; @@ -6649,9 +6648,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) they were modified */ copy_shadow_to_vmcs12(vmx); vmx->nested.sync_shadow_vmcs = false; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, -1ull); } vmx->nested.posted_intr_nv = -1; @@ -7047,7 +7045,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t vmptr; - u32 exec_control; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -7079,9 +7076,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; if (enable_shadow_vmcs) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, __pa(vmx->nested.current_shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; @@ -7591,7 +7587,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) static int vmx_enable_pml(struct vcpu_vmx *vmx) { struct page *pml_pg; - u32 exec_control; pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!pml_pg) @@ -7602,24 +7597,18 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_ENABLE_PML; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); return 0; } static void vmx_disable_pml(struct vcpu_vmx *vmx) { - u32 exec_control; - ASSERT(vmx->pml_pg); __free_page(vmx->pml_pg); vmx->pml_pg = NULL; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); } static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) From 1cea0ce68ed76490ffa64a9e2a7a40104efe9352 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:57 +0800 Subject: [PATCH 026/123] KVM: VMX: drop rdtscp_enabled field Check cpuid bit instead of it Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 8 ++++++++ arch/x86/kvm/vmx.c | 17 ++++++----------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index aed7bfeeaf0d..d434ee952b00 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -141,4 +141,12 @@ static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_PCOMMIT)); } + +static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->edx & bit(X86_FEATURE_RDTSCP)); +} #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2937cf136df6..6f3f97f6e248 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -532,8 +532,6 @@ struct vcpu_vmx { s64 vnmi_blocked_time; u32 exit_reason; - bool rdtscp_enabled; - /* Posted interrupt descriptor */ struct pi_desc pi_desc; @@ -2208,7 +2206,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && vmx->rdtscp_enabled) + if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu)) move_msr_up(vmx, index, save_nmsrs++); /* * MSR_STAR is only needed on long mode guests, and only @@ -2675,7 +2673,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.ia32_xss; break; case MSR_TSC_AUX: - if (!to_vmx(vcpu)->rdtscp_enabled) + if (!guest_cpuid_has_rdtscp(vcpu)) return 1; /* Otherwise falls through */ default: @@ -2781,7 +2779,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) clear_atomic_switch_msr(vmx, MSR_IA32_XSS); break; case MSR_TSC_AUX: - if (!vmx->rdtscp_enabled) + if (!guest_cpuid_has_rdtscp(vcpu)) return 1; /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) @@ -8682,16 +8680,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); - vmx->rdtscp_enabled = false; if (vmx_rdtscp_supported()) { - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) - vmx->rdtscp_enabled = true; - else + bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu); + if (!rdtscp_enabled) secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP; if (nested) { - if (vmx->rdtscp_enabled) + if (rdtscp_enabled) vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_RDTSCP; else From 72c930dcfc2b49404ee9e20f6c868402e9c71166 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Fri, 18 Sep 2015 17:54:29 +0200 Subject: [PATCH 027/123] x86: kvmclock: abolish PVCLOCK_COUNTS_FROM_ZERO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Newer KVM won't be exposing PVCLOCK_COUNTS_FROM_ZERO anymore. The purpose of that flags was to start counting system time from 0 when the KVM clock has been initialized. We can achieve the same by selecting one read as the initial point. A simple subtraction will work unless the KVM clock count overflows earlier (has smaller width) than scheduler's cycle count. We should be safe till x86_128. Because PVCLOCK_COUNTS_FROM_ZERO was enabled only on new hypervisors, setting sched clock as stable based on PVCLOCK_TSC_STABLE_BIT might regress on older ones. I presume we don't need to change kvm_clock_read instead of introducing kvm_sched_clock_read. A problem could arise in case sched_clock is expected to return the same value as get_cycles, but we should have merged those clocks in that case. Signed-off-by: Radim Krčmář Acked-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 46 +++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2c7aafa70702..2bd81e302427 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -32,6 +32,7 @@ static int kvmclock = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; +static cycle_t kvm_sched_clock_offset; static int parse_no_kvmclock(char *arg) { @@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) return kvm_clock_read(); } +static cycle_t kvm_sched_clock_read(void) +{ + return kvm_clock_read() - kvm_sched_clock_offset; +} + +static inline void kvm_sched_clock_init(bool stable) +{ + if (!stable) { + pv_time_ops.sched_clock = kvm_clock_read; + return; + } + + kvm_sched_clock_offset = kvm_clock_read(); + pv_time_ops.sched_clock = kvm_sched_clock_read; + set_sched_clock_stable(); + + printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", + kvm_sched_clock_offset); + + BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); +} + /* * If we don't do that, there is the possibility that the guest * will calibrate under heavy load - thus, getting a lower lpj - @@ -248,7 +272,17 @@ void __init kvmclock_init(void) memblock_free(mem, size); return; } - pv_time_ops.sched_clock = kvm_clock_read; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + + cpu = get_cpu(); + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); + put_cpu(); + x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; @@ -265,16 +299,6 @@ void __init kvmclock_init(void) kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; - - if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) - pvclock_set_flags(~0); - - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - if (flags & PVCLOCK_COUNTS_FROM_ZERO) - set_sched_clock_stable(); - put_cpu(); } int __init kvm_setup_vsyscall_timeinfo(void) From 18cd52c4d9bfbd081fb643021ba8d048475cd82a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 18 Sep 2015 18:04:17 +0200 Subject: [PATCH 028/123] irq_remapping: move structs outside #ifdef This is friendlier to clients of the code, who are going to prepare vcpu_data structs unconditionally, even if CONFIG_IRQ_REMAP is not defined. Reviewed-by: Thomas Gleixner Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/irq_remapping.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 046c7fb1ca43..a210eba2727c 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -33,6 +33,11 @@ enum irq_remap_cap { IRQ_POSTING_CAP = 0, }; +struct vcpu_data { + u64 pi_desc_addr; /* Physical address of PI Descriptor */ + u32 vector; /* Guest vector of the interrupt */ +}; + #ifdef CONFIG_IRQ_REMAP extern bool irq_remapping_cap(enum irq_remap_cap cap); @@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void) return x86_vector_domain; } -struct vcpu_data { - u64 pi_desc_addr; /* Physical address of PI Descriptor */ - u32 vector; /* Guest vector of the interrupt */ -}; - #else /* CONFIG_IRQ_REMAP */ static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } From f73f8173126ba68eb1c42bd9a234a51d78576ca6 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 18 Sep 2015 22:29:39 +0800 Subject: [PATCH 029/123] virt: IRQ bypass manager When a physical I/O device is assigned to a virtual machine through facilities like VFIO and KVM, the interrupt for the device generally bounces through the host system before being injected into the VM. However, hardware technologies exist that often allow the host to be bypassed for some of these scenarios. Intel Posted Interrupts allow the specified physical edge interrupts to be directly injected into a guest when delivered to a physical processor while the vCPU is running. ARM IRQ Forwarding allows forwarded physical interrupts to be directly deactivated by the guest. The IRQ bypass manager here is meant to provide the shim to connect interrupt producers, generally the host physical device driver, with interrupt consumers, generally the hypervisor, in order to configure these bypass mechanism. To do this, we base the connection on a shared, opaque token. For KVM-VFIO this is expected to be an eventfd_ctx since this is the connection we already use to connect an eventfd to an irqfd on the in-kernel path. When a producer and consumer with matching tokens is found, callbacks via both registered participants allow the bypass facilities to be automatically enabled. Signed-off-by: Alex Williamson Reviewed-by: Eric Auger Tested-by: Eric Auger Tested-by: Feng Wu Signed-off-by: Feng Wu Signed-off-by: Paolo Bonzini --- MAINTAINERS | 7 ++ include/linux/irqbypass.h | 90 +++++++++++++ virt/lib/Kconfig | 2 + virt/lib/Makefile | 1 + virt/lib/irqbypass.c | 257 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 357 insertions(+) create mode 100644 include/linux/irqbypass.h create mode 100644 virt/lib/Kconfig create mode 100644 virt/lib/Makefile create mode 100644 virt/lib/irqbypass.c diff --git a/MAINTAINERS b/MAINTAINERS index 797236befd27..3b738cb8bdeb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11177,6 +11177,13 @@ L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/via/via-velocity.* +VIRT LIB +M: Alex Williamson +M: Paolo Bonzini +L: kvm@vger.kernel.org +S: Supported +F: virt/lib/ + VIVID VIRTUAL VIDEO DRIVER M: Hans Verkuil L: linux-media@vger.kernel.org diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h new file mode 100644 index 000000000000..1551b5b2f4c2 --- /dev/null +++ b/include/linux/irqbypass.h @@ -0,0 +1,90 @@ +/* + * IRQ offload/bypass manager + * + * Copyright (C) 2015 Red Hat, Inc. + * Copyright (c) 2015 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef IRQBYPASS_H +#define IRQBYPASS_H + +#include + +struct irq_bypass_consumer; + +/* + * Theory of operation + * + * The IRQ bypass manager is a simple set of lists and callbacks that allows + * IRQ producers (ex. physical interrupt sources) to be matched to IRQ + * consumers (ex. virtualization hardware that allows IRQ bypass or offload) + * via a shared token (ex. eventfd_ctx). Producers and consumers register + * independently. When a token match is found, the optional @stop callback + * will be called for each participant. The pair will then be connected via + * the @add_* callbacks, and finally the optional @start callback will allow + * any final coordination. When either participant is unregistered, the + * process is repeated using the @del_* callbacks in place of the @add_* + * callbacks. Match tokens must be unique per producer/consumer, 1:N pairings + * are not supported. + */ + +/** + * struct irq_bypass_producer - IRQ bypass producer definition + * @node: IRQ bypass manager private list management + * @token: opaque token to match between producer and consumer + * @irq: Linux IRQ number for the producer device + * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional) + * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional) + * @stop: Perform any quiesce operations necessary prior to add/del (optional) + * @start: Perform any startup operations necessary after add/del (optional) + * + * The IRQ bypass producer structure represents an interrupt source for + * participation in possible host bypass, for instance an interrupt vector + * for a physical device assigned to a VM. + */ +struct irq_bypass_producer { + struct list_head node; + void *token; + int irq; + int (*add_consumer)(struct irq_bypass_producer *, + struct irq_bypass_consumer *); + void (*del_consumer)(struct irq_bypass_producer *, + struct irq_bypass_consumer *); + void (*stop)(struct irq_bypass_producer *); + void (*start)(struct irq_bypass_producer *); +}; + +/** + * struct irq_bypass_consumer - IRQ bypass consumer definition + * @node: IRQ bypass manager private list management + * @token: opaque token to match between producer and consumer + * @add_producer: Connect the IRQ consumer to an IRQ producer + * @del_producer: Disconnect the IRQ consumer from an IRQ producer + * @stop: Perform any quiesce operations necessary prior to add/del (optional) + * @start: Perform any startup operations necessary after add/del (optional) + * + * The IRQ bypass consumer structure represents an interrupt sink for + * participation in possible host bypass, for instance a hypervisor may + * support offloads to allow bypassing the host entirely or offload + * portions of the interrupt handling to the VM. + */ +struct irq_bypass_consumer { + struct list_head node; + void *token; + int (*add_producer)(struct irq_bypass_consumer *, + struct irq_bypass_producer *); + void (*del_producer)(struct irq_bypass_consumer *, + struct irq_bypass_producer *); + void (*stop)(struct irq_bypass_consumer *); + void (*start)(struct irq_bypass_consumer *); +}; + +int irq_bypass_register_producer(struct irq_bypass_producer *); +void irq_bypass_unregister_producer(struct irq_bypass_producer *); +int irq_bypass_register_consumer(struct irq_bypass_consumer *); +void irq_bypass_unregister_consumer(struct irq_bypass_consumer *); + +#endif /* IRQBYPASS_H */ diff --git a/virt/lib/Kconfig b/virt/lib/Kconfig new file mode 100644 index 000000000000..89a414f815d2 --- /dev/null +++ b/virt/lib/Kconfig @@ -0,0 +1,2 @@ +config IRQ_BYPASS_MANAGER + tristate diff --git a/virt/lib/Makefile b/virt/lib/Makefile new file mode 100644 index 000000000000..901228d1ffbc --- /dev/null +++ b/virt/lib/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_IRQ_BYPASS_MANAGER) += irqbypass.o diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c new file mode 100644 index 000000000000..09a03b5a21ff --- /dev/null +++ b/virt/lib/irqbypass.c @@ -0,0 +1,257 @@ +/* + * IRQ offload/bypass manager + * + * Copyright (C) 2015 Red Hat, Inc. + * Copyright (c) 2015 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Various virtualization hardware acceleration techniques allow bypassing or + * offloading interrupts received from devices around the host kernel. Posted + * Interrupts on Intel VT-d systems can allow interrupts to be received + * directly by a virtual machine. ARM IRQ Forwarding allows forwarded physical + * interrupts to be directly deactivated by the guest. This manager allows + * interrupt producers and consumers to find each other to enable this sort of + * bypass. + */ + +#include +#include +#include +#include + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("IRQ bypass manager utility module"); + +static LIST_HEAD(producers); +static LIST_HEAD(consumers); +static DEFINE_MUTEX(lock); + +/* @lock must be held when calling connect */ +static int __connect(struct irq_bypass_producer *prod, + struct irq_bypass_consumer *cons) +{ + int ret = 0; + + if (prod->stop) + prod->stop(prod); + if (cons->stop) + cons->stop(cons); + + if (prod->add_consumer) + ret = prod->add_consumer(prod, cons); + + if (!ret) { + ret = cons->add_producer(cons, prod); + if (ret && prod->del_consumer) + prod->del_consumer(prod, cons); + } + + if (cons->start) + cons->start(cons); + if (prod->start) + prod->start(prod); + + return ret; +} + +/* @lock must be held when calling disconnect */ +static void __disconnect(struct irq_bypass_producer *prod, + struct irq_bypass_consumer *cons) +{ + if (prod->stop) + prod->stop(prod); + if (cons->stop) + cons->stop(cons); + + cons->del_producer(cons, prod); + + if (prod->del_consumer) + prod->del_consumer(prod, cons); + + if (cons->start) + cons->start(cons); + if (prod->start) + prod->start(prod); +} + +/** + * irq_bypass_register_producer - register IRQ bypass producer + * @producer: pointer to producer structure + * + * Add the provided IRQ producer to the list of producers and connect + * with any matching token found on the IRQ consumers list. + */ +int irq_bypass_register_producer(struct irq_bypass_producer *producer) +{ + struct irq_bypass_producer *tmp; + struct irq_bypass_consumer *consumer; + + might_sleep(); + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&lock); + + list_for_each_entry(tmp, &producers, node) { + if (tmp->token == producer->token) { + mutex_unlock(&lock); + module_put(THIS_MODULE); + return -EBUSY; + } + } + + list_for_each_entry(consumer, &consumers, node) { + if (consumer->token == producer->token) { + int ret = __connect(producer, consumer); + if (ret) { + mutex_unlock(&lock); + module_put(THIS_MODULE); + return ret; + } + break; + } + } + + list_add(&producer->node, &producers); + + mutex_unlock(&lock); + + return 0; +} +EXPORT_SYMBOL_GPL(irq_bypass_register_producer); + +/** + * irq_bypass_unregister_producer - unregister IRQ bypass producer + * @producer: pointer to producer structure + * + * Remove a previously registered IRQ producer from the list of producers + * and disconnect it from any connected IRQ consumer. + */ +void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) +{ + struct irq_bypass_producer *tmp; + struct irq_bypass_consumer *consumer; + + might_sleep(); + + if (!try_module_get(THIS_MODULE)) + return; /* nothing in the list anyway */ + + mutex_lock(&lock); + + list_for_each_entry(tmp, &producers, node) { + if (tmp->token != producer->token) + continue; + + list_for_each_entry(consumer, &consumers, node) { + if (consumer->token == producer->token) { + __disconnect(producer, consumer); + break; + } + } + + list_del(&producer->node); + module_put(THIS_MODULE); + break; + } + + mutex_unlock(&lock); + + module_put(THIS_MODULE); +} +EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); + +/** + * irq_bypass_register_consumer - register IRQ bypass consumer + * @consumer: pointer to consumer structure + * + * Add the provided IRQ consumer to the list of consumers and connect + * with any matching token found on the IRQ producer list. + */ +int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) +{ + struct irq_bypass_consumer *tmp; + struct irq_bypass_producer *producer; + + if (!consumer->add_producer || !consumer->del_producer) + return -EINVAL; + + might_sleep(); + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&lock); + + list_for_each_entry(tmp, &consumers, node) { + if (tmp->token == consumer->token) { + mutex_unlock(&lock); + module_put(THIS_MODULE); + return -EBUSY; + } + } + + list_for_each_entry(producer, &producers, node) { + if (producer->token == consumer->token) { + int ret = __connect(producer, consumer); + if (ret) { + mutex_unlock(&lock); + module_put(THIS_MODULE); + return ret; + } + break; + } + } + + list_add(&consumer->node, &consumers); + + mutex_unlock(&lock); + + return 0; +} +EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); + +/** + * irq_bypass_unregister_consumer - unregister IRQ bypass consumer + * @consumer: pointer to consumer structure + * + * Remove a previously registered IRQ consumer from the list of consumers + * and disconnect it from any connected IRQ producer. + */ +void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) +{ + struct irq_bypass_consumer *tmp; + struct irq_bypass_producer *producer; + + might_sleep(); + + if (!try_module_get(THIS_MODULE)) + return; /* nothing in the list anyway */ + + mutex_lock(&lock); + + list_for_each_entry(tmp, &consumers, node) { + if (tmp->token != consumer->token) + continue; + + list_for_each_entry(producer, &producers, node) { + if (producer->token == consumer->token) { + __disconnect(producer, consumer); + break; + } + } + + list_del(&consumer->node); + module_put(THIS_MODULE); + break; + } + + mutex_unlock(&lock); + + module_put(THIS_MODULE); +} +EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); From 37d9fe4783ffcaddcc4afe67626691e62c5ab30e Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 22 Sep 2015 16:47:29 +0800 Subject: [PATCH 030/123] virt: Add virt directory to the top Makefile We need to build files in virt/lib/, which are now used by KVM and VFIO, so add virt directory to the top Makefile. Signed-off-by: Feng Wu Acked-by: Michal Marek Signed-off-by: Paolo Bonzini --- Makefile | 10 ++++++---- virt/Makefile | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) create mode 100644 virt/Makefile diff --git a/Makefile b/Makefile index 1d341eba143d..7c6c9769bd1c 100644 --- a/Makefile +++ b/Makefile @@ -550,6 +550,7 @@ drivers-y := drivers/ sound/ firmware/ net-y := net/ libs-y := lib/ core-y := usr/ +virt-y := virt/ endif # KBUILD_EXTMOD ifeq ($(dot-config),1) @@ -882,10 +883,10 @@ core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ - $(net-y) $(net-m) $(libs-y) $(libs-m))) + $(net-y) $(net-m) $(libs-y) $(libs-m) $(virt-y))) vmlinux-alldirs := $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \ - $(init-) $(core-) $(drivers-) $(net-) $(libs-)))) + $(init-) $(core-) $(drivers-) $(net-) $(libs-) $(virt-)))) init-y := $(patsubst %/, %/built-in.o, $(init-y)) core-y := $(patsubst %/, %/built-in.o, $(core-y)) @@ -894,14 +895,15 @@ net-y := $(patsubst %/, %/built-in.o, $(net-y)) libs-y1 := $(patsubst %/, %/lib.a, $(libs-y)) libs-y2 := $(patsubst %/, %/built-in.o, $(libs-y)) libs-y := $(libs-y1) $(libs-y2) +virt-y := $(patsubst %/, %/built-in.o, $(virt-y)) # Externally visible symbols (used by link-vmlinux.sh) export KBUILD_VMLINUX_INIT := $(head-y) $(init-y) -export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) +export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) $(virt-y) export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds export LDFLAGS_vmlinux # used by scripts/pacmage/Makefile -export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools virt) +export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools) vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN) diff --git a/virt/Makefile b/virt/Makefile new file mode 100644 index 000000000000..be783472ac81 --- /dev/null +++ b/virt/Makefile @@ -0,0 +1 @@ +obj-y += lib/ From 166c9775f1f8b8f00ad1db0fa5c8fc74059d965d Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 18 Sep 2015 22:29:42 +0800 Subject: [PATCH 031/123] KVM: create kvm_irqfd.h Move _irqfd_resampler and _irqfd struct declarations in a new public header: kvm_irqfd.h. They are respectively renamed into kvm_kernel_irqfd_resampler and kvm_kernel_irqfd. Those datatypes will be used by architecture specific code, in the context of IRQ bypass manager integration. Signed-off-by: Eric Auger Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- include/linux/kvm_irqfd.h | 69 ++++++++++++++++++++++++++++ virt/kvm/eventfd.c | 95 ++++++++++----------------------------- 2 files changed, 92 insertions(+), 72 deletions(-) create mode 100644 include/linux/kvm_irqfd.h diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h new file mode 100644 index 000000000000..f926b39a26b6 --- /dev/null +++ b/include/linux/kvm_irqfd.h @@ -0,0 +1,69 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * irqfd: Allows an fd to be used to inject an interrupt to the guest + * Credit goes to Avi Kivity for the original idea. + */ + +#ifndef __LINUX_KVM_IRQFD_H +#define __LINUX_KVM_IRQFD_H + +#include +#include + +/* + * Resampling irqfds are a special variety of irqfds used to emulate + * level triggered interrupts. The interrupt is asserted on eventfd + * trigger. On acknowledgment through the irq ack notifier, the + * interrupt is de-asserted and userspace is notified through the + * resamplefd. All resamplers on the same gsi are de-asserted + * together, so we don't need to track the state of each individual + * user. We can also therefore share the same irq source ID. + */ +struct kvm_kernel_irqfd_resampler { + struct kvm *kvm; + /* + * List of resampling struct _irqfd objects sharing this gsi. + * RCU list modified under kvm->irqfds.resampler_lock + */ + struct list_head list; + struct kvm_irq_ack_notifier notifier; + /* + * Entry in list of kvm->irqfd.resampler_list. Use for sharing + * resamplers among irqfds on the same gsi. + * Accessed and modified under kvm->irqfds.resampler_lock + */ + struct list_head link; +}; + +struct kvm_kernel_irqfd { + /* Used for MSI fast-path */ + struct kvm *kvm; + wait_queue_t wait; + /* Update side is protected by irqfds.lock */ + struct kvm_kernel_irq_routing_entry irq_entry; + seqcount_t irq_entry_sc; + /* Used for level IRQ fast-path */ + int gsi; + struct work_struct inject; + /* The resampler used by this irqfd (resampler-only) */ + struct kvm_kernel_irqfd_resampler *resampler; + /* Eventfd notified on resample (resampler-only) */ + struct eventfd_ctx *resamplefd; + /* Entry in list of irqfds for a resampler (resampler-only) */ + struct list_head resampler_link; + /* Used for setup/shutdown */ + struct eventfd_ctx *eventfd; + struct list_head list; + poll_table pt; + struct work_struct shutdown; +}; + +#endif /* __LINUX_KVM_IRQFD_H */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index ac89299b8699..413f5a6b61ba 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -39,68 +40,14 @@ #include #ifdef CONFIG_HAVE_KVM_IRQFD -/* - * -------------------------------------------------------------------- - * irqfd: Allows an fd to be used to inject an interrupt to the guest - * - * Credit goes to Avi Kivity for the original idea. - * -------------------------------------------------------------------- - */ - -/* - * Resampling irqfds are a special variety of irqfds used to emulate - * level triggered interrupts. The interrupt is asserted on eventfd - * trigger. On acknowledgement through the irq ack notifier, the - * interrupt is de-asserted and userspace is notified through the - * resamplefd. All resamplers on the same gsi are de-asserted - * together, so we don't need to track the state of each individual - * user. We can also therefore share the same irq source ID. - */ -struct _irqfd_resampler { - struct kvm *kvm; - /* - * List of resampling struct _irqfd objects sharing this gsi. - * RCU list modified under kvm->irqfds.resampler_lock - */ - struct list_head list; - struct kvm_irq_ack_notifier notifier; - /* - * Entry in list of kvm->irqfd.resampler_list. Use for sharing - * resamplers among irqfds on the same gsi. - * Accessed and modified under kvm->irqfds.resampler_lock - */ - struct list_head link; -}; - -struct _irqfd { - /* Used for MSI fast-path */ - struct kvm *kvm; - wait_queue_t wait; - /* Update side is protected by irqfds.lock */ - struct kvm_kernel_irq_routing_entry irq_entry; - seqcount_t irq_entry_sc; - /* Used for level IRQ fast-path */ - int gsi; - struct work_struct inject; - /* The resampler used by this irqfd (resampler-only) */ - struct _irqfd_resampler *resampler; - /* Eventfd notified on resample (resampler-only) */ - struct eventfd_ctx *resamplefd; - /* Entry in list of irqfds for a resampler (resampler-only) */ - struct list_head resampler_link; - /* Used for setup/shutdown */ - struct eventfd_ctx *eventfd; - struct list_head list; - poll_table pt; - struct work_struct shutdown; -}; static struct workqueue_struct *irqfd_cleanup_wq; static void irqfd_inject(struct work_struct *work) { - struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); + struct kvm_kernel_irqfd *irqfd = + container_of(work, struct kvm_kernel_irqfd, inject); struct kvm *kvm = irqfd->kvm; if (!irqfd->resampler) { @@ -121,12 +68,13 @@ irqfd_inject(struct work_struct *work) static void irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) { - struct _irqfd_resampler *resampler; + struct kvm_kernel_irqfd_resampler *resampler; struct kvm *kvm; - struct _irqfd *irqfd; + struct kvm_kernel_irqfd *irqfd; int idx; - resampler = container_of(kian, struct _irqfd_resampler, notifier); + resampler = container_of(kian, + struct kvm_kernel_irqfd_resampler, notifier); kvm = resampler->kvm; kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, @@ -141,9 +89,9 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) } static void -irqfd_resampler_shutdown(struct _irqfd *irqfd) +irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) { - struct _irqfd_resampler *resampler = irqfd->resampler; + struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler; struct kvm *kvm = resampler->kvm; mutex_lock(&kvm->irqfds.resampler_lock); @@ -168,7 +116,8 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd) static void irqfd_shutdown(struct work_struct *work) { - struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); + struct kvm_kernel_irqfd *irqfd = + container_of(work, struct kvm_kernel_irqfd, shutdown); u64 cnt; /* @@ -198,7 +147,7 @@ irqfd_shutdown(struct work_struct *work) /* assumes kvm->irqfds.lock is held */ static bool -irqfd_is_active(struct _irqfd *irqfd) +irqfd_is_active(struct kvm_kernel_irqfd *irqfd) { return list_empty(&irqfd->list) ? false : true; } @@ -209,7 +158,7 @@ irqfd_is_active(struct _irqfd *irqfd) * assumes kvm->irqfds.lock is held */ static void -irqfd_deactivate(struct _irqfd *irqfd) +irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) { BUG_ON(!irqfd_is_active(irqfd)); @@ -224,7 +173,8 @@ irqfd_deactivate(struct _irqfd *irqfd) static int irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) { - struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); + struct kvm_kernel_irqfd *irqfd = + container_of(wait, struct kvm_kernel_irqfd, wait); unsigned long flags = (unsigned long)key; struct kvm_kernel_irq_routing_entry irq; struct kvm *kvm = irqfd->kvm; @@ -274,12 +224,13 @@ static void irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { - struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); + struct kvm_kernel_irqfd *irqfd = + container_of(pt, struct kvm_kernel_irqfd, pt); add_wait_queue(wqh, &irqfd->wait); } /* Must be called under irqfds.lock */ -static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd) +static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) { struct kvm_kernel_irq_routing_entry *e; struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; @@ -304,7 +255,7 @@ static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd) static int kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { - struct _irqfd *irqfd, *tmp; + struct kvm_kernel_irqfd *irqfd, *tmp; struct fd f; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; int ret; @@ -340,7 +291,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) irqfd->eventfd = eventfd; if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { - struct _irqfd_resampler *resampler; + struct kvm_kernel_irqfd_resampler *resampler; resamplefd = eventfd_ctx_fdget(args->resamplefd); if (IS_ERR(resamplefd)) { @@ -525,7 +476,7 @@ kvm_eventfd_init(struct kvm *kvm) static int kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) { - struct _irqfd *irqfd, *tmp; + struct kvm_kernel_irqfd *irqfd, *tmp; struct eventfd_ctx *eventfd; eventfd = eventfd_ctx_fdget(args->fd); @@ -581,7 +532,7 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) void kvm_irqfd_release(struct kvm *kvm) { - struct _irqfd *irqfd, *tmp; + struct kvm_kernel_irqfd *irqfd, *tmp; spin_lock_irq(&kvm->irqfds.lock); @@ -604,7 +555,7 @@ kvm_irqfd_release(struct kvm *kvm) */ void kvm_irq_routing_update(struct kvm *kvm) { - struct _irqfd *irqfd; + struct kvm_kernel_irqfd *irqfd; spin_lock_irq(&kvm->irqfds.lock); From 1a02b27035f82091d51ecafcb9ccaac1f31d4eb2 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 18 Sep 2015 22:29:43 +0800 Subject: [PATCH 032/123] KVM: introduce kvm_arch functions for IRQ bypass This patch introduces - kvm_arch_irq_bypass_add_producer - kvm_arch_irq_bypass_del_producer - kvm_arch_irq_bypass_stop - kvm_arch_irq_bypass_start They make possible to specialize the KVM IRQ bypass consumer in case CONFIG_KVM_HAVE_IRQ_BYPASS is set. Signed-off-by: Eric Auger [Add weak implementations of the callbacks. - Feng] Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 10 ++++++++++ virt/kvm/Kconfig | 3 +++ virt/kvm/eventfd.c | 12 ++++++++++++ 3 files changed, 25 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cd0ba2e931e1..b8543b02b7bc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -1163,4 +1164,13 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ + +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, + struct irq_bypass_producer *); +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, + struct irq_bypass_producer *); +void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); +void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); +#endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ #endif diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index e2c876d5a03b..9f8014dda2cf 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -47,3 +47,6 @@ config KVM_GENERIC_DIRTYLOG_READ_PROTECT config KVM_COMPAT def_bool y depends on COMPAT && !S390 + +config HAVE_KVM_IRQ_BYPASS + bool diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 413f5a6b61ba..c4f7abec4261 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -252,6 +252,18 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) write_seqcount_end(&irqfd->irq_entry_sc); } +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS +void __attribute__((weak)) kvm_arch_irq_bypass_stop( + struct irq_bypass_consumer *cons) +{ +} + +void __attribute__((weak)) kvm_arch_irq_bypass_start( + struct irq_bypass_consumer *cons) +{ +} +#endif + static int kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { From 9016cfb577a15abd6a7990890ccf6bf1edf04d31 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 18 Sep 2015 22:29:44 +0800 Subject: [PATCH 033/123] KVM: eventfd: add irq bypass consumer management This patch adds the registration/unregistration of an irq_bypass_consumer on irqfd assignment/deassignment. Signed-off-by: Eric Auger Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- include/linux/kvm_irqfd.h | 2 ++ virt/kvm/eventfd.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index f926b39a26b6..0c1de05098c8 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -64,6 +64,8 @@ struct kvm_kernel_irqfd { struct list_head list; poll_table pt; struct work_struct shutdown; + struct irq_bypass_consumer consumer; + struct irq_bypass_producer *producer; }; #endif /* __LINUX_KVM_IRQFD_H */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index c4f7abec4261..7df356d8f1fd 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -140,6 +141,9 @@ irqfd_shutdown(struct work_struct *work) /* * It is now safe to release the object's resources */ +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS + irq_bypass_unregister_consumer(&irqfd->consumer); +#endif eventfd_ctx_put(irqfd->eventfd); kfree(irqfd); } @@ -391,6 +395,17 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) * we might race against the POLLHUP */ fdput(f); +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS + irqfd->consumer.token = (void *)irqfd->eventfd; + irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; + irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; + irqfd->consumer.stop = kvm_arch_irq_bypass_stop; + irqfd->consumer.start = kvm_arch_irq_bypass_start; + ret = irq_bypass_register_consumer(&irqfd->consumer); + if (ret) + pr_info("irq bypass consumer (token %p) registration fails: %d\n", + irqfd->consumer.token, ret); +#endif return 0; From f70c20aaf141adb715a2d750c55154073b02a9c3 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:53 +0800 Subject: [PATCH 034/123] KVM: Add an arch specific hooks in 'struct kvm_kernel_irqfd' This patch adds an arch specific hooks 'arch_update' in 'struct kvm_kernel_irqfd'. On Intel side, it is used to update the IRTE when VT-d posted-interrupts is used. Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 ++ virt/kvm/eventfd.c | 19 ++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b8543b02b7bc..5c3f4538807f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1172,5 +1172,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); +int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ #endif diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 7df356d8f1fd..b637965746bb 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -266,6 +266,13 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start( struct irq_bypass_consumer *cons) { } + +int __attribute__((weak)) kvm_arch_update_irqfd_routing( + struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + return 0; +} #endif static int @@ -586,9 +593,19 @@ void kvm_irq_routing_update(struct kvm *kvm) spin_lock_irq(&kvm->irqfds.lock); - list_for_each_entry(irqfd, &kvm->irqfds.items, list) + list_for_each_entry(irqfd, &kvm->irqfds.items, list) { irqfd_update(kvm, irqfd); +#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS + if (irqfd->producer) { + int ret = kvm_arch_update_irqfd_routing( + irqfd->kvm, irqfd->producer->irq, + irqfd->gsi, 1); + WARN_ON(ret); + } +#endif + } + spin_unlock_irq(&kvm->irqfds.lock); } From 6ef1522f7ecc063317dfb5ca63da6e47130a4c50 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:45 +0800 Subject: [PATCH 035/123] KVM: Extend struct pi_desc for VT-d Posted-Interrupts Extend struct pi_desc for VT-d Posted-Interrupts. Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6f3f97f6e248..4ebdccd882c8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -446,8 +446,24 @@ struct nested_vmx { /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ - u32 control; /* bit 0 of control is outstanding notification bit */ - u32 rsvd[7]; + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; } __aligned(64); static bool pi_test_and_set_on(struct pi_desc *pi_desc) From ebbfc765369690cf8fc512615e6b83ec1702f8ac Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:46 +0800 Subject: [PATCH 036/123] KVM: Add some helper functions for Posted-Interrupts This patch adds some helper functions to manipulate the Posted-Interrupts Descriptor. Signed-off-by: Feng Wu Reviewed-by: Paolo Bonzini Reviewed-by: Alex Williamson [Make the new functions inline. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4ebdccd882c8..4739a52874c2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -443,6 +443,8 @@ struct nested_vmx { }; #define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ @@ -483,6 +485,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + return clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + return set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; From 8feb4a04dc756002f78df0026e58118669de4851 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:47 +0800 Subject: [PATCH 037/123] KVM: Define a new interface kvm_intr_is_single_vcpu() This patch defines a new interface kvm_intr_is_single_vcpu(), which can returns whether the interrupt is for single-CPU or not. It is used by VT-d PI, since now we only support single-CPU interrupts, For lowest-priority interrupts, if user configures it via /proc/irq or uses irqbalance to make it single-CPU, we can use PI to deliver the interrupts to it. Full functionality of lowest-priority support will be added later. Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/kvm/irq_comm.c | 27 +++++++++++++++ arch/x86/kvm/lapic.c | 59 +++++++++++++++++++++++++++++++++ arch/x86/kvm/lapic.h | 2 ++ 4 files changed, 91 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d064cb2e19e8..ba4e5673e604 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1241,4 +1241,7 @@ int x86_set_memory_region(struct kvm *kvm, bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 177460998bb0..39f833f8132e 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -297,6 +297,33 @@ out: return r; } +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + int i, r = 0; + struct kvm_vcpu *vcpu; + + if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) + return true; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (++r == 2) + return false; + + *dest_vcpu = vcpu; + } + + return r == 1; +} +EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); + #define IOAPIC_ROUTING_ENTRY(irq) \ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2f4c0d0cbe0a..944b38a56929 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -755,6 +755,65 @@ out: return ret; } +bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + struct kvm_apic_map *map; + bool ret = false; + struct kvm_lapic *dst = NULL; + + if (irq->shorthand) + return false; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + if (!map) + goto out; + + if (irq->dest_mode == APIC_DEST_PHYSICAL) { + if (irq->dest_id == 0xFF) + goto out; + + if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) + goto out; + + dst = map->phys_map[irq->dest_id]; + if (dst && kvm_apic_present(dst->vcpu)) + *dest_vcpu = dst->vcpu; + else + goto out; + } else { + u16 cid; + unsigned long bitmap = 1; + int i, r = 0; + + if (!kvm_apic_logical_map_valid(map)) + goto out; + + apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); + + if (cid >= ARRAY_SIZE(map->logical_map)) + goto out; + + for_each_set_bit(i, &bitmap, 16) { + dst = map->logical_map[cid][i]; + if (++r == 2) + goto out; + } + + if (dst && kvm_apic_present(dst->vcpu)) + *dest_vcpu = dst->vcpu; + else + goto out; + } + + ret = true; +out: + rcu_read_unlock(); + return ret; +} + /* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded. diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 7259d272416f..fde8e35d5850 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -168,4 +168,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); +bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); #endif From d84f1e0755ba1e87d20d1f90c1e6eb0cbbc0af9d Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:49 +0800 Subject: [PATCH 038/123] KVM: make kvm_set_msi_irq() public Make kvm_set_msi_irq() public, we can use this function outside. Signed-off-by: Feng Wu Reviewed-by: Paolo Bonzini Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/irq_comm.c | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ba4e5673e604..79fffbbe2348 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -176,6 +176,8 @@ enum { */ #define KVM_APIC_PV_EOI_PENDING 1 +struct kvm_kernel_irq_routing_entry; + /* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. @@ -1244,4 +1246,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 39f833f8132e..c89228980230 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -91,8 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm_lapic_irq *irq) +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq) { trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); @@ -108,6 +108,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, irq->level = 1; irq->shorthand = 0; } +EXPORT_SYMBOL_GPL(kvm_set_msi_irq); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) From 6d7425f109d2629d1f4b4b146eca8e43701bf966 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:50 +0800 Subject: [PATCH 039/123] vfio: Register/unregister irq_bypass_producer This patch adds the registration/unregistration of an irq_bypass_producer for MSI/MSIx on vfio pci devices. Acked-by: Alex Williamson Signed-off-by: Feng Wu Signed-off-by: Paolo Bonzini --- drivers/vfio/Kconfig | 1 + drivers/vfio/pci/Kconfig | 1 + drivers/vfio/pci/vfio_pci_intrs.c | 9 +++++++++ drivers/vfio/pci/vfio_pci_private.h | 2 ++ 4 files changed, 13 insertions(+) diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 454017928ed0..850d86ca685b 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -33,3 +33,4 @@ menuconfig VFIO source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" +source "virt/lib/Kconfig" diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 579d83bf5358..02912f180c6d 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -2,6 +2,7 @@ config VFIO_PCI tristate "VFIO support for PCI devices" depends on VFIO && PCI && EVENTFD select VFIO_VIRQFD + select IRQ_BYPASS_MANAGER help Support for the PCI VFIO bus driver. This is required to make use of PCI drivers using the VFIO framework. diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 1f577b4ac126..3b3ba15558b7 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -319,6 +319,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, if (vdev->ctx[vector].trigger) { free_irq(irq, vdev->ctx[vector].trigger); + irq_bypass_unregister_producer(&vdev->ctx[vector].producer); kfree(vdev->ctx[vector].name); eventfd_ctx_put(vdev->ctx[vector].trigger); vdev->ctx[vector].trigger = NULL; @@ -360,6 +361,14 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, return ret; } + vdev->ctx[vector].producer.token = trigger; + vdev->ctx[vector].producer.irq = irq; + ret = irq_bypass_register_producer(&vdev->ctx[vector].producer); + if (unlikely(ret)) + dev_info(&pdev->dev, + "irq bypass producer (token %p) registration fails: %d\n", + vdev->ctx[vector].producer.token, ret); + vdev->ctx[vector].trigger = trigger; return 0; diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index ae0e1b4c1711..0e7394f8f69b 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -13,6 +13,7 @@ #include #include +#include #ifndef VFIO_PCI_PRIVATE_H #define VFIO_PCI_PRIVATE_H @@ -29,6 +30,7 @@ struct vfio_pci_irq_ctx { struct virqfd *mask; char *name; bool masked; + struct irq_bypass_producer producer; }; struct vfio_pci_device { From efc644048ecde54f016011fe10110addd0de348f Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:51 +0800 Subject: [PATCH 040/123] KVM: x86: Update IRTE for posted-interrupts This patch adds the routine to update IRTE for posted-interrupts when guest changes the interrupt configuration. Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Fengguang Wu [Squashed in automatically generated patch from the build robot "KVM: x86: vcpu_to_pi_desc() can be static" - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/kvm/trace.h | 33 +++++++++++++ arch/x86/kvm/vmx.c | 83 +++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 2 + 4 files changed, 121 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 79fffbbe2348..a4067ecd4376 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -897,6 +897,9 @@ struct kvm_x86_ops { gfn_t offset, unsigned long mask); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; + + int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index ce4abe333c39..120302511802 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -992,6 +992,39 @@ TRACE_EVENT(kvm_enter_smm, __entry->smbase) ); +/* + * Tracepoint for VT-d posted-interrupts. + */ +TRACE_EVENT(kvm_pi_irte_update, + TP_PROTO(unsigned int vcpu_id, unsigned int gsi, + unsigned int gvec, u64 pi_desc_addr, bool set), + TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( unsigned int, gsi ) + __field( unsigned int, gvec ) + __field( u64, pi_desc_addr ) + __field( bool, set ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->gsi = gsi; + __entry->gvec = gvec; + __entry->pi_desc_addr = pi_desc_addr; + __entry->set = set; + ), + + TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, " + "gvec: 0x%x, pi_desc_addr: 0x%llx", + __entry->set ? "enabled and being updated" : "disabled", + __entry->vcpu_id, + __entry->gsi, + __entry->gvec, + __entry->pi_desc_addr) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4739a52874c2..cb26b93a2cd7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -603,6 +604,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } +static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [number] = VMCS12_OFFSET(name) #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ @@ -10345,6 +10351,81 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +/* + * vmx_update_pi_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + struct vcpu_data vcpu_info; + int idx, ret = -EINVAL; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + /* + * VT-d PI cannot support posting multicast/broadcast + * interrupts to a vCPU, we still use interrupt remapping + * for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + * + * We will support full lowest-priority interrupt later. + */ + + kvm_set_msi_irq(e, &irq); + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) + continue; + + vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); + vcpu_info.vector = irq.vector; + + trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi, + vcpu_info.vector, vcpu_info.pi_desc_addr, set); + + if (set) + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + else { + /* suppress notification event before unposting */ + pi_set_sn(vcpu_to_pi_desc(vcpu)); + ret = irq_set_vcpu_affinity(host_irq, NULL); + pi_clear_sn(vcpu_to_pi_desc(vcpu)); + } + + if (ret < 0) { + printk(KERN_INFO "%s: failed to update PI IRTE\n", + __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -10462,6 +10543,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, .pmu_ops = &intel_pmu_ops, + + .update_pi_irte = vmx_update_pi_irte, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 185fc1619c99..3978ace4ddbc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -64,6 +64,7 @@ #include /* Ugh! */ #include #include +#include #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 @@ -8094,3 +8095,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); From 87276880065246ce49ec571130d3d1e4a22e5604 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:40 +0800 Subject: [PATCH 041/123] KVM: x86: select IRQ_BYPASS_MANAGER Select IRQ_BYPASS_MANAGER for x86 when CONFIG_KVM is set Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/Kconfig | 2 ++ arch/x86/kvm/x86.c | 53 +++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a4067ecd4376..15664994b6f3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index d8a1d56276e1..639a6e34500c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -28,6 +28,8 @@ config KVM select ANON_INODES select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD + select IRQ_BYPASS_MANAGER + select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3978ace4ddbc..b8425a769c0a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -51,6 +51,8 @@ #include #include #include +#include +#include #include #define CREATE_TRACE_POINTS @@ -8079,6 +8081,57 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + if (kvm_x86_ops->update_pi_irte) { + irqfd->producer = prod; + return kvm_x86_ops->update_pi_irte(irqfd->kvm, + prod->irq, irqfd->gsi, 1); + } + + return -EINVAL; +} + +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + int ret; + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + if (!kvm_x86_ops->update_pi_irte) { + WARN_ON(irqfd->producer != NULL); + return; + } + + WARN_ON(irqfd->producer != prod); + irqfd->producer = NULL; + + /* + * When producer of consumer is unregistered, we change back to + * remapped mode, so we can re-use the current implementation + * when the irq is masked/disabed or the consumer side (KVM + * int this case doesn't want to receive the interrupts. + */ + ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); + if (ret) + printk(KERN_INFO "irq bypass consumer (token %p) unregistration" + " fails: %d\n", irqfd->consumer.token, ret); +} + +int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + if (!kvm_x86_ops->update_pi_irte) + return -EINVAL; + + return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); +} + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); From 28b835d60fcc2498e717cf5e6f0c3691c24546f7 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:54 +0800 Subject: [PATCH 042/123] KVM: Update Posted-Interrupts Descriptor when vCPU is preempted This patch updates the Posted-Interrupts Descriptor when vCPU is preempted. sched out: - Set 'SN' to suppress furture non-urgent interrupts posted for the vCPU. sched in: - Clear 'SN' - Change NDST if vCPU is scheduled to a different CPU - Set 'NV' to POSTED_INTR_VECTOR Signed-off-by: Feng Wu [Include asm/cpu.h to fix !CONFIG_SMP compilation. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cb26b93a2cd7..99f5c61954ea 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -35,6 +35,7 @@ #include "kvm_cache_regs.h" #include "x86.h" +#include #include #include #include @@ -1942,6 +1943,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) preempt_enable(); } +static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + do { + old.control = new.control = pi_desc->control; + + /* + * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there + * are two possible cases: + * 1. After running 'pre_block', context switch + * happened. For this case, 'sn' was set in + * vmx_vcpu_put(), so we need to clear it here. + * 2. After running 'pre_block', we were blocked, + * and woken up by some other guy. For this case, + * we don't need to do anything, 'pi_post_block' + * will do everything for us. However, we cannot + * check whether it is case #1 or case #2 here + * (maybe, not needed), so we also clear sn here, + * I think it is not a big deal. + */ + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { + if (vcpu->cpu != cpu) { + dest = cpu_physical_id(cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + } + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); +} /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -1992,10 +2039,27 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ vmx->loaded_vmcs->cpu = cpu; } + + vmx_vcpu_pi_load(vcpu, cpu); +} + +static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + /* Set SN when the vCPU is preempted */ + if (vcpu->preempted) + pi_set_sn(pi_desc); } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { + vmx_vcpu_pi_put(vcpu); + __vmx_load_host_state(to_vmx(vcpu)); if (!vmm_exclusive) { __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); @@ -4427,6 +4491,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) { #ifdef CONFIG_SMP if (vcpu->mode == IN_GUEST_MODE) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Currently, we don't support urgent interrupt, + * all interrupts are recognized as non-urgent + * interrupt, so we cannot post interrupts when + * 'SN' is set. + * + * If the vcpu is in guest mode, it means it is + * running instead of being scheduled out and + * waiting in the run queue, and that's the only + * case when 'SN' is set currently, warning if + * 'SN' is set. + */ + WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR); return true; From bf9f6ac8d74969690df1485b33b7c238ca9f2269 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:55 +0800 Subject: [PATCH 043/123] KVM: Update Posted-Interrupts Descriptor when vCPU is blocked This patch updates the Posted-Interrupts Descriptor when vCPU is blocked. pre-block: - Add the vCPU to the blocked per-CPU list - Set 'NV' to POSTED_INTR_WAKEUP_VECTOR post-block: - Remove the vCPU from the per-CPU list Signed-off-by: Feng Wu [Concentrate invocation of pre/post-block hooks to vcpu_block. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/locking.txt | 12 ++ arch/x86/include/asm/kvm_host.h | 11 ++ arch/x86/kvm/vmx.c | 153 ++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 34 ++++-- include/linux/kvm_host.h | 3 + virt/kvm/kvm_main.c | 3 + 6 files changed, 206 insertions(+), 10 deletions(-) diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index d68af4dc3006..19f94a6b9bb0 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -166,3 +166,15 @@ Comment: The srcu read lock must be held while accessing memslots (e.g. MMIO/PIO address->device structure mapping (kvm->buses). The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu if it is needed by multiple functions. + +Name: blocked_vcpu_on_cpu_lock +Type: spinlock_t +Arch: x86 +Protects: blocked_vcpu_on_cpu +Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts. + When VT-d posted-interrupts is supported and the VM has assigned + devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu + protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues + wakeup notification event since external interrupts from the + assigned devices happens, we will find the vCPU on the list to + wakeup. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 15664994b6f3..cdbdb559ecd2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -899,6 +899,17 @@ struct kvm_x86_ops { /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; + /* + * Architecture specific hooks for vCPU blocking due to + * HLT instruction. + * Returns for .pre_block(): + * - 0 means continue to block the vCPU. + * - 1 means we cannot block the vCPU since some event + * happens during this period, such as, 'ON' bit in + * posted-interrupts descriptor is set. + */ + int (*pre_block)(struct kvm_vcpu *vcpu); + void (*post_block)(struct kvm_vcpu *vcpu); int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); }; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 99f5c61954ea..c5c22831aee2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -878,6 +878,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DEFINE_PER_CPU(struct desc_ptr, host_gdt); +/* + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * can find which vCPU should be waken up. + */ +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); + static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; static unsigned long *vmx_msr_bitmap_legacy; @@ -2986,6 +2993,8 @@ static int hardware_enable(void) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); /* * Now we can enable the vmclear operation in kdump @@ -6045,6 +6054,25 @@ static void update_ple_window_actual_max(void) ple_window_grow, INT_MIN); } +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +static void wakeup_handler(void) +{ + struct kvm_vcpu *vcpu; + int cpu = smp_processor_id(); + + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), + blocked_vcpu_list) { + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (pi_test_on(pi_desc) == 1) + kvm_vcpu_kick(vcpu); + } + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + static __init int hardware_setup(void) { int r = -ENOMEM, i, msr; @@ -6231,6 +6259,8 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + kvm_set_posted_intr_wakeup_handler(wakeup_handler); + return alloc_kvm_area(); out8: @@ -10431,6 +10461,126 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +/* + * This routine does the following things for vCPU which is going + * to be blocked if VT-d PI is enabled. + * - Store the vCPU to the wakeup list, so when interrupts happen + * we can find the right vCPU to wake up. + * - Change the Posted-interrupt descriptor as below: + * 'NDST' <-- vcpu->pre_pcpu + * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR + * - If 'ON' is set during this process, which means at least one + * interrupt is posted for this vCPU, we cannot block it, in + * this case, return 1, otherwise, return 0. + * + */ +static int vmx_pre_block(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + unsigned int dest; + struct pi_desc old, new; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + vcpu->pre_pcpu = vcpu->cpu; + spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + + do { + old.control = new.control = pi_desc->control; + + /* + * We should not block the vCPU if + * an interrupt is posted for it. + */ + if (pi_test_on(pi_desc) == 1) { + spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + + return 1; + } + + WARN((pi_desc->sn == 1), + "Warning: SN field of posted-interrupts " + "is set before blocking\n"); + + /* + * Since vCPU can be preempted during this process, + * vcpu->cpu could be different with pre_pcpu, we + * need to set pre_pcpu as the destination of wakeup + * notification event, then we can find the right vCPU + * to wakeup in wakeup handler if interrupts happen + * when the vCPU is in blocked state. + */ + dest = cpu_physical_id(vcpu->pre_pcpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'wakeup vector' */ + new.nv = POSTED_INTR_WAKEUP_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + return 0; +} + +static void vmx_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + unsigned long flags; + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + if(vcpu->pre_pcpu != -1) { + spin_lock_irqsave( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + } +} + /* * vmx_update_pi_irte - set IRTE for Posted-Interrupts * @@ -10622,6 +10772,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .pre_block = vmx_pre_block, + .post_block = vmx_post_block, + .pmu_ops = &intel_pmu_ops, .update_pi_irte = vmx_update_pi_irte, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b8425a769c0a..2d2c9bb0d6d6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6335,6 +6335,20 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } } + /* + * KVM_REQ_EVENT is not set when posted interrupts are set by + * VT-d hardware, so we have to update RVI unconditionally. + */ + if (kvm_lapic_enabled(vcpu)) { + /* + * Update architecture specific hints for APIC + * virtual interrupt delivery. + */ + if (kvm_x86_ops->hwapic_irr_update) + kvm_x86_ops->hwapic_irr_update(vcpu, + kvm_lapic_find_highest_irr(vcpu)); + } + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { @@ -6351,13 +6365,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { - /* - * Update architecture specific hints for APIC - * virtual interrupt delivery. - */ - if (kvm_x86_ops->hwapic_irr_update) - kvm_x86_ops->hwapic_irr_update(vcpu, - kvm_lapic_find_highest_irr(vcpu)); update_cr8_intercept(vcpu); kvm_lapic_sync_to_vapic(vcpu); } @@ -6493,10 +6500,15 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { - if (!kvm_arch_vcpu_runnable(vcpu)) { + if (!kvm_arch_vcpu_runnable(vcpu) && + (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_x86_ops->post_block) + kvm_x86_ops->post_block(vcpu); + if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; } @@ -6528,10 +6540,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu) for (;;) { if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) + !vcpu->arch.apf.halted) { r = vcpu_enter_guest(vcpu); - else + } else { r = vcpu_block(kvm, vcpu); + } + if (r <= 0) break; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c3f4538807f..9596a2f0977b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -234,6 +234,9 @@ struct kvm_vcpu { unsigned long requests; unsigned long guest_debug; + int pre_pcpu; + struct list_head blocked_vcpu_list; + struct mutex mutex; struct kvm_run *run; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index afd7ae6aec65..a75502c93c3e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -230,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) init_waitqueue_head(&vcpu->wq); kvm_async_pf_vcpu_init(vcpu); + vcpu->pre_pcpu = -1; + INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); + page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { r = -ENOMEM; From b7d2063177a584cb1afb06fc0ed6c48b576f3e75 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:56 +0800 Subject: [PATCH 044/123] iommu/vt-d: Add a command line parameter for VT-d posted-interrupts Enable VT-d Posted-Interrtups and add a command line parameter for it. Signed-off-by: Feng Wu Reviewed-by: Paolo Bonzini Reviewed-by: Alex Williamson Acked-by: Joerg Roedel Signed-off-by: Paolo Bonzini --- Documentation/kernel-parameters.txt | 1 + drivers/iommu/irq_remapping.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 22a4b687ea5b..73588fcaff8c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1553,6 +1553,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nosid disable Source ID checking no_x2apic_optout BIOS x2APIC opt-out request will be ignored + nopost disable Interrupt Posting iomem= Disable strict checking of access to MMIO memory strict regions from userspace. diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 913455a5fd40..8adaaeae3268 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -22,7 +22,7 @@ int irq_remap_broken; int disable_sourceid_checking; int no_x2apic_optout; -int disable_irq_post = 1; +int disable_irq_post = 0; static int disable_irq_remap; static struct irq_remap_ops *remap_ops; @@ -58,14 +58,18 @@ static __init int setup_irqremap(char *str) return -EINVAL; while (*str) { - if (!strncmp(str, "on", 2)) + if (!strncmp(str, "on", 2)) { disable_irq_remap = 0; - else if (!strncmp(str, "off", 3)) + disable_irq_post = 0; + } else if (!strncmp(str, "off", 3)) { disable_irq_remap = 1; - else if (!strncmp(str, "nosid", 5)) + disable_irq_post = 1; + } else if (!strncmp(str, "nosid", 5)) disable_sourceid_checking = 1; else if (!strncmp(str, "no_x2apic_optout", 16)) no_x2apic_optout = 1; + else if (!strncmp(str, "nopost", 6)) + disable_irq_post = 1; str += strcspn(str, ","); while (*str == ',') From f59922b47e0a202386c8e8dcf9f0235b8a028ae0 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 16 Sep 2015 12:14:52 +0200 Subject: [PATCH 045/123] KVM: s390: remove unused variable in __inject_vm the float int structure is no longer used in __inject_vm. Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 5c2c169395c3..ab9f525aa7cd 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1390,12 +1390,9 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { - struct kvm_s390_float_interrupt *fi; u64 type = READ_ONCE(inti->type); int rc; - fi = &kvm->arch.float_int; - switch (type) { case KVM_S390_MCHK: rc = __inject_float_mchk(kvm, inti); From fee0e0fdb2b9c221a3621bede722aa9f9c9f0d39 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 28 Sep 2015 13:32:38 +0200 Subject: [PATCH 046/123] KVM: s390: disabled wait cares about machine checks, not PER We don't care about program event recording irqs (synchronous program irqs) but asynchronous irqs when checking for disabled wait. Machine checks were missing. Let's directly switch to the functions we have for that purpose instead of testing once again for magic bits. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ab9f525aa7cd..1058240b3db3 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -51,11 +51,9 @@ static int psw_mchk_disabled(struct kvm_vcpu *vcpu) static int psw_interrupts_disabled(struct kvm_vcpu *vcpu) { - if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) || - (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO) || - (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT)) - return 0; - return 1; + return psw_extint_disabled(vcpu) && + psw_ioint_disabled(vcpu) && + psw_mchk_disabled(vcpu); } static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu) From 5f94c58ed0a6db016528d8555f1b655ad354f7bb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 28 Sep 2015 14:27:51 +0200 Subject: [PATCH 047/123] KVM: s390: set interception requests for all floating irqs No need to separate pending and floating irqs when setting interception requests. Let's do it for all equally. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 1058240b3db3..4f05520efbae 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -107,14 +107,10 @@ static inline u8 int_word_to_isc(u32 int_word) return (int_word & 0x38000000) >> 27; } -static inline unsigned long pending_floating_irqs(struct kvm_vcpu *vcpu) +static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.float_int.pending_irqs; -} - -static inline unsigned long pending_local_irqs(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.local_int.pending_irqs; + return vcpu->kvm->arch.float_int.pending_irqs | + vcpu->arch.local_int.pending_irqs; } static unsigned long disable_iscs(struct kvm_vcpu *vcpu, @@ -133,8 +129,7 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) { unsigned long active_mask; - active_mask = pending_local_irqs(vcpu); - active_mask |= pending_floating_irqs(vcpu); + active_mask = pending_irqs(vcpu); if (!active_mask) return 0; @@ -202,7 +197,7 @@ static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag) static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) { - if (!(pending_floating_irqs(vcpu) & IRQ_PEND_IO_MASK)) + if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK)) return; else if (psw_ioint_disabled(vcpu)) __set_cpuflag(vcpu, CPUSTAT_IO_INT); @@ -212,7 +207,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) { - if (!(pending_local_irqs(vcpu) & IRQ_PEND_EXT_MASK)) + if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK)) return; if (psw_extint_disabled(vcpu)) __set_cpuflag(vcpu, CPUSTAT_EXT_INT); @@ -222,7 +217,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu) { - if (!(pending_local_irqs(vcpu) & IRQ_PEND_MCHK_MASK)) + if (!(pending_irqs(vcpu) & IRQ_PEND_MCHK_MASK)) return; if (psw_mchk_disabled(vcpu)) vcpu->arch.sie_block->ictl |= ICTL_LPSW; From 118b862b153190f92415ece4cb97a896929c5ab8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 23 Sep 2015 12:25:15 +0200 Subject: [PATCH 048/123] KVM: s390: kvm_arch_vcpu_runnable already cares about timer interrupts We can remove that double check. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 4f05520efbae..1260f8c18df9 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -839,7 +839,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) vcpu->stat.exit_wait_state++; /* fast path */ - if (kvm_cpu_has_pending_timer(vcpu) || kvm_arch_vcpu_runnable(vcpu)) + if (kvm_arch_vcpu_runnable(vcpu)) return 0; if (psw_interrupts_disabled(vcpu)) { From 4d32ad6becf0baf09f38707f0aff42c0f4367a99 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 6 May 2015 13:51:29 +0200 Subject: [PATCH 049/123] KVM: s390: drop out early in kvm_s390_has_irq() Let's get rid of the local variable and exit directly if we found any pending interrupt. This is not only faster, but also better readable. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 1260f8c18df9..10a0e8beb9e1 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -808,23 +808,21 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu) int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop) { - int rc; + if (deliverable_irqs(vcpu)) + return 1; - rc = !!deliverable_irqs(vcpu); - - if (!rc && kvm_cpu_has_pending_timer(vcpu)) - rc = 1; + if (kvm_cpu_has_pending_timer(vcpu)) + return 1; /* external call pending and deliverable */ - if (!rc && kvm_s390_ext_call_pending(vcpu) && + if (kvm_s390_ext_call_pending(vcpu) && !psw_extint_disabled(vcpu) && (vcpu->arch.sie_block->gcr[0] & 0x2000ul)) - rc = 1; + return 1; - if (!rc && !exclude_stop && kvm_s390_is_stop_irq_pending(vcpu)) - rc = 1; - - return rc; + if (!exclude_stop && kvm_s390_is_stop_irq_pending(vcpu)) + return 1; + return 0; } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) From 66933b78e3204057bfc26343afcd0d463c0e8e55 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 20 Nov 2014 13:49:32 +0100 Subject: [PATCH 050/123] KVM: s390: simplify in-kernel program irq injection The main reason to keep program injection in kernel separated until now was that we were able to do some checking, if really only the owning thread injects program interrupts (via waitqueue_active(li->wq)). This BUG_ON was never triggered and the chances of really hitting it, if another thread injected a program irq to another vcpu, were very small. Let's drop this check and turn kvm_s390_inject_program_int() and kvm_s390_inject_prog_irq() into simple inline functions that makes use of kvm_s390_inject_vcpu(). __must_check can be dropped as they are implicitely given by kvm_s390_inject_vcpu(), to avoid ugly long function prototypes. Reviewed-by: Jens Freimann Acked-by: Cornelia Huck Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 28 ---------------------------- arch/s390/kvm/kvm-s390.h | 24 ++++++++++++++++++++---- 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 10a0e8beb9e1..f603bacf6ac9 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -977,34 +977,6 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) return 0; } -int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) -{ - struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - struct kvm_s390_irq irq; - - spin_lock(&li->lock); - irq.u.pgm.code = code; - __inject_prog(vcpu, &irq); - BUG_ON(waitqueue_active(li->wq)); - spin_unlock(&li->lock); - return 0; -} - -int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, - struct kvm_s390_pgm_info *pgm_info) -{ - struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - struct kvm_s390_irq irq; - int rc; - - spin_lock(&li->lock); - irq.u.pgm = *pgm_info; - rc = __inject_prog(vcpu, &irq); - BUG_ON(waitqueue_active(li->wq)); - spin_unlock(&li->lock); - return rc; -} - static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index c446aabf60d3..3a368d2a6114 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -175,6 +175,7 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) return kvm->arch.user_cpu_state_ctrl != 0; } +/* implemented in interrupt.c */ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); @@ -185,7 +186,25 @@ int __must_check kvm_s390_inject_vm(struct kvm *kvm, struct kvm_s390_interrupt *s390int); int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq); -int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); +static inline int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, + struct kvm_s390_pgm_info *pgm_info) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_PROGRAM_INT, + .u.pgm = *pgm_info, + }; + + return kvm_s390_inject_vcpu(vcpu, &irq); +} +static inline int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_PROGRAM_INT, + .u.pgm.code = code, + }; + + return kvm_s390_inject_vcpu(vcpu, &irq); +} struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, u64 isc_mask, u32 schid); int kvm_s390_reinject_io_int(struct kvm *kvm, @@ -231,9 +250,6 @@ extern unsigned long kvm_s390_fac_list_mask[]; /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); -/* implemented in interrupt.c */ -int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, - struct kvm_s390_pgm_info *pgm_info); static inline void kvm_s390_vcpu_block_all(struct kvm *kvm) { From 238293b14d9b1f5689e2aa68710000b0f25aa612 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 4 May 2015 12:38:48 +0200 Subject: [PATCH 051/123] KVM: s390: correctly handle injection of pgm irqs and per events PER events can always co-exist with other program interrupts. For now, we always overwrite all program interrupt parameters when injecting any type of program interrupt. Let's handle that correctly by only overwriting the relevant portion of the program interrupt parameters. Therefore we can now inject PER events and ordinary program interrupts concurrently, resulting in no loss of program interrupts. This will especially by helpful when manually detecting PER events later - as both types might be triggered during one SIE exit. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f603bacf6ac9..a8be542b9cb0 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -972,7 +972,26 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, irq->u.pgm.code, 0); - li->irq.pgm = irq->u.pgm; + if (irq->u.pgm.code == PGM_PER) { + li->irq.pgm.code |= PGM_PER; + /* only modify PER related information */ + li->irq.pgm.per_address = irq->u.pgm.per_address; + li->irq.pgm.per_code = irq->u.pgm.per_code; + li->irq.pgm.per_atmid = irq->u.pgm.per_atmid; + li->irq.pgm.per_access_id = irq->u.pgm.per_access_id; + } else if (!(irq->u.pgm.code & PGM_PER)) { + li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) | + irq->u.pgm.code; + /* only modify non-PER information */ + li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code; + li->irq.pgm.mon_code = irq->u.pgm.mon_code; + li->irq.pgm.data_exc_code = irq->u.pgm.data_exc_code; + li->irq.pgm.mon_class_nr = irq->u.pgm.mon_class_nr; + li->irq.pgm.exc_access_id = irq->u.pgm.exc_access_id; + li->irq.pgm.op_access_id = irq->u.pgm.op_access_id; + } else { + li->irq.pgm = irq->u.pgm; + } set_bit(IRQ_PEND_PROG, &li->pending_irqs); return 0; } From 5a3d883a59b3fe8dc8775c7a79200a5b11a6761e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 29 Sep 2015 16:27:24 +0200 Subject: [PATCH 052/123] KVM: s390: switch to get_tod_clock() and fix STP sync races Nobody except early.c makes use of store_tod_clock() to handle the cc. So if we would get a cc != 0, we would be in more trouble. Let's replace all users with get_tod_clock(). Returning a cc on an ioctl sounded strange either way. We can now also easily move the get_tod_clock() call into the preempt_disable() section. This is in fact necessary to make the STP sync work as expected. Otherwise the host TOD could change and we would end up with a wrong epoch calculation. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 18 ++++-------------- arch/s390/kvm/priv.c | 8 ++------ 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0a67c40eece9..a0907795f31d 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -523,19 +523,14 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) { struct kvm_vcpu *cur_vcpu; unsigned int vcpu_idx; - u64 host_tod, gtod; - int r; + u64 gtod; if (copy_from_user(>od, (void __user *)attr->addr, sizeof(gtod))) return -EFAULT; - r = store_tod_clock(&host_tod); - if (r) - return r; - mutex_lock(&kvm->lock); preempt_disable(); - kvm->arch.epoch = gtod - host_tod; + kvm->arch.epoch = gtod - get_tod_clock(); kvm_s390_vcpu_block_all(kvm); kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch; @@ -581,15 +576,10 @@ static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) { - u64 host_tod, gtod; - int r; - - r = store_tod_clock(&host_tod); - if (r) - return r; + u64 gtod; preempt_disable(); - gtod = host_tod + kvm->arch.epoch; + gtod = get_tod_clock() + kvm->arch.epoch; preempt_enable(); if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) return -EFAULT; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 4d21dc4d1a84..b253de5b8945 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -34,7 +34,7 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) { struct kvm_vcpu *cpup; - s64 hostclk, val; + s64 val; int i, rc; ar_t ar; u64 op2; @@ -49,15 +49,11 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - if (store_tod_clock(&hostclk)) { - kvm_s390_set_psw_cc(vcpu, 3); - return 0; - } VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val); - val = (val - hostclk) & ~0x3fUL; mutex_lock(&vcpu->kvm->lock); preempt_disable(); + val = (val - get_tod_clock()) & ~0x3fUL; kvm_for_each_vcpu(i, cpup, vcpu->kvm) cpup->arch.sie_block->epoch = val; preempt_enable(); From 25ed16759660cdfccd4a3cb7d30cce8a797b542a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 12 May 2015 09:49:14 +0200 Subject: [PATCH 053/123] KVM: s390: factor out and fix setting of guest TOD clock Let's move that whole logic into one function. We now always use unsigned values when calculating the epoch (to avoid over/underflow defined). Also, we always have to get all VCPUs out of SIE before doing the update to avoid running differing VCPUs with different TODs. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 28 +++++++++++++++++----------- arch/s390/kvm/kvm-s390.h | 1 + arch/s390/kvm/priv.c | 15 +++------------ 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a0907795f31d..87bd602f326c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -521,22 +521,12 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) { - struct kvm_vcpu *cur_vcpu; - unsigned int vcpu_idx; u64 gtod; if (copy_from_user(>od, (void __user *)attr->addr, sizeof(gtod))) return -EFAULT; - mutex_lock(&kvm->lock); - preempt_disable(); - kvm->arch.epoch = gtod - get_tod_clock(); - kvm_s390_vcpu_block_all(kvm); - kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) - cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch; - kvm_s390_vcpu_unblock_all(kvm); - preempt_enable(); - mutex_unlock(&kvm->lock); + kvm_s390_set_tod_clock(kvm, gtod); VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod); return 0; } @@ -1906,6 +1896,22 @@ retry: return 0; } +void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod) +{ + struct kvm_vcpu *vcpu; + int i; + + mutex_lock(&kvm->lock); + preempt_disable(); + kvm->arch.epoch = tod - get_tod_clock(); + kvm_s390_vcpu_block_all(kvm); + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.sie_block->epoch = kvm->arch.epoch; + kvm_s390_vcpu_unblock_all(kvm); + preempt_enable(); + mutex_unlock(&kvm->lock); +} + /** * kvm_arch_fault_in_page - fault-in guest page if necessary * @vcpu: The corresponding virtual cpu diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 3a368d2a6114..cc15ea3a150e 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -231,6 +231,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); /* implemented in kvm-s390.c */ +void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu, diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index b253de5b8945..77191b85ea7a 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -33,11 +33,9 @@ /* Handle SCK (SET CLOCK) interception */ static int handle_set_clock(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *cpup; - s64 val; - int i, rc; + int rc; ar_t ar; - u64 op2; + u64 op2, val; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -50,14 +48,7 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) return kvm_s390_inject_prog_cond(vcpu, rc); VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val); - - mutex_lock(&vcpu->kvm->lock); - preempt_disable(); - val = (val - get_tod_clock()) & ~0x3fUL; - kvm_for_each_vcpu(i, cpup, vcpu->kvm) - cpup->arch.sie_block->epoch = val; - preempt_enable(); - mutex_unlock(&vcpu->kvm->lock); + kvm_s390_set_tod_clock(vcpu->kvm, val); kvm_s390_set_psw_cc(vcpu, 0); return 0; From 60417fcc2b0235dfe3dcd589c56dbe3ea1a64c54 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 29 Sep 2015 16:20:36 +0200 Subject: [PATCH 054/123] KVM: s390: factor out reading of the guest TOD clock Let's factor this out and always use get_tod_clock_fast() when reading the guest TOD. STORE CLOCK FAST does not do serialization and, therefore, might result in some fuzziness between different processors in a way that subsequent calls on different CPUs might have time stamps that are earlier. This semantics is fine though for all KVM use cases. To make it obvious that the new function has STORE CLOCK FAST semantics we name it kvm_s390_get_tod_clock_fast. With this patch, we only have a handful of places were we have to care about STP sync (using preempt_disable() logic). Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 15 +++------------ arch/s390/kvm/kvm-s390.c | 4 +--- arch/s390/kvm/kvm-s390.h | 10 ++++++++++ 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index a8be542b9cb0..373e32346d68 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -69,13 +69,8 @@ static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu) static int ckc_irq_pending(struct kvm_vcpu *vcpu) { - preempt_disable(); - if (!(vcpu->arch.sie_block->ckc < - get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) { - preempt_enable(); + if (vcpu->arch.sie_block->ckc >= kvm_s390_get_tod_clock_fast(vcpu->kvm)) return 0; - } - preempt_enable(); return ckc_interrupts_enabled(vcpu); } @@ -851,9 +846,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) goto no_timer; } - preempt_disable(); - now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; - preempt_enable(); + now = kvm_s390_get_tod_clock_fast(vcpu->kvm); sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); /* underflow */ @@ -892,9 +885,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) u64 now, sltime; vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); - preempt_disable(); - now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; - preempt_enable(); + now = kvm_s390_get_tod_clock_fast(vcpu->kvm); sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); /* diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 87bd602f326c..618c85411a51 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -568,9 +568,7 @@ static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) { u64 gtod; - preempt_disable(); - gtod = get_tod_clock() + kvm->arch.epoch; - preempt_enable(); + gtod = kvm_s390_get_tod_clock_fast(kvm); if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) return -EFAULT; VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index cc15ea3a150e..1e70e00d3c5e 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -271,6 +271,16 @@ static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm) kvm_s390_vcpu_unblock(vcpu); } +static inline u64 kvm_s390_get_tod_clock_fast(struct kvm *kvm) +{ + u64 rc; + + preempt_disable(); + rc = get_tod_clock_fast() + kvm->arch.epoch; + preempt_enable(); + return rc; +} + /** * kvm_s390_inject_prog_cond - conditionally inject a program check * @vcpu: virtual cpu From c77f3fab441c3e466b4c3601a475fc31ce156b06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 8 Oct 2015 20:23:33 +0200 Subject: [PATCH 055/123] kvm: x86: set KVM_REQ_EVENT when updating IRR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After moving PIR to IRR, the interrupt needs to be delivered manually. Reported-by: Paolo Bonzini Cc: Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 944b38a56929..168b8759bd73 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -348,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) struct kvm_lapic *apic = vcpu->arch.apic; __kvm_apic_update_irr(pir, apic->regs); + + kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); From db2bdcbbbd32e5500b822d5e74ef8b5bd777e687 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 8 Oct 2015 20:23:34 +0200 Subject: [PATCH 056/123] KVM: x86: fix edge EOI and IOAPIC reconfig race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM uses eoi_exit_bitmap to track vectors that need an action on EOI. The problem is that IOAPIC can be reconfigured while an interrupt with old configuration is pending and eoi_exit_bitmap only remembers the newest configuration; thus EOI from the pending interrupt is not recognized. (Reconfiguration is not a problem for level interrupts, because IOAPIC sends interrupt with the new configuration.) For an edge interrupt with ACK notifiers, like i8254 timer; things can happen in this order 1) IOAPIC inject a vector from i8254 2) guest reconfigures that vector's VCPU and therefore eoi_exit_bitmap on original VCPU gets cleared 3) guest's handler for the vector does EOI 4) KVM's EOI handler doesn't pass that vector to IOAPIC because it is not in that VCPU's eoi_exit_bitmap 5) i8254 stops working A simple solution is to set the IOAPIC vector in eoi_exit_bitmap if the vector is in PIR/IRR/ISR. This creates an unwanted situation if the vector is reused by a non-IOAPIC source, but I think it is so rare that we don't want to make the solution more sophisticated. The simple solution also doesn't work if we are reconfiguring the vector. (Shouldn't happen in the wild and I'd rather fix users of ACK notifiers instead of working around that.) The are no races because ioapic injection and reconfig are locked. Fixes: b053b2aef25d ("KVM: x86: Add EOI exit bitmap inference") [Before b053b2aef25d, this bug happened only with APICv.] Fixes: c7c9c56ca26f ("x86, apicv: add virtual interrupt delivery support") Cc: Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 4 +++- arch/x86/kvm/x86.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 2dcda0f188ba..88d0a92d3f94 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -246,7 +246,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, - e->fields.dest_id, e->fields.dest_mode)) + e->fields.dest_id, e->fields.dest_mode) || + (e->fields.trig_mode == IOAPIC_EDGE_TRIG && + kvm_apic_pending_eoi(vcpu, e->fields.vector))) __set_bit(e->fields.vector, (unsigned long *)eoi_exit_bitmap); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e28954d2698a..e33aebbf189e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6201,8 +6201,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap); - else + else { + kvm_x86_ops->sync_pir_to_irr(vcpu); kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); + } kvm_x86_ops->load_eoi_exitmap(vcpu); } From 13db77347db175a68edd58a79963cdf5cb3a9607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 8 Oct 2015 20:30:00 +0200 Subject: [PATCH 057/123] KVM: x86: don't notify userspace IOAPIC on edge EOI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On real hardware, edge-triggered interrupts don't set a bit in TMR, which means that IOAPIC isn't notified on EOI. Do the same here. Staying in guest/kernel mode after edge EOI is what we want for most devices. If some bugs could be nicely worked around with edge EOI notifications, we should invest in a better interface. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index c89228980230..8f4499c7ffc1 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -389,13 +389,15 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) for (i = 0; i < nr_ioapic_pins; ++i) { hlist_for_each_entry(entry, &table->map[i], link) { u32 dest_id, dest_mode; + bool level; if (entry->type != KVM_IRQ_ROUTING_MSI) continue; dest_id = (entry->msi.address_lo >> 12) & 0xff; dest_mode = (entry->msi.address_lo >> 2) & 0x1; - if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id, - dest_mode)) { + level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL; + if (level && kvm_apic_match_dest(vcpu, NULL, 0, + dest_id, dest_mode)) { u32 vector = entry->msi.data & 0xff; __set_bit(vector, From 6003a4201077da41427dae5e71bb2f31dfdc0c10 Mon Sep 17 00:00:00 2001 From: Kosuke Tatsukawa Date: Fri, 9 Oct 2015 12:21:55 +0000 Subject: [PATCH 058/123] kvm: fix waitqueue_active without memory barrier in virt/kvm/async_pf.c async_pf_execute() seems to be missing a memory barrier which might cause the waker to not notice the waiter and miss sending a wake_up as in the following figure. async_pf_execute kvm_vcpu_block ------------------------------------------------------------------------ spin_lock(&vcpu->async_pf.lock); if (waitqueue_active(&vcpu->wq)) /* The CPU might reorder the test for the waitqueue up here, before prior writes complete */ prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); /*if (kvm_vcpu_check_block(vcpu) < 0) */ /*if (kvm_arch_vcpu_runnable(vcpu)) { */ ... return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) ... return 0; list_add_tail(&apf->link, &vcpu->async_pf.done); spin_unlock(&vcpu->async_pf.lock); waited = true; schedule(); ------------------------------------------------------------------------ The attached patch adds the missing memory barrier. I found this issue when I was looking through the linux source code for places calling waitqueue_active() before wake_up*(), but without preceding memory barriers, after sending a patch to fix a similar issue in drivers/tty/n_tty.c (Details about the original issue can be found here: https://lkml.org/lkml/2015/9/28/849). Signed-off-by: Kosuke Tatsukawa Signed-off-by: Paolo Bonzini --- virt/kvm/async_pf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 44660aee335f..77d42be6970e 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -94,6 +94,10 @@ static void async_pf_execute(struct work_struct *work) trace_kvm_async_pf_completed(addr, gva); + /* + * This memory barrier pairs with prepare_to_wait's set_current_state() + */ + smp_mb(); if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); From 991e7a0eedf12721f5561d7a1a54d248dc290bc0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 16 Sep 2015 17:30:05 +0800 Subject: [PATCH 059/123] KVM: VMX: adjust interface to allocate/free_vpid Adjust allocate/free_vid so that they can be reused for the nested vpid. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8eeba6ac5914..1452271e98ae 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4268,29 +4268,28 @@ static int alloc_identity_pagetable(struct kvm *kvm) return r; } -static void allocate_vpid(struct vcpu_vmx *vmx) +static int allocate_vpid(void) { int vpid; - vmx->vpid = 0; if (!enable_vpid) - return; + return 0; spin_lock(&vmx_vpid_lock); vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); - if (vpid < VMX_NR_VPIDS) { - vmx->vpid = vpid; + if (vpid < VMX_NR_VPIDS) __set_bit(vpid, vmx_vpid_bitmap); - } + else + vpid = 0; spin_unlock(&vmx_vpid_lock); + return vpid; } -static void free_vpid(struct vcpu_vmx *vmx) +static void free_vpid(int vpid) { - if (!enable_vpid) + if (!enable_vpid || vpid == 0) return; spin_lock(&vmx_vpid_lock); - if (vmx->vpid != 0) - __clear_bit(vmx->vpid, vmx_vpid_bitmap); + __clear_bit(vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); } @@ -8629,7 +8628,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) if (enable_pml) vmx_disable_pml(vmx); - free_vpid(vmx); + free_vpid(vmx->vpid); leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); free_nested(vmx); @@ -8648,7 +8647,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); - allocate_vpid(vmx); + vmx->vpid = allocate_vpid(); err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) @@ -8724,7 +8723,7 @@ free_msrs: uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: - free_vpid(vmx); + free_vpid(vmx->vpid); kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); } From dd5f5341a3a684c8850f8a8ccbaa70e196518e76 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 23 Sep 2015 18:26:57 +0800 Subject: [PATCH 060/123] KVM: VMX: introduce __vmx_flush_tlb to handle specific vpid Introduce __vmx_flush_tlb() to handle specific vpid. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1452271e98ae..e1d94f81a28d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1392,13 +1392,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) __loaded_vmcs_clear, loaded_vmcs, 1); } -static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) +static inline void vpid_sync_vcpu_single(int vpid) { - if (vmx->vpid == 0) + if (vpid == 0) return; if (cpu_has_vmx_invvpid_single()) - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); } static inline void vpid_sync_vcpu_global(void) @@ -1407,10 +1407,10 @@ static inline void vpid_sync_vcpu_global(void) __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); } -static inline void vpid_sync_context(struct vcpu_vmx *vmx) +static inline void vpid_sync_context(int vpid) { if (cpu_has_vmx_invvpid_single()) - vpid_sync_vcpu_single(vmx); + vpid_sync_vcpu_single(vpid); else vpid_sync_vcpu_global(); } @@ -3563,9 +3563,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) { - vpid_sync_context(to_vmx(vcpu)); + vpid_sync_context(vpid); if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; @@ -3573,6 +3573,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) } } +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -4915,7 +4920,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); - vpid_sync_context(vmx); + vpid_sync_context(vmx->vpid); } /* From 6a14c2222419daa7ecebc7f566841b9b4d63b397 Mon Sep 17 00:00:00 2001 From: Tudor Laurentiu Date: Wed, 23 Sep 2015 18:06:22 +0300 Subject: [PATCH 061/123] powerpc/e6500: add TMCFG0 register definition The register is not currently used in the base kernel but will be in a forthcoming kvm patch. Signed-off-by: Laurentiu Tudor Acked-by: Scott Wood Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/reg_booke.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 16547efa2d5a..2fef74b474f0 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -742,6 +742,12 @@ #define MMUBE1_VBE4 0x00000002 #define MMUBE1_VBE5 0x00000001 +#define TMRN_TMCFG0 16 /* Thread Management Configuration Register 0 */ +#define TMRN_TMCFG0_NPRIBITS 0x003f0000 /* Bits of thread priority */ +#define TMRN_TMCFG0_NPRIBITS_SHIFT 16 +#define TMRN_TMCFG0_NATHRD 0x00003f00 /* Number of active threads */ +#define TMRN_TMCFG0_NATHRD_SHIFT 8 +#define TMRN_TMCFG0_NTHRD 0x0000003f /* Number of threads */ #define TMRN_IMSR0 0x120 /* Initial MSR Register 0 (e6500) */ #define TMRN_IMSR1 0x121 /* Initial MSR Register 1 (e6500) */ #define TMRN_INIA0 0x140 /* Next Instruction Address Register 0 */ From d4cd4f9586f87a5fc828b4c4698aa4faf56c96fc Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Thu, 24 Sep 2015 16:00:23 +0200 Subject: [PATCH 062/123] KVM: PPC: e500: fix handling local_sid_lookup result The function can return negative value. The problem has been detected using proposed semantic patch scripts/coccinelle/tests/assign_signed_to_unsigned.cocci [1]. [1]: http://permalink.gmane.org/gmane.linux.kernel/2046107 Signed-off-by: Andrzej Hajda Acked-by: Scott Wood Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/e500.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index b29ce752c7d6..32fdab57d604 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -237,7 +237,8 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, struct kvm_book3e_206_tlb_entry *gtlbe) { struct vcpu_id_table *idt = vcpu_e500->idt; - unsigned int pr, tid, ts, pid; + unsigned int pr, tid, ts; + int pid; u32 val, eaddr; unsigned long flags; From 2daab50e17997424af57d1ab14042a51e9a368ab Mon Sep 17 00:00:00 2001 From: Tudor Laurentiu Date: Fri, 25 Sep 2015 18:02:23 +0300 Subject: [PATCH 063/123] KVM: PPC: e500: Emulate TMCFG0 TMRN register Emulate TMCFG0 TMRN register exposing one HW thread per vcpu. Signed-off-by: Mihai Caraman [Laurentiu.Tudor@freescale.com: rebased on latest kernel, use define instead of hardcoded value, moved code in own function] Signed-off-by: Laurentiu Tudor Acked-by: Scott Wood Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/disassemble.h | 5 +++++ arch/powerpc/kvm/e500_emulate.c | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h index 6330a61b875a..4852e849128b 100644 --- a/arch/powerpc/include/asm/disassemble.h +++ b/arch/powerpc/include/asm/disassemble.h @@ -42,6 +42,11 @@ static inline unsigned int get_dcrn(u32 inst) return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0); } +static inline unsigned int get_tmrn(u32 inst) +{ + return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0); +} + static inline unsigned int get_rt(u32 inst) { return (inst >> 21) & 0x1f; diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index ce7291c79f6c..990db69a1d0b 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "booke.h" #include "e500.h" @@ -22,6 +23,7 @@ #define XOP_DCBTLS 166 #define XOP_MSGSND 206 #define XOP_MSGCLR 238 +#define XOP_MFTMR 366 #define XOP_TLBIVAX 786 #define XOP_TLBSX 914 #define XOP_TLBRE 946 @@ -113,6 +115,19 @@ static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_e500_emul_mftmr(struct kvm_vcpu *vcpu, unsigned int inst, + int rt) +{ + /* Expose one thread per vcpu */ + if (get_tmrn(inst) == TMRN_TMCFG0) { + kvmppc_set_gpr(vcpu, rt, + 1 | (1 << TMRN_TMCFG0_NATHRD_SHIFT)); + return EMULATE_DONE; + } + + return EMULATE_FAIL; +} + int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { @@ -165,6 +180,10 @@ int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); break; + case XOP_MFTMR: + emulated = kvmppc_e500_emul_mftmr(vcpu, inst, rt); + break; + case XOP_EHPRIV: emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst, advance); From 224f363246c3668452ec0ab5a0ff51824822f3fd Mon Sep 17 00:00:00 2001 From: Tudor Laurentiu Date: Thu, 1 Oct 2015 15:58:03 +0300 Subject: [PATCH 064/123] KVM: PPC: e500: fix couple of shift operations on 64 bits Fix couple of cases where we shift left a 32-bit value thus might get truncated results on 64-bit targets. Signed-off-by: Laurentiu Tudor Suggested-by: Scott Wood Acked-by: Scott Wood Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/e500_mmu_host.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 4d33e199edcc..5e2102c19586 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -406,7 +406,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { unsigned long gfn_start, gfn_end; - tsize_pages = 1 << (tsize - 2); + tsize_pages = 1UL << (tsize - 2); gfn_start = gfn & ~(tsize_pages - 1); gfn_end = gfn_start + tsize_pages; @@ -447,7 +447,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } if (likely(!pfnmap)) { - tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); + tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT); pfn = gfn_to_pfn_memslot(slot, gfn); if (is_error_noslot_pfn(pfn)) { if (printk_ratelimit()) From 966d713e86db1916a0b45a324a935d009737316e Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Mon, 23 Mar 2015 22:24:45 +0530 Subject: [PATCH 065/123] KVM: PPC: Book3S HV: Deliver machine check with MSR(RI=0) to guest as MCE For the machine check interrupt that happens while we are in the guest, kvm layer attempts the recovery, and then delivers the machine check interrupt directly to the guest if recovery fails. On successful recovery we go back to normal functioning of the guest. But there can be cases where a machine check interrupt can happen with MSR(RI=0) while we are in the guest. This means MC interrupt is unrecoverable and we have to deliver a machine check to the guest since the machine check interrupt might have trashed valid values in SRR0/1. The current implementation do not handle this case, causing guest to crash with Bad kernel stack pointer instead of machine check oops message. [26281.490060] Bad kernel stack pointer 3fff9ccce5b0 at c00000000000490c [26281.490434] Oops: Bad kernel stack pointer, sig: 6 [#1] [26281.490472] SMP NR_CPUS=2048 NUMA pSeries This patch fixes this issue by checking MSR(RI=0) in KVM layer and forwarding unrecoverable interrupt to guest which then panics with proper machine check Oops message. Signed-off-by: Mahesh Salgaonkar Acked-by: Paul Mackerras Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index b98889e9851d..fb8d29650612 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2377,7 +2377,6 @@ machine_check_realmode: mr r3, r9 /* get vcpu pointer */ bl kvmppc_realmode_machine_check nop - cmpdi r3, 0 /* Did we handle MCE ? */ ld r9, HSTATE_KVM_VCPU(r13) li r12, BOOK3S_INTERRUPT_MACHINE_CHECK /* @@ -2390,13 +2389,18 @@ machine_check_realmode: * The old code used to return to host for unhandled errors which * was causing guest to hang with soft lockups inside guest and * makes it difficult to recover guest instance. + * + * if we receive machine check with MSR(RI=0) then deliver it to + * guest as machine check causing guest to crash. */ - ld r10, VCPU_PC(r9) ld r11, VCPU_MSR(r9) + andi. r10, r11, MSR_RI /* check for unrecoverable exception */ + beq 1f /* Deliver a machine check to guest */ + ld r10, VCPU_PC(r9) + cmpdi r3, 0 /* Did we handle MCE ? */ bne 2f /* Continue guest execution. */ /* If not, deliver a machine check. SRR0/1 are already set */ - li r10, BOOK3S_INTERRUPT_MACHINE_CHECK - ld r11, VCPU_MSR(r9) +1: li r10, BOOK3S_INTERRUPT_MACHINE_CHECK bl kvmppc_msr_interrupt 2: b fast_interrupt_c_return From 99b83ac893b84ed1a62ad6d1f2b6cc32026b9e85 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 13 Oct 2015 09:12:21 -0700 Subject: [PATCH 066/123] KVM: nVMX: emulate the INVVPID instruction Add the INVVPID instruction emulation. Reviewed-by: Wincy Van Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 61 +++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index d25f32ac9c5c..aa336ff3e03e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -416,6 +416,7 @@ enum vmcs_field { #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) +#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e1d94f81a28d..9b6ab311d0bd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -442,6 +442,7 @@ struct nested_vmx { u32 nested_vmx_misc_low; u32 nested_vmx_misc_high; u32 nested_vmx_ept_caps; + u32 nested_vmx_vpid_caps; }; #define POSTED_INTR_ON 0 @@ -2612,6 +2613,8 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; + vmx->nested.nested_vmx_vpid_caps = 0; + if (enable_unrestricted_guest) vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -7329,7 +7332,63 @@ static int handle_invept(struct kvm_vcpu *vcpu) static int handle_invvpid(struct kvm_vcpu *vcpu) { - kvm_queue_exception(vcpu, UD_VECTOR); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmx_instruction_info; + unsigned long type, types; + gva_t gva; + struct x86_exception e; + int vpid; + + if (!(vmx->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_ENABLE_VPID) || + !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; + + if (!(types & (1UL << type))) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + + /* according to the intel vmx instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, + sizeof(u32), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_VPID_EXTENT_ALL_CONTEXT: + if (get_vmcs12(vcpu)->virtual_processor_id == 0) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + vmx_flush_tlb(vcpu); + nested_vmx_succeed(vcpu); + break; + default: + /* Trap single context invalidation invvpid calls */ + BUG_ON(1); + break; + } + + skip_emulated_instruction(vcpu); return 1; } From 5c614b3583e7b6dab0c86356fa36c2bcbb8322a0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 13 Oct 2015 09:18:36 -0700 Subject: [PATCH 067/123] KVM: nVMX: nested VPID emulation VPID is used to tag address space and avoid a TLB flush. Currently L0 use the same VPID to run L1 and all its guests. KVM flushes VPID when switching between L1 and L2. This patch advertises VPID to the L1 hypervisor, then address space of L1 and L2 can be separately treated and avoid TLB flush when swithing between L1 and L2. For each nested vmentry, if vpid12 is changed, reuse shadow vpid w/ an invvpid. Performance: run lmbench on L2 w/ 3.5 kernel. Context switching - times in microseconds - smaller is better ------------------------------------------------------------------------- Host OS 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw --------- ------------- ------ ------ ------ ------ ------ ------- ------- kernel Linux 3.5.0-1 1.2200 1.3700 1.4500 4.7800 2.3300 5.60000 2.88000 nested VPID kernel Linux 3.5.0-1 1.2600 1.4300 1.5600 12.7 12.9 3.49000 7.46000 vanilla Reviewed-by: Jan Kiszka Reviewed-by: Wincy Van Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9b6ab311d0bd..bea552204aa2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -426,6 +426,9 @@ struct nested_vmx { /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ u64 vmcs01_debugctl; + u16 vpid02; + u16 last_vpid; + u32 nested_vmx_procbased_ctls_low; u32 nested_vmx_procbased_ctls_high; u32 nested_vmx_true_procbased_ctls_low; @@ -1213,6 +1216,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); } +static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); +} + static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); @@ -2590,6 +2598,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | @@ -6818,6 +6827,7 @@ static void free_nested(struct vcpu_vmx *vmx) return; vmx->nested.vmxon = false; + free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); @@ -7379,7 +7389,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); return 1; } - vmx_flush_tlb(vcpu); + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); nested_vmx_succeed(vcpu); break; default: @@ -8759,8 +8769,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } - if (nested) + if (nested) { nested_vmx_setup_ctls_msrs(vmx); + vmx->nested.vpid02 = allocate_vpid(); + } vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -8781,6 +8793,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: + free_vpid(vmx->nested.vpid02); free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); @@ -9665,12 +9678,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (enable_vpid) { /* - * Trivially support vpid by letting L2s share their parent - * L1's vpid. TODO: move to a more elaborate solution, giving - * each L2 its own vpid and exposing the vpid feature to L1. + * There is no direct mapping between vpid02 and vpid12, the + * vpid02 is per-vCPU for L0 and reused while the value of + * vpid12 is changed w/ one invvpid during nested vmentry. + * The vpid12 is allocated by L1 for L2, so it will not + * influence global bitmap(for vpid01 and vpid02 allocation) + * even if spawn a lot of nested vCPUs. */ - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx_flush_tlb(vcpu); + if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + vmx->nested.last_vpid = vmcs12->virtual_processor_id; + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); + } + } else { + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmx_flush_tlb(vcpu); + } + } if (nested_cpu_has_ept(vmcs12)) { From 089d7b6ec5151ad06a2cd524bc0580d311b641ad Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 13 Oct 2015 09:18:37 -0700 Subject: [PATCH 068/123] KVM: nVMX: expose VPID capability to L1 Expose VPID capability to L1. For nested guests, we don't do anything specific for single context invalidation. Hence, only advertise support for global context invalidation. The major benefit of nested VPID comes from having separate vpids when switching between L1 and L2, and also when L2's vCPUs not sched in/out on L1. Reviewed-by: Wincy Van Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bea552204aa2..15bff51d492a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2622,7 +2622,11 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; - vmx->nested.nested_vmx_vpid_caps = 0; + if (enable_vpid) + vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; + else + vmx->nested.nested_vmx_vpid_caps = 0; if (enable_unrestricted_guest) vmx->nested.nested_vmx_secondary_ctls_high |= @@ -2739,7 +2743,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) break; case MSR_IA32_VMX_EPT_VPID_CAP: /* Currently, no nested vpid support */ - *pdata = vmx->nested.nested_vmx_ept_caps; + *pdata = vmx->nested.nested_vmx_ept_caps | + ((u64)vmx->nested.nested_vmx_vpid_caps << 32); break; default: return 1; From 951f9fd74f2d826fff1d84a8ec34b491517dc15d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 23 Sep 2015 10:34:26 +0200 Subject: [PATCH 069/123] KVM: x86: manually unroll bad_mt_xwr loop The loop is computing one of two constants, it can be simpler to write everything inline. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c3f39aa9b9cb..b8482c0b75b2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3706,7 +3706,7 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, int maxphyaddr, bool execonly) { - int pte; + u64 bad_mt_xwr; rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); @@ -3724,14 +3724,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; - for (pte = 0; pte < 64; pte++) { - int rwx_bits = pte & 7; - int mt = pte >> 3; - if (mt == 0x2 || mt == 0x3 || mt == 0x7 || - rwx_bits == 0x2 || rwx_bits == 0x6 || - (rwx_bits == 0x4 && !execonly)) - rsvd_check->bad_mt_xwr |= (1ull << pte); + bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ + bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ + bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ + if (!execonly) { + /* bits 0..2 must not be 100 unless VMX capabilities allow it */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 4); } + rsvd_check->bad_mt_xwr = bad_mt_xwr; } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, From 6092d3d3e6db983048469d424a8f2221915a8dd3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 14 Oct 2015 15:10:54 +0200 Subject: [PATCH 070/123] kvm: svm: Only propagate next_rip when guest supports it Currently we always write the next_rip of the shadow vmcb to the guests vmcb when we emulate a vmexit. This could confuse the guest when its cpuid indicated no support for the next_rip feature. Fix this by only propagating next_rip if the guest actually supports it. Cc: Bandan Das Cc: Dirk Mueller Tested-By: Dirk Mueller Signed-off-by: Joerg Roedel Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 21 +++++++++++++++++++++ arch/x86/kvm/svm.c | 11 ++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d434ee952b00..06332cb7e7d1 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -149,4 +149,25 @@ static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_RDTSCP)); } + +/* + * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 + */ +#define BIT_NRIPS 3 + +static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0); + + /* + * NRIPS is a scattered cpuid feature, so we can't use + * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit + * position 8, not 3). + */ + return best && (best->edx & bit(BIT_NRIPS)); +} +#undef BIT_NRIPS + #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 54a86183e5d3..cd8659cfc632 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -159,6 +159,9 @@ struct vcpu_svm { u32 apf_reason; u64 tsc_ratio; + + /* cached guest cpuid flags for faster access */ + bool nrips_enabled : 1; }; static DEFINE_PER_CPU(u64, current_tsc_ratio); @@ -2365,7 +2368,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; - nested_vmcb->control.next_rip = vmcb->control.next_rip; + + if (svm->nrips_enabled) + nested_vmcb->control.next_rip = vmcb->control.next_rip; /* * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have @@ -4085,6 +4090,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) static void svm_cpuid_update(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + + /* Update nrips enabled cache */ + svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu); } static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) From cd1872f028556dc0e8424e58413c0268c159383b Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:04:13 +0900 Subject: [PATCH 071/123] KVM: x86: MMU: Make force_pt_level bool This will be passed to a function later. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 8 ++++---- arch/x86/kvm/paging_tmpl.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b8482c0b75b2..2262728863de 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2962,7 +2962,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - int force_pt_level; + bool force_pt_level; pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; @@ -3476,7 +3476,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, pfn_t pfn; int r; int level; - int force_pt_level; + bool force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -3497,9 +3497,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (mapping_level_dirty_bitmap(vcpu, gfn) || !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) - force_pt_level = 1; + force_pt_level = true; else - force_pt_level = 0; + force_pt_level = false; if (likely(!force_pt_level)) { level = mapping_level(vcpu, gfn); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 736e6ab8784d..07f1a4ede637 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - int force_pt_level; + bool force_pt_level; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; @@ -747,7 +747,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) || is_self_change_mapping; else - force_pt_level = 1; + force_pt_level = true; if (!force_pt_level) { level = min(walker.level, mapping_level(vcpu, walker.gfn)); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); From 5ed5c5c8fdbab889837c9223fc6f4bdaa830879c Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:05:13 +0900 Subject: [PATCH 072/123] KVM: x86: MMU: Simplify force_pt_level calculation code in FNAME(page_fault)() As a bonus, an extra memory slot search can be eliminated when is_self_change_mapping is true. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/paging_tmpl.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 07f1a4ede637..8ebc3a5560ce 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -743,15 +743,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - if (walker.level >= PT_DIRECTORY_LEVEL) - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) - || is_self_change_mapping; - else + if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { + force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); + if (!force_pt_level) { + level = min(walker.level, mapping_level(vcpu, walker.gfn)); + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + } + } else force_pt_level = true; - if (!force_pt_level) { - level = min(walker.level, mapping_level(vcpu, walker.gfn)); - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - } mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); From fd136902187838bcae3a572f41cb703553dd63b8 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:06:02 +0900 Subject: [PATCH 073/123] KVM: x86: MMU: Move mapping_level_dirty_bitmap() call in mapping_level() This is necessary to eliminate an extra memory slot search later. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 29 ++++++++++++++--------------- arch/x86/kvm/paging_tmpl.h | 6 +++--- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2262728863de..890cd694c9a2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -870,10 +870,16 @@ static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); } -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, + bool *force_pt_level) { int host_level, level, max_level; + if (likely(!*force_pt_level)) + *force_pt_level = mapping_level_dirty_bitmap(vcpu, large_gfn); + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; + host_level = host_mapping_level(vcpu->kvm, large_gfn); if (host_level == PT_PAGE_TABLE_LEVEL) @@ -2962,14 +2968,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - bool force_pt_level; + bool force_pt_level = false; pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); /* * This path builds a PAE pagetable - so we can map * 2mb pages at maximum. Therefore check if the level @@ -2979,8 +2984,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, v, level, error_code)) return 0; @@ -3495,20 +3499,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - if (mapping_level_dirty_bitmap(vcpu, gfn) || - !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) - force_pt_level = true; - else - force_pt_level = false; - + force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, + PT_DIRECTORY_LEVEL); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); if (level > PT_DIRECTORY_LEVEL && !check_hugepage_cache_consistency(vcpu, gfn, level)) level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, gpa, level, error_code)) return 0; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 8ebc3a5560ce..bf39d0f3efa9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -744,9 +744,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); - if (!force_pt_level) { - level = min(walker.level, mapping_level(vcpu, walker.gfn)); + level = mapping_level(vcpu, walker.gfn, &force_pt_level); + if (likely(!force_pt_level)) { + level = min(walker.level, level); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } } else From d8aacf5df86a961923a2c9c547d341d64a9d9f5d Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:07:01 +0900 Subject: [PATCH 074/123] KVM: x86: MMU: Remove mapping_level_dirty_bitmap() Now that it has only one caller, and its name is not so helpful for readers, remove it. The new memslot_valid_for_gpte() function makes it possible to share the common code between gfn_to_memslot_dirty_bitmap() and mapping_level(). Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 890cd694c9a2..09833b04fc97 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -851,6 +851,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } +static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, + bool no_dirty_log) +{ + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return false; + if (no_dirty_log && slot->dirty_bitmap) + return false; + + return true; +} + static struct kvm_memory_slot * gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) @@ -858,25 +869,22 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) + if (!memslot_valid_for_gpte(slot, no_dirty_log)) slot = NULL; return slot; } -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) -{ - return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); -} - static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, bool *force_pt_level) { int host_level, level, max_level; + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); if (likely(!*force_pt_level)) - *force_pt_level = mapping_level_dirty_bitmap(vcpu, large_gfn); + *force_pt_level = !memslot_valid_for_gpte(slot, true); if (unlikely(*force_pt_level)) return PT_PAGE_TABLE_LEVEL; From 5225fdf8c8bea4418f69875804584c89a27c170e Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:08:03 +0900 Subject: [PATCH 075/123] KVM: x86: MMU: Eliminate an extra memory slot search in mapping_level() Calling kvm_vcpu_gfn_to_memslot() twice in mapping_level() should be avoided since getting a slot by binary search may not be negligible, especially for virtual machines with many memory slots. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 09833b04fc97..dd2a7c6ec2ca 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -818,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm->arch.indirect_shadow_pages--; } -static int has_wrprotected_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - int level) +static int __has_wrprotected_page(gfn_t gfn, int level, + struct kvm_memory_slot *slot) { - struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (slot) { linfo = lpage_info_slot(gfn, slot, level); return linfo->write_count; @@ -834,6 +831,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu, return 1; } +static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level) +{ + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + return __has_wrprotected_page(gfn, level, slot); +} + static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { unsigned long page_size; @@ -896,7 +901,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (has_wrprotected_page(vcpu, large_gfn, level)) + if (__has_wrprotected_page(large_gfn, level, slot)) break; return level - 1; From 7cae2bedcbd4680b155999655e49c27b9cf020fa Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 14 Oct 2015 19:33:09 -0300 Subject: [PATCH 076/123] KVM: x86: move steal time initialization to vcpu entry time As reported at https://bugs.launchpad.net/qemu/+bug/1494350, it is possible to have vcpu->arch.st.last_steal initialized from a thread other than vcpu thread, say the iothread, via KVM_SET_MSRS. Which can cause an overflow later (when subtracting from vcpu threads sched_info.run_delay). To avoid that, move steal time accumulation to vcpu entry time, before copying steal time data to guest. Signed-off-by: Marcelo Tosatti Reviewed-by: David Matlack Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e33aebbf189e..9e9c226cb79d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1903,6 +1903,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu) static void record_steal_time(struct kvm_vcpu *vcpu) { + accumulate_steal_time(vcpu); + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -2053,12 +2055,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & KVM_MSR_ENABLED)) break; - vcpu->arch.st.last_steal = current->sched_info.run_delay; - - preempt_disable(); - accumulate_steal_time(vcpu); - preempt_enable(); - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; @@ -2634,7 +2630,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->cpu = cpu; } - accumulate_steal_time(vcpu); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } From 351dc6477cd35136ce4668401b1b1332a62908a8 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Fri, 16 Oct 2015 10:07:45 +0300 Subject: [PATCH 077/123] kvm/eventfd: avoid loop inside irqfd_update() The loop(for) inside irqfd_update() is unnecessary because any other value for irq_entry.type will just trigger schedule_work(&irqfd->inject) in irqfd_wakeup. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Vitaly Kuznetsov CC: "K. Y. Srinivasan" CC: Gleb Natapov CC: Paolo Bonzini Signed-off-by: Paolo Bonzini --- virt/kvm/eventfd.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index b637965746bb..518421e65b0d 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -238,20 +238,17 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) { struct kvm_kernel_irq_routing_entry *e; struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - int i, n_entries; + int n_entries; n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); write_seqcount_begin(&irqfd->irq_entry_sc); - irqfd->irq_entry.type = 0; - e = entries; - for (i = 0; i < n_entries; ++i, ++e) { - /* Only fast-path MSI. */ - if (e->type == KVM_IRQ_ROUTING_MSI) - irqfd->irq_entry = *e; - } + if (n_entries == 1) + irqfd->irq_entry = *e; + else + irqfd->irq_entry.type = 0; write_seqcount_end(&irqfd->irq_entry_sc); } From ba1aefcd6db5536d3eb3ca3ce7bd6786960140ea Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Fri, 16 Oct 2015 10:07:46 +0300 Subject: [PATCH 078/123] kvm/eventfd: factor out kvm_notify_acked_gsi() Factor out kvm_notify_acked_gsi() helper to iterate over EOI listeners and notify those matching the given gsi. It will be reused in the upcoming Hyper-V SynIC implementation. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Vitaly Kuznetsov CC: "K. Y. Srinivasan" CC: Gleb Natapov CC: Paolo Bonzini Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + virt/kvm/eventfd.c | 16 +++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9596a2f0977b..b66861c297c4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -829,6 +829,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level, bool line_status); bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); +void kvm_notify_acked_gsi(struct kvm *kvm, int gsi); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 518421e65b0d..f6b986a41823 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -451,9 +451,18 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) } EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); -void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) +void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) { struct kvm_irq_ack_notifier *kian; + + hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, + link) + if (kian->gsi == gsi) + kian->irq_acked(kian); +} + +void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) +{ int gsi, idx; trace_kvm_ack_irq(irqchip, pin); @@ -461,10 +470,7 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) idx = srcu_read_lock(&kvm->irq_srcu); gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); if (gsi != -1) - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) - if (kian->gsi == gsi) - kian->irq_acked(kian); + kvm_notify_acked_gsi(kvm, gsi); srcu_read_unlock(&kvm->irq_srcu, idx); } From c9a5eccac1abf50649949f15754a7635f263a1ff Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Fri, 16 Oct 2015 10:07:47 +0300 Subject: [PATCH 079/123] kvm/eventfd: add arch-specific set_irq Allow for arch-specific interrupt types to be set. For that, add kvm_arch_set_irq() which takes interrupt type-specific action if it recognizes the interrupt type given, and -EWOULDBLOCK otherwise. The default implementation always returns -EWOULDBLOCK. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Vitaly Kuznetsov CC: "K. Y. Srinivasan" CC: Gleb Natapov CC: Paolo Bonzini Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++++ virt/kvm/eventfd.c | 13 ++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b66861c297c4..eba9caebc9c1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -828,6 +828,10 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level, bool line_status); + +int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm, + int irq_source_id, int level, bool line_status); + bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index f6b986a41823..e29fd2640709 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -171,6 +171,15 @@ irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) queue_work(irqfd_cleanup_wq, &irqfd->shutdown); } +int __attribute__((weak)) kvm_arch_set_irq( + struct kvm_kernel_irq_routing_entry *irq, + struct kvm *kvm, int irq_source_id, + int level, + bool line_status) +{ + return -EWOULDBLOCK; +} + /* * Called with wqh->lock held and interrupts disabled */ @@ -195,7 +204,9 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) if (irq.type == KVM_IRQ_ROUTING_MSI) kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); - else + else if (kvm_arch_set_irq(&irq, kvm, + KVM_USERSPACE_IRQ_SOURCE_ID, 1, + false) == -EWOULDBLOCK) schedule_work(&irqfd->inject); srcu_read_unlock(&kvm->irq_srcu, idx); } From f33143d80907602deb1b96db42da93507ed03b31 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Fri, 16 Oct 2015 10:07:48 +0300 Subject: [PATCH 080/123] kvm/irqchip: allow only multiple irqchip routes per GSI Any other irq routing types (MSI, S390_ADAPTER, upcoming Hyper-V SynIC) map one-to-one to GSI. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Vitaly Kuznetsov CC: "K. Y. Srinivasan" CC: Gleb Natapov CC: Paolo Bonzini Signed-off-by: Paolo Bonzini --- virt/kvm/irqchip.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 716a1c4db528..f0b08a2a48ba 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -144,11 +144,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, /* * Do not allow GSI to be mapped to the same irqchip more than once. - * Allow only one to one mapping between GSI and MSI. + * Allow only one to one mapping between GSI and non-irqchip routing. */ hlist_for_each_entry(ei, &rt->map[ue->gsi], link) - if (ei->type == KVM_IRQ_ROUTING_MSI || - ue->type == KVM_IRQ_ROUTING_MSI || + if (ei->type != KVM_IRQ_ROUTING_IRQCHIP || + ue->type != KVM_IRQ_ROUTING_IRQCHIP || ue->u.irqchip.irqchip == ei->irqchip.irqchip) return r; From 5690891bcec5fcfda38da974ffa5488e36a59811 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Oct 2015 11:30:19 +0200 Subject: [PATCH 081/123] kvm: x86: zero EFER on INIT Not zeroing EFER means that a 32-bit firmware cannot enter paging mode without clearing EFER.LME first (which it should not know about). Yang Zhang from Intel confirmed that the manual is wrong and EFER is cleared to zero on INIT. Fixes: d28bc9dd25ce023270d2e039e7c98d38ecbf7758 Cc: stable@vger.kernel.org Cc: Yang Z Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 11 +++++------ arch/x86/kvm/vmx.c | 3 +-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cd8659cfc632..f2c8e4917688 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1089,7 +1089,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) return target_tsc - tsc; } -static void init_vmcb(struct vcpu_svm *svm, bool init_event) +static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; @@ -1160,8 +1160,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - if (!init_event) - svm_set_efer(&svm->vcpu, 0); + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; @@ -1215,7 +1214,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; } - init_vmcb(svm, init_event); + init_vmcb(svm); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); @@ -1271,7 +1270,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; - init_vmcb(svm, false); + init_vmcb(svm); svm_init_osvw(&svm->vcpu); @@ -1893,7 +1892,7 @@ static int shutdown_interception(struct vcpu_svm *svm) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm, false); + init_vmcb(svm); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 15bff51d492a..4d0aa31a42ba 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4932,8 +4932,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx->vcpu.arch.cr0 = cr0; vmx_set_cr4(vcpu, 0); - if (!init_event) - vmx_set_efer(vcpu, 0); + vmx_set_efer(vcpu, 0); vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); From 8c85ac1c0a1b41299370857765bc0950666ed5d9 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 19 Oct 2015 15:13:29 +0900 Subject: [PATCH 082/123] KVM: x86: MMU: Initialize force_pt_level before calling mapping_level() Commit fd1369021878 ("KVM: x86: MMU: Move mapping_level_dirty_bitmap() call in mapping_level()") forgot to initialize force_pt_level to false in FNAME(page_fault)() before calling mapping_level() like nonpaging_map() does. This can sometimes result in forcing page table level mapping unnecessarily. Fix this and move the first *force_pt_level check in mapping_level() before kvm_vcpu_gfn_to_memslot() call to make it a bit clearer that the variable must be initialized before mapping_level() gets called. This change can also avoid calling kvm_vcpu_gfn_to_memslot() when !check_hugepage_cache_consistency() check in tdp_page_fault() forces page table level mapping. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 7 ++++--- arch/x86/kvm/paging_tmpl.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dd2a7c6ec2ca..7d85bcae3332 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -886,10 +886,11 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, int host_level, level, max_level; struct kvm_memory_slot *slot; - slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; - if (likely(!*force_pt_level)) - *force_pt_level = !memslot_valid_for_gpte(slot, true); + slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); + *force_pt_level = !memslot_valid_for_gpte(slot, true); if (unlikely(*force_pt_level)) return PT_PAGE_TABLE_LEVEL; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bf39d0f3efa9..b41faa91a6f9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - bool force_pt_level; + bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; From 572abd563befd56bee918316c4f7ab144d2decf5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 29 Sep 2015 17:15:45 +1000 Subject: [PATCH 083/123] KVM: PPC: Book3S HV: Don't fall back to smaller HPT size in allocation ioctl Currently the KVM_PPC_ALLOCATE_HTAB will try to allocate the requested size of HPT, and if that is not possible, then try to allocate smaller sizes (by factors of 2) until either a minimum is reached or the allocation succeeds. This is not ideal for userspace, particularly in migration scenarios, where the destination VM really does require the size requested. Also, the minimum HPT size of 256kB may be insufficient for the guest to run successfully. This removes the fallback to smaller sizes on allocation failure for the KVM_PPC_ALLOCATE_HTAB ioctl. The fallback still exists for the case where the HPT is allocated at the time the first VCPU is run, if no HPT has been allocated by ioctl by that time. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 1f9c0a17f445..10722b1e38b5 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -70,7 +70,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) } /* Lastly try successively smaller sizes from the page allocator */ - while (!hpt && order > PPC_MIN_HPT_ORDER) { + /* Only do this if userspace didn't specify a size via ioctl */ + while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) { hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| __GFP_NOWARN, order - PAGE_SHIFT); if (!hpt) From c64dfe2af3de448f9afa2e542983015b31c1f91c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 18 May 2015 14:10:54 +1000 Subject: [PATCH 084/123] KVM: PPC: Book3S HV: Make H_REMOVE return correct HPTE value for absent HPTEs This fixes a bug where the old HPTE value returned by H_REMOVE has the valid bit clear if the HPTE was an absent HPTE, as happens for HPTEs for emulated MMIO pages and for RAM pages that have been paged out by the host. If the absent bit is set, we clear it and set the valid bit, because from the guest's point of view, the HPTE is valid. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index c1df9bb1e413..97e7f8c853d8 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -470,6 +470,8 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, note_hpte_modification(kvm, rev); unlock_hpte(hpte, 0); + if (v & HPTE_V_ABSENT) + v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID; hpret[0] = v; hpret[1] = r; return H_SUCCESS; From bfec5c2cc0adad3b343281eb4f9b94222c5f594f Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Fri, 16 Oct 2015 10:27:53 +0530 Subject: [PATCH 085/123] KVM: PPC: Implement extension to report number of memslots QEMU assumes 32 memslots if this extension is not implemented. Although, current value of KVM_USER_MEM_SLOTS is 32, once KVM_USER_MEM_SLOTS changes QEMU would take a wrong value. Signed-off-by: Nikunj A Dadhania Reviewed-by: Thomas Huth Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/powerpc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2e51289610e4..6fd2405c7f4a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -559,6 +559,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) else r = num_online_cpus(); break; + case KVM_CAP_NR_MEMSLOTS: + r = KVM_USER_MEM_SLOTS; + break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; From 70aa3961a196ac32baf54032b2051bac9a941118 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 15 Oct 2015 11:29:58 +0530 Subject: [PATCH 086/123] KVM: PPC: Book3S HV: Handle H_DOORBELL on the guest exit path Currently a CPU running a guest can receive a H_DOORBELL in the following two cases: 1) When the CPU is napping due to CEDE or there not being a guest vcpu. 2) The CPU is running the guest vcpu. Case 1), the doorbell message is not cleared since we were waking up from nap. Hence when the EE bit gets set on transition from guest to host, the H_DOORBELL interrupt is delivered to the host and the corresponding handler is invoked. However in Case 2), the message gets cleared by the action of taking the H_DOORBELL interrupt. Since the CPU was running a guest, instead of invoking the doorbell handler, the code invokes the second-level interrupt handler to switch the context from the guest to the host. At this point the setting of the EE bit doesn't result in the CPU getting the doorbell interrupt since it has already been delivered once. So, the handler for this doorbell is never invoked! This causes softlockups if the missed DOORBELL was an IPI sent from a sibling subcore on the same CPU. This patch fixes it by explitly invoking the doorbell handler on the exit path if the exit reason is H_DOORBELL similar to the way an EXTERNAL interrupt is handled. Since this will also handle Case 1), we can unconditionally clear the doorbell message in kvmppc_check_wake_reason. Signed-off-by: Gautham R. Shenoy Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index fb8d29650612..b1dab8d1d885 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -150,6 +150,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL beq 11f + cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL + beq 15f /* Invoke the H_DOORBELL handler */ cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI beq cr2, 14f /* HMI check */ @@ -174,6 +176,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_HSRR1, r7 b hmi_exception_after_realmode +15: mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + ba 0xe80 + kvmppc_primary_no_guest: /* We handle this much like a ceded vcpu */ /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ @@ -2440,14 +2446,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* hypervisor doorbell */ 3: li r12, BOOK3S_INTERRUPT_H_DOORBELL + + /* + * Clear the doorbell as we will invoke the handler + * explicitly in the guest exit path. + */ + lis r6, (PPC_DBELL_SERVER << (63-36))@h + PPC_MSGCLR(6) /* see if it's a host IPI */ li r3, 1 lbz r0, HSTATE_HOST_IPI(r13) cmpwi r0, 0 bnelr - /* if not, clear it and return -1 */ - lis r6, (PPC_DBELL_SERVER << (63-36))@h - PPC_MSGCLR(6) + /* if not, return -1 */ li r3, -1 blr From 3217f7c25bca66eed9b07f0b8bfd1937169b0736 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 27 Aug 2015 16:41:15 +0200 Subject: [PATCH 087/123] KVM: Add kvm_arch_vcpu_{un}blocking callbacks Some times it is useful for architecture implementations of KVM to know when the VCPU thread is about to block or when it comes back from blocking (arm/arm64 needs to know this to properly implement timers, for example). Therefore provide a generic architecture callback function in line with what we do elsewhere for KVM generic-arch interactions. Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 3 +++ arch/arm64/include/asm/kvm_host.h | 3 +++ arch/mips/include/asm/kvm_host.h | 2 ++ arch/powerpc/include/asm/kvm_host.h | 2 ++ arch/s390/include/asm/kvm_host.h | 2 ++ arch/x86/include/asm/kvm_host.h | 3 +++ include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 3 +++ 8 files changed, 20 insertions(+) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index c4072d9f32c7..84da97901f1f 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -234,4 +234,7 @@ static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ed039688c221..e4f4d65f7d2b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -255,4 +255,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 5a1a882e0a75..6ded8d347af9 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -847,5 +847,7 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif /* __MIPS_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 827a38d7a9db..c9f122d00920 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -718,5 +718,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_exit(void) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 8ced426091e1..72a614c68ed8 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -644,5 +644,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2beee0382088..b28f0f142ecb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1233,4 +1233,7 @@ int x86_set_memory_region(struct kvm *kvm, bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1bef9e21e725..4a86f5f072c0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -625,6 +625,8 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8db1d9361993..7873d6daccb1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2018,6 +2018,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) } while (single_task_running() && ktime_before(cur, stop)); } + kvm_arch_vcpu_blocking(vcpu); + for (;;) { prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); @@ -2031,6 +2033,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) finish_wait(&vcpu->wq, &wait); cur = ktime_get(); + kvm_arch_vcpu_unblocking(vcpu); out: block_ns = ktime_to_ns(cur) - ktime_to_ns(start); From d35268da66870d733ae763fd7f9b06a1f63f395e Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 25 Aug 2015 19:48:21 +0200 Subject: [PATCH 088/123] arm/arm64: KVM: arch_timer: Only schedule soft timer on vcpu_block We currently schedule a soft timer every time we exit the guest if the timer did not expire while running the guest. This is really not necessary, because the only work we do in the timer work function is to kick the vcpu. Kicking the vcpu does two things: (1) If the vpcu thread is on a waitqueue, make it runnable and remove it from the waitqueue. (2) If the vcpu is running on a different physical CPU from the one doing the kick, it sends a reschedule IPI. The second case cannot happen, because the soft timer is only ever scheduled when the vcpu is not running. The first case is only relevant when the vcpu thread is on a waitqueue, which is only the case when the vcpu thread has called kvm_vcpu_block(). Therefore, we only need to make sure a timer is scheduled for kvm_vcpu_block(), which we do by encapsulating all calls to kvm_vcpu_block() with kvm_timer_{un}schedule calls. Additionally, we only schedule a soft timer if the timer is enabled and unmasked, since it is useless otherwise. Note that theoretically userspace can use the SET_ONE_REG interface to change registers that should cause the timer to fire, even if the vcpu is blocked without a scheduled timer, but this case was not supported before this patch and we leave it for future work for now. Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 3 - arch/arm/kvm/arm.c | 10 ++++ arch/arm64/include/asm/kvm_host.h | 3 - include/kvm/arm_arch_timer.h | 2 + virt/kvm/arm/arch_timer.c | 94 ++++++++++++++++++++----------- 5 files changed, 72 insertions(+), 40 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 84da97901f1f..c4072d9f32c7 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -234,7 +234,4 @@ static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} -static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} -static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} - #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 78b286994577..7ed4d475d83a 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -271,6 +271,16 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvm_timer_should_fire(vcpu); } +void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + kvm_timer_schedule(vcpu); +} + +void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + kvm_timer_unschedule(vcpu); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { /* Force users to call KVM_ARM_VCPU_INIT */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e4f4d65f7d2b..ed039688c221 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -255,7 +255,4 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); -static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} -static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} - #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index e1e4d7c38dda..ef14cc1f1f26 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -71,5 +71,7 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); bool kvm_timer_should_fire(struct kvm_vcpu *vcpu); +void kvm_timer_schedule(struct kvm_vcpu *vcpu); +void kvm_timer_unschedule(struct kvm_vcpu *vcpu); #endif diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index b9d3a32cbc04..32095fbb5d7c 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -111,14 +111,21 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) return HRTIMER_NORESTART; } +static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) && + (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) && + !kvm_vgic_get_phys_irq_active(timer->map); +} + bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; cycle_t cval, now; - if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) || - !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) || - kvm_vgic_get_phys_irq_active(timer->map)) + if (!kvm_timer_irq_can_fire(vcpu)) return false; cval = timer->cntv_cval; @@ -127,12 +134,57 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) return cval <= now; } +/* + * Schedule the background timer before calling kvm_vcpu_block, so that this + * thread is removed from its waitqueue and made runnable when there's a timer + * interrupt to handle. + */ +void kvm_timer_schedule(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + u64 ns; + cycle_t cval, now; + + BUG_ON(timer_is_armed(timer)); + + /* + * No need to schedule a background timer if the guest timer has + * already expired, because kvm_vcpu_block will return before putting + * the thread to sleep. + */ + if (kvm_timer_should_fire(vcpu)) + return; + + /* + * If the timer is not capable of raising interrupts (disabled or + * masked), then there's no more work for us to do. + */ + if (!kvm_timer_irq_can_fire(vcpu)) + return; + + /* The timer has not yet expired, schedule a background timer */ + cval = timer->cntv_cval; + now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; + + ns = cyclecounter_cyc2ns(timecounter->cc, + cval - now, + timecounter->mask, + &timecounter->frac); + timer_arm(timer, ns); +} + +void kvm_timer_unschedule(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + timer_disarm(timer); +} + /** * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu * @vcpu: The vcpu pointer * - * Disarm any pending soft timers, since the world-switch code will write the - * virtual timer state back to the physical CPU. + * Check if the virtual timer has expired while we were running in the host, + * and inject an interrupt if that was the case. */ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) { @@ -140,17 +192,6 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) bool phys_active; int ret; - /* - * We're about to run this vcpu again, so there is no need to - * keep the background timer running, as we're about to - * populate the CPU timer again. - */ - timer_disarm(timer); - - /* - * If the timer expired while we were not scheduled, now is the time - * to inject it. - */ if (kvm_timer_should_fire(vcpu)) kvm_timer_inject_irq(vcpu); @@ -176,32 +217,17 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) * kvm_timer_sync_hwstate - sync timer state from cpu * @vcpu: The vcpu pointer * - * Check if the virtual timer was armed and either schedule a corresponding - * soft timer or inject directly if already expired. + * Check if the virtual timer has expired while we were running in the guest, + * and inject an interrupt if that was the case. */ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - cycle_t cval, now; - u64 ns; BUG_ON(timer_is_armed(timer)); - if (kvm_timer_should_fire(vcpu)) { - /* - * Timer has already expired while we were not - * looking. Inject the interrupt and carry on. - */ + if (kvm_timer_should_fire(vcpu)) kvm_timer_inject_irq(vcpu); - return; - } - - cval = timer->cntv_cval; - now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; - - ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask, - &timecounter->frac); - timer_arm(timer, ns); } int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, From 9103617df202d74e5c65f8af84a9aa727f812a06 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 25 Aug 2015 22:50:57 +0200 Subject: [PATCH 089/123] arm/arm64: KVM: vgic: Factor out level irq processing on guest exit Currently vgic_process_maintenance() processes dealing with a completed level-triggered interrupt directly, but we are soon going to reuse this logic for level-triggered mapped interrupts with the HW bit set, so move this logic into a separate static function. Probably the most scary part of this commit is convincing yourself that the current flow is safe compared to the old one. In the following I try to list the changes and why they are harmless: Move vgic_irq_clear_queued after kvm_notify_acked_irq: Harmless because the only potential effect of clearing the queued flag wrt. kvm_set_irq is that vgic_update_irq_pending does not set the pending bit on the emulated CPU interface or in the pending_on_cpu bitmask if the function is called with level=1. However, the point of kvm_notify_acked_irq is to call kvm_set_irq with level=0, and we set the queued flag again in __kvm_vgic_sync_hwstate later on if the level is stil high. Move vgic_set_lr before kvm_notify_acked_irq: Also, harmless because the LR are cpu-local operations and kvm_notify_acked only affects the dist Move vgic_dist_irq_clear_soft_pend after kvm_notify_acked_irq: Also harmless, because now we check the level state in the clear_soft_pend function and lower the pending bits if the level is low. Reviewed-by: Eric Auger Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 94 +++++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 38 deletions(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 66c66165e712..367a180fb5ac 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -107,6 +107,7 @@ static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr); static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc); static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu, int virt_irq); +static int compute_pending_for_cpu(struct kvm_vcpu *vcpu); static const struct vgic_ops *vgic_ops; static const struct vgic_params *vgic; @@ -357,6 +358,11 @@ static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq) struct vgic_dist *dist = &vcpu->kvm->arch.vgic; vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0); + if (!vgic_dist_irq_get_level(vcpu, irq)) { + vgic_dist_irq_clear_pending(vcpu, irq); + if (!compute_pending_for_cpu(vcpu)) + clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); + } } static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) @@ -1338,12 +1344,56 @@ epilog: } } +static int process_level_irq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr) +{ + int level_pending = 0; + + vlr.state = 0; + vlr.hwirq = 0; + vgic_set_lr(vcpu, lr, vlr); + + /* + * If the IRQ was EOIed (called from vgic_process_maintenance) or it + * went from active to non-active (called from vgic_sync_hwirq) it was + * also ACKed and we we therefore assume we can clear the soft pending + * state (should it had been set) for this interrupt. + * + * Note: if the IRQ soft pending state was set after the IRQ was + * acked, it actually shouldn't be cleared, but we have no way of + * knowing that unless we start trapping ACKs when the soft-pending + * state is set. + */ + vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq); + + /* + * Tell the gic to start sampling the line of this interrupt again. + */ + vgic_irq_clear_queued(vcpu, vlr.irq); + + /* Any additional pending interrupt? */ + if (vgic_dist_irq_get_level(vcpu, vlr.irq)) { + vgic_cpu_irq_set(vcpu, vlr.irq); + level_pending = 1; + } else { + vgic_dist_irq_clear_pending(vcpu, vlr.irq); + vgic_cpu_irq_clear(vcpu, vlr.irq); + } + + /* + * Despite being EOIed, the LR may not have + * been marked as empty. + */ + vgic_sync_lr_elrsr(vcpu, lr, vlr); + + return level_pending; +} + static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) { u32 status = vgic_get_interrupt_status(vcpu); struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - bool level_pending = false; struct kvm *kvm = vcpu->kvm; + int level_pending = 0; kvm_debug("STATUS = %08x\n", status); @@ -1358,54 +1408,22 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) { struct vgic_lr vlr = vgic_get_lr(vcpu, lr); + WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq)); - - spin_lock(&dist->lock); - vgic_irq_clear_queued(vcpu, vlr.irq); WARN_ON(vlr.state & LR_STATE_MASK); - vlr.state = 0; - vgic_set_lr(vcpu, lr, vlr); - /* - * If the IRQ was EOIed it was also ACKed and we we - * therefore assume we can clear the soft pending - * state (should it had been set) for this interrupt. - * - * Note: if the IRQ soft pending state was set after - * the IRQ was acked, it actually shouldn't be - * cleared, but we have no way of knowing that unless - * we start trapping ACKs when the soft-pending state - * is set. - */ - vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq); /* * kvm_notify_acked_irq calls kvm_set_irq() - * to reset the IRQ level. Need to release the - * lock for kvm_set_irq to grab it. + * to reset the IRQ level, which grabs the dist->lock + * so we call this before taking the dist->lock. */ - spin_unlock(&dist->lock); - kvm_notify_acked_irq(kvm, 0, vlr.irq - VGIC_NR_PRIVATE_IRQS); + spin_lock(&dist->lock); - - /* Any additional pending interrupt? */ - if (vgic_dist_irq_get_level(vcpu, vlr.irq)) { - vgic_cpu_irq_set(vcpu, vlr.irq); - level_pending = true; - } else { - vgic_dist_irq_clear_pending(vcpu, vlr.irq); - vgic_cpu_irq_clear(vcpu, vlr.irq); - } - + level_pending |= process_level_irq(vcpu, lr, vlr); spin_unlock(&dist->lock); - - /* - * Despite being EOIed, the LR may not have - * been marked as empty. - */ - vgic_sync_lr_elrsr(vcpu, lr, vlr); } } From 8bf9a701e103fd17dbdf0355e43ff5200b4823aa Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sun, 30 Aug 2015 14:42:16 +0200 Subject: [PATCH 090/123] arm/arm64: KVM: Implement GICD_ICFGR as RO for PPIs The GICD_ICFGR allows the bits for the SGIs and PPIs to be read only. We currently simulate this behavior by writing a hardcoded value to the register for the SGIs and PPIs on every write of these bits to the register (ignoring what the guest actually wrote), and by writing the same value as the reset value to the register. This is a bit counter-intuitive, as the register is RO for these bits, and we can just implement it that way, allowing us to control the value of the bits purely in the reset code. Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 367a180fb5ac..f8ca2e9d2f0b 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -690,10 +690,9 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio, vgic_reg_access(mmio, &val, offset, ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); if (mmio->is_write) { - if (offset < 8) { - *reg = ~0U; /* Force PPIs/SGIs to 1 */ + /* Ignore writes to read-only SGI and PPI bits */ + if (offset < 8) return false; - } val = vgic_cfg_compress(val); if (offset & 4) { From 54723bb37feac347a169359536f3dff122cabca3 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sun, 30 Aug 2015 14:45:20 +0200 Subject: [PATCH 091/123] arm/arm64: KVM: Use appropriate define in VGIC reset code We currently initialize the SGIs to be enabled in the VGIC code, but we use the VGIC_NR_PPIS define for this purpose, instead of the the more natural VGIC_NR_SGIS. Change this slightly confusing use of the defines. Note: This should have no functional change, as both names are defined to the number 16. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index f8ca2e9d2f0b..a44ecf9eca4e 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -2128,8 +2128,12 @@ int vgic_init(struct kvm *kvm) break; } - for (i = 0; i < dist->nr_irqs; i++) { - if (i < VGIC_NR_PPIS) + /* + * Enable all SGIs and configure all private IRQs as + * edge-triggered. + */ + for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { + if (i < VGIC_NR_SGIS) vgic_bitmap_set_irq_val(&dist->irq_enabled, vcpu->vcpu_id, i, 1); if (i < VGIC_NR_PRIVATE_IRQS) From 4cf1bc4c7cbf35983e565ab491142af59f03bb22 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sun, 30 Aug 2015 14:47:17 +0200 Subject: [PATCH 092/123] arm/arm64: KVM: Add forwarded physical interrupts documentation Forwarded physical interrupts on arm/arm64 is a tricky concept and the way we deal with them is not apparently easy to understand by reading various specs. Therefore, add a proper documentation file explaining the flow and rationale of the behavior of the vgic. Some of this text was contributed by Marc Zyngier and edited by me. Omissions and errors are all mine. Signed-off-by: Christoffer Dall --- .../virtual/kvm/arm/vgic-mapped-irqs.txt | 187 ++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt diff --git a/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt new file mode 100644 index 000000000000..38bca2835278 --- /dev/null +++ b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt @@ -0,0 +1,187 @@ +KVM/ARM VGIC Forwarded Physical Interrupts +========================================== + +The KVM/ARM code implements software support for the ARM Generic +Interrupt Controller's (GIC's) hardware support for virtualization by +allowing software to inject virtual interrupts to a VM, which the guest +OS sees as regular interrupts. The code is famously known as the VGIC. + +Some of these virtual interrupts, however, correspond to physical +interrupts from real physical devices. One example could be the +architected timer, which itself supports virtualization, and therefore +lets a guest OS program the hardware device directly to raise an +interrupt at some point in time. When such an interrupt is raised, the +host OS initially handles the interrupt and must somehow signal this +event as a virtual interrupt to the guest. Another example could be a +passthrough device, where the physical interrupts are initially handled +by the host, but the device driver for the device lives in the guest OS +and KVM must therefore somehow inject a virtual interrupt on behalf of +the physical one to the guest OS. + +These virtual interrupts corresponding to a physical interrupt on the +host are called forwarded physical interrupts, but are also sometimes +referred to as 'virtualized physical interrupts' and 'mapped interrupts'. + +Forwarded physical interrupts are handled slightly differently compared +to virtual interrupts generated purely by a software emulated device. + + +The HW bit +---------- +Virtual interrupts are signalled to the guest by programming the List +Registers (LRs) on the GIC before running a VCPU. The LR is programmed +with the virtual IRQ number and the state of the interrupt (Pending, +Active, or Pending+Active). When the guest ACKs and EOIs a virtual +interrupt, the LR state moves from Pending to Active, and finally to +inactive. + +The LRs include an extra bit, called the HW bit. When this bit is set, +KVM must also program an additional field in the LR, the physical IRQ +number, to link the virtual with the physical IRQ. + +When the HW bit is set, KVM must EITHER set the Pending OR the Active +bit, never both at the same time. + +Setting the HW bit causes the hardware to deactivate the physical +interrupt on the physical distributor when the guest deactivates the +corresponding virtual interrupt. + + +Forwarded Physical Interrupts Life Cycle +---------------------------------------- + +The state of forwarded physical interrupts is managed in the following way: + + - The physical interrupt is acked by the host, and becomes active on + the physical distributor (*). + - KVM sets the LR.Pending bit, because this is the only way the GICV + interface is going to present it to the guest. + - LR.Pending will stay set as long as the guest has not acked the interrupt. + - LR.Pending transitions to LR.Active on the guest read of the IAR, as + expected. + - On guest EOI, the *physical distributor* active bit gets cleared, + but the LR.Active is left untouched (set). + - KVM clears the LR on VM exits when the physical distributor + active state has been cleared. + +(*): The host handling is slightly more complicated. For some forwarded +interrupts (shared), KVM directly sets the active state on the physical +distributor before entering the guest, because the interrupt is never actually +handled on the host (see details on the timer as an example below). For other +forwarded interrupts (non-shared) the host does not deactivate the interrupt +when the host ISR completes, but leaves the interrupt active until the guest +deactivates it. Leaving the interrupt active is allowed, because Linux +configures the physical GIC with EOIMode=1, which causes EOI operations to +perform a priority drop allowing the GIC to receive other interrupts of the +default priority. + + +Forwarded Edge and Level Triggered PPIs and SPIs +------------------------------------------------ +Forwarded physical interrupts injected should always be active on the +physical distributor when injected to a guest. + +Level-triggered interrupts will keep the interrupt line to the GIC +asserted, typically until the guest programs the device to deassert the +line. This means that the interrupt will remain pending on the physical +distributor until the guest has reprogrammed the device. Since we +always run the VM with interrupts enabled on the CPU, a pending +interrupt will exit the guest as soon as we switch into the guest, +preventing the guest from ever making progress as the process repeats +over and over. Therefore, the active state on the physical distributor +must be set when entering the guest, preventing the GIC from forwarding +the pending interrupt to the CPU. As soon as the guest deactivates the +interrupt, the physical line is sampled by the hardware again and the host +takes a new interrupt if and only if the physical line is still asserted. + +Edge-triggered interrupts do not exhibit the same problem with +preventing guest execution that level-triggered interrupts do. One +option is to not use HW bit at all, and inject edge-triggered interrupts +from a physical device as pure virtual interrupts. But that would +potentially slow down handling of the interrupt in the guest, because a +physical interrupt occurring in the middle of the guest ISR would +preempt the guest for the host to handle the interrupt. Additionally, +if you configure the system to handle interrupts on a separate physical +core from that running your VCPU, you still have to interrupt the VCPU +to queue the pending state onto the LR, even though the guest won't use +this information until the guest ISR completes. Therefore, the HW +bit should always be set for forwarded edge-triggered interrupts. With +the HW bit set, the virtual interrupt is injected and additional +physical interrupts occurring before the guest deactivates the interrupt +simply mark the state on the physical distributor as Pending+Active. As +soon as the guest deactivates the interrupt, the host takes another +interrupt if and only if there was a physical interrupt between injecting +the forwarded interrupt to the guest and the guest deactivating the +interrupt. + +Consequently, whenever we schedule a VCPU with one or more LRs with the +HW bit set, the interrupt must also be active on the physical +distributor. + + +Forwarded LPIs +-------------- +LPIs, introduced in GICv3, are always edge-triggered and do not have an +active state. They become pending when a device signal them, and as +soon as they are acked by the CPU, they are inactive again. + +It therefore doesn't make sense, and is not supported, to set the HW bit +for physical LPIs that are forwarded to a VM as virtual interrupts, +typically virtual SPIs. + +For LPIs, there is no other choice than to preempt the VCPU thread if +necessary, and queue the pending state onto the LR. + + +Putting It Together: The Architected Timer +------------------------------------------ +The architected timer is a device that signals interrupts with level +triggered semantics. The timer hardware is directly accessed by VCPUs +which program the timer to fire at some point in time. Each VCPU on a +system programs the timer to fire at different times, and therefore the +hardware is multiplexed between multiple VCPUs. This is implemented by +context-switching the timer state along with each VCPU thread. + +However, this means that a scenario like the following is entirely +possible, and in fact, typical: + +1. KVM runs the VCPU +2. The guest programs the time to fire in T+100 +3. The guest is idle and calls WFI (wait-for-interrupts) +4. The hardware traps to the host +5. KVM stores the timer state to memory and disables the hardware timer +6. KVM schedules a soft timer to fire in T+(100 - time since step 2) +7. KVM puts the VCPU thread to sleep (on a waitqueue) +8. The soft timer fires, waking up the VCPU thread +9. KVM reprograms the timer hardware with the VCPU's values +10. KVM marks the timer interrupt as active on the physical distributor +11. KVM injects a forwarded physical interrupt to the guest +12. KVM runs the VCPU + +Notice that KVM injects a forwarded physical interrupt in step 11 without +the corresponding interrupt having actually fired on the host. That is +exactly why we mark the timer interrupt as active in step 10, because +the active state on the physical distributor is part of the state +belonging to the timer hardware, which is context-switched along with +the VCPU thread. + +If the guest does not idle because it is busy, the flow looks like this +instead: + +1. KVM runs the VCPU +2. The guest programs the time to fire in T+100 +4. At T+100 the timer fires and a physical IRQ causes the VM to exit + (note that this initially only traps to EL2 and does not run the host ISR + until KVM has returned to the host). +5. With interrupts still disabled on the CPU coming back from the guest, KVM + stores the virtual timer state to memory and disables the virtual hw timer. +6. KVM looks at the timer state (in memory) and injects a forwarded physical + interrupt because it concludes the timer has expired. +7. KVM marks the timer interrupt as active on the physical distributor +7. KVM enables the timer, enables interrupts, and runs the VCPU + +Notice that again the forwarded physical interrupt is injected to the +guest without having actually been handled on the host. In this case it +is because the physical interrupt is never actually seen by the host because the +timer is disabled upon guest return, and the virtual forwarded interrupt is +injected on the KVM guest entry path. From 4b4b4512da2a844b8da2585609b67fae1ce4f4db Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sun, 30 Aug 2015 15:01:27 +0200 Subject: [PATCH 093/123] arm/arm64: KVM: Rework the arch timer to use level-triggered semantics The arch timer currently uses edge-triggered semantics in the sense that the line is never sampled by the vgic and lowering the line from the timer to the vgic doesn't have any effect on the pending state of virtual interrupts in the vgic. This means that we do not support a guest with the otherwise valid behavior of (1) disable interrupts (2) enable the timer (3) disable the timer (4) enable interrupts. Such a guest would validly not expect to see any interrupts on real hardware, but will see interrupts on KVM. This patch fixes this shortcoming through the following series of changes. First, we change the flow of the timer/vgic sync/flush operations. Now the timer is always flushed/synced before the vgic, because the vgic samples the state of the timer output. This has the implication that we move the timer operations in to non-preempible sections, but that is fine after the previous commit getting rid of hrtimer schedules on every entry/exit. Second, we change the internal behavior of the timer, letting the timer keep track of its previous output state, and only lower/raise the line to the vgic when the state changes. Note that in theory this could have been accomplished more simply by signalling the vgic every time the state *potentially* changed, but we don't want to be hitting the vgic more often than necessary. Third, we get rid of the use of the map->active field in the vgic and instead simply set the interrupt as active on the physical distributor whenever the input to the GIC is asserted and conversely clear the physical active state when the input to the GIC is deasserted. Fourth, and finally, we now initialize the timer PPIs (and all the other unused PPIs for now), to be level-triggered, and modify the sync code to sample the line state on HW sync and re-inject a new interrupt if it is still pending at that time. Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 11 ++-- include/kvm/arm_arch_timer.h | 2 +- include/kvm/arm_vgic.h | 3 -- virt/kvm/arm/arch_timer.c | 81 +++++++++++++++++++--------- virt/kvm/arm/vgic.c | 102 +++++++++-------------------------- 5 files changed, 91 insertions(+), 108 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 7ed4d475d83a..59125f48c707 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -561,9 +561,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) { local_irq_enable(); + kvm_timer_sync_hwstate(vcpu); kvm_vgic_sync_hwstate(vcpu); preempt_enable(); - kvm_timer_sync_hwstate(vcpu); continue; } @@ -608,12 +608,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_guest_exit(); trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); + /* + * We must sync the timer state before the vgic state so that + * the vgic can properly sample the updated state of the + * interrupt line. + */ + kvm_timer_sync_hwstate(vcpu); + kvm_vgic_sync_hwstate(vcpu); preempt_enable(); - kvm_timer_sync_hwstate(vcpu); - ret = handle_exit(vcpu, run, ret); } diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index ef14cc1f1f26..1800227af9d6 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -51,7 +51,7 @@ struct arch_timer_cpu { bool armed; /* Timer IRQ */ - const struct kvm_irq_level *irq; + struct kvm_irq_level irq; /* VGIC mapping */ struct irq_phys_map *map; diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 4e14dac282bb..7bc5d0224ab0 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -159,7 +159,6 @@ struct irq_phys_map { u32 virt_irq; u32 phys_irq; u32 irq; - bool active; }; struct irq_phys_map_entry { @@ -354,8 +353,6 @@ int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu); struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int irq); int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map); -bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map); -void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus)) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 32095fbb5d7c..523816d8c402 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -59,18 +59,6 @@ static void timer_disarm(struct arch_timer_cpu *timer) } } -static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu) -{ - int ret; - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - kvm_vgic_set_phys_irq_active(timer->map, true); - ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, - timer->map, - timer->irq->level); - WARN_ON(ret); -} - static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) { struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; @@ -116,8 +104,7 @@ static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu) struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) && - (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) && - !kvm_vgic_get_phys_irq_active(timer->map); + (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE); } bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) @@ -134,6 +121,41 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) return cval <= now; } +static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level) +{ + int ret; + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + BUG_ON(!vgic_initialized(vcpu->kvm)); + + timer->irq.level = new_level; + ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, + timer->map, + timer->irq.level); + WARN_ON(ret); +} + +/* + * Check if there was a change in the timer state (should we raise or lower + * the line level to the GIC). + */ +static void kvm_timer_update_state(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + /* + * If userspace modified the timer registers via SET_ONE_REG before + * the vgic was initialized, we mustn't set the timer->irq.level value + * because the guest would never see the interrupt. Instead wait + * until we call this function from kvm_timer_flush_hwstate. + */ + if (!vgic_initialized(vcpu->kvm)) + return; + + if (kvm_timer_should_fire(vcpu) != timer->irq.level) + kvm_timer_update_irq(vcpu, !timer->irq.level); +} + /* * Schedule the background timer before calling kvm_vcpu_block, so that this * thread is removed from its waitqueue and made runnable when there's a timer @@ -192,17 +214,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) bool phys_active; int ret; - if (kvm_timer_should_fire(vcpu)) - kvm_timer_inject_irq(vcpu); + kvm_timer_update_state(vcpu); /* - * We keep track of whether the edge-triggered interrupt has been - * signalled to the vgic/guest, and if so, we mask the interrupt and - * the physical distributor to prevent the timer from raising a - * physical interrupt whenever we run a guest, preventing forward - * VCPU progress. + * If we enter the guest with the virtual input level to the VGIC + * asserted, then we have already told the VGIC what we need to, and + * we don't need to exit from the guest until the guest deactivates + * the already injected interrupt, so therefore we should set the + * hardware active state to prevent unnecessary exits from the guest. + * + * Conversely, if the virtual input level is deasserted, then always + * clear the hardware active state to ensure that hardware interrupts + * from the timer triggers a guest exit. */ - if (kvm_vgic_get_phys_irq_active(timer->map)) + if (timer->irq.level) phys_active = true; else phys_active = false; @@ -226,8 +251,11 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) BUG_ON(timer_is_armed(timer)); - if (kvm_timer_should_fire(vcpu)) - kvm_timer_inject_irq(vcpu); + /* + * The guest could have modified the timer registers or the timer + * could have expired, update the timer state. + */ + kvm_timer_update_state(vcpu); } int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, @@ -242,7 +270,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, * kvm_vcpu_set_target(). To handle this, we determine * vcpu timer irq number when the vcpu is reset. */ - timer->irq = irq; + timer->irq.irq = irq->irq; /* * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 @@ -251,6 +279,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, * the ARMv7 architecture. */ timer->cntv_ctl = 0; + kvm_timer_update_state(vcpu); /* * Tell the VGIC that the virtual interrupt is tied to a @@ -295,6 +324,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) default: return -1; } + + kvm_timer_update_state(vcpu); return 0; } diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index a44ecf9eca4e..3c2909c1bda3 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -537,34 +537,6 @@ bool vgic_handle_set_pending_reg(struct kvm *kvm, return false; } -/* - * If a mapped interrupt's state has been modified by the guest such that it - * is no longer active or pending, without it have gone through the sync path, - * then the map->active field must be cleared so the interrupt can be taken - * again. - */ -static void vgic_handle_clear_mapped_irq(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct list_head *root; - struct irq_phys_map_entry *entry; - struct irq_phys_map *map; - - rcu_read_lock(); - - /* Check for PPIs */ - root = &vgic_cpu->irq_phys_map_list; - list_for_each_entry_rcu(entry, root, entry) { - map = &entry->map; - - if (!vgic_dist_irq_is_pending(vcpu, map->virt_irq) && - !vgic_irq_is_active(vcpu, map->virt_irq)) - map->active = false; - } - - rcu_read_unlock(); -} - bool vgic_handle_clear_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio, phys_addr_t offset, int vcpu_id) @@ -595,7 +567,6 @@ bool vgic_handle_clear_pending_reg(struct kvm *kvm, vcpu_id, offset); vgic_reg_access(mmio, reg, offset, mode); - vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id)); vgic_update_state(kvm); return true; } @@ -633,7 +604,6 @@ bool vgic_handle_clear_active_reg(struct kvm *kvm, ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); if (mmio->is_write) { - vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id)); vgic_update_state(kvm); return true; } @@ -1443,29 +1413,37 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) /* * Save the physical active state, and reset it to inactive. * - * Return 1 if HW interrupt went from active to inactive, and 0 otherwise. + * Return true if there's a pending level triggered interrupt line to queue. */ -static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr) +static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr) { + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; struct irq_phys_map *map; + bool phys_active; + bool level_pending; int ret; if (!(vlr.state & LR_HW)) - return 0; + return false; map = vgic_irq_map_search(vcpu, vlr.irq); BUG_ON(!map); ret = irq_get_irqchip_state(map->irq, IRQCHIP_STATE_ACTIVE, - &map->active); + &phys_active); WARN_ON(ret); - if (map->active) + if (phys_active) return 0; - return 1; + /* Mapped edge-triggered interrupts not yet supported. */ + WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq)); + spin_lock(&dist->lock); + level_pending = process_level_irq(vcpu, lr, vlr); + spin_unlock(&dist->lock); + return level_pending; } /* Sync back the VGIC state after a guest run */ @@ -1490,18 +1468,8 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) continue; vlr = vgic_get_lr(vcpu, lr); - if (vgic_sync_hwirq(vcpu, vlr)) { - /* - * So this is a HW interrupt that the guest - * EOI-ed. Clean the LR state and allow the - * interrupt to be sampled again. - */ - vlr.state = 0; - vlr.hwirq = 0; - vgic_set_lr(vcpu, lr, vlr); - vgic_irq_clear_queued(vcpu, vlr.irq); - set_bit(lr, elrsr_ptr); - } + if (vgic_sync_hwirq(vcpu, lr, vlr)) + level_pending = true; if (!test_bit(lr, elrsr_ptr)) continue; @@ -1880,30 +1848,6 @@ static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu) kfree(entry); } -/** - * kvm_vgic_get_phys_irq_active - Return the active state of a mapped IRQ - * - * Return the logical active state of a mapped interrupt. This doesn't - * necessarily reflects the current HW state. - */ -bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map) -{ - BUG_ON(!map); - return map->active; -} - -/** - * kvm_vgic_set_phys_irq_active - Set the active state of a mapped IRQ - * - * Set the logical active state of a mapped interrupt. This doesn't - * immediately affects the HW state. - */ -void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active) -{ - BUG_ON(!map); - map->active = active; -} - /** * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping * @vcpu: The VCPU pointer @@ -2129,17 +2073,23 @@ int vgic_init(struct kvm *kvm) } /* - * Enable all SGIs and configure all private IRQs as - * edge-triggered. + * Enable and configure all SGIs to be edge-triggere and + * configure all PPIs as level-triggered. */ for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - if (i < VGIC_NR_SGIS) + if (i < VGIC_NR_SGIS) { + /* SGIs */ vgic_bitmap_set_irq_val(&dist->irq_enabled, vcpu->vcpu_id, i, 1); - if (i < VGIC_NR_PRIVATE_IRQS) vgic_bitmap_set_irq_val(&dist->irq_cfg, vcpu->vcpu_id, i, VGIC_CFG_EDGE); + } else if (i < VGIC_NR_PRIVATE_IRQS) { + /* PPIs */ + vgic_bitmap_set_irq_val(&dist->irq_cfg, + vcpu->vcpu_id, i, + VGIC_CFG_LEVEL); + } } vgic_enable(vcpu); From 8fe2f19e6e6015911bdd4cfcdb23a32e146ba570 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 4 Sep 2015 21:25:12 +0200 Subject: [PATCH 094/123] arm/arm64: KVM: Support edge-triggered forwarded interrupts We mark edge-triggered interrupts with the HW bit set as queued to prevent the VGIC code from injecting LRs with both the Active and Pending bits set at the same time while also setting the HW bit, because the hardware does not support this. However, this means that we must also clear the queued flag when we sync back a LR where the state on the physical distributor went from active to inactive because the guest deactivated the interrupt. At this point we must also check if the interrupt is pending on the distributor, and tell the VGIC to queue it again if it is. Since these actions on the sync path are extremely close to those for level-triggered interrupts, rename process_level_irq to process_queued_irq, allowing it to cater for both cases. Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 3c2909c1bda3..84abc6f38c1d 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1313,13 +1313,10 @@ epilog: } } -static int process_level_irq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr) +static int process_queued_irq(struct kvm_vcpu *vcpu, + int lr, struct vgic_lr vlr) { - int level_pending = 0; - - vlr.state = 0; - vlr.hwirq = 0; - vgic_set_lr(vcpu, lr, vlr); + int pending = 0; /* * If the IRQ was EOIed (called from vgic_process_maintenance) or it @@ -1335,26 +1332,35 @@ static int process_level_irq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr) vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq); /* - * Tell the gic to start sampling the line of this interrupt again. + * Tell the gic to start sampling this interrupt again. */ vgic_irq_clear_queued(vcpu, vlr.irq); /* Any additional pending interrupt? */ - if (vgic_dist_irq_get_level(vcpu, vlr.irq)) { - vgic_cpu_irq_set(vcpu, vlr.irq); - level_pending = 1; + if (vgic_irq_is_edge(vcpu, vlr.irq)) { + BUG_ON(!(vlr.state & LR_HW)); + pending = vgic_dist_irq_is_pending(vcpu, vlr.irq); } else { - vgic_dist_irq_clear_pending(vcpu, vlr.irq); - vgic_cpu_irq_clear(vcpu, vlr.irq); + if (vgic_dist_irq_get_level(vcpu, vlr.irq)) { + vgic_cpu_irq_set(vcpu, vlr.irq); + pending = 1; + } else { + vgic_dist_irq_clear_pending(vcpu, vlr.irq); + vgic_cpu_irq_clear(vcpu, vlr.irq); + } } /* * Despite being EOIed, the LR may not have * been marked as empty. */ + vlr.state = 0; + vlr.hwirq = 0; + vgic_set_lr(vcpu, lr, vlr); + vgic_sync_lr_elrsr(vcpu, lr, vlr); - return level_pending; + return pending; } static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) @@ -1391,7 +1397,7 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) vlr.irq - VGIC_NR_PRIVATE_IRQS); spin_lock(&dist->lock); - level_pending |= process_level_irq(vcpu, lr, vlr); + level_pending |= process_queued_irq(vcpu, lr, vlr); spin_unlock(&dist->lock); } } @@ -1413,7 +1419,7 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) /* * Save the physical active state, and reset it to inactive. * - * Return true if there's a pending level triggered interrupt line to queue. + * Return true if there's a pending forwarded interrupt to queue. */ static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr) { @@ -1438,10 +1444,8 @@ static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr) if (phys_active) return 0; - /* Mapped edge-triggered interrupts not yet supported. */ - WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq)); spin_lock(&dist->lock); - level_pending = process_level_irq(vcpu, lr, vlr); + level_pending = process_queued_irq(vcpu, lr, vlr); spin_unlock(&dist->lock); return level_pending; } From 75755c6d02df9e9b959b3066c12de5494907e3d9 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 9 Oct 2015 10:08:43 -0500 Subject: [PATCH 095/123] arm/arm64: KVM : Enable vhost device selection under KVM config menu vhost drivers provide guest VMs with better I/O performance and lower CPU utilization. This patch allows users to select vhost devices under KVM configuration menu on ARM. This makes vhost support on arm/arm64 on a par with other architectures (e.g. x86, ppc). Signed-off-by: Wei Huang Signed-off-by: Christoffer Dall --- arch/arm/kvm/Kconfig | 2 ++ arch/arm64/kvm/Kconfig | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 356970f3b25e..95a000515e43 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -46,4 +46,6 @@ config KVM_ARM_HOST ---help--- Provides host support for ARM processors. +source drivers/vhost/Kconfig + endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 5c7e920e4861..38102f5d3cbb 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -41,4 +41,6 @@ config KVM_ARM_HOST ---help--- Provides host support for ARM processors. +source drivers/vhost/Kconfig + endif # VIRTUALIZATION From 3781528e3045e7c9cc7c4846e0f675b1f353655f Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 25 Sep 2015 23:41:14 +0200 Subject: [PATCH 096/123] KVM: arm/arm64: rename pause into power_off The kvm_vcpu_arch pause field is renamed into power_off to prepare for the introduction of a new pause field. Also vcpu_pause is renamed into vcpu_sleep since we will sleep until both power_off and pause are false. Signed-off-by: Eric Auger Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 4 ++-- arch/arm/kvm/arm.c | 20 ++++++++++---------- arch/arm/kvm/psci.c | 10 +++++----- arch/arm64/include/asm/kvm_host.h | 4 ++-- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index c4072d9f32c7..107374f986fd 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -126,8 +126,8 @@ struct kvm_vcpu_arch { * here. */ - /* Don't run the guest on this vcpu */ - bool pause; + /* vcpu power-off state */ + bool power_off; /* IO related fields */ struct kvm_decode mmio_decode; diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 59125f48c707..9d2fb4772d8c 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -318,7 +318,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - if (vcpu->arch.pause) + if (vcpu->arch.power_off) mp_state->mp_state = KVM_MP_STATE_STOPPED; else mp_state->mp_state = KVM_MP_STATE_RUNNABLE; @@ -331,10 +331,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, { switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: - vcpu->arch.pause = false; + vcpu->arch.power_off = false; break; case KVM_MP_STATE_STOPPED: - vcpu->arch.pause = true; + vcpu->arch.power_off = true; break; default: return -EINVAL; @@ -478,11 +478,11 @@ bool kvm_arch_intc_initialized(struct kvm *kvm) return vgic_initialized(kvm); } -static void vcpu_pause(struct kvm_vcpu *vcpu) +static void vcpu_sleep(struct kvm_vcpu *vcpu) { wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); - wait_event_interruptible(*wq, !vcpu->arch.pause); + wait_event_interruptible(*wq, !vcpu->arch.power_off); } static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) @@ -532,8 +532,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) update_vttbr(vcpu->kvm); - if (vcpu->arch.pause) - vcpu_pause(vcpu); + if (vcpu->arch.power_off) + vcpu_sleep(vcpu); /* * Disarming the background timer must be done in a @@ -780,12 +780,12 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, vcpu_reset_hcr(vcpu); /* - * Handle the "start in power-off" case by marking the VCPU as paused. + * Handle the "start in power-off" case. */ if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) - vcpu->arch.pause = true; + vcpu->arch.power_off = true; else - vcpu->arch.pause = false; + vcpu->arch.power_off = false; return 0; } diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index ad6f6424f1d1..0b556968a6da 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c @@ -63,7 +63,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) { - vcpu->arch.pause = true; + vcpu->arch.power_off = true; } static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) @@ -87,7 +87,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) */ if (!vcpu) return PSCI_RET_INVALID_PARAMS; - if (!vcpu->arch.pause) { + if (!vcpu->arch.power_off) { if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1) return PSCI_RET_ALREADY_ON; else @@ -115,7 +115,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) * the general puspose registers are undefined upon CPU_ON. */ *vcpu_reg(vcpu, 0) = context_id; - vcpu->arch.pause = false; + vcpu->arch.power_off = false; smp_mb(); /* Make sure the above is visible */ wq = kvm_arch_vcpu_wq(vcpu); @@ -153,7 +153,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) mpidr = kvm_vcpu_get_mpidr_aff(tmp); if ((mpidr & target_affinity_mask) == target_affinity) { matching_cpus++; - if (!tmp->arch.pause) + if (!tmp->arch.power_off) return PSCI_0_2_AFFINITY_LEVEL_ON; } } @@ -179,7 +179,7 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type) * re-initialized. */ kvm_for_each_vcpu(i, tmp, vcpu->kvm) { - tmp->arch.pause = true; + tmp->arch.power_off = true; kvm_vcpu_kick(tmp); } diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ed039688c221..4b4157b3b983 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -149,8 +149,8 @@ struct kvm_vcpu_arch { u32 mdscr_el1; } guest_debug_preserved; - /* Don't run the guest */ - bool pause; + /* vcpu power-off state */ + bool power_off; /* IO related fields */ struct kvm_decode mmio_decode; From 4f5f1dc03606e18986b874f899cf86b0a3e4f2a5 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 25 Sep 2015 23:41:15 +0200 Subject: [PATCH 097/123] KVM: arm/arm64: check power_off in kvm_arch_vcpu_runnable kvm_arch_vcpu_runnable now also checks whether the power_off flag is set. Signed-off-by: Eric Auger Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 9d2fb4772d8c..d04deeb1d4e0 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -352,7 +352,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, */ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - return !!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v); + return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v)) + && !v->arch.power_off); } /* Just ensure a guest exit from a particular CPU */ From 101d3da09c953b08c814cd9a0b8605623d640ba0 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 25 Sep 2015 23:41:16 +0200 Subject: [PATCH 098/123] KVM: arm/arm64: check power_off in critical section before VCPU run In case a vcpu off PSCI call is called just after we executed the vcpu_sleep check, we can enter the guest although power_off is set. Let's check the power_off state in the critical section, just before entering the guest. Signed-off-by: Eric Auger Reported-by: Christoffer Dall Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index d04deeb1d4e0..3b3384c37e24 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -560,7 +560,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) run->exit_reason = KVM_EXIT_INTR; } - if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) { + if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || + vcpu->arch.power_off) { local_irq_enable(); kvm_timer_sync_hwstate(vcpu); kvm_vgic_sync_hwstate(vcpu); From 3b92830ad41b2fe377e0765322e8aefd0ab8388d Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 25 Sep 2015 23:41:17 +0200 Subject: [PATCH 099/123] KVM: arm/arm64: implement kvm_arm_[halt,resume]_guest We introduce kvm_arm_halt_guest and resume functions. They will be used for IRQ forward state change. Halt is synchronous and prevents the guest from being re-entered. We use the same mechanism put in place for PSCI former pause, now renamed power_off. A new flag is introduced in arch vcpu state, pause, only meant to be used by those functions. Signed-off-by: Eric Auger Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 3 +++ arch/arm/kvm/arm.c | 35 +++++++++++++++++++++++++++---- arch/arm64/include/asm/kvm_host.h | 3 +++ 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 107374f986fd..6692982c9b57 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -129,6 +129,9 @@ struct kvm_vcpu_arch { /* vcpu power-off state */ bool power_off; + /* Don't run the guest (internal implementation need) */ + bool pause; + /* IO related fields */ struct kvm_decode mmio_decode; diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3b3384c37e24..ed1574724caf 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -353,7 +353,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v)) - && !v->arch.power_off); + && !v->arch.power_off && !v->arch.pause); } /* Just ensure a guest exit from a particular CPU */ @@ -479,11 +479,38 @@ bool kvm_arch_intc_initialized(struct kvm *kvm) return vgic_initialized(kvm); } +static void kvm_arm_halt_guest(struct kvm *kvm) __maybe_unused; +static void kvm_arm_resume_guest(struct kvm *kvm) __maybe_unused; + +static void kvm_arm_halt_guest(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.pause = true; + force_vm_exit(cpu_all_mask); +} + +static void kvm_arm_resume_guest(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); + + vcpu->arch.pause = false; + wake_up_interruptible(wq); + } +} + static void vcpu_sleep(struct kvm_vcpu *vcpu) { wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); - wait_event_interruptible(*wq, !vcpu->arch.power_off); + wait_event_interruptible(*wq, ((!vcpu->arch.power_off) && + (!vcpu->arch.pause))); } static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) @@ -533,7 +560,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) update_vttbr(vcpu->kvm); - if (vcpu->arch.power_off) + if (vcpu->arch.power_off || vcpu->arch.pause) vcpu_sleep(vcpu); /* @@ -561,7 +588,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) } if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || - vcpu->arch.power_off) { + vcpu->arch.power_off || vcpu->arch.pause) { local_irq_enable(); kvm_timer_sync_hwstate(vcpu); kvm_vgic_sync_hwstate(vcpu); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 4b4157b3b983..a35ce7266aac 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -152,6 +152,9 @@ struct kvm_vcpu_arch { /* vcpu power-off state */ bool power_off; + /* Don't run the guest (internal implementation need) */ + bool pause; + /* IO related fields */ struct kvm_decode mmio_decode; From 952105ab524e3fcc719349da5645ec71d9733547 Mon Sep 17 00:00:00 2001 From: Pavel Fedin Date: Tue, 13 Oct 2015 10:01:25 +0300 Subject: [PATCH 100/123] KVM: arm/arm64: Fix vGIC documentation Correct some old mistakes in the API documentation: 1. VCPU is identified by index (using kvm_get_vcpu() function), but "cpu id" can be mistaken for affinity ID. 2. Some error codes are wrong. [ Slightly tweaked some grammer and did some s/CPU index/vcpu_index/ in the descriptions. -Christoffer ] Signed-off-by: Pavel Fedin Signed-off-by: Christoffer Dall --- Documentation/virtual/kvm/devices/arm-vgic.txt | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt index 3fb905429e8a..59541d49e15c 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt @@ -44,28 +44,29 @@ Groups: Attributes: The attr field of kvm_device_attr encodes two values: bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | - values: | reserved | cpu id | offset | + values: | reserved | vcpu_index | offset | All distributor regs are (rw, 32-bit) The offset is relative to the "Distributor base address" as defined in the GICv2 specs. Getting or setting such a register has the same effect as - reading or writing the register on the actual hardware from the cpu - specified with cpu id field. Note that most distributor fields are not - banked, but return the same value regardless of the cpu id used to access - the register. + reading or writing the register on the actual hardware from the cpu whose + index is specified with the vcpu_index field. Note that most distributor + fields are not banked, but return the same value regardless of the + vcpu_index used to access the register. Limitations: - Priorities are not implemented, and registers are RAZ/WI - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2. Errors: - -ENODEV: Getting or setting this register is not yet supported + -ENXIO: Getting or setting this register is not yet supported -EBUSY: One or more VCPUs are running + -EINVAL: Invalid vcpu_index supplied KVM_DEV_ARM_VGIC_GRP_CPU_REGS Attributes: The attr field of kvm_device_attr encodes two values: bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | - values: | reserved | cpu id | offset | + values: | reserved | vcpu_index | offset | All CPU interface regs are (rw, 32-bit) @@ -91,8 +92,9 @@ Groups: - Priorities are not implemented, and registers are RAZ/WI - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2. Errors: - -ENODEV: Getting or setting this register is not yet supported + -ENXIO: Getting or setting this register is not yet supported -EBUSY: One or more VCPUs are running + -EINVAL: Invalid vcpu_index supplied KVM_DEV_ARM_VGIC_GRP_NR_IRQS Attributes: From b5905dc12ed4254f7e0aac62bab48f002181f639 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sun, 30 Aug 2015 15:55:22 +0200 Subject: [PATCH 101/123] arm/arm64: KVM: Improve kvm_exit tracepoint The ARM architecture only saves the exit class to the HSR (ESR_EL2 for arm64) on synchronous exceptions, not on asynchronous exceptions like an IRQ. However, we only report the exception class on kvm_exit, which is confusing because an IRQ looks like it exited at some PC with the same reason as the previous exit. Add a lookup table for the exception index and prepend the kvm_exit tracepoint text with the exception type to clarify this situation. Also resolve the exception class (EC) to a human-friendly text version so the trace output becomes immediately usable for debugging this code. Cc: Wei Huang Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_arm.h | 20 ++++++++++++++++++++ arch/arm/kvm/arm.c | 2 +- arch/arm/kvm/trace.h | 10 +++++++--- arch/arm64/include/asm/kvm_arm.h | 16 ++++++++++++++++ 4 files changed, 44 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index d995821f1698..dc641ddf0784 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -218,4 +218,24 @@ #define HSR_DABT_CM (1U << 8) #define HSR_DABT_EA (1U << 9) +#define kvm_arm_exception_type \ + {0, "RESET" }, \ + {1, "UNDEFINED" }, \ + {2, "SOFTWARE" }, \ + {3, "PREF_ABORT" }, \ + {4, "DATA_ABORT" }, \ + {5, "IRQ" }, \ + {6, "FIQ" }, \ + {7, "HVC" } + +#define HSRECN(x) { HSR_EC_##x, #x } + +#define kvm_arm_exception_class \ + HSRECN(UNKNOWN), HSRECN(WFI), HSRECN(CP15_32), HSRECN(CP15_64), \ + HSRECN(CP14_MR), HSRECN(CP14_LS), HSRECN(CP_0_13), HSRECN(CP10_ID), \ + HSRECN(JAZELLE), HSRECN(BXJ), HSRECN(CP14_64), HSRECN(SVC_HYP), \ + HSRECN(HVC), HSRECN(SMC), HSRECN(IABT), HSRECN(IABT_HYP), \ + HSRECN(DABT), HSRECN(DABT_HYP) + + #endif /* __ARM_KVM_ARM_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index ed1574724caf..eab83b2435b8 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -635,7 +635,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * guest time. */ kvm_guest_exit(); - trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); + trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* * We must sync the timer state before the vgic state so that diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h index 0ec35392d208..c25a88598eb0 100644 --- a/arch/arm/kvm/trace.h +++ b/arch/arm/kvm/trace.h @@ -25,21 +25,25 @@ TRACE_EVENT(kvm_entry, ); TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, unsigned long vcpu_pc), - TP_ARGS(exit_reason, vcpu_pc), + TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc), + TP_ARGS(idx, exit_reason, vcpu_pc), TP_STRUCT__entry( + __field( int, idx ) __field( unsigned int, exit_reason ) __field( unsigned long, vcpu_pc ) ), TP_fast_assign( + __entry->idx = idx; __entry->exit_reason = exit_reason; __entry->vcpu_pc = vcpu_pc; ), - TP_printk("HSR_EC: 0x%04x, PC: 0x%08lx", + TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", + __print_symbolic(__entry->idx, kvm_arm_exception_type), __entry->exit_reason, + __print_symbolic(__entry->exit_reason, kvm_arm_exception_class), __entry->vcpu_pc) ); diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 9694f2654593..5e6857b6bdc4 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -200,4 +200,20 @@ /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) +#define kvm_arm_exception_type \ + {0, "IRQ" }, \ + {1, "TRAP" } + +#define ECN(x) { ESR_ELx_EC_##x, #x } + +#define kvm_arm_exception_class \ + ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \ + ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(CP14_64), ECN(SVC64), \ + ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(IMP_DEF), ECN(IABT_LOW), \ + ECN(IABT_CUR), ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \ + ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \ + ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \ + ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \ + ECN(BKPT32), ECN(VECTOR32), ECN(BRK64) + #endif /* __ARM64_KVM_ARM_H__ */ From e21f09108754dfdfbb30e547f4edbd3b6884eedb Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sun, 30 Aug 2015 13:57:20 +0200 Subject: [PATCH 102/123] arm/arm64: KVM: Add tracepoints for vgic and timer The VGIC and timer code for KVM arm/arm64 doesn't have any tracepoints or tracepoint infrastructure defined. Rewriting some of the timer code handling showed me how much we need this, so let's add these simple trace points once and for all and we can easily expand with additional trace points in these files as we go along. Cc: Wei Huang Signed-off-by: Christoffer Dall --- virt/kvm/arm/arch_timer.c | 4 +++ virt/kvm/arm/trace.h | 63 +++++++++++++++++++++++++++++++++++++++ virt/kvm/arm/vgic.c | 5 ++++ 3 files changed, 72 insertions(+) create mode 100644 virt/kvm/arm/trace.h diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 523816d8c402..21a0ab2d8919 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -28,6 +28,8 @@ #include #include +#include "trace.h" + static struct timecounter *timecounter; static struct workqueue_struct *wqueue; static unsigned int host_vtimer_irq; @@ -129,6 +131,8 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level) BUG_ON(!vgic_initialized(vcpu->kvm)); timer->irq.level = new_level; + trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->map->virt_irq, + timer->irq.level); ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, timer->map, timer->irq.level); diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h new file mode 100644 index 000000000000..37d8b98867d5 --- /dev/null +++ b/virt/kvm/arm/trace.h @@ -0,0 +1,63 @@ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +/* + * Tracepoints for vgic + */ +TRACE_EVENT(vgic_update_irq_pending, + TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level), + TP_ARGS(vcpu_id, irq, level), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( __u32, irq ) + __field( bool, level ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->level = level; + ), + + TP_printk("VCPU: %ld, IRQ %d, level: %d", + __entry->vcpu_id, __entry->irq, __entry->level) +); + +/* + * Tracepoints for arch_timer + */ +TRACE_EVENT(kvm_timer_update_irq, + TP_PROTO(unsigned long vcpu_id, __u32 irq, int level), + TP_ARGS(vcpu_id, irq, level), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( __u32, irq ) + __field( int, level ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->level = level; + ), + + TP_printk("VCPU: %ld, IRQ %d, level %d", + __entry->vcpu_id, __entry->irq, __entry->level) +); + +#endif /* _TRACE_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 84abc6f38c1d..d4669eb29b77 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -34,6 +34,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include "trace.h" + /* * How the whole thing works (courtesy of Christoffer Dall): * @@ -1574,6 +1577,8 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, int enabled; bool ret = true, can_inject = true; + trace_vgic_update_irq_pending(cpuid, irq_num, level); + if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020)) return -EINVAL; From db85c55f1b01b155332058753854d897e965d67f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 12 Oct 2015 15:04:50 +0100 Subject: [PATCH 103/123] arm64: kvm: restore EL1N SP for panic If we panic in hyp mode, we inject a call to panic() into the EL1N host kernel. If a guest context is active, we first attempt to restore the minimal amount of state necessary to execute the host kernel with restore_sysregs. However, the SP is restored as part of restore_common_regs, and so we may return to the host's panic() function with the SP of the guest. Any calculations based on the SP will be bogus, and any attempt to access the stack will result in recursive data aborts. When running Linux as a guest, the guest's EL1N SP is like to be some valid kernel address. In this case, the host kernel may use that region as a stack for panic(), corrupting it in the process. Avoid the problem by restoring the host SP prior to returning to the host. To prevent misleading backtraces in the host, the FP is zeroed at the same time. We don't need any of the other "common" registers in order to panic successfully. Signed-off-by: Mark Rutland Acked-by: Marc Zyngier Cc: Christoffer Dall Cc: Signed-off-by: Christoffer Dall --- arch/arm64/kvm/hyp.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index e5836138ec42..1599701ef044 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -880,6 +880,14 @@ __kvm_hyp_panic: bl __restore_sysregs + /* + * Make sure we have a valid host stack, and don't leave junk in the + * frame pointer that will give us a misleading host stack unwinding. + */ + ldr x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)] + msr sp_el1, x22 + mov x29, xzr + 1: adr x0, __hyp_panic_str adr x1, 2f ldp x2, x3, [x1] From 5fdf876d30ce7c9b63045b3db8374912ab9b262c Mon Sep 17 00:00:00 2001 From: Michal Marek Date: Thu, 15 Oct 2015 22:16:28 +0200 Subject: [PATCH 104/123] KVM: arm: Do not indent the arguments of DECLARE_BITMAP Besides being a coding style issue, it confuses make tags: ctags: Warning: include/kvm/arm_vgic.h:307: null expansion of name pattern "\1" ctags: Warning: include/kvm/arm_vgic.h:308: null expansion of name pattern "\1" ctags: Warning: include/kvm/arm_vgic.h:309: null expansion of name pattern "\1" ctags: Warning: include/kvm/arm_vgic.h:317: null expansion of name pattern "\1" Cc: kvmarm@lists.cs.columbia.edu Signed-off-by: Michal Marek Signed-off-by: Christoffer Dall --- include/kvm/arm_vgic.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 7bc5d0224ab0..8065801a1847 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -299,9 +299,9 @@ struct vgic_cpu { u8 *vgic_irq_lr_map; /* Pending/active/both interrupts on this VCPU */ - DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS); - DECLARE_BITMAP( active_percpu, VGIC_NR_PRIVATE_IRQS); - DECLARE_BITMAP( pend_act_percpu, VGIC_NR_PRIVATE_IRQS); + DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS); + DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS); + DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS); /* Pending/active/both shared interrupts, dynamically sized */ unsigned long *pending_shared; @@ -309,7 +309,7 @@ struct vgic_cpu { unsigned long *pend_act_shared; /* Bitmap of used/free list registers */ - DECLARE_BITMAP( lr_used, VGIC_V2_MAX_LRS); + DECLARE_BITMAP(lr_used, VGIC_V2_MAX_LRS); /* Number of list registers on this CPU */ int nr_lr; From c5c2c393468576bad6d10b2b5fefff8cd25df3f4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 26 Oct 2015 08:41:29 +0100 Subject: [PATCH 105/123] KVM: s390: SCA must not cross page boundaries We seemed to have missed a few corner cases in commit f6c137ff00a4 ("KVM: s390: randomize sca address"). The SCA has a maximum size of 2112 bytes. By setting the sca_offset to some unlucky numbers, we exceed the page. 0x7c0 (1984) -> Fits exactly 0x7d0 (2000) -> 16 bytes out 0x7e0 (2016) -> 32 bytes out 0x7f0 (2032) -> 48 bytes out One VCPU entry is 32 bytes long. For the last two cases, we actually write data to the other page. 1. The address of the VCPU. 2. Injection/delivery/clearing of SIGP externall calls via SIGP IF. Especially the 2. happens regularly. So this could produce two problems: 1. The guest losing/getting external calls. 2. Random memory overwrites in the host. So this problem happens on every 127 + 128 created VM with 64 VCPUs. Cc: stable@vger.kernel.org # v3.15+ Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 618c85411a51..35596177fad2 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1098,7 +1098,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.sca) goto out_err; spin_lock(&kvm_lock); - sca_offset = (sca_offset + 16) & 0x7f0; + sca_offset += 16; + if (sca_offset + sizeof(struct sca_block) > PAGE_SIZE) + sca_offset = 0; kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset); spin_unlock(&kvm_lock); From 58c383c62e1a4379cee531b56e4293211f2d5ded Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 12 Oct 2015 13:27:29 +0200 Subject: [PATCH 106/123] KVM: s390: drop useless newline in debugging data the s390 debug feature does not need newlines. In fact it will result in empty lines. Get rid of 4 leftovers. Signed-off-by: Christian Borntraeger Acked-by: Cornelia Huck --- arch/s390/kvm/kvm-s390.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 35596177fad2..07a6aa896c12 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -514,7 +514,7 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) if (gtod_high != 0) return -EINVAL; - VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x\n", gtod_high); + VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x", gtod_high); return 0; } @@ -527,7 +527,7 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) return -EFAULT; kvm_s390_set_tod_clock(kvm, gtod); - VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod); + VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod); return 0; } @@ -559,7 +559,7 @@ static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) if (copy_to_user((void __user *)attr->addr, >od_high, sizeof(gtod_high))) return -EFAULT; - VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x\n", gtod_high); + VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x", gtod_high); return 0; } @@ -571,7 +571,7 @@ static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) gtod = kvm_s390_get_tod_clock_fast(kvm); if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) return -EFAULT; - VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod); + VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx", gtod); return 0; } From 46b708ea875f14f5496109df053624199f3aae87 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 23 Jul 2015 15:24:03 +0200 Subject: [PATCH 107/123] KVM: s390: use simple switch statement as multiplexer We currently do some magic shifting (by exploiting that exit codes are always a multiple of 4) and a table lookup to jump into the exit handlers. This causes some calculations and checks, just to do an potentially expensive function call. Changing that to a switch statement gives the compiler the chance to inline and dynamically decide between jump tables or inline compare and branches. In addition it makes the code more readable. bloat-o-meter gives me a small reduction in code size: add/remove: 0/7 grow/shrink: 1/1 up/down: 986/-1334 (-348) function old new delta kvm_handle_sie_intercept 72 1058 +986 handle_prog 704 696 -8 handle_noop 54 - -54 handle_partial_execution 60 - -60 intercept_funcs 120 - -120 handle_instruction 198 - -198 handle_validity 210 - -210 handle_stop 316 - -316 handle_external_interrupt 368 - -368 Right now my gcc does conditional branches instead of jump tables. The inlining seems to give us enough cycles as some micro-benchmarking shows minimal improvements, but still in noise. Signed-off-by: Christian Borntraeger Reviewed-by: Cornelia Huck --- arch/s390/kvm/intercept.c | 42 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 7365e8a46032..b4a5aa110cec 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -336,28 +336,28 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } -static const intercept_handler_t intercept_funcs[] = { - [0x00 >> 2] = handle_noop, - [0x04 >> 2] = handle_instruction, - [0x08 >> 2] = handle_prog, - [0x10 >> 2] = handle_noop, - [0x14 >> 2] = handle_external_interrupt, - [0x18 >> 2] = handle_noop, - [0x1C >> 2] = kvm_s390_handle_wait, - [0x20 >> 2] = handle_validity, - [0x28 >> 2] = handle_stop, - [0x38 >> 2] = handle_partial_execution, -}; - int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { - intercept_handler_t func; - u8 code = vcpu->arch.sie_block->icptcode; - - if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs)) + switch (vcpu->arch.sie_block->icptcode) { + case 0x00: + case 0x10: + case 0x18: + return handle_noop(vcpu); + case 0x04: + return handle_instruction(vcpu); + case 0x08: + return handle_prog(vcpu); + case 0x14: + return handle_external_interrupt(vcpu); + case 0x1c: + return kvm_s390_handle_wait(vcpu); + case 0x20: + return handle_validity(vcpu); + case 0x28: + return handle_stop(vcpu); + case 0x38: + return handle_partial_execution(vcpu); + default: return -EOPNOTSUPP; - func = intercept_funcs[code >> 2]; - if (func) - return func(vcpu); - return -EOPNOTSUPP; + } } From c4cd4c168b81dad53e659d18cdae653bc0ec2384 Mon Sep 17 00:00:00 2001 From: Pavel Fedin Date: Tue, 27 Oct 2015 11:37:29 +0300 Subject: [PATCH 108/123] KVM: arm/arm64: Optimize away redundant LR tracking Currently we use vgic_irq_lr_map in order to track which LRs hold which IRQs, and lr_used bitmap in order to track which LRs are used or free. vgic_irq_lr_map is actually used only for piggy-back optimization, and can be easily replaced by iteration over lr_used. This is good because in future, when LPI support is introduced, number of IRQs will grow up to at least 16384, while numbers from 1024 to 8192 are never going to be used. This would be a huge memory waste. In its turn, lr_used is also completely redundant since ae705930fca6322600690df9dc1c7d0516145a93 ("arm/arm64: KVM: Keep elrsr/aisr in sync with software model"), because together with lr_used we also update elrsr. This allows to easily replace lr_used with elrsr, inverting all conditions (because in elrsr '1' means 'free'). Signed-off-by: Pavel Fedin Signed-off-by: Christoffer Dall --- include/kvm/arm_vgic.h | 6 ----- virt/kvm/arm/vgic-v2.c | 1 + virt/kvm/arm/vgic-v3.c | 1 + virt/kvm/arm/vgic.c | 53 ++++++++++++------------------------------ 4 files changed, 17 insertions(+), 44 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 8065801a1847..3936bf802e1d 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -295,9 +295,6 @@ struct vgic_v3_cpu_if { }; struct vgic_cpu { - /* per IRQ to LR mapping */ - u8 *vgic_irq_lr_map; - /* Pending/active/both interrupts on this VCPU */ DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS); DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS); @@ -308,9 +305,6 @@ struct vgic_cpu { unsigned long *active_shared; unsigned long *pend_act_shared; - /* Bitmap of used/free list registers */ - DECLARE_BITMAP(lr_used, VGIC_V2_MAX_LRS); - /* Number of list registers on this CPU */ int nr_lr; diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c index 8d7b04db8471..c0f5d7fad9ea 100644 --- a/virt/kvm/arm/vgic-v2.c +++ b/virt/kvm/arm/vgic-v2.c @@ -158,6 +158,7 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu) * anyway. */ vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; + vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0; /* Get the show on the road... */ vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c index 7dd5d62f10a1..92003cb61a0a 100644 --- a/virt/kvm/arm/vgic-v3.c +++ b/virt/kvm/arm/vgic-v3.c @@ -193,6 +193,7 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu) * anyway. */ vgic_v3->vgic_vmcr = 0; + vgic_v3->vgic_elrsr = ~0; /* * If we are emulating a GICv3, we do it in an non-GICv2-compatible diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index d4669eb29b77..265a41035728 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -108,6 +108,7 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu); static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr); static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc); +static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu); static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu, int virt_irq); static int compute_pending_for_cpu(struct kvm_vcpu *vcpu); @@ -691,9 +692,11 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio, void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 elrsr = vgic_get_elrsr(vcpu); + unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); int i; - for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) { + for_each_clear_bit(i, elrsr_ptr, vgic_cpu->nr_lr) { struct vgic_lr lr = vgic_get_lr(vcpu, i); /* @@ -1098,7 +1101,6 @@ static inline void vgic_enable(struct kvm_vcpu *vcpu) static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr); /* @@ -1112,8 +1114,6 @@ static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu) vlr.state = 0; vgic_set_lr(vcpu, lr_nr, vlr); - clear_bit(lr_nr, vgic_cpu->lr_used); - vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; vgic_sync_lr_elrsr(vcpu, lr_nr, vlr); } @@ -1128,10 +1128,11 @@ static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu) */ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 elrsr = vgic_get_elrsr(vcpu); + unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); int lr; - for_each_set_bit(lr, vgic_cpu->lr_used, vgic->nr_lr) { + for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) { struct vgic_lr vlr = vgic_get_lr(vcpu, lr); if (!vgic_irq_is_enabled(vcpu, vlr.irq)) { @@ -1188,8 +1189,9 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq, */ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + u64 elrsr = vgic_get_elrsr(vcpu); + unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); struct vgic_lr vlr; int lr; @@ -1200,28 +1202,22 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) kvm_debug("Queue IRQ%d\n", irq); - lr = vgic_cpu->vgic_irq_lr_map[irq]; - /* Do we have an active interrupt for the same CPUID? */ - if (lr != LR_EMPTY) { + for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) { vlr = vgic_get_lr(vcpu, lr); - if (vlr.source == sgi_source_id) { + if (vlr.irq == irq && vlr.source == sgi_source_id) { kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq); - BUG_ON(!test_bit(lr, vgic_cpu->lr_used)); vgic_queue_irq_to_lr(vcpu, irq, lr, vlr); return true; } } /* Try to use another LR for this interrupt */ - lr = find_first_zero_bit((unsigned long *)vgic_cpu->lr_used, - vgic->nr_lr); + lr = find_first_bit(elrsr_ptr, vgic->nr_lr); if (lr >= vgic->nr_lr) return false; kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id); - vgic_cpu->vgic_irq_lr_map[irq] = lr; - set_bit(lr, vgic_cpu->lr_used); vlr.irq = irq; vlr.source = sgi_source_id; @@ -1456,7 +1452,6 @@ static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr) /* Sync back the VGIC state after a guest run */ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_dist *dist = &vcpu->kvm->arch.vgic; u64 elrsr; unsigned long *elrsr_ptr; @@ -1469,22 +1464,10 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) /* Deal with HW interrupts, and clear mappings for empty LRs */ for (lr = 0; lr < vgic->nr_lr; lr++) { - struct vgic_lr vlr; - - if (!test_bit(lr, vgic_cpu->lr_used)) - continue; - - vlr = vgic_get_lr(vcpu, lr); - if (vgic_sync_hwirq(vcpu, lr, vlr)) - level_pending = true; - - if (!test_bit(lr, elrsr_ptr)) - continue; - - clear_bit(lr, vgic_cpu->lr_used); + struct vgic_lr vlr = vgic_get_lr(vcpu, lr); + level_pending |= vgic_sync_hwirq(vcpu, lr, vlr); BUG_ON(vlr.irq >= dist->nr_irqs); - vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY; } /* Check if we still have something up our sleeve... */ @@ -1912,12 +1895,10 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) kfree(vgic_cpu->pending_shared); kfree(vgic_cpu->active_shared); kfree(vgic_cpu->pend_act_shared); - kfree(vgic_cpu->vgic_irq_lr_map); vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list); vgic_cpu->pending_shared = NULL; vgic_cpu->active_shared = NULL; vgic_cpu->pend_act_shared = NULL; - vgic_cpu->vgic_irq_lr_map = NULL; } static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs) @@ -1928,18 +1909,14 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs) vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL); vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL); vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL); - vgic_cpu->vgic_irq_lr_map = kmalloc(nr_irqs, GFP_KERNEL); if (!vgic_cpu->pending_shared || !vgic_cpu->active_shared - || !vgic_cpu->pend_act_shared - || !vgic_cpu->vgic_irq_lr_map) { + || !vgic_cpu->pend_act_shared) { kvm_vgic_vcpu_destroy(vcpu); return -ENOMEM; } - memset(vgic_cpu->vgic_irq_lr_map, LR_EMPTY, nr_irqs); - /* * Store the number of LRs per vcpu, so we don't have to go * all the way to the distributor structure to find out. Only From 212c76545dde8370ebde2a170e4f8e1ed8441dc0 Mon Sep 17 00:00:00 2001 From: Pavel Fedin Date: Tue, 27 Oct 2015 11:37:30 +0300 Subject: [PATCH 109/123] KVM: arm/arm64: Clean up vgic_retire_lr() and surroundings 1. Remove unnecessary 'irq' argument, because irq number can be retrieved from the LR. 2. Since cff9211eb1a1f58ce7f5a2d596b617928fd4be0e ("arm/arm64: KVM: Fix arch timer behavior for disabled interrupts ") LR_STATE_PENDING is queued back by vgic_retire_lr() itself. Also, it clears vlr.state itself. Therefore, we remove the same, now duplicated, check with all accompanying bit manipulations from vgic_unqueue_irqs(). 3. vgic_retire_lr() is always accompanied by vgic_irq_clear_queued(). Since it already does more than just clearing the LR, move vgic_irq_clear_queued() inside of it. Signed-off-by: Pavel Fedin Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 37 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 265a41035728..96e45f3da534 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -105,7 +105,7 @@ #include "vgic.h" static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); -static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu); +static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu); static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr); static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc); static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu); @@ -717,30 +717,14 @@ void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) * interrupt then move the active state to the * distributor tracking bit. */ - if (lr.state & LR_STATE_ACTIVE) { + if (lr.state & LR_STATE_ACTIVE) vgic_irq_set_active(vcpu, lr.irq); - lr.state &= ~LR_STATE_ACTIVE; - } /* * Reestablish the pending state on the distributor and the - * CPU interface. It may have already been pending, but that - * is fine, then we are only setting a few bits that were - * already set. + * CPU interface and mark the LR as free for other use. */ - if (lr.state & LR_STATE_PENDING) { - vgic_dist_irq_set_pending(vcpu, lr.irq); - lr.state &= ~LR_STATE_PENDING; - } - - vgic_set_lr(vcpu, i, lr); - - /* - * Mark the LR as free for other use. - */ - BUG_ON(lr.state & LR_STATE_MASK); - vgic_retire_lr(i, lr.irq, vcpu); - vgic_irq_clear_queued(vcpu, lr.irq); + vgic_retire_lr(i, vcpu); /* Finally update the VGIC state. */ vgic_update_state(vcpu->kvm); @@ -1099,16 +1083,18 @@ static inline void vgic_enable(struct kvm_vcpu *vcpu) vgic_ops->enable(vcpu); } -static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu) +static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu) { struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr); + vgic_irq_clear_queued(vcpu, vlr.irq); + /* * We must transfer the pending state back to the distributor before * retiring the LR, otherwise we may loose edge-triggered interrupts. */ if (vlr.state & LR_STATE_PENDING) { - vgic_dist_irq_set_pending(vcpu, irq); + vgic_dist_irq_set_pending(vcpu, vlr.irq); vlr.hwirq = 0; } @@ -1135,11 +1121,8 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) { struct vgic_lr vlr = vgic_get_lr(vcpu, lr); - if (!vgic_irq_is_enabled(vcpu, vlr.irq)) { - vgic_retire_lr(lr, vlr.irq, vcpu); - if (vgic_irq_is_queued(vcpu, vlr.irq)) - vgic_irq_clear_queued(vcpu, vlr.irq); - } + if (!vgic_irq_is_enabled(vcpu, vlr.irq)) + vgic_retire_lr(lr, vcpu); } } From 26caea7693cb99833fe4ecc544c842289d6b3f69 Mon Sep 17 00:00:00 2001 From: Pavel Fedin Date: Tue, 27 Oct 2015 11:37:31 +0300 Subject: [PATCH 110/123] KVM: arm/arm64: Merge vgic_set_lr() and vgic_sync_lr_elrsr() Now we see that vgic_set_lr() and vgic_sync_lr_elrsr() are always used together. Merge them into one function, saving from second vgic_ops dereferencing every time. Signed-off-by: Pavel Fedin Signed-off-by: Christoffer Dall --- include/kvm/arm_vgic.h | 1 - virt/kvm/arm/vgic-v2.c | 5 ----- virt/kvm/arm/vgic-v3.c | 5 ----- virt/kvm/arm/vgic.c | 14 ++------------ 4 files changed, 2 insertions(+), 23 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 3936bf802e1d..f62addc17dcf 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -112,7 +112,6 @@ struct vgic_vmcr { struct vgic_ops { struct vgic_lr (*get_lr)(const struct kvm_vcpu *, int); void (*set_lr)(struct kvm_vcpu *, int, struct vgic_lr); - void (*sync_lr_elrsr)(struct kvm_vcpu *, int, struct vgic_lr); u64 (*get_elrsr)(const struct kvm_vcpu *vcpu); u64 (*get_eisr)(const struct kvm_vcpu *vcpu); void (*clear_eisr)(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c index c0f5d7fad9ea..ff02f08df74d 100644 --- a/virt/kvm/arm/vgic-v2.c +++ b/virt/kvm/arm/vgic-v2.c @@ -79,11 +79,7 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr, lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT); vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val; -} -static void vgic_v2_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr, - struct vgic_lr lr_desc) -{ if (!(lr_desc.state & LR_STATE_MASK)) vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr); else @@ -167,7 +163,6 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu) static const struct vgic_ops vgic_v2_ops = { .get_lr = vgic_v2_get_lr, .set_lr = vgic_v2_set_lr, - .sync_lr_elrsr = vgic_v2_sync_lr_elrsr, .get_elrsr = vgic_v2_get_elrsr, .get_eisr = vgic_v2_get_eisr, .clear_eisr = vgic_v2_clear_eisr, diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c index 92003cb61a0a..487d6357b7e7 100644 --- a/virt/kvm/arm/vgic-v3.c +++ b/virt/kvm/arm/vgic-v3.c @@ -112,11 +112,7 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr, } vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val; -} -static void vgic_v3_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr, - struct vgic_lr lr_desc) -{ if (!(lr_desc.state & LR_STATE_MASK)) vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr); else @@ -212,7 +208,6 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu) static const struct vgic_ops vgic_v3_ops = { .get_lr = vgic_v3_get_lr, .set_lr = vgic_v3_set_lr, - .sync_lr_elrsr = vgic_v3_sync_lr_elrsr, .get_elrsr = vgic_v3_get_elrsr, .get_eisr = vgic_v3_get_eisr, .clear_eisr = vgic_v3_clear_eisr, diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 96e45f3da534..fe451d4885ae 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1032,12 +1032,6 @@ static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, vgic_ops->set_lr(vcpu, lr, vlr); } -static void vgic_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr, - struct vgic_lr vlr) -{ - vgic_ops->sync_lr_elrsr(vcpu, lr, vlr); -} - static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu) { return vgic_ops->get_elrsr(vcpu); @@ -1100,7 +1094,6 @@ static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu) vlr.state = 0; vgic_set_lr(vcpu, lr_nr, vlr); - vgic_sync_lr_elrsr(vcpu, lr_nr, vlr); } /* @@ -1162,7 +1155,6 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq, } vgic_set_lr(vcpu, lr_nr, vlr); - vgic_sync_lr_elrsr(vcpu, lr_nr, vlr); } /* @@ -1340,8 +1332,6 @@ static int process_queued_irq(struct kvm_vcpu *vcpu, vlr.hwirq = 0; vgic_set_lr(vcpu, lr, vlr); - vgic_sync_lr_elrsr(vcpu, lr, vlr); - return pending; } @@ -1442,8 +1432,6 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) bool level_pending; level_pending = vgic_process_maintenance(vcpu); - elrsr = vgic_get_elrsr(vcpu); - elrsr_ptr = u64_to_bitmask(&elrsr); /* Deal with HW interrupts, and clear mappings for empty LRs */ for (lr = 0; lr < vgic->nr_lr; lr++) { @@ -1454,6 +1442,8 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) } /* Check if we still have something up our sleeve... */ + elrsr = vgic_get_elrsr(vcpu); + elrsr_ptr = u64_to_bitmask(&elrsr); pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr); if (level_pending || pending < vgic->nr_lr) set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); From 6956d8946d5d1cb2ac913caa8d4259a4d0e00c48 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 19 Oct 2015 04:37:18 -0600 Subject: [PATCH 111/123] KVM: don't pointlessly leave KVM_COMPAT=y in non-KVM configs The symbol was missing a KVM dependency. Signed-off-by: Jan Beulich Signed-off-by: Paolo Bonzini --- virt/kvm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 9f8014dda2cf..7a79b6853583 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -46,7 +46,7 @@ config KVM_GENERIC_DIRTYLOG_READ_PROTECT config KVM_COMPAT def_bool y - depends on COMPAT && !S390 + depends on KVM && COMPAT && !S390 config HAVE_KVM_IRQ_BYPASS bool From 2da29bccc5045ea10c70cb3a69be777768fd0b66 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Fri, 30 Oct 2015 12:56:11 +0530 Subject: [PATCH 112/123] KVM: x86: removing unused variable removing unused variables, found by coccinelle Signed-off-by: Saurabh Sengar Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9e9c226cb79d..57b5f79aad08 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3438,41 +3438,35 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) { - int r = 0; - mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; + return 0; } static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) { - int r = 0; - mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; + return 0; } static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) { - int r = 0; - mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, sizeof(ps->channels)); ps->flags = kvm->arch.vpit->pit_state.flags; mutex_unlock(&kvm->arch.vpit->pit_state.lock); memset(&ps->reserved, 0, sizeof(ps->reserved)); - return r; + return 0; } static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) { - int r = 0, start = 0; + int start = 0; u32 prev_legacy, cur_legacy; mutex_lock(&kvm->arch.vpit->pit_state.lock); prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; @@ -3484,7 +3478,7 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) kvm->arch.vpit->pit_state.flags = ps->flags; kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; + return 0; } static int kvm_vm_ioctl_reinject(struct kvm *kvm, From 7a036a6f670f63b32c5ee126425f9109271ca13f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Fri, 30 Oct 2015 16:36:24 +0100 Subject: [PATCH 113/123] KVM: x86: add read_phys to x86_emulate_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to read the physical memory when emulating RSM. X86EMUL_IO_NEEDED is returned on all errors for consistency with other helpers. Signed-off-by: Radim Krčmář Tested-by: Laszlo Ersek Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 10 ++++++++++ arch/x86/kvm/x86.c | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e16466ec473c..e9cd7befcb76 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -111,6 +111,16 @@ struct x86_emulate_ops { unsigned int bytes, struct x86_exception *fault); + /* + * read_phys: Read bytes of standard (non-emulated/special) memory. + * Used for descriptor reading. + * @addr: [IN ] Physical address from which to read. + * @val: [OUT] Value read from memory. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr, + void *val, unsigned int bytes); + /* * write_std: Write bytes of standard (non-emulated/special) memory. * Used for descriptor writing. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 57b5f79aad08..a24bae0a83a2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4089,6 +4089,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } +static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes); + + return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE; +} + int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, @@ -4824,6 +4833,7 @@ static const struct x86_emulate_ops emulate_ops = { .write_gpr = emulator_write_gpr, .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, + .read_phys = kvm_read_guest_phys_system, .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, From f40606b147dd5b4678cedc877a71deb520ca507e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Fri, 30 Oct 2015 16:36:25 +0100 Subject: [PATCH 114/123] KVM: x86: handle SMBASE as physical address in RSM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GET_SMSTATE depends on real mode to ensure that smbase+offset is treated as a physical address, which has already caused a bug after shuffling the code. Enforce physical addressing. Signed-off-by: Radim Krčmář Reported-by: Laszlo Ersek Tested-by: Laszlo Ersek Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9da95b9daf8d..b60fed56671b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) #define GET_SMSTATE(type, smbase, offset) \ ({ \ type __val; \ - int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \ - sizeof(__val), NULL); \ + int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \ + sizeof(__val)); \ if (r != X86EMUL_CONTINUE) \ return X86EMUL_UNHANDLEABLE; \ __val; \ @@ -2484,8 +2484,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) /* * Get back to real mode, to prepare a safe state in which to load - * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed - * to read_std/write_std are not virtual. + * CR0/CR3/CR4/EFER. * * CR4.PCIDE must be zero, because it is a 64-bit mode only feature. */ From c75efa974e013640496620f26f0b532cb5cb17f9 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Fri, 16 Oct 2015 10:07:50 +0300 Subject: [PATCH 115/123] drivers/hv: share Hyper-V SynIC constants with userspace Moved Hyper-V synic contants from guest Hyper-V drivers private header into x86 arch uapi Hyper-V header. Added Hyper-V synic msr's flags into x86 arch uapi Hyper-V header. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Vitaly Kuznetsov CC: "K. Y. Srinivasan" CC: Gleb Natapov CC: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/hyperv.h | 12 ++++++++++++ drivers/hv/hyperv_vmbus.h | 5 ----- include/linux/hyperv.h | 1 + 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 2677a0aac2cc..040d4083c24f 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -257,4 +257,16 @@ typedef struct _HV_REFERENCE_TSC_PAGE { __s64 tsc_offset; } HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; +/* Define the number of synthetic interrupt sources. */ +#define HV_SYNIC_SINT_COUNT (16) +/* Define the expected SynIC version. */ +#define HV_SYNIC_VERSION_1 (0x1) + +#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) +#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) +#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) +#define HV_SYNIC_SINT_MASKED (1ULL << 16) +#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) +#define HV_SYNIC_SINT_VECTOR_MASK (0xFF) + #endif diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 3d70e36c918e..3782636562a1 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -63,9 +63,6 @@ enum hv_cpuid_function { /* Define version of the synthetic interrupt controller. */ #define HV_SYNIC_VERSION (1) -/* Define the expected SynIC version. */ -#define HV_SYNIC_VERSION_1 (0x1) - /* Define synthetic interrupt controller message constants. */ #define HV_MESSAGE_SIZE (256) #define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) @@ -105,8 +102,6 @@ enum hv_message_type { HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 }; -/* Define the number of synthetic interrupt sources. */ -#define HV_SYNIC_SINT_COUNT (16) #define HV_SYNIC_STIMER_COUNT (4) /* Define invalid partition identifier. */ diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 54733d5b503e..8fdc17b84739 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -26,6 +26,7 @@ #define _HYPERV_H #include +#include #include #include From 0669a51015c58b1f036030743a0c0781eb63867f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Fri, 30 Oct 2015 15:48:20 +0100 Subject: [PATCH 116/123] KVM: x86: zero apic_arb_prio on reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BSP doesn't get INIT so its apic_arb_prio isn't zeroed after reboot. BSP won't get lowest priority interrupts until other VCPUs get enough interrupts to match their pre-reboot apic_arb_prio. That behavior doesn't fit into KVM's round-robin-like interpretation of lowest priority delivery ... userspace should KVM_SET_LAPIC on reset, so just zero apic_arb_prio there. Reported-by: Yuki Shibuya Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 168b8759bd73..ecd4ea1d28a8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1918,6 +1918,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_rtc_eoi_tracking_restore_one(vcpu); + + vcpu->arch.apic_arb_prio = 0; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) From b97e6de9c96cefaa02a6a7464731ea504b45e150 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 19:16:47 +0100 Subject: [PATCH 117/123] KVM: x86: merge kvm_arch_set_irq with kvm_set_msi_inatomic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We do not want to do too much work in atomic context, in particular not walking all the VCPUs of the virtual machine. So we want to distinguish the architecture-specific injection function for irqfd from kvm_set_msi. Since it's still empty, reuse the newly added kvm_arch_set_irq and rename it to kvm_arch_set_irq_inatomic. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 14 ++++++++------ include/linux/kvm_host.h | 7 +++---- virt/kvm/eventfd.c | 11 ++++------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8f4499c7ffc1..75dc633c48dc 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -124,12 +124,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, } -static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm) +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) { struct kvm_lapic_irq irq; int r; + if (unlikely(e->type != KVM_IRQ_ROUTING_MSI)) + return -EWOULDBLOCK; + kvm_set_msi_irq(e, &irq); if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) @@ -165,10 +169,8 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) idx = srcu_read_lock(&kvm->irq_srcu); if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { e = &entries[0]; - if (likely(e->type == KVM_IRQ_ROUTING_MSI)) - ret = kvm_set_msi_inatomic(e, kvm); - else - ret = -EWOULDBLOCK; + ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, + irq, level); } srcu_read_unlock(&kvm->irq_srcu, idx); return ret; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 87189a41d904..15c78f320678 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -830,10 +830,9 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level, bool line_status); - -int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm, - int irq_source_id, int level, bool line_status); - +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status); bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index e29fd2640709..46dbc0a7dfc1 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -171,7 +171,7 @@ irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) queue_work(irqfd_cleanup_wq, &irqfd->shutdown); } -int __attribute__((weak)) kvm_arch_set_irq( +int __attribute__((weak)) kvm_arch_set_irq_inatomic( struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm, int irq_source_id, int level, @@ -201,12 +201,9 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) irq = irqfd->irq_entry; } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); /* An event has been signaled, inject an interrupt */ - if (irq.type == KVM_IRQ_ROUTING_MSI) - kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, - false); - else if (kvm_arch_set_irq(&irq, kvm, - KVM_USERSPACE_IRQ_SOURCE_ID, 1, - false) == -EWOULDBLOCK) + if (kvm_arch_set_irq_inatomic(&irq, kvm, + KVM_USERSPACE_IRQ_SOURCE_ID, 1, + false) == -EWOULDBLOCK) schedule_work(&irqfd->inject); srcu_read_unlock(&kvm->irq_srcu, idx); } From 7695405698d011c05a95288f1f3c0a3dd252dccc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 19:22:20 +0100 Subject: [PATCH 118/123] KVM: device assignment: remove pointless #ifdefs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The symbols are always defined. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/assigned-dev.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c index d090ecf08809..1c17ee807ef7 100644 --- a/arch/x86/kvm/assigned-dev.c +++ b/arch/x86/kvm/assigned-dev.c @@ -131,7 +131,6 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) return IRQ_HANDLED; } -#ifdef __KVM_HAVE_MSI static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; @@ -150,9 +149,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) return IRQ_HANDLED; } -#endif -#ifdef __KVM_HAVE_MSIX static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; @@ -183,7 +180,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) return IRQ_HANDLED; } -#endif /* Ack the irq line for an assigned device */ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) @@ -386,7 +382,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, return 0; } -#ifdef __KVM_HAVE_MSI static int assigned_device_enable_host_msi(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { @@ -408,9 +403,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, return 0; } -#endif -#ifdef __KVM_HAVE_MSIX static int assigned_device_enable_host_msix(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { @@ -443,8 +436,6 @@ err: return r; } -#endif - static int assigned_device_enable_guest_intx(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, struct kvm_assigned_irq *irq) @@ -454,7 +445,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm, return 0; } -#ifdef __KVM_HAVE_MSI static int assigned_device_enable_guest_msi(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, struct kvm_assigned_irq *irq) @@ -463,9 +453,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, dev->ack_notifier.gsi = -1; return 0; } -#endif -#ifdef __KVM_HAVE_MSIX static int assigned_device_enable_guest_msix(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, struct kvm_assigned_irq *irq) @@ -474,7 +462,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, dev->ack_notifier.gsi = -1; return 0; } -#endif static int assign_host_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, @@ -492,16 +479,12 @@ static int assign_host_irq(struct kvm *kvm, case KVM_DEV_IRQ_HOST_INTX: r = assigned_device_enable_host_intx(kvm, dev); break; -#ifdef __KVM_HAVE_MSI case KVM_DEV_IRQ_HOST_MSI: r = assigned_device_enable_host_msi(kvm, dev); break; -#endif -#ifdef __KVM_HAVE_MSIX case KVM_DEV_IRQ_HOST_MSIX: r = assigned_device_enable_host_msix(kvm, dev); break; -#endif default: r = -EINVAL; } @@ -534,16 +517,12 @@ static int assign_guest_irq(struct kvm *kvm, case KVM_DEV_IRQ_GUEST_INTX: r = assigned_device_enable_guest_intx(kvm, dev, irq); break; -#ifdef __KVM_HAVE_MSI case KVM_DEV_IRQ_GUEST_MSI: r = assigned_device_enable_guest_msi(kvm, dev, irq); break; -#endif -#ifdef __KVM_HAVE_MSIX case KVM_DEV_IRQ_GUEST_MSIX: r = assigned_device_enable_guest_msix(kvm, dev, irq); break; -#endif default: r = -EINVAL; } @@ -826,7 +805,6 @@ out: } -#ifdef __KVM_HAVE_MSIX static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, struct kvm_assigned_msix_nr *entry_nr) { @@ -906,7 +884,6 @@ msix_entry_out: return r; } -#endif static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, struct kvm_assigned_pci_dev *assigned_dev) @@ -1012,7 +989,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#ifdef __KVM_HAVE_MSIX case KVM_ASSIGN_SET_MSIX_NR: { struct kvm_assigned_msix_nr entry_nr; r = -EFAULT; @@ -1033,7 +1009,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#endif case KVM_ASSIGN_SET_INTX_MASK: { struct kvm_assigned_pci_dev assigned_dev; From 8a22f234a81ab4d1de5d948c3478608f08a9b844 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 18:52:02 +0100 Subject: [PATCH 119/123] KVM: x86: move kvm_set_irq_inatomic to legacy device assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function is not used outside device assignment, and kvm_arch_set_irq_inatomic has a different prototype. Move it here and make it static to avoid confusion. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/assigned-dev.c | 37 +++++++++++++++++++++++++++++++++++++ arch/x86/kvm/irq_comm.c | 34 ---------------------------------- include/linux/kvm_host.h | 1 - 3 files changed, 37 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c index 1c17ee807ef7..9dc091acd5fb 100644 --- a/arch/x86/kvm/assigned-dev.c +++ b/arch/x86/kvm/assigned-dev.c @@ -21,6 +21,7 @@ #include #include "irq.h" #include "assigned-dev.h" +#include "trace/events/kvm.h" struct kvm_assigned_dev_kernel { struct kvm_irq_ack_notifier ack_notifier; @@ -131,6 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) return IRQ_HANDLED; } +/* + * Deliver an IRQ in an atomic context if we can, or return a failure, + * user can retry in a process context. + * Return value: + * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. + * Other values - No need to retry. + */ +static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, + int level) +{ + struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; + struct kvm_kernel_irq_routing_entry *e; + int ret = -EINVAL; + int idx; + + trace_kvm_set_irq(irq, level, irq_source_id); + + /* + * Injection into either PIC or IOAPIC might need to scan all CPUs, + * which would need to be retried from thread context; when same GSI + * is connected to both PIC and IOAPIC, we'd have to report a + * partial failure here. + * Since there's no easy way to do this, we only support injecting MSI + * which is limited to 1:1 GSI mapping. + */ + idx = srcu_read_lock(&kvm->irq_srcu); + if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { + e = &entries[0]; + ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, + irq, level); + } + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + + static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 75dc633c48dc..84b96d319909 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -142,40 +142,6 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, return -EWOULDBLOCK; } -/* - * Deliver an IRQ in an atomic context if we can, or return a failure, - * user can retry in a process context. - * Return value: - * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. - * Other values - No need to retry. - */ -int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) -{ - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - struct kvm_kernel_irq_routing_entry *e; - int ret = -EINVAL; - int idx; - - trace_kvm_set_irq(irq, level, irq_source_id); - - /* - * Injection into either PIC or IOAPIC might need to scan all CPUs, - * which would need to be retried from thread context; when same GSI - * is connected to both PIC and IOAPIC, we'd have to report a - * partial failure here. - * Since there's no easy way to do this, we only support injecting MSI - * which is limited to 1:1 GSI mapping. - */ - idx = srcu_read_lock(&kvm->irq_srcu); - if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { - e = &entries[0]; - ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, - irq, level); - } - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - int kvm_request_irq_source_id(struct kvm *kvm) { unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 15c78f320678..242a6d2b53ff 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -827,7 +827,6 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin); int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); -int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level, bool line_status); int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, From 656ec4a4928a3db7d16e5cb9bce351a478cfd3d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Mon, 2 Nov 2015 22:20:00 +0100 Subject: [PATCH 120/123] KVM: VMX: fix SMEP and SMAP without EPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment in code had it mostly right, but we enable paging for emulated real mode regardless of EPT. Without EPT (which implies emulated real mode), secondary VCPUs won't start unless we disable SM[AE]P when the guest doesn't use paging. Signed-off-by: Radim Krčmář Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4d0aa31a42ba..2ac116417583 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3788,20 +3788,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; - /* - * SMEP/SMAP is disabled if CPU is in non-paging mode - * in hardware. However KVM always uses paging mode to - * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP/SMAP needs to be - * manually disabled when guest switches to non-paging - * mode. - */ - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } } + if (!enable_unrestricted_guest && !is_paging(vcpu)) + /* + * SMEP/SMAP is disabled if CPU is in non-paging mode in + * hardware. However KVM always uses paging mode without + * unrestricted guest. + * To emulate this behavior, SMEP/SMAP needs to be manually + * disabled when guest switches to non-paging mode. + */ + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); + vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); return 0; From 89651a3decbe03754f304a0b248f27eeb9a37937 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 3 Nov 2015 13:43:05 +0100 Subject: [PATCH 121/123] KVM: x86: allow RSM from 64-bit mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SDM says that exiting system management mode from 64-bit mode is invalid, but that would be too good to be true. But actually, most of the code is already there to support exiting from compat mode (EFER.LME=1, EFER.LMA=0). Getting all the way from 64-bit mode to real mode only requires clearing CS.L and CR4.PCIDE. Cc: stable@vger.kernel.org Fixes: 660a5d517aaab9187f93854425c4c63f4a09195c Tested-by: Laszlo Ersek Cc: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b60fed56671b..1505587d06e9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2484,16 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) /* * Get back to real mode, to prepare a safe state in which to load - * CR0/CR3/CR4/EFER. - * - * CR4.PCIDE must be zero, because it is a 64-bit mode only feature. + * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU + * supports long mode. */ + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (emulator_has_longmode(ctxt)) { + struct desc_struct cs_desc; + + /* Zero CR4.PCIDE before CR0.PG. */ + if (cr4 & X86_CR4_PCIDE) { + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + cr4 &= ~X86_CR4_PCIDE; + } + + /* A 32-bit code segment is required to clear EFER.LMA. */ + memset(&cs_desc, 0, sizeof(cs_desc)); + cs_desc.type = 0xb; + cs_desc.s = cs_desc.g = cs_desc.p = 1; + ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS); + } + + /* For the 64-bit case, this will clear EFER.LMA. */ cr0 = ctxt->ops->get_cr(ctxt, 0); if (cr0 & X86_CR0_PE) ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - cr4 = ctxt->ops->get_cr(ctxt, 4); + + /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */ if (cr4 & X86_CR4_PAE) ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + + /* And finally go back to 32-bit mode. */ efer = 0; ctxt->ops->set_msr(ctxt, MSR_EFER, efer); @@ -4454,7 +4474,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), - II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm), + II(EmulateOnUD | ImplicitOps, em_rsm, rsm), F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), From 879ae1880449c88db11c1ebdaedc2da79b2fe73f Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 4 Nov 2015 12:54:41 +0100 Subject: [PATCH 122/123] KVM: x86: obey KVM_X86_QUIRK_CD_NW_CLEARED in kvm_set_cr0() Commit b18d5431acc7 ("KVM: x86: fix CR0.CD virtualization") was technically correct, but it broke OVMF guests by slowing down various parts of the firmware. Commit fb279950ba02 ("KVM: vmx: obey KVM_QUIRK_CD_NW_CLEARED") quirked the first function modified by b18d5431acc7, vmx_get_mt_mask(), for OVMF's sake. This restored the speed of the OVMF code that runs before PlatformPei (including the memory intensive LZMA decompression in SEC). This patch extends the quirk to the second function modified by b18d5431acc7, kvm_set_cr0(). It eliminates the intrusive slowdown that hits the EFI_MP_SERVICES_PROTOCOL implementation of edk2's UefiCpuPkg/CpuDxe -- which is built into OVMF --, when CpuDxe starts up all APs at once for initialization, in order to count them. We also carry over the kvm_arch_has_noncoherent_dma() sub-condition from the other half of the original commit b18d5431acc7. Fixes: b18d5431acc7a2fd22767925f3a6f597aa4bd29e Cc: stable@vger.kernel.org Cc: Jordan Justen Cc: Alex Williamson Reviewed-by: Xiao Guangrong Tested-by: Janusz Mocek Signed-off-by: Laszlo Ersek # Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a24bae0a83a2..30723a4122cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -625,7 +625,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); - if ((cr0 ^ old_cr0) & X86_CR0_CD) + if (((cr0 ^ old_cr0) & X86_CR0_CD) && + kvm_arch_has_noncoherent_dma(vcpu->kvm) && + !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); return 0; From a3eaa8649e4c6a6afdafaa04b9114fb230617bb1 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 4 Nov 2015 13:46:05 +0800 Subject: [PATCH 123/123] KVM: VMX: Fix commit which broke PML I found PML was broken since below commit: commit feda805fe7c4ed9cf78158e73b1218752e3b4314 Author: Xiao Guangrong Date: Wed Sep 9 14:05:55 2015 +0800 KVM: VMX: unify SECONDARY_VM_EXEC_CONTROL update Unify the update in vmx_cpuid_update() Signed-off-by: Xiao Guangrong [Rewrite to use vmcs_set_secondary_exec_control. - Paolo] Signed-off-by: Paolo Bonzini The reason is in above commit vmx_cpuid_update calls vmx_secondary_exec_control, in which currently SECONDARY_EXEC_ENABLE_PML bit is cleared unconditionally (as PML is enabled in creating vcpu). Therefore if vcpu_cpuid_update is called after vcpu is created, PML will be disabled unexpectedly while log-dirty code still thinks PML is used. Fix this by clearing SECONDARY_EXEC_ENABLE_PML in vmx_secondary_exec_control only when PML is not supported or not enabled (!enable_pml). This is more reasonable as PML is currently either always enabled or disabled. With this explicit updating SECONDARY_EXEC_ENABLE_PML in vmx_enable{disable}_pml is not needed so also rename vmx_enable{disable}_pml to vmx_create{destroy}_pml_buffer. Fixes: feda805fe7c4ed9cf78158e73b1218752e3b4314 Signed-off-by: Kai Huang [While at it, change a wrong ASSERT to an "if". The condition can happen if creating the VCPU fails with ENOMEM. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2ac116417583..5eb56ed77c1f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4718,8 +4718,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) a current VMCS12 */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - /* PML is enabled/disabled in creating/destorying vcpu */ - exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + + if (!enable_pml) + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; /* Currently, we allow L1 guest to directly run pcommit instruction. */ exec_control &= ~SECONDARY_EXEC_PCOMMIT; @@ -7804,7 +7805,7 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } -static int vmx_enable_pml(struct vcpu_vmx *vmx) +static int vmx_create_pml_buffer(struct vcpu_vmx *vmx) { struct page *pml_pg; @@ -7817,18 +7818,15 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); - return 0; } -static void vmx_disable_pml(struct vcpu_vmx *vmx) +static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) { - ASSERT(vmx->pml_pg); - __free_page(vmx->pml_pg); - vmx->pml_pg = NULL; - - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); + if (vmx->pml_pg) { + __free_page(vmx->pml_pg); + vmx->pml_pg = NULL; + } } static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) @@ -8706,7 +8704,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (enable_pml) - vmx_disable_pml(vmx); + vmx_destroy_pml_buffer(vmx); free_vpid(vmx->vpid); leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); @@ -8790,7 +8788,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) * for the guest, etc. */ if (enable_pml) { - err = vmx_enable_pml(vmx); + err = vmx_create_pml_buffer(vmx); if (err) goto free_vmcs; }