From bca031e2c8aa22a978a2452bf959e27e9fa73dc7 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 18 Jul 2019 08:15:10 +0000 Subject: [PATCH 01/18] KVM: arm/arm64: Introduce kvm_pmu_vcpu_init() to setup PMU counter index We use "pmc->idx" and the "chained" bitmap to determine if the pmc is chained, in kvm_pmu_pmc_is_chained(). But idx might be uninitialized (and random) when we doing this decision, through a KVM_ARM_VCPU_INIT ioctl -> kvm_pmu_vcpu_reset(). And the test_bit() against this random idx will potentially hit a KASAN BUG [1]. In general, idx is the static property of a PMU counter that is not expected to be modified across resets, as suggested by Julien. It looks more reasonable if we can setup the PMU counter idx for a vcpu in its creation time. Introduce a new function - kvm_pmu_vcpu_init() for this basic setup. Oh, and the KASAN BUG will get fixed this way. [1] https://www.spinics.net/lists/kvm-arm/msg36700.html Fixes: 80f393a23be6 ("KVM: arm/arm64: Support chained PMU counters") Suggested-by: Andrew Murray Suggested-by: Julien Thierry Acked-by: Julien Thierry Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier --- include/kvm/arm_pmu.h | 2 ++ virt/kvm/arm/arm.c | 2 ++ virt/kvm/arm/pmu.c | 18 +++++++++++++++--- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 16c769a7f979..6db030439e29 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -34,6 +34,7 @@ struct kvm_pmu { u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu); +void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val); @@ -71,6 +72,7 @@ static inline u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) { return 0; } +static inline void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index f645c0fbf7ec..c704fa696184 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -340,6 +340,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) /* Set up the timer */ kvm_timer_vcpu_init(vcpu); + kvm_pmu_vcpu_init(vcpu); + kvm_arm_reset_debug_ptr(vcpu); return kvm_vgic_vcpu_init(vcpu); diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index 3dd8238ed246..362a01886bab 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -214,6 +214,20 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) kvm_pmu_release_perf_event(pmc); } +/** + * kvm_pmu_vcpu_init - assign pmu counter idx for cpu + * @vcpu: The vcpu pointer + * + */ +void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + pmu->pmc[i].idx = i; +} + /** * kvm_pmu_vcpu_reset - reset pmu state for cpu * @vcpu: The vcpu pointer @@ -224,10 +238,8 @@ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) int i; struct kvm_pmu *pmu = &vcpu->arch.pmu; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]); - pmu->pmc[i].idx = i; - } bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS); } From 3d584a3c85d6fe2cf878f220d4ad7145e7f89218 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 26 Jul 2019 13:27:05 +0200 Subject: [PATCH 02/18] arm64: KVM: regmap: Fix unexpected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When fall-through warnings was enabled by default, commit d93512ef0f0e ("Makefile: Globally enable fall-through warning"), the following warnings was starting to show up: In file included from ../arch/arm64/include/asm/kvm_emulate.h:19, from ../arch/arm64/kvm/regmap.c:13: ../arch/arm64/kvm/regmap.c: In function ‘vcpu_write_spsr32’: ../arch/arm64/include/asm/kvm_hyp.h:31:3: warning: this statement may fall through [-Wimplicit-fallthrough=] asm volatile(ALTERNATIVE(__msr_s(r##nvh, "%x0"), \ ^~~ ../arch/arm64/include/asm/kvm_hyp.h:46:31: note: in expansion of macro ‘write_sysreg_elx’ #define write_sysreg_el1(v,r) write_sysreg_elx(v, r, _EL1, _EL12) ^~~~~~~~~~~~~~~~ ../arch/arm64/kvm/regmap.c:180:3: note: in expansion of macro ‘write_sysreg_el1’ write_sysreg_el1(v, SYS_SPSR); ^~~~~~~~~~~~~~~~ ../arch/arm64/kvm/regmap.c:181:2: note: here case KVM_SPSR_ABT: ^~~~ In file included from ../arch/arm64/include/asm/cputype.h:132, from ../arch/arm64/include/asm/cache.h:8, from ../include/linux/cache.h:6, from ../include/linux/printk.h:9, from ../include/linux/kernel.h:15, from ../include/asm-generic/bug.h:18, from ../arch/arm64/include/asm/bug.h:26, from ../include/linux/bug.h:5, from ../include/linux/mmdebug.h:5, from ../include/linux/mm.h:9, from ../arch/arm64/kvm/regmap.c:11: ../arch/arm64/include/asm/sysreg.h:837:2: warning: this statement may fall through [-Wimplicit-fallthrough=] asm volatile("msr " __stringify(r) ", %x0" \ ^~~ ../arch/arm64/kvm/regmap.c:182:3: note: in expansion of macro ‘write_sysreg’ write_sysreg(v, spsr_abt); ^~~~~~~~~~~~ ../arch/arm64/kvm/regmap.c:183:2: note: here case KVM_SPSR_UND: ^~~~ Rework to add a 'break;' in the swich-case since it didn't have that, leading to an interresting set of bugs. Cc: stable@vger.kernel.org # v4.17+ Fixes: a892819560c4 ("KVM: arm64: Prepare to handle deferred save/restore of 32-bit registers") Signed-off-by: Anders Roxell [maz: reworked commit message, fixed stable range] Signed-off-by: Marc Zyngier --- arch/arm64/kvm/regmap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/kvm/regmap.c b/arch/arm64/kvm/regmap.c index 0d60e4f0af66..a900181e3867 100644 --- a/arch/arm64/kvm/regmap.c +++ b/arch/arm64/kvm/regmap.c @@ -178,13 +178,18 @@ void vcpu_write_spsr32(struct kvm_vcpu *vcpu, unsigned long v) switch (spsr_idx) { case KVM_SPSR_SVC: write_sysreg_el1(v, SYS_SPSR); + break; case KVM_SPSR_ABT: write_sysreg(v, spsr_abt); + break; case KVM_SPSR_UND: write_sysreg(v, spsr_und); + break; case KVM_SPSR_IRQ: write_sysreg(v, spsr_irq); + break; case KVM_SPSR_FIQ: write_sysreg(v, spsr_fiq); + break; } } From 1a8248c74c81a15a32dc3344fb5c622e19072791 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 26 Jul 2019 13:28:31 +0200 Subject: [PATCH 03/18] KVM: arm: vgic-v3: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When fall-through warnings was enabled by default the following warnings was starting to show up: ../virt/kvm/arm/hyp/vgic-v3-sr.c: In function ‘__vgic_v3_save_aprs’: ../virt/kvm/arm/hyp/vgic-v3-sr.c:351:24: warning: this statement may fall through [-Wimplicit-fallthrough=] cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2); ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~ ../virt/kvm/arm/hyp/vgic-v3-sr.c:352:2: note: here case 6: ^~~~ ../virt/kvm/arm/hyp/vgic-v3-sr.c:353:24: warning: this statement may fall through [-Wimplicit-fallthrough=] cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1); ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~ ../virt/kvm/arm/hyp/vgic-v3-sr.c:354:2: note: here default: ^~~~~~~ Rework so that the compiler doesn't warn about fall-through. Fixes: d93512ef0f0e ("Makefile: Globally enable fall-through warning") Signed-off-by: Anders Roxell Signed-off-by: Marc Zyngier --- virt/kvm/arm/hyp/vgic-v3-sr.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index 254c5f190a3d..ccf1fde9836c 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -349,8 +349,10 @@ void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu) case 7: cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3); cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2); + /* Fall through */ case 6: cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1); + /* Fall through */ default: cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0); } @@ -359,8 +361,10 @@ void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu) case 7: cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3); cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2); + /* Fall through */ case 6: cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1); + /* Fall through */ default: cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0); } @@ -382,8 +386,10 @@ void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu) case 7: __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3); __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2); + /* Fall through */ case 6: __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1); + /* Fall through */ default: __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0); } @@ -392,8 +398,10 @@ void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu) case 7: __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3); __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2); + /* Fall through */ case 6: __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1); + /* Fall through */ default: __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0); } From 6701c619fa082e6660ecd7573fbad2177380c7cc Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sat, 13 Jul 2019 04:40:54 +0000 Subject: [PATCH 04/18] KVM: arm64: Update kvm_arm_exception_class and esr_class_str for new EC We've added two ESR exception classes for new ARM hardware extensions: ESR_ELx_EC_PAC and ESR_ELx_EC_SVE, but failed to update the strings used in tracing and other debug. Let's update "kvm_arm_exception_class" for these two EC, which the new EC will be visible to user-space via kvm_exit trace events Also update to "esr_class_str" for ESR_ELx_EC_PAC, by which we can get more readable debug info. Cc: Marc Zyngier Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: Dave Martin Reviewed-by: James Morse Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 7 ++++--- arch/arm64/kernel/traps.c | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index a8b205e5c4a8..ddf9d762ac62 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -316,9 +316,10 @@ #define kvm_arm_exception_class \ ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \ - ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(CP14_64), ECN(SVC64), \ - ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(IMP_DEF), ECN(IABT_LOW), \ - ECN(IABT_CUR), ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \ + ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(PAC), ECN(CP14_64), \ + ECN(SVC64), ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(SVE), \ + ECN(IMP_DEF), ECN(IABT_LOW), ECN(IABT_CUR), \ + ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \ ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \ ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \ ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8c03456dade6..969e1565152b 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -734,6 +734,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_CP14_LS] = "CP14 LDC/STC", [ESR_ELx_EC_FP_ASIMD] = "ASIMD", [ESR_ELx_EC_CP10_ID] = "CP10 MRC/VMRS", + [ESR_ELx_EC_PAC] = "PAC", [ESR_ELx_EC_CP14_64] = "CP14 MCRR/MRRC", [ESR_ELx_EC_ILL] = "PSTATE.IL", [ESR_ELx_EC_SVC32] = "SVC (AArch32)", From cdb2d3ee0436d74fa9092f2df46aaa6f9e03c969 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 26 Jul 2019 13:27:10 +0200 Subject: [PATCH 05/18] arm64: KVM: hyp: debug-sr: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When fall-through warnings was enabled by default the following warnings was starting to show up: ../arch/arm64/kvm/hyp/debug-sr.c: In function ‘__debug_save_state’: ../arch/arm64/kvm/hyp/debug-sr.c:20:19: warning: this statement may fall through [-Wimplicit-fallthrough=] case 15: ptr[15] = read_debug(reg, 15); \ ../arch/arm64/kvm/hyp/debug-sr.c:113:2: note: in expansion of macro ‘save_debug’ save_debug(dbg->dbg_bcr, dbgbcr, brps); ^~~~~~~~~~ ../arch/arm64/kvm/hyp/debug-sr.c:21:2: note: here case 14: ptr[14] = read_debug(reg, 14); \ ^~~~ ../arch/arm64/kvm/hyp/debug-sr.c:113:2: note: in expansion of macro ‘save_debug’ save_debug(dbg->dbg_bcr, dbgbcr, brps); ^~~~~~~~~~ ../arch/arm64/kvm/hyp/debug-sr.c:21:19: warning: this statement may fall through [-Wimplicit-fallthrough=] case 14: ptr[14] = read_debug(reg, 14); \ ../arch/arm64/kvm/hyp/debug-sr.c:113:2: note: in expansion of macro ‘save_debug’ save_debug(dbg->dbg_bcr, dbgbcr, brps); ^~~~~~~~~~ ../arch/arm64/kvm/hyp/debug-sr.c:22:2: note: here case 13: ptr[13] = read_debug(reg, 13); \ ^~~~ ../arch/arm64/kvm/hyp/debug-sr.c:113:2: note: in expansion of macro ‘save_debug’ save_debug(dbg->dbg_bcr, dbgbcr, brps); ^~~~~~~~~~ Rework to add a 'Fall through' comment where the compiler warned about fall-through, hence silencing the warning. Fixes: d93512ef0f0e ("Makefile: Globally enable fall-through warning") Signed-off-by: Anders Roxell [maz: fixed commit message] Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/debug-sr.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c index 26781da3ad3e..0fc9872a1467 100644 --- a/arch/arm64/kvm/hyp/debug-sr.c +++ b/arch/arm64/kvm/hyp/debug-sr.c @@ -18,40 +18,70 @@ #define save_debug(ptr,reg,nr) \ switch (nr) { \ case 15: ptr[15] = read_debug(reg, 15); \ + /* Fall through */ \ case 14: ptr[14] = read_debug(reg, 14); \ + /* Fall through */ \ case 13: ptr[13] = read_debug(reg, 13); \ + /* Fall through */ \ case 12: ptr[12] = read_debug(reg, 12); \ + /* Fall through */ \ case 11: ptr[11] = read_debug(reg, 11); \ + /* Fall through */ \ case 10: ptr[10] = read_debug(reg, 10); \ + /* Fall through */ \ case 9: ptr[9] = read_debug(reg, 9); \ + /* Fall through */ \ case 8: ptr[8] = read_debug(reg, 8); \ + /* Fall through */ \ case 7: ptr[7] = read_debug(reg, 7); \ + /* Fall through */ \ case 6: ptr[6] = read_debug(reg, 6); \ + /* Fall through */ \ case 5: ptr[5] = read_debug(reg, 5); \ + /* Fall through */ \ case 4: ptr[4] = read_debug(reg, 4); \ + /* Fall through */ \ case 3: ptr[3] = read_debug(reg, 3); \ + /* Fall through */ \ case 2: ptr[2] = read_debug(reg, 2); \ + /* Fall through */ \ case 1: ptr[1] = read_debug(reg, 1); \ + /* Fall through */ \ default: ptr[0] = read_debug(reg, 0); \ } #define restore_debug(ptr,reg,nr) \ switch (nr) { \ case 15: write_debug(ptr[15], reg, 15); \ + /* Fall through */ \ case 14: write_debug(ptr[14], reg, 14); \ + /* Fall through */ \ case 13: write_debug(ptr[13], reg, 13); \ + /* Fall through */ \ case 12: write_debug(ptr[12], reg, 12); \ + /* Fall through */ \ case 11: write_debug(ptr[11], reg, 11); \ + /* Fall through */ \ case 10: write_debug(ptr[10], reg, 10); \ + /* Fall through */ \ case 9: write_debug(ptr[9], reg, 9); \ + /* Fall through */ \ case 8: write_debug(ptr[8], reg, 8); \ + /* Fall through */ \ case 7: write_debug(ptr[7], reg, 7); \ + /* Fall through */ \ case 6: write_debug(ptr[6], reg, 6); \ + /* Fall through */ \ case 5: write_debug(ptr[5], reg, 5); \ + /* Fall through */ \ case 4: write_debug(ptr[4], reg, 4); \ + /* Fall through */ \ case 3: write_debug(ptr[3], reg, 3); \ + /* Fall through */ \ case 2: write_debug(ptr[2], reg, 2); \ + /* Fall through */ \ case 1: write_debug(ptr[1], reg, 1); \ + /* Fall through */ \ default: write_debug(ptr[0], reg, 0); \ } From a48d06f9b7cedbb8ad7804d1720168b7ee6a34e7 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 1 Aug 2019 11:30:13 +0800 Subject: [PATCH 06/18] KVM: LAPIC: Don't need to wakeup vCPU twice afer timer fire MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_set_pending_timer() will take care to wake up the sleeping vCPU which has pending timer, don't need to check this in apic_timer_expired() again. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0aa158657f20..685d17c11461 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1548,7 +1548,6 @@ static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) static void apic_timer_expired(struct kvm_lapic *apic) { struct kvm_vcpu *vcpu = apic->vcpu; - struct swait_queue_head *q = &vcpu->wq; struct kvm_timer *ktimer = &apic->lapic_timer; if (atomic_read(&apic->lapic_timer.pending)) @@ -1566,13 +1565,6 @@ static void apic_timer_expired(struct kvm_lapic *apic) atomic_inc(&apic->lapic_timer.pending); kvm_set_pending_timer(vcpu); - - /* - * For x86, the atomic_inc() is serialized, thus - * using swait_active() is safe. - */ - if (swait_active(q)) - swake_up_one(q); } static void start_sw_tscdeadline(struct kvm_lapic *apic) From 046ddeed0461b5d270470c253cbb321103d048b6 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 1 Aug 2019 11:30:14 +0800 Subject: [PATCH 07/18] KVM: Check preempted_in_kernel for involuntary preemption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit preempted_in_kernel is updated in preempt_notifier when involuntary preemption ocurrs, it can be stale when the voluntarily preempted vCPUs are taken into account by kvm_vcpu_on_spin() loop. This patch lets it just check preempted_in_kernel for involuntary preemption. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 887f3b0c2b60..ed061d8a457c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2508,7 +2508,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) continue; - if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu)) + if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && + !kvm_arch_vcpu_in_kernel(vcpu)) continue; if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) continue; @@ -4205,7 +4206,7 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu) { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); - vcpu->preempted = false; + WRITE_ONCE(vcpu->preempted, false); WRITE_ONCE(vcpu->ready, false); kvm_arch_sched_in(vcpu, cpu); @@ -4219,7 +4220,7 @@ static void kvm_sched_out(struct preempt_notifier *pn, struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); if (current->state == TASK_RUNNING) { - vcpu->preempted = true; + WRITE_ONCE(vcpu->preempted, true); WRITE_ONCE(vcpu->ready, true); } kvm_arch_vcpu_put(vcpu); From 17e433b54393a6269acbcb792da97791fe1592d8 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 5 Aug 2019 10:03:19 +0800 Subject: [PATCH 08/18] KVM: Fix leak vCPU's VMCS value into other pCPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit d73eb57b80b (KVM: Boost vCPUs that are delivering interrupts), a five years old bug is exposed. Running ebizzy benchmark in three 80 vCPUs VMs on one 80 pCPUs Skylake server, a lot of rcu_sched stall warning splatting in the VMs after stress testing: INFO: rcu_sched detected stalls on CPUs/tasks: { 4 41 57 62 77} (detected by 15, t=60004 jiffies, g=899, c=898, q=15073) Call Trace: flush_tlb_mm_range+0x68/0x140 tlb_flush_mmu.part.75+0x37/0xe0 tlb_finish_mmu+0x55/0x60 zap_page_range+0x142/0x190 SyS_madvise+0x3cd/0x9c0 system_call_fastpath+0x1c/0x21 swait_active() sustains to be true before finish_swait() is called in kvm_vcpu_block(), voluntarily preempted vCPUs are taken into account by kvm_vcpu_on_spin() loop greatly increases the probability condition kvm_arch_vcpu_runnable(vcpu) is checked and can be true, when APICv is enabled the yield-candidate vCPU's VMCS RVI field leaks(by vmx_sync_pir_to_irr()) into spinning-on-a-taken-lock vCPU's current VMCS. This patch fixes it by checking conservatively a subset of events. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Christian Borntraeger Cc: Marc Zyngier Cc: stable@vger.kernel.org Fixes: 98f4a1467 (KVM: add kvm_arch_vcpu_runnable() test to kvm_vcpu_on_spin() loop) Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 5 +++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 16 ++++++++++++++++ include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 25 ++++++++++++++++++++++++- 7 files changed, 59 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0dba7eb24f92..3e34d5fa6708 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -50,6 +50,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) return !!(v->arch.pending_exceptions) || kvm_request_pending(v); } +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + return kvm_arch_vcpu_runnable(vcpu); +} + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { return false; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e74f0711eaaf..fc046ca89d32 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1175,6 +1175,7 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); + bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, bool *expired); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7eafc6907861..d685491fce4d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5190,6 +5190,11 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_vcpu_wake_up(vcpu); } +static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return false; +} + static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) { unsigned long flags; @@ -7314,6 +7319,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, + .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 074385c86c09..42ed3faa6af8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6117,6 +6117,11 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } +static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return pi_test_on(vcpu_to_pi_desc(vcpu)); +} + static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -7726,6 +7731,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c6d951cbd76c..93b0bd45ac73 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9698,6 +9698,22 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + kvm_test_request(KVM_REQ_SMI, vcpu) || + kvm_test_request(KVM_REQ_EVENT, vcpu)) + return true; + + if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu)) + return true; + + return false; +} + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { return vcpu->arch.preempted_in_kernel; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c5b5867024c..9e4c2bb90297 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -872,6 +872,7 @@ int kvm_arch_check_processor_compat(void); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); #ifndef __KVM_HAVE_ARCH_VM_ALLOC /* diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ed061d8a457c..1f05aeb9da27 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2477,6 +2477,29 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) #endif } +/* + * Unlike kvm_arch_vcpu_runnable, this function is called outside + * a vcpu_load/vcpu_put pair. However, for most architectures + * kvm_arch_vcpu_runnable does not require vcpu_load. + */ +bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + return kvm_arch_vcpu_runnable(vcpu); +} + +static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (kvm_arch_dy_runnable(vcpu)) + return true; + +#ifdef CONFIG_KVM_ASYNC_PF + if (!list_empty_careful(&vcpu->async_pf.done)) + return true; +#endif + + return false; +} + void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { struct kvm *kvm = me->kvm; @@ -2506,7 +2529,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; if (vcpu == me) continue; - if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) + if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu)) continue; if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu)) From 741cbbae0768b828be2d48331eb371a4f08bbea8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 3 Aug 2019 08:14:25 +0200 Subject: [PATCH 09/18] KVM: remove kvm_arch_has_vcpu_debugfs() There is no need for this function as all arches have to implement kvm_arch_create_vcpu_debugfs() no matter what. A #define symbol let us actually simplify the code. Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 10 ---------- arch/powerpc/kvm/powerpc.c | 10 ---------- arch/s390/kvm/kvm-s390.c | 10 ---------- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/debugfs.c | 5 ----- include/linux/kvm_host.h | 3 ++- virt/kvm/arm/arm.c | 5 ----- virt/kvm/kvm_main.c | 5 ++--- 8 files changed, 6 insertions(+), 44 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 2cfe839f0b3a..1109924560d8 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -150,16 +150,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return 0; } -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_mips_free_vcpus(struct kvm *kvm) { unsigned int i; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3e34d5fa6708..3e566c2e6066 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -457,16 +457,6 @@ err_out: return -EINVAL; } -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_arch_destroy_vm(struct kvm *kvm) { unsigned int i; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3f520cd837fb..f329dcb3f44c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2516,16 +2516,6 @@ out_err: return rc; } -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 3, "%s", "free cpu"); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fc046ca89d32..e92725b2a46f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,6 +35,8 @@ #include #include +#define __KVM_HAVE_ARCH_VCPU_DEBUGFS + #define KVM_MAX_VCPUS 288 #define KVM_SOFT_MAX_VCPUS 240 #define KVM_MAX_VCPU_ID 1023 diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 329361b69d5e..9bd93e0d5f63 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -8,11 +8,6 @@ #include #include "lapic.h" -bool kvm_arch_has_vcpu_debugfs(void) -{ - return true; -} - static int vcpu_get_timer_advance_ns(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9e4c2bb90297..8d34db3c8bc6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -861,8 +861,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); -bool kvm_arch_has_vcpu_debugfs(void); +#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu); +#endif int kvm_arch_hardware_enable(void); void kvm_arch_hardware_disable(void); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index acc43242a310..13f5a1aa6d79 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -144,11 +144,6 @@ out_fail_alloc: return ret; } -bool kvm_arch_has_vcpu_debugfs(void) -{ - return false; -} - int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1f05aeb9da27..4afb1a234018 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2617,12 +2617,10 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { +#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS char dir_name[ITOA_MAX_LEN * 2]; int ret; - if (!kvm_arch_has_vcpu_debugfs()) - return 0; - if (!debugfs_initialized()) return 0; @@ -2637,6 +2635,7 @@ static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) debugfs_remove_recursive(vcpu->debugfs_dentry); return ret; } +#endif return 0; } From 3e7093d045196b1016517631645e874fe903db7e Mon Sep 17 00:00:00 2001 From: Greg KH Date: Wed, 31 Jul 2019 20:56:20 +0200 Subject: [PATCH 10/18] KVM: no need to check return value of debugfs_create functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Also, when doing this, change kvm_arch_create_vcpu_debugfs() to return void instead of an integer, as we should not care at all about if this function actually does anything or not. Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Cc: Signed-off-by: Greg Kroah-Hartman Signed-off-by: Paolo Bonzini --- arch/x86/kvm/debugfs.c | 41 +++++++++++++--------------------------- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 21 +++++--------------- 3 files changed, 19 insertions(+), 45 deletions(-) diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 9bd93e0d5f63..018aebce33ff 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -43,37 +43,22 @@ static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { - struct dentry *ret; + debugfs_create_file("tsc-offset", 0444, vcpu->debugfs_dentry, vcpu, + &vcpu_tsc_offset_fops); - ret = debugfs_create_file("tsc-offset", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_offset_fops); - if (!ret) - return -ENOMEM; - - if (lapic_in_kernel(vcpu)) { - ret = debugfs_create_file("lapic_timer_advance_ns", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_timer_advance_ns_fops); - if (!ret) - return -ENOMEM; - } + if (lapic_in_kernel(vcpu)) + debugfs_create_file("lapic_timer_advance_ns", 0444, + vcpu->debugfs_dentry, vcpu, + &vcpu_timer_advance_ns_fops); if (kvm_has_tsc_control) { - ret = debugfs_create_file("tsc-scaling-ratio", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_scaling_fops); - if (!ret) - return -ENOMEM; - ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_scaling_frac_fops); - if (!ret) - return -ENOMEM; - + debugfs_create_file("tsc-scaling-ratio", 0444, + vcpu->debugfs_dentry, vcpu, + &vcpu_tsc_scaling_fops); + debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, + vcpu->debugfs_dentry, vcpu, + &vcpu_tsc_scaling_frac_fops); } - - return 0; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8d34db3c8bc6..fcb46b3374c6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -862,7 +862,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu); +void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu); #endif int kvm_arch_hardware_enable(void); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4afb1a234018..4feceaa03fb1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2615,29 +2615,20 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); } -static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS char dir_name[ITOA_MAX_LEN * 2]; - int ret; if (!debugfs_initialized()) - return 0; + return; snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); vcpu->debugfs_dentry = debugfs_create_dir(dir_name, - vcpu->kvm->debugfs_dentry); - if (!vcpu->debugfs_dentry) - return -ENOMEM; + vcpu->kvm->debugfs_dentry); - ret = kvm_arch_create_vcpu_debugfs(vcpu); - if (ret < 0) { - debugfs_remove_recursive(vcpu->debugfs_dentry); - return ret; - } + kvm_arch_create_vcpu_debugfs(vcpu); #endif - - return 0; } /* @@ -2672,9 +2663,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (r) goto vcpu_destroy; - r = kvm_create_vcpu_debugfs(vcpu); - if (r) - goto vcpu_destroy; + kvm_create_vcpu_debugfs(vcpu); mutex_lock(&kvm->lock); if (kvm_get_vcpu_by_id(kvm, id)) { From 57b76bdb20ecb05c87f4e12b7ced66bc03a976c3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 25 Jul 2019 12:59:40 +0200 Subject: [PATCH 11/18] x86: kvm: remove useless calls to kvm_para_available Most code in arch/x86/kernel/kvm.c is called through x86_hyper_kvm, and thus only runs if KVM has been detected. There is no need to check again for the CPUID base. Cc: Sergio Lopez Cc: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 82caf01b63dd..edd2179ad2da 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -308,9 +308,6 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { - if (!kvm_para_available()) - return; - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); @@ -625,9 +622,6 @@ static void __init kvm_guest_init(void) { int i; - if (!kvm_para_available()) - return; - paravirt_ops_setup(); register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) @@ -847,8 +841,6 @@ asm( */ void __init kvm_spinlock_init(void) { - if (!kvm_para_available()) - return; /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; From 5eeaf10eec394b28fad2c58f1f5c3a5da0e87d1c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 2 Aug 2019 10:28:32 +0100 Subject: [PATCH 12/18] KVM: arm/arm64: Sync ICH_VMCR_EL2 back when about to block Since commit commit 328e56647944 ("KVM: arm/arm64: vgic: Defer touching GICH_VMCR to vcpu_load/put"), we leave ICH_VMCR_EL2 (or its GICv2 equivalent) loaded as long as we can, only syncing it back when we're scheduled out. There is a small snag with that though: kvm_vgic_vcpu_pending_irq(), which is indirectly called from kvm_vcpu_check_block(), needs to evaluate the guest's view of ICC_PMR_EL1. At the point were we call kvm_vcpu_check_block(), the vcpu is still loaded, and whatever changes to PMR is not visible in memory until we do a vcpu_put(). Things go really south if the guest does the following: mov x0, #0 // or any small value masking interrupts msr ICC_PMR_EL1, x0 [vcpu preempted, then rescheduled, VMCR sampled] mov x0, #ff // allow all interrupts msr ICC_PMR_EL1, x0 wfi // traps to EL2, so samping of VMCR [interrupt arrives just after WFI] Here, the hypervisor's view of PMR is zero, while the guest has enabled its interrupts. kvm_vgic_vcpu_pending_irq() will then say that no interrupts are pending (despite an interrupt being received) and we'll block for no reason. If the guest doesn't have a periodic interrupt firing once it has blocked, it will stay there forever. To avoid this unfortuante situation, let's resync VMCR from kvm_arch_vcpu_blocking(), ensuring that a following kvm_vcpu_check_block() will observe the latest value of PMR. This has been found by booting an arm64 Linux guest with the pseudo NMI feature, and thus using interrupt priorities to mask interrupts instead of the usual PSTATE masking. Cc: stable@vger.kernel.org # 4.12 Fixes: 328e56647944 ("KVM: arm/arm64: vgic: Defer touching GICH_VMCR to vcpu_load/put") Signed-off-by: Marc Zyngier --- include/kvm/arm_vgic.h | 1 + virt/kvm/arm/arm.c | 11 +++++++++++ virt/kvm/arm/vgic/vgic-v2.c | 9 ++++++++- virt/kvm/arm/vgic/vgic-v3.c | 7 ++++++- virt/kvm/arm/vgic/vgic.c | 11 +++++++++++ virt/kvm/arm/vgic/vgic.h | 2 ++ 6 files changed, 39 insertions(+), 2 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 46bbc949c20a..7a30524a80ee 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -350,6 +350,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); void kvm_vgic_load(struct kvm_vcpu *vcpu); void kvm_vgic_put(struct kvm_vcpu *vcpu); +void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index c704fa696184..482b20256fa8 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -323,6 +323,17 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { + /* + * If we're about to block (most likely because we've just hit a + * WFI), we need to sync back the state of the GIC CPU interface + * so that we have the lastest PMR and group enables. This ensures + * that kvm_arch_vcpu_runnable has up-to-date data to decide + * whether we have pending interrupts. + */ + preempt_disable(); + kvm_vgic_vmcr_sync(vcpu); + preempt_enable(); + kvm_vgic_v4_enable_doorbell(vcpu); } diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index 6dd5ad706c92..96aab77d0471 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -484,10 +484,17 @@ void vgic_v2_load(struct kvm_vcpu *vcpu) kvm_vgic_global_state.vctrl_base + GICH_APR); } -void vgic_v2_put(struct kvm_vcpu *vcpu) +void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); +} + +void vgic_v2_put(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + + vgic_v2_vmcr_sync(vcpu); cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); } diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index c2c9ce009f63..0c653a1e5215 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -662,12 +662,17 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) __vgic_v3_activate_traps(vcpu); } -void vgic_v3_put(struct kvm_vcpu *vcpu) +void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; if (likely(cpu_if->vgic_sre)) cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr); +} + +void vgic_v3_put(struct kvm_vcpu *vcpu) +{ + vgic_v3_vmcr_sync(vcpu); kvm_call_hyp(__vgic_v3_save_aprs, vcpu); diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 04786c8ec77e..13d4b38a94ec 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -919,6 +919,17 @@ void kvm_vgic_put(struct kvm_vcpu *vcpu) vgic_v3_put(vcpu); } +void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu) +{ + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + return; + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_vmcr_sync(vcpu); + else + vgic_v3_vmcr_sync(vcpu); +} + int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 57205beaa981..11adbdac1d56 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -193,6 +193,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, void vgic_v2_init_lrs(void); void vgic_v2_load(struct kvm_vcpu *vcpu); void vgic_v2_put(struct kvm_vcpu *vcpu); +void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); @@ -223,6 +224,7 @@ bool vgic_v3_check_base(struct kvm *kvm); void vgic_v3_load(struct kvm_vcpu *vcpu); void vgic_v3_put(struct kvm_vcpu *vcpu); +void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); From 03fdfb2690099c19160a3f2c5b77db60b3afeded Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 5 Aug 2019 10:34:51 +0100 Subject: [PATCH 13/18] KVM: arm64: Don't write junk to sysregs on reset At the moment, the way we reset system registers is mildly insane: We write junk to them, call the reset functions, and then check that we have something else in them. The "fun" thing is that this can happen while the guest is running (PSCI, for example). If anything in KVM has to evaluate the state of a system register while junk is in there, bad thing may happen. Let's stop doing that. Instead, we track that we have called a reset function for that register, and assume that the reset function has done something. This requires fixing a couple of sysreg refinition in the trap table. In the end, the very need of this reset check is pretty dubious, as it doesn't check everything (a lot of the sysregs leave outside of the sys_regs[] array). It may well be axed in the near future. Tested-by: Zenghui Yu Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f26e181d881c..2071260a275b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -632,7 +632,7 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) */ val = ((pmcr & ~ARMV8_PMU_PMCR_MASK) | (ARMV8_PMU_PMCR_MASK & 0xdecafbad)) & (~ARMV8_PMU_PMCR_E); - __vcpu_sys_reg(vcpu, PMCR_EL0) = val; + __vcpu_sys_reg(vcpu, r->reg) = val; } static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) @@ -981,13 +981,13 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ - trap_bvr, reset_bvr, n, 0, get_bvr, set_bvr }, \ + trap_bvr, reset_bvr, 0, 0, get_bvr, set_bvr }, \ { SYS_DESC(SYS_DBGBCRn_EL1(n)), \ - trap_bcr, reset_bcr, n, 0, get_bcr, set_bcr }, \ + trap_bcr, reset_bcr, 0, 0, get_bcr, set_bcr }, \ { SYS_DESC(SYS_DBGWVRn_EL1(n)), \ - trap_wvr, reset_wvr, n, 0, get_wvr, set_wvr }, \ + trap_wvr, reset_wvr, 0, 0, get_wvr, set_wvr }, \ { SYS_DESC(SYS_DBGWCRn_EL1(n)), \ - trap_wcr, reset_wcr, n, 0, get_wcr, set_wcr } + trap_wcr, reset_wcr, 0, 0, get_wcr, set_wcr } /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ @@ -1540,7 +1540,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, { SYS_DESC(SYS_CTR_EL0), access_ctr }, - { SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, }, + { SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, PMCR_EL0 }, { SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 }, { SYS_DESC(SYS_PMCNTENCLR_EL0), access_pmcnten, NULL, PMCNTENSET_EL0 }, { SYS_DESC(SYS_PMOVSCLR_EL0), access_pmovs, NULL, PMOVSSET_EL0 }, @@ -2254,13 +2254,19 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu, } static void reset_sys_reg_descs(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *table, size_t num) + const struct sys_reg_desc *table, size_t num, + unsigned long *bmap) { unsigned long i; for (i = 0; i < num; i++) - if (table[i].reset) + if (table[i].reset) { + int reg = table[i].reg; + table[i].reset(vcpu, &table[i]); + if (reg > 0 && reg < NR_SYS_REGS) + set_bit(reg, bmap); + } } /** @@ -2774,18 +2780,16 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) { size_t num; const struct sys_reg_desc *table; - - /* Catch someone adding a register without putting in reset entry. */ - memset(&vcpu->arch.ctxt.sys_regs, 0x42, sizeof(vcpu->arch.ctxt.sys_regs)); + DECLARE_BITMAP(bmap, NR_SYS_REGS) = { 0, }; /* Generic chip reset first (so target could override). */ - reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); + reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs), bmap); table = get_target_table(vcpu->arch.target, true, &num); - reset_sys_reg_descs(vcpu, table, num); + reset_sys_reg_descs(vcpu, table, num, bmap); for (num = 1; num < NR_SYS_REGS; num++) { - if (WARN(__vcpu_sys_reg(vcpu, num) == 0x4242424242424242, + if (WARN(!test_bit(num, bmap), "Didn't reset __vcpu_sys_reg(%zi)\n", num)) break; } From c69509c70aa45a8c4954c88c629a64acf4ee4a36 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 5 Aug 2019 10:34:51 +0100 Subject: [PATCH 14/18] KVM: arm: Don't write junk to CP15 registers on reset At the moment, the way we reset CP15 registers is mildly insane: We write junk to them, call the reset functions, and then check that we have something else in them. The "fun" thing is that this can happen while the guest is running (PSCI, for example). If anything in KVM has to evaluate the state of a CP15 register while junk is in there, bad thing may happen. Let's stop doing that. Instead, we track that we have called a reset function for that register, and assume that the reset function has done something. In the end, the very need of this reset check is pretty dubious, as it doesn't check everything (a lot of the CP15 reg leave outside of the cp15_regs[] array). It may well be axed in the near future. Signed-off-by: Marc Zyngier --- arch/arm/kvm/coproc.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index d2806bcff8bb..07745ee022a1 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -651,13 +651,22 @@ int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run) } static void reset_coproc_regs(struct kvm_vcpu *vcpu, - const struct coproc_reg *table, size_t num) + const struct coproc_reg *table, size_t num, + unsigned long *bmap) { unsigned long i; for (i = 0; i < num; i++) - if (table[i].reset) + if (table[i].reset) { + int reg = table[i].reg; + table[i].reset(vcpu, &table[i]); + if (reg > 0 && reg < NR_CP15_REGS) { + set_bit(reg, bmap); + if (table[i].is_64bit) + set_bit(reg + 1, bmap); + } + } } static struct coproc_params decode_32bit_hsr(struct kvm_vcpu *vcpu) @@ -1432,17 +1441,15 @@ void kvm_reset_coprocs(struct kvm_vcpu *vcpu) { size_t num; const struct coproc_reg *table; - - /* Catch someone adding a register without putting in reset entry. */ - memset(vcpu->arch.ctxt.cp15, 0x42, sizeof(vcpu->arch.ctxt.cp15)); + DECLARE_BITMAP(bmap, NR_CP15_REGS) = { 0, }; /* Generic chip reset first (so target could override). */ - reset_coproc_regs(vcpu, cp15_regs, ARRAY_SIZE(cp15_regs)); + reset_coproc_regs(vcpu, cp15_regs, ARRAY_SIZE(cp15_regs), bmap); table = get_target_table(vcpu->arch.target, &num); - reset_coproc_regs(vcpu, table, num); + reset_coproc_regs(vcpu, table, num, bmap); for (num = 1; num < NR_CP15_REGS; num++) - WARN(vcpu_cp15(vcpu, num) == 0x42424242, + WARN(!test_bit(num, bmap), "Didn't reset vcpu_cp15(vcpu, %zi)", num); } From 16e604a437c89751dc626c9e90cf88ba93c5be64 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Wed, 7 Aug 2019 10:53:20 +0100 Subject: [PATCH 15/18] KVM: arm/arm64: vgic: Reevaluate level sensitive interrupts on enable A HW mapped level sensitive interrupt asserted by a device will not be put into the ap_list if it is disabled at the VGIC level. When it is enabled again, it will be inserted into the ap_list and written to a list register on guest entry regardless of the state of the device. We could argue that this can also happen on real hardware, when the command to enable the interrupt reached the GIC before the device had the chance to de-assert the interrupt signal; however, we emulate the distributor and redistributors in software and we can do better than that. Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 3ba7278fb533..44efc2ff863f 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -113,6 +113,22 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (vgic_irq_is_mapped_level(irq)) { + bool was_high = irq->line_level; + + /* + * We need to update the state of the interrupt because + * the guest might have changed the state of the device + * while the interrupt was disabled at the VGIC level. + */ + irq->line_level = vgic_get_phys_line_level(irq); + /* + * Deactivate the physical interrupt so the GIC will let + * us know when it is asserted again. + */ + if (!irq->active && was_high && !irq->line_level) + vgic_irq_set_phys_active(irq, false); + } irq->enabled = true; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); From 8f946da73aaa1c3f609bd14d1193f39afa6830c7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 5 Aug 2019 11:11:08 +0200 Subject: [PATCH 16/18] kvm: remove unnecessary PageReserved check The same check is already done in kvm_is_reserved_pfn. Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4feceaa03fb1..c6a91b044d8d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1855,8 +1855,7 @@ void kvm_set_pfn_dirty(kvm_pfn_t pfn) if (!kvm_is_reserved_pfn(pfn)) { struct page *page = pfn_to_page(pfn); - if (!PageReserved(page)) - SetPageDirty(page); + SetPageDirty(page); } } EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); From e2c26537ea7652d123ddb01ceb1aa0e2a5edb7ad Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 31 Jul 2019 16:28:51 +0200 Subject: [PATCH 17/18] KVM: selftests: Update gitignore file for latest changes The kvm_create_max_vcpus test has been moved to the main directory, and sync_regs_test is now available on s390x, too. Signed-off-by: Thomas Huth Acked-by: Shuah Khan Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 41266af0d3dc..b35da375530a 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,7 +1,7 @@ +/s390x/sync_regs_test /x86_64/cr4_cpuid_sync_test /x86_64/evmcs_test /x86_64/hyperv_cpuid -/x86_64/kvm_create_max_vcpus /x86_64/mmio_warning_test /x86_64/platform_info_test /x86_64/set_sregs_test @@ -13,3 +13,4 @@ /x86_64/vmx_tsc_adjust_test /clear_dirty_log_test /dirty_log_test +/kvm_create_max_vcpus From c096397c78f766db972f923433031f2dec01cae0 Mon Sep 17 00:00:00 2001 From: Naresh Kamboju Date: Wed, 7 Aug 2019 13:58:14 +0000 Subject: [PATCH 18/18] selftests: kvm: Adding config fragments selftests kvm test cases need pre-required kernel configs for the test to get pass. Signed-off-by: Naresh Kamboju Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/config | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/testing/selftests/kvm/config diff --git a/tools/testing/selftests/kvm/config b/tools/testing/selftests/kvm/config new file mode 100644 index 000000000000..63ed533f73d6 --- /dev/null +++ b/tools/testing/selftests/kvm/config @@ -0,0 +1,3 @@ +CONFIG_KVM=y +CONFIG_KVM_INTEL=y +CONFIG_KVM_AMD=y