s390: A bunch of fixes and optimizations for interrupt and time

handling.
 
 PPC: Mostly bug fixes.
 
 ARM: No big features, but many small fixes and prerequisites including:
 - a number of fixes for the arch-timer
 - introducing proper level-triggered semantics for the arch-timers
 - a series of patches to synchronously halt a guest (prerequisite for
   IRQ forwarding)
 - some tracepoint improvements
 - a tweak for the EL2 panic handlers
 - some more VGIC cleanups getting rid of redundant state
 
 x86: quite a few changes:
 
 - support for VT-d posted interrupts (i.e. PCI devices can inject
 interrupts directly into vCPUs).  This introduces a new component (in
 virt/lib/) that connects VFIO and KVM together.  The same infrastructure
 will be used for ARM interrupt forwarding as well.
 
 - more Hyper-V features, though the main one Hyper-V synthetic interrupt
 controller will have to wait for 4.5.  These will let KVM expose Hyper-V
 devices.
 
 - nested virtualization now supports VPID (same as PCID but for vCPUs)
 which makes it quite a bit faster
 
 - for future hardware that supports NVDIMM, there is support for clflushopt,
 clwb, pcommit
 
 - support for "split irqchip", i.e. LAPIC in kernel + IOAPIC/PIC/PIT in
 userspace, which reduces the attack surface of the hypervisor
 
 - obligatory smattering of SMM fixes
 
 - on the guest side, stable scheduler clock support was rewritten to not
 require help from the hypervisor.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v2.0.22 (GNU/Linux)
 
 iQEcBAABAgAGBQJWO2IQAAoJEL/70l94x66D/K0H/3AovAgYmJQToZlimsktMk6a
 f2xhdIqfU5lIQQh5uNBCfL3o9o8H9Py1ym7aEw3fmztPHHJYc91oTatt2UEKhmEw
 VtZHp/dFHt3hwaIdXmjRPEXiYctraKCyrhaUYdWmUYkoKi7lW5OL5h+S7frG2U6u
 p/hFKnHRZfXHr6NSgIqvYkKqtnc+C0FWY696IZMzgCksOO8jB1xrxoSN3tANW3oJ
 PDV+4og0fN/Fr1capJUFEc/fejREHneANvlKrLaa8ht0qJQutoczNADUiSFLcMPG
 iHljXeDsv5eyjMtUuIL8+MPzcrIt/y4rY41ZPiKggxULrXc6H+JJL/e/zThZpXc=
 =iv2z
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Paolo Bonzini:
 "First batch of KVM changes for 4.4.

  s390:
     A bunch of fixes and optimizations for interrupt and time handling.

  PPC:
     Mostly bug fixes.

  ARM:
     No big features, but many small fixes and prerequisites including:

      - a number of fixes for the arch-timer

      - introducing proper level-triggered semantics for the arch-timers

      - a series of patches to synchronously halt a guest (prerequisite
        for IRQ forwarding)

      - some tracepoint improvements

      - a tweak for the EL2 panic handlers

      - some more VGIC cleanups getting rid of redundant state

  x86:
     Quite a few changes:

      - support for VT-d posted interrupts (i.e. PCI devices can inject
        interrupts directly into vCPUs).  This introduces a new
        component (in virt/lib/) that connects VFIO and KVM together.
        The same infrastructure will be used for ARM interrupt
        forwarding as well.

      - more Hyper-V features, though the main one Hyper-V synthetic
        interrupt controller will have to wait for 4.5.  These will let
        KVM expose Hyper-V devices.

      - nested virtualization now supports VPID (same as PCID but for
        vCPUs) which makes it quite a bit faster

      - for future hardware that supports NVDIMM, there is support for
        clflushopt, clwb, pcommit

      - support for "split irqchip", i.e.  LAPIC in kernel +
        IOAPIC/PIC/PIT in userspace, which reduces the attack surface of
        the hypervisor

      - obligatory smattering of SMM fixes

      - on the guest side, stable scheduler clock support was rewritten
        to not require help from the hypervisor"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (123 commits)
  KVM: VMX: Fix commit which broke PML
  KVM: x86: obey KVM_X86_QUIRK_CD_NW_CLEARED in kvm_set_cr0()
  KVM: x86: allow RSM from 64-bit mode
  KVM: VMX: fix SMEP and SMAP without EPT
  KVM: x86: move kvm_set_irq_inatomic to legacy device assignment
  KVM: device assignment: remove pointless #ifdefs
  KVM: x86: merge kvm_arch_set_irq with kvm_set_msi_inatomic
  KVM: x86: zero apic_arb_prio on reset
  drivers/hv: share Hyper-V SynIC constants with userspace
  KVM: x86: handle SMBASE as physical address in RSM
  KVM: x86: add read_phys to x86_emulate_ops
  KVM: x86: removing unused variable
  KVM: don't pointlessly leave KVM_COMPAT=y in non-KVM configs
  KVM: arm/arm64: Merge vgic_set_lr() and vgic_sync_lr_elrsr()
  KVM: arm/arm64: Clean up vgic_retire_lr() and surroundings
  KVM: arm/arm64: Optimize away redundant LR tracking
  KVM: s390: use simple switch statement as multiplexer
  KVM: s390: drop useless newline in debugging data
  KVM: s390: SCA must not cross page boundaries
  KVM: arm: Do not indent the arguments of DECLARE_BITMAP
  ...
This commit is contained in:
Linus Torvalds 2015-11-05 16:26:26 -08:00
commit 933425fb00
89 changed files with 2956 additions and 1029 deletions

View File

@ -1585,6 +1585,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nosid disable Source ID checking
no_x2apic_optout
BIOS x2APIC opt-out request will be ignored
nopost disable Interrupt Posting
iomem= Disable strict checking of access to MMIO memory
strict regions from userspace.

View File

@ -401,10 +401,9 @@ Capability: basic
Architectures: x86, ppc, mips
Type: vcpu ioctl
Parameters: struct kvm_interrupt (in)
Returns: 0 on success, -1 on error
Returns: 0 on success, negative on failure.
Queues a hardware interrupt vector to be injected. This is only
useful if in-kernel local APIC or equivalent is not used.
Queues a hardware interrupt vector to be injected.
/* for KVM_INTERRUPT */
struct kvm_interrupt {
@ -414,7 +413,14 @@ struct kvm_interrupt {
X86:
Note 'irq' is an interrupt vector, not an interrupt pin or line.
Returns: 0 on success,
-EEXIST if an interrupt is already enqueued
-EINVAL the the irq number is invalid
-ENXIO if the PIC is in the kernel
-EFAULT if the pointer is invalid
Note 'irq' is an interrupt vector, not an interrupt pin or line. This
ioctl is useful if the in-kernel PIC is not used.
PPC:
@ -1598,7 +1604,7 @@ provided event instead of triggering an exit.
struct kvm_ioeventfd {
__u64 datamatch;
__u64 addr; /* legal pio/mmio address */
__u32 len; /* 1, 2, 4, or 8 bytes */
__u32 len; /* 0, 1, 2, 4, or 8 bytes */
__s32 fd;
__u32 flags;
__u8 pad[36];
@ -1621,6 +1627,10 @@ to the registered address is equal to datamatch in struct kvm_ioeventfd.
For virtio-ccw devices, addr contains the subchannel id and datamatch the
virtqueue index.
With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and
the kernel will ignore the length of guest write and may get a faster vmexit.
The speedup may only apply to specific architectures, but the ioeventfd will
work anyway.
4.60 KVM_DIRTY_TLB
@ -3309,6 +3319,18 @@ Valid values for 'type' are:
to ignore the request, or to gather VM memory core dump and/or
reset/shutdown of the VM.
/* KVM_EXIT_IOAPIC_EOI */
struct {
__u8 vector;
} eoi;
Indicates that the VCPU's in-kernel local APIC received an EOI for a
level-triggered IOAPIC interrupt. This exit only triggers when the
IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled);
the userspace IOAPIC should process the EOI and retrigger the interrupt if
it is still asserted. Vector is the LAPIC interrupt vector for which the
EOI was received.
/* Fix the size of the union. */
char padding[256];
};
@ -3627,6 +3649,26 @@ struct {
KVM handlers should exit to userspace with rc = -EREMOTE.
7.5 KVM_CAP_SPLIT_IRQCHIP
Architectures: x86
Parameters: args[0] - number of routes reserved for userspace IOAPICs
Returns: 0 on success, -1 on error
Create a local apic for each processor in the kernel. This can be used
instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the
IOAPIC and PIC (and also the PIT, even though this has to be enabled
separately).
This capability also enables in kernel routing of interrupt requests;
when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are
used in the IRQ routing table. The first args[0] MSI routes are reserved
for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes,
a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace.
Fails if VCPU has already been created, or if the irqchip is already in the
kernel (i.e. KVM_CREATE_IRQCHIP has already been called).
8. Other capabilities.
----------------------

View File

@ -0,0 +1,187 @@
KVM/ARM VGIC Forwarded Physical Interrupts
==========================================
The KVM/ARM code implements software support for the ARM Generic
Interrupt Controller's (GIC's) hardware support for virtualization by
allowing software to inject virtual interrupts to a VM, which the guest
OS sees as regular interrupts. The code is famously known as the VGIC.
Some of these virtual interrupts, however, correspond to physical
interrupts from real physical devices. One example could be the
architected timer, which itself supports virtualization, and therefore
lets a guest OS program the hardware device directly to raise an
interrupt at some point in time. When such an interrupt is raised, the
host OS initially handles the interrupt and must somehow signal this
event as a virtual interrupt to the guest. Another example could be a
passthrough device, where the physical interrupts are initially handled
by the host, but the device driver for the device lives in the guest OS
and KVM must therefore somehow inject a virtual interrupt on behalf of
the physical one to the guest OS.
These virtual interrupts corresponding to a physical interrupt on the
host are called forwarded physical interrupts, but are also sometimes
referred to as 'virtualized physical interrupts' and 'mapped interrupts'.
Forwarded physical interrupts are handled slightly differently compared
to virtual interrupts generated purely by a software emulated device.
The HW bit
----------
Virtual interrupts are signalled to the guest by programming the List
Registers (LRs) on the GIC before running a VCPU. The LR is programmed
with the virtual IRQ number and the state of the interrupt (Pending,
Active, or Pending+Active). When the guest ACKs and EOIs a virtual
interrupt, the LR state moves from Pending to Active, and finally to
inactive.
The LRs include an extra bit, called the HW bit. When this bit is set,
KVM must also program an additional field in the LR, the physical IRQ
number, to link the virtual with the physical IRQ.
When the HW bit is set, KVM must EITHER set the Pending OR the Active
bit, never both at the same time.
Setting the HW bit causes the hardware to deactivate the physical
interrupt on the physical distributor when the guest deactivates the
corresponding virtual interrupt.
Forwarded Physical Interrupts Life Cycle
----------------------------------------
The state of forwarded physical interrupts is managed in the following way:
- The physical interrupt is acked by the host, and becomes active on
the physical distributor (*).
- KVM sets the LR.Pending bit, because this is the only way the GICV
interface is going to present it to the guest.
- LR.Pending will stay set as long as the guest has not acked the interrupt.
- LR.Pending transitions to LR.Active on the guest read of the IAR, as
expected.
- On guest EOI, the *physical distributor* active bit gets cleared,
but the LR.Active is left untouched (set).
- KVM clears the LR on VM exits when the physical distributor
active state has been cleared.
(*): The host handling is slightly more complicated. For some forwarded
interrupts (shared), KVM directly sets the active state on the physical
distributor before entering the guest, because the interrupt is never actually
handled on the host (see details on the timer as an example below). For other
forwarded interrupts (non-shared) the host does not deactivate the interrupt
when the host ISR completes, but leaves the interrupt active until the guest
deactivates it. Leaving the interrupt active is allowed, because Linux
configures the physical GIC with EOIMode=1, which causes EOI operations to
perform a priority drop allowing the GIC to receive other interrupts of the
default priority.
Forwarded Edge and Level Triggered PPIs and SPIs
------------------------------------------------
Forwarded physical interrupts injected should always be active on the
physical distributor when injected to a guest.
Level-triggered interrupts will keep the interrupt line to the GIC
asserted, typically until the guest programs the device to deassert the
line. This means that the interrupt will remain pending on the physical
distributor until the guest has reprogrammed the device. Since we
always run the VM with interrupts enabled on the CPU, a pending
interrupt will exit the guest as soon as we switch into the guest,
preventing the guest from ever making progress as the process repeats
over and over. Therefore, the active state on the physical distributor
must be set when entering the guest, preventing the GIC from forwarding
the pending interrupt to the CPU. As soon as the guest deactivates the
interrupt, the physical line is sampled by the hardware again and the host
takes a new interrupt if and only if the physical line is still asserted.
Edge-triggered interrupts do not exhibit the same problem with
preventing guest execution that level-triggered interrupts do. One
option is to not use HW bit at all, and inject edge-triggered interrupts
from a physical device as pure virtual interrupts. But that would
potentially slow down handling of the interrupt in the guest, because a
physical interrupt occurring in the middle of the guest ISR would
preempt the guest for the host to handle the interrupt. Additionally,
if you configure the system to handle interrupts on a separate physical
core from that running your VCPU, you still have to interrupt the VCPU
to queue the pending state onto the LR, even though the guest won't use
this information until the guest ISR completes. Therefore, the HW
bit should always be set for forwarded edge-triggered interrupts. With
the HW bit set, the virtual interrupt is injected and additional
physical interrupts occurring before the guest deactivates the interrupt
simply mark the state on the physical distributor as Pending+Active. As
soon as the guest deactivates the interrupt, the host takes another
interrupt if and only if there was a physical interrupt between injecting
the forwarded interrupt to the guest and the guest deactivating the
interrupt.
Consequently, whenever we schedule a VCPU with one or more LRs with the
HW bit set, the interrupt must also be active on the physical
distributor.
Forwarded LPIs
--------------
LPIs, introduced in GICv3, are always edge-triggered and do not have an
active state. They become pending when a device signal them, and as
soon as they are acked by the CPU, they are inactive again.
It therefore doesn't make sense, and is not supported, to set the HW bit
for physical LPIs that are forwarded to a VM as virtual interrupts,
typically virtual SPIs.
For LPIs, there is no other choice than to preempt the VCPU thread if
necessary, and queue the pending state onto the LR.
Putting It Together: The Architected Timer
------------------------------------------
The architected timer is a device that signals interrupts with level
triggered semantics. The timer hardware is directly accessed by VCPUs
which program the timer to fire at some point in time. Each VCPU on a
system programs the timer to fire at different times, and therefore the
hardware is multiplexed between multiple VCPUs. This is implemented by
context-switching the timer state along with each VCPU thread.
However, this means that a scenario like the following is entirely
possible, and in fact, typical:
1. KVM runs the VCPU
2. The guest programs the time to fire in T+100
3. The guest is idle and calls WFI (wait-for-interrupts)
4. The hardware traps to the host
5. KVM stores the timer state to memory and disables the hardware timer
6. KVM schedules a soft timer to fire in T+(100 - time since step 2)
7. KVM puts the VCPU thread to sleep (on a waitqueue)
8. The soft timer fires, waking up the VCPU thread
9. KVM reprograms the timer hardware with the VCPU's values
10. KVM marks the timer interrupt as active on the physical distributor
11. KVM injects a forwarded physical interrupt to the guest
12. KVM runs the VCPU
Notice that KVM injects a forwarded physical interrupt in step 11 without
the corresponding interrupt having actually fired on the host. That is
exactly why we mark the timer interrupt as active in step 10, because
the active state on the physical distributor is part of the state
belonging to the timer hardware, which is context-switched along with
the VCPU thread.
If the guest does not idle because it is busy, the flow looks like this
instead:
1. KVM runs the VCPU
2. The guest programs the time to fire in T+100
4. At T+100 the timer fires and a physical IRQ causes the VM to exit
(note that this initially only traps to EL2 and does not run the host ISR
until KVM has returned to the host).
5. With interrupts still disabled on the CPU coming back from the guest, KVM
stores the virtual timer state to memory and disables the virtual hw timer.
6. KVM looks at the timer state (in memory) and injects a forwarded physical
interrupt because it concludes the timer has expired.
7. KVM marks the timer interrupt as active on the physical distributor
7. KVM enables the timer, enables interrupts, and runs the VCPU
Notice that again the forwarded physical interrupt is injected to the
guest without having actually been handled on the host. In this case it
is because the physical interrupt is never actually seen by the host because the
timer is disabled upon guest return, and the virtual forwarded interrupt is
injected on the KVM guest entry path.

View File

@ -44,28 +44,29 @@ Groups:
Attributes:
The attr field of kvm_device_attr encodes two values:
bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 |
values: | reserved | cpu id | offset |
values: | reserved | vcpu_index | offset |
All distributor regs are (rw, 32-bit)
The offset is relative to the "Distributor base address" as defined in the
GICv2 specs. Getting or setting such a register has the same effect as
reading or writing the register on the actual hardware from the cpu
specified with cpu id field. Note that most distributor fields are not
banked, but return the same value regardless of the cpu id used to access
the register.
reading or writing the register on the actual hardware from the cpu whose
index is specified with the vcpu_index field. Note that most distributor
fields are not banked, but return the same value regardless of the
vcpu_index used to access the register.
Limitations:
- Priorities are not implemented, and registers are RAZ/WI
- Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
Errors:
-ENODEV: Getting or setting this register is not yet supported
-ENXIO: Getting or setting this register is not yet supported
-EBUSY: One or more VCPUs are running
-EINVAL: Invalid vcpu_index supplied
KVM_DEV_ARM_VGIC_GRP_CPU_REGS
Attributes:
The attr field of kvm_device_attr encodes two values:
bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 |
values: | reserved | cpu id | offset |
values: | reserved | vcpu_index | offset |
All CPU interface regs are (rw, 32-bit)
@ -91,8 +92,9 @@ Groups:
- Priorities are not implemented, and registers are RAZ/WI
- Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
Errors:
-ENODEV: Getting or setting this register is not yet supported
-ENXIO: Getting or setting this register is not yet supported
-EBUSY: One or more VCPUs are running
-EINVAL: Invalid vcpu_index supplied
KVM_DEV_ARM_VGIC_GRP_NR_IRQS
Attributes:

View File

@ -166,3 +166,15 @@ Comment: The srcu read lock must be held while accessing memslots (e.g.
MMIO/PIO address->device structure mapping (kvm->buses).
The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu
if it is needed by multiple functions.
Name: blocked_vcpu_on_cpu_lock
Type: spinlock_t
Arch: x86
Protects: blocked_vcpu_on_cpu
Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts.
When VT-d posted-interrupts is supported and the VM has assigned
devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu
protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues
wakeup notification event since external interrupts from the
assigned devices happens, we will find the vCPU on the list to
wakeup.

View File

@ -11348,6 +11348,13 @@ L: netdev@vger.kernel.org
S: Maintained
F: drivers/net/ethernet/via/via-velocity.*
VIRT LIB
M: Alex Williamson <alex.williamson@redhat.com>
M: Paolo Bonzini <pbonzini@redhat.com>
L: kvm@vger.kernel.org
S: Supported
F: virt/lib/
VIVID VIRTUAL VIDEO DRIVER
M: Hans Verkuil <hverkuil@xs4all.nl>
L: linux-media@vger.kernel.org

View File

@ -550,6 +550,7 @@ drivers-y := drivers/ sound/ firmware/
net-y := net/
libs-y := lib/
core-y := usr/
virt-y := virt/
endif # KBUILD_EXTMOD
ifeq ($(dot-config),1)
@ -882,10 +883,10 @@ core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \
$(net-y) $(net-m) $(libs-y) $(libs-m)))
$(net-y) $(net-m) $(libs-y) $(libs-m) $(virt-y)))
vmlinux-alldirs := $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
$(init-) $(core-) $(drivers-) $(net-) $(libs-))))
$(init-) $(core-) $(drivers-) $(net-) $(libs-) $(virt-))))
init-y := $(patsubst %/, %/built-in.o, $(init-y))
core-y := $(patsubst %/, %/built-in.o, $(core-y))
@ -894,14 +895,15 @@ net-y := $(patsubst %/, %/built-in.o, $(net-y))
libs-y1 := $(patsubst %/, %/lib.a, $(libs-y))
libs-y2 := $(patsubst %/, %/built-in.o, $(libs-y))
libs-y := $(libs-y1) $(libs-y2)
virt-y := $(patsubst %/, %/built-in.o, $(virt-y))
# Externally visible symbols (used by link-vmlinux.sh)
export KBUILD_VMLINUX_INIT := $(head-y) $(init-y)
export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y)
export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) $(virt-y)
export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds
export LDFLAGS_vmlinux
# used by scripts/pacmage/Makefile
export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools virt)
export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools)
vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN)

View File

@ -218,4 +218,24 @@
#define HSR_DABT_CM (1U << 8)
#define HSR_DABT_EA (1U << 9)
#define kvm_arm_exception_type \
{0, "RESET" }, \
{1, "UNDEFINED" }, \
{2, "SOFTWARE" }, \
{3, "PREF_ABORT" }, \
{4, "DATA_ABORT" }, \
{5, "IRQ" }, \
{6, "FIQ" }, \
{7, "HVC" }
#define HSRECN(x) { HSR_EC_##x, #x }
#define kvm_arm_exception_class \
HSRECN(UNKNOWN), HSRECN(WFI), HSRECN(CP15_32), HSRECN(CP15_64), \
HSRECN(CP14_MR), HSRECN(CP14_LS), HSRECN(CP_0_13), HSRECN(CP10_ID), \
HSRECN(JAZELLE), HSRECN(BXJ), HSRECN(CP14_64), HSRECN(SVC_HYP), \
HSRECN(HVC), HSRECN(SMC), HSRECN(IABT), HSRECN(IABT_HYP), \
HSRECN(DABT), HSRECN(DABT_HYP)
#endif /* __ARM_KVM_ARM_H__ */

View File

@ -126,7 +126,10 @@ struct kvm_vcpu_arch {
* here.
*/
/* Don't run the guest on this vcpu */
/* vcpu power-off state */
bool power_off;
/* Don't run the guest (internal implementation need) */
bool pause;
/* IO related fields */

View File

@ -46,4 +46,6 @@ config KVM_ARM_HOST
---help---
Provides host support for ARM processors.
source drivers/vhost/Kconfig
endif # VIRTUALIZATION

View File

@ -271,6 +271,16 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
return kvm_timer_should_fire(vcpu);
}
void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
{
kvm_timer_schedule(vcpu);
}
void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
kvm_timer_unschedule(vcpu);
}
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
/* Force users to call KVM_ARM_VCPU_INIT */
@ -308,7 +318,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
if (vcpu->arch.pause)
if (vcpu->arch.power_off)
mp_state->mp_state = KVM_MP_STATE_STOPPED;
else
mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
@ -321,10 +331,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
{
switch (mp_state->mp_state) {
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.pause = false;
vcpu->arch.power_off = false;
break;
case KVM_MP_STATE_STOPPED:
vcpu->arch.pause = true;
vcpu->arch.power_off = true;
break;
default:
return -EINVAL;
@ -342,7 +352,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
*/
int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
{
return !!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v);
return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v))
&& !v->arch.power_off && !v->arch.pause);
}
/* Just ensure a guest exit from a particular CPU */
@ -468,11 +479,38 @@ bool kvm_arch_intc_initialized(struct kvm *kvm)
return vgic_initialized(kvm);
}
static void vcpu_pause(struct kvm_vcpu *vcpu)
static void kvm_arm_halt_guest(struct kvm *kvm) __maybe_unused;
static void kvm_arm_resume_guest(struct kvm *kvm) __maybe_unused;
static void kvm_arm_halt_guest(struct kvm *kvm)
{
int i;
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm)
vcpu->arch.pause = true;
force_vm_exit(cpu_all_mask);
}
static void kvm_arm_resume_guest(struct kvm *kvm)
{
int i;
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm) {
wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
vcpu->arch.pause = false;
wake_up_interruptible(wq);
}
}
static void vcpu_sleep(struct kvm_vcpu *vcpu)
{
wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
wait_event_interruptible(*wq, !vcpu->arch.pause);
wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
(!vcpu->arch.pause)));
}
static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
@ -522,8 +560,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
update_vttbr(vcpu->kvm);
if (vcpu->arch.pause)
vcpu_pause(vcpu);
if (vcpu->arch.power_off || vcpu->arch.pause)
vcpu_sleep(vcpu);
/*
* Disarming the background timer must be done in a
@ -549,11 +587,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
run->exit_reason = KVM_EXIT_INTR;
}
if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
vcpu->arch.power_off || vcpu->arch.pause) {
local_irq_enable();
kvm_timer_sync_hwstate(vcpu);
kvm_vgic_sync_hwstate(vcpu);
preempt_enable();
kvm_timer_sync_hwstate(vcpu);
continue;
}
@ -596,14 +635,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
* guest time.
*/
kvm_guest_exit();
trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
/*
* We must sync the timer state before the vgic state so that
* the vgic can properly sample the updated state of the
* interrupt line.
*/
kvm_timer_sync_hwstate(vcpu);
kvm_vgic_sync_hwstate(vcpu);
preempt_enable();
kvm_timer_sync_hwstate(vcpu);
ret = handle_exit(vcpu, run, ret);
}
@ -765,12 +809,12 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
vcpu_reset_hcr(vcpu);
/*
* Handle the "start in power-off" case by marking the VCPU as paused.
* Handle the "start in power-off" case.
*/
if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
vcpu->arch.pause = true;
vcpu->arch.power_off = true;
else
vcpu->arch.pause = false;
vcpu->arch.power_off = false;
return 0;
}

View File

@ -63,7 +63,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
{
vcpu->arch.pause = true;
vcpu->arch.power_off = true;
}
static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
@ -87,7 +87,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
*/
if (!vcpu)
return PSCI_RET_INVALID_PARAMS;
if (!vcpu->arch.pause) {
if (!vcpu->arch.power_off) {
if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
return PSCI_RET_ALREADY_ON;
else
@ -115,7 +115,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
* the general puspose registers are undefined upon CPU_ON.
*/
*vcpu_reg(vcpu, 0) = context_id;
vcpu->arch.pause = false;
vcpu->arch.power_off = false;
smp_mb(); /* Make sure the above is visible */
wq = kvm_arch_vcpu_wq(vcpu);
@ -153,7 +153,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
mpidr = kvm_vcpu_get_mpidr_aff(tmp);
if ((mpidr & target_affinity_mask) == target_affinity) {
matching_cpus++;
if (!tmp->arch.pause)
if (!tmp->arch.power_off)
return PSCI_0_2_AFFINITY_LEVEL_ON;
}
}
@ -179,7 +179,7 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
* re-initialized.
*/
kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
tmp->arch.pause = true;
tmp->arch.power_off = true;
kvm_vcpu_kick(tmp);
}

View File

@ -25,21 +25,25 @@ TRACE_EVENT(kvm_entry,
);
TRACE_EVENT(kvm_exit,
TP_PROTO(unsigned int exit_reason, unsigned long vcpu_pc),
TP_ARGS(exit_reason, vcpu_pc),
TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc),
TP_ARGS(idx, exit_reason, vcpu_pc),
TP_STRUCT__entry(
__field( int, idx )
__field( unsigned int, exit_reason )
__field( unsigned long, vcpu_pc )
),
TP_fast_assign(
__entry->idx = idx;
__entry->exit_reason = exit_reason;
__entry->vcpu_pc = vcpu_pc;
),
TP_printk("HSR_EC: 0x%04x, PC: 0x%08lx",
TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
__print_symbolic(__entry->idx, kvm_arm_exception_type),
__entry->exit_reason,
__print_symbolic(__entry->exit_reason, kvm_arm_exception_class),
__entry->vcpu_pc)
);

View File

@ -200,4 +200,20 @@
/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
#define HPFAR_MASK (~UL(0xf))
#define kvm_arm_exception_type \
{0, "IRQ" }, \
{1, "TRAP" }
#define ECN(x) { ESR_ELx_EC_##x, #x }
#define kvm_arm_exception_class \
ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \
ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(CP14_64), ECN(SVC64), \
ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(IMP_DEF), ECN(IABT_LOW), \
ECN(IABT_CUR), ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \
ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \
ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \
ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
ECN(BKPT32), ECN(VECTOR32), ECN(BRK64)
#endif /* __ARM64_KVM_ARM_H__ */

View File

@ -149,7 +149,10 @@ struct kvm_vcpu_arch {
u32 mdscr_el1;
} guest_debug_preserved;
/* Don't run the guest */
/* vcpu power-off state */
bool power_off;
/* Don't run the guest (internal implementation need) */
bool pause;
/* IO related fields */

View File

@ -48,4 +48,6 @@ config KVM_ARM_HOST
---help---
Provides host support for ARM processors.
source drivers/vhost/Kconfig
endif # VIRTUALIZATION

View File

@ -880,6 +880,14 @@ __kvm_hyp_panic:
bl __restore_sysregs
/*
* Make sure we have a valid host stack, and don't leave junk in the
* frame pointer that will give us a misleading host stack unwinding.
*/
ldr x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
msr sp_el1, x22
mov x29, xzr
1: adr x0, __hyp_panic_str
adr x1, 2f
ldp x2, x3, [x1]

View File

@ -847,5 +847,7 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot) {}
static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
#endif /* __MIPS_KVM_HOST_H__ */

View File

@ -42,6 +42,11 @@ static inline unsigned int get_dcrn(u32 inst)
return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
}
static inline unsigned int get_tmrn(u32 inst)
{
return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
}
static inline unsigned int get_rt(u32 inst)
{
return (inst >> 21) & 0x1f;

View File

@ -716,5 +716,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot
static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_exit(void) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
#endif /* __POWERPC_KVM_HOST_H__ */

View File

@ -742,6 +742,12 @@
#define MMUBE1_VBE4 0x00000002
#define MMUBE1_VBE5 0x00000001
#define TMRN_TMCFG0 16 /* Thread Management Configuration Register 0 */
#define TMRN_TMCFG0_NPRIBITS 0x003f0000 /* Bits of thread priority */
#define TMRN_TMCFG0_NPRIBITS_SHIFT 16
#define TMRN_TMCFG0_NATHRD 0x00003f00 /* Number of active threads */
#define TMRN_TMCFG0_NATHRD_SHIFT 8
#define TMRN_TMCFG0_NTHRD 0x0000003f /* Number of threads */
#define TMRN_IMSR0 0x120 /* Initial MSR Register 0 (e6500) */
#define TMRN_IMSR1 0x121 /* Initial MSR Register 1 (e6500) */
#define TMRN_INIA0 0x140 /* Next Instruction Address Register 0 */

View File

@ -70,7 +70,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
}
/* Lastly try successively smaller sizes from the page allocator */
while (!hpt && order > PPC_MIN_HPT_ORDER) {
/* Only do this if userspace didn't specify a size via ioctl */
while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) {
hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
__GFP_NOWARN, order - PAGE_SHIFT);
if (!hpt)

View File

@ -470,6 +470,8 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
note_hpte_modification(kvm, rev);
unlock_hpte(hpte, 0);
if (v & HPTE_V_ABSENT)
v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID;
hpret[0] = v;
hpret[1] = r;
return H_SUCCESS;

View File

@ -150,6 +150,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
beq 11f
cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
beq 15f /* Invoke the H_DOORBELL handler */
cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI
beq cr2, 14f /* HMI check */
@ -174,6 +176,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_HSRR1, r7
b hmi_exception_after_realmode
15: mtspr SPRN_HSRR0, r8
mtspr SPRN_HSRR1, r7
ba 0xe80
kvmppc_primary_no_guest:
/* We handle this much like a ceded vcpu */
/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
@ -2377,7 +2383,6 @@ machine_check_realmode:
mr r3, r9 /* get vcpu pointer */
bl kvmppc_realmode_machine_check
nop
cmpdi r3, 0 /* Did we handle MCE ? */
ld r9, HSTATE_KVM_VCPU(r13)
li r12, BOOK3S_INTERRUPT_MACHINE_CHECK
/*
@ -2390,13 +2395,18 @@ machine_check_realmode:
* The old code used to return to host for unhandled errors which
* was causing guest to hang with soft lockups inside guest and
* makes it difficult to recover guest instance.
*
* if we receive machine check with MSR(RI=0) then deliver it to
* guest as machine check causing guest to crash.
*/
ld r10, VCPU_PC(r9)
ld r11, VCPU_MSR(r9)
andi. r10, r11, MSR_RI /* check for unrecoverable exception */
beq 1f /* Deliver a machine check to guest */
ld r10, VCPU_PC(r9)
cmpdi r3, 0 /* Did we handle MCE ? */
bne 2f /* Continue guest execution. */
/* If not, deliver a machine check. SRR0/1 are already set */
li r10, BOOK3S_INTERRUPT_MACHINE_CHECK
ld r11, VCPU_MSR(r9)
1: li r10, BOOK3S_INTERRUPT_MACHINE_CHECK
bl kvmppc_msr_interrupt
2: b fast_interrupt_c_return
@ -2436,14 +2446,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* hypervisor doorbell */
3: li r12, BOOK3S_INTERRUPT_H_DOORBELL
/*
* Clear the doorbell as we will invoke the handler
* explicitly in the guest exit path.
*/
lis r6, (PPC_DBELL_SERVER << (63-36))@h
PPC_MSGCLR(6)
/* see if it's a host IPI */
li r3, 1
lbz r0, HSTATE_HOST_IPI(r13)
cmpwi r0, 0
bnelr
/* if not, clear it and return -1 */
lis r6, (PPC_DBELL_SERVER << (63-36))@h
PPC_MSGCLR(6)
/* if not, return -1 */
li r3, -1
blr

View File

@ -237,7 +237,8 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
struct kvm_book3e_206_tlb_entry *gtlbe)
{
struct vcpu_id_table *idt = vcpu_e500->idt;
unsigned int pr, tid, ts, pid;
unsigned int pr, tid, ts;
int pid;
u32 val, eaddr;
unsigned long flags;

View File

@ -15,6 +15,7 @@
#include <asm/kvm_ppc.h>
#include <asm/disassemble.h>
#include <asm/dbell.h>
#include <asm/reg_booke.h>
#include "booke.h"
#include "e500.h"
@ -22,6 +23,7 @@
#define XOP_DCBTLS 166
#define XOP_MSGSND 206
#define XOP_MSGCLR 238
#define XOP_MFTMR 366
#define XOP_TLBIVAX 786
#define XOP_TLBSX 914
#define XOP_TLBRE 946
@ -113,6 +115,19 @@ static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu)
return EMULATE_DONE;
}
static int kvmppc_e500_emul_mftmr(struct kvm_vcpu *vcpu, unsigned int inst,
int rt)
{
/* Expose one thread per vcpu */
if (get_tmrn(inst) == TMRN_TMCFG0) {
kvmppc_set_gpr(vcpu, rt,
1 | (1 << TMRN_TMCFG0_NATHRD_SHIFT));
return EMULATE_DONE;
}
return EMULATE_FAIL;
}
int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int inst, int *advance)
{
@ -165,6 +180,10 @@ int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
break;
case XOP_MFTMR:
emulated = kvmppc_e500_emul_mftmr(vcpu, inst, rt);
break;
case XOP_EHPRIV:
emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst,
advance);

View File

@ -406,7 +406,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
unsigned long gfn_start, gfn_end;
tsize_pages = 1 << (tsize - 2);
tsize_pages = 1UL << (tsize - 2);
gfn_start = gfn & ~(tsize_pages - 1);
gfn_end = gfn_start + tsize_pages;
@ -447,7 +447,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
}
if (likely(!pfnmap)) {
tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT);
pfn = gfn_to_pfn_memslot(slot, gfn);
if (is_error_noslot_pfn(pfn)) {
if (printk_ratelimit())

View File

@ -559,6 +559,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
else
r = num_online_cpus();
break;
case KVM_CAP_NR_MEMSLOTS:
r = KVM_USER_MEM_SLOTS;
break;
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;

View File

@ -644,5 +644,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot
static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
#endif

View File

@ -336,28 +336,28 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP;
}
static const intercept_handler_t intercept_funcs[] = {
[0x00 >> 2] = handle_noop,
[0x04 >> 2] = handle_instruction,
[0x08 >> 2] = handle_prog,
[0x10 >> 2] = handle_noop,
[0x14 >> 2] = handle_external_interrupt,
[0x18 >> 2] = handle_noop,
[0x1C >> 2] = kvm_s390_handle_wait,
[0x20 >> 2] = handle_validity,
[0x28 >> 2] = handle_stop,
[0x38 >> 2] = handle_partial_execution,
};
int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
{
intercept_handler_t func;
u8 code = vcpu->arch.sie_block->icptcode;
if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs))
switch (vcpu->arch.sie_block->icptcode) {
case 0x00:
case 0x10:
case 0x18:
return handle_noop(vcpu);
case 0x04:
return handle_instruction(vcpu);
case 0x08:
return handle_prog(vcpu);
case 0x14:
return handle_external_interrupt(vcpu);
case 0x1c:
return kvm_s390_handle_wait(vcpu);
case 0x20:
return handle_validity(vcpu);
case 0x28:
return handle_stop(vcpu);
case 0x38:
return handle_partial_execution(vcpu);
default:
return -EOPNOTSUPP;
func = intercept_funcs[code >> 2];
if (func)
return func(vcpu);
return -EOPNOTSUPP;
}
}

View File

@ -51,11 +51,9 @@ static int psw_mchk_disabled(struct kvm_vcpu *vcpu)
static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
{
if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) ||
(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO) ||
(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT))
return 0;
return 1;
return psw_extint_disabled(vcpu) &&
psw_ioint_disabled(vcpu) &&
psw_mchk_disabled(vcpu);
}
static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
@ -71,13 +69,8 @@ static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
static int ckc_irq_pending(struct kvm_vcpu *vcpu)
{
preempt_disable();
if (!(vcpu->arch.sie_block->ckc <
get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) {
preempt_enable();
if (vcpu->arch.sie_block->ckc >= kvm_s390_get_tod_clock_fast(vcpu->kvm))
return 0;
}
preempt_enable();
return ckc_interrupts_enabled(vcpu);
}
@ -109,14 +102,10 @@ static inline u8 int_word_to_isc(u32 int_word)
return (int_word & 0x38000000) >> 27;
}
static inline unsigned long pending_floating_irqs(struct kvm_vcpu *vcpu)
static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.float_int.pending_irqs;
}
static inline unsigned long pending_local_irqs(struct kvm_vcpu *vcpu)
{
return vcpu->arch.local_int.pending_irqs;
return vcpu->kvm->arch.float_int.pending_irqs |
vcpu->arch.local_int.pending_irqs;
}
static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
@ -135,8 +124,7 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
{
unsigned long active_mask;
active_mask = pending_local_irqs(vcpu);
active_mask |= pending_floating_irqs(vcpu);
active_mask = pending_irqs(vcpu);
if (!active_mask)
return 0;
@ -204,7 +192,7 @@ static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
{
if (!(pending_floating_irqs(vcpu) & IRQ_PEND_IO_MASK))
if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK))
return;
else if (psw_ioint_disabled(vcpu))
__set_cpuflag(vcpu, CPUSTAT_IO_INT);
@ -214,7 +202,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
{
if (!(pending_local_irqs(vcpu) & IRQ_PEND_EXT_MASK))
if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK))
return;
if (psw_extint_disabled(vcpu))
__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
@ -224,7 +212,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu)
{
if (!(pending_local_irqs(vcpu) & IRQ_PEND_MCHK_MASK))
if (!(pending_irqs(vcpu) & IRQ_PEND_MCHK_MASK))
return;
if (psw_mchk_disabled(vcpu))
vcpu->arch.sie_block->ictl |= ICTL_LPSW;
@ -815,23 +803,21 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop)
{
int rc;
if (deliverable_irqs(vcpu))
return 1;
rc = !!deliverable_irqs(vcpu);
if (!rc && kvm_cpu_has_pending_timer(vcpu))
rc = 1;
if (kvm_cpu_has_pending_timer(vcpu))
return 1;
/* external call pending and deliverable */
if (!rc && kvm_s390_ext_call_pending(vcpu) &&
if (kvm_s390_ext_call_pending(vcpu) &&
!psw_extint_disabled(vcpu) &&
(vcpu->arch.sie_block->gcr[0] & 0x2000ul))
rc = 1;
return 1;
if (!rc && !exclude_stop && kvm_s390_is_stop_irq_pending(vcpu))
rc = 1;
return rc;
if (!exclude_stop && kvm_s390_is_stop_irq_pending(vcpu))
return 1;
return 0;
}
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
@ -846,7 +832,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
vcpu->stat.exit_wait_state++;
/* fast path */
if (kvm_cpu_has_pending_timer(vcpu) || kvm_arch_vcpu_runnable(vcpu))
if (kvm_arch_vcpu_runnable(vcpu))
return 0;
if (psw_interrupts_disabled(vcpu)) {
@ -860,9 +846,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
goto no_timer;
}
preempt_disable();
now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
preempt_enable();
now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
/* underflow */
@ -901,9 +885,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
u64 now, sltime;
vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
preempt_disable();
now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
preempt_enable();
now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
/*
@ -981,39 +963,30 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
irq->u.pgm.code, 0);
li->irq.pgm = irq->u.pgm;
if (irq->u.pgm.code == PGM_PER) {
li->irq.pgm.code |= PGM_PER;
/* only modify PER related information */
li->irq.pgm.per_address = irq->u.pgm.per_address;
li->irq.pgm.per_code = irq->u.pgm.per_code;
li->irq.pgm.per_atmid = irq->u.pgm.per_atmid;
li->irq.pgm.per_access_id = irq->u.pgm.per_access_id;
} else if (!(irq->u.pgm.code & PGM_PER)) {
li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
irq->u.pgm.code;
/* only modify non-PER information */
li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
li->irq.pgm.mon_code = irq->u.pgm.mon_code;
li->irq.pgm.data_exc_code = irq->u.pgm.data_exc_code;
li->irq.pgm.mon_class_nr = irq->u.pgm.mon_class_nr;
li->irq.pgm.exc_access_id = irq->u.pgm.exc_access_id;
li->irq.pgm.op_access_id = irq->u.pgm.op_access_id;
} else {
li->irq.pgm = irq->u.pgm;
}
set_bit(IRQ_PEND_PROG, &li->pending_irqs);
return 0;
}
int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
struct kvm_s390_irq irq;
spin_lock(&li->lock);
irq.u.pgm.code = code;
__inject_prog(vcpu, &irq);
BUG_ON(waitqueue_active(li->wq));
spin_unlock(&li->lock);
return 0;
}
int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
struct kvm_s390_pgm_info *pgm_info)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
struct kvm_s390_irq irq;
int rc;
spin_lock(&li->lock);
irq.u.pgm = *pgm_info;
rc = __inject_prog(vcpu, &irq);
BUG_ON(waitqueue_active(li->wq));
spin_unlock(&li->lock);
return rc;
}
static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@ -1390,12 +1363,9 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
{
struct kvm_s390_float_interrupt *fi;
u64 type = READ_ONCE(inti->type);
int rc;
fi = &kvm->arch.float_int;
switch (type) {
case KVM_S390_MCHK:
rc = __inject_float_mchk(kvm, inti);

View File

@ -514,35 +514,20 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
if (gtod_high != 0)
return -EINVAL;
VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x\n", gtod_high);
VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x", gtod_high);
return 0;
}
static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
{
struct kvm_vcpu *cur_vcpu;
unsigned int vcpu_idx;
u64 host_tod, gtod;
int r;
u64 gtod;
if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod)))
return -EFAULT;
r = store_tod_clock(&host_tod);
if (r)
return r;
mutex_lock(&kvm->lock);
preempt_disable();
kvm->arch.epoch = gtod - host_tod;
kvm_s390_vcpu_block_all(kvm);
kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm)
cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch;
kvm_s390_vcpu_unblock_all(kvm);
preempt_enable();
mutex_unlock(&kvm->lock);
VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod);
kvm_s390_set_tod_clock(kvm, gtod);
VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod);
return 0;
}
@ -574,26 +559,19 @@ static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
if (copy_to_user((void __user *)attr->addr, &gtod_high,
sizeof(gtod_high)))
return -EFAULT;
VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x\n", gtod_high);
VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x", gtod_high);
return 0;
}
static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
{
u64 host_tod, gtod;
int r;
u64 gtod;
r = store_tod_clock(&host_tod);
if (r)
return r;
preempt_disable();
gtod = host_tod + kvm->arch.epoch;
preempt_enable();
gtod = kvm_s390_get_tod_clock_fast(kvm);
if (copy_to_user((void __user *)attr->addr, &gtod, sizeof(gtod)))
return -EFAULT;
VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod);
VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx", gtod);
return 0;
}
@ -1120,7 +1098,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (!kvm->arch.sca)
goto out_err;
spin_lock(&kvm_lock);
sca_offset = (sca_offset + 16) & 0x7f0;
sca_offset += 16;
if (sca_offset + sizeof(struct sca_block) > PAGE_SIZE)
sca_offset = 0;
kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset);
spin_unlock(&kvm_lock);
@ -1911,6 +1891,22 @@ retry:
return 0;
}
void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod)
{
struct kvm_vcpu *vcpu;
int i;
mutex_lock(&kvm->lock);
preempt_disable();
kvm->arch.epoch = tod - get_tod_clock();
kvm_s390_vcpu_block_all(kvm);
kvm_for_each_vcpu(i, vcpu, kvm)
vcpu->arch.sie_block->epoch = kvm->arch.epoch;
kvm_s390_vcpu_unblock_all(kvm);
preempt_enable();
mutex_unlock(&kvm->lock);
}
/**
* kvm_arch_fault_in_page - fault-in guest page if necessary
* @vcpu: The corresponding virtual cpu

View File

@ -175,6 +175,7 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
return kvm->arch.user_cpu_state_ctrl != 0;
}
/* implemented in interrupt.c */
int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
@ -185,7 +186,25 @@ int __must_check kvm_s390_inject_vm(struct kvm *kvm,
struct kvm_s390_interrupt *s390int);
int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
struct kvm_s390_irq *irq);
int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
static inline int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
struct kvm_s390_pgm_info *pgm_info)
{
struct kvm_s390_irq irq = {
.type = KVM_S390_PROGRAM_INT,
.u.pgm = *pgm_info,
};
return kvm_s390_inject_vcpu(vcpu, &irq);
}
static inline int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
{
struct kvm_s390_irq irq = {
.type = KVM_S390_PROGRAM_INT,
.u.pgm.code = code,
};
return kvm_s390_inject_vcpu(vcpu, &irq);
}
struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
u64 isc_mask, u32 schid);
int kvm_s390_reinject_io_int(struct kvm *kvm,
@ -212,6 +231,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
/* implemented in kvm-s390.c */
void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
@ -231,9 +251,6 @@ extern unsigned long kvm_s390_fac_list_mask[];
/* implemented in diag.c */
int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
/* implemented in interrupt.c */
int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
struct kvm_s390_pgm_info *pgm_info);
static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
{
@ -254,6 +271,16 @@ static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm)
kvm_s390_vcpu_unblock(vcpu);
}
static inline u64 kvm_s390_get_tod_clock_fast(struct kvm *kvm)
{
u64 rc;
preempt_disable();
rc = get_tod_clock_fast() + kvm->arch.epoch;
preempt_enable();
return rc;
}
/**
* kvm_s390_inject_prog_cond - conditionally inject a program check
* @vcpu: virtual cpu

View File

@ -33,11 +33,9 @@
/* Handle SCK (SET CLOCK) interception */
static int handle_set_clock(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu *cpup;
s64 hostclk, val;
int i, rc;
int rc;
ar_t ar;
u64 op2;
u64 op2, val;
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@ -49,19 +47,8 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
if (store_tod_clock(&hostclk)) {
kvm_s390_set_psw_cc(vcpu, 3);
return 0;
}
VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val);
val = (val - hostclk) & ~0x3fUL;
mutex_lock(&vcpu->kvm->lock);
preempt_disable();
kvm_for_each_vcpu(i, cpup, vcpu->kvm)
cpup->arch.sie_block->epoch = val;
preempt_enable();
mutex_unlock(&vcpu->kvm->lock);
kvm_s390_set_tod_clock(vcpu->kvm, val);
kvm_s390_set_psw_cc(vcpu, 0);
return 0;

View File

@ -33,6 +33,11 @@ enum irq_remap_cap {
IRQ_POSTING_CAP = 0,
};
struct vcpu_data {
u64 pi_desc_addr; /* Physical address of PI Descriptor */
u32 vector; /* Guest vector of the interrupt */
};
#ifdef CONFIG_IRQ_REMAP
extern bool irq_remapping_cap(enum irq_remap_cap cap);
@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
return x86_vector_domain;
}
struct vcpu_data {
u64 pi_desc_addr; /* Physical address of PI Descriptor */
u32 vector; /* Guest vector of the interrupt */
};
#else /* CONFIG_IRQ_REMAP */
static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }

View File

@ -111,6 +111,16 @@ struct x86_emulate_ops {
unsigned int bytes,
struct x86_exception *fault);
/*
* read_phys: Read bytes of standard (non-emulated/special) memory.
* Used for descriptor reading.
* @addr: [IN ] Physical address from which to read.
* @val: [OUT] Value read from memory.
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
void *val, unsigned int bytes);
/*
* write_std: Write bytes of standard (non-emulated/special) memory.
* Used for descriptor writing.

View File

@ -24,6 +24,7 @@
#include <linux/perf_event.h>
#include <linux/pvclock_gtod.h>
#include <linux/clocksource.h>
#include <linux/irqbypass.h>
#include <asm/pvclock-abi.h>
#include <asm/desc.h>
@ -176,6 +177,8 @@ enum {
*/
#define KVM_APIC_PV_EOI_PENDING 1
struct kvm_kernel_irq_routing_entry;
/*
* We don't want allocation failures within the mmu code, so we preallocate
* enough memory for a single page fault in a cache.
@ -374,6 +377,7 @@ struct kvm_mtrr {
/* Hyper-V per vcpu emulation context */
struct kvm_vcpu_hv {
u64 hv_vapic;
s64 runtime_offset;
};
struct kvm_vcpu_arch {
@ -396,6 +400,7 @@ struct kvm_vcpu_arch {
u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
u64 eoi_exit_bitmap[4];
unsigned long apic_attention;
int32_t apic_arb_prio;
int mp_state;
@ -573,6 +578,9 @@ struct kvm_vcpu_arch {
struct {
bool pv_unhalted;
} pv;
int pending_ioapic_eoi;
int pending_external_vector;
};
struct kvm_lpage_info {
@ -683,6 +691,9 @@ struct kvm_arch {
u32 bsp_vcpu_id;
u64 disabled_quirks;
bool irqchip_split;
u8 nr_reserved_ioapic_pins;
};
struct kvm_vm_stat {
@ -819,10 +830,10 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
int (*vm_has_apicv)(struct kvm *kvm);
int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm *kvm, int isr);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu);
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
@ -887,6 +898,20 @@ struct kvm_x86_ops {
gfn_t offset, unsigned long mask);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
/*
* Architecture specific hooks for vCPU blocking due to
* HLT instruction.
* Returns for .pre_block():
* - 0 means continue to block the vCPU.
* - 1 means we cannot block the vCPU since some event
* happens during this period, such as, 'ON' bit in
* posted-interrupts descriptor is set.
*/
int (*pre_block)(struct kvm_vcpu *vcpu);
void (*post_block)(struct kvm_vcpu *vcpu);
int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
};
struct kvm_arch_async_pf {
@ -1231,4 +1256,13 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq);
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
#endif /* _ASM_X86_KVM_HOST_H */

View File

@ -72,7 +72,7 @@
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
#define SECONDARY_EXEC_ENABLE_PML 0x00020000
#define SECONDARY_EXEC_XSAVES 0x00100000
#define SECONDARY_EXEC_PCOMMIT 0x00200000
#define PIN_BASED_EXT_INTR_MASK 0x00000001
#define PIN_BASED_NMI_EXITING 0x00000008
@ -416,6 +416,7 @@ enum vmcs_field {
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */
#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */

View File

@ -153,6 +153,12 @@
/* MSR used to provide vcpu index */
#define HV_X64_MSR_VP_INDEX 0x40000002
/* MSR used to reset the guest OS. */
#define HV_X64_MSR_RESET 0x40000003
/* MSR used to provide vcpu runtime in 100ns units */
#define HV_X64_MSR_VP_RUNTIME 0x40000010
/* MSR used to read the per-partition time reference counter */
#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
@ -251,4 +257,16 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
__s64 tsc_offset;
} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
/* Define the number of synthetic interrupt sources. */
#define HV_SYNIC_SINT_COUNT (16)
/* Define the expected SynIC version. */
#define HV_SYNIC_VERSION_1 (0x1)
#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0)
#define HV_SYNIC_SIMP_ENABLE (1ULL << 0)
#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0)
#define HV_SYNIC_SINT_MASKED (1ULL << 16)
#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17)
#define HV_SYNIC_SINT_VECTOR_MASK (0xFF)
#endif

View File

@ -78,6 +78,7 @@
#define EXIT_REASON_PML_FULL 62
#define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64
#define EXIT_REASON_PCOMMIT 65
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@ -126,7 +127,8 @@
{ EXIT_REASON_INVVPID, "INVVPID" }, \
{ EXIT_REASON_INVPCID, "INVPCID" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \
{ EXIT_REASON_XRSTORS, "XRSTORS" }
{ EXIT_REASON_XRSTORS, "XRSTORS" }, \
{ EXIT_REASON_PCOMMIT, "PCOMMIT" }
#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4

View File

@ -32,6 +32,7 @@
static int kvmclock = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
static cycle_t kvm_sched_clock_offset;
static int parse_no_kvmclock(char *arg)
{
@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
return kvm_clock_read();
}
static cycle_t kvm_sched_clock_read(void)
{
return kvm_clock_read() - kvm_sched_clock_offset;
}
static inline void kvm_sched_clock_init(bool stable)
{
if (!stable) {
pv_time_ops.sched_clock = kvm_clock_read;
return;
}
kvm_sched_clock_offset = kvm_clock_read();
pv_time_ops.sched_clock = kvm_sched_clock_read;
set_sched_clock_stable();
printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
kvm_sched_clock_offset);
BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
}
/*
* If we don't do that, there is the possibility that the guest
* will calibrate under heavy load - thus, getting a lower lpj -
@ -248,7 +272,17 @@ void __init kvmclock_init(void)
memblock_free(mem, size);
return;
}
pv_time_ops.sched_clock = kvm_clock_read;
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
cpu = get_cpu();
vcpu_time = &hv_clock[cpu].pvti;
flags = pvclock_read_flags(vcpu_time);
kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
put_cpu();
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
@ -265,16 +299,6 @@ void __init kvmclock_init(void)
kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(~0);
cpu = get_cpu();
vcpu_time = &hv_clock[cpu].pvti;
flags = pvclock_read_flags(vcpu_time);
if (flags & PVCLOCK_COUNTS_FROM_ZERO)
set_sched_clock_stable();
put_cpu();
}
int __init kvm_setup_vsyscall_timeinfo(void)

View File

@ -28,6 +28,8 @@ config KVM
select ANON_INODES
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQFD
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE

View File

@ -21,6 +21,7 @@
#include <linux/fs.h>
#include "irq.h"
#include "assigned-dev.h"
#include "trace/events/kvm.h"
struct kvm_assigned_dev_kernel {
struct kvm_irq_ack_notifier ack_notifier;
@ -131,7 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
return IRQ_HANDLED;
}
#ifdef __KVM_HAVE_MSI
/*
* Deliver an IRQ in an atomic context if we can, or return a failure,
* user can retry in a process context.
* Return value:
* -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
* Other values - No need to retry.
*/
static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
int level)
{
struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
struct kvm_kernel_irq_routing_entry *e;
int ret = -EINVAL;
int idx;
trace_kvm_set_irq(irq, level, irq_source_id);
/*
* Injection into either PIC or IOAPIC might need to scan all CPUs,
* which would need to be retried from thread context; when same GSI
* is connected to both PIC and IOAPIC, we'd have to report a
* partial failure here.
* Since there's no easy way to do this, we only support injecting MSI
* which is limited to 1:1 GSI mapping.
*/
idx = srcu_read_lock(&kvm->irq_srcu);
if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
e = &entries[0];
ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
irq, level);
}
srcu_read_unlock(&kvm->irq_srcu, idx);
return ret;
}
static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@ -150,9 +186,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
return IRQ_HANDLED;
}
#endif
#ifdef __KVM_HAVE_MSIX
static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@ -183,7 +217,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
return IRQ_HANDLED;
}
#endif
/* Ack the irq line for an assigned device */
static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
@ -386,7 +419,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
return 0;
}
#ifdef __KVM_HAVE_MSI
static int assigned_device_enable_host_msi(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
@ -408,9 +440,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
return 0;
}
#endif
#ifdef __KVM_HAVE_MSIX
static int assigned_device_enable_host_msix(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
@ -443,8 +473,6 @@ err:
return r;
}
#endif
static int assigned_device_enable_guest_intx(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
struct kvm_assigned_irq *irq)
@ -454,7 +482,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm,
return 0;
}
#ifdef __KVM_HAVE_MSI
static int assigned_device_enable_guest_msi(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
struct kvm_assigned_irq *irq)
@ -463,9 +490,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
dev->ack_notifier.gsi = -1;
return 0;
}
#endif
#ifdef __KVM_HAVE_MSIX
static int assigned_device_enable_guest_msix(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
struct kvm_assigned_irq *irq)
@ -474,7 +499,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
dev->ack_notifier.gsi = -1;
return 0;
}
#endif
static int assign_host_irq(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
@ -492,16 +516,12 @@ static int assign_host_irq(struct kvm *kvm,
case KVM_DEV_IRQ_HOST_INTX:
r = assigned_device_enable_host_intx(kvm, dev);
break;
#ifdef __KVM_HAVE_MSI
case KVM_DEV_IRQ_HOST_MSI:
r = assigned_device_enable_host_msi(kvm, dev);
break;
#endif
#ifdef __KVM_HAVE_MSIX
case KVM_DEV_IRQ_HOST_MSIX:
r = assigned_device_enable_host_msix(kvm, dev);
break;
#endif
default:
r = -EINVAL;
}
@ -534,16 +554,12 @@ static int assign_guest_irq(struct kvm *kvm,
case KVM_DEV_IRQ_GUEST_INTX:
r = assigned_device_enable_guest_intx(kvm, dev, irq);
break;
#ifdef __KVM_HAVE_MSI
case KVM_DEV_IRQ_GUEST_MSI:
r = assigned_device_enable_guest_msi(kvm, dev, irq);
break;
#endif
#ifdef __KVM_HAVE_MSIX
case KVM_DEV_IRQ_GUEST_MSIX:
r = assigned_device_enable_guest_msix(kvm, dev, irq);
break;
#endif
default:
r = -EINVAL;
}
@ -826,7 +842,6 @@ out:
}
#ifdef __KVM_HAVE_MSIX
static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
struct kvm_assigned_msix_nr *entry_nr)
{
@ -906,7 +921,6 @@ msix_entry_out:
return r;
}
#endif
static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
struct kvm_assigned_pci_dev *assigned_dev)
@ -1012,7 +1026,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
goto out;
break;
}
#ifdef __KVM_HAVE_MSIX
case KVM_ASSIGN_SET_MSIX_NR: {
struct kvm_assigned_msix_nr entry_nr;
r = -EFAULT;
@ -1033,7 +1046,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
goto out;
break;
}
#endif
case KVM_ASSIGN_SET_INTX_MASK: {
struct kvm_assigned_pci_dev assigned_dev;

View File

@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
F(AVX512CD);
F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
/* cpuid 0xD.1.eax */
const u32 kvm_supported_word10_x86_features =

View File

@ -133,4 +133,41 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
best = kvm_find_cpuid_entry(vcpu, 7, 0);
return best && (best->ebx & bit(X86_FEATURE_MPX));
}
static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 7, 0);
return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
}
static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
return best && (best->edx & bit(X86_FEATURE_RDTSCP));
}
/*
* NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
*/
#define BIT_NRIPS 3
static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0);
/*
* NRIPS is a scattered cpuid feature, so we can't use
* X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit
* position 8, not 3).
*/
return best && (best->edx & bit(BIT_NRIPS));
}
#undef BIT_NRIPS
#endif

View File

@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
#define GET_SMSTATE(type, smbase, offset) \
({ \
type __val; \
int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \
sizeof(__val), NULL); \
int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \
sizeof(__val)); \
if (r != X86EMUL_CONTINUE) \
return X86EMUL_UNHANDLEABLE; \
__val; \
@ -2484,17 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
/*
* Get back to real mode, to prepare a safe state in which to load
* CR0/CR3/CR4/EFER. Also this will ensure that addresses passed
* to read_std/write_std are not virtual.
*
* CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
* CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
* supports long mode.
*/
cr4 = ctxt->ops->get_cr(ctxt, 4);
if (emulator_has_longmode(ctxt)) {
struct desc_struct cs_desc;
/* Zero CR4.PCIDE before CR0.PG. */
if (cr4 & X86_CR4_PCIDE) {
ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
cr4 &= ~X86_CR4_PCIDE;
}
/* A 32-bit code segment is required to clear EFER.LMA. */
memset(&cs_desc, 0, sizeof(cs_desc));
cs_desc.type = 0xb;
cs_desc.s = cs_desc.g = cs_desc.p = 1;
ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
}
/* For the 64-bit case, this will clear EFER.LMA. */
cr0 = ctxt->ops->get_cr(ctxt, 0);
if (cr0 & X86_CR0_PE)
ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
cr4 = ctxt->ops->get_cr(ctxt, 4);
/* Now clear CR4.PAE (which must be done before clearing EFER.LME). */
if (cr4 & X86_CR4_PAE)
ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
/* And finally go back to 32-bit mode. */
efer = 0;
ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
@ -4455,7 +4474,7 @@ static const struct opcode twobyte_table[256] = {
F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
/* 0xA8 - 0xAF */
I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),

View File

@ -41,6 +41,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
case HV_X64_MSR_TIME_REF_COUNT:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_RESET:
r = true;
break;
}
@ -163,6 +164,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
data);
case HV_X64_MSR_CRASH_CTL:
return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
case HV_X64_MSR_RESET:
if (data == 1) {
vcpu_debug(vcpu, "hyper-v reset requested\n");
kvm_make_request(KVM_REQ_HV_RESET, vcpu);
}
break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@ -171,7 +178,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
return 0;
}
static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* Calculate cpu time spent by current task in 100ns units */
static u64 current_task_runtime_100ns(void)
{
cputime_t utime, stime;
task_cputime_adjusted(current, &utime, &stime);
return div_u64(cputime_to_nsecs(utime + stime), 100);
}
static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
@ -205,6 +221,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
case HV_X64_MSR_TPR:
return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
case HV_X64_MSR_VP_RUNTIME:
if (!host)
return 1;
hv->runtime_offset = data - current_task_runtime_100ns();
break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@ -241,6 +262,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
pdata);
case HV_X64_MSR_CRASH_CTL:
return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
case HV_X64_MSR_RESET:
data = 0;
break;
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@ -277,6 +301,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_APIC_ASSIST_PAGE:
data = hv->hv_vapic;
break;
case HV_X64_MSR_VP_RUNTIME:
data = current_task_runtime_100ns() + hv->runtime_offset;
break;
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@ -295,7 +322,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
mutex_unlock(&vcpu->kvm->lock);
return r;
} else
return kvm_hv_set_msr(vcpu, msr, data);
return kvm_hv_set_msr(vcpu, msr, data, host);
}
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

View File

@ -35,6 +35,7 @@
#include <linux/kvm_host.h>
#include <linux/slab.h>
#include "ioapic.h"
#include "irq.h"
#include "i8254.h"
#include "x86.h"
@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
s64 interval;
if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
if (!ioapic_in_kernel(kvm) ||
ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
return;
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);

View File

@ -233,21 +233,7 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
}
static void update_handled_vectors(struct kvm_ioapic *ioapic)
{
DECLARE_BITMAP(handled_vectors, 256);
int i;
memset(handled_vectors, 0, sizeof(handled_vectors));
for (i = 0; i < IOAPIC_NUM_PINS; ++i)
__set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
memcpy(ioapic->handled_vectors, handled_vectors,
sizeof(handled_vectors));
smp_wmb();
}
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
u32 *tmr)
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
union kvm_ioapic_redirect_entry *e;
@ -260,13 +246,11 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
index == RTC_GSI) {
if (kvm_apic_match_dest(vcpu, NULL, 0,
e->fields.dest_id, e->fields.dest_mode)) {
e->fields.dest_id, e->fields.dest_mode) ||
(e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
kvm_apic_pending_eoi(vcpu, e->fields.vector)))
__set_bit(e->fields.vector,
(unsigned long *)eoi_exit_bitmap);
if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
__set_bit(e->fields.vector,
(unsigned long *)tmr);
}
}
}
spin_unlock(&ioapic->lock);
@ -315,7 +299,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
e->bits |= (u32) val;
e->fields.remote_irr = 0;
}
update_handled_vectors(ioapic);
mask_after = e->fields.mask;
if (mask_before != mask_after)
kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
@ -599,7 +582,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
ioapic->id = 0;
memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
rtc_irq_eoi_tracking_reset(ioapic);
update_handled_vectors(ioapic);
}
static const struct kvm_io_device_ops ioapic_mmio_ops = {
@ -628,8 +610,10 @@ int kvm_ioapic_init(struct kvm *kvm)
if (ret < 0) {
kvm->arch.vioapic = NULL;
kfree(ioapic);
return ret;
}
kvm_vcpu_request_scan_ioapic(kvm);
return ret;
}
@ -666,7 +650,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
ioapic->irr = 0;
ioapic->irr_delivered = 0;
update_handled_vectors(ioapic);
kvm_vcpu_request_scan_ioapic(kvm);
kvm_ioapic_inject_all(ioapic, state->irr);
spin_unlock(&ioapic->lock);

View File

@ -9,6 +9,7 @@ struct kvm;
struct kvm_vcpu;
#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
#define IOAPIC_EDGE_TRIG 0
#define IOAPIC_LEVEL_TRIG 1
@ -73,7 +74,6 @@ struct kvm_ioapic {
struct kvm *kvm;
void (*ack_notifier)(void *opaque, int irq);
spinlock_t lock;
DECLARE_BITMAP(handled_vectors, 256);
struct rtc_status rtc_status;
struct delayed_work eoi_inject;
u32 irq_eoi[IOAPIC_NUM_PINS];
@ -98,11 +98,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
return kvm->arch.vioapic;
}
static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
static inline int ioapic_in_kernel(struct kvm *kvm)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
smp_rmb();
return test_bit(vector, ioapic->handled_vectors);
int ret;
ret = (ioapic_irqchip(kvm) != NULL);
return ret;
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@ -120,7 +121,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, unsigned long *dest_map);
int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
u32 *tmr);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
#endif

View File

@ -37,15 +37,28 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
/*
* check if there is a pending userspace external interrupt
*/
static int pending_userspace_extint(struct kvm_vcpu *v)
{
return v->arch.pending_external_vector != -1;
}
/*
* check if there is pending interrupt from
* non-APIC source without intack.
*/
static int kvm_cpu_has_extint(struct kvm_vcpu *v)
{
if (kvm_apic_accept_pic_intr(v))
return pic_irqchip(v->kvm)->output; /* PIC */
else
u8 accept = kvm_apic_accept_pic_intr(v);
if (accept) {
if (irqchip_split(v->kvm))
return pending_userspace_extint(v);
else
return pic_irqchip(v->kvm)->output;
} else
return 0;
}
@ -57,13 +70,13 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
*/
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
{
if (!irqchip_in_kernel(v->kvm))
if (!lapic_in_kernel(v))
return v->arch.interrupt.pending;
if (kvm_cpu_has_extint(v))
return 1;
if (kvm_apic_vid_enabled(v->kvm))
if (kvm_vcpu_apic_vid_enabled(v))
return 0;
return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
@ -75,7 +88,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
if (!irqchip_in_kernel(v->kvm))
if (!lapic_in_kernel(v))
return v->arch.interrupt.pending;
if (kvm_cpu_has_extint(v))
@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
*/
static int kvm_cpu_get_extint(struct kvm_vcpu *v)
{
if (kvm_cpu_has_extint(v))
return kvm_pic_read_irq(v->kvm); /* PIC */
return -1;
if (kvm_cpu_has_extint(v)) {
if (irqchip_split(v->kvm)) {
int vector = v->arch.pending_external_vector;
v->arch.pending_external_vector = -1;
return vector;
} else
return kvm_pic_read_irq(v->kvm); /* PIC */
} else
return -1;
}
/*
@ -103,7 +123,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
{
int vector;
if (!irqchip_in_kernel(v->kvm))
if (!lapic_in_kernel(v))
return v->arch.interrupt.nr;
vector = kvm_cpu_get_extint(v);

View File

@ -83,13 +83,38 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
return kvm->arch.vpic;
}
static inline int pic_in_kernel(struct kvm *kvm)
{
int ret;
ret = (pic_irqchip(kvm) != NULL);
return ret;
}
static inline int irqchip_split(struct kvm *kvm)
{
return kvm->arch.irqchip_split;
}
static inline int irqchip_in_kernel(struct kvm *kvm)
{
struct kvm_pic *vpic = pic_irqchip(kvm);
bool ret;
ret = (vpic != NULL);
ret |= irqchip_split(kvm);
/* Read vpic before kvm->irq_routing. */
smp_rmb();
return vpic != NULL;
return ret;
}
static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
{
/* Same as irqchip_in_kernel(vcpu->kvm), but with less
* pointer chasing and no unnecessary memory barriers.
*/
return vcpu->arch.apic != NULL;
}
void kvm_pic_reset(struct kvm_kpic_state *s);

View File

@ -91,8 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
return r;
}
static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq)
void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq)
{
trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
@ -108,6 +108,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
irq->level = 1;
irq->shorthand = 0;
}
EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level, bool line_status)
@ -123,12 +124,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
}
static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm)
int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
{
struct kvm_lapic_irq irq;
int r;
if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
return -EWOULDBLOCK;
kvm_set_msi_irq(e, &irq);
if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
@ -137,42 +142,6 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
return -EWOULDBLOCK;
}
/*
* Deliver an IRQ in an atomic context if we can, or return a failure,
* user can retry in a process context.
* Return value:
* -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
* Other values - No need to retry.
*/
int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
{
struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
struct kvm_kernel_irq_routing_entry *e;
int ret = -EINVAL;
int idx;
trace_kvm_set_irq(irq, level, irq_source_id);
/*
* Injection into either PIC or IOAPIC might need to scan all CPUs,
* which would need to be retried from thread context; when same GSI
* is connected to both PIC and IOAPIC, we'd have to report a
* partial failure here.
* Since there's no easy way to do this, we only support injecting MSI
* which is limited to 1:1 GSI mapping.
*/
idx = srcu_read_lock(&kvm->irq_srcu);
if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
e = &entries[0];
if (likely(e->type == KVM_IRQ_ROUTING_MSI))
ret = kvm_set_msi_inatomic(e, kvm);
else
ret = -EWOULDBLOCK;
}
srcu_read_unlock(&kvm->irq_srcu, idx);
return ret;
}
int kvm_request_irq_source_id(struct kvm *kvm)
{
unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
@ -208,7 +177,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
goto unlock;
}
clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
if (!irqchip_in_kernel(kvm))
if (!ioapic_in_kernel(kvm))
goto unlock;
kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
@ -297,6 +266,33 @@ out:
return r;
}
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu)
{
int i, r = 0;
struct kvm_vcpu *vcpu;
if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
return true;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
continue;
if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
irq->dest_id, irq->dest_mode))
continue;
if (++r == 2)
return false;
*dest_vcpu = vcpu;
}
return r == 1;
}
EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
#define IOAPIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
@ -328,3 +324,54 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
return kvm_set_irq_routing(kvm, default_routing,
ARRAY_SIZE(default_routing), 0);
}
static const struct kvm_irq_routing_entry empty_routing[] = {};
int kvm_setup_empty_irq_routing(struct kvm *kvm)
{
return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
}
void kvm_arch_irq_routing_update(struct kvm *kvm)
{
if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
return;
kvm_make_scan_ioapic_request(kvm);
}
void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_kernel_irq_routing_entry *entry;
struct kvm_irq_routing_table *table;
u32 i, nr_ioapic_pins;
int idx;
/* kvm->irq_routing must be read after clearing
* KVM_SCAN_IOAPIC. */
smp_mb();
idx = srcu_read_lock(&kvm->irq_srcu);
table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
kvm->arch.nr_reserved_ioapic_pins);
for (i = 0; i < nr_ioapic_pins; ++i) {
hlist_for_each_entry(entry, &table->map[i], link) {
u32 dest_id, dest_mode;
bool level;
if (entry->type != KVM_IRQ_ROUTING_MSI)
continue;
dest_id = (entry->msi.address_lo >> 12) & 0xff;
dest_mode = (entry->msi.address_lo >> 2) & 0x1;
level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
if (level && kvm_apic_match_dest(vcpu, NULL, 0,
dest_id, dest_mode)) {
u32 vector = entry->msi.data & 0xff;
__set_bit(vector,
(unsigned long *) eoi_exit_bitmap);
}
}
}
srcu_read_unlock(&kvm->irq_srcu, idx);
}

View File

@ -209,7 +209,7 @@ out:
if (old)
kfree_rcu(old, rcu);
kvm_vcpu_request_scan_ioapic(kvm);
kvm_make_scan_ioapic_request(kvm);
}
static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
@ -348,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
struct kvm_lapic *apic = vcpu->arch.apic;
__kvm_apic_update_irr(pir, apic->regs);
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
@ -390,7 +392,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
vcpu = apic->vcpu;
if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) {
if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) {
/* try to update RVI */
apic_clear_vector(vec, apic->regs + APIC_IRR);
kvm_make_request(KVM_REQ_EVENT, vcpu);
@ -551,15 +553,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}
void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
{
struct kvm_lapic *apic = vcpu->arch.apic;
int i;
for (i = 0; i < 8; i++)
apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
}
static void apic_update_ppr(struct kvm_lapic *apic)
{
u32 tpr, isrv, ppr, old_ppr;
@ -764,6 +757,65 @@ out:
return ret;
}
bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu)
{
struct kvm_apic_map *map;
bool ret = false;
struct kvm_lapic *dst = NULL;
if (irq->shorthand)
return false;
rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);
if (!map)
goto out;
if (irq->dest_mode == APIC_DEST_PHYSICAL) {
if (irq->dest_id == 0xFF)
goto out;
if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
goto out;
dst = map->phys_map[irq->dest_id];
if (dst && kvm_apic_present(dst->vcpu))
*dest_vcpu = dst->vcpu;
else
goto out;
} else {
u16 cid;
unsigned long bitmap = 1;
int i, r = 0;
if (!kvm_apic_logical_map_valid(map))
goto out;
apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
if (cid >= ARRAY_SIZE(map->logical_map))
goto out;
for_each_set_bit(i, &bitmap, 16) {
dst = map->logical_map[cid][i];
if (++r == 2)
goto out;
}
if (dst && kvm_apic_present(dst->vcpu))
*dest_vcpu = dst->vcpu;
else
goto out;
}
ret = true;
out:
rcu_read_unlock();
return ret;
}
/*
* Add a pending IRQ into lapic.
* Return 1 if successfully added and 0 if discarded.
@ -781,6 +833,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
case APIC_DM_FIXED:
if (unlikely(trig_mode && !level))
break;
/* FIXME add logic for vcpu on reset */
if (unlikely(!apic_enabled(apic)))
break;
@ -790,6 +845,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
if (dest_map)
__set_bit(vcpu->vcpu_id, dest_map);
if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
if (trig_mode)
apic_set_vector(vector, apic->regs + APIC_TMR);
else
apic_clear_vector(vector, apic->regs + APIC_TMR);
}
if (kvm_x86_ops->deliver_posted_interrupt)
kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
else {
@ -868,16 +930,32 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
}
static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
{
return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap);
}
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
{
if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
int trigger_mode;
if (apic_test_vector(vector, apic->regs + APIC_TMR))
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
int trigger_mode;
/* Eoi the ioapic only if the ioapic doesn't own the vector. */
if (!kvm_ioapic_handles_vector(apic, vector))
return;
/* Request a KVM exit to inform the userspace IOAPIC. */
if (irqchip_split(apic->vcpu->kvm)) {
apic->vcpu->arch.pending_ioapic_eoi = vector;
kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
return;
}
if (apic_test_vector(vector, apic->regs + APIC_TMR))
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
}
static int apic_set_eoi(struct kvm_lapic *apic)
@ -1615,7 +1693,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu);
apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
apic->highest_isr_cache = -1;
update_divide_count(apic);
@ -1838,7 +1916,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
apic_find_highest_isr(apic));
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_rtc_eoi_tracking_restore_one(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_rtc_eoi_tracking_restore_one(vcpu);
vcpu->arch.apic_arb_prio = 0;
}
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@ -1922,7 +2003,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
/* Cache not set: could be safe but we don't bother. */
apic->highest_isr_cache == -1 ||
/* Need EOI to update ioapic. */
kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
/*
* PV EOI was disabled by apic_sync_pv_eoi_from_guest
* so we need not do anything here.
@ -1978,7 +2059,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 reg = (msr - APIC_BASE_MSR) << 4;
if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
if (reg == APIC_ICR2)
@ -1995,7 +2076,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
if (reg == APIC_DFR || reg == APIC_ICR2) {

View File

@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
void __kvm_apic_update_irr(u32 *pir, void *regs);
void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
@ -144,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
}
static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu)
{
return kvm_x86_ops->vm_has_apicv(kvm);
return kvm_x86_ops->cpu_uses_apicv(vcpu);
}
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
@ -169,4 +168,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
void wait_lapic_expire(struct kvm_vcpu *vcpu);
bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
#endif

View File

@ -818,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm->arch.indirect_shadow_pages--;
}
static int has_wrprotected_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
int level)
static int __has_wrprotected_page(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (slot) {
linfo = lpage_info_slot(gfn, slot, level);
return linfo->write_count;
@ -834,6 +831,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu,
return 1;
}
static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
{
struct kvm_memory_slot *slot;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
return __has_wrprotected_page(gfn, level, slot);
}
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
{
unsigned long page_size;
@ -851,6 +856,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
return ret;
}
static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
bool no_dirty_log)
{
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return false;
if (no_dirty_log && slot->dirty_bitmap)
return false;
return true;
}
static struct kvm_memory_slot *
gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
@ -858,21 +874,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
struct kvm_memory_slot *slot;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
(no_dirty_log && slot->dirty_bitmap))
if (!memslot_valid_for_gpte(slot, no_dirty_log))
slot = NULL;
return slot;
}
static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
}
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
bool *force_pt_level)
{
int host_level, level, max_level;
struct kvm_memory_slot *slot;
if (unlikely(*force_pt_level))
return PT_PAGE_TABLE_LEVEL;
slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
*force_pt_level = !memslot_valid_for_gpte(slot, true);
if (unlikely(*force_pt_level))
return PT_PAGE_TABLE_LEVEL;
host_level = host_mapping_level(vcpu->kvm, large_gfn);
@ -882,7 +902,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
if (has_wrprotected_page(vcpu, large_gfn, level))
if (__has_wrprotected_page(large_gfn, level, slot))
break;
return level - 1;
@ -2962,14 +2982,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
{
int r;
int level;
int force_pt_level;
bool force_pt_level = false;
pfn_t pfn;
unsigned long mmu_seq;
bool map_writable, write = error_code & PFERR_WRITE_MASK;
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
level = mapping_level(vcpu, gfn);
/*
* This path builds a PAE pagetable - so we can map
* 2mb pages at maximum. Therefore check if the level
@ -2979,8 +2998,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
level = PT_DIRECTORY_LEVEL;
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
} else
level = PT_PAGE_TABLE_LEVEL;
}
if (fast_page_fault(vcpu, v, level, error_code))
return 0;
@ -3427,7 +3445,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
{
if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
if (unlikely(!lapic_in_kernel(vcpu) ||
kvm_event_needs_reinjection(vcpu)))
return false;
@ -3476,7 +3494,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
pfn_t pfn;
int r;
int level;
int force_pt_level;
bool force_pt_level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
@ -3495,20 +3513,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (r)
return r;
if (mapping_level_dirty_bitmap(vcpu, gfn) ||
!check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL))
force_pt_level = 1;
else
force_pt_level = 0;
force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
PT_DIRECTORY_LEVEL);
level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
level = mapping_level(vcpu, gfn);
if (level > PT_DIRECTORY_LEVEL &&
!check_hugepage_cache_consistency(vcpu, gfn, level))
level = PT_DIRECTORY_LEVEL;
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
} else
level = PT_PAGE_TABLE_LEVEL;
}
if (fast_page_fault(vcpu, gpa, level, error_code))
return 0;
@ -3706,7 +3719,7 @@ static void
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
int maxphyaddr, bool execonly)
{
int pte;
u64 bad_mt_xwr;
rsvd_check->rsvd_bits_mask[0][3] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
@ -3724,14 +3737,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
for (pte = 0; pte < 64; pte++) {
int rwx_bits = pte & 7;
int mt = pte >> 3;
if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
rwx_bits == 0x2 || rwx_bits == 0x6 ||
(rwx_bits == 0x4 && !execonly))
rsvd_check->bad_mt_xwr |= (1ull << pte);
bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
if (!execonly) {
/* bits 0..2 must not be 100 unless VMX capabilities allow it */
bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
}
rsvd_check->bad_mt_xwr = bad_mt_xwr;
}
static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,

View File

@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
int force_pt_level;
bool force_pt_level = false;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
@ -743,15 +743,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
&walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
if (walker.level >= PT_DIRECTORY_LEVEL)
force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
|| is_self_change_mapping;
else
force_pt_level = 1;
if (!force_pt_level) {
level = min(walker.level, mapping_level(vcpu, walker.gfn));
walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
}
if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
level = mapping_level(vcpu, walker.gfn, &force_pt_level);
if (likely(!force_pt_level)) {
level = min(walker.level, level);
walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
}
} else
force_pt_level = true;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();

View File

@ -159,6 +159,9 @@ struct vcpu_svm {
u32 apf_reason;
u64 tsc_ratio;
/* cached guest cpuid flags for faster access */
bool nrips_enabled : 1;
};
static DEFINE_PER_CPU(u64, current_tsc_ratio);
@ -1086,7 +1089,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
return target_tsc - tsc;
}
static void init_vmcb(struct vcpu_svm *svm, bool init_event)
static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
@ -1157,8 +1160,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
if (!init_event)
svm_set_efer(&svm->vcpu, 0);
svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
kvm_set_rflags(&svm->vcpu, 2);
save->rip = 0x0000fff0;
@ -1212,7 +1214,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
}
init_vmcb(svm, init_event);
init_vmcb(svm);
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@ -1268,7 +1270,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
clear_page(svm->vmcb);
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
init_vmcb(svm, false);
init_vmcb(svm);
svm_init_osvw(&svm->vcpu);
@ -1890,7 +1892,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
* so reinitialize it.
*/
clear_page(svm->vmcb);
init_vmcb(svm, false);
init_vmcb(svm);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
@ -2365,7 +2367,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
nested_vmcb->control.next_rip = vmcb->control.next_rip;
if (svm->nrips_enabled)
nested_vmcb->control.next_rip = vmcb->control.next_rip;
/*
* If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@ -3060,7 +3064,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
/* instruction emulation calls kvm_set_cr8() */
r = cr_interception(svm);
if (irqchip_in_kernel(svm->vcpu.kvm))
if (lapic_in_kernel(&svm->vcpu))
return r;
if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
return r;
@ -3294,24 +3298,11 @@ static int msr_interception(struct vcpu_svm *svm)
static int interrupt_window_interception(struct vcpu_svm *svm)
{
struct kvm_run *kvm_run = svm->vcpu.run;
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
mark_dirty(svm->vmcb, VMCB_INTR);
++svm->vcpu.stat.irq_window_exits;
/*
* If the user space waits to inject interrupts, exit as soon as
* possible
*/
if (!irqchip_in_kernel(svm->vcpu.kvm) &&
kvm_run->request_interrupt_window &&
!kvm_cpu_has_interrupt(&svm->vcpu)) {
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
return 0;
}
return 1;
}
@ -3659,12 +3650,12 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
return;
}
static int svm_vm_has_apicv(struct kvm *kvm)
static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu)
{
return 0;
}
static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
{
return;
}
@ -4098,6 +4089,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
static void svm_cpuid_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
/* Update nrips enabled cache */
svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
}
static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@ -4425,7 +4420,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
.vm_has_apicv = svm_vm_has_apicv,
.cpu_uses_apicv = svm_cpu_uses_apicv,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.sync_pir_to_irr = svm_sync_pir_to_irr,

View File

@ -128,6 +128,24 @@ TRACE_EVENT(kvm_pio,
__entry->count > 1 ? "(...)" : "")
);
/*
* Tracepoint for fast mmio.
*/
TRACE_EVENT(kvm_fast_mmio,
TP_PROTO(u64 gpa),
TP_ARGS(gpa),
TP_STRUCT__entry(
__field(u64, gpa)
),
TP_fast_assign(
__entry->gpa = gpa;
),
TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
);
/*
* Tracepoint for cpuid.
*/
@ -974,6 +992,39 @@ TRACE_EVENT(kvm_enter_smm,
__entry->smbase)
);
/*
* Tracepoint for VT-d posted-interrupts.
*/
TRACE_EVENT(kvm_pi_irte_update,
TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
unsigned int gvec, u64 pi_desc_addr, bool set),
TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
TP_STRUCT__entry(
__field( unsigned int, vcpu_id )
__field( unsigned int, gsi )
__field( unsigned int, gvec )
__field( u64, pi_desc_addr )
__field( bool, set )
),
TP_fast_assign(
__entry->vcpu_id = vcpu_id;
__entry->gsi = gsi;
__entry->gvec = gvec;
__entry->pi_desc_addr = pi_desc_addr;
__entry->set = set;
),
TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
"gvec: 0x%x, pi_desc_addr: 0x%llx",
__entry->set ? "enabled and being updated" : "disabled",
__entry->vcpu_id,
__entry->gsi,
__entry->gvec,
__entry->pi_desc_addr)
);
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH

File diff suppressed because it is too large Load Diff

View File

@ -51,6 +51,8 @@
#include <linux/pci.h>
#include <linux/timekeeper_internal.h>
#include <linux/pvclock_gtod.h>
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@ -64,6 +66,7 @@
#include <asm/fpu/internal.h> /* Ugh! */
#include <asm/pvclock.h>
#include <asm/div64.h>
#include <asm/irq_remapping.h>
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
@ -622,7 +625,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu);
if ((cr0 ^ old_cr0) & X86_CR0_CD)
if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
return 0;
@ -789,7 +794,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS)
return 1;
if (irqchip_in_kernel(vcpu->kvm))
if (lapic_in_kernel(vcpu))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
@ -799,7 +804,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm))
if (lapic_in_kernel(vcpu))
return kvm_lapic_get_cr8(vcpu);
else
return vcpu->arch.cr8;
@ -953,6 +958,9 @@ static u32 emulated_msrs[] = {
HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
HV_X64_MSR_RESET,
HV_X64_MSR_VP_INDEX,
HV_X64_MSR_VP_RUNTIME,
HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
@ -1898,6 +1906,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu)
static void record_steal_time(struct kvm_vcpu *vcpu)
{
accumulate_steal_time(vcpu);
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@ -2048,12 +2058,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!(data & KVM_MSR_ENABLED))
break;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
preempt_disable();
accumulate_steal_time(vcpu);
preempt_enable();
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
break;
@ -2449,6 +2453,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENABLE_CAP_VM:
case KVM_CAP_DISABLE_QUIRKS:
case KVM_CAP_SET_BOOT_CPU_ID:
case KVM_CAP_SPLIT_IRQCHIP:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@ -2628,7 +2633,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vcpu->cpu = cpu;
}
accumulate_steal_time(vcpu);
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
}
@ -2662,12 +2666,24 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
{
if (irq->irq >= KVM_NR_INTERRUPTS)
return -EINVAL;
if (irqchip_in_kernel(vcpu->kvm))
if (!irqchip_in_kernel(vcpu->kvm)) {
kvm_queue_interrupt(vcpu, irq->irq, false);
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
/*
* With in-kernel LAPIC, we only use this to inject EXTINT, so
* fail for in-kernel 8259.
*/
if (pic_in_kernel(vcpu->kvm))
return -ENXIO;
kvm_queue_interrupt(vcpu, irq->irq, false);
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (vcpu->arch.pending_external_vector != -1)
return -EEXIST;
vcpu->arch.pending_external_vector = irq->irq;
return 0;
}
@ -3176,7 +3192,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_vapic_addr va;
r = -EINVAL;
if (!irqchip_in_kernel(vcpu->kvm))
if (!lapic_in_kernel(vcpu))
goto out;
r = -EFAULT;
if (copy_from_user(&va, argp, sizeof va))
@ -3425,41 +3441,35 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int r = 0;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return r;
return 0;
}
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int r = 0;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return r;
return 0;
}
static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
int r = 0;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
sizeof(ps->channels));
ps->flags = kvm->arch.vpit->pit_state.flags;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
memset(&ps->reserved, 0, sizeof(ps->reserved));
return r;
return 0;
}
static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
int r = 0, start = 0;
int start = 0;
u32 prev_legacy, cur_legacy;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@ -3471,7 +3481,7 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
kvm->arch.vpit->pit_state.flags = ps->flags;
kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return r;
return 0;
}
static int kvm_vm_ioctl_reinject(struct kvm *kvm,
@ -3556,6 +3566,28 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
kvm->arch.disabled_quirks = cap->args[0];
r = 0;
break;
case KVM_CAP_SPLIT_IRQCHIP: {
mutex_lock(&kvm->lock);
r = -EINVAL;
if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
goto split_irqchip_unlock;
r = -EEXIST;
if (irqchip_in_kernel(kvm))
goto split_irqchip_unlock;
if (atomic_read(&kvm->online_vcpus))
goto split_irqchip_unlock;
r = kvm_setup_empty_irq_routing(kvm);
if (r)
goto split_irqchip_unlock;
/* Pairs with irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_split = true;
kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
r = 0;
split_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
}
default:
r = -EINVAL;
break;
@ -3669,7 +3701,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
r = -ENXIO;
if (!irqchip_in_kernel(kvm))
if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@ -3693,7 +3725,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
r = -ENXIO;
if (!irqchip_in_kernel(kvm))
if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
if (r)
@ -4060,6 +4092,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
}
static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
unsigned long addr, void *val, unsigned int bytes)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
}
int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val,
unsigned int bytes,
@ -4795,6 +4836,7 @@ static const struct x86_emulate_ops emulate_ops = {
.write_gpr = emulator_write_gpr,
.read_std = kvm_read_guest_virt_system,
.write_std = kvm_write_guest_virt_system,
.read_phys = kvm_read_guest_phys_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
@ -5667,7 +5709,7 @@ void kvm_arch_exit(void)
int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
if (irqchip_in_kernel(vcpu->kvm)) {
if (lapic_in_kernel(vcpu)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
} else {
@ -5774,9 +5816,15 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
*/
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
{
return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
vcpu->run->request_interrupt_window &&
kvm_arch_interrupt_allowed(vcpu));
if (!vcpu->run->request_interrupt_window || pic_in_kernel(vcpu->kvm))
return false;
if (kvm_cpu_has_interrupt(vcpu))
return false;
return (irqchip_split(vcpu->kvm)
? kvm_apic_accept_pic_intr(vcpu)
: kvm_arch_interrupt_allowed(vcpu));
}
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
@ -5787,13 +5835,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
kvm_run->ready_for_interrupt_injection = 1;
else
if (!irqchip_in_kernel(vcpu->kvm))
kvm_run->ready_for_interrupt_injection =
kvm_arch_interrupt_allowed(vcpu) &&
!kvm_cpu_has_interrupt(vcpu) &&
!kvm_event_needs_reinjection(vcpu);
else if (!pic_in_kernel(vcpu->kvm))
kvm_run->ready_for_interrupt_injection =
kvm_apic_accept_pic_intr(vcpu) &&
!kvm_cpu_has_interrupt(vcpu);
else
kvm_run->ready_for_interrupt_injection = 1;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@ -6144,18 +6196,18 @@ static void process_smi(struct kvm_vcpu *vcpu)
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
u64 eoi_exit_bitmap[4];
u32 tmr[8];
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
memset(eoi_exit_bitmap, 0, 32);
memset(tmr, 0, 32);
memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
kvm_apic_update_tmr(vcpu, tmr);
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
else {
kvm_x86_ops->sync_pir_to_irr(vcpu);
kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
}
kvm_x86_ops->load_eoi_exitmap(vcpu);
}
static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
@ -6168,7 +6220,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
struct page *page = NULL;
if (!irqchip_in_kernel(vcpu->kvm))
if (!lapic_in_kernel(vcpu))
return;
if (!kvm_x86_ops->set_apic_access_page_addr)
@ -6206,7 +6258,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
bool req_int_win = !lapic_in_kernel(vcpu) &&
vcpu->run->request_interrupt_window;
bool req_immediate_exit = false;
@ -6258,6 +6310,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_pmu_handle_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
kvm_pmu_deliver_pmi(vcpu);
if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
if (test_bit(vcpu->arch.pending_ioapic_eoi,
(void *) vcpu->arch.eoi_exit_bitmap)) {
vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
vcpu->run->eoi.vector =
vcpu->arch.pending_ioapic_eoi;
r = 0;
goto out;
}
}
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@ -6268,6 +6331,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
r = 0;
goto out;
}
}
/*
* KVM_REQ_EVENT is not set when posted interrupts are set by
* VT-d hardware, so we have to update RVI unconditionally.
*/
if (kvm_lapic_enabled(vcpu)) {
/*
* Update architecture specific hints for APIC
* virtual interrupt delivery.
*/
if (kvm_x86_ops->hwapic_irr_update)
kvm_x86_ops->hwapic_irr_update(vcpu,
kvm_lapic_find_highest_irr(vcpu));
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@ -6286,13 +6369,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
/*
* Update architecture specific hints for APIC
* virtual interrupt delivery.
*/
if (kvm_x86_ops->hwapic_irr_update)
kvm_x86_ops->hwapic_irr_update(vcpu,
kvm_lapic_find_highest_irr(vcpu));
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
}
@ -6428,10 +6504,15 @@ out:
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
if (!kvm_arch_vcpu_runnable(vcpu)) {
if (!kvm_arch_vcpu_runnable(vcpu) &&
(!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (kvm_x86_ops->post_block)
kvm_x86_ops->post_block(vcpu);
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
}
@ -6468,10 +6549,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
for (;;) {
if (kvm_vcpu_running(vcpu))
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
else
} else {
r = vcpu_block(kvm, vcpu);
}
if (r <= 0)
break;
@ -6480,8 +6563,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
kvm_inject_pending_timer_irqs(vcpu);
if (dm_request_for_irq_injection(vcpu)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
r = 0;
vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++vcpu->stat.request_irq_exits;
break;
}
@ -6608,7 +6691,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
/* re-sync apic's tpr */
if (!irqchip_in_kernel(vcpu->kvm)) {
if (!lapic_in_kernel(vcpu)) {
if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
r = -EINVAL;
goto out;
@ -7308,7 +7391,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
{
return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
}
struct static_key kvm_no_apic_vcpu __read_mostly;
@ -7377,6 +7460,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
vcpu->arch.pending_external_vector = -1;
return 0;
fail_free_mce_banks:
@ -7402,7 +7487,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
kvm_mmu_destroy(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
if (!irqchip_in_kernel(vcpu->kvm))
if (!lapic_in_kernel(vcpu))
static_key_slow_dec(&kvm_no_apic_vcpu);
}
@ -8029,7 +8114,59 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
if (kvm_x86_ops->update_pi_irte) {
irqfd->producer = prod;
return kvm_x86_ops->update_pi_irte(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
}
return -EINVAL;
}
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{
int ret;
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
if (!kvm_x86_ops->update_pi_irte) {
WARN_ON(irqfd->producer != NULL);
return;
}
WARN_ON(irqfd->producer != prod);
irqfd->producer = NULL;
/*
* When producer of consumer is unregistered, we change back to
* remapped mode, so we can re-use the current implementation
* when the irq is masked/disabed or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
}
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
if (!kvm_x86_ops->update_pi_irte)
return -EINVAL;
return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
}
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
@ -8044,3 +8181,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);

View File

@ -63,9 +63,6 @@ enum hv_cpuid_function {
/* Define version of the synthetic interrupt controller. */
#define HV_SYNIC_VERSION (1)
/* Define the expected SynIC version. */
#define HV_SYNIC_VERSION_1 (0x1)
/* Define synthetic interrupt controller message constants. */
#define HV_MESSAGE_SIZE (256)
#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240)
@ -105,8 +102,6 @@ enum hv_message_type {
HVMSG_X64_LEGACY_FP_ERROR = 0x80010005
};
/* Define the number of synthetic interrupt sources. */
#define HV_SYNIC_SINT_COUNT (16)
#define HV_SYNIC_STIMER_COUNT (4)
/* Define invalid partition identifier. */

View File

@ -22,7 +22,7 @@ int irq_remap_broken;
int disable_sourceid_checking;
int no_x2apic_optout;
int disable_irq_post = 1;
int disable_irq_post = 0;
static int disable_irq_remap;
static struct irq_remap_ops *remap_ops;
@ -58,14 +58,18 @@ static __init int setup_irqremap(char *str)
return -EINVAL;
while (*str) {
if (!strncmp(str, "on", 2))
if (!strncmp(str, "on", 2)) {
disable_irq_remap = 0;
else if (!strncmp(str, "off", 3))
disable_irq_post = 0;
} else if (!strncmp(str, "off", 3)) {
disable_irq_remap = 1;
else if (!strncmp(str, "nosid", 5))
disable_irq_post = 1;
} else if (!strncmp(str, "nosid", 5))
disable_sourceid_checking = 1;
else if (!strncmp(str, "no_x2apic_optout", 16))
no_x2apic_optout = 1;
else if (!strncmp(str, "nopost", 6))
disable_irq_post = 1;
str += strcspn(str, ",");
while (*str == ',')

View File

@ -33,3 +33,4 @@ menuconfig VFIO
source "drivers/vfio/pci/Kconfig"
source "drivers/vfio/platform/Kconfig"
source "virt/lib/Kconfig"

View File

@ -2,6 +2,7 @@ config VFIO_PCI
tristate "VFIO support for PCI devices"
depends on VFIO && PCI && EVENTFD
select VFIO_VIRQFD
select IRQ_BYPASS_MANAGER
help
Support for the PCI VFIO bus driver. This is required to make
use of PCI drivers using the VFIO framework.

View File

@ -319,6 +319,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
if (vdev->ctx[vector].trigger) {
free_irq(irq, vdev->ctx[vector].trigger);
irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
kfree(vdev->ctx[vector].name);
eventfd_ctx_put(vdev->ctx[vector].trigger);
vdev->ctx[vector].trigger = NULL;
@ -360,6 +361,14 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
return ret;
}
vdev->ctx[vector].producer.token = trigger;
vdev->ctx[vector].producer.irq = irq;
ret = irq_bypass_register_producer(&vdev->ctx[vector].producer);
if (unlikely(ret))
dev_info(&pdev->dev,
"irq bypass producer (token %p) registration fails: %d\n",
vdev->ctx[vector].producer.token, ret);
vdev->ctx[vector].trigger = trigger;
return 0;

View File

@ -13,6 +13,7 @@
#include <linux/mutex.h>
#include <linux/pci.h>
#include <linux/irqbypass.h>
#ifndef VFIO_PCI_PRIVATE_H
#define VFIO_PCI_PRIVATE_H
@ -29,6 +30,7 @@ struct vfio_pci_irq_ctx {
struct virqfd *mask;
char *name;
bool masked;
struct irq_bypass_producer producer;
};
struct vfio_pci_device {

View File

@ -51,7 +51,7 @@ struct arch_timer_cpu {
bool armed;
/* Timer IRQ */
const struct kvm_irq_level *irq;
struct kvm_irq_level irq;
/* VGIC mapping */
struct irq_phys_map *map;
@ -71,5 +71,7 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
bool kvm_timer_should_fire(struct kvm_vcpu *vcpu);
void kvm_timer_schedule(struct kvm_vcpu *vcpu);
void kvm_timer_unschedule(struct kvm_vcpu *vcpu);
#endif

View File

@ -112,7 +112,6 @@ struct vgic_vmcr {
struct vgic_ops {
struct vgic_lr (*get_lr)(const struct kvm_vcpu *, int);
void (*set_lr)(struct kvm_vcpu *, int, struct vgic_lr);
void (*sync_lr_elrsr)(struct kvm_vcpu *, int, struct vgic_lr);
u64 (*get_elrsr)(const struct kvm_vcpu *vcpu);
u64 (*get_eisr)(const struct kvm_vcpu *vcpu);
void (*clear_eisr)(struct kvm_vcpu *vcpu);
@ -159,7 +158,6 @@ struct irq_phys_map {
u32 virt_irq;
u32 phys_irq;
u32 irq;
bool active;
};
struct irq_phys_map_entry {
@ -296,22 +294,16 @@ struct vgic_v3_cpu_if {
};
struct vgic_cpu {
/* per IRQ to LR mapping */
u8 *vgic_irq_lr_map;
/* Pending/active/both interrupts on this VCPU */
DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS);
DECLARE_BITMAP( active_percpu, VGIC_NR_PRIVATE_IRQS);
DECLARE_BITMAP( pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS);
DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS);
DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
/* Pending/active/both shared interrupts, dynamically sized */
unsigned long *pending_shared;
unsigned long *active_shared;
unsigned long *pend_act_shared;
/* Bitmap of used/free list registers */
DECLARE_BITMAP( lr_used, VGIC_V2_MAX_LRS);
/* Number of list registers on this CPU */
int nr_lr;
@ -354,8 +346,6 @@ int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
int virt_irq, int irq);
int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map);
void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active);
#define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel))
#define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus))

View File

@ -26,6 +26,7 @@
#define _HYPERV_H
#include <uapi/linux/hyperv.h>
#include <uapi/asm/hyperv.h>
#include <linux/types.h>
#include <linux/scatterlist.h>

90
include/linux/irqbypass.h Normal file
View File

@ -0,0 +1,90 @@
/*
* IRQ offload/bypass manager
*
* Copyright (C) 2015 Red Hat, Inc.
* Copyright (c) 2015 Linaro Ltd.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#ifndef IRQBYPASS_H
#define IRQBYPASS_H
#include <linux/list.h>
struct irq_bypass_consumer;
/*
* Theory of operation
*
* The IRQ bypass manager is a simple set of lists and callbacks that allows
* IRQ producers (ex. physical interrupt sources) to be matched to IRQ
* consumers (ex. virtualization hardware that allows IRQ bypass or offload)
* via a shared token (ex. eventfd_ctx). Producers and consumers register
* independently. When a token match is found, the optional @stop callback
* will be called for each participant. The pair will then be connected via
* the @add_* callbacks, and finally the optional @start callback will allow
* any final coordination. When either participant is unregistered, the
* process is repeated using the @del_* callbacks in place of the @add_*
* callbacks. Match tokens must be unique per producer/consumer, 1:N pairings
* are not supported.
*/
/**
* struct irq_bypass_producer - IRQ bypass producer definition
* @node: IRQ bypass manager private list management
* @token: opaque token to match between producer and consumer
* @irq: Linux IRQ number for the producer device
* @add_consumer: Connect the IRQ producer to an IRQ consumer (optional)
* @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional)
* @stop: Perform any quiesce operations necessary prior to add/del (optional)
* @start: Perform any startup operations necessary after add/del (optional)
*
* The IRQ bypass producer structure represents an interrupt source for
* participation in possible host bypass, for instance an interrupt vector
* for a physical device assigned to a VM.
*/
struct irq_bypass_producer {
struct list_head node;
void *token;
int irq;
int (*add_consumer)(struct irq_bypass_producer *,
struct irq_bypass_consumer *);
void (*del_consumer)(struct irq_bypass_producer *,
struct irq_bypass_consumer *);
void (*stop)(struct irq_bypass_producer *);
void (*start)(struct irq_bypass_producer *);
};
/**
* struct irq_bypass_consumer - IRQ bypass consumer definition
* @node: IRQ bypass manager private list management
* @token: opaque token to match between producer and consumer
* @add_producer: Connect the IRQ consumer to an IRQ producer
* @del_producer: Disconnect the IRQ consumer from an IRQ producer
* @stop: Perform any quiesce operations necessary prior to add/del (optional)
* @start: Perform any startup operations necessary after add/del (optional)
*
* The IRQ bypass consumer structure represents an interrupt sink for
* participation in possible host bypass, for instance a hypervisor may
* support offloads to allow bypassing the host entirely or offload
* portions of the interrupt handling to the VM.
*/
struct irq_bypass_consumer {
struct list_head node;
void *token;
int (*add_producer)(struct irq_bypass_consumer *,
struct irq_bypass_producer *);
void (*del_producer)(struct irq_bypass_consumer *,
struct irq_bypass_producer *);
void (*stop)(struct irq_bypass_consumer *);
void (*start)(struct irq_bypass_consumer *);
};
int irq_bypass_register_producer(struct irq_bypass_producer *);
void irq_bypass_unregister_producer(struct irq_bypass_producer *);
int irq_bypass_register_consumer(struct irq_bypass_consumer *);
void irq_bypass_unregister_consumer(struct irq_bypass_consumer *);
#endif /* IRQBYPASS_H */

View File

@ -24,6 +24,7 @@
#include <linux/err.h>
#include <linux/irqflags.h>
#include <linux/context_tracking.h>
#include <linux/irqbypass.h>
#include <asm/signal.h>
#include <linux/kvm.h>
@ -140,6 +141,8 @@ static inline bool is_error_page(struct page *page)
#define KVM_REQ_APIC_PAGE_RELOAD 25
#define KVM_REQ_SMI 26
#define KVM_REQ_HV_CRASH 27
#define KVM_REQ_IOAPIC_EOI_EXIT 28
#define KVM_REQ_HV_RESET 29
#define KVM_USERSPACE_IRQ_SOURCE_ID 0
#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
@ -231,6 +234,9 @@ struct kvm_vcpu {
unsigned long requests;
unsigned long guest_debug;
int pre_pcpu;
struct list_head blocked_vcpu_list;
struct mutex mutex;
struct kvm_run *run;
@ -329,6 +335,18 @@ struct kvm_kernel_irq_routing_entry {
struct hlist_node link;
};
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
struct kvm_irq_routing_table {
int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
u32 nr_rt_entries;
/*
* Array indexed by gsi. Each entry contains list of irq chips
* the gsi is connected to.
*/
struct hlist_head map[0];
};
#endif
#ifndef KVM_PRIVATE_MEM_SLOTS
#define KVM_PRIVATE_MEM_SLOTS 0
#endif
@ -455,10 +473,14 @@ void vcpu_put(struct kvm_vcpu *vcpu);
#ifdef __KVM_HAVE_IOAPIC
void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
void kvm_arch_irq_routing_update(struct kvm *kvm);
#else
static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
{
}
static inline void kvm_arch_irq_routing_update(struct kvm *kvm)
{
}
#endif
#ifdef CONFIG_HAVE_KVM_IRQFD
@ -625,6 +647,8 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
void kvm_vcpu_block(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
int kvm_vcpu_yield_to(struct kvm_vcpu *target);
void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
@ -803,10 +827,13 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin);
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
bool line_status);
int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
int irq_source_id, int level, bool line_status);
int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id,
int level, bool line_status);
bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
void kvm_notify_acked_gsi(struct kvm *kvm, int gsi);
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
void kvm_register_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
@ -1002,6 +1029,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
#endif
int kvm_setup_default_irq_routing(struct kvm *kvm);
int kvm_setup_empty_irq_routing(struct kvm *kvm);
int kvm_set_irq_routing(struct kvm *kvm,
const struct kvm_irq_routing_entry *entries,
unsigned nr,
@ -1144,5 +1172,15 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
{
}
#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
#endif
#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
struct irq_bypass_producer *);
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
struct irq_bypass_producer *);
void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
#endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
#endif

71
include/linux/kvm_irqfd.h Normal file
View File

@ -0,0 +1,71 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* irqfd: Allows an fd to be used to inject an interrupt to the guest
* Credit goes to Avi Kivity for the original idea.
*/
#ifndef __LINUX_KVM_IRQFD_H
#define __LINUX_KVM_IRQFD_H
#include <linux/kvm_host.h>
#include <linux/poll.h>
/*
* Resampling irqfds are a special variety of irqfds used to emulate
* level triggered interrupts. The interrupt is asserted on eventfd
* trigger. On acknowledgment through the irq ack notifier, the
* interrupt is de-asserted and userspace is notified through the
* resamplefd. All resamplers on the same gsi are de-asserted
* together, so we don't need to track the state of each individual
* user. We can also therefore share the same irq source ID.
*/
struct kvm_kernel_irqfd_resampler {
struct kvm *kvm;
/*
* List of resampling struct _irqfd objects sharing this gsi.
* RCU list modified under kvm->irqfds.resampler_lock
*/
struct list_head list;
struct kvm_irq_ack_notifier notifier;
/*
* Entry in list of kvm->irqfd.resampler_list. Use for sharing
* resamplers among irqfds on the same gsi.
* Accessed and modified under kvm->irqfds.resampler_lock
*/
struct list_head link;
};
struct kvm_kernel_irqfd {
/* Used for MSI fast-path */
struct kvm *kvm;
wait_queue_t wait;
/* Update side is protected by irqfds.lock */
struct kvm_kernel_irq_routing_entry irq_entry;
seqcount_t irq_entry_sc;
/* Used for level IRQ fast-path */
int gsi;
struct work_struct inject;
/* The resampler used by this irqfd (resampler-only) */
struct kvm_kernel_irqfd_resampler *resampler;
/* Eventfd notified on resample (resampler-only) */
struct eventfd_ctx *resamplefd;
/* Entry in list of irqfds for a resampler (resampler-only) */
struct list_head resampler_link;
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
poll_table pt;
struct work_struct shutdown;
struct irq_bypass_consumer consumer;
struct irq_bypass_producer *producer;
};
#endif /* __LINUX_KVM_IRQFD_H */

View File

@ -183,6 +183,7 @@ struct kvm_s390_skeys {
#define KVM_EXIT_EPR 23
#define KVM_EXIT_SYSTEM_EVENT 24
#define KVM_EXIT_S390_STSI 25
#define KVM_EXIT_IOAPIC_EOI 26
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@ -333,6 +334,10 @@ struct kvm_run {
__u8 sel1;
__u16 sel2;
} s390_stsi;
/* KVM_EXIT_IOAPIC_EOI */
struct {
__u8 vector;
} eoi;
/* Fix the size of the union. */
char padding[256];
};
@ -824,6 +829,8 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_MULTI_ADDRESS_SPACE 118
#define KVM_CAP_GUEST_DEBUG_HW_BPS 119
#define KVM_CAP_GUEST_DEBUG_HW_WPS 120
#define KVM_CAP_SPLIT_IRQCHIP 121
#define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
#ifdef KVM_CAP_IRQ_ROUTING

View File

@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
*ut = p->utime;
*st = p->stime;
}
EXPORT_SYMBOL_GPL(task_cputime_adjusted);
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
task_cputime(p, &cputime.utime, &cputime.stime);
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
EXPORT_SYMBOL_GPL(task_cputime_adjusted);
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{

1
virt/Makefile Normal file
View File

@ -0,0 +1 @@
obj-y += lib/

View File

@ -46,4 +46,7 @@ config KVM_GENERIC_DIRTYLOG_READ_PROTECT
config KVM_COMPAT
def_bool y
depends on COMPAT && !S390
depends on KVM && COMPAT && !S390
config HAVE_KVM_IRQ_BYPASS
bool

View File

@ -28,6 +28,8 @@
#include <kvm/arm_vgic.h>
#include <kvm/arm_arch_timer.h>
#include "trace.h"
static struct timecounter *timecounter;
static struct workqueue_struct *wqueue;
static unsigned int host_vtimer_irq;
@ -59,18 +61,6 @@ static void timer_disarm(struct arch_timer_cpu *timer)
}
}
static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
{
int ret;
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
kvm_vgic_set_phys_irq_active(timer->map, true);
ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
timer->map,
timer->irq->level);
WARN_ON(ret);
}
static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
{
struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
@ -111,14 +101,20 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
return HRTIMER_NORESTART;
}
static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE);
}
bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
cycle_t cval, now;
if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
!(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) ||
kvm_vgic_get_phys_irq_active(timer->map))
if (!kvm_timer_irq_can_fire(vcpu))
return false;
cval = timer->cntv_cval;
@ -127,12 +123,94 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
return cval <= now;
}
static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level)
{
int ret;
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
BUG_ON(!vgic_initialized(vcpu->kvm));
timer->irq.level = new_level;
trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->map->virt_irq,
timer->irq.level);
ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
timer->map,
timer->irq.level);
WARN_ON(ret);
}
/*
* Check if there was a change in the timer state (should we raise or lower
* the line level to the GIC).
*/
static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
/*
* If userspace modified the timer registers via SET_ONE_REG before
* the vgic was initialized, we mustn't set the timer->irq.level value
* because the guest would never see the interrupt. Instead wait
* until we call this function from kvm_timer_flush_hwstate.
*/
if (!vgic_initialized(vcpu->kvm))
return;
if (kvm_timer_should_fire(vcpu) != timer->irq.level)
kvm_timer_update_irq(vcpu, !timer->irq.level);
}
/*
* Schedule the background timer before calling kvm_vcpu_block, so that this
* thread is removed from its waitqueue and made runnable when there's a timer
* interrupt to handle.
*/
void kvm_timer_schedule(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
u64 ns;
cycle_t cval, now;
BUG_ON(timer_is_armed(timer));
/*
* No need to schedule a background timer if the guest timer has
* already expired, because kvm_vcpu_block will return before putting
* the thread to sleep.
*/
if (kvm_timer_should_fire(vcpu))
return;
/*
* If the timer is not capable of raising interrupts (disabled or
* masked), then there's no more work for us to do.
*/
if (!kvm_timer_irq_can_fire(vcpu))
return;
/* The timer has not yet expired, schedule a background timer */
cval = timer->cntv_cval;
now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
ns = cyclecounter_cyc2ns(timecounter->cc,
cval - now,
timecounter->mask,
&timecounter->frac);
timer_arm(timer, ns);
}
void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
timer_disarm(timer);
}
/**
* kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu
* @vcpu: The vcpu pointer
*
* Disarm any pending soft timers, since the world-switch code will write the
* virtual timer state back to the physical CPU.
* Check if the virtual timer has expired while we were running in the host,
* and inject an interrupt if that was the case.
*/
void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
{
@ -140,28 +218,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
bool phys_active;
int ret;
/*
* We're about to run this vcpu again, so there is no need to
* keep the background timer running, as we're about to
* populate the CPU timer again.
*/
timer_disarm(timer);
kvm_timer_update_state(vcpu);
/*
* If the timer expired while we were not scheduled, now is the time
* to inject it.
* If we enter the guest with the virtual input level to the VGIC
* asserted, then we have already told the VGIC what we need to, and
* we don't need to exit from the guest until the guest deactivates
* the already injected interrupt, so therefore we should set the
* hardware active state to prevent unnecessary exits from the guest.
*
* Conversely, if the virtual input level is deasserted, then always
* clear the hardware active state to ensure that hardware interrupts
* from the timer triggers a guest exit.
*/
if (kvm_timer_should_fire(vcpu))
kvm_timer_inject_irq(vcpu);
/*
* We keep track of whether the edge-triggered interrupt has been
* signalled to the vgic/guest, and if so, we mask the interrupt and
* the physical distributor to prevent the timer from raising a
* physical interrupt whenever we run a guest, preventing forward
* VCPU progress.
*/
if (kvm_vgic_get_phys_irq_active(timer->map))
if (timer->irq.level)
phys_active = true;
else
phys_active = false;
@ -176,32 +246,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
* kvm_timer_sync_hwstate - sync timer state from cpu
* @vcpu: The vcpu pointer
*
* Check if the virtual timer was armed and either schedule a corresponding
* soft timer or inject directly if already expired.
* Check if the virtual timer has expired while we were running in the guest,
* and inject an interrupt if that was the case.
*/
void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
cycle_t cval, now;
u64 ns;
BUG_ON(timer_is_armed(timer));
if (kvm_timer_should_fire(vcpu)) {
/*
* Timer has already expired while we were not
* looking. Inject the interrupt and carry on.
*/
kvm_timer_inject_irq(vcpu);
return;
}
cval = timer->cntv_cval;
now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask,
&timecounter->frac);
timer_arm(timer, ns);
/*
* The guest could have modified the timer registers or the timer
* could have expired, update the timer state.
*/
kvm_timer_update_state(vcpu);
}
int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
@ -216,7 +274,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
* kvm_vcpu_set_target(). To handle this, we determine
* vcpu timer irq number when the vcpu is reset.
*/
timer->irq = irq;
timer->irq.irq = irq->irq;
/*
* The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
@ -225,6 +283,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
* the ARMv7 architecture.
*/
timer->cntv_ctl = 0;
kvm_timer_update_state(vcpu);
/*
* Tell the VGIC that the virtual interrupt is tied to a
@ -269,6 +328,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
default:
return -1;
}
kvm_timer_update_state(vcpu);
return 0;
}

63
virt/kvm/arm/trace.h Normal file
View File

@ -0,0 +1,63 @@
#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KVM_H
#include <linux/tracepoint.h>
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
/*
* Tracepoints for vgic
*/
TRACE_EVENT(vgic_update_irq_pending,
TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
TP_ARGS(vcpu_id, irq, level),
TP_STRUCT__entry(
__field( unsigned long, vcpu_id )
__field( __u32, irq )
__field( bool, level )
),
TP_fast_assign(
__entry->vcpu_id = vcpu_id;
__entry->irq = irq;
__entry->level = level;
),
TP_printk("VCPU: %ld, IRQ %d, level: %d",
__entry->vcpu_id, __entry->irq, __entry->level)
);
/*
* Tracepoints for arch_timer
*/
TRACE_EVENT(kvm_timer_update_irq,
TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
TP_ARGS(vcpu_id, irq, level),
TP_STRUCT__entry(
__field( unsigned long, vcpu_id )
__field( __u32, irq )
__field( int, level )
),
TP_fast_assign(
__entry->vcpu_id = vcpu_id;
__entry->irq = irq;
__entry->level = level;
),
TP_printk("VCPU: %ld, IRQ %d, level %d",
__entry->vcpu_id, __entry->irq, __entry->level)
);
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
/* This part must be outside protection */
#include <trace/define_trace.h>

View File

@ -79,11 +79,7 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
}
static void vgic_v2_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
struct vgic_lr lr_desc)
{
if (!(lr_desc.state & LR_STATE_MASK))
vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
else
@ -158,6 +154,7 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
* anyway.
*/
vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
/* Get the show on the road... */
vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
@ -166,7 +163,6 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
static const struct vgic_ops vgic_v2_ops = {
.get_lr = vgic_v2_get_lr,
.set_lr = vgic_v2_set_lr,
.sync_lr_elrsr = vgic_v2_sync_lr_elrsr,
.get_elrsr = vgic_v2_get_elrsr,
.get_eisr = vgic_v2_get_eisr,
.clear_eisr = vgic_v2_clear_eisr,

View File

@ -112,11 +112,7 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
}
vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
}
static void vgic_v3_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
struct vgic_lr lr_desc)
{
if (!(lr_desc.state & LR_STATE_MASK))
vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
else
@ -193,6 +189,7 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
* anyway.
*/
vgic_v3->vgic_vmcr = 0;
vgic_v3->vgic_elrsr = ~0;
/*
* If we are emulating a GICv3, we do it in an non-GICv2-compatible
@ -211,7 +208,6 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
static const struct vgic_ops vgic_v3_ops = {
.get_lr = vgic_v3_get_lr,
.set_lr = vgic_v3_set_lr,
.sync_lr_elrsr = vgic_v3_sync_lr_elrsr,
.get_elrsr = vgic_v3_get_elrsr,
.get_eisr = vgic_v3_get_eisr,
.clear_eisr = vgic_v3_clear_eisr,

View File

@ -34,6 +34,9 @@
#include <asm/kvm.h>
#include <kvm/iodev.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
/*
* How the whole thing works (courtesy of Christoffer Dall):
*
@ -102,11 +105,13 @@
#include "vgic.h"
static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu);
static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu);
static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
int virt_irq);
static int compute_pending_for_cpu(struct kvm_vcpu *vcpu);
static const struct vgic_ops *vgic_ops;
static const struct vgic_params *vgic;
@ -357,6 +362,11 @@ static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq)
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0);
if (!vgic_dist_irq_get_level(vcpu, irq)) {
vgic_dist_irq_clear_pending(vcpu, irq);
if (!compute_pending_for_cpu(vcpu))
clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
}
}
static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
@ -531,34 +541,6 @@ bool vgic_handle_set_pending_reg(struct kvm *kvm,
return false;
}
/*
* If a mapped interrupt's state has been modified by the guest such that it
* is no longer active or pending, without it have gone through the sync path,
* then the map->active field must be cleared so the interrupt can be taken
* again.
*/
static void vgic_handle_clear_mapped_irq(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct list_head *root;
struct irq_phys_map_entry *entry;
struct irq_phys_map *map;
rcu_read_lock();
/* Check for PPIs */
root = &vgic_cpu->irq_phys_map_list;
list_for_each_entry_rcu(entry, root, entry) {
map = &entry->map;
if (!vgic_dist_irq_is_pending(vcpu, map->virt_irq) &&
!vgic_irq_is_active(vcpu, map->virt_irq))
map->active = false;
}
rcu_read_unlock();
}
bool vgic_handle_clear_pending_reg(struct kvm *kvm,
struct kvm_exit_mmio *mmio,
phys_addr_t offset, int vcpu_id)
@ -589,7 +571,6 @@ bool vgic_handle_clear_pending_reg(struct kvm *kvm,
vcpu_id, offset);
vgic_reg_access(mmio, reg, offset, mode);
vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id));
vgic_update_state(kvm);
return true;
}
@ -627,7 +608,6 @@ bool vgic_handle_clear_active_reg(struct kvm *kvm,
ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
if (mmio->is_write) {
vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id));
vgic_update_state(kvm);
return true;
}
@ -684,10 +664,9 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
vgic_reg_access(mmio, &val, offset,
ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
if (mmio->is_write) {
if (offset < 8) {
*reg = ~0U; /* Force PPIs/SGIs to 1 */
/* Ignore writes to read-only SGI and PPI bits */
if (offset < 8)
return false;
}
val = vgic_cfg_compress(val);
if (offset & 4) {
@ -713,9 +692,11 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
u64 elrsr = vgic_get_elrsr(vcpu);
unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
int i;
for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) {
for_each_clear_bit(i, elrsr_ptr, vgic_cpu->nr_lr) {
struct vgic_lr lr = vgic_get_lr(vcpu, i);
/*
@ -736,30 +717,14 @@ void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
* interrupt then move the active state to the
* distributor tracking bit.
*/
if (lr.state & LR_STATE_ACTIVE) {
if (lr.state & LR_STATE_ACTIVE)
vgic_irq_set_active(vcpu, lr.irq);
lr.state &= ~LR_STATE_ACTIVE;
}
/*
* Reestablish the pending state on the distributor and the
* CPU interface. It may have already been pending, but that
* is fine, then we are only setting a few bits that were
* already set.
* CPU interface and mark the LR as free for other use.
*/
if (lr.state & LR_STATE_PENDING) {
vgic_dist_irq_set_pending(vcpu, lr.irq);
lr.state &= ~LR_STATE_PENDING;
}
vgic_set_lr(vcpu, i, lr);
/*
* Mark the LR as free for other use.
*/
BUG_ON(lr.state & LR_STATE_MASK);
vgic_retire_lr(i, lr.irq, vcpu);
vgic_irq_clear_queued(vcpu, lr.irq);
vgic_retire_lr(i, vcpu);
/* Finally update the VGIC state. */
vgic_update_state(vcpu->kvm);
@ -1067,12 +1032,6 @@ static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr,
vgic_ops->set_lr(vcpu, lr, vlr);
}
static void vgic_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
struct vgic_lr vlr)
{
vgic_ops->sync_lr_elrsr(vcpu, lr, vlr);
}
static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu)
{
return vgic_ops->get_elrsr(vcpu);
@ -1118,25 +1077,23 @@ static inline void vgic_enable(struct kvm_vcpu *vcpu)
vgic_ops->enable(vcpu);
}
static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu)
static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr);
vgic_irq_clear_queued(vcpu, vlr.irq);
/*
* We must transfer the pending state back to the distributor before
* retiring the LR, otherwise we may loose edge-triggered interrupts.
*/
if (vlr.state & LR_STATE_PENDING) {
vgic_dist_irq_set_pending(vcpu, irq);
vgic_dist_irq_set_pending(vcpu, vlr.irq);
vlr.hwirq = 0;
}
vlr.state = 0;
vgic_set_lr(vcpu, lr_nr, vlr);
clear_bit(lr_nr, vgic_cpu->lr_used);
vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY;
vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
}
/*
@ -1150,17 +1107,15 @@ static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu)
*/
static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
u64 elrsr = vgic_get_elrsr(vcpu);
unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
int lr;
for_each_set_bit(lr, vgic_cpu->lr_used, vgic->nr_lr) {
for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
if (!vgic_irq_is_enabled(vcpu, vlr.irq)) {
vgic_retire_lr(lr, vlr.irq, vcpu);
if (vgic_irq_is_queued(vcpu, vlr.irq))
vgic_irq_clear_queued(vcpu, vlr.irq);
}
if (!vgic_irq_is_enabled(vcpu, vlr.irq))
vgic_retire_lr(lr, vcpu);
}
}
@ -1200,7 +1155,6 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
}
vgic_set_lr(vcpu, lr_nr, vlr);
vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
}
/*
@ -1210,8 +1164,9 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
*/
bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
u64 elrsr = vgic_get_elrsr(vcpu);
unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
struct vgic_lr vlr;
int lr;
@ -1222,28 +1177,22 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
kvm_debug("Queue IRQ%d\n", irq);
lr = vgic_cpu->vgic_irq_lr_map[irq];
/* Do we have an active interrupt for the same CPUID? */
if (lr != LR_EMPTY) {
for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
vlr = vgic_get_lr(vcpu, lr);
if (vlr.source == sgi_source_id) {
if (vlr.irq == irq && vlr.source == sgi_source_id) {
kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
BUG_ON(!test_bit(lr, vgic_cpu->lr_used));
vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
return true;
}
}
/* Try to use another LR for this interrupt */
lr = find_first_zero_bit((unsigned long *)vgic_cpu->lr_used,
vgic->nr_lr);
lr = find_first_bit(elrsr_ptr, vgic->nr_lr);
if (lr >= vgic->nr_lr)
return false;
kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id);
vgic_cpu->vgic_irq_lr_map[irq] = lr;
set_bit(lr, vgic_cpu->lr_used);
vlr.irq = irq;
vlr.source = sgi_source_id;
@ -1338,12 +1287,60 @@ epilog:
}
}
static int process_queued_irq(struct kvm_vcpu *vcpu,
int lr, struct vgic_lr vlr)
{
int pending = 0;
/*
* If the IRQ was EOIed (called from vgic_process_maintenance) or it
* went from active to non-active (called from vgic_sync_hwirq) it was
* also ACKed and we we therefore assume we can clear the soft pending
* state (should it had been set) for this interrupt.
*
* Note: if the IRQ soft pending state was set after the IRQ was
* acked, it actually shouldn't be cleared, but we have no way of
* knowing that unless we start trapping ACKs when the soft-pending
* state is set.
*/
vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
/*
* Tell the gic to start sampling this interrupt again.
*/
vgic_irq_clear_queued(vcpu, vlr.irq);
/* Any additional pending interrupt? */
if (vgic_irq_is_edge(vcpu, vlr.irq)) {
BUG_ON(!(vlr.state & LR_HW));
pending = vgic_dist_irq_is_pending(vcpu, vlr.irq);
} else {
if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
vgic_cpu_irq_set(vcpu, vlr.irq);
pending = 1;
} else {
vgic_dist_irq_clear_pending(vcpu, vlr.irq);
vgic_cpu_irq_clear(vcpu, vlr.irq);
}
}
/*
* Despite being EOIed, the LR may not have
* been marked as empty.
*/
vlr.state = 0;
vlr.hwirq = 0;
vgic_set_lr(vcpu, lr, vlr);
return pending;
}
static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
{
u32 status = vgic_get_interrupt_status(vcpu);
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
bool level_pending = false;
struct kvm *kvm = vcpu->kvm;
int level_pending = 0;
kvm_debug("STATUS = %08x\n", status);
@ -1358,54 +1355,22 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) {
struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
spin_lock(&dist->lock);
vgic_irq_clear_queued(vcpu, vlr.irq);
WARN_ON(vlr.state & LR_STATE_MASK);
vlr.state = 0;
vgic_set_lr(vcpu, lr, vlr);
/*
* If the IRQ was EOIed it was also ACKed and we we
* therefore assume we can clear the soft pending
* state (should it had been set) for this interrupt.
*
* Note: if the IRQ soft pending state was set after
* the IRQ was acked, it actually shouldn't be
* cleared, but we have no way of knowing that unless
* we start trapping ACKs when the soft-pending state
* is set.
*/
vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
/*
* kvm_notify_acked_irq calls kvm_set_irq()
* to reset the IRQ level. Need to release the
* lock for kvm_set_irq to grab it.
* to reset the IRQ level, which grabs the dist->lock
* so we call this before taking the dist->lock.
*/
spin_unlock(&dist->lock);
kvm_notify_acked_irq(kvm, 0,
vlr.irq - VGIC_NR_PRIVATE_IRQS);
spin_lock(&dist->lock);
/* Any additional pending interrupt? */
if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
vgic_cpu_irq_set(vcpu, vlr.irq);
level_pending = true;
} else {
vgic_dist_irq_clear_pending(vcpu, vlr.irq);
vgic_cpu_irq_clear(vcpu, vlr.irq);
}
level_pending |= process_queued_irq(vcpu, lr, vlr);
spin_unlock(&dist->lock);
/*
* Despite being EOIed, the LR may not have
* been marked as empty.
*/
vgic_sync_lr_elrsr(vcpu, lr, vlr);
}
}
@ -1426,35 +1391,40 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
/*
* Save the physical active state, and reset it to inactive.
*
* Return 1 if HW interrupt went from active to inactive, and 0 otherwise.
* Return true if there's a pending forwarded interrupt to queue.
*/
static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr)
{
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
struct irq_phys_map *map;
bool phys_active;
bool level_pending;
int ret;
if (!(vlr.state & LR_HW))
return 0;
return false;
map = vgic_irq_map_search(vcpu, vlr.irq);
BUG_ON(!map);
ret = irq_get_irqchip_state(map->irq,
IRQCHIP_STATE_ACTIVE,
&map->active);
&phys_active);
WARN_ON(ret);
if (map->active)
if (phys_active)
return 0;
return 1;
spin_lock(&dist->lock);
level_pending = process_queued_irq(vcpu, lr, vlr);
spin_unlock(&dist->lock);
return level_pending;
}
/* Sync back the VGIC state after a guest run */
static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
u64 elrsr;
unsigned long *elrsr_ptr;
@ -1462,40 +1432,18 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
bool level_pending;
level_pending = vgic_process_maintenance(vcpu);
elrsr = vgic_get_elrsr(vcpu);
elrsr_ptr = u64_to_bitmask(&elrsr);
/* Deal with HW interrupts, and clear mappings for empty LRs */
for (lr = 0; lr < vgic->nr_lr; lr++) {
struct vgic_lr vlr;
if (!test_bit(lr, vgic_cpu->lr_used))
continue;
vlr = vgic_get_lr(vcpu, lr);
if (vgic_sync_hwirq(vcpu, vlr)) {
/*
* So this is a HW interrupt that the guest
* EOI-ed. Clean the LR state and allow the
* interrupt to be sampled again.
*/
vlr.state = 0;
vlr.hwirq = 0;
vgic_set_lr(vcpu, lr, vlr);
vgic_irq_clear_queued(vcpu, vlr.irq);
set_bit(lr, elrsr_ptr);
}
if (!test_bit(lr, elrsr_ptr))
continue;
clear_bit(lr, vgic_cpu->lr_used);
struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
level_pending |= vgic_sync_hwirq(vcpu, lr, vlr);
BUG_ON(vlr.irq >= dist->nr_irqs);
vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
}
/* Check if we still have something up our sleeve... */
elrsr = vgic_get_elrsr(vcpu);
elrsr_ptr = u64_to_bitmask(&elrsr);
pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr);
if (level_pending || pending < vgic->nr_lr)
set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
@ -1585,6 +1533,8 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
int enabled;
bool ret = true, can_inject = true;
trace_vgic_update_irq_pending(cpuid, irq_num, level);
if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
return -EINVAL;
@ -1863,30 +1813,6 @@ static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
kfree(entry);
}
/**
* kvm_vgic_get_phys_irq_active - Return the active state of a mapped IRQ
*
* Return the logical active state of a mapped interrupt. This doesn't
* necessarily reflects the current HW state.
*/
bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map)
{
BUG_ON(!map);
return map->active;
}
/**
* kvm_vgic_set_phys_irq_active - Set the active state of a mapped IRQ
*
* Set the logical active state of a mapped interrupt. This doesn't
* immediately affects the HW state.
*/
void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active)
{
BUG_ON(!map);
map->active = active;
}
/**
* kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
* @vcpu: The VCPU pointer
@ -1942,12 +1868,10 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
kfree(vgic_cpu->pending_shared);
kfree(vgic_cpu->active_shared);
kfree(vgic_cpu->pend_act_shared);
kfree(vgic_cpu->vgic_irq_lr_map);
vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
vgic_cpu->pending_shared = NULL;
vgic_cpu->active_shared = NULL;
vgic_cpu->pend_act_shared = NULL;
vgic_cpu->vgic_irq_lr_map = NULL;
}
static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
@ -1958,18 +1882,14 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
vgic_cpu->vgic_irq_lr_map = kmalloc(nr_irqs, GFP_KERNEL);
if (!vgic_cpu->pending_shared
|| !vgic_cpu->active_shared
|| !vgic_cpu->pend_act_shared
|| !vgic_cpu->vgic_irq_lr_map) {
|| !vgic_cpu->pend_act_shared) {
kvm_vgic_vcpu_destroy(vcpu);
return -ENOMEM;
}
memset(vgic_cpu->vgic_irq_lr_map, LR_EMPTY, nr_irqs);
/*
* Store the number of LRs per vcpu, so we don't have to go
* all the way to the distributor structure to find out. Only
@ -2111,14 +2031,24 @@ int vgic_init(struct kvm *kvm)
break;
}
for (i = 0; i < dist->nr_irqs; i++) {
if (i < VGIC_NR_PPIS)
/*
* Enable and configure all SGIs to be edge-triggere and
* configure all PPIs as level-triggered.
*/
for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
if (i < VGIC_NR_SGIS) {
/* SGIs */
vgic_bitmap_set_irq_val(&dist->irq_enabled,
vcpu->vcpu_id, i, 1);
if (i < VGIC_NR_PRIVATE_IRQS)
vgic_bitmap_set_irq_val(&dist->irq_cfg,
vcpu->vcpu_id, i,
VGIC_CFG_EDGE);
} else if (i < VGIC_NR_PRIVATE_IRQS) {
/* PPIs */
vgic_bitmap_set_irq_val(&dist->irq_cfg,
vcpu->vcpu_id, i,
VGIC_CFG_LEVEL);
}
}
vgic_enable(vcpu);

View File

@ -94,6 +94,10 @@ static void async_pf_execute(struct work_struct *work)
trace_kvm_async_pf_completed(addr, gva);
/*
* This memory barrier pairs with prepare_to_wait's set_current_state()
*/
smp_mb();
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);

View File

@ -23,6 +23,7 @@
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/kvm_irqfd.h>
#include <linux/workqueue.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
@ -34,73 +35,20 @@
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/seqlock.h>
#include <linux/irqbypass.h>
#include <trace/events/kvm.h>
#include <kvm/iodev.h>
#ifdef CONFIG_HAVE_KVM_IRQFD
/*
* --------------------------------------------------------------------
* irqfd: Allows an fd to be used to inject an interrupt to the guest
*
* Credit goes to Avi Kivity for the original idea.
* --------------------------------------------------------------------
*/
/*
* Resampling irqfds are a special variety of irqfds used to emulate
* level triggered interrupts. The interrupt is asserted on eventfd
* trigger. On acknowledgement through the irq ack notifier, the
* interrupt is de-asserted and userspace is notified through the
* resamplefd. All resamplers on the same gsi are de-asserted
* together, so we don't need to track the state of each individual
* user. We can also therefore share the same irq source ID.
*/
struct _irqfd_resampler {
struct kvm *kvm;
/*
* List of resampling struct _irqfd objects sharing this gsi.
* RCU list modified under kvm->irqfds.resampler_lock
*/
struct list_head list;
struct kvm_irq_ack_notifier notifier;
/*
* Entry in list of kvm->irqfd.resampler_list. Use for sharing
* resamplers among irqfds on the same gsi.
* Accessed and modified under kvm->irqfds.resampler_lock
*/
struct list_head link;
};
struct _irqfd {
/* Used for MSI fast-path */
struct kvm *kvm;
wait_queue_t wait;
/* Update side is protected by irqfds.lock */
struct kvm_kernel_irq_routing_entry irq_entry;
seqcount_t irq_entry_sc;
/* Used for level IRQ fast-path */
int gsi;
struct work_struct inject;
/* The resampler used by this irqfd (resampler-only) */
struct _irqfd_resampler *resampler;
/* Eventfd notified on resample (resampler-only) */
struct eventfd_ctx *resamplefd;
/* Entry in list of irqfds for a resampler (resampler-only) */
struct list_head resampler_link;
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
poll_table pt;
struct work_struct shutdown;
};
static struct workqueue_struct *irqfd_cleanup_wq;
static void
irqfd_inject(struct work_struct *work)
{
struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
struct kvm_kernel_irqfd *irqfd =
container_of(work, struct kvm_kernel_irqfd, inject);
struct kvm *kvm = irqfd->kvm;
if (!irqfd->resampler) {
@ -121,12 +69,13 @@ irqfd_inject(struct work_struct *work)
static void
irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
{
struct _irqfd_resampler *resampler;
struct kvm_kernel_irqfd_resampler *resampler;
struct kvm *kvm;
struct _irqfd *irqfd;
struct kvm_kernel_irqfd *irqfd;
int idx;
resampler = container_of(kian, struct _irqfd_resampler, notifier);
resampler = container_of(kian,
struct kvm_kernel_irqfd_resampler, notifier);
kvm = resampler->kvm;
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
@ -141,9 +90,9 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
}
static void
irqfd_resampler_shutdown(struct _irqfd *irqfd)
irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
{
struct _irqfd_resampler *resampler = irqfd->resampler;
struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
struct kvm *kvm = resampler->kvm;
mutex_lock(&kvm->irqfds.resampler_lock);
@ -168,7 +117,8 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd)
static void
irqfd_shutdown(struct work_struct *work)
{
struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
struct kvm_kernel_irqfd *irqfd =
container_of(work, struct kvm_kernel_irqfd, shutdown);
u64 cnt;
/*
@ -191,6 +141,9 @@ irqfd_shutdown(struct work_struct *work)
/*
* It is now safe to release the object's resources
*/
#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
irq_bypass_unregister_consumer(&irqfd->consumer);
#endif
eventfd_ctx_put(irqfd->eventfd);
kfree(irqfd);
}
@ -198,7 +151,7 @@ irqfd_shutdown(struct work_struct *work)
/* assumes kvm->irqfds.lock is held */
static bool
irqfd_is_active(struct _irqfd *irqfd)
irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
{
return list_empty(&irqfd->list) ? false : true;
}
@ -209,7 +162,7 @@ irqfd_is_active(struct _irqfd *irqfd)
* assumes kvm->irqfds.lock is held
*/
static void
irqfd_deactivate(struct _irqfd *irqfd)
irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
{
BUG_ON(!irqfd_is_active(irqfd));
@ -218,13 +171,23 @@ irqfd_deactivate(struct _irqfd *irqfd)
queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
}
int __attribute__((weak)) kvm_arch_set_irq_inatomic(
struct kvm_kernel_irq_routing_entry *irq,
struct kvm *kvm, int irq_source_id,
int level,
bool line_status)
{
return -EWOULDBLOCK;
}
/*
* Called with wqh->lock held and interrupts disabled
*/
static int
irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
struct kvm_kernel_irqfd *irqfd =
container_of(wait, struct kvm_kernel_irqfd, wait);
unsigned long flags = (unsigned long)key;
struct kvm_kernel_irq_routing_entry irq;
struct kvm *kvm = irqfd->kvm;
@ -238,10 +201,9 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
irq = irqfd->irq_entry;
} while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
/* An event has been signaled, inject an interrupt */
if (irq.type == KVM_IRQ_ROUTING_MSI)
kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
false);
else
if (kvm_arch_set_irq_inatomic(&irq, kvm,
KVM_USERSPACE_IRQ_SOURCE_ID, 1,
false) == -EWOULDBLOCK)
schedule_work(&irqfd->inject);
srcu_read_unlock(&kvm->irq_srcu, idx);
}
@ -274,37 +236,54 @@ static void
irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
struct kvm_kernel_irqfd *irqfd =
container_of(pt, struct kvm_kernel_irqfd, pt);
add_wait_queue(wqh, &irqfd->wait);
}
/* Must be called under irqfds.lock */
static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd)
static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
int i, n_entries;
int n_entries;
n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
write_seqcount_begin(&irqfd->irq_entry_sc);
irqfd->irq_entry.type = 0;
e = entries;
for (i = 0; i < n_entries; ++i, ++e) {
/* Only fast-path MSI. */
if (e->type == KVM_IRQ_ROUTING_MSI)
irqfd->irq_entry = *e;
}
if (n_entries == 1)
irqfd->irq_entry = *e;
else
irqfd->irq_entry.type = 0;
write_seqcount_end(&irqfd->irq_entry_sc);
}
#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
void __attribute__((weak)) kvm_arch_irq_bypass_stop(
struct irq_bypass_consumer *cons)
{
}
void __attribute__((weak)) kvm_arch_irq_bypass_start(
struct irq_bypass_consumer *cons)
{
}
int __attribute__((weak)) kvm_arch_update_irqfd_routing(
struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
return 0;
}
#endif
static int
kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
{
struct _irqfd *irqfd, *tmp;
struct kvm_kernel_irqfd *irqfd, *tmp;
struct fd f;
struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
int ret;
@ -340,7 +319,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
irqfd->eventfd = eventfd;
if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
struct _irqfd_resampler *resampler;
struct kvm_kernel_irqfd_resampler *resampler;
resamplefd = eventfd_ctx_fdget(args->resamplefd);
if (IS_ERR(resamplefd)) {
@ -428,6 +407,17 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
* we might race against the POLLHUP
*/
fdput(f);
#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
irqfd->consumer.token = (void *)irqfd->eventfd;
irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
irqfd->consumer.start = kvm_arch_irq_bypass_start;
ret = irq_bypass_register_consumer(&irqfd->consumer);
if (ret)
pr_info("irq bypass consumer (token %p) registration fails: %d\n",
irqfd->consumer.token, ret);
#endif
return 0;
@ -469,9 +459,18 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
}
EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
{
struct kvm_irq_ack_notifier *kian;
hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
link)
if (kian->gsi == gsi)
kian->irq_acked(kian);
}
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
{
int gsi, idx;
trace_kvm_ack_irq(irqchip, pin);
@ -479,10 +478,7 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
idx = srcu_read_lock(&kvm->irq_srcu);
gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
if (gsi != -1)
hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
link)
if (kian->gsi == gsi)
kian->irq_acked(kian);
kvm_notify_acked_gsi(kvm, gsi);
srcu_read_unlock(&kvm->irq_srcu, idx);
}
@ -525,7 +521,7 @@ kvm_eventfd_init(struct kvm *kvm)
static int
kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
{
struct _irqfd *irqfd, *tmp;
struct kvm_kernel_irqfd *irqfd, *tmp;
struct eventfd_ctx *eventfd;
eventfd = eventfd_ctx_fdget(args->fd);
@ -581,7 +577,7 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
void
kvm_irqfd_release(struct kvm *kvm)
{
struct _irqfd *irqfd, *tmp;
struct kvm_kernel_irqfd *irqfd, *tmp;
spin_lock_irq(&kvm->irqfds.lock);
@ -604,13 +600,23 @@ kvm_irqfd_release(struct kvm *kvm)
*/
void kvm_irq_routing_update(struct kvm *kvm)
{
struct _irqfd *irqfd;
struct kvm_kernel_irqfd *irqfd;
spin_lock_irq(&kvm->irqfds.lock);
list_for_each_entry(irqfd, &kvm->irqfds.items, list)
list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
irqfd_update(kvm, irqfd);
#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
if (irqfd->producer) {
int ret = kvm_arch_update_irqfd_routing(
irqfd->kvm, irqfd->producer->irq,
irqfd->gsi, 1);
WARN_ON(ret);
}
#endif
}
spin_unlock_irq(&kvm->irqfds.lock);
}
@ -914,9 +920,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
return -EINVAL;
/* ioeventfd with no length can't be combined with DATAMATCH */
if (!args->len &&
args->flags & (KVM_IOEVENTFD_FLAG_PIO |
KVM_IOEVENTFD_FLAG_DATAMATCH))
if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
return -EINVAL;
ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);

View File

@ -31,16 +31,6 @@
#include <trace/events/kvm.h>
#include "irq.h"
struct kvm_irq_routing_table {
int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
u32 nr_rt_entries;
/*
* Array indexed by gsi. Each entry contains list of irq chips
* the gsi is connected to.
*/
struct hlist_head map[0];
};
int kvm_irq_map_gsi(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *entries, int gsi)
{
@ -154,11 +144,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
/*
* Do not allow GSI to be mapped to the same irqchip more than once.
* Allow only one to one mapping between GSI and MSI.
* Allow only one to one mapping between GSI and non-irqchip routing.
*/
hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
if (ei->type == KVM_IRQ_ROUTING_MSI ||
ue->type == KVM_IRQ_ROUTING_MSI ||
if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
ue->u.irqchip.irqchip == ei->irqchip.irqchip)
return r;
@ -231,6 +221,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
kvm_irq_routing_update(kvm);
mutex_unlock(&kvm->irq_lock);
kvm_arch_irq_routing_update(kvm);
synchronize_srcu_expedited(&kvm->irq_srcu);
new = old;

View File

@ -230,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
vcpu->pre_pcpu = -1;
INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
r = -ENOMEM;
@ -2018,6 +2021,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
} while (single_task_running() && ktime_before(cur, stop));
}
kvm_arch_vcpu_blocking(vcpu);
for (;;) {
prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
@ -2031,6 +2036,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
finish_wait(&vcpu->wq, &wait);
cur = ktime_get();
kvm_arch_vcpu_unblocking(vcpu);
out:
block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
@ -2718,6 +2724,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_IRQFD:
case KVM_CAP_IRQFD_RESAMPLE:
#endif
case KVM_CAP_IOEVENTFD_ANY_LENGTH:
case KVM_CAP_CHECK_EXTENSION_VM:
return 1;
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
@ -3341,7 +3348,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
return -ENOSPC;
new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
sizeof(struct kvm_io_range)), GFP_KERNEL);
if (!new_bus)
return -ENOMEM;
@ -3373,7 +3380,7 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
if (r)
return r;
new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
sizeof(struct kvm_io_range)), GFP_KERNEL);
if (!new_bus)
return -ENOMEM;

2
virt/lib/Kconfig Normal file
View File

@ -0,0 +1,2 @@
config IRQ_BYPASS_MANAGER
tristate

1
virt/lib/Makefile Normal file
View File

@ -0,0 +1 @@
obj-$(CONFIG_IRQ_BYPASS_MANAGER) += irqbypass.o

257
virt/lib/irqbypass.c Normal file
View File

@ -0,0 +1,257 @@
/*
* IRQ offload/bypass manager
*
* Copyright (C) 2015 Red Hat, Inc.
* Copyright (c) 2015 Linaro Ltd.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Various virtualization hardware acceleration techniques allow bypassing or
* offloading interrupts received from devices around the host kernel. Posted
* Interrupts on Intel VT-d systems can allow interrupts to be received
* directly by a virtual machine. ARM IRQ Forwarding allows forwarded physical
* interrupts to be directly deactivated by the guest. This manager allows
* interrupt producers and consumers to find each other to enable this sort of
* bypass.
*/
#include <linux/irqbypass.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/mutex.h>
MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("IRQ bypass manager utility module");
static LIST_HEAD(producers);
static LIST_HEAD(consumers);
static DEFINE_MUTEX(lock);
/* @lock must be held when calling connect */
static int __connect(struct irq_bypass_producer *prod,
struct irq_bypass_consumer *cons)
{
int ret = 0;
if (prod->stop)
prod->stop(prod);
if (cons->stop)
cons->stop(cons);
if (prod->add_consumer)
ret = prod->add_consumer(prod, cons);
if (!ret) {
ret = cons->add_producer(cons, prod);
if (ret && prod->del_consumer)
prod->del_consumer(prod, cons);
}
if (cons->start)
cons->start(cons);
if (prod->start)
prod->start(prod);
return ret;
}
/* @lock must be held when calling disconnect */
static void __disconnect(struct irq_bypass_producer *prod,
struct irq_bypass_consumer *cons)
{
if (prod->stop)
prod->stop(prod);
if (cons->stop)
cons->stop(cons);
cons->del_producer(cons, prod);
if (prod->del_consumer)
prod->del_consumer(prod, cons);
if (cons->start)
cons->start(cons);
if (prod->start)
prod->start(prod);
}
/**
* irq_bypass_register_producer - register IRQ bypass producer
* @producer: pointer to producer structure
*
* Add the provided IRQ producer to the list of producers and connect
* with any matching token found on the IRQ consumers list.
*/
int irq_bypass_register_producer(struct irq_bypass_producer *producer)
{
struct irq_bypass_producer *tmp;
struct irq_bypass_consumer *consumer;
might_sleep();
if (!try_module_get(THIS_MODULE))
return -ENODEV;
mutex_lock(&lock);
list_for_each_entry(tmp, &producers, node) {
if (tmp->token == producer->token) {
mutex_unlock(&lock);
module_put(THIS_MODULE);
return -EBUSY;
}
}
list_for_each_entry(consumer, &consumers, node) {
if (consumer->token == producer->token) {
int ret = __connect(producer, consumer);
if (ret) {
mutex_unlock(&lock);
module_put(THIS_MODULE);
return ret;
}
break;
}
}
list_add(&producer->node, &producers);
mutex_unlock(&lock);
return 0;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
/**
* irq_bypass_unregister_producer - unregister IRQ bypass producer
* @producer: pointer to producer structure
*
* Remove a previously registered IRQ producer from the list of producers
* and disconnect it from any connected IRQ consumer.
*/
void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
{
struct irq_bypass_producer *tmp;
struct irq_bypass_consumer *consumer;
might_sleep();
if (!try_module_get(THIS_MODULE))
return; /* nothing in the list anyway */
mutex_lock(&lock);
list_for_each_entry(tmp, &producers, node) {
if (tmp->token != producer->token)
continue;
list_for_each_entry(consumer, &consumers, node) {
if (consumer->token == producer->token) {
__disconnect(producer, consumer);
break;
}
}
list_del(&producer->node);
module_put(THIS_MODULE);
break;
}
mutex_unlock(&lock);
module_put(THIS_MODULE);
}
EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
/**
* irq_bypass_register_consumer - register IRQ bypass consumer
* @consumer: pointer to consumer structure
*
* Add the provided IRQ consumer to the list of consumers and connect
* with any matching token found on the IRQ producer list.
*/
int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
{
struct irq_bypass_consumer *tmp;
struct irq_bypass_producer *producer;
if (!consumer->add_producer || !consumer->del_producer)
return -EINVAL;
might_sleep();
if (!try_module_get(THIS_MODULE))
return -ENODEV;
mutex_lock(&lock);
list_for_each_entry(tmp, &consumers, node) {
if (tmp->token == consumer->token) {
mutex_unlock(&lock);
module_put(THIS_MODULE);
return -EBUSY;
}
}
list_for_each_entry(producer, &producers, node) {
if (producer->token == consumer->token) {
int ret = __connect(producer, consumer);
if (ret) {
mutex_unlock(&lock);
module_put(THIS_MODULE);
return ret;
}
break;
}
}
list_add(&consumer->node, &consumers);
mutex_unlock(&lock);
return 0;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
/**
* irq_bypass_unregister_consumer - unregister IRQ bypass consumer
* @consumer: pointer to consumer structure
*
* Remove a previously registered IRQ consumer from the list of consumers
* and disconnect it from any connected IRQ producer.
*/
void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
{
struct irq_bypass_consumer *tmp;
struct irq_bypass_producer *producer;
might_sleep();
if (!try_module_get(THIS_MODULE))
return; /* nothing in the list anyway */
mutex_lock(&lock);
list_for_each_entry(tmp, &consumers, node) {
if (tmp->token != consumer->token)
continue;
list_for_each_entry(producer, &producers, node) {
if (producer->token == consumer->token) {
__disconnect(producer, consumer);
break;
}
}
list_del(&consumer->node);
module_put(THIS_MODULE);
break;
}
mutex_unlock(&lock);
module_put(THIS_MODULE);
}
EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);