mirror of
https://github.com/xemu-project/xemu.git
synced 2024-11-28 05:50:37 +00:00
f8d8a94400
On systems with guest visible IOMMU, adding a new memory region onto PCI bus calls vfio_listener_region_add() for every DMA window. This installs a notifier for IOMMU memory regions. The notifier is supposed to be removed vfio_listener_region_del(), however in the case of mixed PHB (emulated + VFIO devices) when last VFIO device is unplugged and container gets destroyed, all existing DMA windows stay alive altogether with the notifiers which are on the linked list which head was in the destroyed container. This unregisters IOMMU memory region notifier when a container is destroyed. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
973 lines
29 KiB
C
973 lines
29 KiB
C
/*
|
|
* generic functions used by VFIO devices
|
|
*
|
|
* Copyright Red Hat, Inc. 2012
|
|
*
|
|
* Authors:
|
|
* Alex Williamson <alex.williamson@redhat.com>
|
|
*
|
|
* This work is licensed under the terms of the GNU GPL, version 2. See
|
|
* the COPYING file in the top-level directory.
|
|
*
|
|
* Based on qemu-kvm device-assignment:
|
|
* Adapted for KVM by Qumranet.
|
|
* Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
|
|
* Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
|
|
* Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
|
|
* Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
|
|
* Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
|
|
*/
|
|
|
|
#include <sys/ioctl.h>
|
|
#include <sys/mman.h>
|
|
#include <linux/vfio.h>
|
|
|
|
#include "hw/vfio/vfio-common.h"
|
|
#include "hw/vfio/vfio.h"
|
|
#include "exec/address-spaces.h"
|
|
#include "exec/memory.h"
|
|
#include "hw/hw.h"
|
|
#include "qemu/error-report.h"
|
|
#include "sysemu/kvm.h"
|
|
#include "trace.h"
|
|
|
|
struct vfio_group_head vfio_group_list =
|
|
QLIST_HEAD_INITIALIZER(vfio_group_list);
|
|
struct vfio_as_head vfio_address_spaces =
|
|
QLIST_HEAD_INITIALIZER(vfio_address_spaces);
|
|
|
|
#ifdef CONFIG_KVM
|
|
/*
|
|
* We have a single VFIO pseudo device per KVM VM. Once created it lives
|
|
* for the life of the VM. Closing the file descriptor only drops our
|
|
* reference to it and the device's reference to kvm. Therefore once
|
|
* initialized, this file descriptor is only released on QEMU exit and
|
|
* we'll re-use it should another vfio device be attached before then.
|
|
*/
|
|
static int vfio_kvm_device_fd = -1;
|
|
#endif
|
|
|
|
/*
|
|
* Common VFIO interrupt disable
|
|
*/
|
|
void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
|
|
{
|
|
struct vfio_irq_set irq_set = {
|
|
.argsz = sizeof(irq_set),
|
|
.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
|
|
.index = index,
|
|
.start = 0,
|
|
.count = 0,
|
|
};
|
|
|
|
ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
|
|
}
|
|
|
|
void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
|
|
{
|
|
struct vfio_irq_set irq_set = {
|
|
.argsz = sizeof(irq_set),
|
|
.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
|
|
.index = index,
|
|
.start = 0,
|
|
.count = 1,
|
|
};
|
|
|
|
ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
|
|
}
|
|
|
|
void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
|
|
{
|
|
struct vfio_irq_set irq_set = {
|
|
.argsz = sizeof(irq_set),
|
|
.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
|
|
.index = index,
|
|
.start = 0,
|
|
.count = 1,
|
|
};
|
|
|
|
ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
|
|
}
|
|
|
|
/*
|
|
* IO Port/MMIO - Beware of the endians, VFIO is always little endian
|
|
*/
|
|
void vfio_region_write(void *opaque, hwaddr addr,
|
|
uint64_t data, unsigned size)
|
|
{
|
|
VFIORegion *region = opaque;
|
|
VFIODevice *vbasedev = region->vbasedev;
|
|
union {
|
|
uint8_t byte;
|
|
uint16_t word;
|
|
uint32_t dword;
|
|
uint64_t qword;
|
|
} buf;
|
|
|
|
switch (size) {
|
|
case 1:
|
|
buf.byte = data;
|
|
break;
|
|
case 2:
|
|
buf.word = cpu_to_le16(data);
|
|
break;
|
|
case 4:
|
|
buf.dword = cpu_to_le32(data);
|
|
break;
|
|
default:
|
|
hw_error("vfio: unsupported write size, %d bytes", size);
|
|
break;
|
|
}
|
|
|
|
if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
|
|
error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
|
|
",%d) failed: %m",
|
|
__func__, vbasedev->name, region->nr,
|
|
addr, data, size);
|
|
}
|
|
|
|
trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
|
|
|
|
/*
|
|
* A read or write to a BAR always signals an INTx EOI. This will
|
|
* do nothing if not pending (including not in INTx mode). We assume
|
|
* that a BAR access is in response to an interrupt and that BAR
|
|
* accesses will service the interrupt. Unfortunately, we don't know
|
|
* which access will service the interrupt, so we're potentially
|
|
* getting quite a few host interrupts per guest interrupt.
|
|
*/
|
|
vbasedev->ops->vfio_eoi(vbasedev);
|
|
}
|
|
|
|
uint64_t vfio_region_read(void *opaque,
|
|
hwaddr addr, unsigned size)
|
|
{
|
|
VFIORegion *region = opaque;
|
|
VFIODevice *vbasedev = region->vbasedev;
|
|
union {
|
|
uint8_t byte;
|
|
uint16_t word;
|
|
uint32_t dword;
|
|
uint64_t qword;
|
|
} buf;
|
|
uint64_t data = 0;
|
|
|
|
if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
|
|
error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
|
|
__func__, vbasedev->name, region->nr,
|
|
addr, size);
|
|
return (uint64_t)-1;
|
|
}
|
|
switch (size) {
|
|
case 1:
|
|
data = buf.byte;
|
|
break;
|
|
case 2:
|
|
data = le16_to_cpu(buf.word);
|
|
break;
|
|
case 4:
|
|
data = le32_to_cpu(buf.dword);
|
|
break;
|
|
default:
|
|
hw_error("vfio: unsupported read size, %d bytes", size);
|
|
break;
|
|
}
|
|
|
|
trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
|
|
|
|
/* Same as write above */
|
|
vbasedev->ops->vfio_eoi(vbasedev);
|
|
|
|
return data;
|
|
}
|
|
|
|
const MemoryRegionOps vfio_region_ops = {
|
|
.read = vfio_region_read,
|
|
.write = vfio_region_write,
|
|
.endianness = DEVICE_LITTLE_ENDIAN,
|
|
};
|
|
|
|
/*
|
|
* DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
|
|
*/
|
|
static int vfio_dma_unmap(VFIOContainer *container,
|
|
hwaddr iova, ram_addr_t size)
|
|
{
|
|
struct vfio_iommu_type1_dma_unmap unmap = {
|
|
.argsz = sizeof(unmap),
|
|
.flags = 0,
|
|
.iova = iova,
|
|
.size = size,
|
|
};
|
|
|
|
if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
|
|
error_report("VFIO_UNMAP_DMA: %d", -errno);
|
|
return -errno;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
|
|
ram_addr_t size, void *vaddr, bool readonly)
|
|
{
|
|
struct vfio_iommu_type1_dma_map map = {
|
|
.argsz = sizeof(map),
|
|
.flags = VFIO_DMA_MAP_FLAG_READ,
|
|
.vaddr = (__u64)(uintptr_t)vaddr,
|
|
.iova = iova,
|
|
.size = size,
|
|
};
|
|
|
|
if (!readonly) {
|
|
map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
|
|
}
|
|
|
|
/*
|
|
* Try the mapping, if it fails with EBUSY, unmap the region and try
|
|
* again. This shouldn't be necessary, but we sometimes see it in
|
|
* the the VGA ROM space.
|
|
*/
|
|
if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
|
|
(errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
|
|
ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
|
|
return 0;
|
|
}
|
|
|
|
error_report("VFIO_MAP_DMA: %d", -errno);
|
|
return -errno;
|
|
}
|
|
|
|
static bool vfio_listener_skipped_section(MemoryRegionSection *section)
|
|
{
|
|
return (!memory_region_is_ram(section->mr) &&
|
|
!memory_region_is_iommu(section->mr)) ||
|
|
/*
|
|
* Sizing an enabled 64-bit BAR can cause spurious mappings to
|
|
* addresses in the upper part of the 64-bit address space. These
|
|
* are never accessed by the CPU and beyond the address width of
|
|
* some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
|
|
*/
|
|
section->offset_within_address_space & (1ULL << 63);
|
|
}
|
|
|
|
static void vfio_iommu_map_notify(Notifier *n, void *data)
|
|
{
|
|
VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
|
|
VFIOContainer *container = giommu->container;
|
|
IOMMUTLBEntry *iotlb = data;
|
|
MemoryRegion *mr;
|
|
hwaddr xlat;
|
|
hwaddr len = iotlb->addr_mask + 1;
|
|
void *vaddr;
|
|
int ret;
|
|
|
|
trace_vfio_iommu_map_notify(iotlb->iova,
|
|
iotlb->iova + iotlb->addr_mask);
|
|
|
|
/*
|
|
* The IOMMU TLB entry we have just covers translation through
|
|
* this IOMMU to its immediate target. We need to translate
|
|
* it the rest of the way through to memory.
|
|
*/
|
|
rcu_read_lock();
|
|
mr = address_space_translate(&address_space_memory,
|
|
iotlb->translated_addr,
|
|
&xlat, &len, iotlb->perm & IOMMU_WO);
|
|
if (!memory_region_is_ram(mr)) {
|
|
error_report("iommu map to non memory area %"HWADDR_PRIx"",
|
|
xlat);
|
|
goto out;
|
|
}
|
|
/*
|
|
* Translation truncates length to the IOMMU page size,
|
|
* check that it did not truncate too much.
|
|
*/
|
|
if (len & iotlb->addr_mask) {
|
|
error_report("iommu has granularity incompatible with target AS");
|
|
goto out;
|
|
}
|
|
|
|
if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
|
|
vaddr = memory_region_get_ram_ptr(mr) + xlat;
|
|
ret = vfio_dma_map(container, iotlb->iova,
|
|
iotlb->addr_mask + 1, vaddr,
|
|
!(iotlb->perm & IOMMU_WO) || mr->readonly);
|
|
if (ret) {
|
|
error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
|
|
"0x%"HWADDR_PRIx", %p) = %d (%m)",
|
|
container, iotlb->iova,
|
|
iotlb->addr_mask + 1, vaddr, ret);
|
|
}
|
|
} else {
|
|
ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
|
|
if (ret) {
|
|
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
|
|
"0x%"HWADDR_PRIx") = %d (%m)",
|
|
container, iotlb->iova,
|
|
iotlb->addr_mask + 1, ret);
|
|
}
|
|
}
|
|
out:
|
|
rcu_read_unlock();
|
|
}
|
|
|
|
static void vfio_listener_region_add(MemoryListener *listener,
|
|
MemoryRegionSection *section)
|
|
{
|
|
VFIOContainer *container = container_of(listener, VFIOContainer,
|
|
iommu_data.type1.listener);
|
|
hwaddr iova, end;
|
|
Int128 llend;
|
|
void *vaddr;
|
|
int ret;
|
|
|
|
if (vfio_listener_skipped_section(section)) {
|
|
trace_vfio_listener_region_add_skip(
|
|
section->offset_within_address_space,
|
|
section->offset_within_address_space +
|
|
int128_get64(int128_sub(section->size, int128_one())));
|
|
return;
|
|
}
|
|
|
|
if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
|
|
(section->offset_within_region & ~TARGET_PAGE_MASK))) {
|
|
error_report("%s received unaligned region", __func__);
|
|
return;
|
|
}
|
|
|
|
iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
|
|
llend = int128_make64(section->offset_within_address_space);
|
|
llend = int128_add(llend, section->size);
|
|
llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
|
|
|
|
if (int128_ge(int128_make64(iova), llend)) {
|
|
return;
|
|
}
|
|
|
|
memory_region_ref(section->mr);
|
|
|
|
if (memory_region_is_iommu(section->mr)) {
|
|
VFIOGuestIOMMU *giommu;
|
|
|
|
trace_vfio_listener_region_add_iommu(iova,
|
|
int128_get64(int128_sub(llend, int128_one())));
|
|
/*
|
|
* FIXME: We should do some checking to see if the
|
|
* capabilities of the host VFIO IOMMU are adequate to model
|
|
* the guest IOMMU
|
|
*
|
|
* FIXME: For VFIO iommu types which have KVM acceleration to
|
|
* avoid bouncing all map/unmaps through qemu this way, this
|
|
* would be the right place to wire that up (tell the KVM
|
|
* device emulation the VFIO iommu handles to use).
|
|
*/
|
|
/*
|
|
* This assumes that the guest IOMMU is empty of
|
|
* mappings at this point.
|
|
*
|
|
* One way of doing this is:
|
|
* 1. Avoid sharing IOMMUs between emulated devices or different
|
|
* IOMMU groups.
|
|
* 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
|
|
* there are some mappings in IOMMU.
|
|
*
|
|
* VFIO on SPAPR does that. Other IOMMU models may do that different,
|
|
* they must make sure there are no existing mappings or
|
|
* loop through existing mappings to map them into VFIO.
|
|
*/
|
|
giommu = g_malloc0(sizeof(*giommu));
|
|
giommu->iommu = section->mr;
|
|
giommu->container = container;
|
|
giommu->n.notify = vfio_iommu_map_notify;
|
|
QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
|
|
memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
|
|
|
|
return;
|
|
}
|
|
|
|
/* Here we assume that memory_region_is_ram(section->mr)==true */
|
|
|
|
end = int128_get64(llend);
|
|
vaddr = memory_region_get_ram_ptr(section->mr) +
|
|
section->offset_within_region +
|
|
(iova - section->offset_within_address_space);
|
|
|
|
trace_vfio_listener_region_add_ram(iova, end - 1, vaddr);
|
|
|
|
ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
|
|
if (ret) {
|
|
error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
|
|
"0x%"HWADDR_PRIx", %p) = %d (%m)",
|
|
container, iova, end - iova, vaddr, ret);
|
|
|
|
/*
|
|
* On the initfn path, store the first error in the container so we
|
|
* can gracefully fail. Runtime, there's not much we can do other
|
|
* than throw a hardware error.
|
|
*/
|
|
if (!container->iommu_data.type1.initialized) {
|
|
if (!container->iommu_data.type1.error) {
|
|
container->iommu_data.type1.error = ret;
|
|
}
|
|
} else {
|
|
hw_error("vfio: DMA mapping failed, unable to continue");
|
|
}
|
|
}
|
|
}
|
|
|
|
static void vfio_listener_region_del(MemoryListener *listener,
|
|
MemoryRegionSection *section)
|
|
{
|
|
VFIOContainer *container = container_of(listener, VFIOContainer,
|
|
iommu_data.type1.listener);
|
|
hwaddr iova, end;
|
|
int ret;
|
|
|
|
if (vfio_listener_skipped_section(section)) {
|
|
trace_vfio_listener_region_del_skip(
|
|
section->offset_within_address_space,
|
|
section->offset_within_address_space +
|
|
int128_get64(int128_sub(section->size, int128_one())));
|
|
return;
|
|
}
|
|
|
|
if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
|
|
(section->offset_within_region & ~TARGET_PAGE_MASK))) {
|
|
error_report("%s received unaligned region", __func__);
|
|
return;
|
|
}
|
|
|
|
if (memory_region_is_iommu(section->mr)) {
|
|
VFIOGuestIOMMU *giommu;
|
|
|
|
QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
|
|
if (giommu->iommu == section->mr) {
|
|
memory_region_unregister_iommu_notifier(&giommu->n);
|
|
QLIST_REMOVE(giommu, giommu_next);
|
|
g_free(giommu);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* FIXME: We assume the one big unmap below is adequate to
|
|
* remove any individual page mappings in the IOMMU which
|
|
* might have been copied into VFIO. This works for a page table
|
|
* based IOMMU where a big unmap flattens a large range of IO-PTEs.
|
|
* That may not be true for all IOMMU types.
|
|
*/
|
|
}
|
|
|
|
iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
|
|
end = (section->offset_within_address_space + int128_get64(section->size)) &
|
|
TARGET_PAGE_MASK;
|
|
|
|
if (iova >= end) {
|
|
return;
|
|
}
|
|
|
|
trace_vfio_listener_region_del(iova, end - 1);
|
|
|
|
ret = vfio_dma_unmap(container, iova, end - iova);
|
|
memory_region_unref(section->mr);
|
|
if (ret) {
|
|
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
|
|
"0x%"HWADDR_PRIx") = %d (%m)",
|
|
container, iova, end - iova, ret);
|
|
}
|
|
}
|
|
|
|
static const MemoryListener vfio_memory_listener = {
|
|
.region_add = vfio_listener_region_add,
|
|
.region_del = vfio_listener_region_del,
|
|
};
|
|
|
|
static void vfio_listener_release(VFIOContainer *container)
|
|
{
|
|
memory_listener_unregister(&container->iommu_data.type1.listener);
|
|
}
|
|
|
|
int vfio_mmap_region(Object *obj, VFIORegion *region,
|
|
MemoryRegion *mem, MemoryRegion *submem,
|
|
void **map, size_t size, off_t offset,
|
|
const char *name)
|
|
{
|
|
int ret = 0;
|
|
VFIODevice *vbasedev = region->vbasedev;
|
|
|
|
if (vbasedev->allow_mmap && size && region->flags &
|
|
VFIO_REGION_INFO_FLAG_MMAP) {
|
|
int prot = 0;
|
|
|
|
if (region->flags & VFIO_REGION_INFO_FLAG_READ) {
|
|
prot |= PROT_READ;
|
|
}
|
|
|
|
if (region->flags & VFIO_REGION_INFO_FLAG_WRITE) {
|
|
prot |= PROT_WRITE;
|
|
}
|
|
|
|
*map = mmap(NULL, size, prot, MAP_SHARED,
|
|
vbasedev->fd,
|
|
region->fd_offset + offset);
|
|
if (*map == MAP_FAILED) {
|
|
*map = NULL;
|
|
ret = -errno;
|
|
goto empty_region;
|
|
}
|
|
|
|
memory_region_init_ram_ptr(submem, obj, name, size, *map);
|
|
memory_region_set_skip_dump(submem);
|
|
} else {
|
|
empty_region:
|
|
/* Create a zero sized sub-region to make cleanup easy. */
|
|
memory_region_init(submem, obj, name, 0);
|
|
}
|
|
|
|
memory_region_add_subregion(mem, offset, submem);
|
|
|
|
return ret;
|
|
}
|
|
|
|
void vfio_reset_handler(void *opaque)
|
|
{
|
|
VFIOGroup *group;
|
|
VFIODevice *vbasedev;
|
|
|
|
QLIST_FOREACH(group, &vfio_group_list, next) {
|
|
QLIST_FOREACH(vbasedev, &group->device_list, next) {
|
|
vbasedev->ops->vfio_compute_needs_reset(vbasedev);
|
|
}
|
|
}
|
|
|
|
QLIST_FOREACH(group, &vfio_group_list, next) {
|
|
QLIST_FOREACH(vbasedev, &group->device_list, next) {
|
|
if (vbasedev->needs_reset) {
|
|
vbasedev->ops->vfio_hot_reset_multi(vbasedev);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void vfio_kvm_device_add_group(VFIOGroup *group)
|
|
{
|
|
#ifdef CONFIG_KVM
|
|
struct kvm_device_attr attr = {
|
|
.group = KVM_DEV_VFIO_GROUP,
|
|
.attr = KVM_DEV_VFIO_GROUP_ADD,
|
|
.addr = (uint64_t)(unsigned long)&group->fd,
|
|
};
|
|
|
|
if (!kvm_enabled()) {
|
|
return;
|
|
}
|
|
|
|
if (vfio_kvm_device_fd < 0) {
|
|
struct kvm_create_device cd = {
|
|
.type = KVM_DEV_TYPE_VFIO,
|
|
};
|
|
|
|
if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
|
|
error_report("Failed to create KVM VFIO device: %m");
|
|
return;
|
|
}
|
|
|
|
vfio_kvm_device_fd = cd.fd;
|
|
}
|
|
|
|
if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
|
|
error_report("Failed to add group %d to KVM VFIO device: %m",
|
|
group->groupid);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static void vfio_kvm_device_del_group(VFIOGroup *group)
|
|
{
|
|
#ifdef CONFIG_KVM
|
|
struct kvm_device_attr attr = {
|
|
.group = KVM_DEV_VFIO_GROUP,
|
|
.attr = KVM_DEV_VFIO_GROUP_DEL,
|
|
.addr = (uint64_t)(unsigned long)&group->fd,
|
|
};
|
|
|
|
if (vfio_kvm_device_fd < 0) {
|
|
return;
|
|
}
|
|
|
|
if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
|
|
error_report("Failed to remove group %d from KVM VFIO device: %m",
|
|
group->groupid);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
|
|
{
|
|
VFIOAddressSpace *space;
|
|
|
|
QLIST_FOREACH(space, &vfio_address_spaces, list) {
|
|
if (space->as == as) {
|
|
return space;
|
|
}
|
|
}
|
|
|
|
/* No suitable VFIOAddressSpace, create a new one */
|
|
space = g_malloc0(sizeof(*space));
|
|
space->as = as;
|
|
QLIST_INIT(&space->containers);
|
|
|
|
QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
|
|
|
|
return space;
|
|
}
|
|
|
|
static void vfio_put_address_space(VFIOAddressSpace *space)
|
|
{
|
|
if (QLIST_EMPTY(&space->containers)) {
|
|
QLIST_REMOVE(space, list);
|
|
g_free(space);
|
|
}
|
|
}
|
|
|
|
static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
|
|
{
|
|
VFIOContainer *container;
|
|
int ret, fd;
|
|
VFIOAddressSpace *space;
|
|
|
|
space = vfio_get_address_space(as);
|
|
|
|
QLIST_FOREACH(container, &space->containers, next) {
|
|
if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
|
|
group->container = container;
|
|
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
fd = qemu_open("/dev/vfio/vfio", O_RDWR);
|
|
if (fd < 0) {
|
|
error_report("vfio: failed to open /dev/vfio/vfio: %m");
|
|
ret = -errno;
|
|
goto put_space_exit;
|
|
}
|
|
|
|
ret = ioctl(fd, VFIO_GET_API_VERSION);
|
|
if (ret != VFIO_API_VERSION) {
|
|
error_report("vfio: supported vfio version: %d, "
|
|
"reported version: %d", VFIO_API_VERSION, ret);
|
|
ret = -EINVAL;
|
|
goto close_fd_exit;
|
|
}
|
|
|
|
container = g_malloc0(sizeof(*container));
|
|
container->space = space;
|
|
container->fd = fd;
|
|
if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) ||
|
|
ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) {
|
|
bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU);
|
|
|
|
ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
|
|
if (ret) {
|
|
error_report("vfio: failed to set group container: %m");
|
|
ret = -errno;
|
|
goto free_container_exit;
|
|
}
|
|
|
|
ret = ioctl(fd, VFIO_SET_IOMMU,
|
|
v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU);
|
|
if (ret) {
|
|
error_report("vfio: failed to set iommu for container: %m");
|
|
ret = -errno;
|
|
goto free_container_exit;
|
|
}
|
|
|
|
container->iommu_data.type1.listener = vfio_memory_listener;
|
|
container->iommu_data.release = vfio_listener_release;
|
|
|
|
memory_listener_register(&container->iommu_data.type1.listener,
|
|
container->space->as);
|
|
|
|
if (container->iommu_data.type1.error) {
|
|
ret = container->iommu_data.type1.error;
|
|
error_report("vfio: memory listener initialization failed for container");
|
|
goto listener_release_exit;
|
|
}
|
|
|
|
container->iommu_data.type1.initialized = true;
|
|
|
|
} else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
|
|
ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
|
|
if (ret) {
|
|
error_report("vfio: failed to set group container: %m");
|
|
ret = -errno;
|
|
goto free_container_exit;
|
|
}
|
|
ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU);
|
|
if (ret) {
|
|
error_report("vfio: failed to set iommu for container: %m");
|
|
ret = -errno;
|
|
goto free_container_exit;
|
|
}
|
|
|
|
/*
|
|
* The host kernel code implementing VFIO_IOMMU_DISABLE is called
|
|
* when container fd is closed so we do not call it explicitly
|
|
* in this file.
|
|
*/
|
|
ret = ioctl(fd, VFIO_IOMMU_ENABLE);
|
|
if (ret) {
|
|
error_report("vfio: failed to enable container: %m");
|
|
ret = -errno;
|
|
goto free_container_exit;
|
|
}
|
|
|
|
container->iommu_data.type1.listener = vfio_memory_listener;
|
|
container->iommu_data.release = vfio_listener_release;
|
|
|
|
memory_listener_register(&container->iommu_data.type1.listener,
|
|
container->space->as);
|
|
|
|
} else {
|
|
error_report("vfio: No available IOMMU models");
|
|
ret = -EINVAL;
|
|
goto free_container_exit;
|
|
}
|
|
|
|
QLIST_INIT(&container->group_list);
|
|
QLIST_INSERT_HEAD(&space->containers, container, next);
|
|
|
|
group->container = container;
|
|
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
|
|
|
|
return 0;
|
|
listener_release_exit:
|
|
vfio_listener_release(container);
|
|
|
|
free_container_exit:
|
|
g_free(container);
|
|
|
|
close_fd_exit:
|
|
close(fd);
|
|
|
|
put_space_exit:
|
|
vfio_put_address_space(space);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void vfio_disconnect_container(VFIOGroup *group)
|
|
{
|
|
VFIOContainer *container = group->container;
|
|
|
|
if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
|
|
error_report("vfio: error disconnecting group %d from container",
|
|
group->groupid);
|
|
}
|
|
|
|
QLIST_REMOVE(group, container_next);
|
|
group->container = NULL;
|
|
|
|
if (QLIST_EMPTY(&container->group_list)) {
|
|
VFIOAddressSpace *space = container->space;
|
|
VFIOGuestIOMMU *giommu, *tmp;
|
|
|
|
if (container->iommu_data.release) {
|
|
container->iommu_data.release(container);
|
|
}
|
|
QLIST_REMOVE(container, next);
|
|
|
|
QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
|
|
memory_region_unregister_iommu_notifier(&giommu->n);
|
|
QLIST_REMOVE(giommu, giommu_next);
|
|
g_free(giommu);
|
|
}
|
|
|
|
trace_vfio_disconnect_container(container->fd);
|
|
close(container->fd);
|
|
g_free(container);
|
|
|
|
vfio_put_address_space(space);
|
|
}
|
|
}
|
|
|
|
VFIOGroup *vfio_get_group(int groupid, AddressSpace *as)
|
|
{
|
|
VFIOGroup *group;
|
|
char path[32];
|
|
struct vfio_group_status status = { .argsz = sizeof(status) };
|
|
|
|
QLIST_FOREACH(group, &vfio_group_list, next) {
|
|
if (group->groupid == groupid) {
|
|
/* Found it. Now is it already in the right context? */
|
|
if (group->container->space->as == as) {
|
|
return group;
|
|
} else {
|
|
error_report("vfio: group %d used in multiple address spaces",
|
|
group->groupid);
|
|
return NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
group = g_malloc0(sizeof(*group));
|
|
|
|
snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
|
|
group->fd = qemu_open(path, O_RDWR);
|
|
if (group->fd < 0) {
|
|
error_report("vfio: error opening %s: %m", path);
|
|
goto free_group_exit;
|
|
}
|
|
|
|
if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
|
|
error_report("vfio: error getting group status: %m");
|
|
goto close_fd_exit;
|
|
}
|
|
|
|
if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
|
|
error_report("vfio: error, group %d is not viable, please ensure "
|
|
"all devices within the iommu_group are bound to their "
|
|
"vfio bus driver.", groupid);
|
|
goto close_fd_exit;
|
|
}
|
|
|
|
group->groupid = groupid;
|
|
QLIST_INIT(&group->device_list);
|
|
|
|
if (vfio_connect_container(group, as)) {
|
|
error_report("vfio: failed to setup container for group %d", groupid);
|
|
goto close_fd_exit;
|
|
}
|
|
|
|
if (QLIST_EMPTY(&vfio_group_list)) {
|
|
qemu_register_reset(vfio_reset_handler, NULL);
|
|
}
|
|
|
|
QLIST_INSERT_HEAD(&vfio_group_list, group, next);
|
|
|
|
vfio_kvm_device_add_group(group);
|
|
|
|
return group;
|
|
|
|
close_fd_exit:
|
|
close(group->fd);
|
|
|
|
free_group_exit:
|
|
g_free(group);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
void vfio_put_group(VFIOGroup *group)
|
|
{
|
|
if (!group || !QLIST_EMPTY(&group->device_list)) {
|
|
return;
|
|
}
|
|
|
|
vfio_kvm_device_del_group(group);
|
|
vfio_disconnect_container(group);
|
|
QLIST_REMOVE(group, next);
|
|
trace_vfio_put_group(group->fd);
|
|
close(group->fd);
|
|
g_free(group);
|
|
|
|
if (QLIST_EMPTY(&vfio_group_list)) {
|
|
qemu_unregister_reset(vfio_reset_handler, NULL);
|
|
}
|
|
}
|
|
|
|
int vfio_get_device(VFIOGroup *group, const char *name,
|
|
VFIODevice *vbasedev)
|
|
{
|
|
struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
|
|
int ret, fd;
|
|
|
|
fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
|
|
if (fd < 0) {
|
|
error_report("vfio: error getting device %s from group %d: %m",
|
|
name, group->groupid);
|
|
error_printf("Verify all devices in group %d are bound to vfio-<bus> "
|
|
"or pci-stub and not already in use\n", group->groupid);
|
|
return fd;
|
|
}
|
|
|
|
ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
|
|
if (ret) {
|
|
error_report("vfio: error getting device info: %m");
|
|
close(fd);
|
|
return ret;
|
|
}
|
|
|
|
vbasedev->fd = fd;
|
|
vbasedev->group = group;
|
|
QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
|
|
|
|
vbasedev->num_irqs = dev_info.num_irqs;
|
|
vbasedev->num_regions = dev_info.num_regions;
|
|
vbasedev->flags = dev_info.flags;
|
|
|
|
trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
|
|
dev_info.num_irqs);
|
|
|
|
vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
|
|
return 0;
|
|
}
|
|
|
|
void vfio_put_base_device(VFIODevice *vbasedev)
|
|
{
|
|
if (!vbasedev->group) {
|
|
return;
|
|
}
|
|
QLIST_REMOVE(vbasedev, next);
|
|
vbasedev->group = NULL;
|
|
trace_vfio_put_base_device(vbasedev->fd);
|
|
close(vbasedev->fd);
|
|
}
|
|
|
|
static int vfio_container_do_ioctl(AddressSpace *as, int32_t groupid,
|
|
int req, void *param)
|
|
{
|
|
VFIOGroup *group;
|
|
VFIOContainer *container;
|
|
int ret = -1;
|
|
|
|
group = vfio_get_group(groupid, as);
|
|
if (!group) {
|
|
error_report("vfio: group %d not registered", groupid);
|
|
return ret;
|
|
}
|
|
|
|
container = group->container;
|
|
if (group->container) {
|
|
ret = ioctl(container->fd, req, param);
|
|
if (ret < 0) {
|
|
error_report("vfio: failed to ioctl %d to container: ret=%d, %s",
|
|
_IOC_NR(req) - VFIO_BASE, ret, strerror(errno));
|
|
}
|
|
}
|
|
|
|
vfio_put_group(group);
|
|
|
|
return ret;
|
|
}
|
|
|
|
int vfio_container_ioctl(AddressSpace *as, int32_t groupid,
|
|
int req, void *param)
|
|
{
|
|
/* We allow only certain ioctls to the container */
|
|
switch (req) {
|
|
case VFIO_CHECK_EXTENSION:
|
|
case VFIO_IOMMU_SPAPR_TCE_GET_INFO:
|
|
case VFIO_EEH_PE_OP:
|
|
break;
|
|
default:
|
|
/* Return an error on unknown requests */
|
|
error_report("vfio: unsupported ioctl %X", req);
|
|
return -1;
|
|
}
|
|
|
|
return vfio_container_do_ioctl(as, groupid, req, param);
|
|
}
|