From c02607aad2f9ed478eb288bcec1c00cd9df38b3c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 29 Sep 2014 10:05:06 -0600 Subject: [PATCH 1/6] iommu: introduce domain attribute for nesting IOMMUs Some IOMMUs, such as the ARM SMMU, support two stages of translation. The idea behind such a scheme is to allow a guest operating system to use the IOMMU for DMA mappings in the first stage of translation, with the hypervisor then installing mappings in the second stage to provide isolation of the DMA to the physical range assigned to that virtual machine. In order to allow IOMMU domains to be used for second-stage translation, this patch adds a new iommu_attr (IOMMU_ATTR_NESTING) for setting second-stage domains prior to device attach. The attribute can also be queried to see if a domain is actually making use of nesting. Acked-by: Joerg Roedel Signed-off-by: Will Deacon Signed-off-by: Alex Williamson --- include/linux/iommu.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 20f9a527922a..7b02bcc85b9e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -80,6 +80,7 @@ enum iommu_attr { DOMAIN_ATTR_FSL_PAMU_STASH, DOMAIN_ATTR_FSL_PAMU_ENABLE, DOMAIN_ATTR_FSL_PAMUV1, + DOMAIN_ATTR_NESTING, /* two stages of translation */ DOMAIN_ATTR_MAX, }; From f5c9ecebaf2a2c9381973798e389cc019dd983e0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 29 Sep 2014 10:06:19 -0600 Subject: [PATCH 2/6] vfio/iommu_type1: add new VFIO_TYPE1_NESTING_IOMMU IOMMU type VFIO allows devices to be safely handed off to userspace by putting them behind an IOMMU configured to ensure DMA and interrupt isolation. This enables userspace KVM clients, such as kvmtool and qemu, to further map the device into a virtual machine. With IOMMUs such as the ARM SMMU, it is then possible to provide SMMU translation services to the guest operating system, which are nested with the existing translation installed by VFIO. However, enabling this feature means that the IOMMU driver must be informed that the VFIO domain is being created for the purposes of nested translation. This patch adds a new IOMMU type (VFIO_TYPE1_NESTING_IOMMU) to the VFIO type-1 driver. The new IOMMU type acts identically to the VFIO_TYPE1v2_IOMMU type, but additionally sets the DOMAIN_ATTR_NESTING attribute on its IOMMU domains. Cc: Joerg Roedel Signed-off-by: Will Deacon Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 30 +++++++++++++++++++++++++----- include/uapi/linux/vfio.h | 3 +++ 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0734fbe5b651..583ccdb2c58f 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -57,7 +57,8 @@ struct vfio_iommu { struct list_head domain_list; struct mutex lock; struct rb_root dma_list; - bool v2; + bool v2; + bool nesting; }; struct vfio_domain { @@ -705,6 +706,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, goto out_free; } + if (iommu->nesting) { + int attr = 1; + + ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING, + &attr); + if (ret) + goto out_domain; + } + ret = iommu_attach_group(domain->domain, iommu_group); if (ret) goto out_domain; @@ -819,17 +829,26 @@ static void *vfio_iommu_type1_open(unsigned long arg) { struct vfio_iommu *iommu; - if (arg != VFIO_TYPE1_IOMMU && arg != VFIO_TYPE1v2_IOMMU) - return ERR_PTR(-EINVAL); - iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); if (!iommu) return ERR_PTR(-ENOMEM); + switch (arg) { + case VFIO_TYPE1_IOMMU: + break; + case VFIO_TYPE1_NESTING_IOMMU: + iommu->nesting = true; + case VFIO_TYPE1v2_IOMMU: + iommu->v2 = true; + break; + default: + kfree(iommu); + return ERR_PTR(-EINVAL); + } + INIT_LIST_HEAD(&iommu->domain_list); iommu->dma_list = RB_ROOT; mutex_init(&iommu->lock); - iommu->v2 = (arg == VFIO_TYPE1v2_IOMMU); return iommu; } @@ -885,6 +904,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, switch (arg) { case VFIO_TYPE1_IOMMU: case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_NESTING_IOMMU: return 1; case VFIO_DMA_CC_IOMMU: if (!iommu) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 6612974c64bf..29715d27548f 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -33,6 +33,9 @@ /* Check if EEH is supported */ #define VFIO_EEH 5 +/* Two-stage IOMMU */ +#define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ + /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between From 3b307ffe3faad60aeda0e9a4f661d5c1edbd761e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 29 Sep 2014 10:13:46 -0600 Subject: [PATCH 3/6] PCI: Export MSI message relevant functions The patch exports 2 MSI message relevant functions, which will be used by VFIO PCI driver. The VFIO PCI driver would be built as a module. Signed-off-by: Gavin Shan Acked-by: Bjorn Helgaas Signed-off-by: Alex Williamson --- drivers/pci/msi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 5a40516444f3..e468d65eb238 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -312,6 +312,7 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) __get_cached_msi_msg(entry, msg); } +EXPORT_SYMBOL_GPL(get_cached_msi_msg); void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { @@ -356,6 +357,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg) __write_msi_msg(entry, msg); } +EXPORT_SYMBOL_GPL(write_msi_msg); static void free_msi_irqs(struct pci_dev *dev) { From b8f02af096b1fc9fd46680cbe55214e477eb76d3 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 29 Sep 2014 10:16:24 -0600 Subject: [PATCH 4/6] vfio/pci: Restore MSIx message prior to enabling The MSIx vector table lives in device memory, which may be cleared as part of a backdoor device reset. This is the case on the IBM IPR HBA when the BIST is run on the device. When assigned to a QEMU guest, the guest driver does a pci_save_state(), issues a BIST, then does a pci_restore_state(). The BIST clears the MSIx vector table, but due to the way interrupts are configured the pci_restore_state() does not restore the vector table as expected. Eventually this results in an EEH error on Power platforms when the device attempts to signal an interrupt with the zero'd table entry. Fix the problem by restoring the host cached MSI message prior to enabling each vector. Reported-by: Wen Xiong Signed-off-by: Gavin Shan Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 9dd49c9839ac..553212f037c3 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -548,6 +549,20 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, return PTR_ERR(trigger); } + /* + * The MSIx vector table resides in device memory which may be cleared + * via backdoor resets. We don't allow direct access to the vector + * table so even if a userspace driver attempts to save/restore around + * such a reset it would be unsuccessful. To avoid this, restore the + * cached value of the message prior to enabling. + */ + if (msix) { + struct msi_msg msg; + + get_cached_msi_msg(irq, &msg); + write_msi_msg(irq, &msg); + } + ret = request_irq(irq, vfio_msihandler, 0, vdev->ctx[vector].name, trigger); if (ret) { From 0f905ce2b59c666ad48f240bfa2ab28b77f7f936 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 29 Sep 2014 10:31:51 -0600 Subject: [PATCH 5/6] drivers/vfio: Export vfio_spapr_iommu_eeh_ioctl() with GPL The function should have been exported with EXPORT_SYMBOL_GPL() as part of commit 92d18a6851fb ("drivers/vfio: Fix EEH build error"). Suggested-by: Alexey Kardashevskiy Signed-off-by: Gavin Shan Signed-off-by: Alex Williamson --- drivers/vfio/vfio_spapr_eeh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c index 86dfceb9201f..5fa42db769ee 100644 --- a/drivers/vfio/vfio_spapr_eeh.c +++ b/drivers/vfio/vfio_spapr_eeh.c @@ -92,7 +92,7 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, return ret; } -EXPORT_SYMBOL(vfio_spapr_iommu_eeh_ioctl); +EXPORT_SYMBOL_GPL(vfio_spapr_iommu_eeh_ioctl); MODULE_VERSION(DRIVER_VERSION); MODULE_LICENSE("GPL v2"); From 93899a679fd6b2534b5c297d9316bae039ebcbe1 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 29 Sep 2014 17:18:39 -0600 Subject: [PATCH 6/6] vfio-pci: Fix remove path locking Locking both the remove() and release() path results in a deadlock that should have been obvious. To fix this we can get and hold the vfio_device reference as we evaluate whether to do a bus/slot reset. This will automatically block any remove() calls, allowing us to remove the explict lock. Fixes 61d792562b53. Signed-off-by: Alex Williamson Cc: stable@vger.kernel.org [3.17] --- drivers/vfio/pci/vfio_pci.c | 138 +++++++++++++++--------------------- 1 file changed, 58 insertions(+), 80 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index f7825332a325..9558da3f06a0 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -876,15 +876,11 @@ static void vfio_pci_remove(struct pci_dev *pdev) { struct vfio_pci_device *vdev; - mutex_lock(&driver_lock); - vdev = vfio_del_group_dev(&pdev->dev); if (vdev) { iommu_group_put(pdev->dev.iommu_group); kfree(vdev); } - - mutex_unlock(&driver_lock); } static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, @@ -927,108 +923,90 @@ static struct pci_driver vfio_pci_driver = { .err_handler = &vfio_err_handlers, }; -/* - * Test whether a reset is necessary and possible. We mark devices as - * needs_reset when they are released, but don't have a function-local reset - * available. If any of these exist in the affected devices, we want to do - * a bus/slot reset. We also need all of the affected devices to be unused, - * so we abort if any device has a non-zero refcnt. driver_lock prevents a - * device from being opened during the scan or unbound from vfio-pci. - */ -static int vfio_pci_test_bus_reset(struct pci_dev *pdev, void *data) -{ - bool *needs_reset = data; - struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver); - int ret = -EBUSY; - - if (pci_drv == &vfio_pci_driver) { - struct vfio_device *device; - struct vfio_pci_device *vdev; - - device = vfio_device_get_from_dev(&pdev->dev); - if (!device) - return ret; - - vdev = vfio_device_data(device); - if (vdev) { - if (vdev->needs_reset) - *needs_reset = true; - - if (!vdev->refcnt) - ret = 0; - } - - vfio_device_put(device); - } - - /* - * TODO: vfio-core considers groups to be viable even if some devices - * are attached to known drivers, like pci-stub or pcieport. We can't - * freeze devices from being unbound to those drivers like we can - * here though, so it would be racy to test for them. We also can't - * use device_lock() to prevent changes as that would interfere with - * PCI-core taking device_lock during bus reset. For now, we require - * devices to be bound to vfio-pci to get a bus/slot reset on release. - */ - - return ret; -} - -/* Clear needs_reset on all affected devices after successful bus/slot reset */ -static int vfio_pci_clear_needs_reset(struct pci_dev *pdev, void *data) +struct vfio_devices { + struct vfio_device **devices; + int cur_index; + int max_index; +}; + +static int vfio_pci_get_devs(struct pci_dev *pdev, void *data) { + struct vfio_devices *devs = data; struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver); - if (pci_drv == &vfio_pci_driver) { - struct vfio_device *device; - struct vfio_pci_device *vdev; + if (pci_drv != &vfio_pci_driver) + return -EBUSY; - device = vfio_device_get_from_dev(&pdev->dev); - if (!device) - return 0; + if (devs->cur_index == devs->max_index) + return -ENOSPC; - vdev = vfio_device_data(device); - if (vdev) - vdev->needs_reset = false; - - vfio_device_put(device); - } + devs->devices[devs->cur_index] = vfio_device_get_from_dev(&pdev->dev); + if (!devs->devices[devs->cur_index]) + return -EINVAL; + devs->cur_index++; return 0; } /* * Attempt to do a bus/slot reset if there are devices affected by a reset for * this device that are needs_reset and all of the affected devices are unused - * (!refcnt). Callers of this function are required to hold driver_lock such - * that devices can not be unbound from vfio-pci or opened by a user while we - * test for and perform a bus/slot reset. + * (!refcnt). Callers are required to hold driver_lock when calling this to + * prevent device opens and concurrent bus reset attempts. We prevent device + * unbinds by acquiring and holding a reference to the vfio_device. + * + * NB: vfio-core considers a group to be viable even if some devices are + * bound to drivers like pci-stub or pcieport. Here we require all devices + * to be bound to vfio_pci since that's the only way we can be sure they + * stay put. */ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) { + struct vfio_devices devs = { .cur_index = 0 }; + int i = 0, ret = -EINVAL; bool needs_reset = false, slot = false; - int ret; + struct vfio_pci_device *tmp; if (!pci_probe_reset_slot(vdev->pdev->slot)) slot = true; else if (pci_probe_reset_bus(vdev->pdev->bus)) return; + if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, + &i, slot) || !i) + return; + + devs.max_index = i; + devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL); + if (!devs.devices) + return; + if (vfio_pci_for_each_slot_or_bus(vdev->pdev, - vfio_pci_test_bus_reset, - &needs_reset, slot) || !needs_reset) - return; + vfio_pci_get_devs, &devs, slot)) + goto put_devs; - if (slot) - ret = pci_try_reset_slot(vdev->pdev->slot); - else - ret = pci_try_reset_bus(vdev->pdev->bus); + for (i = 0; i < devs.cur_index; i++) { + tmp = vfio_device_data(devs.devices[i]); + if (tmp->needs_reset) + needs_reset = true; + if (tmp->refcnt) + goto put_devs; + } - if (ret) - return; + if (needs_reset) + ret = slot ? pci_try_reset_slot(vdev->pdev->slot) : + pci_try_reset_bus(vdev->pdev->bus); - vfio_pci_for_each_slot_or_bus(vdev->pdev, - vfio_pci_clear_needs_reset, NULL, slot); +put_devs: + for (i = 0; i < devs.cur_index; i++) { + if (!ret) { + tmp = vfio_device_data(devs.devices[i]); + tmp->needs_reset = false; + } + vfio_device_put(devs.devices[i]); + } + + kfree(devs.devices); } static void __exit vfio_pci_cleanup(void)