From 698feb5e13a2d763369909ce33f2bd7a7c1c11c0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 7 Apr 2017 18:59:07 +0800 Subject: [PATCH 01/15] memory: add section range info for IOMMU notifier In this patch, IOMMUNotifier.{start|end} are introduced to store section information for a specific notifier. When notification occurs, we not only check the notification type (MAP|UNMAP), but also check whether the notified iova range overlaps with the range of specific IOMMU notifier, and skip those notifiers if not in the listened range. When removing an region, we need to make sure we removed the correct VFIOGuestIOMMU by checking the IOMMUNotifier.start address as well. This patch is solving the problem that vfio-pci devices receive duplicated UNMAP notification on x86 platform when vIOMMU is there. The issue is that x86 IOMMU has a (0, 2^64-1) IOMMU region, which is splitted by the (0xfee00000, 0xfeefffff) IRQ region. AFAIK this (splitted IOMMU region) is only happening on x86. This patch also helps vhost to leverage the new interface as well, so that vhost won't get duplicated cache flushes. In that sense, it's an slight performance improvement. Suggested-by: David Gibson Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin Acked-by: Alex Williamson Signed-off-by: Peter Xu Message-Id: <1491562755-23867-2-git-send-email-peterx@redhat.com> [ehabkost: included extra vhost_iommu_region_del() change from Peter Xu] Signed-off-by: Eduardo Habkost --- hw/vfio/common.c | 12 +++++++++--- hw/virtio/vhost.c | 13 ++++++++++--- include/exec/memory.h | 19 ++++++++++++++++++- memory.c | 9 +++++++++ 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index f3ba9b9007..6b33b9f55d 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -478,8 +478,13 @@ static void vfio_listener_region_add(MemoryListener *listener, giommu->iommu_offset = section->offset_within_address_space - section->offset_within_region; giommu->container = container; - giommu->n.notify = vfio_iommu_map_notify; - giommu->n.notifier_flags = IOMMU_NOTIFIER_ALL; + llend = int128_add(int128_make64(section->offset_within_region), + section->size); + llend = int128_sub(llend, int128_one()); + iommu_notifier_init(&giommu->n, vfio_iommu_map_notify, + IOMMU_NOTIFIER_ALL, + section->offset_within_region, + int128_get64(llend)); QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); @@ -550,7 +555,8 @@ static void vfio_listener_region_del(MemoryListener *listener, VFIOGuestIOMMU *giommu; QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { - if (giommu->iommu == section->mr) { + if (giommu->iommu == section->mr && + giommu->n.start == section->offset_within_region) { memory_region_unregister_iommu_notifier(giommu->iommu, &giommu->n); QLIST_REMOVE(giommu, giommu_next); diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 613494dcc2..0001e60b77 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -736,14 +736,20 @@ static void vhost_iommu_region_add(MemoryListener *listener, struct vhost_dev *dev = container_of(listener, struct vhost_dev, iommu_listener); struct vhost_iommu *iommu; + Int128 end; if (!memory_region_is_iommu(section->mr)) { return; } iommu = g_malloc0(sizeof(*iommu)); - iommu->n.notify = vhost_iommu_unmap_notify; - iommu->n.notifier_flags = IOMMU_NOTIFIER_UNMAP; + end = int128_add(int128_make64(section->offset_within_region), + section->size); + end = int128_sub(end, int128_one()); + iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, + IOMMU_NOTIFIER_UNMAP, + section->offset_within_region, + int128_get64(end)); iommu->mr = section->mr; iommu->iommu_offset = section->offset_within_address_space - section->offset_within_region; @@ -765,7 +771,8 @@ static void vhost_iommu_region_del(MemoryListener *listener, } QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { - if (iommu->mr == section->mr) { + if (iommu->mr == section->mr && + iommu->n.start == section->offset_within_region) { memory_region_unregister_iommu_notifier(iommu->mr, &iommu->n); QLIST_REMOVE(iommu, iommu_next); diff --git a/include/exec/memory.h b/include/exec/memory.h index f20b191793..0840c89489 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -77,13 +77,30 @@ typedef enum { #define IOMMU_NOTIFIER_ALL (IOMMU_NOTIFIER_MAP | IOMMU_NOTIFIER_UNMAP) +struct IOMMUNotifier; +typedef void (*IOMMUNotify)(struct IOMMUNotifier *notifier, + IOMMUTLBEntry *data); + struct IOMMUNotifier { - void (*notify)(struct IOMMUNotifier *notifier, IOMMUTLBEntry *data); + IOMMUNotify notify; IOMMUNotifierFlag notifier_flags; + /* Notify for address space range start <= addr <= end */ + hwaddr start; + hwaddr end; QLIST_ENTRY(IOMMUNotifier) node; }; typedef struct IOMMUNotifier IOMMUNotifier; +static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, + IOMMUNotifierFlag flags, + hwaddr start, hwaddr end) +{ + n->notify = fn; + n->notifier_flags = flags; + n->start = start; + n->end = end; +} + /* New-style MMIO accessors can indicate that the transaction failed. * A zero (MEMTX_OK) response means success; anything else is a failure * of some kind. The memory subsystem will bitwise-OR together results diff --git a/memory.c b/memory.c index 4c95aaf39c..75ac595073 100644 --- a/memory.c +++ b/memory.c @@ -1606,6 +1606,7 @@ void memory_region_register_iommu_notifier(MemoryRegion *mr, /* We need to register for at least one bitfield */ assert(n->notifier_flags != IOMMU_NOTIFIER_NONE); + assert(n->start <= n->end); QLIST_INSERT_HEAD(&mr->iommu_notify, n, node); memory_region_update_iommu_notify_flags(mr); } @@ -1667,6 +1668,14 @@ void memory_region_notify_iommu(MemoryRegion *mr, } QLIST_FOREACH(iommu_notifier, &mr->iommu_notify, node) { + /* + * Skip the notification if the notification does not overlap + * with registered range. + */ + if (iommu_notifier->start > entry.iova + entry.addr_mask + 1 || + iommu_notifier->end < entry.iova) { + continue; + } if (iommu_notifier->notifier_flags & request_flags) { iommu_notifier->notify(iommu_notifier, &entry); } From 512fa40867e6118568756a81ddaf476a0fef0f32 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 7 Apr 2017 18:59:08 +0800 Subject: [PATCH 02/15] memory: provide IOMMU_NOTIFIER_FOREACH macro A new macro is provided to iterate all the IOMMU notifiers hooked under specific IOMMU memory region. Reviewed-by: David Gibson Reviewed-by: Eric Auger Reviewed-by: \"Michael S. Tsirkin\" Signed-off-by: Peter Xu Message-Id: <1491562755-23867-3-git-send-email-peterx@redhat.com> Signed-off-by: Eduardo Habkost --- include/exec/memory.h | 3 +++ memory.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/exec/memory.h b/include/exec/memory.h index 0840c89489..07e43daf20 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -239,6 +239,9 @@ struct MemoryRegion { IOMMUNotifierFlag iommu_notify_flags; }; +#define IOMMU_NOTIFIER_FOREACH(n, mr) \ + QLIST_FOREACH((n), &(mr)->iommu_notify, node) + /** * MemoryListener: callbacks structure for updates to the physical memory map * diff --git a/memory.c b/memory.c index 75ac595073..7496b3d3d5 100644 --- a/memory.c +++ b/memory.c @@ -1583,7 +1583,7 @@ static void memory_region_update_iommu_notify_flags(MemoryRegion *mr) IOMMUNotifierFlag flags = IOMMU_NOTIFIER_NONE; IOMMUNotifier *iommu_notifier; - QLIST_FOREACH(iommu_notifier, &mr->iommu_notify, node) { + IOMMU_NOTIFIER_FOREACH(iommu_notifier, mr) { flags |= iommu_notifier->notifier_flags; } @@ -1667,7 +1667,7 @@ void memory_region_notify_iommu(MemoryRegion *mr, request_flags = IOMMU_NOTIFIER_UNMAP; } - QLIST_FOREACH(iommu_notifier, &mr->iommu_notify, node) { + IOMMU_NOTIFIER_FOREACH(iommu_notifier, mr) { /* * Skip the notification if the notification does not overlap * with registered range. From de472e4a92f780d02b894001e004f4b4a350ec38 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 7 Apr 2017 18:59:09 +0800 Subject: [PATCH 03/15] memory: provide iommu_replay_all() This is an "global" version of existing memory_region_iommu_replay() - we announce the translations to all the registered notifiers, instead of a specific one. Reviewed-by: David Gibson Reviewed-by: \"Michael S. Tsirkin\" Signed-off-by: Peter Xu Message-Id: <1491562755-23867-4-git-send-email-peterx@redhat.com> Signed-off-by: Eduardo Habkost --- include/exec/memory.h | 8 ++++++++ memory.c | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/include/exec/memory.h b/include/exec/memory.h index 07e43daf20..fb7dff3405 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -712,6 +712,14 @@ void memory_region_register_iommu_notifier(MemoryRegion *mr, void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n, bool is_write); +/** + * memory_region_iommu_replay_all: replay existing IOMMU translations + * to all the notifiers registered. + * + * @mr: the memory region to observe + */ +void memory_region_iommu_replay_all(MemoryRegion *mr); + /** * memory_region_unregister_iommu_notifier: unregister a notifier for * changes to IOMMU translation entries. diff --git a/memory.c b/memory.c index 7496b3d3d5..b4ed67b27a 100644 --- a/memory.c +++ b/memory.c @@ -1642,6 +1642,15 @@ void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n, } } +void memory_region_iommu_replay_all(MemoryRegion *mr) +{ + IOMMUNotifier *notifier; + + IOMMU_NOTIFIER_FOREACH(notifier, mr) { + memory_region_iommu_replay(mr, notifier, false); + } +} + void memory_region_unregister_iommu_notifier(MemoryRegion *mr, IOMMUNotifier *n) { From bd2bfa4c52e5f4dc6dbaa5be0521aedc31cb53d9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 7 Apr 2017 18:59:10 +0800 Subject: [PATCH 04/15] memory: introduce memory_region_notify_one() Generalizing the notify logic in memory_region_notify_iommu() into a single function. This can be further used in customized replay() functions for IOMMUs. Reviewed-by: David Gibson Reviewed-by: Eric Auger Reviewed-by: \"Michael S. Tsirkin\" Signed-off-by: Peter Xu Message-Id: <1491562755-23867-5-git-send-email-peterx@redhat.com> Signed-off-by: Eduardo Habkost --- include/exec/memory.h | 15 +++++++++++++++ memory.c | 40 ++++++++++++++++++++++++---------------- 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/include/exec/memory.h b/include/exec/memory.h index fb7dff3405..055b3a8b23 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -687,6 +687,21 @@ uint64_t memory_region_iommu_get_min_page_size(MemoryRegion *mr); void memory_region_notify_iommu(MemoryRegion *mr, IOMMUTLBEntry entry); +/** + * memory_region_notify_one: notify a change in an IOMMU translation + * entry to a single notifier + * + * This works just like memory_region_notify_iommu(), but it only + * notifies a specific notifier, not all of them. + * + * @notifier: the notifier to be notified + * @entry: the new entry in the IOMMU translation table. The entry + * replaces all old entries for the same virtual I/O address range. + * Deleted entries have .@perm == 0. + */ +void memory_region_notify_one(IOMMUNotifier *notifier, + IOMMUTLBEntry *entry); + /** * memory_region_register_iommu_notifier: register a notifier for changes to * IOMMU translation entries. diff --git a/memory.c b/memory.c index b4ed67b27a..ded4bf13d6 100644 --- a/memory.c +++ b/memory.c @@ -1662,32 +1662,40 @@ void memory_region_unregister_iommu_notifier(MemoryRegion *mr, memory_region_update_iommu_notify_flags(mr); } -void memory_region_notify_iommu(MemoryRegion *mr, - IOMMUTLBEntry entry) +void memory_region_notify_one(IOMMUNotifier *notifier, + IOMMUTLBEntry *entry) { - IOMMUNotifier *iommu_notifier; IOMMUNotifierFlag request_flags; - assert(memory_region_is_iommu(mr)); + /* + * Skip the notification if the notification does not overlap + * with registered range. + */ + if (notifier->start > entry->iova + entry->addr_mask + 1 || + notifier->end < entry->iova) { + return; + } - if (entry.perm & IOMMU_RW) { + if (entry->perm & IOMMU_RW) { request_flags = IOMMU_NOTIFIER_MAP; } else { request_flags = IOMMU_NOTIFIER_UNMAP; } + if (notifier->notifier_flags & request_flags) { + notifier->notify(notifier, entry); + } +} + +void memory_region_notify_iommu(MemoryRegion *mr, + IOMMUTLBEntry entry) +{ + IOMMUNotifier *iommu_notifier; + + assert(memory_region_is_iommu(mr)); + IOMMU_NOTIFIER_FOREACH(iommu_notifier, mr) { - /* - * Skip the notification if the notification does not overlap - * with registered range. - */ - if (iommu_notifier->start > entry.iova + entry.addr_mask + 1 || - iommu_notifier->end < entry.iova) { - continue; - } - if (iommu_notifier->notifier_flags & request_flags) { - iommu_notifier->notify(iommu_notifier, &entry); - } + memory_region_notify_one(iommu_notifier, &entry); } } From faa362e3cc94bf739a89b457693e3fbd7a4b95c4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 7 Apr 2017 18:59:11 +0800 Subject: [PATCH 05/15] memory: add MemoryRegionIOMMUOps.replay() callback Originally we have one memory_region_iommu_replay() function, which is the default behavior to replay the translations of the whole IOMMU region. However, on some platform like x86, we may want our own replay logic for IOMMU regions. This patch adds one more hook for IOMMUOps for the callback, and it'll override the default if set. Reviewed-by: David Gibson Reviewed-by: Eric Auger Reviewed-by: \"Michael S. Tsirkin\" Signed-off-by: Peter Xu Message-Id: <1491562755-23867-6-git-send-email-peterx@redhat.com> Signed-off-by: Eduardo Habkost --- include/exec/memory.h | 2 ++ memory.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/include/exec/memory.h b/include/exec/memory.h index 055b3a8b23..c0280b7064 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -191,6 +191,8 @@ struct MemoryRegionIOMMUOps { void (*notify_flag_changed)(MemoryRegion *iommu, IOMMUNotifierFlag old_flags, IOMMUNotifierFlag new_flags); + /* Set this up to provide customized IOMMU replay function */ + void (*replay)(MemoryRegion *iommu, IOMMUNotifier *notifier); }; typedef struct CoalescedMemoryRange CoalescedMemoryRange; diff --git a/memory.c b/memory.c index ded4bf13d6..b782d5bc5a 100644 --- a/memory.c +++ b/memory.c @@ -1626,6 +1626,12 @@ void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n, hwaddr addr, granularity; IOMMUTLBEntry iotlb; + /* If the IOMMU has its own replay callback, override */ + if (mr->iommu_ops->replay) { + mr->iommu_ops->replay(mr, n); + return; + } + granularity = memory_region_iommu_get_min_page_size(mr); for (addr = 0; addr < memory_region_size(mr); addr += granularity) { From 10315b9b28c28655ce64500281f4a028d0f8c5ff Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 7 Apr 2017 18:59:12 +0800 Subject: [PATCH 06/15] intel_iommu: use the correct memory region for device IOTLB notification We have a specific memory region for DMAR now, so it's wrong to trigger the notifier with the root region. Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Richard Henderson Cc: Eduardo Habkost Signed-off-by: Jason Wang Reviewed-by: Peter Xu Reviewed-by: \"Michael S. Tsirkin\" Signed-off-by: Peter Xu Message-Id: <1491562755-23867-7-git-send-email-peterx@redhat.com> Signed-off-by: Eduardo Habkost --- hw/i386/intel_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 22d8226e43..2412df4182 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -1457,7 +1457,7 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, entry.iova = addr; entry.perm = IOMMU_NONE; entry.translated_addr = 0; - memory_region_notify_iommu(entry.target_as->root, entry); + memory_region_notify_iommu(&vtd_dev_as->iommu, entry); done: return true; From f06a696dc958dd80f7eaf5be66fdefac77741ee0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 7 Apr 2017 18:59:13 +0800 Subject: [PATCH 07/15] intel_iommu: provide its own replay() callback The default replay() don't work for VT-d since vt-d will have a huge default memory region which covers address range 0-(2^64-1). This will normally consumes a lot of time (which looks like a dead loop). The solution is simple - we don't walk over all the regions. Instead, we jump over the regions when we found that the page directories are empty. It'll greatly reduce the time to walk the whole region. To achieve this, we provided a page walk helper to do that, invoking corresponding hook function when we found an page we are interested in. vtd_page_walk_level() is the core logic for the page walking. It's interface is designed to suite further use case, e.g., to invalidate a range of addresses. Reviewed-by: Jason Wang Reviewed-by: David Gibson Reviewed-by: \"Michael S. Tsirkin\" Signed-off-by: Peter Xu Message-Id: <1491562755-23867-8-git-send-email-peterx@redhat.com> Signed-off-by: Eduardo Habkost --- hw/i386/intel_iommu.c | 182 ++++++++++++++++++++++++++++++++++++++++-- hw/i386/trace-events | 7 ++ include/exec/memory.h | 2 + 3 files changed, 186 insertions(+), 5 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 2412df4182..7af4e22958 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -595,6 +595,22 @@ static inline uint32_t vtd_get_agaw_from_context_entry(VTDContextEntry *ce) return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; } +static inline uint64_t vtd_iova_limit(VTDContextEntry *ce) +{ + uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce); + return 1ULL << MIN(ce_agaw, VTD_MGAW); +} + +/* Return true if IOVA passes range check, otherwise false. */ +static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce) +{ + /* + * Check if @iova is above 2^X-1, where X is the minimum of MGAW + * in CAP_REG and AW in context-entry. + */ + return !(iova & ~(vtd_iova_limit(ce) - 1)); +} + static const uint64_t vtd_paging_entry_rsvd_field[] = { [0] = ~0ULL, /* For not large page */ @@ -630,13 +646,9 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, uint32_t level = vtd_get_level_from_context_entry(ce); uint32_t offset; uint64_t slpte; - uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce); uint64_t access_right_check; - /* Check if @iova is above 2^X-1, where X is the minimum of MGAW - * in CAP_REG and AW in context-entry. - */ - if (iova & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) { + if (!vtd_iova_range_check(iova, ce)) { VTD_DPRINTF(GENERAL, "error: iova 0x%"PRIx64 " exceeds limits", iova); return -VTD_FR_ADDR_BEYOND_MGAW; } @@ -684,6 +696,134 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, } } +typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); + +/** + * vtd_page_walk_level - walk over specific level for IOVA range + * + * @addr: base GPA addr to start the walk + * @start: IOVA range start address + * @end: IOVA range end address (start <= addr < end) + * @hook_fn: hook func to be called when detected page + * @private: private data to be passed into hook func + * @read: whether parent level has read permission + * @write: whether parent level has write permission + * @notify_unmap: whether we should notify invalid entries + */ +static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, + uint64_t end, vtd_page_walk_hook hook_fn, + void *private, uint32_t level, + bool read, bool write, bool notify_unmap) +{ + bool read_cur, write_cur, entry_valid; + uint32_t offset; + uint64_t slpte; + uint64_t subpage_size, subpage_mask; + IOMMUTLBEntry entry; + uint64_t iova = start; + uint64_t iova_next; + int ret = 0; + + trace_vtd_page_walk_level(addr, level, start, end); + + subpage_size = 1ULL << vtd_slpt_level_shift(level); + subpage_mask = vtd_slpt_level_page_mask(level); + + while (iova < end) { + iova_next = (iova & subpage_mask) + subpage_size; + + offset = vtd_iova_level_offset(iova, level); + slpte = vtd_get_slpte(addr, offset); + + if (slpte == (uint64_t)-1) { + trace_vtd_page_walk_skip_read(iova, iova_next); + goto next; + } + + if (vtd_slpte_nonzero_rsvd(slpte, level)) { + trace_vtd_page_walk_skip_reserve(iova, iova_next); + goto next; + } + + /* Permissions are stacked with parents' */ + read_cur = read && (slpte & VTD_SL_R); + write_cur = write && (slpte & VTD_SL_W); + + /* + * As long as we have either read/write permission, this is a + * valid entry. The rule works for both page entries and page + * table entries. + */ + entry_valid = read_cur | write_cur; + + if (vtd_is_last_slpte(slpte, level)) { + entry.target_as = &address_space_memory; + entry.iova = iova & subpage_mask; + /* NOTE: this is only meaningful if entry_valid == true */ + entry.translated_addr = vtd_get_slpte_addr(slpte); + entry.addr_mask = ~subpage_mask; + entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); + if (!entry_valid && !notify_unmap) { + trace_vtd_page_walk_skip_perm(iova, iova_next); + goto next; + } + trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr, + entry.addr_mask, entry.perm); + if (hook_fn) { + ret = hook_fn(&entry, private); + if (ret < 0) { + return ret; + } + } + } else { + if (!entry_valid) { + trace_vtd_page_walk_skip_perm(iova, iova_next); + goto next; + } + ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte), iova, + MIN(iova_next, end), hook_fn, private, + level - 1, read_cur, write_cur, + notify_unmap); + if (ret < 0) { + return ret; + } + } + +next: + iova = iova_next; + } + + return 0; +} + +/** + * vtd_page_walk - walk specific IOVA range, and call the hook + * + * @ce: context entry to walk upon + * @start: IOVA address to start the walk + * @end: IOVA range end address (start <= addr < end) + * @hook_fn: the hook that to be called for each detected area + * @private: private data for the hook function + */ +static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end, + vtd_page_walk_hook hook_fn, void *private) +{ + dma_addr_t addr = vtd_get_slpt_base_from_context(ce); + uint32_t level = vtd_get_level_from_context_entry(ce); + + if (!vtd_iova_range_check(start, ce)) { + return -VTD_FR_ADDR_BEYOND_MGAW; + } + + if (!vtd_iova_range_check(end, ce)) { + /* Fix end so that it reaches the maximum */ + end = vtd_iova_limit(ce); + } + + return vtd_page_walk_level(addr, start, end, hook_fn, private, + level, true, true, false); +} + /* Map a device to its corresponding domain (context-entry) */ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn, VTDContextEntry *ce) @@ -2402,6 +2542,37 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) return vtd_dev_as; } +static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private) +{ + memory_region_notify_one((IOMMUNotifier *)private, entry); + return 0; +} + +static void vtd_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n) +{ + VTDAddressSpace *vtd_as = container_of(mr, VTDAddressSpace, iommu); + IntelIOMMUState *s = vtd_as->iommu_state; + uint8_t bus_n = pci_bus_num(vtd_as->bus); + VTDContextEntry ce; + + if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) { + /* + * Scanned a valid context entry, walk over the pages and + * notify when needed. + */ + trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn), + PCI_FUNC(vtd_as->devfn), + VTD_CONTEXT_ENTRY_DID(ce.hi), + ce.hi, ce.lo); + vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n); + } else { + trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), + PCI_FUNC(vtd_as->devfn)); + } + + return; +} + /* Do the initialization. It will also be called when reset, so pay * attention when adding new initialization stuff. */ @@ -2416,6 +2587,7 @@ static void vtd_init(IntelIOMMUState *s) s->iommu_ops.translate = vtd_iommu_translate; s->iommu_ops.notify_flag_changed = vtd_iommu_notify_flag_changed; + s->iommu_ops.replay = vtd_iommu_replay; s->root = 0; s->root_extended = false; s->dmar_enabled = false; diff --git a/hw/i386/trace-events b/hw/i386/trace-events index baed874a80..f725bca33e 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -30,6 +30,13 @@ vtd_iotlb_cc_hit(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32 vtd_iotlb_cc_update(uint8_t bus, uint8_t devfn, uint64_t high, uint64_t low, uint32_t gen1, uint32_t gen2) "IOTLB context update bus 0x%"PRIx8" devfn 0x%"PRIx8" high 0x%"PRIx64" low 0x%"PRIx64" gen %"PRIu32" -> gen %"PRIu32 vtd_iotlb_reset(const char *reason) "IOTLB reset (reason: %s)" vtd_fault_disabled(void) "Fault processing disabled for context entry" +vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64 +vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8 +vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64 +vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" +vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read" +vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty" +vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" # hw/i386/amd_iommu.c amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 diff --git a/include/exec/memory.h b/include/exec/memory.h index c0280b7064..c4fc94d504 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -55,6 +55,8 @@ typedef enum { IOMMU_RW = 3, } IOMMUAccessFlags; +#define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | ((w) ? IOMMU_WO : 0)) + struct IOMMUTLBEntry { AddressSpace *target_as; hwaddr iova; From 558e0024a428a8f21605dc8aa026612ccc0f14cd Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 7 Apr 2017 18:59:14 +0800 Subject: [PATCH 08/15] intel_iommu: allow dynamic switch of IOMMU region This is preparation work to finally enabled dynamic switching ON/OFF for VT-d protection. The old VT-d codes is using static IOMMU address space, and that won't satisfy vfio-pci device listeners. Let me explain. vfio-pci devices depend on the memory region listener and IOMMU replay mechanism to make sure the device mapping is coherent with the guest even if there are domain switches. And there are two kinds of domain switches: (1) switch from domain A -> B (2) switch from domain A -> no domain (e.g., turn DMAR off) Case (1) is handled by the context entry invalidation handling by the VT-d replay logic. What the replay function should do here is to replay the existing page mappings in domain B. However for case (2), we don't want to replay any domain mappings - we just need the default GPA->HPA mappings (the address_space_memory mapping). And this patch helps on case (2) to build up the mapping automatically by leveraging the vfio-pci memory listeners. Another important thing that this patch does is to seperate IR (Interrupt Remapping) from DMAR (DMA Remapping). IR region should not depend on the DMAR region (like before this patch). It should be a standalone region, and it should be able to be activated without DMAR (which is a common behavior of Linux kernel - by default it enables IR while disabled DMAR). Reviewed-by: Jason Wang Reviewed-by: David Gibson Reviewed-by: \"Michael S. Tsirkin\" Signed-off-by: Peter Xu Message-Id: <1491562755-23867-9-git-send-email-peterx@redhat.com> Signed-off-by: Eduardo Habkost --- hw/i386/intel_iommu.c | 81 ++++++++++++++++++++++++++++++++--- hw/i386/trace-events | 2 +- include/hw/i386/intel_iommu.h | 2 + 3 files changed, 79 insertions(+), 6 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 7af4e22958..f7dec82066 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -1291,9 +1291,49 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); } +static void vtd_switch_address_space(VTDAddressSpace *as) +{ + assert(as); + + trace_vtd_switch_address_space(pci_bus_num(as->bus), + VTD_PCI_SLOT(as->devfn), + VTD_PCI_FUNC(as->devfn), + as->iommu_state->dmar_enabled); + + /* Turn off first then on the other */ + if (as->iommu_state->dmar_enabled) { + memory_region_set_enabled(&as->sys_alias, false); + memory_region_set_enabled(&as->iommu, true); + } else { + memory_region_set_enabled(&as->iommu, false); + memory_region_set_enabled(&as->sys_alias, true); + } +} + +static void vtd_switch_address_space_all(IntelIOMMUState *s) +{ + GHashTableIter iter; + VTDBus *vtd_bus; + int i; + + g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); + while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { + for (i = 0; i < X86_IOMMU_PCI_DEVFN_MAX; i++) { + if (!vtd_bus->dev_as[i]) { + continue; + } + vtd_switch_address_space(vtd_bus->dev_as[i]); + } + } +} + /* Handle Translation Enable/Disable */ static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en) { + if (s->dmar_enabled == en) { + return; + } + VTD_DPRINTF(CSR, "Translation Enable %s", (en ? "on" : "off")); if (en) { @@ -1308,6 +1348,8 @@ static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en) /* Ok - report back to driver */ vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0); } + + vtd_switch_address_space_all(s); } /* Handle Interrupt Remap Enable/Disable */ @@ -2529,15 +2571,44 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) vtd_dev_as->devfn = (uint8_t)devfn; vtd_dev_as->iommu_state = s; vtd_dev_as->context_cache_entry.context_cache_gen = 0; + + /* + * Memory region relationships looks like (Address range shows + * only lower 32 bits to make it short in length...): + * + * |-----------------+-------------------+----------| + * | Name | Address range | Priority | + * |-----------------+-------------------+----------+ + * | vtd_root | 00000000-ffffffff | 0 | + * | intel_iommu | 00000000-ffffffff | 1 | + * | vtd_sys_alias | 00000000-ffffffff | 1 | + * | intel_iommu_ir | fee00000-feefffff | 64 | + * |-----------------+-------------------+----------| + * + * We enable/disable DMAR by switching enablement for + * vtd_sys_alias and intel_iommu regions. IR region is always + * enabled. + */ memory_region_init_iommu(&vtd_dev_as->iommu, OBJECT(s), - &s->iommu_ops, "intel_iommu", UINT64_MAX); + &s->iommu_ops, "intel_iommu_dmar", + UINT64_MAX); + memory_region_init_alias(&vtd_dev_as->sys_alias, OBJECT(s), + "vtd_sys_alias", get_system_memory(), + 0, memory_region_size(get_system_memory())); memory_region_init_io(&vtd_dev_as->iommu_ir, OBJECT(s), &vtd_mem_ir_ops, s, "intel_iommu_ir", VTD_INTERRUPT_ADDR_SIZE); - memory_region_add_subregion(&vtd_dev_as->iommu, VTD_INTERRUPT_ADDR_FIRST, - &vtd_dev_as->iommu_ir); - address_space_init(&vtd_dev_as->as, - &vtd_dev_as->iommu, name); + memory_region_init(&vtd_dev_as->root, OBJECT(s), + "vtd_root", UINT64_MAX); + memory_region_add_subregion_overlap(&vtd_dev_as->root, + VTD_INTERRUPT_ADDR_FIRST, + &vtd_dev_as->iommu_ir, 64); + address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, name); + memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, + &vtd_dev_as->sys_alias, 1); + memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, + &vtd_dev_as->iommu, 1); + vtd_switch_address_space(vtd_dev_as); } return vtd_dev_as; } diff --git a/hw/i386/trace-events b/hw/i386/trace-events index f725bca33e..3c3a16755f 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -4,7 +4,6 @@ x86_iommu_iec_notify(bool global, uint32_t index, uint32_t mask) "Notify IEC invalidation: global=%d index=%" PRIu32 " mask=%" PRIu32 # hw/i386/intel_iommu.c -vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" vtd_inv_desc(const char *type, uint64_t hi, uint64_t lo) "invalidate desc type %s high 0x%"PRIx64" low 0x%"PRIx64 vtd_inv_desc_invalid(uint64_t hi, uint64_t lo) "invalid inv desc hi 0x%"PRIx64" lo 0x%"PRIx64 vtd_inv_desc_cc_domain(uint16_t domain) "context invalidate domain 0x%"PRIx16 @@ -37,6 +36,7 @@ vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, in vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read" vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty" vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" +vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" # hw/i386/amd_iommu.c amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index fe645aa93a..8f212a1198 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -83,6 +83,8 @@ struct VTDAddressSpace { uint8_t devfn; AddressSpace as; MemoryRegion iommu; + MemoryRegion root; + MemoryRegion sys_alias; MemoryRegion iommu_ir; /* Interrupt region: 0xfeeXXXXX */ IntelIOMMUState *iommu_state; VTDContextCacheEntry context_cache_entry; From dd4d607e40dcd2cb7646b510504880a70939d91b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 7 Apr 2017 18:59:15 +0800 Subject: [PATCH 09/15] intel_iommu: enable remote IOTLB This patch is based on Aviv Ben-David ()'s patch upstream: "IOMMU: enable intel_iommu map and unmap notifiers" https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg01453.html However I removed/fixed some content, and added my own codes. Instead of translate() every page for iotlb invalidations (which is slower), we walk the pages when needed and notify in a hook function. This patch enables vfio devices for VT-d emulation. And, since we already have vhost DMAR support via device-iotlb, a natural benefit that this patch brings is that vt-d enabled vhost can live even without ATS capability now. Though more tests are needed. Signed-off-by: Aviv Ben-David Reviewed-by: Jason Wang Reviewed-by: David Gibson Reviewed-by: \"Michael S. Tsirkin\" Signed-off-by: Peter Xu Message-Id: <1491562755-23867-10-git-send-email-peterx@redhat.com> Signed-off-by: Eduardo Habkost --- hw/i386/intel_iommu.c | 191 ++++++++++++++++++++++++++++++--- hw/i386/intel_iommu_internal.h | 1 + hw/i386/trace-events | 1 + include/hw/i386/intel_iommu.h | 8 ++ 4 files changed, 188 insertions(+), 13 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index f7dec82066..02f047c8e3 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -806,7 +806,8 @@ next: * @private: private data for the hook function */ static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end, - vtd_page_walk_hook hook_fn, void *private) + vtd_page_walk_hook hook_fn, void *private, + bool notify_unmap) { dma_addr_t addr = vtd_get_slpt_base_from_context(ce); uint32_t level = vtd_get_level_from_context_entry(ce); @@ -821,7 +822,7 @@ static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end, } return vtd_page_walk_level(addr, start, end, hook_fn, private, - level, true, true, false); + level, true, true, notify_unmap); } /* Map a device to its corresponding domain (context-entry) */ @@ -1038,6 +1039,15 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) s->intr_root, s->intr_size); } +static void vtd_iommu_replay_all(IntelIOMMUState *s) +{ + IntelIOMMUNotifierNode *node; + + QLIST_FOREACH(node, &s->notifiers_list, next) { + memory_region_iommu_replay_all(&node->vtd_as->iommu); + } +} + static void vtd_context_global_invalidate(IntelIOMMUState *s) { trace_vtd_inv_desc_cc_global(); @@ -1045,6 +1055,14 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s) if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { vtd_reset_context_cache(s); } + /* + * From VT-d spec 6.5.2.1, a global context entry invalidation + * should be followed by a IOTLB global invalidation, so we should + * be safe even without this. Hoewever, let's replay the region as + * well to be safer, and go back here when we need finer tunes for + * VT-d emulation codes. + */ + vtd_iommu_replay_all(s); } @@ -1111,6 +1129,16 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), VTD_PCI_FUNC(devfn_it)); vtd_as->context_cache_entry.context_cache_gen = 0; + /* + * So a device is moving out of (or moving into) a + * domain, a replay() suites here to notify all the + * IOMMU_NOTIFIER_MAP registers about this change. + * This won't bring bad even if we have no such + * notifier registered - the IOMMU notification + * framework will skip MAP notifications if that + * happened. + */ + memory_region_iommu_replay_all(&vtd_as->iommu); } } } @@ -1152,12 +1180,53 @@ static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) { trace_vtd_iotlb_reset("global invalidation recved"); vtd_reset_iotlb(s); + vtd_iommu_replay_all(s); } static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) { + IntelIOMMUNotifierNode *node; + VTDContextEntry ce; + VTDAddressSpace *vtd_as; + g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, &domain_id); + + QLIST_FOREACH(node, &s->notifiers_list, next) { + vtd_as = node->vtd_as; + if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce) && + domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { + memory_region_iommu_replay_all(&vtd_as->iommu); + } + } +} + +static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry, + void *private) +{ + memory_region_notify_iommu((MemoryRegion *)private, *entry); + return 0; +} + +static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, + uint16_t domain_id, hwaddr addr, + uint8_t am) +{ + IntelIOMMUNotifierNode *node; + VTDContextEntry ce; + int ret; + + QLIST_FOREACH(node, &(s->notifiers_list), next) { + VTDAddressSpace *vtd_as = node->vtd_as; + ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce); + if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { + vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE, + vtd_page_invalidate_notify_hook, + (void *)&vtd_as->iommu, true); + } + } } static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, @@ -1170,6 +1239,7 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, info.addr = addr; info.mask = ~((1 << am) - 1); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); + vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); } /* Flush IOTLB @@ -2187,15 +2257,33 @@ static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu, IOMMUNotifierFlag new) { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); + IntelIOMMUState *s = vtd_as->iommu_state; + IntelIOMMUNotifierNode *node = NULL; + IntelIOMMUNotifierNode *next_node = NULL; - if (new & IOMMU_NOTIFIER_MAP) { - error_report("Device at bus %s addr %02x.%d requires iommu " - "notifier which is currently not supported by " - "intel-iommu emulation", - vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn), - PCI_FUNC(vtd_as->devfn)); + if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) { + error_report("We need to set cache_mode=1 for intel-iommu to enable " + "device assignment with IOMMU protection."); exit(1); } + + if (old == IOMMU_NOTIFIER_NONE) { + node = g_malloc0(sizeof(*node)); + node->vtd_as = vtd_as; + QLIST_INSERT_HEAD(&s->notifiers_list, node, next); + return; + } + + /* update notifier node with new flags */ + QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) { + if (node->vtd_as == vtd_as) { + if (new == IOMMU_NOTIFIER_NONE) { + QLIST_REMOVE(node, next); + g_free(node); + } + return; + } + } } static const VMStateDescription vtd_vmstate = { @@ -2613,6 +2701,74 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) return vtd_dev_as; } +/* Unmap the whole range in the notifier's scope. */ +static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) +{ + IOMMUTLBEntry entry; + hwaddr size; + hwaddr start = n->start; + hwaddr end = n->end; + + /* + * Note: all the codes in this function has a assumption that IOVA + * bits are no more than VTD_MGAW bits (which is restricted by + * VT-d spec), otherwise we need to consider overflow of 64 bits. + */ + + if (end > VTD_ADDRESS_SIZE) { + /* + * Don't need to unmap regions that is bigger than the whole + * VT-d supported address space size + */ + end = VTD_ADDRESS_SIZE; + } + + assert(start <= end); + size = end - start; + + if (ctpop64(size) != 1) { + /* + * This size cannot format a correct mask. Let's enlarge it to + * suite the minimum available mask. + */ + int n = 64 - clz64(size); + if (n > VTD_MGAW) { + /* should not happen, but in case it happens, limit it */ + n = VTD_MGAW; + } + size = 1ULL << n; + } + + entry.target_as = &address_space_memory; + /* Adjust iova for the size */ + entry.iova = n->start & ~(size - 1); + /* This field is meaningless for unmap */ + entry.translated_addr = 0; + entry.perm = IOMMU_NONE; + entry.addr_mask = size - 1; + + trace_vtd_as_unmap_whole(pci_bus_num(as->bus), + VTD_PCI_SLOT(as->devfn), + VTD_PCI_FUNC(as->devfn), + entry.iova, size); + + memory_region_notify_one(n, &entry); +} + +static void vtd_address_space_unmap_all(IntelIOMMUState *s) +{ + IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; + IOMMUNotifier *n; + + QLIST_FOREACH(node, &s->notifiers_list, next) { + vtd_as = node->vtd_as; + IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { + vtd_address_space_unmap(vtd_as, n); + } + } +} + static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private) { memory_region_notify_one((IOMMUNotifier *)private, entry); @@ -2626,16 +2782,19 @@ static void vtd_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n) uint8_t bus_n = pci_bus_num(vtd_as->bus); VTDContextEntry ce; + /* + * The replay can be triggered by either a invalidation or a newly + * created entry. No matter what, we release existing mappings + * (it means flushing caches for UNMAP-only registers). + */ + vtd_address_space_unmap(vtd_as, n); + if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) { - /* - * Scanned a valid context entry, walk over the pages and - * notify when needed. - */ trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn), PCI_FUNC(vtd_as->devfn), VTD_CONTEXT_ENTRY_DID(ce.hi), ce.hi, ce.lo); - vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n); + vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false); } else { trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), PCI_FUNC(vtd_as->devfn)); @@ -2754,6 +2913,11 @@ static void vtd_reset(DeviceState *dev) VTD_DPRINTF(GENERAL, ""); vtd_init(s); + + /* + * When device reset, throw away all mappings and external caches + */ + vtd_address_space_unmap_all(s); } static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) @@ -2817,6 +2981,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) return; } + QLIST_INIT(&s->notifiers_list); memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 41041219ba..29d67075f4 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -197,6 +197,7 @@ #define VTD_DOMAIN_ID_MASK ((1UL << VTD_DOMAIN_ID_SHIFT) - 1) #define VTD_CAP_ND (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL) #define VTD_MGAW 39 /* Maximum Guest Address Width */ +#define VTD_ADDRESS_SIZE (1ULL << VTD_MGAW) #define VTD_CAP_MGAW (((VTD_MGAW - 1) & 0x3fULL) << 16) #define VTD_MAMV 18ULL #define VTD_CAP_MAMV (VTD_MAMV << 48) diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 3c3a16755f..04a6980800 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -37,6 +37,7 @@ vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"P vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty" vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" +vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64 # hw/i386/amd_iommu.c amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 8f212a1198..3e51876b75 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -63,6 +63,7 @@ typedef union VTD_IR_TableEntry VTD_IR_TableEntry; typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress; typedef struct VTDIrq VTDIrq; typedef struct VTD_MSIMessage VTD_MSIMessage; +typedef struct IntelIOMMUNotifierNode IntelIOMMUNotifierNode; /* Context-Entry */ struct VTDContextEntry { @@ -249,6 +250,11 @@ struct VTD_MSIMessage { /* When IR is enabled, all MSI/MSI-X data bits should be zero */ #define VTD_IR_MSI_DATA (0) +struct IntelIOMMUNotifierNode { + VTDAddressSpace *vtd_as; + QLIST_ENTRY(IntelIOMMUNotifierNode) next; +}; + /* The iommu (DMAR) device state struct */ struct IntelIOMMUState { X86IOMMUState x86_iommu; @@ -286,6 +292,8 @@ struct IntelIOMMUState { MemoryRegionIOMMUOps iommu_ops; GHashTable *vtd_as_by_busptr; /* VTDBus objects indexed by PCIBus* reference */ VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */ + /* list of registered notifiers */ + QLIST_HEAD(, IntelIOMMUNotifierNode) notifiers_list; /* interrupt remapping */ bool intr_enabled; /* Whether guest enabled IR */ From 36cccb8c575b74a691f685911fbb0301af19f924 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 22 Feb 2017 16:26:47 -0300 Subject: [PATCH 10/15] qdev: Make "hotplugged" property read-only The "hotplugged" property is user visible, but it was never meant to be set by the user. There are probably multiple ways to break or crash device code by overriding the property. For example, we recently fixed a crash in rtc_set_memory() related to the property (commit 26ef65beab852caf2b1ef4976e3473f2d525164d). There has been some discussion about making management software use "hotplugged=on" on migration, to indicate devices that were hotplugged in the migration source. There were other suggestions to address this, like including the "hotplugged" field in the migration stream instead of requiring it to be set explicitly. Whatever solution we choose in the future, this patch disables setting "hotplugged" explicitly in the command-line by now, because the ability to set the property is unused, untested, and undocumented. Signed-off-by: Eduardo Habkost Message-Id: <20170222192647.19690-1-ehabkost@redhat.com> Reviewed-by: Eric Blake Reviewed-by: Markus Armbruster Signed-off-by: Eduardo Habkost --- hw/core/qdev.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/hw/core/qdev.c b/hw/core/qdev.c index 1e7fb33246..695d7c4216 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -1037,13 +1037,6 @@ static bool device_get_hotplugged(Object *obj, Error **err) return dev->hotplugged; } -static void device_set_hotplugged(Object *obj, bool value, Error **err) -{ - DeviceState *dev = DEVICE(obj); - - dev->hotplugged = value; -} - static void device_initfn(Object *obj) { DeviceState *dev = DEVICE(obj); @@ -1063,7 +1056,7 @@ static void device_initfn(Object *obj) object_property_add_bool(obj, "hotpluggable", device_get_hotpluggable, NULL, NULL); object_property_add_bool(obj, "hotplugged", - device_get_hotplugged, device_set_hotplugged, + device_get_hotplugged, NULL, &error_abort); class = object_get_class(OBJECT(dev)); From 991db247745c1de161d665f847fd28618212f2e8 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 28 Feb 2017 09:52:51 +0100 Subject: [PATCH 11/15] hw/core/null-machine: Print error message when using the -kernel parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the user currently tries to use the -kernel parameter, simply nothing happens, and the user might get confused that there is nothing loaded to memory, but also no error message has been issued. Since there is no real generic way to load a kernel on all CPU types (but on some targets, the generic loader can be used instead), issue an appropriate error message here now to avoid the possible confusion. Signed-off-by: Thomas Huth Message-Id: <1488271971-12624-1-git-send-email-thuth@redhat.com> Reviewed-by: Marcel Apfelbaum Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Eduardo Habkost Signed-off-by: Eduardo Habkost --- hw/core/null-machine.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/core/null-machine.c b/hw/core/null-machine.c index 27c8369b57..864832db34 100644 --- a/hw/core/null-machine.c +++ b/hw/core/null-machine.c @@ -40,6 +40,12 @@ static void machine_none_init(MachineState *mch) memory_region_allocate_system_memory(ram, NULL, "ram", mch->ram_size); memory_region_add_subregion(get_system_memory(), 0, ram); } + + if (mch->kernel_filename) { + error_report("The -kernel parameter is not supported " + "(use the generic 'loader' device instead)."); + exit(1); + } } static void machine_none_machine_init(MachineClass *mc) From 4728b5741040df8bd02a5ad8fd3c2beefae3dcb1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 10 Mar 2017 21:09:29 +0800 Subject: [PATCH 12/15] hostmem: introduce host_memory_backend_mr_inited() We were checking this against memory region size of host memory backend's mr field to see whether the mr has been inited. This is efficient but less elegant. Let's make a helper for it to avoid confusions, along with some notes. Suggested-by: Peter Maydell Signed-off-by: Peter Xu Message-Id: <1489151370-15453-2-git-send-email-peterx@redhat.com> Reviewed-by: Eduardo Habkost Signed-off-by: Eduardo Habkost --- backends/hostmem.c | 9 +++++++++ include/sysemu/hostmem.h | 1 + 2 files changed, 10 insertions(+) diff --git a/backends/hostmem.c b/backends/hostmem.c index 89feb9ed75..d8faab4bed 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -237,6 +237,15 @@ static void host_memory_backend_init(Object *obj) backend->prealloc = mem_prealloc; } +bool host_memory_backend_mr_inited(HostMemoryBackend *backend) +{ + /* + * NOTE: We forbid zero-length memory backend, so here zero means + * "we haven't inited the backend memory region yet". + */ + return memory_region_size(&backend->mr) != 0; +} + MemoryRegion * host_memory_backend_get_memory(HostMemoryBackend *backend, Error **errp) { diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h index ecae0cff19..ed6a437f4d 100644 --- a/include/sysemu/hostmem.h +++ b/include/sysemu/hostmem.h @@ -62,6 +62,7 @@ struct HostMemoryBackend { MemoryRegion mr; }; +bool host_memory_backend_mr_inited(HostMemoryBackend *backend); MemoryRegion *host_memory_backend_get_memory(HostMemoryBackend *backend, Error **errp); From 6f4c60e47fdb39848eb40804f54e53d8ccaa0f2e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 10 Mar 2017 21:09:30 +0800 Subject: [PATCH 13/15] hostmem: use host_memory_backend_mr_inited() where proper Use the new interface to boost readability. Signed-off-by: Peter Xu Message-Id: <1489151370-15453-3-git-send-email-peterx@redhat.com> Reviewed-by: Eduardo Habkost Signed-off-by: Eduardo Habkost --- backends/hostmem-file.c | 6 +++--- backends/hostmem.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c index 42efb2f28a..fc4ef46d11 100644 --- a/backends/hostmem-file.c +++ b/backends/hostmem-file.c @@ -51,7 +51,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) #ifndef CONFIG_LINUX error_setg(errp, "-mem-path not supported on this host"); #else - if (!memory_region_size(&backend->mr)) { + if (!host_memory_backend_mr_inited(backend)) { gchar *path; backend->force_prealloc = mem_prealloc; path = object_get_canonical_path(OBJECT(backend)); @@ -76,7 +76,7 @@ static void set_mem_path(Object *o, const char *str, Error **errp) HostMemoryBackend *backend = MEMORY_BACKEND(o); HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); - if (memory_region_size(&backend->mr)) { + if (host_memory_backend_mr_inited(backend)) { error_setg(errp, "cannot change property value"); return; } @@ -96,7 +96,7 @@ static void file_memory_backend_set_share(Object *o, bool value, Error **errp) HostMemoryBackend *backend = MEMORY_BACKEND(o); HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); - if (memory_region_size(&backend->mr)) { + if (host_memory_backend_mr_inited(backend)) { error_setg(errp, "cannot change property value"); return; } diff --git a/backends/hostmem.c b/backends/hostmem.c index d8faab4bed..4606b73849 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -45,7 +45,7 @@ host_memory_backend_set_size(Object *obj, Visitor *v, const char *name, Error *local_err = NULL; uint64_t value; - if (memory_region_size(&backend->mr)) { + if (host_memory_backend_mr_inited(backend)) { error_setg(&local_err, "cannot change property value"); goto out; } @@ -146,7 +146,7 @@ static void host_memory_backend_set_merge(Object *obj, bool value, Error **errp) { HostMemoryBackend *backend = MEMORY_BACKEND(obj); - if (!memory_region_size(&backend->mr)) { + if (!host_memory_backend_mr_inited(backend)) { backend->merge = value; return; } @@ -172,7 +172,7 @@ static void host_memory_backend_set_dump(Object *obj, bool value, Error **errp) { HostMemoryBackend *backend = MEMORY_BACKEND(obj); - if (!memory_region_size(&backend->mr)) { + if (!host_memory_backend_mr_inited(backend)) { backend->dump = value; return; } @@ -208,7 +208,7 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value, } } - if (!memory_region_size(&backend->mr)) { + if (!host_memory_backend_mr_inited(backend)) { backend->prealloc = value; return; } @@ -249,7 +249,7 @@ bool host_memory_backend_mr_inited(HostMemoryBackend *backend) MemoryRegion * host_memory_backend_get_memory(HostMemoryBackend *backend, Error **errp) { - return memory_region_size(&backend->mr) ? &backend->mr : NULL; + return host_memory_backend_mr_inited(backend) ? &backend->mr : NULL; } void host_memory_backend_set_mapped(HostMemoryBackend *backend, bool mapped) From 606fd0e20600ad0756fdfd2165a105f576ce43a9 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 10 Mar 2017 22:05:49 +0200 Subject: [PATCH 14/15] qdev: Constify value passed to qdev_prop_set_macaddr The 'value' argument is not modified so this can be made const for code safeness. Signed-off-by: Krzysztof Kozlowski Message-Id: <20170310200550.13313-2-krzk@kernel.org> Signed-off-by: Eduardo Habkost --- hw/core/qdev-properties.c | 3 ++- include/hw/qdev-properties.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c index 6ab4265eb4..fa3617db2d 100644 --- a/hw/core/qdev-properties.c +++ b/hw/core/qdev-properties.c @@ -1010,7 +1010,8 @@ void qdev_prop_set_string(DeviceState *dev, const char *name, const char *value) object_property_set_str(OBJECT(dev), value, name, &error_abort); } -void qdev_prop_set_macaddr(DeviceState *dev, const char *name, uint8_t *value) +void qdev_prop_set_macaddr(DeviceState *dev, const char *name, + const uint8_t *value) { char str[2 * 6 + 5 + 1]; snprintf(str, sizeof(str), "%02x:%02x:%02x:%02x:%02x:%02x", diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h index 7ac315331a..1d69fa7a8f 100644 --- a/include/hw/qdev-properties.h +++ b/include/hw/qdev-properties.h @@ -188,7 +188,8 @@ void qdev_prop_set_chr(DeviceState *dev, const char *name, Chardev *value); void qdev_prop_set_netdev(DeviceState *dev, const char *name, NetClientState *value); void qdev_prop_set_drive(DeviceState *dev, const char *name, BlockBackend *value, Error **errp); -void qdev_prop_set_macaddr(DeviceState *dev, const char *name, uint8_t *value); +void qdev_prop_set_macaddr(DeviceState *dev, const char *name, + const uint8_t *value); void qdev_prop_set_enum(DeviceState *dev, const char *name, int value); /* FIXME: Remove opaque pointer properties. */ void qdev_prop_set_ptr(DeviceState *dev, const char *name, void *value); From be9721f400f7e5395bb2a257c291557df8f3f833 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 10 Mar 2017 22:05:50 +0200 Subject: [PATCH 15/15] qdev: Constify local variable returned by blk_bs Inside qdev_prop_set_drive() the value returned by blk_bs() is passed only as pointer to const to bdrv_get_node_name() and pointed values is not modified in other places so this can be made const for code safeness. Signed-off-by: Krzysztof Kozlowski Message-Id: <20170310200550.13313-3-krzk@kernel.org> Signed-off-by: Eduardo Habkost --- hw/core/qdev-properties-system.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index e885e650fb..79c2014135 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -409,7 +409,7 @@ void qdev_prop_set_drive(DeviceState *dev, const char *name, if (value) { ref = blk_name(value); if (!*ref) { - BlockDriverState *bs = blk_bs(value); + const BlockDriverState *bs = blk_bs(value); if (bs) { ref = bdrv_get_node_name(bs); }