From 30d4b180e20c081f435143f8bc211c66a930608a Mon Sep 17 00:00:00 2001 From: Tamas Lengyel Date: Mon, 31 Dec 2012 15:44:30 -0500 Subject: [PATCH 01/10] xen/privcmd: Relax access control in privcmd_ioctl_mmap In the privcmd Linux driver two checks in the functions privcmd_ioctl_mmap and privcmd_ioctl_mmap_batch are not needed as they are trying to enforce hypervisor-level access control. They should be removed as they break secondary control domains when performing dom0 disaggregation. Xen itself provides adequate security controls around these hypercalls and these checks prevent those controls from functioning as intended. Signed-off-by: Tamas K Lengyel Cc: Daniel De Graaf [v1: Fixed up the patch and commit description] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/privcmd.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index b9d08987a5a5..f6316127f53f 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -199,9 +199,6 @@ static long privcmd_ioctl_mmap(void __user *udata) LIST_HEAD(pagelist); struct mmap_mfn_state state; - if (!xen_initial_domain()) - return -EPERM; - /* We only support privcmd_ioctl_mmap_batch for auto translated. */ if (xen_feature(XENFEAT_auto_translated_physmap)) return -ENOSYS; @@ -360,9 +357,6 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) int *err_array = NULL; struct mmap_batch_state state; - if (!xen_initial_domain()) - return -EPERM; - switch (version) { case 1: if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) From 6337a23992826e68257fba51267cb6996439520d Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 22 Nov 2012 10:20:23 +0800 Subject: [PATCH 02/10] x86/xen : Fix the wrong check in pciback Fix the wrong check in pciback. Signed-off-by: Yang Zhang Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xen-pciback/pciback.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h index a7def010eba3..f72af87640e0 100644 --- a/drivers/xen/xen-pciback/pciback.h +++ b/drivers/xen/xen-pciback/pciback.h @@ -124,7 +124,7 @@ static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, struct pci_dev *dev) { - if (xen_pcibk_backend && xen_pcibk_backend->free) + if (xen_pcibk_backend && xen_pcibk_backend->release) return xen_pcibk_backend->release(pdev, dev); } From d0b4d64aadb9f4a90669848de9ef3819050a98cd Mon Sep 17 00:00:00 2001 From: Matt Wilson Date: Tue, 15 Jan 2013 13:21:27 +0000 Subject: [PATCH 03/10] xen/grant-table: correctly initialize grant table version 1 Commit 85ff6acb075a484780b3d763fdf41596d8fc0970 (xen/granttable: Grant tables V2 implementation) changed the GREFS_PER_GRANT_FRAME macro from a constant to a conditional expression. The expression depends on grant_table_version being appropriately set. Unfortunately, at init time grant_table_version will be 0. The GREFS_PER_GRANT_FRAME conditional expression checks for "grant_table_version == 1", and therefore returns the number of grant references per frame for v2. This causes gnttab_init() to allocate fewer pages for gnttab_list, as a frame can old half the number of v2 entries than v1 entries. After gnttab_resume() is called, grant_table_version is appropriately set. nr_init_grefs will then be miscalculated and gnttab_free_count will hold a value larger than the actual number of free gref entries. If a guest is heavily utilizing improperly initialized v1 grant tables, memory corruption can occur. One common manifestation is corruption of the vmalloc list, resulting in a poisoned pointer derefrence when accessing /proc/meminfo or /proc/vmallocinfo: [ 40.770064] BUG: unable to handle kernel paging request at 0000200200001407 [ 40.770083] IP: [] get_vmalloc_info+0x70/0x110 [ 40.770102] PGD 0 [ 40.770107] Oops: 0000 [#1] SMP [ 40.770114] CPU 10 This patch introduces a static variable, grefs_per_grant_frame, to cache the calculated value. gnttab_init() now calls gnttab_request_version() early so that grant_table_version and grefs_per_grant_frame can be appropriately set. A few BUG_ON()s have been added to prevent this type of bug from reoccurring in the future. Signed-off-by: Matt Wilson Reviewed-and-Tested-by: Steven Noonan Acked-by: Ian Campbell Cc: Konrad Rzeszutek Wilk Cc: Annie Li Cc: xen-devel@lists.xen.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.3 and newer Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/grant-table.c | 50 +++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index b91f14e83164..95ce9d02ceca 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -56,10 +56,6 @@ /* External tools reserve first few grant table entries. */ #define NR_RESERVED_ENTRIES 8 #define GNTTAB_LIST_END 0xffffffff -#define GREFS_PER_GRANT_FRAME \ -(grant_table_version == 1 ? \ -(PAGE_SIZE / sizeof(struct grant_entry_v1)) : \ -(PAGE_SIZE / sizeof(union grant_entry_v2))) static grant_ref_t **gnttab_list; static unsigned int nr_grant_frames; @@ -154,6 +150,7 @@ static struct gnttab_ops *gnttab_interface; static grant_status_t *grstatus; static int grant_table_version; +static int grefs_per_grant_frame; static struct gnttab_free_callback *gnttab_free_callback_list; @@ -767,12 +764,14 @@ static int grow_gnttab_list(unsigned int more_frames) unsigned int new_nr_grant_frames, extra_entries, i; unsigned int nr_glist_frames, new_nr_glist_frames; - new_nr_grant_frames = nr_grant_frames + more_frames; - extra_entries = more_frames * GREFS_PER_GRANT_FRAME; + BUG_ON(grefs_per_grant_frame == 0); - nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; + new_nr_grant_frames = nr_grant_frames + more_frames; + extra_entries = more_frames * grefs_per_grant_frame; + + nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; new_nr_glist_frames = - (new_nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; + (new_nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; for (i = nr_glist_frames; i < new_nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC); if (!gnttab_list[i]) @@ -780,12 +779,12 @@ static int grow_gnttab_list(unsigned int more_frames) } - for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames; - i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++) + for (i = grefs_per_grant_frame * nr_grant_frames; + i < grefs_per_grant_frame * new_nr_grant_frames - 1; i++) gnttab_entry(i) = i + 1; gnttab_entry(i) = gnttab_free_head; - gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames; + gnttab_free_head = grefs_per_grant_frame * nr_grant_frames; gnttab_free_count += extra_entries; nr_grant_frames = new_nr_grant_frames; @@ -957,7 +956,8 @@ EXPORT_SYMBOL_GPL(gnttab_unmap_refs); static unsigned nr_status_frames(unsigned nr_grant_frames) { - return (nr_grant_frames * GREFS_PER_GRANT_FRAME + SPP - 1) / SPP; + BUG_ON(grefs_per_grant_frame == 0); + return (nr_grant_frames * grefs_per_grant_frame + SPP - 1) / SPP; } static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes) @@ -1115,6 +1115,7 @@ static void gnttab_request_version(void) rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); if (rc == 0 && gsv.version == 2) { grant_table_version = 2; + grefs_per_grant_frame = PAGE_SIZE / sizeof(union grant_entry_v2); gnttab_interface = &gnttab_v2_ops; } else if (grant_table_version == 2) { /* @@ -1127,17 +1128,17 @@ static void gnttab_request_version(void) panic("we need grant tables version 2, but only version 1 is available"); } else { grant_table_version = 1; + grefs_per_grant_frame = PAGE_SIZE / sizeof(struct grant_entry_v1); gnttab_interface = &gnttab_v1_ops; } printk(KERN_INFO "Grant tables using version %d layout.\n", grant_table_version); } -int gnttab_resume(void) +static int gnttab_setup(void) { unsigned int max_nr_gframes; - gnttab_request_version(); max_nr_gframes = gnttab_max_grant_frames(); if (max_nr_gframes < nr_grant_frames) return -ENOSYS; @@ -1160,6 +1161,12 @@ int gnttab_resume(void) return 0; } +int gnttab_resume(void) +{ + gnttab_request_version(); + return gnttab_setup(); +} + int gnttab_suspend(void) { gnttab_interface->unmap_frames(); @@ -1171,9 +1178,10 @@ static int gnttab_expand(unsigned int req_entries) int rc; unsigned int cur, extra; + BUG_ON(grefs_per_grant_frame == 0); cur = nr_grant_frames; - extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) / - GREFS_PER_GRANT_FRAME); + extra = ((req_entries + (grefs_per_grant_frame-1)) / + grefs_per_grant_frame); if (cur + extra > gnttab_max_grant_frames()) return -ENOSPC; @@ -1191,21 +1199,23 @@ int gnttab_init(void) unsigned int nr_init_grefs; int ret; + gnttab_request_version(); nr_grant_frames = 1; boot_max_nr_grant_frames = __max_nr_grant_frames(); /* Determine the maximum number of frames required for the * grant reference free list on the current hypervisor. */ + BUG_ON(grefs_per_grant_frame == 0); max_nr_glist_frames = (boot_max_nr_grant_frames * - GREFS_PER_GRANT_FRAME / RPP); + grefs_per_grant_frame / RPP); gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), GFP_KERNEL); if (gnttab_list == NULL) return -ENOMEM; - nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; + nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; for (i = 0; i < nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL); if (gnttab_list[i] == NULL) { @@ -1214,12 +1224,12 @@ int gnttab_init(void) } } - if (gnttab_resume() < 0) { + if (gnttab_setup() < 0) { ret = -ENODEV; goto ini_nomem; } - nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME; + nr_init_grefs = nr_grant_frames * grefs_per_grant_frame; for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) gnttab_entry(i) = i + 1; From e5c702d3b268066dc70d619ecff06a08065f343f Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 15 Jan 2013 13:31:43 +0000 Subject: [PATCH 04/10] Xen: properly bound buffer access when parsing cpu/*/availability At the same time reduce the local buffers to 16 bytes each. Signed-off-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/cpu_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 4dcfced107f5..084041d42c9a 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -25,10 +25,10 @@ static void disable_hotplug_cpu(int cpu) static int vcpu_online(unsigned int cpu) { int err; - char dir[32], state[32]; + char dir[16], state[16]; sprintf(dir, "cpu/%u", cpu); - err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state); + err = xenbus_scanf(XBT_NIL, dir, "availability", "%15s", state); if (err != 1) { if (!xen_initial_domain()) printk(KERN_ERR "XENBUS: Unable to read cpu state\n"); From 99beae6cb8f4dd5dab81a370b79c3b1085848d89 Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Mon, 14 Jan 2013 22:35:40 -0500 Subject: [PATCH 05/10] xen/privcmd: Fix mmap batch ioctl. 1. If any individual mapping error happens, the V1 case will mark *all* operations as failed. Fixed. 2. The err_array was allocated with kcalloc, resulting in potentially O(n) page allocations. Refactor code to not use this array. Signed-off-by: Andres Lagar-Cavilla Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/privcmd.c | 83 ++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 421375a9196a..ca2b00e9d558 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -258,11 +258,12 @@ struct mmap_batch_state { * -ENOENT if at least 1 -ENOENT has happened. */ int global_error; - /* An array for individual errors */ - int *err; + int version; /* User-space mfn array to store errors in the second pass for V1. */ xen_pfn_t __user *user_mfn; + /* User-space int array to store errors in the second pass for V2. */ + int __user *user_err; }; /* auto translated dom0 note: if domU being created is PV, then mfn is @@ -285,7 +286,19 @@ static int mmap_batch_fn(void *data, void *state) &cur_page); /* Store error code for second pass. */ - *(st->err++) = ret; + if (st->version == 1) { + if (ret < 0) { + /* + * V1 encodes the error codes in the 32bit top nibble of the + * mfn (with its known limitations vis-a-vis 64 bit callers). + */ + *mfnp |= (ret == -ENOENT) ? + PRIVCMD_MMAPBATCH_PAGED_ERROR : + PRIVCMD_MMAPBATCH_MFN_ERROR; + } + } else { /* st->version == 2 */ + *((int *) mfnp) = ret; + } /* And see if it affects the global_error. */ if (ret < 0) { @@ -302,20 +315,25 @@ static int mmap_batch_fn(void *data, void *state) return 0; } -static int mmap_return_errors_v1(void *data, void *state) +static int mmap_return_errors(void *data, void *state) { - xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; - int err = *(st->err++); - /* - * V1 encodes the error codes in the 32bit top nibble of the - * mfn (with its known limitations vis-a-vis 64 bit callers). - */ - *mfnp |= (err == -ENOENT) ? - PRIVCMD_MMAPBATCH_PAGED_ERROR : - PRIVCMD_MMAPBATCH_MFN_ERROR; - return __put_user(*mfnp, st->user_mfn++); + if (st->version == 1) { + xen_pfn_t mfnp = *((xen_pfn_t *) data); + if (mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR) + return __put_user(mfnp, st->user_mfn++); + else + st->user_mfn++; + } else { /* st->version == 2 */ + int err = *((int *) data); + if (err) + return __put_user(err, st->user_err++); + else + st->user_err++; + } + + return 0; } /* Allocate pfns that are then mapped with gmfns from foreign domid. Update @@ -354,7 +372,6 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) struct vm_area_struct *vma; unsigned long nr_pages; LIST_HEAD(pagelist); - int *err_array = NULL; struct mmap_batch_state state; switch (version) { @@ -390,10 +407,12 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) goto out; } - err_array = kcalloc(m.num, sizeof(int), GFP_KERNEL); - if (err_array == NULL) { - ret = -ENOMEM; - goto out; + if (version == 2) { + /* Zero error array now to only copy back actual errors. */ + if (clear_user(m.err, sizeof(int) * m.num)) { + ret = -EFAULT; + goto out; + } } down_write(&mm->mmap_sem); @@ -421,7 +440,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) state.va = m.addr; state.index = 0; state.global_error = 0; - state.err = err_array; + state.version = version; /* mmap_batch_fn guarantees ret == 0 */ BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), @@ -429,21 +448,14 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) up_write(&mm->mmap_sem); - if (version == 1) { - if (state.global_error) { - /* Write back errors in second pass. */ - state.user_mfn = (xen_pfn_t *)m.arr; - state.err = err_array; - ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, mmap_return_errors_v1, &state); - } else - ret = 0; - - } else if (version == 2) { - ret = __copy_to_user(m.err, err_array, m.num * sizeof(int)); - if (ret) - ret = -EFAULT; - } + if (state.global_error) { + /* Write back errors in second pass. */ + state.user_mfn = (xen_pfn_t *)m.arr; + state.user_err = m.err; + ret = traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_return_errors, &state); + } else + ret = 0; /* If we have not had any EFAULT-like global errors then set the global * error to -ENOENT if necessary. */ @@ -451,7 +463,6 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) ret = -ENOENT; out: - kfree(err_array); free_page_list(&pagelist); return ret; From 2512f298cb9886e06938e761c9e924c8448d9ab8 Mon Sep 17 00:00:00 2001 From: Daniel De Graaf Date: Wed, 2 Jan 2013 22:57:11 +0000 Subject: [PATCH 06/10] xen/gntdev: fix unsafe vma access In gntdev_ioctl_get_offset_for_vaddr, we need to hold mmap_sem while calling find_vma() to avoid potentially having the result freed out from under us. Similarly, the MMU notifier functions need to synchronize with gntdev_vma_close to avoid map->vma being freed during their iteration. Signed-off-by: Daniel De Graaf Reported-by: Al Viro Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/gntdev.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 2e22df2f7a3f..cca62cc8549b 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -378,7 +378,20 @@ static void gntdev_vma_close(struct vm_area_struct *vma) struct grant_map *map = vma->vm_private_data; pr_debug("gntdev_vma_close %p\n", vma); - map->vma = NULL; + if (use_ptemod) { + struct file *file = vma->vm_file; + struct gntdev_priv *priv = file->private_data; + /* It is possible that an mmu notifier could be running + * concurrently, so take priv->lock to ensure that the vma won't + * vanishing during the unmap_grant_pages call, since we will + * spin here until that completes. Such a concurrent call will + * not do any unmapping, since that has been done prior to + * closing the vma, but it may still iterate the unmap_ops list. + */ + spin_lock(&priv->lock); + map->vma = NULL; + spin_unlock(&priv->lock); + } vma->vm_private_data = NULL; gntdev_put_map(map); } @@ -579,25 +592,31 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, struct ioctl_gntdev_get_offset_for_vaddr op; struct vm_area_struct *vma; struct grant_map *map; + int rv = -EINVAL; if (copy_from_user(&op, u, sizeof(op)) != 0) return -EFAULT; pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, op.vaddr); if (!vma || vma->vm_ops != &gntdev_vmops) - return -EINVAL; + goto out_unlock; map = vma->vm_private_data; if (!map) - return -EINVAL; + goto out_unlock; op.offset = map->index << PAGE_SHIFT; op.count = map->count; + rv = 0; - if (copy_to_user(u, &op, sizeof(op)) != 0) + out_unlock: + up_read(¤t->mm->mmap_sem); + + if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0) return -EFAULT; - return 0; + return rv; } static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) From 16a1d0225e22e4e273e6b60a21db95decde666c2 Mon Sep 17 00:00:00 2001 From: Daniel De Graaf Date: Wed, 2 Jan 2013 22:57:12 +0000 Subject: [PATCH 07/10] xen/gntdev: correctly unmap unlinked maps in mmu notifier If gntdev_ioctl_unmap_grant_ref is called on a range before unmapping it, the entry is removed from priv->maps and the later call to mn_invl_range_start won't find it to do the unmapping. Fix this by creating another list of freeable maps that the mmu notifier can search and use to unmap grants. Signed-off-by: Daniel De Graaf Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/gntdev.c | 92 ++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index cca62cc8549b..9be3e5e46d1f 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -56,10 +56,15 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by " static atomic_t pages_mapped = ATOMIC_INIT(0); static int use_ptemod; +#define populate_freeable_maps use_ptemod struct gntdev_priv { + /* maps with visible offsets in the file descriptor */ struct list_head maps; - /* lock protects maps from concurrent changes */ + /* maps that are not visible; will be freed on munmap. + * Only populated if populate_freeable_maps == 1 */ + struct list_head freeable_maps; + /* lock protects maps and freeable_maps */ spinlock_t lock; struct mm_struct *mm; struct mmu_notifier mn; @@ -193,7 +198,7 @@ static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, return NULL; } -static void gntdev_put_map(struct grant_map *map) +static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map) { if (!map) return; @@ -208,6 +213,12 @@ static void gntdev_put_map(struct grant_map *map) evtchn_put(map->notify.event); } + if (populate_freeable_maps && priv) { + spin_lock(&priv->lock); + list_del(&map->next); + spin_unlock(&priv->lock); + } + if (map->pages && !use_ptemod) unmap_grant_pages(map, 0, map->count); gntdev_free_map(map); @@ -376,11 +387,11 @@ static void gntdev_vma_open(struct vm_area_struct *vma) static void gntdev_vma_close(struct vm_area_struct *vma) { struct grant_map *map = vma->vm_private_data; + struct file *file = vma->vm_file; + struct gntdev_priv *priv = file->private_data; pr_debug("gntdev_vma_close %p\n", vma); if (use_ptemod) { - struct file *file = vma->vm_file; - struct gntdev_priv *priv = file->private_data; /* It is possible that an mmu notifier could be running * concurrently, so take priv->lock to ensure that the vma won't * vanishing during the unmap_grant_pages call, since we will @@ -393,7 +404,7 @@ static void gntdev_vma_close(struct vm_area_struct *vma) spin_unlock(&priv->lock); } vma->vm_private_data = NULL; - gntdev_put_map(map); + gntdev_put_map(priv, map); } static struct vm_operations_struct gntdev_vmops = { @@ -403,33 +414,43 @@ static struct vm_operations_struct gntdev_vmops = { /* ------------------------------------------------------------------ */ +static void unmap_if_in_range(struct grant_map *map, + unsigned long start, unsigned long end) +{ + unsigned long mstart, mend; + int err; + + if (!map->vma) + return; + if (map->vma->vm_start >= end) + return; + if (map->vma->vm_end <= start) + return; + mstart = max(start, map->vma->vm_start); + mend = min(end, map->vma->vm_end); + pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end, + start, end, mstart, mend); + err = unmap_grant_pages(map, + (mstart - map->vma->vm_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); + WARN_ON(err); +} + static void mn_invl_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); struct grant_map *map; - unsigned long mstart, mend; - int err; spin_lock(&priv->lock); list_for_each_entry(map, &priv->maps, next) { - if (!map->vma) - continue; - if (map->vma->vm_start >= end) - continue; - if (map->vma->vm_end <= start) - continue; - mstart = max(start, map->vma->vm_start); - mend = min(end, map->vma->vm_end); - pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end, - start, end, mstart, mend); - err = unmap_grant_pages(map, - (mstart - map->vma->vm_start) >> PAGE_SHIFT, - (mend - mstart) >> PAGE_SHIFT); - WARN_ON(err); + unmap_if_in_range(map, start, end); + } + list_for_each_entry(map, &priv->freeable_maps, next) { + unmap_if_in_range(map, start, end); } spin_unlock(&priv->lock); } @@ -458,6 +479,15 @@ static void mn_release(struct mmu_notifier *mn, err = unmap_grant_pages(map, /* offset */ 0, map->count); WARN_ON(err); } + list_for_each_entry(map, &priv->freeable_maps, next) { + if (!map->vma) + continue; + pr_debug("map %d+%d (%lx %lx)\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end); + err = unmap_grant_pages(map, /* offset */ 0, map->count); + WARN_ON(err); + } spin_unlock(&priv->lock); } @@ -479,6 +509,7 @@ static int gntdev_open(struct inode *inode, struct file *flip) return -ENOMEM; INIT_LIST_HEAD(&priv->maps); + INIT_LIST_HEAD(&priv->freeable_maps); spin_lock_init(&priv->lock); if (use_ptemod) { @@ -513,8 +544,9 @@ static int gntdev_release(struct inode *inode, struct file *flip) while (!list_empty(&priv->maps)) { map = list_entry(priv->maps.next, struct grant_map, next); list_del(&map->next); - gntdev_put_map(map); + gntdev_put_map(NULL /* already removed */, map); } + WARN_ON(!list_empty(&priv->freeable_maps)); if (use_ptemod) mmu_notifier_unregister(&priv->mn, priv->mm); @@ -542,14 +574,14 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) { pr_debug("can't map: over limit\n"); - gntdev_put_map(map); + gntdev_put_map(NULL, map); return err; } if (copy_from_user(map->grants, &u->refs, sizeof(map->grants[0]) * op.count) != 0) { - gntdev_put_map(map); - return err; + gntdev_put_map(NULL, map); + return -EFAULT; } spin_lock(&priv->lock); @@ -578,11 +610,13 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); if (map) { list_del(&map->next); + if (populate_freeable_maps) + list_add_tail(&map->next, &priv->freeable_maps); err = 0; } spin_unlock(&priv->lock); if (map) - gntdev_put_map(map); + gntdev_put_map(priv, map); return err; } @@ -797,7 +831,7 @@ out_unlock_put: out_put_map: if (use_ptemod) map->vma = NULL; - gntdev_put_map(map); + gntdev_put_map(priv, map); return err; } From 1affa98d23c0188fdd4433512489be753a25bb23 Mon Sep 17 00:00:00 2001 From: Daniel De Graaf Date: Wed, 2 Jan 2013 17:57:13 -0500 Subject: [PATCH 08/10] xen/gntdev: remove erronous use of copy_to_user Since there is now a mapping of granted pages in kernel address space in both PV and HVM, use it for UNMAP_NOTIFY_CLEAR_BYTE instead of accessing memory via copy_to_user and triggering sleep-in-atomic warnings. Signed-off-by: Daniel De Graaf Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/gntdev.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 9be3e5e46d1f..3c8803feba26 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -312,17 +312,10 @@ static int __unmap_grant_pages(struct grant_map *map, int offset, int pages) if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { int pgno = (map->notify.addr >> PAGE_SHIFT); - if (pgno >= offset && pgno < offset + pages && use_ptemod) { - void __user *tmp = (void __user *) - map->vma->vm_start + map->notify.addr; - err = copy_to_user(tmp, &err, 1); - if (err) - return -EFAULT; - map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; - } else if (pgno >= offset && pgno < offset + pages) { - uint8_t *tmp = kmap(map->pages[pgno]); + if (pgno >= offset && pgno < offset + pages) { + /* No need for kmap, pages are in lowmem */ + uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno])); tmp[map->notify.addr & (PAGE_SIZE-1)] = 0; - kunmap(map->pages[pgno]); map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; } } From d55bf532d72b3cfdfe84e696ace995067324c96c Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 15 Jan 2013 22:40:26 -0500 Subject: [PATCH 09/10] Revert "xen/smp: Fix CPU online/offline bug triggering a BUG: scheduling while atomic." This reverts commit 41bd956de3dfdc3a43708fe2e0c8096c69064a1e. The fix is incorrect and not appropiate for the latest kernels. In fact it _causes_ the BUG: scheduling while atomic while doing vCPU hotplug. Suggested-by: Wei Liu Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 4f7d2599b484..34bc4cee8887 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -432,13 +432,6 @@ static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); cpu_bringup(); - /* - * Balance out the preempt calls - as we are running in cpu_idle - * loop which has been called at bootup from cpu_bringup_and_idle. - * The cpucpu_bringup_and_idle called cpu_bringup which made a - * preempt_disable() So this preempt_enable will balance it out. - */ - preempt_enable(); } #else /* !CONFIG_HOTPLUG_CPU */ From 9174adbee4a9a49d0139f5d71969852b36720809 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Wed, 16 Jan 2013 12:00:55 +0000 Subject: [PATCH 10/10] xen: Fix stack corruption in xen_failsafe_callback for 32bit PVOPS guests. This fixes CVE-2013-0190 / XSA-40 There has been an error on the xen_failsafe_callback path for failed iret, which causes the stack pointer to be wrong when entering the iret_exc error path. This can result in the kernel crashing. In the classic kernel case, the relevant code looked a little like: popl %eax # Error code from hypervisor jz 5f addl $16,%esp jmp iret_exc # Hypervisor said iret fault 5: addl $16,%esp # Hypervisor said segment selector fault Here, there are two identical addls on either option of a branch which appears to have been optimised by hoisting it above the jz, and converting it to an lea, which leaves the flags register unaffected. In the PVOPS case, the code looks like: popl_cfi %eax # Error from the hypervisor lea 16(%esp),%esp # Add $16 before choosing fault path CFI_ADJUST_CFA_OFFSET -16 jz 5f addl $16,%esp # Incorrectly adjust %esp again jmp iret_exc It is possible unprivileged userspace applications to cause this behaviour, for example by loading an LDT code selector, then changing the code selector to be not-present. At this point, there is a race condition where it is possible for the hypervisor to return back to userspace from an interrupt, fault on its own iret, and inject a failsafe_callback into the kernel. This bug has been present since the introduction of Xen PVOPS support in commit 5ead97c84 (xen: Core Xen implementation), in 2.6.23. Signed-off-by: Frediano Ziglio Signed-off-by: Andrew Cooper Cc: stable@vger.kernel.org Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/kernel/entry_32.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 88b725aa1d52..cf8639b4dcf3 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1084,7 +1084,6 @@ ENTRY(xen_failsafe_callback) lea 16(%esp),%esp CFI_ADJUST_CFA_OFFSET -16 jz 5f - addl $16,%esp jmp iret_exc 5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL