From 21b9e979501fdb5f6797193d70428a2b00bd5247 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 19 Jul 2017 14:55:26 -0700 Subject: [PATCH 001/855] device-dax: Kill dax_region ida Commit bbb3be170ac2 "device-dax: fix sysfs duplicate warnings" arranged for passing a dax instance-id to devm_create_dax_dev(), rather than generating one internally. Remove the dax_region ida and related code. Signed-off-by: Dan Williams --- drivers/dax/dax-private.h | 3 --- drivers/dax/device.c | 24 +++--------------------- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index b6fc4f04636d..d1b36a42132f 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -28,7 +28,6 @@ */ struct dax_region { int id; - struct ida ida; void *base; struct kref kref; struct device *dev; @@ -42,7 +41,6 @@ struct dax_region { * @region - parent region * @dax_dev - core dax functionality * @dev - device core - * @id - child id in the region * @num_resources - number of physical address extents in this device * @res - array of physical address ranges */ @@ -50,7 +48,6 @@ struct dev_dax { struct dax_region *region; struct dax_device *dax_dev; struct device dev; - int id; int num_resources; struct resource res[0]; }; diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 948806e57cee..a5a670c1cd58 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -128,7 +128,6 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, dax_region->pfn_flags = pfn_flags; kref_init(&dax_region->kref); dax_region->id = region_id; - ida_init(&dax_region->ida); dax_region->align = align; dax_region->dev = parent; dax_region->base = addr; @@ -582,8 +581,6 @@ static void dev_dax_release(struct device *dev) struct dax_region *dax_region = dev_dax->region; struct dax_device *dax_dev = dev_dax->dax_dev; - if (dev_dax->id >= 0) - ida_simple_remove(&dax_region->ida, dev_dax->id); dax_region_put(dax_region); put_dax(dax_dev); kfree(dev_dax); @@ -642,19 +639,7 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, } if (i < count) - goto err_id; - - if (id < 0) { - id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); - dev_dax->id = id; - if (id < 0) { - rc = id; - goto err_id; - } - } else { - /* region provider owns @id lifetime */ - dev_dax->id = -1; - } + goto err; /* * No 'host' or dax_operations since there is no access to this @@ -663,7 +648,7 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, dax_dev = alloc_dax(dev_dax, NULL, NULL); if (!dax_dev) { rc = -ENOMEM; - goto err_dax; + goto err; } /* from here on we're committed to teardown via dax_dev_release() */ @@ -700,10 +685,7 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, return dev_dax; - err_dax: - if (dev_dax->id >= 0) - ida_simple_remove(&dax_region->ida, dev_dax->id); - err_id: + err: kfree(dev_dax); return ERR_PTR(rc); From 93694f9630b0ed29cda61df58e480dcb34ef52fd Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 19 Jul 2017 15:57:44 -0700 Subject: [PATCH 002/855] device-dax: Kill dax_region base Nothing consumes this attribute of a region and devres otherwise remembers the value for de-allocation purposes. Signed-off-by: Dan Williams --- drivers/dax/dax-private.h | 2 -- drivers/dax/device-dax.h | 5 ++--- drivers/dax/device.c | 3 +-- drivers/dax/pmem.c | 2 +- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index d1b36a42132f..9b393c218fe4 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -19,7 +19,6 @@ /** * struct dax_region - mapping infrastructure for dax devices * @id: kernel-wide unique region for a memory range - * @base: linear address corresponding to @res * @kref: to pin while other agents have a need to do lookups * @dev: parent device backing this region * @align: allocation and mapping alignment for child dax devices @@ -28,7 +27,6 @@ */ struct dax_region { int id; - void *base; struct kref kref; struct device *dev; unsigned int align; diff --git a/drivers/dax/device-dax.h b/drivers/dax/device-dax.h index 688b051750bd..4f1c69e1b3a2 100644 --- a/drivers/dax/device-dax.h +++ b/drivers/dax/device-dax.h @@ -17,9 +17,8 @@ struct dev_dax; struct resource; struct dax_region; void dax_region_put(struct dax_region *dax_region); -struct dax_region *alloc_dax_region(struct device *parent, - int region_id, struct resource *res, unsigned int align, - void *addr, unsigned long flags); +struct dax_region *alloc_dax_region(struct device *parent, int region_id, + struct resource *res, unsigned int align, unsigned long flags); struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id, struct resource *res, int count); #endif /* __DEVICE_DAX_H__ */ diff --git a/drivers/dax/device.c b/drivers/dax/device.c index a5a670c1cd58..811c1015194c 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -100,7 +100,7 @@ static void dax_region_unregister(void *region) } struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, unsigned int align, void *addr, + struct resource *res, unsigned int align, unsigned long pfn_flags) { struct dax_region *dax_region; @@ -130,7 +130,6 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, dax_region->id = region_id; dax_region->align = align; dax_region->dev = parent; - dax_region->base = addr; if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { kfree(dax_region); return NULL; diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 2c1f459c0c63..72a76105eb02 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -125,7 +125,7 @@ static int dax_pmem_probe(struct device *dev) return -EINVAL; dax_region = alloc_dax_region(dev, region_id, &res, - le32_to_cpu(pfn_sb->align), addr, PFN_DEV|PFN_MAP); + le32_to_cpu(pfn_sb->align), PFN_DEV|PFN_MAP); if (!dax_region) return -ENOMEM; From 753a0850e707e9a8c5861356222f9b9e4eba7945 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 14 Jul 2017 13:54:50 -0700 Subject: [PATCH 003/855] device-dax: Remove multi-resource infrastructure The multi-resource implementation anticipated discontiguous sub-division support. That has not yet materialized, delete the infrastructure and related code. Signed-off-by: Dan Williams --- drivers/dax/dax-private.h | 4 --- drivers/dax/device-dax.h | 3 +-- drivers/dax/device.c | 49 ++++++---------------------------- drivers/dax/pmem.c | 3 +-- tools/testing/nvdimm/dax-dev.c | 16 +++-------- 5 files changed, 13 insertions(+), 62 deletions(-) diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index 9b393c218fe4..dbd077653b5c 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -39,14 +39,10 @@ struct dax_region { * @region - parent region * @dax_dev - core dax functionality * @dev - device core - * @num_resources - number of physical address extents in this device - * @res - array of physical address ranges */ struct dev_dax { struct dax_region *region; struct dax_device *dax_dev; struct device dev; - int num_resources; - struct resource res[0]; }; #endif diff --git a/drivers/dax/device-dax.h b/drivers/dax/device-dax.h index 4f1c69e1b3a2..e9be99584b92 100644 --- a/drivers/dax/device-dax.h +++ b/drivers/dax/device-dax.h @@ -19,6 +19,5 @@ struct dax_region; void dax_region_put(struct dax_region *dax_region); struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, unsigned long flags); -struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, - int id, struct resource *res, int count); +struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id); #endif /* __DEVICE_DAX_H__ */ diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 811c1015194c..db12e24b8005 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -151,11 +151,7 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dev_dax *dev_dax = to_dev_dax(dev); - unsigned long long size = 0; - int i; - - for (i = 0; i < dev_dax->num_resources; i++) - size += resource_size(&dev_dax->res[i]); + unsigned long long size = resource_size(&dev_dax->region->res); return sprintf(buf, "%llu\n", size); } @@ -224,21 +220,11 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size) { - struct resource *res; - /* gcc-4.6.3-nolibc for i386 complains that this is uninitialized */ - phys_addr_t uninitialized_var(phys); - int i; + struct resource *res = &dev_dax->region->res; + phys_addr_t phys; - for (i = 0; i < dev_dax->num_resources; i++) { - res = &dev_dax->res[i]; - phys = pgoff * PAGE_SIZE + res->start; - if (phys >= res->start && phys <= res->end) - break; - pgoff -= PHYS_PFN(resource_size(res)); - } - - if (i < dev_dax->num_resources) { - res = &dev_dax->res[i]; + phys = pgoff * PAGE_SIZE + res->start; + if (phys >= res->start && phys <= res->end) { if (phys + size - 1 <= res->end) return phys; } @@ -608,8 +594,7 @@ static void unregister_dev_dax(void *dev) put_device(dev); } -struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, - int id, struct resource *res, int count) +struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id) { struct device *parent = dax_region->dev; struct dax_device *dax_dev; @@ -617,29 +602,12 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, struct inode *inode; struct device *dev; struct cdev *cdev; - int rc, i; + int rc; - if (!count) - return ERR_PTR(-EINVAL); - - dev_dax = kzalloc(struct_size(dev_dax, res, count), GFP_KERNEL); + dev_dax = kzalloc(sizeof(*dev_dax), GFP_KERNEL); if (!dev_dax) return ERR_PTR(-ENOMEM); - for (i = 0; i < count; i++) { - if (!IS_ALIGNED(res[i].start, dax_region->align) - || !IS_ALIGNED(resource_size(&res[i]), - dax_region->align)) { - rc = -EINVAL; - break; - } - dev_dax->res[i].start = res[i].start; - dev_dax->res[i].end = res[i].end; - } - - if (i < count) - goto err; - /* * No 'host' or dax_operations since there is no access to this * device outside of mmap of the resulting character device. @@ -659,7 +627,6 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, cdev_init(cdev, &dax_fops); cdev->owner = parent->driver->owner; - dev_dax->num_resources = count; dev_dax->dax_dev = dax_dev; dev_dax->region = dax_region; kref_get(&dax_region->kref); diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 72a76105eb02..6c61a019f997 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -129,8 +129,7 @@ static int dax_pmem_probe(struct device *dev) if (!dax_region) return -ENOMEM; - /* TODO: support for subdividing a dax region... */ - dev_dax = devm_create_dev_dax(dax_region, id, &res, 1); + dev_dax = devm_create_dev_dax(dax_region, id); /* child dev_dax instances now own the lifetime of the dax_region */ dax_region_put(dax_region); diff --git a/tools/testing/nvdimm/dax-dev.c b/tools/testing/nvdimm/dax-dev.c index 36ee3d8797c3..f36e708265b8 100644 --- a/tools/testing/nvdimm/dax-dev.c +++ b/tools/testing/nvdimm/dax-dev.c @@ -17,20 +17,11 @@ phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size) { - struct resource *res; + struct resource *res = &dev_dax->region->res; phys_addr_t addr; - int i; - for (i = 0; i < dev_dax->num_resources; i++) { - res = &dev_dax->res[i]; - addr = pgoff * PAGE_SIZE + res->start; - if (addr >= res->start && addr <= res->end) - break; - pgoff -= PHYS_PFN(resource_size(res)); - } - - if (i < dev_dax->num_resources) { - res = &dev_dax->res[i]; + addr = pgoff * PAGE_SIZE + res->start; + if (addr >= res->start && addr <= res->end) { if (addr + size - 1 <= res->end) { if (get_nfit_res(addr)) { struct page *page; @@ -44,6 +35,5 @@ phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, return addr; } } - return -1; } From 51cf784c42d07fbd62cb604836a9270cf3361509 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Jul 2017 17:58:21 -0700 Subject: [PATCH 004/855] device-dax: Start defining a dax bus model Towards eliminating the dax_class, move the dax-device-attribute enabling to a new bus.c file in the core. The amount of code thrash of sub-sequent patches is reduced as no logic changes are made, just pure code movement. A temporary export of unregister_dex_dax() and dax_attribute_groups is needed to preserve compilation, but those symbols become static again in a follow-on patch. Signed-off-by: Dan Williams --- drivers/dax/Makefile | 1 + drivers/dax/bus.c | 174 +++++++++++++++++++++++++++++++++ drivers/dax/bus.h | 15 +++ drivers/dax/dax-private.h | 14 +++ drivers/dax/dax.h | 18 ---- drivers/dax/device-dax.h | 23 ----- drivers/dax/device.c | 185 +----------------------------------- drivers/dax/pmem.c | 2 +- drivers/dax/super.c | 1 + tools/testing/nvdimm/Kbuild | 1 + 10 files changed, 210 insertions(+), 224 deletions(-) create mode 100644 drivers/dax/bus.c create mode 100644 drivers/dax/bus.h delete mode 100644 drivers/dax/dax.h delete mode 100644 drivers/dax/device-dax.h diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 574286fac87c..658e6b9b1d74 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o dax-y := super.o +dax-y += bus.o dax_pmem-y := pmem.o device_dax-y := device.o diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c new file mode 100644 index 000000000000..8a398e8e1956 --- /dev/null +++ b/drivers/dax/bus.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2017-2018 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include "dax-private.h" +#include "bus.h" + +/* + * Rely on the fact that drvdata is set before the attributes are + * registered, and that the attributes are unregistered before drvdata + * is cleared to assume that drvdata is always valid. + */ +static ssize_t id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + + return sprintf(buf, "%d\n", dax_region->id); +} +static DEVICE_ATTR_RO(id); + +static ssize_t region_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + + return sprintf(buf, "%llu\n", (unsigned long long) + resource_size(&dax_region->res)); +} +static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, + region_size_show, NULL); + +static ssize_t align_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + + return sprintf(buf, "%u\n", dax_region->align); +} +static DEVICE_ATTR_RO(align); + +static struct attribute *dax_region_attributes[] = { + &dev_attr_region_size.attr, + &dev_attr_align.attr, + &dev_attr_id.attr, + NULL, +}; + +static const struct attribute_group dax_region_attribute_group = { + .name = "dax_region", + .attrs = dax_region_attributes, +}; + +static const struct attribute_group *dax_region_attribute_groups[] = { + &dax_region_attribute_group, + NULL, +}; + +static void dax_region_free(struct kref *kref) +{ + struct dax_region *dax_region; + + dax_region = container_of(kref, struct dax_region, kref); + kfree(dax_region); +} + +void dax_region_put(struct dax_region *dax_region) +{ + kref_put(&dax_region->kref, dax_region_free); +} +EXPORT_SYMBOL_GPL(dax_region_put); + +static void dax_region_unregister(void *region) +{ + struct dax_region *dax_region = region; + + sysfs_remove_groups(&dax_region->dev->kobj, + dax_region_attribute_groups); + dax_region_put(dax_region); +} + +struct dax_region *alloc_dax_region(struct device *parent, int region_id, + struct resource *res, unsigned int align, + unsigned long pfn_flags) +{ + struct dax_region *dax_region; + + /* + * The DAX core assumes that it can store its private data in + * parent->driver_data. This WARN is a reminder / safeguard for + * developers of device-dax drivers. + */ + if (dev_get_drvdata(parent)) { + dev_WARN(parent, "dax core failed to setup private data\n"); + return NULL; + } + + if (!IS_ALIGNED(res->start, align) + || !IS_ALIGNED(resource_size(res), align)) + return NULL; + + dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); + if (!dax_region) + return NULL; + + dev_set_drvdata(parent, dax_region); + memcpy(&dax_region->res, res, sizeof(*res)); + dax_region->pfn_flags = pfn_flags; + kref_init(&dax_region->kref); + dax_region->id = region_id; + dax_region->align = align; + dax_region->dev = parent; + if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { + kfree(dax_region); + return NULL; + } + + kref_get(&dax_region->kref); + if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) + return NULL; + return dax_region; +} +EXPORT_SYMBOL_GPL(alloc_dax_region); + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + unsigned long long size = resource_size(&dev_dax->region->res); + + return sprintf(buf, "%llu\n", size); +} +static DEVICE_ATTR_RO(size); + +static struct attribute *dev_dax_attributes[] = { + &dev_attr_size.attr, + NULL, +}; + +static const struct attribute_group dev_dax_attribute_group = { + .attrs = dev_dax_attributes, +}; + +const struct attribute_group *dax_attribute_groups[] = { + &dev_dax_attribute_group, + NULL, +}; +EXPORT_SYMBOL_GPL(dax_attribute_groups); + +void kill_dev_dax(struct dev_dax *dev_dax) +{ + struct dax_device *dax_dev = dev_dax->dax_dev; + struct inode *inode = dax_inode(dax_dev); + + kill_dax(dax_dev); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); +} +EXPORT_SYMBOL_GPL(kill_dev_dax); + +void unregister_dev_dax(void *dev) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_device *dax_dev = dev_dax->dax_dev; + struct inode *inode = dax_inode(dax_dev); + struct cdev *cdev = inode->i_cdev; + + dev_dbg(dev, "trace\n"); + + kill_dev_dax(dev_dax); + cdev_device_del(cdev, dev); + put_device(dev); +} +EXPORT_SYMBOL_GPL(unregister_dev_dax); diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h new file mode 100644 index 000000000000..840865aa69e8 --- /dev/null +++ b/drivers/dax/bus.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */ +#ifndef __DAX_BUS_H__ +#define __DAX_BUS_H__ +struct device; +struct dev_dax; +struct resource; +struct dax_device; +struct dax_region; +void dax_region_put(struct dax_region *dax_region); +struct dax_region *alloc_dax_region(struct device *parent, int region_id, + struct resource *res, unsigned int align, unsigned long flags); +struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id); +void kill_dev_dax(struct dev_dax *dev_dax); +#endif /* __DAX_BUS_H__ */ diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index dbd077653b5c..620c3f4eefe7 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -16,6 +16,15 @@ #include #include +/* private routines between core files */ +struct dax_device; +struct dax_device *inode_dax(struct inode *inode); +struct inode *dax_inode(struct dax_device *dax_dev); + +/* temporary until devm_create_dax_dev moves to bus.c */ +extern const struct attribute_group *dax_attribute_groups[]; +void unregister_dev_dax(void *dev); + /** * struct dax_region - mapping infrastructure for dax devices * @id: kernel-wide unique region for a memory range @@ -45,4 +54,9 @@ struct dev_dax { struct dax_device *dax_dev; struct device dev; }; + +static inline struct dev_dax *to_dev_dax(struct device *dev) +{ + return container_of(dev, struct dev_dax, dev); +} #endif diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h deleted file mode 100644 index f9e5feea742c..000000000000 --- a/drivers/dax/dax.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#ifndef __DAX_H__ -#define __DAX_H__ -struct dax_device; -struct dax_device *inode_dax(struct inode *inode); -struct inode *dax_inode(struct dax_device *dax_dev); -#endif /* __DAX_H__ */ diff --git a/drivers/dax/device-dax.h b/drivers/dax/device-dax.h deleted file mode 100644 index e9be99584b92..000000000000 --- a/drivers/dax/device-dax.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright(c) 2016 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#ifndef __DEVICE_DAX_H__ -#define __DEVICE_DAX_H__ -struct device; -struct dev_dax; -struct resource; -struct dax_region; -void dax_region_put(struct dax_region *dax_region); -struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, unsigned int align, unsigned long flags); -struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id); -#endif /* __DEVICE_DAX_H__ */ diff --git a/drivers/dax/device.c b/drivers/dax/device.c index db12e24b8005..1fc375783e0b 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -1,15 +1,5 @@ -/* - * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-2018 Intel Corporation. All rights reserved. */ #include #include #include @@ -21,156 +11,10 @@ #include #include #include "dax-private.h" -#include "dax.h" +#include "bus.h" static struct class *dax_class; -/* - * Rely on the fact that drvdata is set before the attributes are - * registered, and that the attributes are unregistered before drvdata - * is cleared to assume that drvdata is always valid. - */ -static ssize_t id_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dax_region *dax_region = dev_get_drvdata(dev); - - return sprintf(buf, "%d\n", dax_region->id); -} -static DEVICE_ATTR_RO(id); - -static ssize_t region_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dax_region *dax_region = dev_get_drvdata(dev); - - return sprintf(buf, "%llu\n", (unsigned long long) - resource_size(&dax_region->res)); -} -static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, - region_size_show, NULL); - -static ssize_t align_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dax_region *dax_region = dev_get_drvdata(dev); - - return sprintf(buf, "%u\n", dax_region->align); -} -static DEVICE_ATTR_RO(align); - -static struct attribute *dax_region_attributes[] = { - &dev_attr_region_size.attr, - &dev_attr_align.attr, - &dev_attr_id.attr, - NULL, -}; - -static const struct attribute_group dax_region_attribute_group = { - .name = "dax_region", - .attrs = dax_region_attributes, -}; - -static const struct attribute_group *dax_region_attribute_groups[] = { - &dax_region_attribute_group, - NULL, -}; - -static void dax_region_free(struct kref *kref) -{ - struct dax_region *dax_region; - - dax_region = container_of(kref, struct dax_region, kref); - kfree(dax_region); -} - -void dax_region_put(struct dax_region *dax_region) -{ - kref_put(&dax_region->kref, dax_region_free); -} -EXPORT_SYMBOL_GPL(dax_region_put); - -static void dax_region_unregister(void *region) -{ - struct dax_region *dax_region = region; - - sysfs_remove_groups(&dax_region->dev->kobj, - dax_region_attribute_groups); - dax_region_put(dax_region); -} - -struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, unsigned int align, - unsigned long pfn_flags) -{ - struct dax_region *dax_region; - - /* - * The DAX core assumes that it can store its private data in - * parent->driver_data. This WARN is a reminder / safeguard for - * developers of device-dax drivers. - */ - if (dev_get_drvdata(parent)) { - dev_WARN(parent, "dax core failed to setup private data\n"); - return NULL; - } - - if (!IS_ALIGNED(res->start, align) - || !IS_ALIGNED(resource_size(res), align)) - return NULL; - - dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); - if (!dax_region) - return NULL; - - dev_set_drvdata(parent, dax_region); - memcpy(&dax_region->res, res, sizeof(*res)); - dax_region->pfn_flags = pfn_flags; - kref_init(&dax_region->kref); - dax_region->id = region_id; - dax_region->align = align; - dax_region->dev = parent; - if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { - kfree(dax_region); - return NULL; - } - - kref_get(&dax_region->kref); - if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) - return NULL; - return dax_region; -} -EXPORT_SYMBOL_GPL(alloc_dax_region); - -static struct dev_dax *to_dev_dax(struct device *dev) -{ - return container_of(dev, struct dev_dax, dev); -} - -static ssize_t size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dev_dax *dev_dax = to_dev_dax(dev); - unsigned long long size = resource_size(&dev_dax->region->res); - - return sprintf(buf, "%llu\n", size); -} -static DEVICE_ATTR_RO(size); - -static struct attribute *dev_dax_attributes[] = { - &dev_attr_size.attr, - NULL, -}; - -static const struct attribute_group dev_dax_attribute_group = { - .attrs = dev_dax_attributes, -}; - -static const struct attribute_group *dax_attribute_groups[] = { - &dev_dax_attribute_group, - NULL, -}; - static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, const char *func) { @@ -571,29 +415,6 @@ static void dev_dax_release(struct device *dev) kfree(dev_dax); } -static void kill_dev_dax(struct dev_dax *dev_dax) -{ - struct dax_device *dax_dev = dev_dax->dax_dev; - struct inode *inode = dax_inode(dax_dev); - - kill_dax(dax_dev); - unmap_mapping_range(inode->i_mapping, 0, 0, 1); -} - -static void unregister_dev_dax(void *dev) -{ - struct dev_dax *dev_dax = to_dev_dax(dev); - struct dax_device *dax_dev = dev_dax->dax_dev; - struct inode *inode = dax_inode(dax_dev); - struct cdev *cdev = inode->i_cdev; - - dev_dbg(dev, "trace\n"); - - kill_dev_dax(dev_dax); - cdev_device_del(cdev, dev); - put_device(dev); -} - struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id) { struct device *parent = dax_region->dev; diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 6c61a019f997..e1b50d8fd1f8 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -16,7 +16,7 @@ #include #include "../nvdimm/pfn.h" #include "../nvdimm/nd.h" -#include "device-dax.h" +#include "bus.h" struct dax_pmem { struct device *dev; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 6e928f37d084..0ecc1a2cf1cc 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -22,6 +22,7 @@ #include #include #include +#include "dax-private.h" static dev_t dax_devt; DEFINE_STATIC_SRCU(dax_srcu); diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index 10ddf223055b..c9b500a652d5 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -57,6 +57,7 @@ nd_e820-y := $(NVDIMM_SRC)/e820.o nd_e820-y += config_check.o dax-y := $(DAX_SRC)/super.o +dax-y += $(DAX_SRC)/bus.o dax-y += config_check.o device_dax-y := $(DAX_SRC)/device.o From 9567da0b408a2553d32ca83cba4f1fc5a8aad459 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Jul 2017 17:58:21 -0700 Subject: [PATCH 005/855] device-dax: Introduce bus + driver model In support of multiple device-dax instances per device-dax-region and allowing the 'kmem' driver to attach to dax-instances instead of the current device-node access, convert the dax sub-system from a class to a bus. Recall that the kmem driver takes reserved / special purpose memories and assigns them to be managed by the core-mm. Aside from the fact the device-dax instances are registered and probed on a bus, two other lifetime-management changes are made: 1/ Delay attaching a cdev until driver probe time 2/ A new run_dax() helper is introduced to allow restoring dax-operation after a kill_dax() event. So, at driver ->probe() time we run_dax() and at ->remove() time we kill_dax() and invalidate all mappings. Signed-off-by: Dan Williams --- drivers/dax/bus.c | 133 +++++++++++++++++++++++++++++++++++--- drivers/dax/bus.h | 16 +++++ drivers/dax/dax-private.h | 6 +- drivers/dax/device.c | 99 ++++++++++------------------ drivers/dax/super.c | 40 ++++++++---- 5 files changed, 205 insertions(+), 89 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 8a398e8e1956..0cff32102c4c 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -6,6 +6,33 @@ #include "dax-private.h" #include "bus.h" +static int dax_bus_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + /* + * We only ever expect to handle device-dax instances, i.e. the + * @type argument to MODULE_ALIAS_DAX_DEVICE() is always zero + */ + return add_uevent_var(env, "MODALIAS=" DAX_DEVICE_MODALIAS_FMT, 0); +} + +static int dax_bus_match(struct device *dev, struct device_driver *drv); + +static struct bus_type dax_bus_type = { + .name = "dax", + .uevent = dax_bus_uevent, + .match = dax_bus_match, +}; + +static int dax_bus_match(struct device *dev, struct device_driver *drv) +{ + /* + * The drivers that can register on the 'dax' bus are private to + * drivers/dax/ so any device and driver on the bus always + * match. + */ + return 1; +} + /* * Rely on the fact that drvdata is set before the attributes are * registered, and that the attributes are unregistered before drvdata @@ -142,11 +169,10 @@ static const struct attribute_group dev_dax_attribute_group = { .attrs = dev_dax_attributes, }; -const struct attribute_group *dax_attribute_groups[] = { +static const struct attribute_group *dax_attribute_groups[] = { &dev_dax_attribute_group, NULL, }; -EXPORT_SYMBOL_GPL(dax_attribute_groups); void kill_dev_dax(struct dev_dax *dev_dax) { @@ -158,17 +184,108 @@ void kill_dev_dax(struct dev_dax *dev_dax) } EXPORT_SYMBOL_GPL(kill_dev_dax); -void unregister_dev_dax(void *dev) +static void dev_dax_release(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; struct dax_device *dax_dev = dev_dax->dax_dev; - struct inode *inode = dax_inode(dax_dev); - struct cdev *cdev = inode->i_cdev; - dev_dbg(dev, "trace\n"); + dax_region_put(dax_region); + put_dax(dax_dev); + kfree(dev_dax); +} + +static void unregister_dev_dax(void *dev) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + dev_dbg(dev, "%s\n", __func__); kill_dev_dax(dev_dax); - cdev_device_del(cdev, dev); + device_del(dev); put_device(dev); } -EXPORT_SYMBOL_GPL(unregister_dev_dax); + +struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id) +{ + struct device *parent = dax_region->dev; + struct dax_device *dax_dev; + struct dev_dax *dev_dax; + struct inode *inode; + struct device *dev; + int rc = -ENOMEM; + + if (id < 0) + return ERR_PTR(-EINVAL); + + dev_dax = kzalloc(sizeof(*dev_dax), GFP_KERNEL); + if (!dev_dax) + return ERR_PTR(-ENOMEM); + + /* + * No 'host' or dax_operations since there is no access to this + * device outside of mmap of the resulting character device. + */ + dax_dev = alloc_dax(dev_dax, NULL, NULL); + if (!dax_dev) + goto err; + + /* a device_dax instance is dead while the driver is not attached */ + kill_dax(dax_dev); + + /* from here on we're committed to teardown via dax_dev_release() */ + dev = &dev_dax->dev; + device_initialize(dev); + + dev_dax->dax_dev = dax_dev; + dev_dax->region = dax_region; + kref_get(&dax_region->kref); + + inode = dax_inode(dax_dev); + dev->devt = inode->i_rdev; + dev->bus = &dax_bus_type; + dev->parent = parent; + dev->groups = dax_attribute_groups; + dev->release = dev_dax_release; + dev_set_name(dev, "dax%d.%d", dax_region->id, id); + + rc = device_add(dev); + if (rc) { + kill_dev_dax(dev_dax); + put_device(dev); + return ERR_PTR(rc); + } + + rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev); + if (rc) + return ERR_PTR(rc); + + return dev_dax; + + err: + kfree(dev_dax); + + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(devm_create_dev_dax); + +int __dax_driver_register(struct device_driver *drv, + struct module *module, const char *mod_name) +{ + drv->owner = module; + drv->name = mod_name; + drv->mod_name = mod_name; + drv->bus = &dax_bus_type; + return driver_register(drv); +} +EXPORT_SYMBOL_GPL(__dax_driver_register); + +int __init dax_bus_init(void) +{ + return bus_register(&dax_bus_type); +} + +void __exit dax_bus_exit(void) +{ + bus_unregister(&dax_bus_type); +} diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index 840865aa69e8..ea509504df3a 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -11,5 +11,21 @@ void dax_region_put(struct dax_region *dax_region); struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, unsigned long flags); struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id); +int __dax_driver_register(struct device_driver *drv, + struct module *module, const char *mod_name); +#define dax_driver_register(driver) \ + __dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) void kill_dev_dax(struct dev_dax *dev_dax); + +/* + * While run_dax() is potentially a generic operation that could be + * defined in include/linux/dax.h we don't want to grow any users + * outside of drivers/dax/ + */ +void run_dax(struct dax_device *dax_dev); + +#define MODULE_ALIAS_DAX_DEVICE(type) \ + MODULE_ALIAS("dax:t" __stringify(type) "*") +#define DAX_DEVICE_MODALIAS_FMT "dax:t%d" + #endif /* __DAX_BUS_H__ */ diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index 620c3f4eefe7..c3a121700837 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -20,10 +20,8 @@ struct dax_device; struct dax_device *inode_dax(struct inode *inode); struct inode *dax_inode(struct dax_device *dax_dev); - -/* temporary until devm_create_dax_dev moves to bus.c */ -extern const struct attribute_group *dax_attribute_groups[]; -void unregister_dev_dax(void *dev); +int dax_bus_init(void); +void dax_bus_exit(void); /** * struct dax_region - mapping infrastructure for dax devices diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 1fc375783e0b..f55829404a24 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -13,8 +13,6 @@ #include "dax-private.h" #include "bus.h" -static struct class *dax_class; - static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, const char *func) { @@ -404,93 +402,64 @@ static const struct file_operations dax_fops = { .mmap_supported_flags = MAP_SYNC, }; -static void dev_dax_release(struct device *dev) +static void dev_dax_cdev_del(void *cdev) { - struct dev_dax *dev_dax = to_dev_dax(dev); - struct dax_region *dax_region = dev_dax->region; - struct dax_device *dax_dev = dev_dax->dax_dev; - - dax_region_put(dax_region); - put_dax(dax_dev); - kfree(dev_dax); + cdev_del(cdev); } -struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id) +static void dev_dax_kill(void *dev_dax) { - struct device *parent = dax_region->dev; - struct dax_device *dax_dev; - struct dev_dax *dev_dax; + kill_dev_dax(dev_dax); +} + +static int dev_dax_probe(struct device *dev) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_device *dax_dev = dev_dax->dax_dev; struct inode *inode; - struct device *dev; struct cdev *cdev; int rc; - dev_dax = kzalloc(sizeof(*dev_dax), GFP_KERNEL); - if (!dev_dax) - return ERR_PTR(-ENOMEM); - - /* - * No 'host' or dax_operations since there is no access to this - * device outside of mmap of the resulting character device. - */ - dax_dev = alloc_dax(dev_dax, NULL, NULL); - if (!dax_dev) { - rc = -ENOMEM; - goto err; - } - - /* from here on we're committed to teardown via dax_dev_release() */ - dev = &dev_dax->dev; - device_initialize(dev); - inode = dax_inode(dax_dev); cdev = inode->i_cdev; cdev_init(cdev, &dax_fops); - cdev->owner = parent->driver->owner; - - dev_dax->dax_dev = dax_dev; - dev_dax->region = dax_region; - kref_get(&dax_region->kref); - - dev->devt = inode->i_rdev; - dev->class = dax_class; - dev->parent = parent; - dev->groups = dax_attribute_groups; - dev->release = dev_dax_release; - dev_set_name(dev, "dax%d.%d", dax_region->id, id); - - rc = cdev_device_add(cdev, dev); - if (rc) { - kill_dev_dax(dev_dax); - put_device(dev); - return ERR_PTR(rc); - } - - rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev); + cdev->owner = dev->driver->owner; + cdev_set_parent(cdev, &dev->kobj); + rc = cdev_add(cdev, dev->devt, 1); if (rc) - return ERR_PTR(rc); + return rc; - return dev_dax; + rc = devm_add_action_or_reset(dev, dev_dax_cdev_del, cdev); + if (rc) + return rc; - err: - kfree(dev_dax); - - return ERR_PTR(rc); + run_dax(dax_dev); + return devm_add_action_or_reset(dev, dev_dax_kill, dev_dax); } -EXPORT_SYMBOL_GPL(devm_create_dev_dax); + +static int dev_dax_remove(struct device *dev) +{ + /* all probe actions are unwound by devm */ + return 0; +} + +static struct device_driver device_dax_driver = { + .probe = dev_dax_probe, + .remove = dev_dax_remove, +}; static int __init dax_init(void) { - dax_class = class_create(THIS_MODULE, "dax"); - return PTR_ERR_OR_ZERO(dax_class); + return dax_driver_register(&device_dax_driver); } static void __exit dax_exit(void) { - class_destroy(dax_class); + driver_unregister(&device_dax_driver); } MODULE_AUTHOR("Intel Corporation"); MODULE_LICENSE("GPL v2"); -subsys_initcall(dax_init); +module_init(dax_init); module_exit(dax_exit); +MODULE_ALIAS_DAX_DEVICE(0); diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 0ecc1a2cf1cc..ccb22d8db3a2 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -366,11 +366,15 @@ void kill_dax(struct dax_device *dax_dev) spin_lock(&dax_host_lock); hlist_del_init(&dax_dev->list); spin_unlock(&dax_host_lock); - - dax_dev->private = NULL; } EXPORT_SYMBOL_GPL(kill_dax); +void run_dax(struct dax_device *dax_dev) +{ + set_bit(DAXDEV_ALIVE, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(run_dax); + static struct inode *dax_alloc_inode(struct super_block *sb) { struct dax_device *dax_dev; @@ -585,6 +589,8 @@ EXPORT_SYMBOL_GPL(dax_inode); void *dax_get_private(struct dax_device *dax_dev) { + if (!test_bit(DAXDEV_ALIVE, &dax_dev->flags)) + return NULL; return dax_dev->private; } EXPORT_SYMBOL_GPL(dax_get_private); @@ -598,7 +604,7 @@ static void init_once(void *_dax_dev) inode_init_once(inode); } -static int __dax_fs_init(void) +static int dax_fs_init(void) { int rc; @@ -630,35 +636,45 @@ static int __dax_fs_init(void) return rc; } -static void __dax_fs_exit(void) +static void dax_fs_exit(void) { kern_unmount(dax_mnt); unregister_filesystem(&dax_fs_type); kmem_cache_destroy(dax_cache); } -static int __init dax_fs_init(void) +static int __init dax_core_init(void) { int rc; - rc = __dax_fs_init(); + rc = dax_fs_init(); if (rc) return rc; rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax"); if (rc) - __dax_fs_exit(); - return rc; + goto err_chrdev; + + rc = dax_bus_init(); + if (rc) + goto err_bus; + return 0; + +err_bus: + unregister_chrdev_region(dax_devt, MINORMASK+1); +err_chrdev: + dax_fs_exit(); + return 0; } -static void __exit dax_fs_exit(void) +static void __exit dax_core_exit(void) { unregister_chrdev_region(dax_devt, MINORMASK+1); ida_destroy(&dax_minor_ida); - __dax_fs_exit(); + dax_fs_exit(); } MODULE_AUTHOR("Intel Corporation"); MODULE_LICENSE("GPL v2"); -subsys_initcall(dax_fs_init); -module_exit(dax_fs_exit); +subsys_initcall(dax_core_init); +module_exit(dax_core_exit); From 89ec9f2cfa36cc5fca2fb445ed221bb9add7b536 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 Oct 2018 15:52:42 -0700 Subject: [PATCH 006/855] device-dax: Move resource pinning+mapping into the common driver Move the responsibility of calling devm_request_resource() and devm_memremap_pages() into the common device-dax driver. This is another preparatory step to allowing an alternate personality driver for a device-dax range. Signed-off-by: Dan Williams --- drivers/dax/bus.c | 6 ++- drivers/dax/bus.h | 3 +- drivers/dax/dax-private.h | 9 +++- drivers/dax/device.c | 61 ++++++++++++++++++++++++++ drivers/dax/pmem.c | 90 ++++++--------------------------------- 5 files changed, 90 insertions(+), 79 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 0cff32102c4c..69aae2cbd45f 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2017-2018 Intel Corporation. All rights reserved. */ +#include #include #include #include @@ -206,7 +207,8 @@ static void unregister_dev_dax(void *dev) put_device(dev); } -struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id) +struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id, + struct dev_pagemap *pgmap) { struct device *parent = dax_region->dev; struct dax_device *dax_dev; @@ -222,6 +224,8 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id) if (!dev_dax) return ERR_PTR(-ENOMEM); + memcpy(&dev_dax->pgmap, pgmap, sizeof(*pgmap)); + /* * No 'host' or dax_operations since there is no access to this * device outside of mmap of the resulting character device. diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index ea509504df3a..e08e0c394983 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -10,7 +10,8 @@ struct dax_region; void dax_region_put(struct dax_region *dax_region); struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, unsigned long flags); -struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id); +struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id, + struct dev_pagemap *pgmap); int __dax_driver_register(struct device_driver *drv, struct module *module, const char *mod_name); #define dax_driver_register(driver) \ diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index c3a121700837..a82ce48f5884 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -42,15 +42,22 @@ struct dax_region { }; /** - * struct dev_dax - instance data for a subdivision of a dax region + * struct dev_dax - instance data for a subdivision of a dax region, and + * data while the device is activated in the driver. * @region - parent region * @dax_dev - core dax functionality * @dev - device core + * @pgmap - pgmap for memmap setup / lifetime (driver owned) + * @ref: pgmap reference count (driver owned) + * @cmp: @ref final put completion (driver owned) */ struct dev_dax { struct dax_region *region; struct dax_device *dax_dev; struct device dev; + struct dev_pagemap pgmap; + struct percpu_ref ref; + struct completion cmp; }; static inline struct dev_dax *to_dev_dax(struct device *dev) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index f55829404a24..6ad964d7b077 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-2018 Intel Corporation. All rights reserved. */ +#include #include #include #include @@ -13,6 +14,38 @@ #include "dax-private.h" #include "bus.h" +static struct dev_dax *ref_to_dev_dax(struct percpu_ref *ref) +{ + return container_of(ref, struct dev_dax, ref); +} + +static void dev_dax_percpu_release(struct percpu_ref *ref) +{ + struct dev_dax *dev_dax = ref_to_dev_dax(ref); + + dev_dbg(&dev_dax->dev, "%s\n", __func__); + complete(&dev_dax->cmp); +} + +static void dev_dax_percpu_exit(void *data) +{ + struct percpu_ref *ref = data; + struct dev_dax *dev_dax = ref_to_dev_dax(ref); + + dev_dbg(&dev_dax->dev, "%s\n", __func__); + wait_for_completion(&dev_dax->cmp); + percpu_ref_exit(ref); +} + +static void dev_dax_percpu_kill(struct percpu_ref *data) +{ + struct percpu_ref *ref = data; + struct dev_dax *dev_dax = ref_to_dev_dax(ref); + + dev_dbg(&dev_dax->dev, "%s\n", __func__); + percpu_ref_kill(ref); +} + static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, const char *func) { @@ -416,10 +449,38 @@ static int dev_dax_probe(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); struct dax_device *dax_dev = dev_dax->dax_dev; + struct resource *res = &dev_dax->region->res; struct inode *inode; struct cdev *cdev; + void *addr; int rc; + /* 1:1 map region resource range to device-dax instance range */ + if (!devm_request_mem_region(dev, res->start, resource_size(res), + dev_name(dev))) { + dev_warn(dev, "could not reserve region %pR\n", res); + return -EBUSY; + } + + init_completion(&dev_dax->cmp); + rc = percpu_ref_init(&dev_dax->ref, dev_dax_percpu_release, 0, + GFP_KERNEL); + if (rc) + return rc; + + rc = devm_add_action_or_reset(dev, dev_dax_percpu_exit, &dev_dax->ref); + if (rc) + return rc; + + dev_dax->pgmap.ref = &dev_dax->ref; + dev_dax->pgmap.kill = dev_dax_percpu_kill; + addr = devm_memremap_pages(dev, &dev_dax->pgmap); + if (IS_ERR(addr)) { + devm_remove_action(dev, dev_dax_percpu_exit, &dev_dax->ref); + percpu_ref_exit(&dev_dax->ref); + return PTR_ERR(addr); + } + inode = dax_inode(dax_dev); cdev = inode->i_cdev; cdev_init(cdev, &dax_fops); diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index e1b50d8fd1f8..d3cefa7868ac 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -18,54 +18,16 @@ #include "../nvdimm/nd.h" #include "bus.h" -struct dax_pmem { - struct device *dev; - struct percpu_ref ref; - struct dev_pagemap pgmap; - struct completion cmp; -}; - -static struct dax_pmem *to_dax_pmem(struct percpu_ref *ref) -{ - return container_of(ref, struct dax_pmem, ref); -} - -static void dax_pmem_percpu_release(struct percpu_ref *ref) -{ - struct dax_pmem *dax_pmem = to_dax_pmem(ref); - - dev_dbg(dax_pmem->dev, "trace\n"); - complete(&dax_pmem->cmp); -} - -static void dax_pmem_percpu_exit(void *data) -{ - struct percpu_ref *ref = data; - struct dax_pmem *dax_pmem = to_dax_pmem(ref); - - dev_dbg(dax_pmem->dev, "trace\n"); - wait_for_completion(&dax_pmem->cmp); - percpu_ref_exit(ref); -} - -static void dax_pmem_percpu_kill(struct percpu_ref *ref) -{ - struct dax_pmem *dax_pmem = to_dax_pmem(ref); - - dev_dbg(dax_pmem->dev, "trace\n"); - percpu_ref_kill(ref); -} - static int dax_pmem_probe(struct device *dev) { - void *addr; struct resource res; int rc, id, region_id; + resource_size_t offset; struct nd_pfn_sb *pfn_sb; struct dev_dax *dev_dax; - struct dax_pmem *dax_pmem; struct nd_namespace_io *nsio; struct dax_region *dax_region; + struct dev_pagemap pgmap = { 0 }; struct nd_namespace_common *ndns; struct nd_dax *nd_dax = to_nd_dax(dev); struct nd_pfn *nd_pfn = &nd_dax->nd_pfn; @@ -75,61 +37,37 @@ static int dax_pmem_probe(struct device *dev) return PTR_ERR(ndns); nsio = to_nd_namespace_io(&ndns->dev); - dax_pmem = devm_kzalloc(dev, sizeof(*dax_pmem), GFP_KERNEL); - if (!dax_pmem) - return -ENOMEM; - /* parse the 'pfn' info block via ->rw_bytes */ rc = devm_nsio_enable(dev, nsio); if (rc) return rc; - rc = nvdimm_setup_pfn(nd_pfn, &dax_pmem->pgmap); + rc = nvdimm_setup_pfn(nd_pfn, &pgmap); if (rc) return rc; devm_nsio_disable(dev, nsio); - pfn_sb = nd_pfn->pfn_sb; - - if (!devm_request_mem_region(dev, nsio->res.start, - resource_size(&nsio->res), + /* reserve the metadata area, device-dax will reserve the data */ + pfn_sb = nd_pfn->pfn_sb; + offset = le64_to_cpu(pfn_sb->dataoff); + if (!devm_request_mem_region(dev, nsio->res.start, offset, dev_name(&ndns->dev))) { - dev_warn(dev, "could not reserve region %pR\n", &nsio->res); - return -EBUSY; - } - - dax_pmem->dev = dev; - init_completion(&dax_pmem->cmp); - rc = percpu_ref_init(&dax_pmem->ref, dax_pmem_percpu_release, 0, - GFP_KERNEL); - if (rc) - return rc; - - rc = devm_add_action(dev, dax_pmem_percpu_exit, &dax_pmem->ref); - if (rc) { - percpu_ref_exit(&dax_pmem->ref); - return rc; - } - - dax_pmem->pgmap.ref = &dax_pmem->ref; - dax_pmem->pgmap.kill = dax_pmem_percpu_kill; - addr = devm_memremap_pages(dev, &dax_pmem->pgmap); - if (IS_ERR(addr)) - return PTR_ERR(addr); - - /* adjust the dax_region resource to the start of data */ - memcpy(&res, &dax_pmem->pgmap.res, sizeof(res)); - res.start += le64_to_cpu(pfn_sb->dataoff); + dev_warn(dev, "could not reserve metadata\n"); + return -EBUSY; + } rc = sscanf(dev_name(&ndns->dev), "namespace%d.%d", ®ion_id, &id); if (rc != 2) return -EINVAL; + /* adjust the dax_region resource to the start of data */ + memcpy(&res, &pgmap.res, sizeof(res)); + res.start += offset; dax_region = alloc_dax_region(dev, region_id, &res, le32_to_cpu(pfn_sb->align), PFN_DEV|PFN_MAP); if (!dax_region) return -ENOMEM; - dev_dax = devm_create_dev_dax(dax_region, id); + dev_dax = devm_create_dev_dax(dax_region, id, &pgmap); /* child dev_dax instances now own the lifetime of the dax_region */ dax_region_put(dax_region); From d200781ef237a354d918ceff5cee350d88a93d42 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 7 Nov 2018 15:31:23 -0800 Subject: [PATCH 007/855] device-dax: Add support for a dax override driver Introduce the 'new_id' concept for enabling a custom device-driver attach policy for dax-bus drivers. The intended use is to have a mechanism for hot-plugging device-dax ranges into the page allocator on-demand. With this in place the default policy of using device-dax for performance differentiated memory can be overridden by user-space policy that can arrange for the memory range to be managed as 'System RAM' with user-defined NUMA and other performance attributes. Signed-off-by: Dan Williams --- drivers/dax/bus.c | 145 +++++++++++++++++++++++++++++++++++++++++-- drivers/dax/bus.h | 10 ++- drivers/dax/device.c | 11 ++-- 3 files changed, 156 insertions(+), 10 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 69aae2cbd45f..17af6fbc3be5 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -2,11 +2,21 @@ /* Copyright(c) 2017-2018 Intel Corporation. All rights reserved. */ #include #include +#include +#include #include #include #include "dax-private.h" #include "bus.h" +static DEFINE_MUTEX(dax_bus_lock); + +#define DAX_NAME_LEN 30 +struct dax_id { + struct list_head list; + char dev_name[DAX_NAME_LEN]; +}; + static int dax_bus_uevent(struct device *dev, struct kobj_uevent_env *env) { /* @@ -16,22 +26,115 @@ static int dax_bus_uevent(struct device *dev, struct kobj_uevent_env *env) return add_uevent_var(env, "MODALIAS=" DAX_DEVICE_MODALIAS_FMT, 0); } +static struct dax_device_driver *to_dax_drv(struct device_driver *drv) +{ + return container_of(drv, struct dax_device_driver, drv); +} + +static struct dax_id *__dax_match_id(struct dax_device_driver *dax_drv, + const char *dev_name) +{ + struct dax_id *dax_id; + + lockdep_assert_held(&dax_bus_lock); + + list_for_each_entry(dax_id, &dax_drv->ids, list) + if (sysfs_streq(dax_id->dev_name, dev_name)) + return dax_id; + return NULL; +} + +static int dax_match_id(struct dax_device_driver *dax_drv, struct device *dev) +{ + int match; + + mutex_lock(&dax_bus_lock); + match = !!__dax_match_id(dax_drv, dev_name(dev)); + mutex_unlock(&dax_bus_lock); + + return match; +} + +static ssize_t do_id_store(struct device_driver *drv, const char *buf, + size_t count, bool add) +{ + struct dax_device_driver *dax_drv = to_dax_drv(drv); + unsigned int region_id, id; + char devname[DAX_NAME_LEN]; + struct dax_id *dax_id; + ssize_t rc = count; + int fields; + + fields = sscanf(buf, "dax%d.%d", ®ion_id, &id); + if (fields != 2) + return -EINVAL; + sprintf(devname, "dax%d.%d", region_id, id); + if (!sysfs_streq(buf, devname)) + return -EINVAL; + + mutex_lock(&dax_bus_lock); + dax_id = __dax_match_id(dax_drv, buf); + if (!dax_id) { + if (add) { + dax_id = kzalloc(sizeof(*dax_id), GFP_KERNEL); + if (dax_id) { + strncpy(dax_id->dev_name, buf, DAX_NAME_LEN); + list_add(&dax_id->list, &dax_drv->ids); + } else + rc = -ENOMEM; + } else + /* nothing to remove */; + } else if (!add) { + list_del(&dax_id->list); + kfree(dax_id); + } else + /* dax_id already added */; + mutex_unlock(&dax_bus_lock); + return rc; +} + +static ssize_t new_id_store(struct device_driver *drv, const char *buf, + size_t count) +{ + return do_id_store(drv, buf, count, true); +} +static DRIVER_ATTR_WO(new_id); + +static ssize_t remove_id_store(struct device_driver *drv, const char *buf, + size_t count) +{ + return do_id_store(drv, buf, count, false); +} +static DRIVER_ATTR_WO(remove_id); + +static struct attribute *dax_drv_attrs[] = { + &driver_attr_new_id.attr, + &driver_attr_remove_id.attr, + NULL, +}; +ATTRIBUTE_GROUPS(dax_drv); + static int dax_bus_match(struct device *dev, struct device_driver *drv); static struct bus_type dax_bus_type = { .name = "dax", .uevent = dax_bus_uevent, .match = dax_bus_match, + .drv_groups = dax_drv_groups, }; static int dax_bus_match(struct device *dev, struct device_driver *drv) { + struct dax_device_driver *dax_drv = to_dax_drv(drv); + /* - * The drivers that can register on the 'dax' bus are private to - * drivers/dax/ so any device and driver on the bus always - * match. + * All but the 'device-dax' driver, which has 'match_always' + * set, requires an exact id match. */ - return 1; + if (dax_drv->match_always) + return 1; + + return dax_match_id(dax_drv, dev); } /* @@ -273,17 +376,49 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id, } EXPORT_SYMBOL_GPL(devm_create_dev_dax); -int __dax_driver_register(struct device_driver *drv, +static int match_always_count; + +int __dax_driver_register(struct dax_device_driver *dax_drv, struct module *module, const char *mod_name) { + struct device_driver *drv = &dax_drv->drv; + int rc = 0; + + INIT_LIST_HEAD(&dax_drv->ids); drv->owner = module; drv->name = mod_name; drv->mod_name = mod_name; drv->bus = &dax_bus_type; + + /* there can only be one default driver */ + mutex_lock(&dax_bus_lock); + match_always_count += dax_drv->match_always; + if (match_always_count > 1) { + match_always_count--; + WARN_ON(1); + rc = -EINVAL; + } + mutex_unlock(&dax_bus_lock); + if (rc) + return rc; return driver_register(drv); } EXPORT_SYMBOL_GPL(__dax_driver_register); +void dax_driver_unregister(struct dax_device_driver *dax_drv) +{ + struct dax_id *dax_id, *_id; + + mutex_lock(&dax_bus_lock); + match_always_count -= dax_drv->match_always; + list_for_each_entry_safe(dax_id, _id, &dax_drv->ids, list) { + list_del(&dax_id->list); + kfree(dax_id); + } + mutex_unlock(&dax_bus_lock); +} +EXPORT_SYMBOL_GPL(dax_driver_unregister); + int __init dax_bus_init(void) { return bus_register(&dax_bus_type); diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index e08e0c394983..395ab812367c 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -12,10 +12,18 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, unsigned long flags); struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id, struct dev_pagemap *pgmap); -int __dax_driver_register(struct device_driver *drv, + +struct dax_device_driver { + struct device_driver drv; + struct list_head ids; + int match_always; +}; + +int __dax_driver_register(struct dax_device_driver *dax_drv, struct module *module, const char *mod_name); #define dax_driver_register(driver) \ __dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) +void dax_driver_unregister(struct dax_device_driver *dax_drv); void kill_dev_dax(struct dev_dax *dev_dax); /* diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 6ad964d7b077..ad3120395f7a 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -504,9 +504,12 @@ static int dev_dax_remove(struct device *dev) return 0; } -static struct device_driver device_dax_driver = { - .probe = dev_dax_probe, - .remove = dev_dax_remove, +static struct dax_device_driver device_dax_driver = { + .drv = { + .probe = dev_dax_probe, + .remove = dev_dax_remove, + }, + .match_always = 1, }; static int __init dax_init(void) @@ -516,7 +519,7 @@ static int __init dax_init(void) static void __exit dax_exit(void) { - driver_unregister(&device_dax_driver); + dax_driver_unregister(&device_dax_driver); } MODULE_AUTHOR("Intel Corporation"); From 730926c3b0998943654019f00296cf8e3b02277e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Jul 2017 13:51:53 -0700 Subject: [PATCH 008/855] device-dax: Add /sys/class/dax backwards compatibility On the expectation that some environments may not upgrade libdaxctl (userspace component that depends on the /sys/class/dax hierarchy), provide a default / legacy dax_pmem_compat driver. The dax_pmem_compat driver implements the original /sys/class/dax sysfs layout rather than /sys/bus/dax. When userspace is upgraded it can blacklist this module and switch to the dax_pmem driver going forward. CONFIG_DEV_DAX_PMEM_COMPAT and supporting code will be deleted according to the dax_pmem entry in Documentation/ABI/obsolete/. Signed-off-by: Dan Williams --- Documentation/ABI/obsolete/sysfs-class-dax | 22 +++++++ drivers/dax/Kconfig | 12 +++- drivers/dax/Makefile | 4 +- drivers/dax/bus.c | 29 +++++++-- drivers/dax/bus.h | 26 +++++++- drivers/dax/device.c | 9 ++- drivers/dax/pmem/Makefile | 7 +++ drivers/dax/pmem/compat.c | 73 ++++++++++++++++++++++ drivers/dax/{pmem.c => pmem/core.c} | 51 +++++---------- drivers/dax/pmem/pmem.c | 40 ++++++++++++ tools/testing/nvdimm/Kbuild | 6 +- 11 files changed, 229 insertions(+), 50 deletions(-) create mode 100644 Documentation/ABI/obsolete/sysfs-class-dax create mode 100644 drivers/dax/pmem/Makefile create mode 100644 drivers/dax/pmem/compat.c rename drivers/dax/{pmem.c => pmem/core.c} (57%) create mode 100644 drivers/dax/pmem/pmem.c diff --git a/Documentation/ABI/obsolete/sysfs-class-dax b/Documentation/ABI/obsolete/sysfs-class-dax new file mode 100644 index 000000000000..2cb9fc5e8bd1 --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-class-dax @@ -0,0 +1,22 @@ +What: /sys/class/dax/ +Date: May, 2016 +KernelVersion: v4.7 +Contact: linux-nvdimm@lists.01.org +Description: Device DAX is the device-centric analogue of Filesystem + DAX (CONFIG_FS_DAX). It allows memory ranges to be + allocated and mapped without need of an intervening file + system. Device DAX is strict, precise and predictable. + Specifically this interface: + + 1/ Guarantees fault granularity with respect to a given + page size (pte, pmd, or pud) set at configuration time. + + 2/ Enforces deterministic behavior by being strict about + what fault scenarios are supported. + + The /sys/class/dax/ interface enumerates all the + device-dax instances in the system. The ABI is + deprecated and will be removed after 2020. It is + replaced with the DAX bus interface /sys/bus/dax/ where + device-dax instances can be found under + /sys/bus/dax/devices/ diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index e0700bf4893a..6fc96f03920e 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -23,12 +23,22 @@ config DEV_DAX config DEV_DAX_PMEM tristate "PMEM DAX: direct access to persistent memory" depends on LIBNVDIMM && NVDIMM_DAX && DEV_DAX + depends on m # until we can kill DEV_DAX_PMEM_COMPAT default DEV_DAX help Support raw access to persistent memory. Note that this driver consumes memory ranges allocated and exported by the libnvdimm sub-system. - Say Y if unsure + Say M if unsure + +config DEV_DAX_PMEM_COMPAT + tristate "PMEM DAX: support the deprecated /sys/class/dax interface" + depends on DEV_DAX_PMEM + default DEV_DAX_PMEM + help + Older versions of the libdaxctl library expect to find all + device-dax instances under /sys/class/dax. If libdaxctl in + your distribution is older than v58 say M, otherwise say N. endif diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 658e6b9b1d74..233bbffccbe6 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DAX) += dax.o obj-$(CONFIG_DEV_DAX) += device_dax.o -obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o dax-y := super.o dax-y += bus.o -dax_pmem-y := pmem.o device_dax-y := device.o + +obj-y += pmem/ diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 17af6fbc3be5..568168500217 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -9,6 +9,8 @@ #include "dax-private.h" #include "bus.h" +static struct class *dax_class; + static DEFINE_MUTEX(dax_bus_lock); #define DAX_NAME_LEN 30 @@ -310,8 +312,8 @@ static void unregister_dev_dax(void *dev) put_device(dev); } -struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id, - struct dev_pagemap *pgmap) +struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, + struct dev_pagemap *pgmap, enum dev_dax_subsys subsys) { struct device *parent = dax_region->dev; struct dax_device *dax_dev; @@ -350,7 +352,10 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id, inode = dax_inode(dax_dev); dev->devt = inode->i_rdev; - dev->bus = &dax_bus_type; + if (subsys == DEV_DAX_BUS) + dev->bus = &dax_bus_type; + else + dev->class = dax_class; dev->parent = parent; dev->groups = dax_attribute_groups; dev->release = dev_dax_release; @@ -374,7 +379,7 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id, return ERR_PTR(rc); } -EXPORT_SYMBOL_GPL(devm_create_dev_dax); +EXPORT_SYMBOL_GPL(__devm_create_dev_dax); static int match_always_count; @@ -407,6 +412,7 @@ EXPORT_SYMBOL_GPL(__dax_driver_register); void dax_driver_unregister(struct dax_device_driver *dax_drv) { + struct device_driver *drv = &dax_drv->drv; struct dax_id *dax_id, *_id; mutex_lock(&dax_bus_lock); @@ -416,15 +422,28 @@ void dax_driver_unregister(struct dax_device_driver *dax_drv) kfree(dax_id); } mutex_unlock(&dax_bus_lock); + driver_unregister(drv); } EXPORT_SYMBOL_GPL(dax_driver_unregister); int __init dax_bus_init(void) { - return bus_register(&dax_bus_type); + int rc; + + if (IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)) { + dax_class = class_create(THIS_MODULE, "dax"); + if (IS_ERR(dax_class)) + return PTR_ERR(dax_class); + } + + rc = bus_register(&dax_bus_type); + if (rc) + class_destroy(dax_class); + return rc; } void __exit dax_bus_exit(void) { bus_unregister(&dax_bus_type); + class_destroy(dax_class); } diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index 395ab812367c..ce977552ffb5 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -2,7 +2,8 @@ /* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */ #ifndef __DAX_BUS_H__ #define __DAX_BUS_H__ -struct device; +#include + struct dev_dax; struct resource; struct dax_device; @@ -10,8 +11,23 @@ struct dax_region; void dax_region_put(struct dax_region *dax_region); struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, unsigned long flags); -struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id, - struct dev_pagemap *pgmap); + +enum dev_dax_subsys { + DEV_DAX_BUS, + DEV_DAX_CLASS, +}; + +struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, + struct dev_pagemap *pgmap, enum dev_dax_subsys subsys); + +static inline struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, + int id, struct dev_pagemap *pgmap) +{ + return __devm_create_dev_dax(dax_region, id, pgmap, DEV_DAX_BUS); +} + +/* to be deleted when DEV_DAX_CLASS is removed */ +struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys); struct dax_device_driver { struct device_driver drv; @@ -26,6 +42,10 @@ int __dax_driver_register(struct dax_device_driver *dax_drv, void dax_driver_unregister(struct dax_device_driver *dax_drv); void kill_dev_dax(struct dev_dax *dev_dax); +#if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) +int dev_dax_probe(struct device *dev); +#endif + /* * While run_dax() is potentially a generic operation that could be * defined in include/linux/dax.h we don't want to grow any users diff --git a/drivers/dax/device.c b/drivers/dax/device.c index ad3120395f7a..e428468ab661 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -445,7 +445,7 @@ static void dev_dax_kill(void *dev_dax) kill_dev_dax(dev_dax); } -static int dev_dax_probe(struct device *dev) +int dev_dax_probe(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); struct dax_device *dax_dev = dev_dax->dax_dev; @@ -484,7 +484,11 @@ static int dev_dax_probe(struct device *dev) inode = dax_inode(dax_dev); cdev = inode->i_cdev; cdev_init(cdev, &dax_fops); - cdev->owner = dev->driver->owner; + if (dev->class) { + /* for the CONFIG_DEV_DAX_PMEM_COMPAT case */ + cdev->owner = dev->parent->driver->owner; + } else + cdev->owner = dev->driver->owner; cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) @@ -497,6 +501,7 @@ static int dev_dax_probe(struct device *dev) run_dax(dax_dev); return devm_add_action_or_reset(dev, dev_dax_kill, dev_dax); } +EXPORT_SYMBOL_GPL(dev_dax_probe); static int dev_dax_remove(struct device *dev) { diff --git a/drivers/dax/pmem/Makefile b/drivers/dax/pmem/Makefile new file mode 100644 index 000000000000..e2e79bd3fdcf --- /dev/null +++ b/drivers/dax/pmem/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o +obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem_core.o +obj-$(CONFIG_DEV_DAX_PMEM_COMPAT) += dax_pmem_compat.o + +dax_pmem-y := pmem.o +dax_pmem_core-y := core.o +dax_pmem_compat-y := compat.o diff --git a/drivers/dax/pmem/compat.c b/drivers/dax/pmem/compat.c new file mode 100644 index 000000000000..d7b15e6f30c5 --- /dev/null +++ b/drivers/dax/pmem/compat.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include +#include "../bus.h" + +/* we need the private definitions to implement compat suport */ +#include "../dax-private.h" + +static int dax_pmem_compat_probe(struct device *dev) +{ + struct dev_dax *dev_dax = __dax_pmem_probe(dev, DEV_DAX_CLASS); + int rc; + + if (IS_ERR(dev_dax)) + return PTR_ERR(dev_dax); + + if (!devres_open_group(&dev_dax->dev, dev_dax, GFP_KERNEL)) + return -ENOMEM; + + device_lock(&dev_dax->dev); + rc = dev_dax_probe(&dev_dax->dev); + device_unlock(&dev_dax->dev); + + devres_close_group(&dev_dax->dev, dev_dax); + if (rc) + devres_release_group(&dev_dax->dev, dev_dax); + + return rc; +} + +static int dax_pmem_compat_release(struct device *dev, void *data) +{ + device_lock(dev); + devres_release_group(dev, to_dev_dax(dev)); + device_unlock(dev); + + return 0; +} + +static int dax_pmem_compat_remove(struct device *dev) +{ + device_for_each_child(dev, NULL, dax_pmem_compat_release); + return 0; +} + +static struct nd_device_driver dax_pmem_compat_driver = { + .probe = dax_pmem_compat_probe, + .remove = dax_pmem_compat_remove, + .drv = { + .name = "dax_pmem_compat", + }, + .type = ND_DRIVER_DAX_PMEM, +}; + +static int __init dax_pmem_compat_init(void) +{ + return nd_driver_register(&dax_pmem_compat_driver); +} +module_init(dax_pmem_compat_init); + +static void __exit dax_pmem_compat_exit(void) +{ + driver_unregister(&dax_pmem_compat_driver.drv); +} +module_exit(dax_pmem_compat_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM); diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem/core.c similarity index 57% rename from drivers/dax/pmem.c rename to drivers/dax/pmem/core.c index d3cefa7868ac..bdcff1b14e95 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem/core.c @@ -1,24 +1,13 @@ -/* - * Copyright(c) 2016 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#include +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */ #include #include #include -#include "../nvdimm/pfn.h" -#include "../nvdimm/nd.h" -#include "bus.h" +#include "../../nvdimm/pfn.h" +#include "../../nvdimm/nd.h" +#include "../bus.h" -static int dax_pmem_probe(struct device *dev) +struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) { struct resource res; int rc, id, region_id; @@ -34,16 +23,16 @@ static int dax_pmem_probe(struct device *dev) ndns = nvdimm_namespace_common_probe(dev); if (IS_ERR(ndns)) - return PTR_ERR(ndns); + return ERR_CAST(ndns); nsio = to_nd_namespace_io(&ndns->dev); /* parse the 'pfn' info block via ->rw_bytes */ rc = devm_nsio_enable(dev, nsio); if (rc) - return rc; + return ERR_PTR(rc); rc = nvdimm_setup_pfn(nd_pfn, &pgmap); if (rc) - return rc; + return ERR_PTR(rc); devm_nsio_disable(dev, nsio); /* reserve the metadata area, device-dax will reserve the data */ @@ -52,12 +41,12 @@ static int dax_pmem_probe(struct device *dev) if (!devm_request_mem_region(dev, nsio->res.start, offset, dev_name(&ndns->dev))) { dev_warn(dev, "could not reserve metadata\n"); - return -EBUSY; + return ERR_PTR(-EBUSY); } rc = sscanf(dev_name(&ndns->dev), "namespace%d.%d", ®ion_id, &id); if (rc != 2) - return -EINVAL; + return ERR_PTR(-EINVAL); /* adjust the dax_region resource to the start of data */ memcpy(&res, &pgmap.res, sizeof(res)); @@ -65,26 +54,16 @@ static int dax_pmem_probe(struct device *dev) dax_region = alloc_dax_region(dev, region_id, &res, le32_to_cpu(pfn_sb->align), PFN_DEV|PFN_MAP); if (!dax_region) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - dev_dax = devm_create_dev_dax(dax_region, id, &pgmap); + dev_dax = __devm_create_dev_dax(dax_region, id, &pgmap, subsys); /* child dev_dax instances now own the lifetime of the dax_region */ dax_region_put(dax_region); - return PTR_ERR_OR_ZERO(dev_dax); + return dev_dax; } - -static struct nd_device_driver dax_pmem_driver = { - .probe = dax_pmem_probe, - .drv = { - .name = "dax_pmem", - }, - .type = ND_DRIVER_DAX_PMEM, -}; - -module_nd_driver(dax_pmem_driver); +EXPORT_SYMBOL_GPL(__dax_pmem_probe); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Intel Corporation"); -MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM); diff --git a/drivers/dax/pmem/pmem.c b/drivers/dax/pmem/pmem.c new file mode 100644 index 000000000000..0ae4238a0ef8 --- /dev/null +++ b/drivers/dax/pmem/pmem.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include +#include "../bus.h" + +static int dax_pmem_probe(struct device *dev) +{ + return PTR_ERR_OR_ZERO(__dax_pmem_probe(dev, DEV_DAX_BUS)); +} + +static struct nd_device_driver dax_pmem_driver = { + .probe = dax_pmem_probe, + .drv = { + .name = "dax_pmem", + }, + .type = ND_DRIVER_DAX_PMEM, +}; + +static int __init dax_pmem_init(void) +{ + return nd_driver_register(&dax_pmem_driver); +} +module_init(dax_pmem_init); + +static void __exit dax_pmem_exit(void) +{ + driver_unregister(&dax_pmem_driver.drv); +} +module_exit(dax_pmem_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +#if !IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) +/* For compat builds, don't load this module by default */ +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM); +#endif diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index c9b500a652d5..e1286d2cdfbf 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -35,6 +35,8 @@ obj-$(CONFIG_DAX) += dax.o endif obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o +obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem_core.o +obj-$(CONFIG_DEV_DAX_PMEM_COMPAT) += dax_pmem_compat.o nfit-y := $(ACPI_SRC)/core.o nfit-y += $(ACPI_SRC)/intel.o @@ -65,7 +67,9 @@ device_dax-y += dax-dev.o device_dax-y += device_dax_test.o device_dax-y += config_check.o -dax_pmem-y := $(DAX_SRC)/pmem.o +dax_pmem-y := $(DAX_SRC)/pmem/pmem.o +dax_pmem_core-y := $(DAX_SRC)/pmem/core.o +dax_pmem_compat-y := $(DAX_SRC)/pmem/compat.o dax_pmem-y += config_check.o libnvdimm-y := $(NVDIMM_SRC)/core.o From 8fc5c73554db0ac18c0c6ac5b2099ab917f83bdf Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 9 Nov 2018 12:43:07 -0800 Subject: [PATCH 009/855] acpi/nfit, device-dax: Identify differentiated memory with a unique numa-node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Persistent memory, as described by the ACPI NFIT (NVDIMM Firmware Interface Table), is the first known instance of a memory range described by a unique "target" proximity domain. Where "initiator" and "target" proximity domains is an approach that the ACPI HMAT (Heterogeneous Memory Attributes Table) uses to described the unique performance properties of a memory range relative to a given initiator (e.g. CPU or DMA device). Currently the numa-node for a /dev/pmemX block-device or /dev/daxX.Y char-device follows the traditional notion of 'numa-node' where the attribute conveys the closest online numa-node. That numa-node attribute is useful for cpu-binding and memory-binding processes *near* the device. However, when the memory range backing a 'pmem', or 'dax' device is onlined (memory hot-add) the memory-only-numa-node representing that address needs to be differentiated from the set of online nodes. In other words, the numa-node association of the device depends on whether you can bind processes *near* the cpu-numa-node in the offline device-case, or bind process *on* the memory-range directly after the backing address range is onlined. Allow for the case that platform firmware describes persistent memory with a unique proximity domain, i.e. when it is distinct from the proximity of DRAM and CPUs that are on the same socket. Plumb the Linux numa-node translation of that proximity through the libnvdimm region device to namespaces that are in device-dax mode. With this in place the proposed kmem driver [1] can optionally discover a unique numa-node number for the address range as it transitions the memory from an offline state managed by a device-driver to an online memory range managed by the core-mm. [1]: https://lore.kernel.org/lkml/20181022201317.8558C1D8@viggo.jf.intel.com Reported-by: Fan Du Cc: Michael Ellerman Cc: "Oliver O'Halloran" Cc: Dave Hansen Cc: Jérôme Glisse Reviewed-by: Yang Shi Signed-off-by: Dan Williams --- arch/powerpc/platforms/pseries/papr_scm.c | 1 + drivers/acpi/nfit/core.c | 8 ++++++-- drivers/acpi/numa.c | 1 + drivers/dax/bus.c | 4 +++- drivers/dax/bus.h | 3 ++- drivers/dax/dax-private.h | 4 ++++ drivers/dax/pmem/core.c | 4 +++- drivers/nvdimm/e820.c | 1 + drivers/nvdimm/nd.h | 2 +- drivers/nvdimm/of_pmem.c | 1 + drivers/nvdimm/region_devs.c | 1 + include/linux/acpi.h | 5 +++++ include/linux/libnvdimm.h | 1 + 13 files changed, 30 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 7d6457ab5d34..8806ac822627 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -236,6 +236,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) memset(&ndr_desc, 0, sizeof(ndr_desc)); ndr_desc.attr_groups = region_attr_groups; ndr_desc.numa_node = dev_to_node(&p->pdev->dev); + ndr_desc.target_node = ndr_desc.numa_node; ndr_desc.res = &p->res; ndr_desc.of_node = p->dn; ndr_desc.provider_data = p; diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 011d3db19c80..475899974c70 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -2869,11 +2869,15 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, ndr_desc->res = &res; ndr_desc->provider_data = nfit_spa; ndr_desc->attr_groups = acpi_nfit_region_attribute_groups; - if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) + if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) { ndr_desc->numa_node = acpi_map_pxm_to_online_node( spa->proximity_domain); - else + ndr_desc->target_node = acpi_map_pxm_to_node( + spa->proximity_domain); + } else { ndr_desc->numa_node = NUMA_NO_NODE; + ndr_desc->target_node = NUMA_NO_NODE; + } /* * Persistence domain bits are hierarchical, if diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 274699463b4f..b9d86babb13a 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -84,6 +84,7 @@ int acpi_map_pxm_to_node(int pxm) return node; } +EXPORT_SYMBOL(acpi_map_pxm_to_node); /** * acpi_map_pxm_to_online_node - Map proximity ID to online node diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 568168500217..c620ad52d7e5 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -214,7 +214,7 @@ static void dax_region_unregister(void *region) } struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, unsigned int align, + struct resource *res, int target_node, unsigned int align, unsigned long pfn_flags) { struct dax_region *dax_region; @@ -244,6 +244,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, dax_region->id = region_id; dax_region->align = align; dax_region->dev = parent; + dax_region->target_node = target_node; if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { kfree(dax_region); return NULL; @@ -348,6 +349,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, dev_dax->dax_dev = dax_dev; dev_dax->region = dax_region; + dev_dax->target_node = dax_region->target_node; kref_get(&dax_region->kref); inode = dax_inode(dax_dev); diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index ce977552ffb5..8619e3299943 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -10,7 +10,8 @@ struct dax_device; struct dax_region; void dax_region_put(struct dax_region *dax_region); struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, unsigned int align, unsigned long flags); + struct resource *res, int target_node, unsigned int align, + unsigned long flags); enum dev_dax_subsys { DEV_DAX_BUS, diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index a82ce48f5884..a45612148ca0 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -26,6 +26,7 @@ void dax_bus_exit(void); /** * struct dax_region - mapping infrastructure for dax devices * @id: kernel-wide unique region for a memory range + * @target_node: effective numa node if this memory range is onlined * @kref: to pin while other agents have a need to do lookups * @dev: parent device backing this region * @align: allocation and mapping alignment for child dax devices @@ -34,6 +35,7 @@ void dax_bus_exit(void); */ struct dax_region { int id; + int target_node; struct kref kref; struct device *dev; unsigned int align; @@ -46,6 +48,7 @@ struct dax_region { * data while the device is activated in the driver. * @region - parent region * @dax_dev - core dax functionality + * @target_node: effective numa node if dev_dax memory range is onlined * @dev - device core * @pgmap - pgmap for memmap setup / lifetime (driver owned) * @ref: pgmap reference count (driver owned) @@ -54,6 +57,7 @@ struct dax_region { struct dev_dax { struct dax_region *region; struct dax_device *dax_dev; + int target_node; struct device dev; struct dev_pagemap pgmap; struct percpu_ref ref; diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c index bdcff1b14e95..f71019ce0647 100644 --- a/drivers/dax/pmem/core.c +++ b/drivers/dax/pmem/core.c @@ -20,6 +20,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) struct nd_namespace_common *ndns; struct nd_dax *nd_dax = to_nd_dax(dev); struct nd_pfn *nd_pfn = &nd_dax->nd_pfn; + struct nd_region *nd_region = to_nd_region(dev->parent); ndns = nvdimm_namespace_common_probe(dev); if (IS_ERR(ndns)) @@ -52,7 +53,8 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) memcpy(&res, &pgmap.res, sizeof(res)); res.start += offset; dax_region = alloc_dax_region(dev, region_id, &res, - le32_to_cpu(pfn_sb->align), PFN_DEV|PFN_MAP); + nd_region->target_node, le32_to_cpu(pfn_sb->align), + PFN_DEV|PFN_MAP); if (!dax_region) return ERR_PTR(-ENOMEM); diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c index 521eaf53a52a..36be9b619187 100644 --- a/drivers/nvdimm/e820.c +++ b/drivers/nvdimm/e820.c @@ -47,6 +47,7 @@ static int e820_register_one(struct resource *res, void *data) ndr_desc.res = res; ndr_desc.attr_groups = e820_pmem_region_attribute_groups; ndr_desc.numa_node = e820_range_to_nid(res->start); + ndr_desc.target_node = ndr_desc.numa_node; set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) return -ENXIO; diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index cfde992684e7..0b3d7595b3cb 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -153,7 +153,7 @@ struct nd_region { u16 ndr_mappings; u64 ndr_size; u64 ndr_start; - int id, num_lanes, ro, numa_node; + int id, num_lanes, ro, numa_node, target_node; void *provider_data; struct kernfs_node *bb_state; struct badblocks bb; diff --git a/drivers/nvdimm/of_pmem.c b/drivers/nvdimm/of_pmem.c index 0a701837dfc0..ecaaa27438e2 100644 --- a/drivers/nvdimm/of_pmem.c +++ b/drivers/nvdimm/of_pmem.c @@ -68,6 +68,7 @@ static int of_pmem_region_probe(struct platform_device *pdev) memset(&ndr_desc, 0, sizeof(ndr_desc)); ndr_desc.attr_groups = region_attr_groups; ndr_desc.numa_node = dev_to_node(&pdev->dev); + ndr_desc.target_node = ndr_desc.numa_node; ndr_desc.res = &pdev->resource[i]; ndr_desc.of_node = np; set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index e2818f94f292..caf2f3129ccd 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -1065,6 +1065,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, nd_region->flags = ndr_desc->flags; nd_region->ro = ro; nd_region->numa_node = ndr_desc->numa_node; + nd_region->target_node = ndr_desc->target_node; ida_init(&nd_region->ns_ida); ida_init(&nd_region->btt_ida); ida_init(&nd_region->pfn_ida); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 87715f20b69a..eddf2736e5a6 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -400,12 +400,17 @@ extern bool acpi_osi_is_win8(void); #ifdef CONFIG_ACPI_NUMA int acpi_map_pxm_to_online_node(int pxm); +int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); #else static inline int acpi_map_pxm_to_online_node(int pxm) { return 0; } +static inline int acpi_map_pxm_to_node(int pxm) +{ + return 0; +} static inline int acpi_get_node(acpi_handle handle) { return 0; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 5440f11b0907..56bc545ad3b2 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -128,6 +128,7 @@ struct nd_region_desc { void *provider_data; int num_lanes; int numa_node; + int target_node; unsigned long flags; struct device_node *of_node; }; From f5fd9fd4000984f19db689282054953981a50534 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 13 Jul 2018 18:00:18 +0300 Subject: [PATCH 010/855] drm/etnaviv: fix some off by one bugs The ->nr_signal is the supposed to be the number of elements in the ->signal array. There was one place where it was 5 but it was supposed to be 4. That looks like a copy and paste bug. There were also two checks that were off by one. Fixes: 9e2c2e273012 ("drm/etnaviv: add infrastructure to query perf counter") Signed-off-by: Dan Carpenter Reviewed-by: Christian Gmeiner Tested-by: Christian Gmeiner Signed-off-by: Lucas Stach --- drivers/gpu/drm/etnaviv/etnaviv_perfmon.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_perfmon.c b/drivers/gpu/drm/etnaviv/etnaviv_perfmon.c index 9980d81a26e3..4227a4006c34 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_perfmon.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_perfmon.c @@ -113,7 +113,7 @@ static const struct etnaviv_pm_domain doms_3d[] = { .name = "PE", .profile_read = VIVS_MC_PROFILE_PE_READ, .profile_config = VIVS_MC_PROFILE_CONFIG0, - .nr_signals = 5, + .nr_signals = 4, .signal = (const struct etnaviv_pm_signal[]) { { "PIXEL_COUNT_KILLED_BY_COLOR_PIPE", @@ -435,7 +435,7 @@ int etnaviv_pm_query_sig(struct etnaviv_gpu *gpu, dom = meta->domains + signal->domain; - if (signal->iter > dom->nr_signals) + if (signal->iter >= dom->nr_signals) return -EINVAL; sig = &dom->signal[signal->iter]; @@ -461,7 +461,7 @@ int etnaviv_pm_req_validate(const struct drm_etnaviv_gem_submit_pmr *r, dom = meta->domains + r->domain; - if (r->signal > dom->nr_signals) + if (r->signal >= dom->nr_signals) return -EINVAL; return 0; From 72ac64b84b2ecb5c980354a189018ede14d61035 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 23 Nov 2018 16:26:04 +0100 Subject: [PATCH 011/855] drm/etnaviv: move job context pointer to etnaviv_gem_submit The context isn't really related to the cmdbuf, but is a property of the job. This has been missed when moving to a properly refcounted etnaviv_gem_submit. Signed-off-by: Lucas Stach Reviewed-by: Christian Gmeiner --- drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.h | 2 -- drivers/gpu/drm/etnaviv/etnaviv_gem.h | 1 + drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 2 +- drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.h b/drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.h index acb68c698363..4d5d1a77eb2a 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.h +++ b/drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.h @@ -15,8 +15,6 @@ struct etnaviv_perfmon_request; struct etnaviv_cmdbuf { /* suballocator this cmdbuf is allocated from */ struct etnaviv_cmdbuf_suballoc *suballoc; - /* user context key, must be unique between all active users */ - struct etnaviv_file_private *ctx; /* cmdbuf properties */ int suballoc_offset; void *vaddr; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.h b/drivers/gpu/drm/etnaviv/etnaviv_gem.h index 76079c2291f8..f0abb744ef95 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.h +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.h @@ -95,6 +95,7 @@ struct etnaviv_gem_submit_bo { struct etnaviv_gem_submit { struct drm_sched_job sched_job; struct kref refcount; + struct etnaviv_file_private *ctx; struct etnaviv_gpu *gpu; struct dma_fence *out_fence, *in_fence; int out_fence_id; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c index 30875f8f2933..b2fe3446bfbc 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c @@ -506,7 +506,7 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data, if (ret) goto err_submit_objects; - submit->cmdbuf.ctx = file->driver_priv; + submit->ctx = file->driver_priv; submit->exec_state = args->exec_state; submit->flags = args->flags; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c index 49a6763693f1..7e4f550dde5b 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c @@ -150,7 +150,7 @@ int etnaviv_sched_push_job(struct drm_sched_entity *sched_entity, mutex_lock(&submit->gpu->fence_lock); ret = drm_sched_job_init(&submit->sched_job, sched_entity, - submit->cmdbuf.ctx); + submit->ctx); if (ret) goto out_unlock; From 31867b23d7d1ee3535136c6a410a6cf56f666bfc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 28 Dec 2018 11:00:38 -0800 Subject: [PATCH 012/855] f2fs: wait on atomic writes to count F2FS_CP_WB_DATA Otherwise, we can get wrong counts incurring checkpoint hang. IO_W (CP: -24, Data: 24, Flush: ( 0 0 1), Discard: ( 0 0)) Thread A Thread B - f2fs_write_data_pages - __write_data_page - f2fs_submit_page_write - inc_page_count(F2FS_WB_DATA) type is F2FS_WB_DATA due to file is non-atomic one - f2fs_ioc_start_atomic_write - set_inode_flag(FI_ATOMIC_FILE) - f2fs_write_end_io - dec_page_count(F2FS_WB_CP_DATA) type is F2FS_WB_DATA due to file becomes atomic one Cc: Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bba56b39dcc5..ae2b45e75847 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1750,10 +1750,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (!get_dirty_pages(inode)) - goto skip_flush; - - f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + /* + * Should wait end_io to count F2FS_WB_CP_DATA correctly by + * f2fs_is_atomic_file. + */ + if (get_dirty_pages(inode)) + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); @@ -1761,7 +1763,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } -skip_flush: + set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); From 7c77bf7de1574ac7a31a2b76f4927404307d13e7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 1 Jan 2019 00:11:30 -0800 Subject: [PATCH 013/855] f2fs: don't access node/meta inode mapping after iput This fixes wrong access of address spaces of node and meta inodes after iput. Fixes: 60aa4d5536ab ("f2fs: fix use-after-free issue when accessing sbi->stat_info") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 19 ++++++++++++------- fs/f2fs/super.c | 5 +++++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ebcc121920ba..503fde8349e6 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -96,8 +96,10 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->free_secs = free_sections(sbi); si->prefree_count = prefree_segments(sbi); si->dirty_count = dirty_segments(sbi); - si->node_pages = NODE_MAPPING(sbi)->nrpages; - si->meta_pages = META_MAPPING(sbi)->nrpages; + if (sbi->node_inode) + si->node_pages = NODE_MAPPING(sbi)->nrpages; + if (sbi->meta_inode) + si->meta_pages = META_MAPPING(sbi)->nrpages; si->nats = NM_I(sbi)->nat_cnt; si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); @@ -175,7 +177,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) static void update_mem_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned npages; int i; if (si->base_mem) @@ -258,10 +259,14 @@ get_cache: sizeof(struct extent_node); si->page_mem = 0; - npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_SHIFT; - npages = META_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + if (sbi->node_inode) { + unsigned npages = NODE_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } + if (sbi->meta_inode) { + unsigned npages = META_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } } static int stat_show(struct seq_file *s, void *v) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c46a1d4318d4..14f033e1ab42 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1075,7 +1075,10 @@ static void f2fs_put_super(struct super_block *sb) f2fs_bug_on(sbi, sbi->fsync_node_num); iput(sbi->node_inode); + sbi->node_inode = NULL; + iput(sbi->meta_inode); + sbi->meta_inode = NULL; /* * iput() can update stat information, if f2fs_write_checkpoint() @@ -3410,6 +3413,7 @@ free_node_inode: f2fs_release_ino_entry(sbi, true); truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); + sbi->node_inode = NULL; free_stats: f2fs_destroy_stats(sbi); free_nm: @@ -3422,6 +3426,7 @@ free_devices: free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); + sbi->meta_inode = NULL; free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_percpu: From f365c6cc85b1d2348d73bf327258874fcc7ac161 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 1 Jan 2019 21:33:11 +0800 Subject: [PATCH 014/855] f2fs: change error code to -ENOMEM from -EINVAL The error case of failing allocating memory should return -ENOMEM. Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 14f033e1ab42..bc32a1035f65 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -269,7 +269,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, if (!qname) { f2fs_msg(sb, KERN_ERR, "Not enough memory for storing quotafile name"); - return -EINVAL; + return -ENOMEM; } if (F2FS_OPTION(sbi).s_qf_names[qtype]) { if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) From 539cd352239f6acae4a7997bf31edca5c9a69c81 Mon Sep 17 00:00:00 2001 From: Zhikang Zhang Date: Thu, 3 Jan 2019 20:06:38 +0800 Subject: [PATCH 015/855] f2fs: fix compile warnings: 'struct *' declared inside parameter list We meet these compile warnings below, which caused by missing declare structs: struct f2fs_io_info, struct extent, struct f2fs_sb_info. warning: 'struct f2fs_io_info' declared inside parameter list warning: 'struct extent_info' declared inside parameter list warning: 'struct f2fs_sb_info' declared inside parameter list Signed-off-by: Zhikang Zhang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 3ec73f17ee2a..8a28a2b1be74 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -149,6 +149,9 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { CP_SPEC_LOG_NUM, "log type is 2" }, \ { CP_RECOVER_DIR, "dir needs recovery" }) +struct f2fs_sb_info; +struct f2fs_io_info; +struct extent_info; struct victim_sel_policy; struct f2fs_map_blocks; From 8e11403876b1e29d290ea42bb20bf45ef927ef6e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 4 Jan 2019 01:38:29 +0000 Subject: [PATCH 016/855] f2fs: remove set but not used variable 'err' Fixes gcc '-Wunused-but-set-variable' warning: fs/f2fs/data.c: In function 'f2fs_dio_submit_bio': fs/f2fs/data.c:2585:6: warning: variable 'err' set but not used [-Wunused-but-set-variable] Signed-off-by: YueHaibing Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f91d8630c9a2..4e402add2e60 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2582,14 +2582,11 @@ static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode, { struct f2fs_private_dio *dio; bool write = (bio_op(bio) == REQ_OP_WRITE); - int err; dio = f2fs_kzalloc(F2FS_I_SB(inode), sizeof(struct f2fs_private_dio), GFP_NOFS); - if (!dio) { - err = -ENOMEM; + if (!dio) goto out; - } dio->inode = inode; dio->orig_end_io = bio->bi_end_io; From 36c5733f9570e9052a6437012be8034085f25ab7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Jan 2019 17:39:53 +0800 Subject: [PATCH 017/855] f2fs: check inject_rate validity during configuring Type of inject_rate is unsigned int, let's check new value's validity during configuring. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 0575edbe3ed6..02d4012a9183 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -222,6 +222,8 @@ out: #ifdef CONFIG_F2FS_FAULT_INJECTION if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) return -EINVAL; + if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX) + return -EINVAL; #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); From 5d539245cb18afa8943cbf2be0cdc033e49f3a4a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jan 2019 17:19:08 -0800 Subject: [PATCH 018/855] f2fs: export FS_NOCOW_FL flag to user This exports pin_file status to user. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/file.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 12fabd6735dd..9286ec381453 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2301,11 +2301,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) #define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define F2FS_NOCOW_FL 0x00800000 /* Do not cow file */ #define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define F2FS_FL_USER_VISIBLE 0x30CBDFFF /* User visible flags */ #define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ /* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ae2b45e75847..0d461321edfc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1651,6 +1651,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) flags |= F2FS_ENCRYPT_FL; if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) flags |= F2FS_INLINE_DATA_FL; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + flags |= F2FS_NOCOW_FL; flags &= F2FS_FL_USER_VISIBLE; From c20e57b32d817d35271c4b205cb6ab80d8d93aeb Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 4 Jan 2019 14:26:18 +0100 Subject: [PATCH 019/855] f2fs: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Cc: Jaegeuk Kim Cc: Chao Yu Cc: linux-f2fs-devel@lists.sourceforge.net Signed-off-by: Greg Kroah-Hartman Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 20 +++----------------- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/super.c | 5 +---- 3 files changed, 6 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 503fde8349e6..99e9a5c37b71 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -511,30 +511,16 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kvfree(si); } -int __init f2fs_create_root_stats(void) +void __init f2fs_create_root_stats(void) { - struct dentry *file; - f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); - if (!f2fs_debugfs_root) - return -ENOMEM; - file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, - NULL, &stat_fops); - if (!file) { - debugfs_remove(f2fs_debugfs_root); - f2fs_debugfs_root = NULL; - return -ENOMEM; - } - - return 0; + debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, + &stat_fops); } void f2fs_destroy_root_stats(void) { - if (!f2fs_debugfs_root) - return; - debugfs_remove_recursive(f2fs_debugfs_root); f2fs_debugfs_root = NULL; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9286ec381453..7df41cd1eb35 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3329,7 +3329,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) int f2fs_build_stats(struct f2fs_sb_info *sbi); void f2fs_destroy_stats(struct f2fs_sb_info *sbi); -int __init f2fs_create_root_stats(void); +void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_cp_count(si) do { } while (0) @@ -3367,7 +3367,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline int __init f2fs_create_root_stats(void) { return 0; } +static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } #endif diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bc32a1035f65..ea514acede36 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3550,9 +3550,7 @@ static int __init init_f2fs_fs(void) err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; - err = f2fs_create_root_stats(); - if (err) - goto free_filesystem; + f2fs_create_root_stats(); err = f2fs_init_post_read_processing(); if (err) goto free_root_stats; @@ -3560,7 +3558,6 @@ static int __init init_f2fs_fs(void) free_root_stats: f2fs_destroy_root_stats(); -free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); From bb8e82b64b4b93372a84c39e6fc1b17199e221ab Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Jan 2019 12:50:01 -0800 Subject: [PATCH 020/855] etnaviv mailing list is moderated Signed-off-by: Matthew Wilcox Signed-off-by: Lucas Stach --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 32d444476a90..44888eb121d8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5141,7 +5141,7 @@ DRM DRIVERS FOR VIVANTE GPU IP M: Lucas Stach R: Russell King R: Christian Gmeiner -L: etnaviv@lists.freedesktop.org +L: etnaviv@lists.freedesktop.org (moderated for non-subscribers) L: dri-devel@lists.freedesktop.org S: Maintained F: drivers/gpu/drm/etnaviv/ From a20ca6744ba7e683a772bb5f1715e4662342be58 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 8 Jan 2019 12:43:56 -0800 Subject: [PATCH 021/855] Build etnaviv on non-ARM architectures I wanted to test-compile etnaviv on x86 after making a tree-wide change to it. Unfortunately, Kconfig has a bad dependency, so I couldn't. Signed-off-by: Matthew Wilcox Signed-off-by: Lucas Stach --- drivers/gpu/drm/etnaviv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/etnaviv/Kconfig b/drivers/gpu/drm/etnaviv/Kconfig index 041a77e400d4..342591a1084e 100644 --- a/drivers/gpu/drm/etnaviv/Kconfig +++ b/drivers/gpu/drm/etnaviv/Kconfig @@ -2,7 +2,7 @@ config DRM_ETNAVIV tristate "ETNAVIV (DRM support for Vivante GPU IP cores)" depends on DRM - depends on ARCH_MXC || ARCH_DOVE || (ARM && COMPILE_TEST) + depends on ARCH_MXC || ARCH_DOVE || ARM || COMPILE_TEST depends on MMU select SHMEM select SYNC_FILE From 88ca66d8540ca26119b1428cddb96b37925bdf01 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 11 Jan 2019 09:49:30 +0100 Subject: [PATCH 022/855] x86/asm: Remove dead __GNUC__ conditionals The minimum supported gcc version is >= 4.6, so these can be removed. Signed-off-by: Rasmus Villemoes Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Dan Williams Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190111084931.24601-1-linux@rasmusvillemoes.dk --- arch/x86/include/asm/bitops.h | 6 ------ arch/x86/include/asm/string_32.h | 20 -------------------- arch/x86/include/asm/string_64.h | 15 --------------- 3 files changed, 41 deletions(-) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index ad7b210aa3f6..d153d570bb04 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -36,13 +36,7 @@ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). */ -#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) -/* Technically wrong, but this avoids compilation errors on some gcc - versions. */ -#define BITOP_ADDR(x) "=m" (*(volatile long *) (x)) -#else #define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) -#endif #define ADDR BITOP_ADDR(addr) diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 55d392c6bd29..2fd165f1cffa 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -179,14 +179,7 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) * No 3D Now! */ -#if (__GNUC__ >= 4) #define memcpy(t, f, n) __builtin_memcpy(t, f, n) -#else -#define memcpy(t, f, n) \ - (__builtin_constant_p((n)) \ - ? __constant_memcpy((t), (f), (n)) \ - : __memcpy((t), (f), (n))) -#endif #endif #endif /* !CONFIG_FORTIFY_SOURCE */ @@ -282,12 +275,7 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, { int d0, d1; -#if __GNUC__ == 4 && __GNUC_MINOR__ == 0 - /* Workaround for broken gcc 4.0 */ - register unsigned long eax asm("%eax") = pattern; -#else unsigned long eax = pattern; -#endif switch (count % 4) { case 0: @@ -321,15 +309,7 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, #define __HAVE_ARCH_MEMSET extern void *memset(void *, int, size_t); #ifndef CONFIG_FORTIFY_SOURCE -#if (__GNUC__ >= 4) #define memset(s, c, count) __builtin_memset(s, c, count) -#else -#define memset(s, c, count) \ - (__builtin_constant_p(c) \ - ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \ - (count)) \ - : __memset((s), (c), (count))) -#endif #endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMSET16 diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 4e4194e21a09..75314c3dbe47 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -14,21 +14,6 @@ extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); -#ifndef CONFIG_FORTIFY_SOURCE -#if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 -#define memcpy(dst, src, len) \ -({ \ - size_t __len = (len); \ - void *__ret; \ - if (__builtin_constant_p(len) && __len >= 64) \ - __ret = __memcpy((dst), (src), __len); \ - else \ - __ret = __builtin_memcpy((dst), (src), __len); \ - __ret; \ -}) -#endif -#endif /* !CONFIG_FORTIFY_SOURCE */ - #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); void *__memset(void *s, int c, size_t n); From 2e905c7abdcd5ff09b9df33d25eb7148c85bed2a Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 11 Jan 2019 09:49:31 +0100 Subject: [PATCH 023/855] x86/asm: Remove unused __constant_c_x_memset() macro and inlines Nothing refers to the __constant_c_x_memset() macro anymore. Remove it and the two referenced static inline functions. Signed-off-by: Rasmus Villemoes Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190111084931.24601-2-linux@rasmusvillemoes.dk --- arch/x86/include/asm/string_32.h | 84 -------------------------------- 1 file changed, 84 deletions(-) diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 2fd165f1cffa..f74362b05619 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -209,29 +209,6 @@ static inline void *__memset_generic(void *s, char c, size_t count) /* we might want to write optimized versions of these later */ #define __constant_count_memset(s, c, count) __memset_generic((s), (c), (count)) -/* - * memset(x, 0, y) is a reasonably common thing to do, so we want to fill - * things 32 bits at a time even when we don't know the size of the - * area at compile-time.. - */ -static __always_inline -void *__constant_c_memset(void *s, unsigned long c, size_t count) -{ - int d0, d1; - asm volatile("rep ; stosl\n\t" - "testb $2,%b3\n\t" - "je 1f\n\t" - "stosw\n" - "1:\ttestb $1,%b3\n\t" - "je 2f\n\t" - "stosb\n" - "2:" - : "=&c" (d0), "=&D" (d1) - : "a" (c), "q" (count), "0" (count/4), "1" ((long)s) - : "memory"); - return s; -} - /* Added by Gertjan van Wingerde to make minix and sysv module work */ #define __HAVE_ARCH_STRNLEN extern size_t strnlen(const char *s, size_t count); @@ -240,67 +217,6 @@ extern size_t strnlen(const char *s, size_t count); #define __HAVE_ARCH_STRSTR extern char *strstr(const char *cs, const char *ct); -/* - * This looks horribly ugly, but the compiler can optimize it totally, - * as we by now know that both pattern and count is constant.. - */ -static __always_inline -void *__constant_c_and_count_memset(void *s, unsigned long pattern, - size_t count) -{ - switch (count) { - case 0: - return s; - case 1: - *(unsigned char *)s = pattern & 0xff; - return s; - case 2: - *(unsigned short *)s = pattern & 0xffff; - return s; - case 3: - *(unsigned short *)s = pattern & 0xffff; - *((unsigned char *)s + 2) = pattern & 0xff; - return s; - case 4: - *(unsigned long *)s = pattern; - return s; - } - -#define COMMON(x) \ - asm volatile("rep ; stosl" \ - x \ - : "=&c" (d0), "=&D" (d1) \ - : "a" (eax), "0" (count/4), "1" ((long)s) \ - : "memory") - - { - int d0, d1; - unsigned long eax = pattern; - - switch (count % 4) { - case 0: - COMMON(""); - return s; - case 1: - COMMON("\n\tstosb"); - return s; - case 2: - COMMON("\n\tstosw"); - return s; - default: - COMMON("\n\tstosw\n\tstosb"); - return s; - } - } - -#undef COMMON -} - -#define __constant_c_x_memset(s, c, count) \ - (__builtin_constant_p(count) \ - ? __constant_c_and_count_memset((s), (c), (count)) \ - : __constant_c_memset((s), (c), (count))) - #define __memset(s, c, count) \ (__builtin_constant_p(count) \ ? __constant_count_memset((s), (c), (count)) \ From f8261c376e7f8cb9024af5a6c54be540c7f9108e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 14 Jan 2019 13:49:46 +0300 Subject: [PATCH 024/855] drm/etnaviv: NULL vs IS_ERR() buf in etnaviv_core_dump() The etnaviv_gem_get_pages() never returns NULL. It returns error pointers on error. Fixes: a8c21a5451d8 ("drm/etnaviv: add initial etnaviv DRM driver") Signed-off-by: Dan Carpenter Signed-off-by: Lucas Stach --- drivers/gpu/drm/etnaviv/etnaviv_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c b/drivers/gpu/drm/etnaviv/etnaviv_dump.c index 3fbb4855396c..33854c94cb85 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c @@ -215,7 +215,7 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu) mutex_lock(&obj->lock); pages = etnaviv_gem_get_pages(obj); mutex_unlock(&obj->lock); - if (pages) { + if (!IS_ERR(pages)) { int j; iter.hdr->data[0] = bomap - bomap_start; From 5b8e432dbb0e20bd8e99033f4a0bfa0d38c0e08e Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Tue, 15 Jan 2019 00:41:43 +0800 Subject: [PATCH 025/855] xen/blkback: add stack variable 'blkif' in connect_ring() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As 'be->blkif' is used for many times in connect_ring(), the stack variable 'blkif' is added to substitute 'be-blkif'. Suggested-by: Paul Durrant Signed-off-by: Dongli Zhang Reviewed-by: Paul Durrant Reviewed-by: Roger Pau Monné Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/xenbus.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index a4bc74e72c39..a4aadac772e5 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -1023,6 +1023,7 @@ fail: static int connect_ring(struct backend_info *be) { struct xenbus_device *dev = be->dev; + struct xen_blkif *blkif = be->blkif; unsigned int pers_grants; char protocol[64] = ""; int err, i; @@ -1033,25 +1034,25 @@ static int connect_ring(struct backend_info *be) pr_debug("%s %s\n", __func__, dev->otherend); - be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; + blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; err = xenbus_scanf(XBT_NIL, dev->otherend, "protocol", "%63s", protocol); if (err <= 0) strcpy(protocol, "unspecified, assuming default"); else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; else { xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); return -ENOSYS; } pers_grants = xenbus_read_unsigned(dev->otherend, "feature-persistent", 0); - be->blkif->vbd.feature_gnt_persistent = pers_grants; - be->blkif->vbd.overflow_max_grants = 0; + blkif->vbd.feature_gnt_persistent = pers_grants; + blkif->vbd.overflow_max_grants = 0; /* * Read the number of hardware queues from frontend. @@ -1067,16 +1068,16 @@ static int connect_ring(struct backend_info *be) requested_num_queues, xenblk_max_queues); return -ENOSYS; } - be->blkif->nr_rings = requested_num_queues; - if (xen_blkif_alloc_rings(be->blkif)) + blkif->nr_rings = requested_num_queues; + if (xen_blkif_alloc_rings(blkif)) return -ENOMEM; pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename, - be->blkif->nr_rings, be->blkif->blk_protocol, protocol, + blkif->nr_rings, blkif->blk_protocol, protocol, pers_grants ? "persistent grants" : ""); - if (be->blkif->nr_rings == 1) - return read_per_ring_refs(&be->blkif->rings[0], dev->otherend); + if (blkif->nr_rings == 1) + return read_per_ring_refs(&blkif->rings[0], dev->otherend); else { xspathsize = strlen(dev->otherend) + xenstore_path_ext_size; xspath = kmalloc(xspathsize, GFP_KERNEL); @@ -1085,10 +1086,10 @@ static int connect_ring(struct backend_info *be) return -ENOMEM; } - for (i = 0; i < be->blkif->nr_rings; i++) { + for (i = 0; i < blkif->nr_rings; i++) { memset(xspath, 0, xspathsize); snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i); - err = read_per_ring_refs(&be->blkif->rings[i], xspath); + err = read_per_ring_refs(&blkif->rings[i], xspath); if (err) { kfree(xspath); return err; From fd4b77e8deea9cced6c1eca8bd64d2f03fc7cc94 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Tue, 15 Jan 2019 09:53:22 +0100 Subject: [PATCH 026/855] drm/etnaviv: don't restrict to certain architectures The Vivante GPU cores are found in many different SoCs and the driver does not depend on anything architecture specific, so just drop the architecture restriction. Signed-off-by: Lucas Stach Acked-by: Sam Ravnborg --- drivers/gpu/drm/etnaviv/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/etnaviv/Kconfig b/drivers/gpu/drm/etnaviv/Kconfig index 342591a1084e..21df44b78df3 100644 --- a/drivers/gpu/drm/etnaviv/Kconfig +++ b/drivers/gpu/drm/etnaviv/Kconfig @@ -2,7 +2,6 @@ config DRM_ETNAVIV tristate "ETNAVIV (DRM support for Vivante GPU IP cores)" depends on DRM - depends on ARCH_MXC || ARCH_DOVE || ARM || COMPILE_TEST depends on MMU select SHMEM select SYNC_FILE From ddf06b753a8573eb0323d91efeaffe397f7b673d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 Jan 2019 10:21:24 +0800 Subject: [PATCH 027/855] f2fs: fix to trigger fsck if dirent.name_len is zero While traversing dirents in f2fs_fill_dentries(), if bitmap is valid, filename length should not be zero, otherwise, directory structure consistency could be corrupted, in this case, let's print related info and set SBI_NEED_FSCK to trigger fsck for repairing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 50d0d36280fa..926166528cd4 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -800,6 +800,10 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (de->name_len == 0) { bit_pos++; ctx->pos = start_pos + bit_pos; + printk_ratelimited( + "%s, invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } From 720db068634c91553a8e1d9a0fcd8c7050e06d2b Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 7 Jan 2019 15:02:34 +0800 Subject: [PATCH 028/855] f2fs: check if file namelen exceeds max value Dentry bitmap is not enough to detect incorrect dentries. So this patch also checks the namelen value of a dentry. Signed-off-by: Gong Chen Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 926166528cd4..ba7535399d95 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -814,7 +814,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, /* check memory boundary before moving forward */ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - if (unlikely(bit_pos > d->max)) { + if (unlikely(bit_pos > d->max || + le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) { f2fs_msg(sbi->sb, KERN_WARNING, "%s: corrupted namelen=%d, run fsck to fix.", __func__, le16_to_cpu(de->name_len)); From 2f84babfe5eb270e13d54afebc26b6d19f49cba2 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 14 Jan 2019 22:05:14 +0800 Subject: [PATCH 029/855] f2fs: add brackets for macros Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7df41cd1eb35..f71b44f21ffb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -254,7 +254,7 @@ struct discard_entry { /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ - (MAX_PLIST_NUM - 1) : (blk_num - 1)) + (MAX_PLIST_NUM - 1) : ((blk_num) - 1)) enum { D_PREP, /* initial */ @@ -2763,9 +2763,9 @@ static inline int get_inline_xattr_addrs(struct inode *inode) #define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) #define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ - ((offsetof(typeof(*f2fs_inode), field) + \ + ((offsetof(typeof(*(f2fs_inode)), field) + \ sizeof((f2fs_inode)->field)) \ - <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) { @@ -2794,8 +2794,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, #define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) -#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ - (!is_read_io(fio->op) || fio->is_meta)) +#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META && \ + (!is_read_io((fio)->op) || (fio)->is_meta)) bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); From ac92985864e187a1735502f6a02f54eaa655b2aa Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 15 Jan 2019 20:02:15 +0000 Subject: [PATCH 030/855] f2fs: UBSAN: set boolean value iostat_enable correctly When setting /sys/fs/f2fs//iostat_enable with non-bool value, UBSAN reports the following warning. [ 7562.295484] ================================================================================ [ 7562.296531] UBSAN: Undefined behaviour in fs/f2fs/f2fs.h:2776:10 [ 7562.297651] load of value 64 is not a valid value for type '_Bool' [ 7562.298642] CPU: 1 PID: 7487 Comm: dd Not tainted 4.20.0-rc4+ #79 [ 7562.298653] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 7562.298662] Call Trace: [ 7562.298760] dump_stack+0x46/0x5b [ 7562.298811] ubsan_epilogue+0x9/0x40 [ 7562.298830] __ubsan_handle_load_invalid_value+0x72/0x90 [ 7562.298863] f2fs_file_write_iter+0x29f/0x3f0 [ 7562.298905] __vfs_write+0x115/0x160 [ 7562.298922] vfs_write+0xa7/0x190 [ 7562.298934] ksys_write+0x50/0xc0 [ 7562.298973] do_syscall_64+0x4a/0xe0 [ 7562.298992] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 7562.299001] RIP: 0033:0x7fa45ec19c00 [ 7562.299004] Code: 73 01 c3 48 8b 0d 88 92 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d dd eb 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04 24 [ 7562.299044] RSP: 002b:00007ffca52b49e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 7562.299052] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fa45ec19c00 [ 7562.299059] RDX: 0000000000000400 RSI: 000000000093f000 RDI: 0000000000000001 [ 7562.299065] RBP: 000000000093f000 R08: 0000000000000004 R09: 0000000000000000 [ 7562.299071] R10: 00007ffca52b47b0 R11: 0000000000000246 R12: 0000000000000400 [ 7562.299077] R13: 000000000093f000 R14: 000000000093f400 R15: 0000000000000000 [ 7562.299091] ================================================================================ So, if iostat_enable is enabled, set its value as true. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 02d4012a9183..5b83e1d66cb3 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -280,10 +280,16 @@ out: return count; } - *ui = t; - if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) - f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "iostat_enable")) { + sbi->iostat_enable = !!t; + if (!sbi->iostat_enable) + f2fs_reset_iostat(sbi); + return count; + } + + *ui = (unsigned int)t; + return count; } From f9aa52a8cbe09fe25244d59c29660bbe635df613 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Jan 2019 09:51:28 +0800 Subject: [PATCH 031/855] f2fs: fix to initialize variable to avoid UBSAN/smatch warning As Dan Carpenter as below: The patch df634f444ee9: "f2fs: use rb_*_cached friends" from Oct 4, 2018, leads to the following static checker warning: fs/f2fs/extent_cache.c:606 f2fs_update_extent_tree_range() error: uninitialized symbol 'leftmost'. And also Eric Biggers, and Kyungtae Kim reported, there is an UBSAN warning described as below: We report a bug in linux-4.20.2: "UBSAN: Undefined behaviour in fs/f2fs/extent_cache.c" kernel config: https://kt0755.github.io/etc/config_v4.20_stable repro: https://kt0755.github.io/etc/repro.4a3e7.c (f2fs is mounted on /mnt/f2fs/) This arose in f2fs_update_extent_tree_range (fs/f2fs/extent_cache.c:605). It seems that, for some reason, its last argument became "24" although that was supposed to be bool type. ========================================= UBSAN: Undefined behaviour in fs/f2fs/extent_cache.c:605:4 load of value 24 is not a valid value for type '_Bool' CPU: 0 PID: 6774 Comm: syz-executor5 Not tainted 4.20.2 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xb1/0x118 lib/dump_stack.c:113 ubsan_epilogue+0x12/0x94 lib/ubsan.c:159 __ubsan_handle_load_invalid_value+0x17a/0x1be lib/ubsan.c:457 f2fs_update_extent_tree_range+0x1d4a/0x1d50 fs/f2fs/extent_cache.c:605 f2fs_update_extent_cache+0x2b6/0x350 fs/f2fs/extent_cache.c:804 f2fs_update_data_blkaddr+0x61/0x70 fs/f2fs/data.c:656 f2fs_outplace_write_data+0x1d6/0x4b0 fs/f2fs/segment.c:3140 f2fs_convert_inline_page+0x86d/0x2060 fs/f2fs/inline.c:163 f2fs_convert_inline_inode+0x6b5/0xad0 fs/f2fs/inline.c:208 f2fs_preallocate_blocks+0x78b/0xb00 fs/f2fs/data.c:982 f2fs_file_write_iter+0x31b/0xf40 fs/f2fs/file.c:3062 call_write_iter include/linux/fs.h:1857 [inline] new_sync_write fs/read_write.c:474 [inline] __vfs_write+0x538/0x6e0 fs/read_write.c:487 vfs_write+0x1b3/0x520 fs/read_write.c:549 ksys_write+0xde/0x1c0 fs/read_write.c:598 __do_sys_write fs/read_write.c:610 [inline] __se_sys_write fs/read_write.c:607 [inline] __x64_sys_write+0x7e/0xc0 fs/read_write.c:607 do_syscall_64+0xbe/0x4f0 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 Code: e8 8c 9f 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 9b 6b fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f1ea15edc68 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007f1ea15ee6cc RCX: 00000000004497b9 RDX: 0000000000001000 RSI: 0000000020000140 RDI: 0000000000000013 RBP: 000000000071bea0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 000000000000bb50 R14: 00000000006f4bf0 R15: 00007f1ea15ee700 ========================================= As I checked, this uninitialized variable won't cause extent cache corruption, but in order to avoid such kind of warning of both UBSAN and smatch, fix to initialize related variable. Reported-by: Dan Carpenter Reported-by: Eric Biggers Reported-by: Kyungtae Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 1cb0fcc67d2d..caf77fe8ac07 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -506,7 +506,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, unsigned int end = fofs + len; unsigned int pos = (unsigned int)fofs; bool updated = false; - bool leftmost; + bool leftmost = false; if (!et) return; From 2010987365ab3b381df20bb6f175b59551cfe67d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 10 Jan 2019 16:40:12 +0800 Subject: [PATCH 032/855] f2fs: fix to set sbi dirty correctly In order to record direct IO count, we add two additional type in enum count_type: F2FS_DIO_{WRITE,READ}, but those IO won't dirty filesystem metadata, so we don't need to set filesystem dirty in inc_page_count(), fix it. Fixes: 02b16d0a34a1 ("f2fs: add to account direct IO") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f71b44f21ffb..0f564883e078 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1799,13 +1799,12 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || - count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA || - count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE || - count_type == F2FS_RD_META) - return; - - set_sbi_flag(sbi, SBI_IS_DIRTY); + if (count_type == F2FS_DIRTY_DENTS || + count_type == F2FS_DIRTY_NODES || + count_type == F2FS_DIRTY_META || + count_type == F2FS_DIRTY_QDATA || + count_type == F2FS_DIRTY_IMETA) + set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) From 3bbe8b1a4ae9585e9cf15e7036bf9e5374a482df Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 23 Jan 2019 01:16:32 -0600 Subject: [PATCH 033/855] 9p: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. This patch fixes the following warning: net/9p/trans_xen.c:514:6: warning: this statement may fall through [-Wimplicit-fallthrough=] Warning level 3 was used: -Wimplicit-fallthrough=3 This patch is part of the ongoing efforts to enabling -Wimplicit-fallthrough Link: http://lkml.kernel.org/r/20190123071632.GA8039@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Dominique Martinet --- net/9p/trans_xen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index e2fbf3677b9b..29420ebb8f07 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -513,7 +513,7 @@ static void xen_9pfs_front_changed(struct xenbus_device *dev, case XenbusStateClosed: if (dev->state == XenbusStateClosed) break; - /* Missed the backend's CLOSING state -- fallthrough */ + /* fall through - Missed the backend's CLOSING state */ case XenbusStateClosing: xenbus_frontend_closed(dev); break; From 664525b2d84abca1074c9546654ae9689de8a818 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 24 Jan 2019 13:12:04 -0800 Subject: [PATCH 034/855] device-dax: Auto-bind device after successful new_id The typical 'new_id' attribute behavior is to immediately attach a device to its driver after a new device-id is added. Implement this behavior for the dax bus. Reported-by: Alexander Duyck Reported-by: Brice Goglin Cc: Dave Hansen Signed-off-by: Dan Williams --- drivers/dax/bus.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index c620ad52d7e5..a410154d75fb 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -57,8 +57,13 @@ static int dax_match_id(struct dax_device_driver *dax_drv, struct device *dev) return match; } +enum id_action { + ID_REMOVE, + ID_ADD, +}; + static ssize_t do_id_store(struct device_driver *drv, const char *buf, - size_t count, bool add) + size_t count, enum id_action action) { struct dax_device_driver *dax_drv = to_dax_drv(drv); unsigned int region_id, id; @@ -77,7 +82,7 @@ static ssize_t do_id_store(struct device_driver *drv, const char *buf, mutex_lock(&dax_bus_lock); dax_id = __dax_match_id(dax_drv, buf); if (!dax_id) { - if (add) { + if (action == ID_ADD) { dax_id = kzalloc(sizeof(*dax_id), GFP_KERNEL); if (dax_id) { strncpy(dax_id->dev_name, buf, DAX_NAME_LEN); @@ -86,26 +91,33 @@ static ssize_t do_id_store(struct device_driver *drv, const char *buf, rc = -ENOMEM; } else /* nothing to remove */; - } else if (!add) { + } else if (action == ID_REMOVE) { list_del(&dax_id->list); kfree(dax_id); } else /* dax_id already added */; mutex_unlock(&dax_bus_lock); - return rc; + + if (rc < 0) + return rc; + if (action == ID_ADD) + rc = driver_attach(drv); + if (rc) + return rc; + return count; } static ssize_t new_id_store(struct device_driver *drv, const char *buf, size_t count) { - return do_id_store(drv, buf, count, true); + return do_id_store(drv, buf, count, ID_ADD); } static DRIVER_ATTR_WO(new_id); static ssize_t remove_id_store(struct device_driver *drv, const char *buf, size_t count) { - return do_id_store(drv, buf, count, false); + return do_id_store(drv, buf, count, ID_REMOVE); } static DRIVER_ATTR_WO(remove_id); From 9ec5cd0a40f2e7834ddd55708e8def18f0940244 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 13 Dec 2018 14:04:56 +0100 Subject: [PATCH 035/855] ARM: 8817/1: mm: skip cleaning of idmap page tables on LPAE capable cores Currently, init_static_idmap() installs some page table entries to cover the identity mapped part of the kernel image (which is only about 160 bytes in size in a multi_v7_defconfig Thumb2 build), and calls flush_cache_louis() to ensure that the updates are visible to the page table walker on the same core. When running under virtualization, flush_cache_louis() may take more than 10 seconds to complete: [ 0.108192] Setting up static identity map for 0x40300000 - 0x403000a0 [ 13.078127] rcu: Hierarchical SRCU implementation. This is due to the fact that set/way ops are not virtualizable, and so KVM may trap each one, resulting in a substantial delay. Since only LPAE capable CPUs may execute under virtualization, and considering that LPAE capable CPUs are guaranteed to have cache coherent page table walkers (per the architecture), let's only perform this cache maintenance on non-LPAE cores. Cc: Will Deacon Acked-by: Marc Zyngier Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King --- arch/arm/mm/idmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c index 1d1edd064199..a033f6134a64 100644 --- a/arch/arm/mm/idmap.c +++ b/arch/arm/mm/idmap.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -110,7 +111,8 @@ static int __init init_static_idmap(void) __idmap_text_end, 0); /* Flush L1 for the hardware to see this page table content */ - flush_cache_louis(); + if (!(elf_hwcap & HWCAP_LPAE)) + flush_cache_louis(); return 0; } From 4a4d68fc2657df9a2b4f114081141e2599804586 Mon Sep 17 00:00:00 2001 From: "Wolfram Sang (Renesas)" Date: Wed, 19 Dec 2018 00:12:07 +0100 Subject: [PATCH 036/855] ARM: 8818/1: dma-mapping: update comment about handling dma_ops when detaching from IOMMU Update the comment because we don't set the pointer to NULL anymore. Also use the correct pointer name 'dma_ops' instead of 'dma_map_ops'. Fixes: 1874619a7df4 ("ARM: dma-mapping: Set proper DMA ops in arm_iommu_detach_device()") Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Signed-off-by: Russell King --- arch/arm/mm/dma-mapping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index f1e2922e447c..fae7465767b9 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -2277,7 +2277,7 @@ EXPORT_SYMBOL_GPL(arm_iommu_attach_device); * @dev: valid struct device pointer * * Detaches the provided device from a previously attached map. - * This voids the dma operations (dma_map_ops pointer) + * This overwrites the dma_ops pointer with appropriate non-IOMMU ops. */ void arm_iommu_detach_device(struct device *dev) { From 091bb549f7722723b284f63ac665e2aedcf9dec9 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 5 Jan 2019 19:35:25 +0100 Subject: [PATCH 037/855] ARM: 8819/1: Remove '-p' from LDFLAGS This option is not supported by lld: ld.lld: error: unknown argument: -p This has been a no-op in binutils since 2004 (see commit dea514f51da1 in that tree). Given that the lowest officially supported of binutils for the kernel is 2.20, which was released in 2009, nobody needs this flag around so just remove it. Commit 1a381d4a0a9a ("arm64: remove no-op -p linker flag") did the same for arm64. Signed-off-by: Nathan Chancellor Acked-by: Ard Biesheuvel Acked-by: Nicolas Pitre Reviewed-by: Nick Desaulniers Reviewed-by: Stefan Agner Signed-off-by: Russell King --- arch/arm/Makefile | 2 +- arch/arm/boot/bootp/Makefile | 2 +- arch/arm/boot/compressed/Makefile | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 9db3c584b2cb..ac0187c8d258 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -10,7 +10,7 @@ # # Copyright (C) 1995-2001 by Russell King -LDFLAGS_vmlinux :=-p --no-undefined -X --pic-veneer +LDFLAGS_vmlinux := --no-undefined -X --pic-veneer ifeq ($(CONFIG_CPU_ENDIAN_BE8),y) LDFLAGS_vmlinux += --be8 KBUILD_LDFLAGS_MODULE += --be8 diff --git a/arch/arm/boot/bootp/Makefile b/arch/arm/boot/bootp/Makefile index 83e1a076a5d6..981a8d03f064 100644 --- a/arch/arm/boot/bootp/Makefile +++ b/arch/arm/boot/bootp/Makefile @@ -8,7 +8,7 @@ GCOV_PROFILE := n -LDFLAGS_bootp :=-p --no-undefined -X \ +LDFLAGS_bootp := --no-undefined -X \ --defsym initrd_phys=$(INITRD_PHYS) \ --defsym params_phys=$(PARAMS_PHYS) -T AFLAGS_initrd.o :=-DINITRD=\"$(INITRD)\" diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 6114ae6ea466..9219389bbe61 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -132,8 +132,6 @@ endif ifeq ($(CONFIG_CPU_ENDIAN_BE8),y) LDFLAGS_vmlinux += --be8 endif -# ? -LDFLAGS_vmlinux += -p # Report unresolved symbol references LDFLAGS_vmlinux += --no-undefined # Delete all temporary local symbols From 1c31d4e96b8c205fe3aa8e73e930a0ccbf4b9a2b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 8 Jan 2019 14:27:01 +0100 Subject: [PATCH 038/855] ARM: 8820/1: mm: Stop printing the virtual memory layout Since commit ad67b74d2469d9b8 ("printk: hash addresses printed with %p"), the virtual memory layout printed during boot up contains "ptrval" instead of actual addresses: Memory: 501296K/524288K available (6144K kernel code, 528K rwdata, 1944K rodata, 1024K init, 7584K bss, 22992K reserved, 0K cma-reserved) Virtual kernel memory layout: vector : 0xffff0000 - 0xffff1000 ( 4 kB) fixmap : 0xffc00000 - 0xfff00000 (3072 kB) vmalloc : 0xe0800000 - 0xff800000 ( 496 MB) lowmem : 0xc0000000 - 0xe0000000 ( 512 MB) modules : 0xbf000000 - 0xc0000000 ( 16 MB) .text : 0x(ptrval) - 0x(ptrval) (7136 kB) .init : 0x(ptrval) - 0x(ptrval) (1024 kB) .data : 0x(ptrval) - 0x(ptrval) ( 529 kB) .bss : 0x(ptrval) - 0x(ptrval) (7585 kB) Instead of changing the printing to "%px", and leaking virtual memory layout information again, just remove the printing completely, cfr. e.g. commits 071929dbdd865f77 ("arm64: Stop printing the virtual memory layout") and 31833332f7987636 ("m68k/mm: Stop printing the virtual memory layout"). All interesting information (actual section sizes) is already printed by mem_init_print_info() just above anyway. Signed-off-by: Geert Uytterhoeven Reviewed-by: Kees Cook Signed-off-by: Russell King --- arch/arm/mm/init.c | 49 ---------------------------------------------- 1 file changed, 49 deletions(-) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 478ea8b7db87..57962080e47a 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -494,55 +494,6 @@ void __init mem_init(void) mem_init_print_info(NULL); -#define MLK(b, t) b, t, ((t) - (b)) >> 10 -#define MLM(b, t) b, t, ((t) - (b)) >> 20 -#define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K) - - pr_notice("Virtual kernel memory layout:\n" - " vector : 0x%08lx - 0x%08lx (%4ld kB)\n" -#ifdef CONFIG_HAVE_TCM - " DTCM : 0x%08lx - 0x%08lx (%4ld kB)\n" - " ITCM : 0x%08lx - 0x%08lx (%4ld kB)\n" -#endif - " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" - " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" - " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" -#ifdef CONFIG_HIGHMEM - " pkmap : 0x%08lx - 0x%08lx (%4ld MB)\n" -#endif -#ifdef CONFIG_MODULES - " modules : 0x%08lx - 0x%08lx (%4ld MB)\n" -#endif - " .text : 0x%p" " - 0x%p" " (%4td kB)\n" - " .init : 0x%p" " - 0x%p" " (%4td kB)\n" - " .data : 0x%p" " - 0x%p" " (%4td kB)\n" - " .bss : 0x%p" " - 0x%p" " (%4td kB)\n", - - MLK(VECTORS_BASE, VECTORS_BASE + PAGE_SIZE), -#ifdef CONFIG_HAVE_TCM - MLK(DTCM_OFFSET, (unsigned long) dtcm_end), - MLK(ITCM_OFFSET, (unsigned long) itcm_end), -#endif - MLK(FIXADDR_START, FIXADDR_END), - MLM(VMALLOC_START, VMALLOC_END), - MLM(PAGE_OFFSET, (unsigned long)high_memory), -#ifdef CONFIG_HIGHMEM - MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP) * - (PAGE_SIZE)), -#endif -#ifdef CONFIG_MODULES - MLM(MODULES_VADDR, MODULES_END), -#endif - - MLK_ROUNDUP(_text, _etext), - MLK_ROUNDUP(__init_begin, __init_end), - MLK_ROUNDUP(_sdata, _edata), - MLK_ROUNDUP(__bss_start, __bss_stop)); - -#undef MLK -#undef MLM -#undef MLK_ROUNDUP - /* * Check boundaries twice: Some fundamental inconsistencies can * be detected at build time already. From 8f433ec4d0c41d8d01113bc0bf846f2423df6c71 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 8 Jan 2019 14:28:05 +0100 Subject: [PATCH 039/855] ARM: 8821/1: Correct meaning of SCU in HAVE_ARM_SCU help txt According to the ARM Cortex-A5 and Cortex-A9 Technical Reference Manuals, SCU stands for "Snoop Control Unit". Signed-off-by: Geert Uytterhoeven Signed-off-by: Russell King --- arch/arm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 664e918e2624..8986cf07fe1f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1304,7 +1304,7 @@ config SCHED_SMT config HAVE_ARM_SCU bool help - This option enables support for the ARM system coherency unit + This option enables support for the ARM snoop control unit config HAVE_ARM_ARCH_TIMER bool "Architected timer support" From fec9eac6594750ee74c99549f13de3aa9de91b18 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 8 Jan 2019 14:29:23 +0100 Subject: [PATCH 040/855] ARM: 8822/1: smp_twd: Remove legacy TWD registration As of commit 7484c727b636a838 ("ARM: realview: delete the RealView board files"), the ARM Timer and Watchdog Unit is instantiated from DT only. Moreover, the driver is selected from ARCH_MULTIPLATFORM platforms only, which implies OF, TIMER_OF, and COMMON_CLK. Hence remove all unused legacy infrastructure from the driver. Signed-off-by: Geert Uytterhoeven Reviewed-by: Linus Walleij Acked-by: Marc Zyngier Signed-off-by: Russell King --- arch/arm/Kconfig | 1 - arch/arm/include/asm/smp_twd.h | 16 --------- arch/arm/kernel/smp_twd.c | 66 ---------------------------------- 3 files changed, 83 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8986cf07fe1f..93f380931d5b 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1316,7 +1316,6 @@ config HAVE_ARM_ARCH_TIMER config HAVE_ARM_TWD bool - select TIMER_OF if OF help This options enables support for the ARM timer and watchdog unit diff --git a/arch/arm/include/asm/smp_twd.h b/arch/arm/include/asm/smp_twd.h index 312784ee9936..c729d2113a24 100644 --- a/arch/arm/include/asm/smp_twd.h +++ b/arch/arm/include/asm/smp_twd.h @@ -19,20 +19,4 @@ #define TWD_TIMER_CONTROL_PERIODIC (1 << 1) #define TWD_TIMER_CONTROL_IT_ENABLE (1 << 2) -#include - -struct twd_local_timer { - struct resource res[2]; -}; - -#define DEFINE_TWD_LOCAL_TIMER(name,base,irq) \ -struct twd_local_timer name __initdata = { \ - .res = { \ - DEFINE_RES_MEM(base, 0x10), \ - DEFINE_RES_IRQ(irq), \ - }, \ -}; - -int twd_local_timer_register(struct twd_local_timer *); - #endif diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c index b30eafeef096..3cdc399b9fc3 100644 --- a/arch/arm/kernel/smp_twd.c +++ b/arch/arm/kernel/smp_twd.c @@ -100,8 +100,6 @@ static void twd_timer_stop(void) disable_percpu_irq(clk->irq); } -#ifdef CONFIG_COMMON_CLK - /* * Updates clockevent frequency when the cpu frequency changes. * Called on the cpu that is changing frequency with interrupts disabled. @@ -143,54 +141,6 @@ static int twd_clk_init(void) } core_initcall(twd_clk_init); -#elif defined (CONFIG_CPU_FREQ) - -#include - -/* - * Updates clockevent frequency when the cpu frequency changes. - * Called on the cpu that is changing frequency with interrupts disabled. - */ -static void twd_update_frequency(void *data) -{ - twd_timer_rate = clk_get_rate(twd_clk); - - clockevents_update_freq(raw_cpu_ptr(twd_evt), twd_timer_rate); -} - -static int twd_cpufreq_transition(struct notifier_block *nb, - unsigned long state, void *data) -{ - struct cpufreq_freqs *freqs = data; - - /* - * The twd clock events must be reprogrammed to account for the new - * frequency. The timer is local to a cpu, so cross-call to the - * changing cpu. - */ - if (state == CPUFREQ_POSTCHANGE) - smp_call_function_single(freqs->cpu, twd_update_frequency, - NULL, 1); - - return NOTIFY_OK; -} - -static struct notifier_block twd_cpufreq_nb = { - .notifier_call = twd_cpufreq_transition, -}; - -static int twd_cpufreq_init(void) -{ - if (twd_evt && raw_cpu_ptr(twd_evt) && !IS_ERR(twd_clk)) - return cpufreq_register_notifier(&twd_cpufreq_nb, - CPUFREQ_TRANSITION_NOTIFIER); - - return 0; -} -core_initcall(twd_cpufreq_init); - -#endif - static void twd_calibrate_rate(void) { unsigned long count; @@ -366,21 +316,6 @@ out_free: return err; } -int __init twd_local_timer_register(struct twd_local_timer *tlt) -{ - if (twd_base || twd_evt) - return -EBUSY; - - twd_ppi = tlt->res[1].start; - - twd_base = ioremap(tlt->res[0].start, resource_size(&tlt->res[0])); - if (!twd_base) - return -ENOMEM; - - return twd_local_timer_common_register(NULL); -} - -#ifdef CONFIG_OF static int __init twd_local_timer_of_register(struct device_node *np) { int err; @@ -406,4 +341,3 @@ out: TIMER_OF_DECLARE(arm_twd_a9, "arm,cortex-a9-twd-timer", twd_local_timer_of_register); TIMER_OF_DECLARE(arm_twd_a5, "arm,cortex-a5-twd-timer", twd_local_timer_of_register); TIMER_OF_DECLARE(arm_twd_11mp, "arm,arm11mp-twd-timer", twd_local_timer_of_register); -#endif From 58ca33824ff850bee93c850830a7b180486f3371 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Wed, 16 Jan 2019 13:31:41 +0100 Subject: [PATCH 041/855] ARM: 8823/1: Implement pgprot_device() This is used when mmapping the PCI resource* files in sys. Because ARM currently lacks an implementation of pgprot_device(), it falls back to pgprot_uncached() (Strongly Ordered), but we should be able to use Device memory instead. Doing this speeds up large writes to the resource files by about 40% on one of my systems. It also ensures that mmaps on these resources use the same memory type as ioremap(). Signed-off-by: Vincent Whitchurch Signed-off-by: Russell King --- arch/arm/include/asm/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index a757401129f9..48ce1b19069b 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -125,6 +125,9 @@ extern pgprot_t pgprot_s2_device; #define pgprot_stronglyordered(prot) \ __pgprot_modify(prot, L_PTE_MT_MASK, L_PTE_MT_UNCACHED) +#define pgprot_device(prot) \ + __pgprot_modify(prot, L_PTE_MT_MASK, L_PTE_MT_DEV_SHARED | L_PTE_SHARED | L_PTE_DIRTY | L_PTE_XN) + #ifdef CONFIG_ARM_DMA_MEM_BUFFERABLE #define pgprot_dmacoherent(prot) \ __pgprot_modify(prot, L_PTE_MT_MASK, L_PTE_MT_BUFFERABLE | L_PTE_XN) From 071d184a19f673bbb69a593c2ff4bd7acc0d1fe6 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Tue, 22 Jan 2019 21:05:10 +0100 Subject: [PATCH 042/855] ARM: 8826/1: mm: initialize pfn limits with find_limits() The max_low_pfn value must be set before sparse_init() is called to keep the early memblock allocations and frees balanced for kmemleak initialization when sparsemem is enabled. This commit accomplishes that by replacing the local variables min, max_low, and max_high with the global limit variables min_low_pfn, max_low_pfn, and max_pfn respectively in bootmem_init(). The global variables are initialized directly by find_limits() and used in the remainder of the function. Fixes: 9099daed9c69 ("mm: kmemleak: avoid using __va() on addresses that don't have a lowmem mapping") Cc: Catalin Marinas Acked-by: Mike Rapoport Signed-off-by: Doug Berger Signed-off-by: Russell King --- arch/arm/mm/init.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 57962080e47a..a6b0805b7977 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -278,15 +278,12 @@ void __init arm_memblock_init(const struct machine_desc *mdesc) void __init bootmem_init(void) { - unsigned long min, max_low, max_high; - memblock_allow_resize(); - max_low = max_high = 0; - find_limits(&min, &max_low, &max_high); + find_limits(&min_low_pfn, &max_low_pfn, &max_pfn); - early_memtest((phys_addr_t)min << PAGE_SHIFT, - (phys_addr_t)max_low << PAGE_SHIFT); + early_memtest((phys_addr_t)min_low_pfn << PAGE_SHIFT, + (phys_addr_t)max_low_pfn << PAGE_SHIFT); /* * Sparsemem tries to allocate bootmem in memory_present(), @@ -304,16 +301,7 @@ void __init bootmem_init(void) * the sparse mem_map arrays initialized by sparse_init() * for memmap_init_zone(), otherwise all PFNs are invalid. */ - zone_sizes_init(min, max_low, max_high); - - /* - * This doesn't seem to be used by the Linux memory manager any - * more, but is used by ll_rw_block. If we can get rid of it, we - * also get rid of some of the stuff above as well. - */ - min_low_pfn = min; - max_low_pfn = max_low; - max_pfn = max_high; + zone_sizes_init(min_low_pfn, max_low_pfn, max_pfn); } /* From baf2df8e15be22b8bd24bdd6fd4575b6641bcfd1 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Thu, 24 Jan 2019 21:41:59 +0100 Subject: [PATCH 043/855] ARM: 8827/1: fix argument count to match macro definition The macro str8w takes 10 arguments, abort being the 10th. In this particular instantiation the abort argument is passed as 11th argument leading to an error when using LLVM's integrated assembler: :46:47: error: too many positional arguments str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f ^ arch/arm/lib/copy_template.S:277:5: note: while in macro instantiation 18: forward_copy_shift pull=24 push=8 ^ The argument is not used in the macro hence this does not change code generation. Signed-off-by: Stefan Agner Reviewed-by: Nicolas Pitre Signed-off-by: Russell King --- arch/arm/lib/copy_template.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S index 652e4d98cd47..2d54491b0e22 100644 --- a/arch/arm/lib/copy_template.S +++ b/arch/arm/lib/copy_template.S @@ -241,7 +241,7 @@ orr r9, r9, ip, lspush #\push mov ip, ip, lspull #\pull orr ip, ip, lr, lspush #\push - str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f + str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, abort=19f bge 12b PLD( cmn r2, #96 ) PLD( bge 13b ) From 32fdb046ac43aa884d960165072ca37b26d78543 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Thu, 24 Jan 2019 21:42:54 +0100 Subject: [PATCH 044/855] ARM: 8828/1: uaccess: use unified assembler language syntax Convert the conditional infix to a postfix to make sure this inline assembly is unified syntax. Since gcc assumes non-unified syntax when emitting ARM instructions, make sure to define the syntax as unified. This allows to use LLVM's integrated assembler. Signed-off-by: Stefan Agner Signed-off-by: Russell King --- arch/arm/include/asm/uaccess.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 42aa4a22803c..89a28680934b 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -86,7 +86,8 @@ static inline void set_fs(mm_segment_t fs) #define __range_ok(addr, size) ({ \ unsigned long flag, roksum; \ __chk_user_ptr(addr); \ - __asm__("adds %1, %2, %3; sbcccs %1, %1, %0; movcc %0, #0" \ + __asm__(".syntax unified\n" \ + "adds %1, %2, %3; sbcscc %1, %1, %0; movcc %0, #0" \ : "=&r" (flag), "=&r" (roksum) \ : "r" (addr), "Ir" (size), "0" (current_thread_info()->addr_limit) \ : "cc"); \ From eb7ff9023e4f2998d527b37bffe794759800332a Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Thu, 24 Jan 2019 21:43:40 +0100 Subject: [PATCH 045/855] ARM: 8829/1: spinlock: use unified assembler language syntax Convert the conditional infix to a postfix to make sure this inline assembly is unified syntax. Since gcc assumes non-unified syntax when emitting ARM instructions, make sure to define the syntax as unified. This allows to use LLVM's integrated assembler. Signed-off-by: Stefan Agner Signed-off-by: Russell King --- arch/arm/include/asm/spinlock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index 099c78fcf62d..8f009e788ad4 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -210,11 +210,12 @@ static inline void arch_read_lock(arch_rwlock_t *rw) prefetchw(&rw->lock); __asm__ __volatile__( +" .syntax unified\n" "1: ldrex %0, [%2]\n" " adds %0, %0, #1\n" " strexpl %1, %0, [%2]\n" WFE("mi") -" rsbpls %0, %1, #0\n" +" rsbspl %0, %1, #0\n" " bmi 1b" : "=&r" (tmp), "=&r" (tmp2) : "r" (&rw->lock) From 72cd4064fccaae15ab84d40d4be23667402df4ed Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Fri, 25 Jan 2019 15:18:37 +0100 Subject: [PATCH 046/855] ARM: 8830/1: NOMMU: Toggle only bits in EXC_RETURN we are really care of ARMv8M introduces support for Security extension to M class, among other things it affects exception handling, especially, encoding of EXC_RETURN. The new bits have been added: Bit [6] Secure or Non-secure stack Bit [5] Default callee register stacking Bit [0] Exception Secure which conflicts with hard-coded value of EXC_RETURN: In fact, we only care of few bits: Bit [3] Mode (0 - Handler, 1 - Thread) Bit [2] Stack pointer selection (0 - Main, 1 - Process) We can toggle only those bits and left other bits as they were on exception entry. It is basically, what patch does - saves EXC_RETURN when we do transition form Thread to Handler mode (it is first svc), so later saved value is used instead of EXC_RET_THREADMODE_PROCESSSTACK. Signed-off-by: Vladimir Murzin Signed-off-by: Russell King --- arch/arm/include/asm/v7m.h | 2 +- arch/arm/kernel/entry-header.S | 3 ++- arch/arm/kernel/entry-v7m.S | 4 ++++ arch/arm/mm/proc-v7m.S | 3 +++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/v7m.h b/arch/arm/include/asm/v7m.h index 187ccf6496ad..2cb00d15831b 100644 --- a/arch/arm/include/asm/v7m.h +++ b/arch/arm/include/asm/v7m.h @@ -49,7 +49,7 @@ * (0 -> msp; 1 -> psp). Bits [1:0] are fixed to 0b01. */ #define EXC_RET_STACK_MASK 0x00000004 -#define EXC_RET_THREADMODE_PROCESSSTACK 0xfffffffd +#define EXC_RET_THREADMODE_PROCESSSTACK (3 << 2) /* Cache related definitions */ diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S index 773424843d6e..62db1c9746cb 100644 --- a/arch/arm/kernel/entry-header.S +++ b/arch/arm/kernel/entry-header.S @@ -127,7 +127,8 @@ */ .macro v7m_exception_slow_exit ret_r0 cpsid i - ldr lr, =EXC_RET_THREADMODE_PROCESSSTACK + ldr lr, =exc_ret + ldr lr, [lr] @ read original r12, sp, lr, pc and xPSR add r12, sp, #S_IP diff --git a/arch/arm/kernel/entry-v7m.S b/arch/arm/kernel/entry-v7m.S index abcf47848525..19d2dcd6530d 100644 --- a/arch/arm/kernel/entry-v7m.S +++ b/arch/arm/kernel/entry-v7m.S @@ -146,3 +146,7 @@ ENTRY(vector_table) .rept CONFIG_CPU_V7M_NUM_IRQ .long __irq_entry @ External Interrupts .endr + .align 2 + .globl exc_ret +exc_ret: + .space 4 diff --git a/arch/arm/mm/proc-v7m.S b/arch/arm/mm/proc-v7m.S index 47a5acc64433..92e84181933a 100644 --- a/arch/arm/mm/proc-v7m.S +++ b/arch/arm/mm/proc-v7m.S @@ -139,6 +139,9 @@ __v7m_setup_cont: cpsie i svc #0 1: cpsid i + ldr r0, =exc_ret + orr lr, lr, #EXC_RET_THREADMODE_PROCESSSTACK + str lr, [r0] ldmia sp, {r0-r3, r12} str r5, [r12, #11 * 4] @ restore the original SVC vector entry mov lr, r6 @ restore LR From 49f30235061bf5c5c201c5097061e34510bf110a Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Fri, 25 Jan 2019 15:26:32 +0100 Subject: [PATCH 047/855] ARM: 8831/1: NOMMU: pmsa-v8: remove unneeded semicolon Remove unneeded semicolon. [vladimir] proper tags in subject line Signed-off-by: Peng Hao Acked-by: Vladimir Murzin Signed-off-by: Vladimir Murzin Signed-off-by: Russell King --- arch/arm/mm/pmsa-v8.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mm/pmsa-v8.c b/arch/arm/mm/pmsa-v8.c index 617a83def88a..0d7d5fb59247 100644 --- a/arch/arm/mm/pmsa-v8.c +++ b/arch/arm/mm/pmsa-v8.c @@ -165,7 +165,7 @@ static int __init pmsav8_setup_ram(unsigned int number, phys_addr_t start,phys_a return -EINVAL; bar = start; - lar = (end - 1) & ~(PMSAv8_MINALIGN - 1);; + lar = (end - 1) & ~(PMSAv8_MINALIGN - 1); bar |= PMSAv8_AP_PL1RW_PL0RW | PMSAv8_RGN_SHARED; lar |= PMSAv8_LAR_IDX(PMSAv8_RGN_NORMAL) | PMSAv8_LAR_EN; @@ -181,7 +181,7 @@ static int __init pmsav8_setup_io(unsigned int number, phys_addr_t start,phys_ad return -EINVAL; bar = start; - lar = (end - 1) & ~(PMSAv8_MINALIGN - 1);; + lar = (end - 1) & ~(PMSAv8_MINALIGN - 1); bar |= PMSAv8_AP_PL1RW_PL0RW | PMSAv8_RGN_SHARED | PMSAv8_BAR_XN; lar |= PMSAv8_LAR_IDX(PMSAv8_RGN_DEVICE_nGnRnE) | PMSAv8_LAR_EN; From 49e30bd07df512cdc42e63c14235e6239e92d749 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Thu, 31 Jan 2019 13:50:29 +0100 Subject: [PATCH 048/855] ARM: 8832/1: NOMMU: Limit visibility for CONFIG_FLASH_{MEM_BASE,SIZE} It looks like usage of CONFIG_FLASH_{MEM_BASE,SIZE} is limited with: arch/arm/mm/proc-arm740.S arch/arm/mm/proc-arm940.S arch/arm/mm/proc-arm946.S So it might look confusing to see the option for anything except these. Signed-off-by: Vladimir Murzin Acked-by: Arnd Bergmann Signed-off-by: Russell King --- arch/arm/Kconfig-nommu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/Kconfig-nommu b/arch/arm/Kconfig-nommu index 1168a03c8525..36c80d3dd93f 100644 --- a/arch/arm/Kconfig-nommu +++ b/arch/arm/Kconfig-nommu @@ -20,10 +20,12 @@ config DRAM_SIZE config FLASH_MEM_BASE hex 'FLASH Base Address' if SET_MEM_PARAM + depends on CPU_ARM740T || CPU_ARM946E || CPU_ARM940T default 0x00400000 config FLASH_SIZE hex 'FLASH Size' if SET_MEM_PARAM + depends on CPU_ARM740T || CPU_ARM946E || CPU_ARM940T default 0x00400000 config PROCESSOR_ID From d0e22329e1a51548bdeeae66bc35fe2dc1cabdf6 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 13 Dec 2018 12:54:26 +0000 Subject: [PATCH 049/855] ARM: qcom: remove unnecessary boot_lock The boot_lock is something that was required for ARM development platforms to ensure that the delay calibration worked properly. This is not necessary for modern platforms that have better bus bandwidth and do not need to calibrate the delay loop for secondary cores. Remove the boot_lock entirely. Signed-off-by: Russell King --- arch/arm/mach-qcom/platsmp.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c index 5494c9e0c909..99a6a5e809e0 100644 --- a/arch/arm/mach-qcom/platsmp.c +++ b/arch/arm/mach-qcom/platsmp.c @@ -46,8 +46,6 @@ extern void secondary_startup_arm(void); -static DEFINE_SPINLOCK(boot_lock); - #ifdef CONFIG_HOTPLUG_CPU static void qcom_cpu_die(unsigned int cpu) { @@ -55,15 +53,6 @@ static void qcom_cpu_die(unsigned int cpu) } #endif -static void qcom_secondary_init(unsigned int cpu) -{ - /* - * Synchronise with the boot thread. - */ - spin_lock(&boot_lock); - spin_unlock(&boot_lock); -} - static int scss_release_secondary(unsigned int cpu) { struct device_node *node; @@ -280,12 +269,6 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int)) per_cpu(cold_boot_done, cpu) = true; } - /* - * set synchronisation state between this boot processor - * and the secondary one - */ - spin_lock(&boot_lock); - /* * Send the secondary CPU a soft interrupt, thereby causing * the boot monitor to read the system wide flags register, @@ -293,12 +276,6 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int)) */ arch_send_wakeup_ipi_mask(cpumask_of(cpu)); - /* - * now the secondary core is starting up let it run its - * calibrations, then wait for it to finish - */ - spin_unlock(&boot_lock); - return ret; } @@ -334,7 +311,6 @@ static void __init qcom_smp_prepare_cpus(unsigned int max_cpus) static const struct smp_operations smp_msm8660_ops __initconst = { .smp_prepare_cpus = qcom_smp_prepare_cpus, - .smp_secondary_init = qcom_secondary_init, .smp_boot_secondary = msm8660_boot_secondary, #ifdef CONFIG_HOTPLUG_CPU .cpu_die = qcom_cpu_die, @@ -344,7 +320,6 @@ CPU_METHOD_OF_DECLARE(qcom_smp, "qcom,gcc-msm8660", &smp_msm8660_ops); static const struct smp_operations qcom_smp_kpssv1_ops __initconst = { .smp_prepare_cpus = qcom_smp_prepare_cpus, - .smp_secondary_init = qcom_secondary_init, .smp_boot_secondary = kpssv1_boot_secondary, #ifdef CONFIG_HOTPLUG_CPU .cpu_die = qcom_cpu_die, @@ -354,7 +329,6 @@ CPU_METHOD_OF_DECLARE(qcom_smp_kpssv1, "qcom,kpss-acc-v1", &qcom_smp_kpssv1_ops) static const struct smp_operations qcom_smp_kpssv2_ops __initconst = { .smp_prepare_cpus = qcom_smp_prepare_cpus, - .smp_secondary_init = qcom_secondary_init, .smp_boot_secondary = kpssv2_boot_secondary, #ifdef CONFIG_HOTPLUG_CPU .cpu_die = qcom_cpu_die, From 0eb037998afe5514c8534276c152da31d2fabf07 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 13 Dec 2018 12:54:26 +0000 Subject: [PATCH 050/855] ARM: oxnas: remove CPU hotplug implementation The CPU hotplug implementation on this platform is cargo-culted from the plat-versatile implementation, and is buggy. Once a CPU hits the "low power" loop, it will wait for pen_release to be set to the CPU number to wake up again - but nothing in this implementation does that. So, once a CPU has entered cpu_die() it will never, ever leave. Remove this useless cargo-culted implementation. Signed-off-by: Russell King --- arch/arm/mach-oxnas/Makefile | 1 - arch/arm/mach-oxnas/hotplug.c | 109 ---------------------------------- arch/arm/mach-oxnas/platsmp.c | 4 -- 3 files changed, 114 deletions(-) delete mode 100644 arch/arm/mach-oxnas/hotplug.c diff --git a/arch/arm/mach-oxnas/Makefile b/arch/arm/mach-oxnas/Makefile index b625906a9970..61a34e1c0f22 100644 --- a/arch/arm/mach-oxnas/Makefile +++ b/arch/arm/mach-oxnas/Makefile @@ -1,2 +1 @@ obj-$(CONFIG_SMP) += platsmp.o headsmp.o -obj-$(CONFIG_HOTPLUG_CPU) += hotplug.o diff --git a/arch/arm/mach-oxnas/hotplug.c b/arch/arm/mach-oxnas/hotplug.c deleted file mode 100644 index 854f29b8cba6..000000000000 --- a/arch/arm/mach-oxnas/hotplug.c +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (C) 2002 ARM Ltd. - * All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include -#include -#include - -#include -#include - -static inline void cpu_enter_lowpower(void) -{ - unsigned int v; - - asm volatile( - " mcr p15, 0, %1, c7, c5, 0\n" - " mcr p15, 0, %1, c7, c10, 4\n" - /* - * Turn off coherency - */ - " mrc p15, 0, %0, c1, c0, 1\n" - " bic %0, %0, #0x20\n" - " mcr p15, 0, %0, c1, c0, 1\n" - " mrc p15, 0, %0, c1, c0, 0\n" - " bic %0, %0, %2\n" - " mcr p15, 0, %0, c1, c0, 0\n" - : "=&r" (v) - : "r" (0), "Ir" (CR_C) - : "cc"); -} - -static inline void cpu_leave_lowpower(void) -{ - unsigned int v; - - asm volatile( "mrc p15, 0, %0, c1, c0, 0\n" - " orr %0, %0, %1\n" - " mcr p15, 0, %0, c1, c0, 0\n" - " mrc p15, 0, %0, c1, c0, 1\n" - " orr %0, %0, #0x20\n" - " mcr p15, 0, %0, c1, c0, 1\n" - : "=&r" (v) - : "Ir" (CR_C) - : "cc"); -} - -static inline void platform_do_lowpower(unsigned int cpu, int *spurious) -{ - /* - * there is no power-control hardware on this platform, so all - * we can do is put the core into WFI; this is safe as the calling - * code will have already disabled interrupts - */ - for (;;) { - /* - * here's the WFI - */ - asm(".word 0xe320f003\n" - : - : - : "memory", "cc"); - - if (pen_release == cpu_logical_map(cpu)) { - /* - * OK, proper wakeup, we're done - */ - break; - } - - /* - * Getting here, means that we have come out of WFI without - * having been woken up - this shouldn't happen - * - * Just note it happening - when we're woken, we can report - * its occurrence. - */ - (*spurious)++; - } -} - -/* - * platform-specific code to shutdown a CPU - * - * Called with IRQs disabled - */ -void ox820_cpu_die(unsigned int cpu) -{ - int spurious = 0; - - /* - * we're ready for shutdown now, so do it - */ - cpu_enter_lowpower(); - platform_do_lowpower(cpu, &spurious); - - /* - * bring this CPU back into the world of cache - * coherency, and then restore interrupts - */ - cpu_leave_lowpower(); - - if (spurious) - pr_warn("CPU%u: %u spurious wakeup calls\n", cpu, spurious); -} diff --git a/arch/arm/mach-oxnas/platsmp.c b/arch/arm/mach-oxnas/platsmp.c index 442cc8a2f7dc..735141c0e3a3 100644 --- a/arch/arm/mach-oxnas/platsmp.c +++ b/arch/arm/mach-oxnas/platsmp.c @@ -19,7 +19,6 @@ #include extern void ox820_secondary_startup(void); -extern void ox820_cpu_die(unsigned int cpu); static void __iomem *cpu_ctrl; static void __iomem *gic_cpu_ctrl; @@ -94,9 +93,6 @@ unmap_scu: static const struct smp_operations ox820_smp_ops __initconst = { .smp_prepare_cpus = ox820_smp_prepare_cpus, .smp_boot_secondary = ox820_boot_secondary, -#ifdef CONFIG_HOTPLUG_CPU - .cpu_die = ox820_cpu_die, -#endif }; CPU_METHOD_OF_DECLARE(ox820_smp, "oxsemi,ox820-smp", &ox820_smp_ops); From 70678554c4c40f2cf8de5e7cd53fdd2c73387e51 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 13 Dec 2018 12:54:26 +0000 Subject: [PATCH 051/855] ARM: actions: remove boot_lock and pen_release The actions SMP implementation has several issues: 1. pen_release is only ever read and compared to -1, and is defined in arch/arm/kernel/smp.c to be -1. This test will always succeed. 2. we are already guaranteed to be single threaded while bringing up a CPU, so the spinlock makes no sense, remove it. 3. owl_secondary_startup() is not referenced nor defined, the prototype is redundant, remove it. Signed-off-by: Russell King --- arch/arm/mach-actions/platsmp.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/arch/arm/mach-actions/platsmp.c b/arch/arm/mach-actions/platsmp.c index 3efaa10efc43..4fd479c948e6 100644 --- a/arch/arm/mach-actions/platsmp.c +++ b/arch/arm/mach-actions/platsmp.c @@ -39,10 +39,6 @@ static void __iomem *sps_base_addr; static void __iomem *timer_base_addr; static int ncores; -static DEFINE_SPINLOCK(boot_lock); - -void owl_secondary_startup(void); - static int s500_wakeup_secondary(unsigned int cpu) { int ret; @@ -84,7 +80,6 @@ static int s500_wakeup_secondary(unsigned int cpu) static int s500_smp_boot_secondary(unsigned int cpu, struct task_struct *idle) { - unsigned long timeout; int ret; ret = s500_wakeup_secondary(cpu); @@ -93,21 +88,11 @@ static int s500_smp_boot_secondary(unsigned int cpu, struct task_struct *idle) udelay(10); - spin_lock(&boot_lock); - smp_send_reschedule(cpu); - timeout = jiffies + (1 * HZ); - while (time_before(jiffies, timeout)) { - if (pen_release == -1) - break; - } - writel(0, timer_base_addr + OWL_CPU1_ADDR + (cpu - 1) * 4); writel(0, timer_base_addr + OWL_CPU1_FLAG + (cpu - 1) * 4); - spin_unlock(&boot_lock); - return 0; } From 6213f70e7c10fd4a01b65bad3826648fc78df8a8 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 13 Dec 2018 14:02:48 +0000 Subject: [PATCH 052/855] ARM: smp: remove arch-provided "pen_release" Consolidating the "pen_release" stuff amongst the various SoC implementations gives credence to having a CPU holding pen for secondary CPUs. However, this is far from the truth. Many SoC implementations cargo-cult copied various bits of the pen release implementation from the initial Realview/Versatile Express implementation without understanding what it was or why it existed. The reason it existed is because these are _development_ platforms, and some board firmware is unable to individually control the startup of secondary CPUs. Moreover, they do not have a way to power down or reset secondary CPUs for hot-unplug. Hence, the pen_release implementation was designed for ARM Ltd's development platforms to provide a working implementation, even though it is very far from what is required. It was decided a while back to reduce the duplication by consolidating the "pen_release" variable, but this only made the situation worse - we have ended up with several implementations that read this variable but do not write it - again, showing the cargo-cult mentality at work, lack of proper review of new code, and in some cases a lack of testing. While it would be preferable to remove pen_release entirely from the kernel, this is not possible without help from the SoC maintainers, which seems to be lacking. However, I want to remove pen_release from arch code to remove the credence that having it gives. This patch removes pen_release from the arch code entirely, adding private per-SoC definitions for it instead, and explicitly stating that write_pen_release() is cargo-cult copied and should not be copied any further. Rename write_pen_release() in a similar fashion as well. Signed-off-by: Russell King --- arch/arm/include/asm/smp.h | 1 - arch/arm/kernel/smp.c | 6 ------ arch/arm/mach-exynos/headsmp.S | 2 +- arch/arm/mach-exynos/platsmp.c | 31 ++++++++++++++++++------------- arch/arm/mach-prima2/common.h | 2 ++ arch/arm/mach-prima2/headsmp.S | 2 +- arch/arm/mach-prima2/hotplug.c | 3 ++- arch/arm/mach-prima2/platsmp.c | 17 ++++++++++------- arch/arm/mach-spear/generic.h | 2 ++ arch/arm/mach-spear/headsmp.S | 2 +- arch/arm/mach-spear/hotplug.c | 4 +++- arch/arm/mach-spear/platsmp.c | 27 ++++++++++++++++----------- 12 files changed, 56 insertions(+), 43 deletions(-) diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h index 709a55989cb0..451ae684aaf4 100644 --- a/arch/arm/include/asm/smp.h +++ b/arch/arm/include/asm/smp.h @@ -67,7 +67,6 @@ struct secondary_data { void *stack; }; extern struct secondary_data secondary_data; -extern volatile int pen_release; extern void secondary_startup(void); extern void secondary_startup_arm(void); diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 3bf82232b1be..051b0e8e7d30 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -62,12 +62,6 @@ */ struct secondary_data secondary_data; -/* - * control for which core is the next to come out of the secondary - * boot "holding pen" - */ -volatile int pen_release = -1; - enum ipi_msg_type { IPI_WAKEUP, IPI_TIMER, diff --git a/arch/arm/mach-exynos/headsmp.S b/arch/arm/mach-exynos/headsmp.S index 005695c9bf40..0ac2cb9a7355 100644 --- a/arch/arm/mach-exynos/headsmp.S +++ b/arch/arm/mach-exynos/headsmp.S @@ -36,4 +36,4 @@ ENDPROC(exynos4_secondary_startup) .align 2 1: .long . - .long pen_release + .long exynos_pen_release diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c index c39ffd2e2fe6..3795412c0189 100644 --- a/arch/arm/mach-exynos/platsmp.c +++ b/arch/arm/mach-exynos/platsmp.c @@ -28,6 +28,9 @@ extern void exynos4_secondary_startup(void); +/* XXX exynos_pen_release is cargo culted code - DO NOT COPY XXX */ +volatile int exynos_pen_release = -1; + #ifdef CONFIG_HOTPLUG_CPU static inline void cpu_leave_lowpower(u32 core_id) { @@ -57,7 +60,7 @@ static inline void platform_do_lowpower(unsigned int cpu, int *spurious) wfi(); - if (pen_release == core_id) { + if (exynos_pen_release == core_id) { /* * OK, proper wakeup, we're done */ @@ -228,15 +231,17 @@ void exynos_core_restart(u32 core_id) } /* - * Write pen_release in a way that is guaranteed to be visible to all - * observers, irrespective of whether they're taking part in coherency + * XXX CARGO CULTED CODE - DO NOT COPY XXX + * + * Write exynos_pen_release in a way that is guaranteed to be visible to + * all observers, irrespective of whether they're taking part in coherency * or not. This is necessary for the hotplug code to work reliably. */ -static void write_pen_release(int val) +static void exynos_write_pen_release(int val) { - pen_release = val; + exynos_pen_release = val; smp_wmb(); - sync_cache_w(&pen_release); + sync_cache_w(&exynos_pen_release); } static DEFINE_SPINLOCK(boot_lock); @@ -247,7 +252,7 @@ static void exynos_secondary_init(unsigned int cpu) * let the primary processor know we're out of the * pen, then head off into the C entry point */ - write_pen_release(-1); + exynos_write_pen_release(-1); /* * Synchronise with the boot thread. @@ -322,12 +327,12 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle) /* * The secondary processor is waiting to be released from * the holding pen - release it, then wait for it to flag - * that it has been released by resetting pen_release. + * that it has been released by resetting exynos_pen_release. * - * Note that "pen_release" is the hardware CPU core ID, whereas + * Note that "exynos_pen_release" is the hardware CPU core ID, whereas * "cpu" is Linux's internal ID. */ - write_pen_release(core_id); + exynos_write_pen_release(core_id); if (!exynos_cpu_power_state(core_id)) { exynos_cpu_power_up(core_id); @@ -376,13 +381,13 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle) else arch_send_wakeup_ipi_mask(cpumask_of(cpu)); - if (pen_release == -1) + if (exynos_pen_release == -1) break; udelay(10); } - if (pen_release != -1) + if (exynos_pen_release != -1) ret = -ETIMEDOUT; /* @@ -392,7 +397,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle) fail: spin_unlock(&boot_lock); - return pen_release != -1 ? ret : 0; + return exynos_pen_release != -1 ? ret : 0; } static void __init exynos_smp_prepare_cpus(unsigned int max_cpus) diff --git a/arch/arm/mach-prima2/common.h b/arch/arm/mach-prima2/common.h index 6d77b622d168..457eb7b18160 100644 --- a/arch/arm/mach-prima2/common.h +++ b/arch/arm/mach-prima2/common.h @@ -15,6 +15,8 @@ #include #include +extern volatile int prima2_pen_release; + extern const struct smp_operations sirfsoc_smp_ops; extern void sirfsoc_secondary_startup(void); extern void sirfsoc_cpu_die(unsigned int cpu); diff --git a/arch/arm/mach-prima2/headsmp.S b/arch/arm/mach-prima2/headsmp.S index 209d9fc5c16c..6cf4fc60347b 100644 --- a/arch/arm/mach-prima2/headsmp.S +++ b/arch/arm/mach-prima2/headsmp.S @@ -34,4 +34,4 @@ ENDPROC(sirfsoc_secondary_startup) .align 1: .long . - .long pen_release + .long prima2_pen_release diff --git a/arch/arm/mach-prima2/hotplug.c b/arch/arm/mach-prima2/hotplug.c index a728c78b996f..b6cf1527e330 100644 --- a/arch/arm/mach-prima2/hotplug.c +++ b/arch/arm/mach-prima2/hotplug.c @@ -11,6 +11,7 @@ #include #include +#include "common.h" static inline void platform_do_lowpower(unsigned int cpu) { @@ -18,7 +19,7 @@ static inline void platform_do_lowpower(unsigned int cpu) for (;;) { __asm__ __volatile__("dsb\n\t" "wfi\n\t" : : : "memory"); - if (pen_release == cpu_logical_map(cpu)) { + if (prima2_pen_release == cpu_logical_map(cpu)) { /* * OK, proper wakeup, we're done */ diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c index 75ef5d4be554..d1f8b5168083 100644 --- a/arch/arm/mach-prima2/platsmp.c +++ b/arch/arm/mach-prima2/platsmp.c @@ -24,13 +24,16 @@ static void __iomem *clk_base; static DEFINE_SPINLOCK(boot_lock); +/* XXX prima2_pen_release is cargo culted code - DO NOT COPY XXX */ +volatile int prima2_pen_release = -1; + static void sirfsoc_secondary_init(unsigned int cpu) { /* * let the primary processor know we're out of the * pen, then head off into the C entry point */ - pen_release = -1; + prima2_pen_release = -1; smp_wmb(); /* @@ -80,13 +83,13 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle) /* * The secondary processor is waiting to be released from * the holding pen - release it, then wait for it to flag - * that it has been released by resetting pen_release. + * that it has been released by resetting prima2_pen_release. * - * Note that "pen_release" is the hardware CPU ID, whereas + * Note that "prima2_pen_release" is the hardware CPU ID, whereas * "cpu" is Linux's internal ID. */ - pen_release = cpu_logical_map(cpu); - sync_cache_w(&pen_release); + prima2_pen_release = cpu_logical_map(cpu); + sync_cache_w(&prima2_pen_release); /* * Send the secondary CPU SEV, thereby causing the boot monitor to read @@ -97,7 +100,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle) timeout = jiffies + (1 * HZ); while (time_before(jiffies, timeout)) { smp_rmb(); - if (pen_release == -1) + if (prima2_pen_release == -1) break; udelay(10); @@ -109,7 +112,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle) */ spin_unlock(&boot_lock); - return pen_release != -1 ? -ENOSYS : 0; + return prima2_pen_release != -1 ? -ENOSYS : 0; } const struct smp_operations sirfsoc_smp_ops __initconst = { diff --git a/arch/arm/mach-spear/generic.h b/arch/arm/mach-spear/generic.h index 909b97c0b237..25b4c5e66e39 100644 --- a/arch/arm/mach-spear/generic.h +++ b/arch/arm/mach-spear/generic.h @@ -20,6 +20,8 @@ #include +extern volatile int spear_pen_release; + extern void spear13xx_timer_init(void); extern void spear3xx_timer_init(void); extern struct pl022_ssp_controller pl022_plat_data; diff --git a/arch/arm/mach-spear/headsmp.S b/arch/arm/mach-spear/headsmp.S index c52192dc3d9f..6e250b6c0aa2 100644 --- a/arch/arm/mach-spear/headsmp.S +++ b/arch/arm/mach-spear/headsmp.S @@ -43,5 +43,5 @@ pen: ldr r7, [r6] .align 1: .long . - .long pen_release + .long spear_pen_release ENDPROC(spear13xx_secondary_startup) diff --git a/arch/arm/mach-spear/hotplug.c b/arch/arm/mach-spear/hotplug.c index 12edd1cf8a12..0dd84f609627 100644 --- a/arch/arm/mach-spear/hotplug.c +++ b/arch/arm/mach-spear/hotplug.c @@ -16,6 +16,8 @@ #include #include +#include "generic.h" + static inline void cpu_enter_lowpower(void) { unsigned int v; @@ -57,7 +59,7 @@ static inline void spear13xx_do_lowpower(unsigned int cpu, int *spurious) for (;;) { wfi(); - if (pen_release == cpu) { + if (spear_pen_release == cpu) { /* * OK, proper wakeup, we're done */ diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c index 39038a03836a..b1ff4bb86f6d 100644 --- a/arch/arm/mach-spear/platsmp.c +++ b/arch/arm/mach-spear/platsmp.c @@ -20,16 +20,21 @@ #include #include "generic.h" +/* XXX spear_pen_release is cargo culted code - DO NOT COPY XXX */ +volatile int spear_pen_release = -1; + /* - * Write pen_release in a way that is guaranteed to be visible to all - * observers, irrespective of whether they're taking part in coherency + * XXX CARGO CULTED CODE - DO NOT COPY XXX + * + * Write spear_pen_release in a way that is guaranteed to be visible to + * all observers, irrespective of whether they're taking part in coherency * or not. This is necessary for the hotplug code to work reliably. */ -static void write_pen_release(int val) +static void spear_write_pen_release(int val) { - pen_release = val; + spear_pen_release = val; smp_wmb(); - sync_cache_w(&pen_release); + sync_cache_w(&spear_pen_release); } static DEFINE_SPINLOCK(boot_lock); @@ -42,7 +47,7 @@ static void spear13xx_secondary_init(unsigned int cpu) * let the primary processor know we're out of the * pen, then head off into the C entry point */ - write_pen_release(-1); + spear_write_pen_release(-1); /* * Synchronise with the boot thread. @@ -64,17 +69,17 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle) /* * The secondary processor is waiting to be released from * the holding pen - release it, then wait for it to flag - * that it has been released by resetting pen_release. + * that it has been released by resetting spear_pen_release. * - * Note that "pen_release" is the hardware CPU ID, whereas + * Note that "spear_pen_release" is the hardware CPU ID, whereas * "cpu" is Linux's internal ID. */ - write_pen_release(cpu); + spear_write_pen_release(cpu); timeout = jiffies + (1 * HZ); while (time_before(jiffies, timeout)) { smp_rmb(); - if (pen_release == -1) + if (spear_pen_release == -1) break; udelay(10); @@ -86,7 +91,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle) */ spin_unlock(&boot_lock); - return pen_release != -1 ? -ENOSYS : 0; + return spear_pen_release != -1 ? -ENOSYS : 0; } /* From 5388a5b82199facacd3d7ac0d05aca6e8f902fed Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 10 Apr 2018 11:35:36 +0100 Subject: [PATCH 053/855] ARM: avoid Cortex-A9 livelock on tight dmb loops machine_crash_nonpanic_core() does this: while (1) cpu_relax(); because the kernel has crashed, and we have no known safe way to deal with the CPU. So, we place the CPU into an infinite loop which we expect it to never exit - at least not until the system as a whole is reset by some method. In the absence of erratum 754327, this code assembles to: b . In other words, an infinite loop. When erratum 754327 is enabled, this becomes: 1: dmb b 1b It has been observed that on some systems (eg, OMAP4) where, if a crash is triggered, the system tries to kexec into the panic kernel, but fails after taking the secondary CPU down - placing it into one of these loops. This causes the system to livelock, and the most noticable effect is the system stops after issuing: Loading crashdump kernel... to the system console. The tested as working solution I came up with was to add wfe() to these infinite loops thusly: while (1) { cpu_relax(); wfe(); } which, without 754327 builds to: 1: wfe b 1b or with 754327 is enabled: 1: dmb wfe b 1b Adding "wfe" does two things depending on the environment we're running under: - where we're running on bare metal, and the processor implements "wfe", it stops us spinning endlessly in a loop where we're never going to do any useful work. - if we're running in a VM, it allows the CPU to be given back to the hypervisor and rescheduled for other purposes (maybe a different VM) rather than wasting CPU cycles inside a crashed VM. However, in light of erratum 794072, Will Deacon wanted to see 10 nops as well - which is reasonable to cover the case where we have erratum 754327 enabled _and_ we have a processor that doesn't implement the wfe hint. So, we now end up with: 1: wfe b 1b when erratum 754327 is disabled, or: 1: dmb nop nop nop nop nop nop nop nop nop nop wfe b 1b when erratum 754327 is enabled. We also get the dmb + 10 nop sequence elsewhere in the kernel, in terminating loops. This is reasonable - it means we get the workaround for erratum 794072 when erratum 754327 is enabled, but still relinquish the dead processor - either by placing it in a lower power mode when wfe is implemented as such or by returning it to the hypervisior, or in the case where wfe is a no-op, we use the workaround specified in erratum 794072 to avoid the problem. These as two entirely orthogonal problems - the 10 nops addresses erratum 794072, and the wfe is an optimisation that makes the system more efficient when crashed either in terms of power consumption or by allowing the host/other VMs to make use of the CPU. I don't see any reason not to use kexec() inside a VM - it has the potential to provide automated recovery from a failure of the VMs kernel with the opportunity for saving a crashdump of the failure. A panic() with a reboot timeout won't do that, and reading the libvirt documentation, setting on_reboot to "preserve" won't either (the documentation states "The preserve action for an on_reboot event is treated as a destroy".) Surely it has to be a good thing to avoiding having CPUs spinning inside a VM that is doing no useful work. Acked-by: Will Deacon Signed-off-by: Russell King --- arch/arm/include/asm/barrier.h | 2 ++ arch/arm/include/asm/processor.h | 6 +++++- arch/arm/kernel/machine_kexec.c | 5 ++++- arch/arm/kernel/smp.c | 4 +++- arch/arm/mach-omap2/prm_common.c | 4 +++- 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h index 69772e742a0a..83ae97c049d9 100644 --- a/arch/arm/include/asm/barrier.h +++ b/arch/arm/include/asm/barrier.h @@ -11,6 +11,8 @@ #define sev() __asm__ __volatile__ ("sev" : : : "memory") #define wfe() __asm__ __volatile__ ("wfe" : : : "memory") #define wfi() __asm__ __volatile__ ("wfi" : : : "memory") +#else +#define wfe() do { } while (0) #endif #if __LINUX_ARM_ARCH__ >= 7 diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 120f4c9bbfde..57fe73ea0f72 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -89,7 +89,11 @@ extern void release_thread(struct task_struct *); unsigned long get_wchan(struct task_struct *p); #if __LINUX_ARM_ARCH__ == 6 || defined(CONFIG_ARM_ERRATA_754327) -#define cpu_relax() smp_mb() +#define cpu_relax() \ + do { \ + smp_mb(); \ + __asm__ __volatile__("nop; nop; nop; nop; nop; nop; nop; nop; nop; nop;"); \ + } while (0) #else #define cpu_relax() barrier() #endif diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index dd2eb5f76b9f..76300f3813e8 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -91,8 +91,11 @@ void machine_crash_nonpanic_core(void *unused) set_cpu_online(smp_processor_id(), false); atomic_dec(&waiting_for_crash_ipi); - while (1) + + while (1) { cpu_relax(); + wfe(); + } } void crash_smp_send_stop(void) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 3bf82232b1be..7f0b99e1fff3 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -604,8 +604,10 @@ static void ipi_cpu_stop(unsigned int cpu) local_fiq_disable(); local_irq_disable(); - while (1) + while (1) { cpu_relax(); + wfe(); + } } static DEFINE_PER_CPU(struct completion *, cpu_completion); diff --git a/arch/arm/mach-omap2/prm_common.c b/arch/arm/mach-omap2/prm_common.c index 058a37e6d11c..fd6e0671f957 100644 --- a/arch/arm/mach-omap2/prm_common.c +++ b/arch/arm/mach-omap2/prm_common.c @@ -523,8 +523,10 @@ void omap_prm_reset_system(void) prm_ll_data->reset_system(); - while (1) + while (1) { cpu_relax(); + wfe(); + } } /** From 03f2c02d8be76d2080c174cd65d2dfcbbf783708 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Jan 2019 10:42:11 -0800 Subject: [PATCH 054/855] f2fs: run discard jobs when put_super When we umount f2fs, we need to avoid long delay due to discard commands, which is actually taking tens of seconds, if storage is very slow on UNMAP. So, this patch introduces timeout-based work on it. By default, let me give 5 seconds for discard. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 7 +++++++ fs/f2fs/f2fs.h | 5 ++++- fs/f2fs/segment.c | 11 ++++++++++- fs/f2fs/super.c | 4 +++- fs/f2fs/sysfs.c | 3 +++ 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index a7ce33199457..91822ce25831 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -86,6 +86,13 @@ Description: The unit size is one block, now only support configuring in range of [1, 512]. +What: /sys/fs/f2fs//umount_discard_timeout +Date: January 2019 +Contact: "Jaegeuk Kim" +Description: + Set timeout to issue discard commands during umount. + Default: 5 secs + What: /sys/fs/f2fs//max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0f564883e078..6b6ec5600089 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -191,6 +191,7 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ struct cp_control { int reason; @@ -310,6 +311,7 @@ struct discard_policy { bool sync; /* submit discard with REQ_SYNC flag */ bool ordered; /* issue discard by lba order */ unsigned int granularity; /* discard granularity */ + int timeout; /* discard timeout for put_super */ }; struct discard_cmd_control { @@ -1110,6 +1112,7 @@ enum { DISCARD_TIME, GC_TIME, DISABLE_TIME, + UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; @@ -3006,7 +3009,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); -bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9b79056d705d..5b2b9be6f28d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1037,6 +1037,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->timeout = 0; if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1424,7 +1425,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, int i, issued = 0; bool io_interrupted = false; + if (dpolicy->timeout != 0) + f2fs_update_time(sbi, dpolicy->timeout); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (dpolicy->timeout != 0 && + f2fs_time_over(sbi, dpolicy->timeout)) + break; + if (i + 1 < dpolicy->granularity) break; @@ -1611,7 +1619,7 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super */ -bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_policy dpolicy; @@ -1619,6 +1627,7 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT; __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ea514acede36..24efd76ca151 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1048,7 +1048,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - dropped = f2fs_wait_discard_bios(sbi); + dropped = f2fs_issue_discard_timeout(sbi); if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && !sbi->discard_blks && !dropped) { @@ -2706,6 +2706,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; + sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = + DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); for (i = 0; i < NR_COUNT_TYPE; i++) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 5b83e1d66cb3..18c627e8fc90 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -426,6 +426,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, interval_time[DISCARD_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, + umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); @@ -483,6 +485,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(idle_interval), ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), + ATTR_LIST(umount_discard_timeout), ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), ATTR_LIST(gc_pin_file_thresh), From 8d43d57036679a8635952c9ef54989a7b48e8c00 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Mon, 10 Dec 2018 11:15:16 +0100 Subject: [PATCH 055/855] KVM: s390: clarify kvm related kernel message As suggested by our ID dept. here are some kernel message updates. Signed-off-by: Michael Mueller Reviewed-by: Christian Borntraeger Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7f4bc58a53b9..11b4be3dad15 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -432,7 +432,7 @@ int kvm_arch_init(void *opaque) /* Register floating interrupt controller interface. */ rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); if (rc) { - pr_err("Failed to register FLIC rc=%d\n", rc); + pr_err("A FLIC registration call failed with rc=%d\n", rc); goto out_debug_unreg; } return 0; @@ -4293,12 +4293,12 @@ static int __init kvm_s390_init(void) int i; if (!sclp.has_sief2) { - pr_info("SIE not available\n"); + pr_info("SIE is not available\n"); return -ENODEV; } if (nested && hpage) { - pr_info("nested (vSIE) and hpage (huge page backing) can currently not be activated concurrently"); + pr_info("A KVM host that supports nesting cannot back its KVM guests with huge pages\n"); return -EINVAL; } From b7d455712927cc8042f97fffba5d51052856ddaf Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:32 +0100 Subject: [PATCH 056/855] KVM: s390: drop obsolete else path The explicit else path specified in set_intercept_indicators_io is not required as the function returns in case the first branch is taken anyway. Signed-off-by: Michael Mueller Reviewed-by: Cornelia Huck Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-2-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index fcb55b02990e..19d556512452 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -345,7 +345,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) { if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_IO_MASK)) return; - else if (psw_ioint_disabled(vcpu)) + if (psw_ioint_disabled(vcpu)) kvm_s390_set_cpuflags(vcpu, CPUSTAT_IO_INT); else vcpu->arch.sie_block->lctl |= LCTL_CR6; From 689bdf9e9c337121d948d48605de0659c088e6bb Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:33 +0100 Subject: [PATCH 057/855] KVM: s390: make bitmap declaration consistent Use a consistent bitmap declaration throughout the code. Signed-off-by: Michael Mueller Reviewed-by: Cornelia Huck Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-3-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 2 +- arch/s390/kvm/interrupt.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d5d24889c3bc..3cba08f73dc6 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -591,7 +591,7 @@ struct kvm_s390_float_interrupt { struct kvm_s390_mchk_info mchk; struct kvm_s390_ext_info srv_signal; int next_rr_cpu; - unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct mutex ais_lock; u8 simm; u8 nimm; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 19d556512452..167a3068ef84 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2831,7 +2831,7 @@ static void store_local_irq(struct kvm_s390_local_interrupt *li, int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) { int scn; - unsigned long sigp_emerg_pending[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; unsigned long pending_irqs; struct kvm_s390_irq irq; From 246b72183b3538907d5c1e6d383c6956385d068d Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:34 +0100 Subject: [PATCH 058/855] KVM: s390: move bitmap idle_mask into arch struct top level The vcpu idle_mask state is used by but not specific to the emulated floating interruptions. The state is relevant to gisa related interruptions as well. Signed-off-by: Michael Mueller Reviewed-by: Pierre Morel Reviewed-by: Cornelia Huck Acked-by: Halil Pasic Message-Id: <20190131085247.13826-4-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 2 +- arch/s390/kvm/interrupt.c | 11 +++++------ arch/s390/kvm/kvm-s390.h | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 3cba08f73dc6..c259a67c4298 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -591,7 +591,6 @@ struct kvm_s390_float_interrupt { struct kvm_s390_mchk_info mchk; struct kvm_s390_ext_info srv_signal; int next_rr_cpu; - DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct mutex ais_lock; u8 simm; u8 nimm; @@ -838,6 +837,7 @@ struct kvm_arch{ /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); struct kvm_s390_gisa *gisa; + DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 167a3068ef84..2a3eb9f076c3 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -318,13 +318,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); - set_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); + set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); + clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) @@ -1726,7 +1726,6 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) */ static void __floating_irq_kick(struct kvm *kvm, u64 type) { - struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; struct kvm_vcpu *dst_vcpu; int sigcpu, online_vcpus, nr_tries = 0; @@ -1735,11 +1734,11 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) return; /* find idle VCPUs first, then round robin */ - sigcpu = find_first_bit(fi->idle_mask, online_vcpus); + sigcpu = find_first_bit(kvm->arch.idle_mask, online_vcpus); if (sigcpu == online_vcpus) { do { - sigcpu = fi->next_rr_cpu; - fi->next_rr_cpu = (fi->next_rr_cpu + 1) % online_vcpus; + sigcpu = kvm->arch.float_int.next_rr_cpu++; + kvm->arch.float_int.next_rr_cpu %= online_vcpus; /* avoid endless loops if all vcpus are stopped */ if (nr_tries++ >= online_vcpus) return; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 1f6e36cdce0d..72ef87799593 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -67,7 +67,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); + return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) From 672128bfee082d37e7eda630f98c46aacda4963c Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:35 +0100 Subject: [PATCH 059/855] KVM: s390: coding style kvm_s390_gisa_init/clear() The change helps to reduce line length and increases code readability. Signed-off-by: Michael Mueller Reviewed-by: Cornelia Huck Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-5-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 2a3eb9f076c3..005dbe7252e7 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2885,20 +2885,20 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) void kvm_s390_gisa_clear(struct kvm *kvm) { - if (kvm->arch.gisa) { - memset(kvm->arch.gisa, 0, sizeof(struct kvm_s390_gisa)); - kvm->arch.gisa->next_alert = (u32)(u64)kvm->arch.gisa; - VM_EVENT(kvm, 3, "gisa 0x%pK cleared", kvm->arch.gisa); - } + if (!kvm->arch.gisa) + return; + memset(kvm->arch.gisa, 0, sizeof(struct kvm_s390_gisa)); + kvm->arch.gisa->next_alert = (u32)(u64)kvm->arch.gisa; + VM_EVENT(kvm, 3, "gisa 0x%pK cleared", kvm->arch.gisa); } void kvm_s390_gisa_init(struct kvm *kvm) { - if (css_general_characteristics.aiv) { - kvm->arch.gisa = &kvm->arch.sie_page2->gisa; - VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa); - kvm_s390_gisa_clear(kvm); - } + if (!css_general_characteristics.aiv) + return; + kvm->arch.gisa = &kvm->arch.sie_page2->gisa; + kvm_s390_gisa_clear(kvm); + VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa); } void kvm_s390_gisa_destroy(struct kvm *kvm) From 96723d323a080d98a2b330ef2d7aa020bf589d9e Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:36 +0100 Subject: [PATCH 060/855] KVM: s390: use pending_irqs_no_gisa() where appropriate Interruption types that are not represented in GISA shall use pending_irqs_no_gisa() to test pending interruptions. Signed-off-by: Michael Mueller Reviewed-by: Cornelia Huck Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-6-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 005dbe7252e7..cb48736867ed 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -353,7 +353,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) { - if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK)) + if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_EXT_MASK)) return; if (psw_extint_disabled(vcpu)) kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); @@ -363,7 +363,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu) { - if (!(pending_irqs(vcpu) & IRQ_PEND_MCHK_MASK)) + if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_MCHK_MASK)) return; if (psw_mchk_disabled(vcpu)) vcpu->arch.sie_block->ictl |= ICTL_LPSW; From bb2fb8cdcf2d9f1845ffaa077013b45e14642019 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:37 +0100 Subject: [PATCH 061/855] KVM: s390: remove kvm_s390_ from gisa static inline functions This will shorten the length of code lines. All GISA related static inline functions are local to interrupt.c. Signed-off-by: Michael Mueller Reviewed-by: Cornelia Huck Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-7-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index cb48736867ed..942cc7d33766 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -217,22 +217,22 @@ static inline u8 int_word_to_isc(u32 int_word) */ #define IPM_BIT_OFFSET (offsetof(struct kvm_s390_gisa, ipm) * BITS_PER_BYTE) -static inline void kvm_s390_gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); } -static inline u8 kvm_s390_gisa_get_ipm(struct kvm_s390_gisa *gisa) +static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa) { return READ_ONCE(gisa->ipm); } -static inline void kvm_s390_gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); } -static inline int kvm_s390_gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); } @@ -246,7 +246,7 @@ static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) { return pending_irqs_no_gisa(vcpu) | - kvm_s390_gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7; + gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7; } static inline int isc_to_irq_type(unsigned long isc) @@ -999,7 +999,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, } if (vcpu->kvm->arch.gisa && - kvm_s390_gisa_tac_ipm_gisc(vcpu->kvm->arch.gisa, isc)) { + gisa_tac_ipm_gisc(vcpu->kvm->arch.gisa, isc)) { /* * in case an adapter interrupt was not delivered * in SIE context KVM will handle the delivery @@ -1541,10 +1541,10 @@ static int get_top_gisa_isc(struct kvm *kvm, u64 isc_mask, u32 schid) if (!kvm->arch.gisa) goto out; - active_mask = (isc_mask & kvm_s390_gisa_get_ipm(kvm->arch.gisa) << 24) << 32; + active_mask = (isc_mask & gisa_get_ipm(kvm->arch.gisa) << 24) << 32; while (active_mask) { isc = __fls(active_mask) ^ (BITS_PER_LONG - 1); - if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, isc)) + if (gisa_tac_ipm_gisc(kvm->arch.gisa, isc)) return isc; clear_bit_inv(isc, &active_mask); } @@ -1584,7 +1584,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, /* both types of interrupts present */ if (int_word_to_isc(inti->io.io_int_word) <= isc) { /* classical IO int with higher priority */ - kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(kvm->arch.gisa, isc); goto out; } gisa_out: @@ -1596,7 +1596,7 @@ gisa_out: kvm_s390_reinject_io_int(kvm, inti); inti = tmp_inti; } else - kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(kvm->arch.gisa, isc); out: return inti; } @@ -1694,7 +1694,7 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) if (kvm->arch.gisa && inti->type & KVM_S390_INT_IO_AI_MASK) { VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc); - kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(kvm->arch.gisa, isc); kfree(inti); return 0; } @@ -2025,15 +2025,14 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) max_irqs = len / sizeof(struct kvm_s390_irq); - if (kvm->arch.gisa && - kvm_s390_gisa_get_ipm(kvm->arch.gisa)) { + if (kvm->arch.gisa && gisa_get_ipm(kvm->arch.gisa)) { for (i = 0; i <= MAX_ISC; i++) { if (n == max_irqs) { /* signal userspace to try again */ ret = -ENOMEM; goto out_nolock; } - if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, i)) { + if (gisa_tac_ipm_gisc(kvm->arch.gisa, i)) { irq = (struct kvm_s390_irq *) &buf[n]; irq->type = KVM_S390_INT_IO(1, 0, 0, 0); irq->u.io.io_int_word = isc_to_int_word(i); From 982cff425959901dfb4df7433622e5c0510f9d37 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:38 +0100 Subject: [PATCH 062/855] KVM: s390: introduce struct kvm_s390_gisa_interrupt Use this struct analog to the kvm interruption structs for kvm emulated floating and local interruptions. GIB handling will add further fields to this structure as required. Signed-off-by: Michael Mueller Reviewed-by: Cornelia Huck Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-8-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 6 +++- arch/s390/kvm/interrupt.c | 52 ++++++++++++++++++-------------- arch/s390/kvm/kvm-s390.c | 2 +- 3 files changed, 36 insertions(+), 24 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index c259a67c4298..0941571d34eb 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -803,6 +803,10 @@ struct kvm_s390_vsie { struct page *pages[KVM_MAX_VCPUS]; }; +struct kvm_s390_gisa_interrupt { + struct kvm_s390_gisa *origin; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -836,8 +840,8 @@ struct kvm_arch{ atomic64_t cmma_dirty_pages; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); - struct kvm_s390_gisa *gisa; DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); + struct kvm_s390_gisa_interrupt gisa_int; }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 942cc7d33766..ee91d1de0036 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -246,7 +246,8 @@ static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) { return pending_irqs_no_gisa(vcpu) | - gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7; + gisa_get_ipm(vcpu->kvm->arch.gisa_int.origin) << + IRQ_PEND_IO_ISC_7; } static inline int isc_to_irq_type(unsigned long isc) @@ -956,6 +957,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, { struct list_head *isc_list; struct kvm_s390_float_interrupt *fi; + struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int; struct kvm_s390_interrupt_info *inti = NULL; struct kvm_s390_io_info io; u32 isc; @@ -998,8 +1000,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, goto out; } - if (vcpu->kvm->arch.gisa && - gisa_tac_ipm_gisc(vcpu->kvm->arch.gisa, isc)) { + if (gi->origin && gisa_tac_ipm_gisc(gi->origin, isc)) { /* * in case an adapter interrupt was not delivered * in SIE context KVM will handle the delivery @@ -1533,18 +1534,19 @@ static struct kvm_s390_interrupt_info *get_top_io_int(struct kvm *kvm, static int get_top_gisa_isc(struct kvm *kvm, u64 isc_mask, u32 schid) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; unsigned long active_mask; int isc; if (schid) goto out; - if (!kvm->arch.gisa) + if (!gi->origin) goto out; - active_mask = (isc_mask & gisa_get_ipm(kvm->arch.gisa) << 24) << 32; + active_mask = (isc_mask & gisa_get_ipm(gi->origin) << 24) << 32; while (active_mask) { isc = __fls(active_mask) ^ (BITS_PER_LONG - 1); - if (gisa_tac_ipm_gisc(kvm->arch.gisa, isc)) + if (gisa_tac_ipm_gisc(gi->origin, isc)) return isc; clear_bit_inv(isc, &active_mask); } @@ -1567,6 +1569,7 @@ out: struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, u64 isc_mask, u32 schid) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_s390_interrupt_info *inti, *tmp_inti; int isc; @@ -1584,7 +1587,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, /* both types of interrupts present */ if (int_word_to_isc(inti->io.io_int_word) <= isc) { /* classical IO int with higher priority */ - gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(gi->origin, isc); goto out; } gisa_out: @@ -1596,7 +1599,7 @@ gisa_out: kvm_s390_reinject_io_int(kvm, inti); inti = tmp_inti; } else - gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(gi->origin, isc); out: return inti; } @@ -1685,6 +1688,7 @@ static int __inject_float_mchk(struct kvm *kvm, static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_s390_float_interrupt *fi; struct list_head *list; int isc; @@ -1692,9 +1696,9 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) kvm->stat.inject_io++; isc = int_word_to_isc(inti->io.io_int_word); - if (kvm->arch.gisa && inti->type & KVM_S390_INT_IO_AI_MASK) { + if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) { VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc); - gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(gi->origin, isc); kfree(inti); return 0; } @@ -1752,7 +1756,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_STOP_INT); break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - if (!(type & KVM_S390_INT_IO_AI_MASK && kvm->arch.gisa)) + if (!(type & KVM_S390_INT_IO_AI_MASK && + kvm->arch.gisa_int.origin)) kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); break; default: @@ -2002,6 +2007,7 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm) static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_s390_interrupt_info *inti; struct kvm_s390_float_interrupt *fi; struct kvm_s390_irq *buf; @@ -2025,14 +2031,14 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) max_irqs = len / sizeof(struct kvm_s390_irq); - if (kvm->arch.gisa && gisa_get_ipm(kvm->arch.gisa)) { + if (gi->origin && gisa_get_ipm(gi->origin)) { for (i = 0; i <= MAX_ISC; i++) { if (n == max_irqs) { /* signal userspace to try again */ ret = -ENOMEM; goto out_nolock; } - if (gisa_tac_ipm_gisc(kvm->arch.gisa, i)) { + if (gisa_tac_ipm_gisc(gi->origin, i)) { irq = (struct kvm_s390_irq *) &buf[n]; irq->type = KVM_S390_INT_IO(1, 0, 0, 0); irq->u.io.io_int_word = isc_to_int_word(i); @@ -2884,25 +2890,27 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) void kvm_s390_gisa_clear(struct kvm *kvm) { - if (!kvm->arch.gisa) + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!gi->origin) return; - memset(kvm->arch.gisa, 0, sizeof(struct kvm_s390_gisa)); - kvm->arch.gisa->next_alert = (u32)(u64)kvm->arch.gisa; - VM_EVENT(kvm, 3, "gisa 0x%pK cleared", kvm->arch.gisa); + memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); + gi->origin->next_alert = (u32)(u64)gi->origin; + VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin); } void kvm_s390_gisa_init(struct kvm *kvm) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + if (!css_general_characteristics.aiv) return; - kvm->arch.gisa = &kvm->arch.sie_page2->gisa; + gi->origin = &kvm->arch.sie_page2->gisa; kvm_s390_gisa_clear(kvm); - VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa); + VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); } void kvm_s390_gisa_destroy(struct kvm *kvm) { - if (!kvm->arch.gisa) - return; - kvm->arch.gisa = NULL; + kvm->arch.gisa_int.origin = NULL; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 11b4be3dad15..5eaffb3e1738 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2812,7 +2812,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block->icpua = id; spin_lock_init(&vcpu->arch.local_int.lock); - vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa; + vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa_int.origin; if (vcpu->arch.sie_block->gd && sclp.has_gisaf) vcpu->arch.sie_block->gd |= GISA_FORMAT1; seqcount_init(&vcpu->arch.cputm_seqcount); From 3dec19221788ec29fff91f3a6bebfc7b8132b0a1 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:39 +0100 Subject: [PATCH 063/855] s390/cio: add function chsc_sgib() This patch implements the Set Guest Information Block operation to request association or disassociation of a Guest Information Block (GIB) with the Adapter Interruption Facility. The operation is required to receive GIB alert interrupts for guest adapters in conjunction with AIV and GISA. Signed-off-by: Michael Mueller Reviewed-by: Sebastian Ott Reviewed-by: Pierre Morel Reviewed-by: Christian Borntraeger Acked-by: Halil Pasic Acked-by: Janosch Frank Acked-by: Cornelia Huck Message-Id: <20190131085247.13826-9-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/cio.h | 1 + drivers/s390/cio/chsc.c | 37 +++++++++++++++++++++++++++++++++++++ drivers/s390/cio/chsc.h | 1 + 3 files changed, 39 insertions(+) diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 225667652069..1727180e8ca1 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -331,5 +331,6 @@ extern void css_schedule_reprobe(void); /* Function from drivers/s390/cio/chsc.c */ int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); int chsc_sstpi(void *page, void *result, size_t size); +int chsc_sgib(u32 origin); #endif diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index a0baee25134c..4159c63a5fd2 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -1382,3 +1382,40 @@ int chsc_pnso_brinfo(struct subchannel_id schid, return chsc_error_from_response(brinfo_area->response.code); } EXPORT_SYMBOL_GPL(chsc_pnso_brinfo); + +int chsc_sgib(u32 origin) +{ + struct { + struct chsc_header request; + u16 op; + u8 reserved01[2]; + u8 reserved02:4; + u8 fmt:4; + u8 reserved03[7]; + /* operation data area begin */ + u8 reserved04[4]; + u32 gib_origin; + u8 reserved05[10]; + u8 aix; + u8 reserved06[4029]; + struct chsc_header response; + u8 reserved07[4]; + } *sgib_area; + int ret; + + spin_lock_irq(&chsc_page_lock); + memset(chsc_page, 0, PAGE_SIZE); + sgib_area = chsc_page; + sgib_area->request.length = 0x0fe0; + sgib_area->request.code = 0x0021; + sgib_area->op = 0x1; + sgib_area->gib_origin = origin; + + ret = chsc(sgib_area); + if (ret == 0) + ret = chsc_error_from_response(sgib_area->response.code); + spin_unlock_irq(&chsc_page_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(chsc_sgib); diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h index 78aba8d94eec..e57d68e325a3 100644 --- a/drivers/s390/cio/chsc.h +++ b/drivers/s390/cio/chsc.h @@ -164,6 +164,7 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp); int chsc_ssqd(struct subchannel_id schid, struct chsc_ssqd_area *ssqd); int chsc_sadc(struct subchannel_id schid, struct chsc_scssc_area *scssc, u64 summary_indicator_addr, u64 subchannel_indicator_addr); +int chsc_sgib(u32 origin); int chsc_error_from_response(int response); int chsc_siosl(struct subchannel_id schid); From 1282c21eb3dac324b8531bf2e0e0fbd9d5b6516b Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:40 +0100 Subject: [PATCH 064/855] KVM: s390: add the GIB and its related life-cyle functions The Guest Information Block (GIB) links the GISA of all guests that have adapter interrupts pending. These interrupts cannot be delivered because all vcpus of these guests are currently in WAIT state or have masked the respective Interruption Sub Class (ISC). If enabled, a GIB alert is issued on the host to schedule these guests to run suitable vcpus to consume the pending interruptions. This mechanism allows to process adapter interrupts for currently not running guests. The GIB is created during host initialization and associated with the Adapter Interruption Facility in case an Adapter Interruption Virtualization Facility is available. The GIB initialization and thus the activation of the related code will be done in an upcoming patch of this series. Signed-off-by: Michael Mueller Reviewed-by: Janosch Frank Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Reviewed-by: Pierre Morel Acked-by: Halil Pasic Message-Id: <20190131085247.13826-10-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 9 +++++++ arch/s390/kvm/interrupt.c | 43 ++++++++++++++++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 1 + arch/s390/kvm/kvm-s390.h | 2 ++ 4 files changed, 55 insertions(+) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 0941571d34eb..397af0d33539 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -784,6 +784,15 @@ struct kvm_s390_gisa { }; }; +struct kvm_s390_gib { + u32 alert_list_origin; + u32 reserved01; + u8:5; + u8 nisc:3; + u8 reserved03[3]; + u32 reserved04[5]; +}; + /* * sie_page2 has to be allocated as DMA because fac_list, crycb and * gisa need 31bit addresses in the sie control block. diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ee91d1de0036..5efcd9e2cf8f 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -7,6 +7,9 @@ * Author(s): Carsten Otte */ +#define KMSG_COMPONENT "kvm-s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include #include #include @@ -31,6 +34,8 @@ #define PFAULT_DONE 0x0680 #define VIRTIO_PARAM 0x0d00 +static struct kvm_s390_gib *gib; + /* handle external calls via sigp interpretation facility */ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) { @@ -2914,3 +2919,41 @@ void kvm_s390_gisa_destroy(struct kvm *kvm) { kvm->arch.gisa_int.origin = NULL; } + +void kvm_s390_gib_destroy(void) +{ + if (!gib) + return; + chsc_sgib(0); + free_page((unsigned long)gib); + gib = NULL; +} + +int kvm_s390_gib_init(u8 nisc) +{ + int rc = 0; + + if (!css_general_characteristics.aiv) { + KVM_EVENT(3, "%s", "gib not initialized, no AIV facility"); + goto out; + } + + gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL | GFP_DMA); + if (!gib) { + rc = -ENOMEM; + goto out; + } + + gib->nisc = nisc; + if (chsc_sgib((u32)(u64)gib)) { + pr_err("Associating the GIB with the AIV facility failed\n"); + free_page((unsigned long)gib); + gib = NULL; + rc = -EIO; + goto out; + } + + KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); +out: + return rc; +} diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5eaffb3e1738..ede89172b8f4 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -444,6 +444,7 @@ out_debug_unreg: void kvm_arch_exit(void) { + kvm_s390_gib_destroy(); debug_unregister(kvm_s390_dbf); } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 72ef87799593..6d9448dbd052 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -381,6 +381,8 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, void kvm_s390_gisa_init(struct kvm *kvm); void kvm_s390_gisa_clear(struct kvm *kvm); void kvm_s390_gisa_destroy(struct kvm *kvm); +int kvm_s390_gib_init(u8 nisc); +void kvm_s390_gib_destroy(void); /* implemented in guestdbg.c */ void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu); From 25c84dbaec6a5079ab64cf8b633ec811f8e43fdd Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:41 +0100 Subject: [PATCH 065/855] KVM: s390: add kvm reference to struct sie_page2 Adding the kvm reference to struct sie_page2 will allow to determine the kvm a given gisa belongs to: container_of(gisa, struct sie_page2, gisa)->kvm This functionality will be required to process a gisa in gib alert interruption context. Signed-off-by: Michael Mueller Reviewed-by: Pierre Morel Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-11-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 3 ++- arch/s390/kvm/kvm-s390.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 397af0d33539..7077762ab460 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -801,7 +801,8 @@ struct sie_page2 { __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64]; /* 0x0000 */ struct kvm_s390_crypto_cb crycb; /* 0x0800 */ struct kvm_s390_gisa gisa; /* 0x0900 */ - u8 reserved920[0x1000 - 0x920]; /* 0x0920 */ + struct kvm *kvm; /* 0x0920 */ + u8 reserved928[0x1000 - 0x928]; /* 0x0928 */ }; struct kvm_s390_vsie { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ede89172b8f4..0de062e989e2 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2210,6 +2210,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.sie_page2) goto out_err; + kvm->arch.sie_page2->kvm = kvm; kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list; for (i = 0; i < kvm_s390_fac_size(); i++) { From 6cff2e1046015f2729f6cb68be1c999c4a9b259f Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:42 +0100 Subject: [PATCH 066/855] KVM: s390: add functions to (un)register GISC with GISA Add the Interruption Alert Mask (IAM) to the architecture specific kvm struct. This mask in the GISA is used to define for which ISC a GIB alert will be issued. The functions kvm_s390_gisc_register() and kvm_s390_gisc_unregister() are used to (un)register a GISC (guest ISC) with a virtual machine and its GISA. Upon successful completion, kvm_s390_gisc_register() returns the ISC to be used for GIB alert interruptions. A negative return code indicates an error during registration. Theses functions will be used by other adapter types like AP and PCI to request pass-through interruption support. Signed-off-by: Michael Mueller Acked-by: Pierre Morel Acked-by: Halil Pasic Reviewed-by: Cornelia Huck Message-Id: <20190131085247.13826-12-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 13 ++++ arch/s390/kvm/interrupt.c | 112 +++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 7077762ab460..2cfff617cb21 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -781,6 +781,9 @@ struct kvm_s390_gisa { u8 reserved03[11]; u32 airq_count; } g1; + struct { + u64 word[4]; + } u64; }; }; @@ -813,8 +816,15 @@ struct kvm_s390_vsie { struct page *pages[KVM_MAX_VCPUS]; }; +struct kvm_s390_gisa_iam { + u8 mask; + spinlock_t ref_lock; + u32 ref_count[MAX_ISC + 1]; +}; + struct kvm_s390_gisa_interrupt { struct kvm_s390_gisa *origin; + struct kvm_s390_gisa_iam alert; }; struct kvm_arch{ @@ -885,6 +895,9 @@ void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, extern int sie64a(struct kvm_s390_sie_block *, u64 *); extern char sie_exit; +extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); +extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); + static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_check_processor_compat(void *rtn) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 5efcd9e2cf8f..040745b23224 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -222,6 +222,33 @@ static inline u8 int_word_to_isc(u32 int_word) */ #define IPM_BIT_OFFSET (offsetof(struct kvm_s390_gisa, ipm) * BITS_PER_BYTE) +/** + * gisa_set_iam - change the GISA interruption alert mask + * + * @gisa: gisa to operate on + * @iam: new IAM value to use + * + * Change the IAM atomically with the next alert address and the IPM + * of the GISA if the GISA is not part of the GIB alert list. All three + * fields are located in the first long word of the GISA. + * + * Returns: 0 on success + * -EBUSY in case the gisa is part of the alert list + */ +static inline int gisa_set_iam(struct kvm_s390_gisa *gisa, u8 iam) +{ + u64 word, _word; + + do { + word = READ_ONCE(gisa->u64.word[0]); + if ((u64)gisa != word >> 32) + return -EBUSY; + _word = (word & ~0xffUL) | iam; + } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); + + return 0; +} + static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -2911,6 +2938,8 @@ void kvm_s390_gisa_init(struct kvm *kvm) if (!css_general_characteristics.aiv) return; gi->origin = &kvm->arch.sie_page2->gisa; + gi->alert.mask = 0; + spin_lock_init(&gi->alert.ref_lock); kvm_s390_gisa_clear(kvm); VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); } @@ -2920,6 +2949,89 @@ void kvm_s390_gisa_destroy(struct kvm *kvm) kvm->arch.gisa_int.origin = NULL; } +/** + * kvm_s390_gisc_register - register a guest ISC + * + * @kvm: the kernel vm to work with + * @gisc: the guest interruption sub class to register + * + * The function extends the vm specific alert mask to use. + * The effective IAM mask in the GISA is updated as well + * in case the GISA is not part of the GIB alert list. + * It will be updated latest when the IAM gets restored + * by gisa_get_ipm_or_restore_iam(). + * + * Returns: the nonspecific ISC (NISC) the gib alert mechanism + * has registered with the channel subsystem. + * -ENODEV in case the vm uses no GISA + * -ERANGE in case the guest ISC is invalid + */ +int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!gi->origin) + return -ENODEV; + if (gisc > MAX_ISC) + return -ERANGE; + + spin_lock(&gi->alert.ref_lock); + gi->alert.ref_count[gisc]++; + if (gi->alert.ref_count[gisc] == 1) { + gi->alert.mask |= 0x80 >> gisc; + gisa_set_iam(gi->origin, gi->alert.mask); + } + spin_unlock(&gi->alert.ref_lock); + + return gib->nisc; +} +EXPORT_SYMBOL_GPL(kvm_s390_gisc_register); + +/** + * kvm_s390_gisc_unregister - unregister a guest ISC + * + * @kvm: the kernel vm to work with + * @gisc: the guest interruption sub class to register + * + * The function reduces the vm specific alert mask to use. + * The effective IAM mask in the GISA is updated as well + * in case the GISA is not part of the GIB alert list. + * It will be updated latest when the IAM gets restored + * by gisa_get_ipm_or_restore_iam(). + * + * Returns: the nonspecific ISC (NISC) the gib alert mechanism + * has registered with the channel subsystem. + * -ENODEV in case the vm uses no GISA + * -ERANGE in case the guest ISC is invalid + * -EINVAL in case the guest ISC is not registered + */ +int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + int rc = 0; + + if (!gi->origin) + return -ENODEV; + if (gisc > MAX_ISC) + return -ERANGE; + + spin_lock(&gi->alert.ref_lock); + if (gi->alert.ref_count[gisc] == 0) { + rc = -EINVAL; + goto out; + } + gi->alert.ref_count[gisc]--; + if (gi->alert.ref_count[gisc] == 0) { + gi->alert.mask &= ~(0x80 >> gisc); + gisa_set_iam(gi->origin, gi->alert.mask); + } +out: + spin_unlock(&gi->alert.ref_lock); + + return rc; +} +EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); + void kvm_s390_gib_destroy(void) { if (!gib) From 174dd4f88875f6ebad7ad1dff3c3e9530a1e3506 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:43 +0100 Subject: [PATCH 067/855] KVM: s390: kvm_s390_gisa_clear() now clears the IPM only Function kvm_s390_gisa_clear() now clears the Interruption Pending Mask of the GISA asap. If the GISA is in the alert list at this time it stays in the list but is removed by process_gib_alert_list(). Signed-off-by: Michael Mueller Acked-by: Halil Pasic Reviewed-by: Pierre Morel Message-Id: <20190131085247.13826-13-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 040745b23224..9faaa8f96fc3 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -249,6 +249,25 @@ static inline int gisa_set_iam(struct kvm_s390_gisa *gisa, u8 iam) return 0; } +/** + * gisa_clear_ipm - clear the GISA interruption pending mask + * + * @gisa: gisa to operate on + * + * Clear the IPM atomically with the next alert address and the IAM + * of the GISA unconditionally. All three fields are located in the + * first long word of the GISA. + */ +static inline void gisa_clear_ipm(struct kvm_s390_gisa *gisa) +{ + u64 word, _word; + + do { + word = READ_ONCE(gisa->u64.word[0]); + _word = word & ~(0xffUL << 24); + } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); +} + static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -2926,8 +2945,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm) if (!gi->origin) return; - memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); - gi->origin->next_alert = (u32)(u64)gi->origin; + gisa_clear_ipm(gi->origin); VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin); } @@ -2940,7 +2958,8 @@ void kvm_s390_gisa_init(struct kvm *kvm) gi->origin = &kvm->arch.sie_page2->gisa; gi->alert.mask = 0; spin_lock_init(&gi->alert.ref_lock); - kvm_s390_gisa_clear(kvm); + memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); + gi->origin->next_alert = (u32)(u64)gi->origin; VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); } From 9f30f62163786a0b80e0886046b5c66e714e7e71 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:44 +0100 Subject: [PATCH 068/855] KVM: s390: add gib_alert_irq_handler() The patch implements a handler for GIB alert interruptions on the host. Its task is to alert guests that interrupts are pending for them. A GIB alert interrupt statistic counter is added as well: $ cat /proc/interrupts CPU0 CPU1 ... GAL: 23 37 [I/O] GIB Alert ... Signed-off-by: Michael Mueller Acked-by: Halil Pasic Reviewed-by: Pierre Morel Message-Id: <20190131085247.13826-14-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/irq.h | 1 + arch/s390/include/asm/isc.h | 1 + arch/s390/include/asm/kvm_host.h | 3 + arch/s390/kernel/irq.c | 1 + arch/s390/kvm/interrupt.c | 169 ++++++++++++++++++++++++++++++- arch/s390/kvm/kvm-s390.c | 2 + 6 files changed, 175 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index 2f7f27e5493f..afaf5e3c57fd 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -62,6 +62,7 @@ enum interruption_class { IRQIO_MSI, IRQIO_VIR, IRQIO_VAI, + IRQIO_GAL, NMI_NMI, CPU_RST, NR_ARCH_IRQS diff --git a/arch/s390/include/asm/isc.h b/arch/s390/include/asm/isc.h index 6cb9e2ed05b6..b2cc1ec78d06 100644 --- a/arch/s390/include/asm/isc.h +++ b/arch/s390/include/asm/isc.h @@ -21,6 +21,7 @@ /* Adapter interrupts. */ #define QDIO_AIRQ_ISC IO_SCH_ISC /* I/O subchannel in qdio mode */ #define PCI_ISC 2 /* PCI I/O subchannels */ +#define GAL_ISC 5 /* GIB alert */ #define AP_ISC 6 /* adjunct processor (crypto) devices */ /* Functions for registration of I/O interruption subclasses */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 2cfff617cb21..c5f51566ecd6 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -825,6 +825,9 @@ struct kvm_s390_gisa_iam { struct kvm_s390_gisa_interrupt { struct kvm_s390_gisa *origin; struct kvm_s390_gisa_iam alert; + struct hrtimer timer; + u64 expires; + DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS); }; struct kvm_arch{ diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 0e8d68bac82c..0cd5a5f96729 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -88,6 +88,7 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQIO_MSI, .name = "MSI", .desc = "[I/O] MSI Interrupt" }, {.irq = IRQIO_VIR, .name = "VIR", .desc = "[I/O] Virtual I/O Devices"}, {.irq = IRQIO_VAI, .name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"}, + {.irq = IRQIO_GAL, .name = "GAL", .desc = "[I/O] GIB Alert"}, {.irq = NMI_NMI, .name = "NMI", .desc = "[NMI] Machine Check"}, {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, }; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 9faaa8f96fc3..dc446203e5a4 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" @@ -268,6 +269,38 @@ static inline void gisa_clear_ipm(struct kvm_s390_gisa *gisa) } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); } +/** + * gisa_get_ipm_or_restore_iam - return IPM or restore GISA IAM + * + * @gi: gisa interrupt struct to work on + * + * Atomically restores the interruption alert mask if none of the + * relevant ISCs are pending and return the IPM. + * + * Returns: the relevant pending ISCs + */ +static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi) +{ + u8 pending_mask, alert_mask; + u64 word, _word; + + do { + word = READ_ONCE(gi->origin->u64.word[0]); + alert_mask = READ_ONCE(gi->alert.mask); + pending_mask = (u8)(word >> 24) & alert_mask; + if (pending_mask) + return pending_mask; + _word = (word & ~0xffUL) | alert_mask; + } while (cmpxchg(&gi->origin->u64.word[0], word, _word) != word); + + return 0; +} + +static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa) +{ + return READ_ONCE(gisa->next_alert) != (u32)(u64)gisa; +} + static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -1141,6 +1174,7 @@ static u64 __calculate_sltime(struct kvm_vcpu *vcpu) int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) { + struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int; u64 sltime; vcpu->stat.exit_wait_state++; @@ -1154,6 +1188,11 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; /* disabled wait */ } + if (gi->origin && + (gisa_get_ipm_or_restore_iam(gi) & + vcpu->arch.sie_block->gcr[6] >> 24)) + return 0; + if (!ckc_interrupts_enabled(vcpu) && !cpu_timer_interrupts_enabled(vcpu)) { VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer"); @@ -2939,6 +2978,93 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) return n; } +static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) +{ + int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus); + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_vcpu *vcpu; + + for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_id); + if (psw_ioint_disabled(vcpu)) + continue; + deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); + if (deliverable_mask) { + /* lately kicked but not yet running */ + if (test_and_set_bit(vcpu_id, gi->kicked_mask)) + return; + kvm_s390_vcpu_wakeup(vcpu); + return; + } + } +} + +static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer) +{ + struct kvm_s390_gisa_interrupt *gi = + container_of(timer, struct kvm_s390_gisa_interrupt, timer); + struct kvm *kvm = + container_of(gi->origin, struct sie_page2, gisa)->kvm; + u8 pending_mask; + + pending_mask = gisa_get_ipm_or_restore_iam(gi); + if (pending_mask) { + __airqs_kick_single_vcpu(kvm, pending_mask); + hrtimer_forward_now(timer, ns_to_ktime(gi->expires)); + return HRTIMER_RESTART; + }; + + return HRTIMER_NORESTART; +} + +#define NULL_GISA_ADDR 0x00000000UL +#define NONE_GISA_ADDR 0x00000001UL +#define GISA_ADDR_MASK 0xfffff000UL + +static void process_gib_alert_list(void) +{ + struct kvm_s390_gisa_interrupt *gi; + struct kvm_s390_gisa *gisa; + struct kvm *kvm; + u32 final, origin = 0UL; + + do { + /* + * If the NONE_GISA_ADDR is still stored in the alert list + * origin, we will leave the outer loop. No further GISA has + * been added to the alert list by millicode while processing + * the current alert list. + */ + final = (origin & NONE_GISA_ADDR); + /* + * Cut off the alert list and store the NONE_GISA_ADDR in the + * alert list origin to avoid further GAL interruptions. + * A new alert list can be build up by millicode in parallel + * for guests not in the yet cut-off alert list. When in the + * final loop, store the NULL_GISA_ADDR instead. This will re- + * enable GAL interruptions on the host again. + */ + origin = xchg(&gib->alert_list_origin, + (!final) ? NONE_GISA_ADDR : NULL_GISA_ADDR); + /* + * Loop through the just cut-off alert list and start the + * gisa timers to kick idle vcpus to consume the pending + * interruptions asap. + */ + while (origin & GISA_ADDR_MASK) { + gisa = (struct kvm_s390_gisa *)(u64)origin; + origin = gisa->next_alert; + gisa->next_alert = (u32)(u64)gisa; + kvm = container_of(gisa, struct sie_page2, gisa)->kvm; + gi = &kvm->arch.gisa_int; + if (hrtimer_active(&gi->timer)) + hrtimer_cancel(&gi->timer); + hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL); + } + } while (!final); + +} + void kvm_s390_gisa_clear(struct kvm *kvm) { struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; @@ -2958,6 +3084,9 @@ void kvm_s390_gisa_init(struct kvm *kvm) gi->origin = &kvm->arch.sie_page2->gisa; gi->alert.mask = 0; spin_lock_init(&gi->alert.ref_lock); + gi->expires = 50 * 1000; /* 50 usec */ + hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + gi->timer.function = gisa_vcpu_kicker; memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); gi->origin->next_alert = (u32)(u64)gi->origin; VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); @@ -2965,7 +3094,17 @@ void kvm_s390_gisa_init(struct kvm *kvm) void kvm_s390_gisa_destroy(struct kvm *kvm) { - kvm->arch.gisa_int.origin = NULL; + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!gi->origin) + return; + if (gi->alert.mask) + KVM_EVENT(3, "vm 0x%pK has unexpected iam 0x%02x", + kvm, gi->alert.mask); + while (gisa_in_alert_list(gi->origin)) + cpu_relax(); + hrtimer_cancel(&gi->timer); + gi->origin = NULL; } /** @@ -3051,11 +3190,23 @@ out: } EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); +static void gib_alert_irq_handler(struct airq_struct *airq) +{ + inc_irq_stat(IRQIO_GAL); + process_gib_alert_list(); +} + +static struct airq_struct gib_alert_irq = { + .handler = gib_alert_irq_handler, + .lsi_ptr = &gib_alert_irq.lsi_mask, +}; + void kvm_s390_gib_destroy(void) { if (!gib) return; chsc_sgib(0); + unregister_adapter_interrupt(&gib_alert_irq); free_page((unsigned long)gib); gib = NULL; } @@ -3075,16 +3226,30 @@ int kvm_s390_gib_init(u8 nisc) goto out; } + gib_alert_irq.isc = nisc; + if (register_adapter_interrupt(&gib_alert_irq)) { + pr_err("Registering the GIB alert interruption handler failed\n"); + rc = -EIO; + goto out_free_gib; + } + gib->nisc = nisc; if (chsc_sgib((u32)(u64)gib)) { pr_err("Associating the GIB with the AIV facility failed\n"); free_page((unsigned long)gib); gib = NULL; rc = -EIO; - goto out; + goto out_unreg_gal; } KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); + goto out; + +out_unreg_gal: + unregister_adapter_interrupt(&gib_alert_irq); +out_free_gib: + free_page((unsigned long)gib); + gib = NULL; out: return rc; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0de062e989e2..0099fbda2e98 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3460,6 +3460,8 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) kvm_s390_patch_guest_per_regs(vcpu); } + clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask); + vcpu->arch.sie_block->icptcode = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags); From b1d1e76ed9ee5a0f7671da257d4f0595d1f5162e Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:45 +0100 Subject: [PATCH 069/855] KVM: s390: start using the GIB By initializing the GIB, it will be used by the kvm host. Signed-off-by: Michael Mueller Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Message-Id: <20190131085247.13826-15-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0099fbda2e98..2e47c724679e 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -435,8 +435,15 @@ int kvm_arch_init(void *opaque) pr_err("A FLIC registration call failed with rc=%d\n", rc); goto out_debug_unreg; } + + rc = kvm_s390_gib_init(GAL_ISC); + if (rc) + goto out_gib_destroy; + return 0; +out_gib_destroy: + kvm_s390_gib_destroy(); out_debug_unreg: debug_unregister(kvm_s390_dbf); return rc; From b9fa6d6ee9b88eedf2d2261f931841664aa6980e Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 31 Jan 2019 09:52:46 +0100 Subject: [PATCH 070/855] KVM: s390: fix possible null pointer dereference in pending_irqs() Assure a GISA is in use before accessing the IPM to avoid a null pointer dereference issue. Signed-off-by: Michael Mueller Reported-by: Halil Pasic Reviewed-by: Pierre Morel Reviewed-by: Cornelia Huck Message-Id: <20190131085247.13826-16-mimu@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index dc446203e5a4..82162867f378 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -329,9 +329,13 @@ static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) { - return pending_irqs_no_gisa(vcpu) | - gisa_get_ipm(vcpu->kvm->arch.gisa_int.origin) << - IRQ_PEND_IO_ISC_7; + struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int; + unsigned long pending_mask; + + pending_mask = pending_irqs_no_gisa(vcpu); + if (gi->origin) + pending_mask |= gisa_get_ipm(gi->origin) << IRQ_PEND_IO_ISC_7; + return pending_mask; } static inline int isc_to_irq_type(unsigned long isc) From 5766ba31992d39738b5b0b32d38f94abbe862b3e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 8 Feb 2019 19:24:44 +0100 Subject: [PATCH 071/855] fbdev/via: fix spelling mistake "Expandsion" -> "Expansion" Trivial fix to spelling mistake in MODULE_PARM_DESC text. Signed-off-by: Colin Ian King Cc: Florian Tobias Schandinat Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/via/viafbdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/via/viafbdev.c b/drivers/video/fbdev/via/viafbdev.c index 7bb7e90b8f00..bdf5a0ea876d 100644 --- a/drivers/video/fbdev/via/viafbdev.c +++ b/drivers/video/fbdev/via/viafbdev.c @@ -2110,7 +2110,7 @@ MODULE_PARM_DESC(viafb_lcd_panel_id, module_param(viafb_lcd_dsp_method, int, S_IRUSR); MODULE_PARM_DESC(viafb_lcd_dsp_method, - "Set Flat Panel display scaling method.(Default=Expandsion)"); + "Set Flat Panel display scaling method.(Default=Expansion)"); module_param(viafb_SAMM_ON, int, S_IRUSR); MODULE_PARM_DESC(viafb_SAMM_ON, From 8e71fa5e4d86bedfd26df85381d65d6b4c860020 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 8 Feb 2019 19:24:45 +0100 Subject: [PATCH 072/855] fbdev: chipsfb: remove set but not used variable 'size' Fixes gcc '-Wunused-but-set-variable' warning: drivers/video/fbdev/chipsfb.c: In function 'chipsfb_pci_init': drivers/video/fbdev/chipsfb.c:352:22: warning: variable 'size' set but not used [-Wunused-but-set-variable] Fixes: 8c8709334cec ("[PATCH] ppc32: Remove CONFIG_PMAC_PBOOK"). Signed-off-by: YueHaibing Acked-by: Michael Ellerman Cc: Daniel Vetter Cc: Christophe Leroy [b.zolnierkie: minor commit summary and description fixups] Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/chipsfb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/video/fbdev/chipsfb.c b/drivers/video/fbdev/chipsfb.c index 40182ed85648..ca549e1532e6 100644 --- a/drivers/video/fbdev/chipsfb.c +++ b/drivers/video/fbdev/chipsfb.c @@ -349,7 +349,7 @@ static void init_chips(struct fb_info *p, unsigned long addr) static int chipsfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent) { struct fb_info *p; - unsigned long addr, size; + unsigned long addr; unsigned short cmd; int rc = -ENODEV; @@ -361,7 +361,6 @@ static int chipsfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent) if ((dp->resource[0].flags & IORESOURCE_MEM) == 0) goto err_disable; addr = pci_resource_start(dp, 0); - size = pci_resource_len(dp, 0); if (addr == 0) goto err_disable; From 23cd78e28a7b17528f44933edb93e93ba082138c Mon Sep 17 00:00:00 2001 From: Konstantin Khorenko Date: Fri, 8 Feb 2019 19:24:45 +0100 Subject: [PATCH 073/855] fbcon: use kvmalloc() for scrollback buffer Scrollback frame buffer is rather big - 32K, so it requires 3rd order page, so let's use kvmalloc() instead of ordinary kmalloc() for it. Signed-off-by: Konstantin Khorenko Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/core/fbcon.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index bfa1360ec750..fd430e6b4c63 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -999,7 +999,7 @@ static const char *fbcon_startup(void) if (!softback_buf) { softback_buf = (unsigned long) - kmalloc(fbcon_softback_size, + kvmalloc(fbcon_softback_size, GFP_KERNEL); if (!softback_buf) { fbcon_softback_size = 0; @@ -1008,7 +1008,7 @@ static const char *fbcon_startup(void) } } else { if (softback_buf) { - kfree((void *) softback_buf); + kvfree((void *) softback_buf); softback_buf = 0; softback_top = 0; } @@ -3672,7 +3672,7 @@ static void fbcon_exit(void) } #endif - kfree((void *)softback_buf); + kvfree((void *)softback_buf); softback_buf = 0UL; for_each_registered_fb(i) { From a6f13af4d115a1afb97a9215c491f0bbc5362af7 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 8 Feb 2019 19:24:45 +0100 Subject: [PATCH 074/855] fbdev: Use of_node_name_eq for node name comparisons Convert string compares of DT node names to use of_node_name_eq helper instead. This removes direct access to the node name pointer. For instances using of_node_cmp, this has the side effect of now using case sensitive comparisons. This should not matter for any FDT based system which omap is. Signed-off-by: Rob Herring Cc: Benjamin Herrenschmidt Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/aty/radeon_pm.c | 6 +++--- drivers/video/fbdev/cg14.c | 4 ++-- drivers/video/fbdev/cg3.c | 2 +- drivers/video/fbdev/ffb.c | 2 +- drivers/video/fbdev/imsttfb.c | 4 ++-- drivers/video/fbdev/offb.c | 2 +- drivers/video/fbdev/omap2/omapfb/dss/dss-of.c | 4 ++-- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/video/fbdev/aty/radeon_pm.c b/drivers/video/fbdev/aty/radeon_pm.c index e695adb0e573..2dc5703eac51 100644 --- a/drivers/video/fbdev/aty/radeon_pm.c +++ b/drivers/video/fbdev/aty/radeon_pm.c @@ -2844,8 +2844,8 @@ void radeonfb_pm_init(struct radeonfb_info *rinfo, int dynclk, int ignore_devlis * in some desktop G4s), Via (M9+ chip on iBook G4) and * Snowy (M11 chip on iBook G4 manufactured after July 2005) */ - if (!strcmp(rinfo->of_node->name, "ATY,JasperParent") || - !strcmp(rinfo->of_node->name, "ATY,SnowyParent")) { + if (of_node_name_eq(rinfo->of_node, "ATY,JasperParent") || + of_node_name_eq(rinfo->of_node, "ATY,SnowyParent")) { rinfo->reinit_func = radeon_reinitialize_M10; rinfo->pm_mode |= radeon_pm_off; } @@ -2855,7 +2855,7 @@ void radeonfb_pm_init(struct radeonfb_info *rinfo, int dynclk, int ignore_devlis rinfo->pm_mode |= radeon_pm_off; } #endif - if (!strcmp(rinfo->of_node->name, "ATY,ViaParent")) { + if (of_node_name_eq(rinfo->of_node, "ATY,ViaParent")) { rinfo->reinit_func = radeon_reinitialize_M9P; rinfo->pm_mode |= radeon_pm_off; } diff --git a/drivers/video/fbdev/cg14.c b/drivers/video/fbdev/cg14.c index 9af54c2368fd..a6dce1a78490 100644 --- a/drivers/video/fbdev/cg14.c +++ b/drivers/video/fbdev/cg14.c @@ -486,8 +486,8 @@ static int cg14_probe(struct platform_device *op) info->var.xres); info->fix.smem_len = PAGE_ALIGN(linebytes * info->var.yres); - if (!strcmp(dp->parent->name, "sbus") || - !strcmp(dp->parent->name, "sbi")) { + if (of_node_name_eq(dp->parent, "sbus") || + of_node_name_eq(dp->parent, "sbi")) { info->fix.smem_start = op->resource[0].start; par->iospace = op->resource[0].flags & IORESOURCE_BITS; } else { diff --git a/drivers/video/fbdev/cg3.c b/drivers/video/fbdev/cg3.c index 1bd95b02f3aa..6d42def8436b 100644 --- a/drivers/video/fbdev/cg3.c +++ b/drivers/video/fbdev/cg3.c @@ -369,7 +369,7 @@ static int cg3_probe(struct platform_device *op) info->var.red.length = 8; info->var.green.length = 8; info->var.blue.length = 8; - if (!strcmp(dp->name, "cgRDI")) + if (of_node_name_eq(dp, "cgRDI")) par->flags |= CG3_FLAG_RDI; if (par->flags & CG3_FLAG_RDI) cg3_rdi_maybe_fixup_var(&info->var, dp); diff --git a/drivers/video/fbdev/ffb.c b/drivers/video/fbdev/ffb.c index 6b1915872af1..b7aee0c427a8 100644 --- a/drivers/video/fbdev/ffb.c +++ b/drivers/video/fbdev/ffb.c @@ -944,7 +944,7 @@ static int ffb_probe(struct platform_device *op) info->var.accel_flags = FB_ACCELF_TEXT; - if (!strcmp(dp->name, "SUNW,afb")) + if (of_node_name_eq(dp, "SUNW,afb")) par->flags |= FFB_FLAG_AFB; par->board_type = of_getintprop_default(dp, "board_type", 0); diff --git a/drivers/video/fbdev/imsttfb.c b/drivers/video/fbdev/imsttfb.c index 901ca4ed10e9..b268570b019d 100644 --- a/drivers/video/fbdev/imsttfb.c +++ b/drivers/video/fbdev/imsttfb.c @@ -1498,8 +1498,8 @@ static int imsttfb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) switch (pdev->device) { case PCI_DEVICE_ID_IMS_TT128: /* IMS,tt128mbA */ par->ramdac = IBM; - if (dp && ((strcmp(dp->name, "IMS,tt128mb8") == 0) || - (strcmp(dp->name, "IMS,tt128mb8A") == 0))) + if (of_node_name_eq(dp, "IMS,tt128mb8") || + of_node_name_eq(dp, "IMS,tt128mb8A")) par->ramdac = TVP; break; case PCI_DEVICE_ID_IMS_TT3D: /* IMS,tt3d */ diff --git a/drivers/video/fbdev/offb.c b/drivers/video/fbdev/offb.c index 057d3cdef92e..8f62e174990b 100644 --- a/drivers/video/fbdev/offb.c +++ b/drivers/video/fbdev/offb.c @@ -646,7 +646,7 @@ static void __init offb_init_nodriver(struct device_node *dp, int no_real_node) } #endif /* kludge for valkyrie */ - if (strcmp(dp->name, "valkyrie") == 0) + if (of_node_name_eq(dp, "valkyrie")) address += 0x1000; offb_init_fb(no_real_node ? "bootx" : NULL, width, height, depth, pitch, address, diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c b/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c index f1eb8b0f8a2a..5ce893c1923d 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c @@ -60,7 +60,7 @@ omapdss_of_get_next_port(const struct device_node *parent, return NULL; } prev = port; - } while (of_node_cmp(port->name, "port") != 0); + } while (!of_node_name_eq(port, "port")); of_node_put(ports); } @@ -83,7 +83,7 @@ omapdss_of_get_next_endpoint(const struct device_node *parent, if (!ep) return NULL; prev = ep; - } while (of_node_cmp(ep->name, "endpoint") != 0); + } while (!of_node_name_eq(ep, "endpoint")); return ep; } From f1fbbf5cfaa8f5e651c85a9c504ed65e96b1b51b Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 8 Feb 2019 19:24:46 +0100 Subject: [PATCH 075/855] omapfb: fix typo Fix spelling mistake: "lenght" -> "length" Signed-off-by: Matteo Croce Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/omap2/omapfb/dss/hdmi4_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi4_core.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi4_core.c index fa72e735dad2..d146793dd044 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi4_core.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi4_core.c @@ -712,7 +712,7 @@ int hdmi4_audio_config(struct hdmi_core_data *core, struct hdmi_wp_data *wp, else acore.i2s_cfg.justification = HDMI_AUDIO_JUSTIFY_RIGHT; /* - * The I2S input word length is twice the lenght given in the IEC-60958 + * The I2S input word length is twice the length given in the IEC-60958 * status word. If the word size is greater than * 20 bits, increment by one. */ From 89ef5c6a7584f9d7f8f11def67d2e8a2312ec9fe Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Fri, 8 Feb 2019 19:24:46 +0100 Subject: [PATCH 076/855] video: offb: annotate implicit fall throughs There is a plan to build the kernel with -Wimplicit-fallthrough and these places in the code produced warnings (W=1). Fix them up. This commit remove the following warnings: drivers/video/fbdev/offb.c:211:5: warning: this statement may fall through [-Wimplicit-fallthrough=] drivers/video/fbdev/offb.c:142:3: warning: this statement may fall through [-Wimplicit-fallthrough=] Signed-off-by: Mathieu Malaterre Acked-by: Gustavo A. R. Silva Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/offb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/video/fbdev/offb.c b/drivers/video/fbdev/offb.c index 8f62e174990b..fbc6eafb63c7 100644 --- a/drivers/video/fbdev/offb.c +++ b/drivers/video/fbdev/offb.c @@ -141,6 +141,7 @@ static int offb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, /* Clear PALETTE_ACCESS_CNTL in DAC_CNTL */ out_le32(par->cmap_adr + 0x58, in_le32(par->cmap_adr + 0x58) & ~0x20); + /* fall through */ case cmap_r128: /* Set palette index & data */ out_8(par->cmap_adr + 0xb0, regno); @@ -210,6 +211,7 @@ static int offb_blank(int blank, struct fb_info *info) /* Clear PALETTE_ACCESS_CNTL in DAC_CNTL */ out_le32(par->cmap_adr + 0x58, in_le32(par->cmap_adr + 0x58) & ~0x20); + /* fall through */ case cmap_r128: /* Set palette index & data */ out_8(par->cmap_adr + 0xb0, i); From 1ea673ade7448db0f6f2fe423e72b84e33f653fe Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Fri, 8 Feb 2019 19:24:46 +0100 Subject: [PATCH 077/855] video: fbdev: geode: remove ifdef OLPC noise provides olpc_has_dcon() stub for CONFIG_OLPC=n, compiler should just optimize the unneeded bits away. Signed-off-by: Lubomir Rintel Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/geode/gxfb_core.c | 13 ++----------- drivers/video/fbdev/geode/lxfb_core.c | 13 ++----------- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/drivers/video/fbdev/geode/gxfb_core.c b/drivers/video/fbdev/geode/gxfb_core.c index f4f76373b2a8..b1906cf5a8f0 100644 --- a/drivers/video/fbdev/geode/gxfb_core.c +++ b/drivers/video/fbdev/geode/gxfb_core.c @@ -33,6 +33,8 @@ #include #include +#include + #include "gxfb.h" static char *mode_option; @@ -107,9 +109,6 @@ static struct fb_videomode gx_modedb[] = { FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, }; -#ifdef CONFIG_OLPC -#include - static struct fb_videomode gx_dcon_modedb[] = { /* The only mode the DCON has is 1200x900 */ { NULL, 50, 1200, 900, 17460, 24, 8, 4, 5, 8, 3, @@ -128,14 +127,6 @@ static void get_modedb(struct fb_videomode **modedb, unsigned int *size) } } -#else -static void get_modedb(struct fb_videomode **modedb, unsigned int *size) -{ - *modedb = (struct fb_videomode *) gx_modedb; - *size = ARRAY_SIZE(gx_modedb); -} -#endif - static int gxfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) { if (var->xres > 1600 || var->yres > 1200) diff --git a/drivers/video/fbdev/geode/lxfb_core.c b/drivers/video/fbdev/geode/lxfb_core.c index 138da6cb6cbc..17ab905811b1 100644 --- a/drivers/video/fbdev/geode/lxfb_core.c +++ b/drivers/video/fbdev/geode/lxfb_core.c @@ -23,6 +23,8 @@ #include #include +#include + #include "lxfb.h" static char *mode_option; @@ -216,9 +218,6 @@ static struct fb_videomode geode_modedb[] = { 0, FB_VMODE_NONINTERLACED, 0 }, }; -#ifdef CONFIG_OLPC -#include - static struct fb_videomode olpc_dcon_modedb[] = { /* The only mode the DCON has is 1200x900 */ { NULL, 50, 1200, 900, 17460, 24, 8, 4, 5, 8, 3, @@ -237,14 +236,6 @@ static void get_modedb(struct fb_videomode **modedb, unsigned int *size) } } -#else -static void get_modedb(struct fb_videomode **modedb, unsigned int *size) -{ - *modedb = (struct fb_videomode *) geode_modedb; - *size = ARRAY_SIZE(geode_modedb); -} -#endif - static int lxfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) { if (var->xres > 1920 || var->yres > 1440) From 60d2fa0dad0687b4d227aacc042ccdbea1782d87 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 8 Feb 2019 19:24:47 +0100 Subject: [PATCH 078/855] fbdev: omap2: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Signed-off-by: Greg Kroah-Hartman Acked-by: Tony Lindgren Cc: Mauro Carvalho Chehab Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/omap2/omapfb/dss/core.c | 31 ++++----------------- drivers/video/fbdev/omap2/omapfb/dss/dss.h | 2 +- 2 files changed, 7 insertions(+), 26 deletions(-) diff --git a/drivers/video/fbdev/omap2/omapfb/dss/core.c b/drivers/video/fbdev/omap2/omapfb/dss/core.c index b4bcf3a4a647..7e6a3eb266d0 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/core.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/core.c @@ -110,19 +110,12 @@ DEFINE_SHOW_ATTRIBUTE(dss); static struct dentry *dss_debugfs_dir; -static int dss_initialize_debugfs(void) +static void dss_initialize_debugfs(void) { dss_debugfs_dir = debugfs_create_dir("omapdss", NULL); - if (IS_ERR(dss_debugfs_dir)) { - int err = PTR_ERR(dss_debugfs_dir); - dss_debugfs_dir = NULL; - return err; - } debugfs_create_file("clk", S_IRUGO, dss_debugfs_dir, &dss_debug_dump_clocks, &dss_fops); - - return 0; } static void dss_uninitialize_debugfs(void) @@ -130,24 +123,18 @@ static void dss_uninitialize_debugfs(void) debugfs_remove_recursive(dss_debugfs_dir); } -int dss_debugfs_create_file(const char *name, void (*write)(struct seq_file *)) +void dss_debugfs_create_file(const char *name, void (*write)(struct seq_file *)) { - struct dentry *d; - - d = debugfs_create_file(name, S_IRUGO, dss_debugfs_dir, - write, &dss_fops); - - return PTR_ERR_OR_ZERO(d); + debugfs_create_file(name, S_IRUGO, dss_debugfs_dir, write, &dss_fops); } #else /* CONFIG_FB_OMAP2_DSS_DEBUGFS */ -static inline int dss_initialize_debugfs(void) +static inline void dss_initialize_debugfs(void) { - return 0; } static inline void dss_uninitialize_debugfs(void) { } -int dss_debugfs_create_file(const char *name, void (*write)(struct seq_file *)) +void dss_debugfs_create_file(const char *name, void (*write)(struct seq_file *)) { return 0; } @@ -188,9 +175,7 @@ static int __init omap_dss_probe(struct platform_device *pdev) dss_features_init(omapdss_get_version()); - r = dss_initialize_debugfs(); - if (r) - goto err_debugfs; + dss_initialize_debugfs(); if (def_disp_name) core.default_display_name = def_disp_name; @@ -198,10 +183,6 @@ static int __init omap_dss_probe(struct platform_device *pdev) register_pm_notifier(&omap_dss_pm_notif_block); return 0; - -err_debugfs: - - return r; } static int omap_dss_remove(struct platform_device *pdev) diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dss.h b/drivers/video/fbdev/omap2/omapfb/dss/dss.h index a3cc0ca8f9d2..b1a354494144 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/dss.h +++ b/drivers/video/fbdev/omap2/omapfb/dss/dss.h @@ -214,7 +214,7 @@ struct platform_device *dss_get_core_pdev(void); int dss_dsi_enable_pads(int dsi_id, unsigned lane_mask); void dss_dsi_disable_pads(int dsi_id, unsigned lane_mask); int dss_set_min_bus_tput(struct device *dev, unsigned long tput); -int dss_debugfs_create_file(const char *name, void (*write)(struct seq_file *)); +void dss_debugfs_create_file(const char *name, void (*write)(struct seq_file *)); /* display */ int dss_suspend_all_devices(void); From 72aed9e31344a62e6cab2c49a27fd0bc740add63 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 8 Feb 2019 19:24:47 +0100 Subject: [PATCH 079/855] fbdev: mbx: fix up debugfs file creation There is no need to keep the dentries around for the individual debugfs files, just delete the whole directory all at once at shutdown instead. This also fixes a tiny memory leak where the memory for the pointers to the file dentries was never freed when the device shut down, as well as making the logic of the code a lot simpler. Signed-off-by: Greg Kroah-Hartman Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/mbx/mbxdebugfs.c | 34 ++++++++-------------------- drivers/video/fbdev/mbx/mbxfb.c | 2 +- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/drivers/video/fbdev/mbx/mbxdebugfs.c b/drivers/video/fbdev/mbx/mbxdebugfs.c index 2bd328883178..52cfe0132b25 100644 --- a/drivers/video/fbdev/mbx/mbxdebugfs.c +++ b/drivers/video/fbdev/mbx/mbxdebugfs.c @@ -211,36 +211,22 @@ static const struct file_operations misc_fops = { static void mbxfb_debugfs_init(struct fb_info *fbi) { struct mbxfb_info *mfbi = fbi->par; - struct mbxfb_debugfs_data *dbg; + struct dentry *dir; - dbg = kzalloc(sizeof(struct mbxfb_debugfs_data), GFP_KERNEL); - mfbi->debugfs_data = dbg; + dir = debugfs_create_dir("mbxfb", NULL); + mbfi->debugfs_dir = dir; - dbg->dir = debugfs_create_dir("mbxfb", NULL); - dbg->sysconf = debugfs_create_file("sysconf", 0444, dbg->dir, - fbi, &sysconf_fops); - dbg->clock = debugfs_create_file("clock", 0444, dbg->dir, - fbi, &clock_fops); - dbg->display = debugfs_create_file("display", 0444, dbg->dir, - fbi, &display_fops); - dbg->gsctl = debugfs_create_file("gsctl", 0444, dbg->dir, - fbi, &gsctl_fops); - dbg->sdram = debugfs_create_file("sdram", 0444, dbg->dir, - fbi, &sdram_fops); - dbg->misc = debugfs_create_file("misc", 0444, dbg->dir, - fbi, &misc_fops); + debugfs_create_file("sysconf", 0444, dir, fbi, &sysconf_fops); + debugfs_create_file("clock", 0444, dir, fbi, &clock_fops); + debugfs_create_file("display", 0444, dir, fbi, &display_fops); + debugfs_create_file("gsctl", 0444, dir, fbi, &gsctl_fops); + debugfs_create_file("sdram", 0444, dir, fbi, &sdram_fops); + debugfs_create_file("misc", 0444, dir, fbi, &misc_fops); } static void mbxfb_debugfs_remove(struct fb_info *fbi) { struct mbxfb_info *mfbi = fbi->par; - struct mbxfb_debugfs_data *dbg = mfbi->debugfs_data; - debugfs_remove(dbg->misc); - debugfs_remove(dbg->sdram); - debugfs_remove(dbg->gsctl); - debugfs_remove(dbg->display); - debugfs_remove(dbg->clock); - debugfs_remove(dbg->sysconf); - debugfs_remove(dbg->dir); + debugfs_remove_recursive(mfbi->debugfs_dir); } diff --git a/drivers/video/fbdev/mbx/mbxfb.c b/drivers/video/fbdev/mbx/mbxfb.c index 539b85da0897..6ded480a69b4 100644 --- a/drivers/video/fbdev/mbx/mbxfb.c +++ b/drivers/video/fbdev/mbx/mbxfb.c @@ -74,7 +74,7 @@ struct mbxfb_info { u32 pseudo_palette[MAX_PALETTES]; #ifdef CONFIG_FB_MBX_DEBUG - void *debugfs_data; + struct dentry *debugfs_dir; #endif }; From a41458d74c241f2c39ee23fef88df166c3c26a85 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 8 Feb 2019 19:24:47 +0100 Subject: [PATCH 080/855] video/fbdev: refactor video= cmdline parsing Make the video_setup() function slightly easier to read by removing the repeated checks for !global. Remove the misleading return value comment while at it. I'm slightly hesitant to change any of this, but here goes anyway, with hopes that the next person to have to look at this has it a wee bit easier. Signed-off-by: Jani Nikula Reviewed-by: Daniel Vetter Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/core/fb_cmdline.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/video/fbdev/core/fb_cmdline.c b/drivers/video/fbdev/core/fb_cmdline.c index 39509ccd92f1..3b5bd666b952 100644 --- a/drivers/video/fbdev/core/fb_cmdline.c +++ b/drivers/video/fbdev/core/fb_cmdline.c @@ -75,36 +75,33 @@ EXPORT_SYMBOL(fb_get_options); * NOTE: This function is a __setup and __init function. * It only stores the options. Drivers have to call * fb_get_options() as necessary. - * - * Returns zero. - * */ static int __init video_setup(char *options) { - int i, global = 0; - if (!options || !*options) - global = 1; + goto out; - if (!global && !strncmp(options, "ofonly", 6)) { + if (!strncmp(options, "ofonly", 6)) { ofonly = 1; - global = 1; + goto out; } - if (!global && !strchr(options, ':')) { - fb_mode_option = options; - global = 1; - } + if (strchr(options, ':')) { + /* named */ + int i; - if (!global) { for (i = 0; i < FB_MAX; i++) { if (video_options[i] == NULL) { video_options[i] = options; break; } } + } else { + /* global */ + fb_mode_option = options; } +out: return 1; } __setup("video=", video_setup); From a5399db139cb3ad9b8502d8b1bd02da9ce0b9df0 Mon Sep 17 00:00:00 2001 From: Manfred Schlaegl Date: Fri, 8 Feb 2019 19:24:47 +0100 Subject: [PATCH 081/855] fbdev: fbmem: fix memory access if logo is bigger than the screen There is no clipping on the x or y axis for logos larger that the framebuffer size. Therefore: a logo bigger than screen size leads to invalid memory access: [ 1.254664] Backtrace: [ 1.254728] [] (cfb_imageblit) from [] (fb_show_logo+0x620/0x684) [ 1.254763] r10:00000003 r9:00027fd8 r8:c6a40000 r7:c6a36e50 r6:00000000 r5:c06b81e4 [ 1.254774] r4:c6a3e800 [ 1.254810] [] (fb_show_logo) from [] (fbcon_switch+0x3fc/0x46c) [ 1.254842] r10:c6a3e824 r9:c6a3e800 r8:00000000 r7:c6a0c000 r6:c070b014 r5:c6a3e800 [ 1.254852] r4:c6808c00 [ 1.254889] [] (fbcon_switch) from [] (redraw_screen+0xf0/0x1e8) [ 1.254918] r10:00000000 r9:00000000 r8:00000000 r7:00000000 r6:c070d5a0 r5:00000080 [ 1.254928] r4:c6808c00 [ 1.254961] [] (redraw_screen) from [] (do_bind_con_driver+0x194/0x2e4) [ 1.254991] r9:00000000 r8:00000000 r7:00000014 r6:c070d5a0 r5:c070d5a0 r4:c070d5a0 So prevent displaying a logo bigger than screen size and avoid invalid memory access. Signed-off-by: Manfred Schlaegl Signed-off-by: Martin Kepplinger Cc: Daniel Vetter Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/core/fbmem.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index cb43a2258c51..4721491e6c8c 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -431,6 +431,9 @@ static void fb_do_show_logo(struct fb_info *info, struct fb_image *image, { unsigned int x; + if (image->width > info->var.xres || image->height > info->var.yres) + return; + if (rotate == FB_ROTATE_UR) { for (x = 0; x < num && image->dx + image->width <= info->var.xres; From 7d7e58d30e046d3471295a256d15e0b6bdd502c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Vok=C3=A1=C4=8D?= Date: Fri, 8 Feb 2019 19:24:48 +0100 Subject: [PATCH 082/855] dt-bindings: display: ssd1307fb: Remove reset-active-low from examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reset-active-low property has been removed brom the binding a while ago. So remove it from the examples as well. Fixes: 519b4db ("fbdev: ssd1307fb: Remove reset-active-low from the DT binding document") Reviewed-by: Rob Herring Reviewed-by: Alexandre Belloni Cc: Shawn Guo Cc: Fabio Estevam Cc: Maxime Ripard , Signed-off-by: Michal Vokáč Signed-off-by: Bartlomiej Zolnierkiewicz --- Documentation/devicetree/bindings/display/ssd1307fb.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/devicetree/bindings/display/ssd1307fb.txt b/Documentation/devicetree/bindings/display/ssd1307fb.txt index 209d931ef16c..b67f8caa212c 100644 --- a/Documentation/devicetree/bindings/display/ssd1307fb.txt +++ b/Documentation/devicetree/bindings/display/ssd1307fb.txt @@ -36,7 +36,6 @@ ssd1307: oled@3c { reg = <0x3c>; pwms = <&pwm 4 3000>; reset-gpios = <&gpio2 7>; - reset-active-low; }; ssd1306: oled@3c { @@ -44,7 +43,6 @@ ssd1306: oled@3c { reg = <0x3c>; pwms = <&pwm 4 3000>; reset-gpios = <&gpio2 7>; - reset-active-low; solomon,com-lrremap; solomon,com-invdir; solomon,com-offset = <32>; From af4b3a71a5c8204d00c1124c9297a0c48a53ec65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Vok=C3=A1=C4=8D?= Date: Fri, 8 Feb 2019 19:24:48 +0100 Subject: [PATCH 083/855] video: ssd1307fb: Do not hard code active-low reset sequence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SSD130x OLED display reset signal is active low. Now the reset sequence is implemented in such a way that users are forced to define reset-gpios as GPIO_ACTIVE_HIGH in DT to make the reset work. Do not hard code the active-low sequence into the driver but instead allow the user to specify the gpio as GPIO_ACTIVE_LOW to reflect the real world. Reviewed-by: Rob Herring Reviewed-by: Alexandre Belloni Cc: Shawn Guo Cc: Fabio Estevam Cc: Maxime Ripard , Signed-off-by: Michal Vokáč Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/ssd1307fb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/video/fbdev/ssd1307fb.c b/drivers/video/fbdev/ssd1307fb.c index 4061a20cfe24..3b361bc9feb8 100644 --- a/drivers/video/fbdev/ssd1307fb.c +++ b/drivers/video/fbdev/ssd1307fb.c @@ -667,10 +667,10 @@ static int ssd1307fb_probe(struct i2c_client *client, if (par->reset) { /* Reset the screen */ - gpiod_set_value_cansleep(par->reset, 0); - udelay(4); gpiod_set_value_cansleep(par->reset, 1); udelay(4); + gpiod_set_value_cansleep(par->reset, 0); + udelay(4); } if (par->vbat_reg) { From 1ac1d4845c013af9ca003adda7dfedb25a8e42ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Vok=C3=A1=C4=8D?= Date: Fri, 8 Feb 2019 19:24:48 +0100 Subject: [PATCH 084/855] ARM: dts: imx28-cfa10036: Fix the reset gpio signal polarity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reset signal of the SSD1306 OLED display is actually active-low. Adapt the DT to reflect the real world. Reviewed-by: Rob Herring Acked-by: Alexandre Belloni Cc: Shawn Guo Cc: Fabio Estevam Cc: Maxime Ripard , Signed-off-by: Michal Vokáč Signed-off-by: Bartlomiej Zolnierkiewicz --- arch/arm/boot/dts/imx28-cfa10036.dts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/imx28-cfa10036.dts b/arch/arm/boot/dts/imx28-cfa10036.dts index d3e3622979c5..de48b5808ef6 100644 --- a/arch/arm/boot/dts/imx28-cfa10036.dts +++ b/arch/arm/boot/dts/imx28-cfa10036.dts @@ -11,6 +11,7 @@ /dts-v1/; #include "imx28.dtsi" +#include / { model = "Crystalfontz CFA-10036 Board"; @@ -96,7 +97,7 @@ pinctrl-names = "default"; pinctrl-0 = <&ssd1306_cfa10036>; reg = <0x3c>; - reset-gpios = <&gpio2 7 0>; + reset-gpios = <&gpio2 7 GPIO_ACTIVE_LOW>; solomon,height = <32>; solomon,width = <128>; solomon,page-offset = <0>; From a1939185c7a907f91b40da259a610ce2e2da9e18 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Fri, 8 Feb 2019 19:24:49 +0100 Subject: [PATCH 085/855] printk: Export console_printk The fbcon can be built as a module and requires console_printk. Export console_printk. Signed-off-by: Prarit Bhargava Reviewed-by: Sergey Senozhatsky Acked-by: Petr Mladek Signed-off-by: Bartlomiej Zolnierkiewicz --- kernel/printk/printk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d3d170374ceb..8201019d1fff 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -65,6 +65,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; +EXPORT_SYMBOL_GPL(console_printk); atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); EXPORT_SYMBOL(ignore_console_lock_warning); From 10993504d647356196a04b3022d645ec92e00159 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Fri, 8 Feb 2019 19:24:49 +0100 Subject: [PATCH 086/855] fbcon: Silence fbcon logo on 'quiet' boots On text-based systems the 'quiet' boot option will show printk levels higher than CONSOLE_LOGLEVEL_QUIET. The displaying of the Tux logo during boot can cause some consoles to lose display data and as a result confuse the end user. Do not display the Tux logo on systems that are in 'quiet' boot. v2: It helps to commit all my changes before sending them. Remove extra bracket. v3: buildbot error fix: fbcon can be built as part of a module so export console_printk v4: move console_printk change to separate patch, and drop logo cleanup v5: Only set FBCON_LOGO_DONTSHOW for console loglevel Signed-off-by: Prarit Bhargava Cc: Hans de Goede Cc: Marko Myllynen Cc: Steven Rostedt (VMware) Cc: Kees Cook Cc: Daniel Vetter Cc: Thierry Reding Cc: Yisheng Xie Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/core/fbcon.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index fd430e6b4c63..cd059a801662 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -656,11 +656,14 @@ static void fbcon_prepare_logo(struct vc_data *vc, struct fb_info *info, kfree(save); } + if (logo_shown == FBCON_LOGO_DONTSHOW) + return; + if (logo_lines > vc->vc_bottom) { logo_shown = FBCON_LOGO_CANSHOW; printk(KERN_INFO "fbcon_init: disable boot-logo (boot-logo bigger than screen).\n"); - } else if (logo_shown != FBCON_LOGO_DONTSHOW) { + } else { logo_shown = FBCON_LOGO_DRAW; vc->vc_top = logo_lines; } @@ -1066,6 +1069,9 @@ static void fbcon_init(struct vc_data *vc, int init) cap = info->flags; + if (console_loglevel <= CONSOLE_LOGLEVEL_QUIET) + logo_shown = FBCON_LOGO_DONTSHOW; + if (vc != svc || logo_shown == FBCON_LOGO_DONTSHOW || (info->fix.type == FB_TYPE_TEXT)) logo = 0; From f40298444e8ca13789dc002ffb269c343aadfb87 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 8 Feb 2019 19:24:49 +0100 Subject: [PATCH 087/855] video: fbdev: Fix potential NULL pointer dereference There is a potential NULL pointer dereference in case fb_create_modedb() fails and returns NULL. Signed-off-by: YueHaibing Cc: Kees Cook Cc: Rob Herring Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/core/fbmon.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/video/fbdev/core/fbmon.c b/drivers/video/fbdev/core/fbmon.c index dd3128990776..3558a70a6664 100644 --- a/drivers/video/fbdev/core/fbmon.c +++ b/drivers/video/fbdev/core/fbmon.c @@ -978,6 +978,8 @@ void fb_edid_to_monspecs(unsigned char *edid, struct fb_monspecs *specs) get_monspecs(edid, specs); specs->modedb = fb_create_modedb(edid, &specs->modedb_len, specs); + if (!specs->modedb) + return; /* * Workaround for buggy EDIDs that sets that the first From ef092dec043c0166b8aea0e56d1db8757877e40b Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 15 Oct 2018 14:02:40 +0200 Subject: [PATCH 088/855] drm/etnaviv: mmuv2: don't map zero page Keep the page at address 0 as faulting to catch any potential state setup issues early. Signed-off-by: Lucas Stach Reviewed-by: Christian Gmeiner --- drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c b/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c index f1c88d8ad5ba..f794e04be9e6 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c @@ -320,8 +320,8 @@ etnaviv_iommuv2_domain_alloc(struct etnaviv_gpu *gpu) domain = &etnaviv_domain->base; domain->dev = gpu->dev; - domain->base = 0; - domain->size = (u64)SZ_1G * 4; + domain->base = SZ_4K; + domain->size = (u64)SZ_1G * 4 - SZ_4K; domain->ops = &etnaviv_iommuv2_ops; ret = etnaviv_iommuv2_init(etnaviv_domain); From 9e05352340d3a3e68c144136db9810b26ebb88c3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 5 Feb 2019 12:08:19 +0300 Subject: [PATCH 089/855] drm/etnaviv: potential NULL dereference The etnaviv_gem_prime_get_sg_table() is supposed to return error pointers. Otherwise it can lead to a NULL dereference when it's called from drm_gem_map_dma_buf(). Fixes: 5f4a4a73f437 ("drm/etnaviv: fix gem_prime_get_sg_table to return new SG table") Signed-off-by: Dan Carpenter Reviewed-by: Christian Gmeiner Signed-off-by: Lucas Stach --- drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c index 0566171f8df2..f21529e635e3 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c @@ -15,7 +15,7 @@ struct sg_table *etnaviv_gem_prime_get_sg_table(struct drm_gem_object *obj) int npages = obj->size >> PAGE_SHIFT; if (WARN_ON(!etnaviv_obj->pages)) /* should have already pinned! */ - return NULL; + return ERR_PTR(-EINVAL); return drm_prime_pages_to_sg(etnaviv_obj->pages, npages); } From 266d63a7d9d48c6d5dee486378ec0e8c86c4d74a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 11 Feb 2019 12:23:51 +0100 Subject: [PATCH 090/855] x86/cpufeature: Fix various quality problems in the header Thomas noticed that the new arch/x86/include/asm/cpu_device_id.h header is a train-wreck that didn't incorporate review feedback like not using __u8 in kernel-only headers. While at it also fix all the *other* problems this header has: - Use canonical names for the header guards. It's inexplicable why a non-standard guard was used. - Don't define the header guard to 1. Plus annotate the closing #endif as done absolutely every other header. Again, an inexplicable source of noise. - Move the kernel API calls provided by this header next to each other, there's absolutely no reason to have them spread apart in the header. - Align the INTEL_CPU_DESC() macro initializations vertically, this is easier to read and it's also the canonical style. - Actually name the macro arguments properly: instead of 'mod, step, rev', spell out 'model, stepping, revision' - it's not like we have a lack of characters in this header. - Actually make arguments macro-safe - again it's inexplicable why it wasn't done properly to begin with. Quite amazing how many problems a 41 lines header can contain. This kind of code quality is unacceptable, and it slipped through the review net of 2 developers and 2 maintainers, including myself, until Thomas noticed it. :-/ Reported-by: Thomas Gleixner Cc: Andi Kleen Cc: Kan Liang Cc: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpu_device_id.h | 31 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index 3417110574c1..31c379c1da41 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _CPU_DEVICE_ID -#define _CPU_DEVICE_ID 1 +#ifndef _ASM_X86_CPU_DEVICE_ID +#define _ASM_X86_CPU_DEVICE_ID /* * Declare drivers belonging to specific x86 CPUs @@ -9,8 +9,6 @@ #include -extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); - /* * Match specific microcode revisions. * @@ -22,21 +20,22 @@ extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); */ struct x86_cpu_desc { - __u8 x86_family; - __u8 x86_vendor; - __u8 x86_model; - __u8 x86_stepping; - __u32 x86_microcode_rev; + u8 x86_family; + u8 x86_vendor; + u8 x86_model; + u8 x86_stepping; + u32 x86_microcode_rev; }; -#define INTEL_CPU_DESC(mod, step, rev) { \ - .x86_family = 6, \ - .x86_vendor = X86_VENDOR_INTEL, \ - .x86_model = mod, \ - .x86_stepping = step, \ - .x86_microcode_rev = rev, \ +#define INTEL_CPU_DESC(model, stepping, revision) { \ + .x86_family = 6, \ + .x86_vendor = X86_VENDOR_INTEL, \ + .x86_model = (model), \ + .x86_stepping = (stepping), \ + .x86_microcode_rev = (revision), \ } +extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table); -#endif +#endif /* _ASM_X86_CPU_DEVICE_ID */ From cce8e04cf79e47809455215744685e8eb56f94bb Mon Sep 17 00:00:00 2001 From: Paul Selles Date: Thu, 6 Dec 2018 21:30:50 +0800 Subject: [PATCH 091/855] ntb_hw_switchtec: debug print 64bit aligned crosslink BAR Numbers Switchtec NTB crosslink BARs are 64bit addressed but they are printed as 32bit addressed BARs. Fix debug log to increment the BAR numbers by 2 to reflect the 64bit address alignment. Fixes: 017525018202 ("ntb_hw_switchtec: Add initialization code for crosslink") Signed-off-by: Paul Selles Signed-off-by: Wesley Sheng Reviewed-by: Logan Gunthorpe Signed-off-by: Jon Mason --- drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c index 5ee5f40b4dfc..9916bc5b6759 100644 --- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c +++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c @@ -1120,7 +1120,7 @@ static int crosslink_enum_partition(struct switchtec_ntb *sndev, dev_dbg(&sndev->stdev->dev, "Crosslink BAR%d addr: %llx\n", - i, bar_addr); + i*2, bar_addr); if (bar_addr != bar_space * i) continue; From d123fab71f63aae129aebe052664fda73131921a Mon Sep 17 00:00:00 2001 From: Wesley Sheng Date: Thu, 6 Dec 2018 21:30:51 +0800 Subject: [PATCH 092/855] ntb_hw_switchtec: NT req id mapping table register entry number should be 512 The number of available NT req id mapping table entries per NTB control register is 512. The driver mistakenly limits the number to 256. Fix the array size of NT req id mapping table. Fixes: c082b04c9d40 ("NTB: switchtec: Add NTB hardware register definitions") Signed-off-by: Wesley Sheng Reviewed-by: Logan Gunthorpe Signed-off-by: Jon Mason --- include/linux/switchtec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h index eee0412bdf4b..32b282cd0ead 100644 --- a/include/linux/switchtec.h +++ b/include/linux/switchtec.h @@ -249,8 +249,8 @@ struct ntb_ctrl_regs { u64 xlate_addr; } bar_entry[6]; u32 reserved2[216]; - u32 req_id_table[256]; - u32 reserved3[512]; + u32 req_id_table[512]; + u32 reserved3[256]; u64 lut_entry[512]; } __packed; From a2585cdc9e4cda6afaea5f5687eaabce3bebbb2c Mon Sep 17 00:00:00 2001 From: Paul Selles Date: Thu, 6 Dec 2018 21:30:52 +0800 Subject: [PATCH 093/855] ntb_hw_switchtec: Added support of >=4G memory windows Current Switchtec's BAR setup registers are limited to 32bits, corresponding to the maximum MW (memory window) size is <4G. Increase the MW sizes with the addition of the BAR Setup Extension Register for the upper 32bits of a 64bits MW size. This increases the MW range to between 4K and 2^63. Reported-by: Boris Glimcher Signed-off-by: Paul Selles Signed-off-by: Wesley Sheng Reviewed-by: Logan Gunthorpe Signed-off-by: Jon Mason --- drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 9 +++++++-- include/linux/switchtec.h | 6 +++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c index 9916bc5b6759..f6f00354047b 100644 --- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c +++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c @@ -264,6 +264,7 @@ static void switchtec_ntb_mw_clr_direct(struct switchtec_ntb *sndev, int idx) ctl_val &= ~NTB_CTRL_BAR_DIR_WIN_EN; iowrite32(ctl_val, &ctl->bar_entry[bar].ctl); iowrite32(0, &ctl->bar_entry[bar].win_size); + iowrite32(0, &ctl->bar_ext_entry[bar].win_size); iowrite64(sndev->self_partition, &ctl->bar_entry[bar].xlate_addr); } @@ -286,7 +287,9 @@ static void switchtec_ntb_mw_set_direct(struct switchtec_ntb *sndev, int idx, ctl_val |= NTB_CTRL_BAR_DIR_WIN_EN; iowrite32(ctl_val, &ctl->bar_entry[bar].ctl); - iowrite32(xlate_pos | size, &ctl->bar_entry[bar].win_size); + iowrite32(xlate_pos | (lower_32_bits(size) & 0xFFFFF000), + &ctl->bar_entry[bar].win_size); + iowrite32(upper_32_bits(size), &ctl->bar_ext_entry[bar].win_size); iowrite64(sndev->self_partition | addr, &ctl->bar_entry[bar].xlate_addr); } @@ -1053,7 +1056,9 @@ static int crosslink_setup_mws(struct switchtec_ntb *sndev, int ntb_lut_idx, ctl_val |= NTB_CTRL_BAR_DIR_WIN_EN; iowrite32(ctl_val, &ctl->bar_entry[bar].ctl); - iowrite32(xlate_pos | size, &ctl->bar_entry[bar].win_size); + iowrite32(xlate_pos | (lower_32_bits(size) & 0xFFFFF000), + &ctl->bar_entry[bar].win_size); + iowrite32(upper_32_bits(size), &ctl->bar_ext_entry[bar].win_size); iowrite64(sndev->peer_partition | addr, &ctl->bar_entry[bar].xlate_addr); } diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h index 32b282cd0ead..52a079b3a9a6 100644 --- a/include/linux/switchtec.h +++ b/include/linux/switchtec.h @@ -248,7 +248,11 @@ struct ntb_ctrl_regs { u32 win_size; u64 xlate_addr; } bar_entry[6]; - u32 reserved2[216]; + struct { + u32 win_size; + u32 reserved[3]; + } bar_ext_entry[6]; + u32 reserved2[192]; u32 req_id_table[512]; u32 reserved3[256]; u64 lut_entry[512]; From 9143595a7e05a848384c240d34abcc4740a65897 Mon Sep 17 00:00:00 2001 From: Joey Zhang Date: Mon, 7 Jan 2019 11:12:56 +0800 Subject: [PATCH 094/855] NTB: ntb_transport: Free MWs in ntb_transport_link_cleanup() If NTB peer host crashes or reboots, the NTB transport link will be down and the MWs of NTB transport will be invalid. But the ntb_transport_link_cleanup() does not free these invalid MWs. When the NTB peer host is recovered later, NTB transport link will be up and the ntb_set_mw() will not reset up MWs. Because the MWs of NTB transport are invalid, the NTB transport will not work. We can fix it by freeing MWs when NTB transport link is down, then the ntb_set_mw() will reset up MWs when NTB transport link is up. Signed-off-by: Joey Zhang Reviewed-by: Logan Gunthorpe Signed-off-by: Jon Mason --- drivers/ntb/ntb_transport.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c index 3bfdb4562408..6e8902d03a69 100644 --- a/drivers/ntb/ntb_transport.c +++ b/drivers/ntb/ntb_transport.c @@ -862,6 +862,9 @@ static void ntb_transport_link_cleanup(struct ntb_transport_ctx *nt) if (!nt->link_is_up) cancel_delayed_work_sync(&nt->link_work); + for (i = 0; i < nt->mw_count; i++) + ntb_free_mw(nt, i); + /* The scratchpad registers keep the values if the remote side * goes down, blast them now to give them a sane value the next * time they are accessed From c59666bb32b91da84b1d3df0e88789de9e827f61 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 18 Jan 2019 17:10:01 -0700 Subject: [PATCH 095/855] NTB: ntb_transport: Ensure the destination buffer is mapped for TX DMA Presently, when ntb_transport is used with DMA and the IOMMU turned on, it fails with errors from the IOMMU such as: DMAR: DRHD: handling fault status reg 202 DMAR: [DMA Write] Request device [00:04.0] fault addr 381fc0340000 [fault reason 05] PTE Write access is not set This is because ntb_transport does not map the BAR space with the IOMMU. To fix this, we map the entire MW region for each QP after we assign the DMA channel. This prevents needing an extra DMA map in the fast path. Link: https://lore.kernel.org/linux-pci/499934e7-3734-1aee-37dd-b42a5d2a2608@intel.com/ Signed-off-by: Logan Gunthorpe Reviewed-by: Dave Jiang Signed-off-by: Jon Mason --- drivers/ntb/ntb_transport.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c index 6e8902d03a69..d4f39ba1d976 100644 --- a/drivers/ntb/ntb_transport.c +++ b/drivers/ntb/ntb_transport.c @@ -144,7 +144,9 @@ struct ntb_transport_qp { struct list_head tx_free_q; spinlock_t ntb_tx_free_q_lock; void __iomem *tx_mw; - dma_addr_t tx_mw_phys; + phys_addr_t tx_mw_phys; + size_t tx_mw_size; + dma_addr_t tx_mw_dma_addr; unsigned int tx_index; unsigned int tx_max_entry; unsigned int tx_max_frame; @@ -1052,6 +1054,7 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt, tx_size = (unsigned int)mw_size / num_qps_mw; qp_offset = tx_size * (qp_num / mw_count); + qp->tx_mw_size = tx_size; qp->tx_mw = nt->mw_vec[mw_num].vbase + qp_offset; if (!qp->tx_mw) return -EINVAL; @@ -1647,7 +1650,7 @@ static int ntb_async_tx_submit(struct ntb_transport_qp *qp, dma_cookie_t cookie; device = chan->device; - dest = qp->tx_mw_phys + qp->tx_max_frame * entry->tx_index; + dest = qp->tx_mw_dma_addr + qp->tx_max_frame * entry->tx_index; buff_off = (size_t)buf & ~PAGE_MASK; dest_off = (size_t)dest & ~PAGE_MASK; @@ -1866,6 +1869,18 @@ ntb_transport_create_queue(void *data, struct device *client_dev, qp->rx_dma_chan = NULL; } + if (qp->tx_dma_chan) { + qp->tx_mw_dma_addr = + dma_map_resource(qp->tx_dma_chan->device->dev, + qp->tx_mw_phys, qp->tx_mw_size, + DMA_FROM_DEVICE, 0); + if (dma_mapping_error(qp->tx_dma_chan->device->dev, + qp->tx_mw_dma_addr)) { + qp->tx_mw_dma_addr = 0; + goto err1; + } + } + dev_dbg(&pdev->dev, "Using %s memcpy for TX\n", qp->tx_dma_chan ? "DMA" : "CPU"); @@ -1907,6 +1922,10 @@ err1: qp->rx_alloc_entry = 0; while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_free_q))) kfree(entry); + if (qp->tx_mw_dma_addr) + dma_unmap_resource(qp->tx_dma_chan->device->dev, + qp->tx_mw_dma_addr, qp->tx_mw_size, + DMA_FROM_DEVICE, 0); if (qp->tx_dma_chan) dma_release_channel(qp->tx_dma_chan); if (qp->rx_dma_chan) @@ -1948,6 +1967,11 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp) */ dma_sync_wait(chan, qp->last_cookie); dmaengine_terminate_all(chan); + + dma_unmap_resource(chan->device->dev, + qp->tx_mw_dma_addr, qp->tx_mw_size, + DMA_FROM_DEVICE, 0); + dma_release_channel(chan); } From 65ab26e397555d340290a99e6c736e291526ecc0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 31 Jan 2019 23:49:21 +0100 Subject: [PATCH 096/855] selftests: kvm: add selftest for releasing VM file descriptor while in L2 This adds a test for the previous bug. Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../kvm/x86_64/vmx_close_while_nested_test.c | 95 +++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 6210ba41c29e..2689d1ea6d7a 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -3,6 +3,7 @@ /x86_64/platform_info_test /x86_64/set_sregs_test /x86_64/sync_regs_test +/x86_64/vmx_close_while_nested_test /x86_64/vmx_tsc_adjust_test /x86_64/state_test /dirty_log_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index f9a0e9938480..3c1f4bdf9000 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -16,6 +16,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid +TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += clear_dirty_log_test diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c new file mode 100644 index 000000000000..6edec6fd790b --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c @@ -0,0 +1,95 @@ +/* + * vmx_close_while_nested + * + * Copyright (C) 2019, Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Verify that nothing bad happens if a KVM user exits with open + * file descriptors while executing a nested guest. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#include +#include + +#include "kselftest.h" + +#define VCPU_ID 5 + +enum { + PORT_L0_EXIT = 0x2000, +}; + +/* The virtual machine object. */ +static struct kvm_vm *vm; + +static void l2_guest_code(void) +{ + /* Exit to L0 */ + asm volatile("inb %%dx, %%al" + : : [port] "d" (PORT_L0_EXIT) : "rax"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + uintptr_t save_cr3; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(0); +} + +int main(int argc, char *argv[]) +{ + struct vmx_pages *vmx_pages; + vm_vaddr_t vmx_pages_gva; + struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); + + if (!(entry->ecx & CPUID_VMX)) { + fprintf(stderr, "nested VMX not enabled, skipping test\n"); + exit(KSFT_SKIP); + } + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + /* Allocate VMX pages and shared descriptors (vmx_pages). */ + vmx_pages = vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + + for (;;) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + if (run->io.port == PORT_L0_EXIT) + break; + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_ASSERT(false, "%s", (const char *)uc.args[0]); + /* NOT REACHED */ + default: + TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); + } + } +} From 61c08aa9606d4e48a8a50639c956448a720174c3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:48 -0800 Subject: [PATCH 097/855] KVM: VMX: Compare only a single byte for VMCS' "launched" in vCPU-run The vCPU-run asm blob does a manual comparison of a VMCS' launched status to execute the correct VM-Enter instruction, i.e. VMLAUNCH vs. VMRESUME. The launched flag is a bool, which is a typedef of _Bool. C99 does not define an exact size for _Bool, stating only that is must be large enough to hold '0' and '1'. Most, if not all, compilers use a single byte for _Bool, including gcc[1]. Originally, 'launched' was of type 'int' and so the asm blob used 'cmpl' to check the launch status. When 'launched' was moved to be stored on a per-VMCS basis, struct vcpu_vmx's "temporary" __launched flag was added in order to avoid having to pass the current VMCS into the asm blob. The new '__launched' was defined as a 'bool' and not an 'int', but the 'cmp' instruction was not updated. This has not caused any known problems, likely due to compilers aligning variables to 4-byte or 8-byte boundaries and KVM zeroing out struct vcpu_vmx during allocation. I.e. vCPU-run accesses "junk" data, it just happens to always be zero and so doesn't affect the result. [1] https://gcc.gnu.org/ml/gcc-patches/2000-10/msg01127.html Fixes: d462b8192368 ("KVM: VMX: Keep list of loaded VMCSs, instead of vcpus") Cc: Reviewed-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 95d618045001..fdb6305cc971 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6402,7 +6402,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov %%" _ASM_AX", %%cr2 \n\t" "3: \n\t" /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%%" _ASM_CX ") \n\t" + "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t" /* Load guest registers. Don't clobber flags. */ "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t" "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t" From 1ce072cbfd8dba46f117804850398e0b3040a541 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:49 -0800 Subject: [PATCH 098/855] KVM: nVMX: Check a single byte for VMCS "launched" in nested early checks Nested early checks does a manual comparison of a VMCS' launched status in its asm blob to execute the correct VM-Enter instruction, i.e. VMLAUNCH vs. VMRESUME. The launched flag is a bool, which is a typedef of _Bool. C99 does not define an exact size for _Bool, stating only that is must be large enough to hold '0' and '1'. Most, if not all, compilers use a single byte for _Bool, including gcc[1]. The use of 'cmpl' instead of 'cmpb' was not deliberate, but rather the result of a copy-paste as the asm blob was directly derived from the asm blob for vCPU-run. This has not caused any known problems, likely due to compilers aligning variables to 4-byte or 8-byte boundaries and KVM zeroing out struct vcpu_vmx during allocation. I.e. vCPU-run accesses "junk" data, it just happens to always be zero and so doesn't affect the result. [1] https://gcc.gnu.org/ml/gcc-patches/2000-10/msg01127.html Fixes: 52017608da33 ("KVM: nVMX: add option to perform early consistency checks via H/W") Cc: Reviewed-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d8ea4ebd79e7..50c9e04910ea 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2761,7 +2761,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%% " _ASM_CX")\n\t" + "cmpb $0, %c[launched](%% " _ASM_CX")\n\t" "call vmx_vmenter\n\t" From 0e0ab73c9a0243736bcd779b30b717e23ba9a56d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:50 -0800 Subject: [PATCH 099/855] KVM: VMX: Zero out *all* general purpose registers after VM-Exit ...except RSP, which is restored by hardware as part of VM-Exit. Paolo theorized that restoring registers from the stack after a VM-Exit in lieu of zeroing them could lead to speculative execution with the guest's values, e.g. if the stack accesses miss the L1 cache[1]. Zeroing XORs are dirt cheap, so just be ultra-paranoid. Note that the scratch register (currently RCX) used to save/restore the guest state is also zeroed as its host-defined value is loaded via the stack, just with a MOV instead of a POP. [1] https://patchwork.kernel.org/patch/10771539/#22441255 Fixes: 0cb5b30698fd ("kvm: vmx: Scrub hardware GPRs at VM-exit") Cc: Cc: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fdb6305cc971..10fee67a6dcd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6452,10 +6452,15 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t" "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t" "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t" + /* - * Clear host registers marked as clobbered to prevent - * speculative use. - */ + * Clear all general purpose registers (except RSP, which is loaded by + * the CPU during VM-Exit) to prevent speculative use of the guest's + * values, even those that are saved/loaded via the stack. In theory, + * an L1 cache miss when restoring registers could lead to speculative + * execution with the guest's values. Zeroing XORs are dirt cheap, + * i.e. the extra paranoia is essentially free. + */ "xor %%r8d, %%r8d \n\t" "xor %%r9d, %%r9d \n\t" "xor %%r10d, %%r10d \n\t" @@ -6470,8 +6475,11 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%eax, %%eax \n\t" "xor %%ebx, %%ebx \n\t" + "xor %%ecx, %%ecx \n\t" + "xor %%edx, %%edx \n\t" "xor %%esi, %%esi \n\t" "xor %%edi, %%edi \n\t" + "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" : ASM_CALL_CONSTRAINT : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), From 831a3011294d9aa284afae9194619893d454d708 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:51 -0800 Subject: [PATCH 100/855] KVM: VMX: Modify only RSP when creating a placeholder for guest's RCX In the vCPU-run asm blob, the guest's RCX is temporarily saved onto the stack after VM-Exit as the exit flow must first load a register with a pointer to the vCPU's save area in order to save the guest's registers. RCX is arbitrarily designated as the scratch register. Since the stack usage is to (1)save host, (2)save guest, (3)load host and (4)load guest, the code can't conform to the stack's natural FIFO semantics, i.e. it can't simply do PUSH/POP. Regardless of whether it is done for the host's value or guest's value, at some point the code needs to access the stack using a non-traditional method, e.g. MOV instead of POP. vCPU-run opts to create a placeholder on the stack for guest's RCX (by adjusting RSP) and saves RCX to its place immediately after VM-Exit (via MOV). In other words, the purpose of the first 'PUSH RCX' at the start of the vCPU-run asm blob is to adjust RSP down, i.e. there's no need to actually access memory. Use 'SUB $wordsize, RSP' instead of 'PUSH RCX' to make it more obvious that the intent is simply to create a gap on the stack for the guest's RCX. Reviewed-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 10fee67a6dcd..cb0dd17c31c6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6378,7 +6378,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) asm( /* Store host registers */ "push %%" _ASM_DX "; push %%" _ASM_BP ";" - "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ + "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ "push %%" _ASM_CX " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" From f3689e3f17f064fd4cd5f0cb01ae2395c94f39d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:52 -0800 Subject: [PATCH 101/855] KVM: VMX: Save RSI to an unused output in the vCPU-run asm blob RSI is clobbered by the vCPU-run asm blob, but it's not marked as such, probably because GCC doesn't let you mark inputs as clobbered. "Save" RSI to a dummy output so that GCC recognizes it as being clobbered. Fixes: 773e8a0425c9 ("x86/kvm: use Enlightened VMCS when running on Hyper-V") Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cb0dd17c31c6..3ae51a09f420 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6481,7 +6481,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" - : ASM_CALL_CONSTRAINT + : ASM_CALL_CONSTRAINT, "=S"((int){0}) : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), From ccf447434ee624b64a1c215d88453f1921aaa88a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:53 -0800 Subject: [PATCH 102/855] KVM: VMX: Manually load RDX in vCPU-run asm blob Load RDX with the VMCS.HOST_RSP field encoding on-demand instead of delegating to the compiler via an input constraint. In addition to saving one whole MOV instruction, this allows RDX to be properly clobbered (in a future patch) instead of being saved/loaded to/from the stack. Despite nested_vmx_check_vmentry_hw() having similar code, leave it alone, for now. In that case, RDX is unconditionally used and isn't clobbered, i.e. sending in HOST_RSP as an input is simpler. Note that because HOST_RSP is an enum and not a define, it must be redefined as an immediate instead of using __stringify(HOST_RSP). The naming "conflict" between host_rsp and HOST_RSP is slightly confusing, but the former will be removed in a future patch, at which point HOST_RSP is absolutely what is desired. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3ae51a09f420..b935ba3306aa 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6390,6 +6390,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" "jmp 1f \n\t" "2: \n\t" + "mov $%c[HOST_RSP], %%" _ASM_DX " \n\t" __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" "1: \n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ @@ -6482,10 +6483,11 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" : ASM_CALL_CONSTRAINT, "=S"((int){0}) - : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), + : "c"(vmx), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + [HOST_RSP]"i"(HOST_RSP), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), From 6f7c6d23b71af2e65056e777eeb2f1e6e0195f14 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:54 -0800 Subject: [PATCH 103/855] KVM: VMX: Let the compiler save/load RDX during vCPU-run Per commit c20363006af6 ("KVM: VMX: Let gcc to choose which registers to save (x86_64)"), the only reason RDX is saved/loaded to/from the stack is because it was specified as an input, i.e. couldn't be marked as clobbered (ignoring the fact that "saving" it to a dummy output would indirectly mark it as clobbered). Now that RDX is no longer an input, clobber it. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b935ba3306aa..e9d81cd8421f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6377,7 +6377,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) asm( /* Store host registers */ - "push %%" _ASM_DX "; push %%" _ASM_BP ";" + "push %%" _ASM_BP " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ "push %%" _ASM_CX " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ @@ -6481,7 +6481,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%esi, %%esi \n\t" "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" - "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" + "pop %%" _ASM_BP " \n\t" : ASM_CALL_CONSTRAINT, "=S"((int){0}) : "c"(vmx), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), @@ -6509,10 +6509,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rdi" + , "rax", "rbx", "rdx", "rdi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "eax", "ebx", "edi" + , "eax", "ebx", "edx", "edi" #endif ); } From 9ce0a07a6f49822238fd4357c02e0dba060a43cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:55 -0800 Subject: [PATCH 104/855] KVM: nVMX: Remove a rogue "rax" clobber from nested_vmx_check_vmentry_hw() RAX is not touched by nested_vmx_check_vmentry_hw(), directly or indirectly (e.g. vmx_vmenter()). Remove it from the clobber list. Fixes: 52017608da33 ("KVM: nVMX: add option to perform early consistency checks via H/W") Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 50c9e04910ea..bb85e3a05424 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2773,7 +2773,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [wordsize]"i"(sizeof(ulong)) - : "rax", "cc", "memory" + : "cc", "memory" ); preempt_enable(); From 98ff2acc91d836277d34558fbbba7678e04281ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:56 -0800 Subject: [PATCH 105/855] KVM: nVMX: Drop STACK_FRAME_NON_STANDARD from nested_vmx_check_vmentry_hw() ...as it doesn't technically actually do anything non-standard with the stack even though it modifies RSP in a weird way. E.g. RSP is loaded with VMCS.HOST_RSP if the VM-Enter gets far enough to trigger VM-Exit, but it's simply reloaded with the current value. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bb85e3a05424..e40df725b952 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2809,8 +2809,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } -STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); - static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); From 6c1e7e5b40f23b9e754a47852924115febba35df Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:57 -0800 Subject: [PATCH 106/855] KVM: nVMX: Explicitly reference the scratch reg in nested early checks Using %1 to reference RCX, i.e. the 'vmx' pointer', is obtuse and fragile, e.g. it results in cryptic and infurating compile errors if the output constraints are touched by anything more than a gentle breeze. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e40df725b952..a562ecabc118 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2757,7 +2757,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) /* Set HOST_RSP */ "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%% " _ASM_CX ")\n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ From f1727b4954772a778df0b73a93c4b646fd3c21f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:58 -0800 Subject: [PATCH 107/855] KVM: nVMX: Capture VM-Fail to a local var in nested_vmx_check_vmentry_hw() Unlike the primary vCPU-run flow, the nested early checks code doesn't actually want to propagate VM-Fail back to 'vmx'. Yay copy+paste. In additional to eliminating the need to clear vmx->fail before returning, using a local boolean also drops a reference to 'vmx' in the asm blob. Dropping the reference to 'vmx' will save a register in the long run as future patches will shift all pointer references from 'vmx' to 'vmx->loaded_vmcs'. Fixes: 52017608da33 ("KVM: nVMX: add option to perform early consistency checks via H/W") Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a562ecabc118..bfacf9029466 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2718,6 +2718,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; + bool vm_fail; if (!nested_early_check) return 0; @@ -2763,14 +2764,18 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) /* Check if vmlaunch or vmresume is needed */ "cmpb $0, %c[launched](%% " _ASM_CX")\n\t" + /* + * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set + * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail + * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the + * results of VM-Enter is captured via SETBE to vm_fail. + */ "call vmx_vmenter\n\t" - /* Set vmx->fail accordingly */ - "setbe %c[fail](%% " _ASM_CX")\n\t" - : ASM_CALL_CONSTRAINT + "setbe %[fail]\n\t" + : ASM_CALL_CONSTRAINT, [fail]"=qm"(vm_fail) : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" @@ -2783,10 +2788,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) if (vmx->msr_autoload.guest.nr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - if (vmx->fail) { + if (vm_fail) { WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - vmx->fail = 0; return 1; } From bbc0b8239257d91cd43fe219de5552dee8bb9123 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:59 -0800 Subject: [PATCH 108/855] KVM: nVMX: Capture VM-Fail via CC_{SET,OUT} in nested early checks ...to take advantage of __GCC_ASM_FLAG_OUTPUTS__ when possible. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bfacf9029466..67f8fdf568eb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2768,12 +2768,12 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the - * results of VM-Enter is captured via SETBE to vm_fail. + * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. */ "call vmx_vmenter\n\t" - "setbe %[fail]\n\t" - : ASM_CALL_CONSTRAINT, [fail]"=qm"(vm_fail) + CC_SET(be) + : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), From 74dfa2784e961fae66143b811f45105b43c63046 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:00 -0800 Subject: [PATCH 109/855] KVM: nVMX: Reference vmx->loaded_vmcs->launched directly Temporarily propagating vmx->loaded_vmcs->launched to vmx->__launched is not functionally necessary, but rather was done historically to avoid passing both 'vmx' and 'loaded_vmcs' to the vCPU-run asm blob. Nested early checks inherited this behavior by virtue of copy+paste. A future patch will move HOST_RSP caching to be per-VMCS, i.e. store 'host_rsp' in loaded VMCS. Now that the reference to 'vmx->fail' is also gone from nested early checks, referencing 'loaded_vmcs' directly means we can drop the 'vmx' reference when introducing per-VMCS RSP caching. And it means __launched can be dropped from struct vcpu_vmx if/when vCPU-run receives similar treatment. Note the use of a named register constraint for 'loaded_vmcs'. Using RCX to hold 'vmx' was inherited from vCPU-run. In the vCPU-run case, the scratch register needs to be explicitly defined as it is crushed when loading guest state, i.e. deferring to the compiler would corrupt the pointer. Since nested early checks never loads guests state, it's a-ok to let the compiler pick any register. Naming the constraint avoids the fragility of referencing constraints via %1, %2, etc.., which breaks horribly when modifying constraints, and generally makes the asm blob more readable. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 67f8fdf568eb..47299e10522f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2752,8 +2752,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } - vmx->__launched = vmx->loaded_vmcs->launched; - asm( /* Set HOST_RSP */ "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ @@ -2762,7 +2760,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ - "cmpb $0, %c[launched](%% " _ASM_CX")\n\t" + "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" /* * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set @@ -2775,7 +2773,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) CC_SET(be) : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), + [loaded_vmcs]"r"(vmx->loaded_vmcs), + [launched]"i"(offsetof(struct loaded_vmcs, launched)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" From fbda0fd31a6d683637f848ba17956048dd0c7e48 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:01 -0800 Subject: [PATCH 110/855] KVM: nVMX: Let the compiler select the reg for holding HOST_RSP ...and provide an explicit name for the constraint. Naming the input constraint makes the code self-documenting and also avoids the fragility of numerically referring to constraints, e.g. %4 breaks badly whenever the constraints are modified. Explicitly using RDX was inherited from vCPU-run, i.e. completely arbitrary. Even vCPU-run doesn't truly need to explicitly use RDX, but doing so is more robust as vCPU-run needs tight control over its register usage. Note that while the naming "conflict" between host_rsp and HOST_RSP is slightly confusing, the former will be renamed slightly in a future patch, at which point HOST_RSP is absolutely what is desired. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 47299e10522f..23a2c1b91389 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2753,9 +2753,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) } asm( - /* Set HOST_RSP */ "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" + __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" "mov %%" _ASM_SP ", %c[host_rsp](%% " _ASM_CX ")\n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ @@ -2772,7 +2771,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) CC_SET(be) : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) - : "c"(vmx), "d"((unsigned long)HOST_RSP), + : "c"(vmx), + [HOST_RSP]"r"((unsigned long)HOST_RSP), [loaded_vmcs]"r"(vmx->loaded_vmcs), [launched]"i"(offsetof(struct loaded_vmcs, launched)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), From 5a8781607e677eda60b20e0a4c91d2a5f12f9244 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:02 -0800 Subject: [PATCH 111/855] KVM: nVMX: Cache host_rsp on a per-VMCS basis Currently, host_rsp is cached on a per-vCPU basis, i.e. it's stored in struct vcpu_vmx. In non-nested usage the caching is for all intents and purposes 100% effective, e.g. only the first VMLAUNCH needs to synchronize VMCS.HOST_RSP since the call stack to vmx_vcpu_run() is identical each and every time. But when running a nested guest, KVM must invalidate the cache when switching the current VMCS as it can't guarantee the new VMCS has the same HOST_RSP as the previous VMCS. In other words, the cache loses almost all of its efficacy when running a nested VM. Move host_rsp to struct vmcs_host_state, which is per-VMCS, so that it is cached on a per-VMCS basis and restores its 100% hit rate when nested VMs are in play. Note that the host_rsp cache for vmcs02 essentially "breaks" when nested early checks are enabled as nested_vmx_check_vmentry_hw() will see a different RSP at the time of its VM-Enter. While it's possible to avoid even that VMCS.HOST_RSP synchronization, e.g. by employing a dedicated VM-Exit stack, there is little motivation for doing so as the overhead of two VMWRITEs (~55 cycles) is dwarfed by the overhead of the extra VMX transition (600+ cycles) and is a proverbial drop in the ocean relative to the total cost of a nested transtion (10s of thousands of cycles). Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 24 ++++++------------------ arch/x86/kvm/vmx/vmcs.h | 1 + arch/x86/kvm/vmx/vmx.c | 13 ++++++------- arch/x86/kvm/vmx/vmx.h | 1 - 4 files changed, 13 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 23a2c1b91389..0e67649e39ce 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1979,17 +1979,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) prepare_vmcs02_early_full(vmx, vmcs12); - /* - * HOST_RSP is normally set correctly in vmx_vcpu_run() just before - * entry, but only if the current (host) sp changed from the value - * we wrote last (vmx->host_rsp). This cache is no longer relevant - * if we switch vmcs, and rather than hold a separate cache per vmcs, - * here we just force the write to happen on entry. host_rsp will - * also be written unconditionally by nested_vmx_check_vmentry_hw() - * if we are doing early consistency checks via hardware. - */ - vmx->host_rsp = 0; - /* * PIN CONTROLS */ @@ -2754,8 +2743,11 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) asm( "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ + "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" + "je 1f \n\t" __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%% " _ASM_CX ")\n\t" + "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" + "1: \n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ @@ -2771,11 +2763,10 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) CC_SET(be) : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) - : "c"(vmx), - [HOST_RSP]"r"((unsigned long)HOST_RSP), + : [HOST_RSP]"r"((unsigned long)HOST_RSP), [loaded_vmcs]"r"(vmx->loaded_vmcs), [launched]"i"(offsetof(struct loaded_vmcs, launched)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" ); @@ -3912,9 +3903,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx_flush_tlb(vcpu, true); } - /* This is needed for same reason as it was needed in prepare_vmcs02 */ - vmx->host_rsp = 0; - /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 6def3ba88e3b..cb6079f8a227 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -34,6 +34,7 @@ struct vmcs_host_state { unsigned long cr4; /* May not match real cr4 */ unsigned long gs_base; unsigned long fs_base; + unsigned long rsp; u16 fs_sel, gs_sel, ldt_sel; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e9d81cd8421f..12bb61e7aca6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6381,9 +6381,9 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ "push %%" _ASM_CX " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" + "cmp %%" _ASM_SP ", (%%" _ASM_DI ") \n\t" "je 1f \n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" + "mov %%" _ASM_SP ", (%%" _ASM_DI ") \n\t" /* Avoid VMWRITE when Enlightened VMCS is in use */ "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" "jz 2f \n\t" @@ -6482,11 +6482,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP " \n\t" - : ASM_CALL_CONSTRAINT, "=S"((int){0}) - : "c"(vmx), "S"(evmcs_rsp), + : ASM_CALL_CONSTRAINT, "=D"((int){0}), "=S"((int){0}) + : "c"(vmx), "D"(&vmx->loaded_vmcs->host_state.rsp), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [HOST_RSP]"i"(HOST_RSP), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), @@ -6509,10 +6508,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rdx", "rdi" + , "rax", "rbx", "rdx" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "eax", "ebx", "edx", "edi" + , "eax", "ebx", "edx" #endif ); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 99328954c2fc..8e203b725928 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -175,7 +175,6 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; - unsigned long host_rsp; u8 fail; u8 msr_bitmap_mode; u32 exit_intr_info; From 47e97c099bbcb3211b22456679991095c0578da2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:03 -0800 Subject: [PATCH 112/855] KVM: VMX: Load/save guest CR2 via C code in __vmx_vcpu_run() ...to eliminate its parameter and struct vcpu_vmx offset definition from the assembly blob. Accessing CR2 from C versus assembly doesn't change the likelihood of taking a page fault (and modifying CR2) while it's loaded with the guest's value, so long as we don't do anything silly between accessing CR2 and VM-Enter/VM-Exit. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 12bb61e7aca6..5e43999ece1d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6375,6 +6375,9 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); + if (vcpu->arch.cr2 != read_cr2()) + write_cr2(vcpu->arch.cr2); + asm( /* Store host registers */ "push %%" _ASM_BP " \n\t" @@ -6395,13 +6398,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "1: \n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ - /* Reload cr2 if changed */ - "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t" - "mov %%cr2, %%" _ASM_DX " \n\t" - "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" - "je 3f \n\t" - "mov %%" _ASM_AX", %%cr2 \n\t" - "3: \n\t" /* Check if vmlaunch or vmresume is needed */ "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t" /* Load guest registers. Don't clobber flags. */ @@ -6471,9 +6467,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%r14d, %%r14d \n\t" "xor %%r15d, %%r15d \n\t" #endif - "mov %%cr2, %%" _ASM_AX " \n\t" - "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t" - "xor %%eax, %%eax \n\t" "xor %%ebx, %%ebx \n\t" "xor %%ecx, %%ecx \n\t" @@ -6504,7 +6497,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), #endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 @@ -6514,6 +6506,8 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) , "eax", "ebx", "edx" #endif ); + + vcpu->arch.cr2 = read_cr2(); } STACK_FRAME_NON_STANDARD(__vmx_vcpu_run); From c09b03eb7f964370149577fdba425623b7f5b0b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:04 -0800 Subject: [PATCH 113/855] KVM: VMX: Update VMCS.HOST_RSP via helper C function Providing a helper function to update HOST_RSP is visibly easier to read, and more importantly (for the future) eliminates two arguments to the VM-Enter assembly blob. Reducing the number of arguments to the asm blob is for all intents and purposes a prerequisite to moving the code to a proper assembly routine. It's not truly mandatory, but it greatly simplifies the future code, and the cost of the extra CALL+RET is negligible in the grand scheme. Note that although _ASM_ARG[1-3] can be used in the inline asm itself, the intput/output constraints need to be manually defined. gcc will actually compile with _ASM_ARG[1-3] specified as constraints, but what it actually ends up doing with the bogus constraint is unknown. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 51 +++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5e43999ece1d..4b8a94fedb76 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6363,15 +6363,18 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->hv_timer_armed = false; } +void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) +{ + if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { + vmx->loaded_vmcs->host_state.rsp = host_rsp; + vmcs_writel(HOST_RSP, host_rsp); + } +} + static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - unsigned long evmcs_rsp; - vmx->__launched = vmx->loaded_vmcs->launched; - evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? - (unsigned long)¤t_evmcs->host_rsp : 0; - if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); @@ -6382,21 +6385,14 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Store host registers */ "push %%" _ASM_BP " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ - "push %%" _ASM_CX " \n\t" - "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - "cmp %%" _ASM_SP ", (%%" _ASM_DI ") \n\t" - "je 1f \n\t" - "mov %%" _ASM_SP ", (%%" _ASM_DI ") \n\t" - /* Avoid VMWRITE when Enlightened VMCS is in use */ - "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" - "jz 2f \n\t" - "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" - "jmp 1f \n\t" - "2: \n\t" - "mov $%c[HOST_RSP], %%" _ASM_DX " \n\t" - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "1: \n\t" - "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ + "push %%" _ASM_ARG1 " \n\t" + + /* Adjust RSP to account for the CALL to vmx_vmenter(). */ + "lea -%c[wordsize](%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" + "call vmx_update_host_rsp \n\t" + + /* Load the vcpu_vmx pointer to RCX. */ + "mov (%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Check if vmlaunch or vmresume is needed */ "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t" @@ -6475,11 +6471,16 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP " \n\t" - : ASM_CALL_CONSTRAINT, "=D"((int){0}), "=S"((int){0}) - : "c"(vmx), "D"(&vmx->loaded_vmcs->host_state.rsp), "S"(evmcs_rsp), + : ASM_CALL_CONSTRAINT, +#ifdef CONFIG_X86_64 + "=D"((int){0}) + : "D"(vmx), +#else + "=a"((int){0}) + : "a"(vmx), +#endif [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [HOST_RSP]"i"(HOST_RSP), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), @@ -6500,10 +6501,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rdx" + , "rax", "rbx", "rcx", "rdx", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "eax", "ebx", "edx" + , "ebx", "ecx", "edx", "edi", "esi" #endif ); From c9afc58cc368623aaa34a62e321eea2a59240f6f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:05 -0800 Subject: [PATCH 114/855] KVM: VMX: Pass "launched" directly to the vCPU-run asm blob ...and remove struct vcpu_vmx's temporary __launched variable. Eliminating __launched is a bonus, the real motivation is to get to the point where the only reference to struct vcpu_vmx in the asm code is to vcpu.arch.regs, which will simplify moving the blob to a proper asm file. Note that also means this approach is deliberately different than what is used in nested_vmx_check_vmentry_hw(). Use BL as it is a callee-save register in both 32-bit and 64-bit ABIs, i.e. it can't be modified by vmx_update_host_rsp(), to avoid having to temporarily save/restore the launched flag. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 13 ++++++------- arch/x86/kvm/vmx/vmx.h | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4b8a94fedb76..996a13ea86cc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6373,8 +6373,6 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - vmx->__launched = vmx->loaded_vmcs->launched; - if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); @@ -6395,7 +6393,8 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov (%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Check if vmlaunch or vmresume is needed */ - "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t" + "cmpb $0, %%bl \n\t" + /* Load guest registers. Don't clobber flags. */ "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t" "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t" @@ -6471,7 +6470,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP " \n\t" - : ASM_CALL_CONSTRAINT, + : ASM_CALL_CONSTRAINT, "=b"((int){0}), #ifdef CONFIG_X86_64 "=D"((int){0}) : "D"(vmx), @@ -6479,7 +6478,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "=a"((int){0}) : "a"(vmx), #endif - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), + "b"(vmx->loaded_vmcs->launched), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), @@ -6501,10 +6500,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rcx", "rdx", "rsi" + , "rax", "rcx", "rdx", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "ebx", "ecx", "edx", "edi", "esi" + , "ecx", "edx", "edi", "esi" #endif ); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 8e203b725928..6ee6a492efaf 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -208,7 +208,7 @@ struct vcpu_vmx { struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; struct loaded_vmcs *loaded_cpu_state; - bool __launched; /* temporary, used in vmx_vcpu_run */ + struct msr_autoload { struct vmx_msrs guest; struct vmx_msrs host; From 217aaff53c25f03b1d2fc23eff9dc2bae34f690e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:06 -0800 Subject: [PATCH 115/855] KVM: VMX: Invert the ordering of saving guest/host scratch reg at VM-Enter Switching the ordering allows for an out-of-line path for VM-Fail that elides saving guest state but still shares the register clearing with the VM-Exit path. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 996a13ea86cc..79b42197ed7e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6382,7 +6382,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) asm( /* Store host registers */ "push %%" _ASM_BP " \n\t" - "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ "push %%" _ASM_ARG1 " \n\t" /* Adjust RSP to account for the CALL to vmx_vmenter(). */ @@ -6418,11 +6417,11 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Enter guest mode */ "call vmx_vmenter\n\t" - /* Save guest's RCX to the stack placeholder (see above) */ - "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t" + /* Temporarily save guest's RCX. */ + "push %%" _ASM_CX " \n\t" - /* Load host's RCX, i.e. the vmx_vcpu pointer */ - "pop %%" _ASM_CX " \n\t" + /* Reload the vcpu_vmx pointer to RCX. */ + "mov %c[wordsize](%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Set vmx->fail based on EFLAGS.{CF,ZF} */ "setbe %c[fail](%%" _ASM_CX ")\n\t" @@ -6469,6 +6468,9 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%esi, %%esi \n\t" "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" + + /* "POP" the vcpu_vmx pointer. */ + "add $%c[wordsize], %%" _ASM_SP " \n\t" "pop %%" _ASM_BP " \n\t" : ASM_CALL_CONSTRAINT, "=b"((int){0}), #ifdef CONFIG_X86_64 From f78d0971b7bd5bf4373a1fac27f176af5d5594ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:07 -0800 Subject: [PATCH 116/855] KVM: VMX: Don't save guest registers after VM-Fail A failed VM-Enter (obviously) didn't succeed, meaning the CPU never executed an instrunction in guest mode and so can't have changed the general purpose registers. In addition to saving some instructions in the VM-Fail case, this also provides a separate path entirely and thus an opportunity to propagate the fail condition to vmx->fail via register without introducing undue pain. Using a register, as opposed to directly referencing vmx->fail, eliminates the need to pass the offset of 'fail', which will simplify moving the code to proper assembly in future patches. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 79b42197ed7e..1dcd3f157a70 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6416,6 +6416,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Enter guest mode */ "call vmx_vmenter\n\t" + "jbe 2f \n\t" /* Temporarily save guest's RCX. */ "push %%" _ASM_CX " \n\t" @@ -6423,9 +6424,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Reload the vcpu_vmx pointer to RCX. */ "mov %c[wordsize](%%" _ASM_SP "), %%" _ASM_CX " \n\t" - /* Set vmx->fail based on EFLAGS.{CF,ZF} */ - "setbe %c[fail](%%" _ASM_CX ")\n\t" - /* Save all guest registers, including RCX from the stack */ "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t" "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t" @@ -6443,15 +6441,22 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t" "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t" "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t" +#endif + + /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ + "xor %%ebx, %%ebx \n\t" /* - * Clear all general purpose registers (except RSP, which is loaded by - * the CPU during VM-Exit) to prevent speculative use of the guest's - * values, even those that are saved/loaded via the stack. In theory, - * an L1 cache miss when restoring registers could lead to speculative - * execution with the guest's values. Zeroing XORs are dirt cheap, - * i.e. the extra paranoia is essentially free. + * Clear all general purpose registers except RSP and RBX to prevent + * speculative use of the guest's values, even those that are reloaded + * via the stack. In theory, an L1 cache miss when restoring registers + * could lead to speculative execution with the guest's values. + * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially + * free. RSP and RBX are exempt as RSP is restored by hardware during + * VM-Exit and RBX is explicitly loaded with 0 or 1 to "return" VM-Fail. */ + "1: \n\t" +#ifdef CONFIG_X86_64 "xor %%r8d, %%r8d \n\t" "xor %%r9d, %%r9d \n\t" "xor %%r10d, %%r10d \n\t" @@ -6462,7 +6467,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%r15d, %%r15d \n\t" #endif "xor %%eax, %%eax \n\t" - "xor %%ebx, %%ebx \n\t" "xor %%ecx, %%ecx \n\t" "xor %%edx, %%edx \n\t" "xor %%esi, %%esi \n\t" @@ -6472,7 +6476,15 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* "POP" the vcpu_vmx pointer. */ "add $%c[wordsize], %%" _ASM_SP " \n\t" "pop %%" _ASM_BP " \n\t" - : ASM_CALL_CONSTRAINT, "=b"((int){0}), + "jmp 3f \n\t" + + /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ + "2: \n\t" + "mov $1, %%ebx \n\t" + "jmp 1b \n\t" + "3: \n\t" + + : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 "=D"((int){0}) : "D"(vmx), @@ -6481,7 +6493,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) : "a"(vmx), #endif "b"(vmx->loaded_vmcs->launched), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), From d55892049171872a8f1aba542aa0691efe3da52d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:08 -0800 Subject: [PATCH 117/855] KVM: VMX: Use vcpu->arch.regs directly when saving/loading guest state ...now that all other references to struct vcpu_vmx have been removed. Note that 'vmx' still needs to be passed into the asm blob in _ASM_ARG1 as it is consumed by vmx_update_host_rsp(). And similar to that code, use _ASM_ARG2 in the assembly code to prepare for moving to proper asm, while explicitly referencing the exact registers in the clobber list for clarity in the short term and to avoid additional precompiler games. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 53 +++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1dcd3f157a70..cd92568bed26 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6382,13 +6382,18 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) asm( /* Store host registers */ "push %%" _ASM_BP " \n\t" - "push %%" _ASM_ARG1 " \n\t" + + /* + * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and + * @regs is needed after VM-Exit to save the guest's register values. + */ + "push %%" _ASM_ARG2 " \n\t" /* Adjust RSP to account for the CALL to vmx_vmenter(). */ "lea -%c[wordsize](%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" "call vmx_update_host_rsp \n\t" - /* Load the vcpu_vmx pointer to RCX. */ + /* Load RCX with @regs. */ "mov (%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Check if vmlaunch or vmresume is needed */ @@ -6421,7 +6426,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Temporarily save guest's RCX. */ "push %%" _ASM_CX " \n\t" - /* Reload the vcpu_vmx pointer to RCX. */ + /* Reload RCX with @regs. */ "mov %c[wordsize](%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Save all guest registers, including RCX from the stack */ @@ -6486,37 +6491,37 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 - "=D"((int){0}) - : "D"(vmx), + "=D"((int){0}), "=S"((int){0}) + : "D"(vmx), "S"(&vcpu->arch.regs), #else - "=a"((int){0}) - : "a"(vmx), + "=a"((int){0}), "=d"((int){0}) + : "a"(vmx), "d"(&vcpu->arch.regs), #endif "b"(vmx->loaded_vmcs->launched), - [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), - [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), + [rax]"i"(VCPU_REGS_RAX * sizeof(ulong)), + [rbx]"i"(VCPU_REGS_RBX * sizeof(ulong)), + [rcx]"i"(VCPU_REGS_RCX * sizeof(ulong)), + [rdx]"i"(VCPU_REGS_RDX * sizeof(ulong)), + [rsi]"i"(VCPU_REGS_RSI * sizeof(ulong)), + [rdi]"i"(VCPU_REGS_RDI * sizeof(ulong)), + [rbp]"i"(VCPU_REGS_RBP * sizeof(ulong)), #ifdef CONFIG_X86_64 - [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), + [r8]"i"(VCPU_REGS_R8 * sizeof(ulong)), + [r9]"i"(VCPU_REGS_R9 * sizeof(ulong)), + [r10]"i"(VCPU_REGS_R10 * sizeof(ulong)), + [r11]"i"(VCPU_REGS_R11 * sizeof(ulong)), + [r12]"i"(VCPU_REGS_R12 * sizeof(ulong)), + [r13]"i"(VCPU_REGS_R13 * sizeof(ulong)), + [r14]"i"(VCPU_REGS_R14 * sizeof(ulong)), + [r15]"i"(VCPU_REGS_R15 * sizeof(ulong)), #endif [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rcx", "rdx", "rsi" + , "rax", "rcx", "rdx" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "ecx", "edx", "edi", "esi" + , "ecx", "edi", "esi" #endif ); From de9c0d49d85dc563549972edc5589d195cd5e859 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 2 Feb 2019 03:34:36 +0100 Subject: [PATCH 118/855] ARM: 8833/1: Ensure that NEON code always compiles with Clang While building arm32 allyesconfig, I ran into the following errors: arch/arm/lib/xor-neon.c:17:2: error: You should compile this file with '-mfloat-abi=softfp -mfpu=neon' In file included from lib/raid6/neon1.c:27: /home/nathan/cbl/prebuilt/lib/clang/8.0.0/include/arm_neon.h:28:2: error: "NEON support not enabled" Building V=1 showed NEON_FLAGS getting passed along to Clang but __ARM_NEON__ was not getting defined. Ultimately, it boils down to Clang only defining __ARM_NEON__ when targeting armv7, rather than armv6k, which is the '-march' value for allyesconfig. >From lib/Basic/Targets/ARM.cpp in the Clang source: // This only gets set when Neon instructions are actually available, unlike // the VFP define, hence the soft float and arch check. This is subtly // different from gcc, we follow the intent which was that it should be set // when Neon instructions are actually available. if ((FPU & NeonFPU) && !SoftFloat && ArchVersion >= 7) { Builder.defineMacro("__ARM_NEON", "1"); Builder.defineMacro("__ARM_NEON__"); // current AArch32 NEON implementations do not support double-precision // floating-point even when it is present in VFP. Builder.defineMacro("__ARM_NEON_FP", "0x" + Twine::utohexstr(HW_FP & ~HW_FP_DP)); } Ard Biesheuvel recommended explicitly adding '-march=armv7-a' at the beginning of the NEON_FLAGS definitions so that __ARM_NEON__ always gets definined by Clang. This doesn't functionally change anything because that code will only run where NEON is supported, which is implicitly armv7. Link: https://github.com/ClangBuiltLinux/linux/issues/287 Suggested-by: Ard Biesheuvel Signed-off-by: Nathan Chancellor Acked-by: Nicolas Pitre Reviewed-by: Nick Desaulniers Reviewed-by: Stefan Agner Signed-off-by: Russell King --- Documentation/arm/kernel_mode_neon.txt | 4 ++-- arch/arm/lib/Makefile | 2 +- arch/arm/lib/xor-neon.c | 2 +- lib/raid6/Makefile | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/arm/kernel_mode_neon.txt b/Documentation/arm/kernel_mode_neon.txt index 525452726d31..b9e060c5b61e 100644 --- a/Documentation/arm/kernel_mode_neon.txt +++ b/Documentation/arm/kernel_mode_neon.txt @@ -6,7 +6,7 @@ TL;DR summary * Use only NEON instructions, or VFP instructions that don't rely on support code * Isolate your NEON code in a separate compilation unit, and compile it with - '-mfpu=neon -mfloat-abi=softfp' + '-march=armv7-a -mfpu=neon -mfloat-abi=softfp' * Put kernel_neon_begin() and kernel_neon_end() calls around the calls into your NEON code * Don't sleep in your NEON code, and be aware that it will be executed with @@ -87,7 +87,7 @@ instructions appearing in unexpected places if no special care is taken. Therefore, the recommended and only supported way of using NEON/VFP in the kernel is by adhering to the following rules: * isolate the NEON code in a separate compilation unit and compile it with - '-mfpu=neon -mfloat-abi=softfp'; + '-march=armv7-a -mfpu=neon -mfloat-abi=softfp'; * issue the calls to kernel_neon_begin(), kernel_neon_end() as well as the calls into the unit containing the NEON code from a compilation unit which is *not* built with the GCC flag '-mfpu=neon' set. diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index ad25fd1872c7..0bff0176db2c 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -39,7 +39,7 @@ $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S ifeq ($(CONFIG_KERNEL_MODE_NEON),y) - NEON_FLAGS := -mfloat-abi=softfp -mfpu=neon + NEON_FLAGS := -march=armv7-a -mfloat-abi=softfp -mfpu=neon CFLAGS_xor-neon.o += $(NEON_FLAGS) obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o endif diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c index 2c40aeab3eaa..c691b901092f 100644 --- a/arch/arm/lib/xor-neon.c +++ b/arch/arm/lib/xor-neon.c @@ -14,7 +14,7 @@ MODULE_LICENSE("GPL"); #ifndef __ARM_NEON__ -#error You should compile this file with '-mfloat-abi=softfp -mfpu=neon' +#error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon' #endif /* diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 4e90d443d1b0..e723eacf7868 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -39,7 +39,7 @@ endif ifeq ($(CONFIG_KERNEL_MODE_NEON),y) NEON_FLAGS := -ffreestanding ifeq ($(ARCH),arm) -NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon +NEON_FLAGS += -march=armv7-a -mfloat-abi=softfp -mfpu=neon endif CFLAGS_recov_neon_inner.o += $(NEON_FLAGS) ifeq ($(ARCH),arm64) From 37406a60fac7de336dec331a8707a94462ac5a5e Mon Sep 17 00:00:00 2001 From: Sean Paul Date: Tue, 12 Feb 2019 12:32:41 -0500 Subject: [PATCH 119/855] drm: Merge __drm_atomic_helper_disable_all() into drm_atomic_helper_disable_all() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only difference between the functions is the clean_old_fbs argument, whose use was removed in the patch referenced below. So remove the internal copy and drop the guts back into drm_atomic_helper_disable_all() Changes in v2: - Instead of just removing the unused arg, merge the functions Fixes: e00fb8564ee9 ("drm: Stop updating plane->crtc/fb/old_fb on atomic drivers") Cc: Ville Syrjälä Cc: Maarten Lankhorst Cc: Harry Wentland Cc: Sinclair Yeh # This email bounces :( Cc: Maxime Ripard Cc: Sean Paul Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Reviewed-by: Ville Syrjälä Signed-off-by: Sean Paul Link: https://patchwork.freedesktop.org/patch/msgid/20190212173245.51980-1-sean@poorly.run --- drivers/gpu/drm/drm_atomic_helper.c | 59 +++++++++++++---------------- 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index c53ecbd9abdd..7a7231d00779 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -3030,9 +3030,31 @@ commit: return 0; } -static int __drm_atomic_helper_disable_all(struct drm_device *dev, - struct drm_modeset_acquire_ctx *ctx, - bool clean_old_fbs) +/** + * drm_atomic_helper_disable_all - disable all currently active outputs + * @dev: DRM device + * @ctx: lock acquisition context + * + * Loops through all connectors, finding those that aren't turned off and then + * turns them off by setting their DPMS mode to OFF and deactivating the CRTC + * that they are connected to. + * + * This is used for example in suspend/resume to disable all currently active + * functions when suspending. If you just want to shut down everything at e.g. + * driver unload, look at drm_atomic_helper_shutdown(). + * + * Note that if callers haven't already acquired all modeset locks this might + * return -EDEADLK, which must be handled by calling drm_modeset_backoff(). + * + * Returns: + * 0 on success or a negative error code on failure. + * + * See also: + * drm_atomic_helper_suspend(), drm_atomic_helper_resume() and + * drm_atomic_helper_shutdown(). + */ +int drm_atomic_helper_disable_all(struct drm_device *dev, + struct drm_modeset_acquire_ctx *ctx) { struct drm_atomic_state *state; struct drm_connector_state *conn_state; @@ -3090,35 +3112,6 @@ free: drm_atomic_state_put(state); return ret; } - -/** - * drm_atomic_helper_disable_all - disable all currently active outputs - * @dev: DRM device - * @ctx: lock acquisition context - * - * Loops through all connectors, finding those that aren't turned off and then - * turns them off by setting their DPMS mode to OFF and deactivating the CRTC - * that they are connected to. - * - * This is used for example in suspend/resume to disable all currently active - * functions when suspending. If you just want to shut down everything at e.g. - * driver unload, look at drm_atomic_helper_shutdown(). - * - * Note that if callers haven't already acquired all modeset locks this might - * return -EDEADLK, which must be handled by calling drm_modeset_backoff(). - * - * Returns: - * 0 on success or a negative error code on failure. - * - * See also: - * drm_atomic_helper_suspend(), drm_atomic_helper_resume() and - * drm_atomic_helper_shutdown(). - */ -int drm_atomic_helper_disable_all(struct drm_device *dev, - struct drm_modeset_acquire_ctx *ctx) -{ - return __drm_atomic_helper_disable_all(dev, ctx, false); -} EXPORT_SYMBOL(drm_atomic_helper_disable_all); /** @@ -3139,7 +3132,7 @@ void drm_atomic_helper_shutdown(struct drm_device *dev) DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, 0, ret); - ret = __drm_atomic_helper_disable_all(dev, &ctx, true); + ret = drm_atomic_helper_disable_all(dev, &ctx); if (ret) DRM_ERROR("Disabling all crtc's during unload failed with %i\n", ret); From ebb09b33c60c46fd4f7ffa0af9e693eebe765d1b Mon Sep 17 00:00:00 2001 From: Leonid Ravich Date: Tue, 12 Feb 2019 22:09:28 +0200 Subject: [PATCH 120/855] NTB: add new parameter to peer_db_addr() db_bit and db_data NTB door bell usage depends on NTB hardware. ex: intel NTB gen1 has one peer door bell register which can be controlled by the bitmap writen to it, while Intel NTB gen3 has a registers per door bell and the data trigering the each door bell is always 1. therefore exposing only peer door bell address forcing the user to be aware of such low level details Signed-off-by: Leonid Ravich Acked-by: Logan Gunthorpe Acked-by: Dave Jiang Acked-by: Allen Hubbe Signed-off-by: Jon Mason --- drivers/ntb/hw/intel/ntb_hw_gen1.c | 25 ++++++++++++++----- drivers/ntb/hw/intel/ntb_hw_gen1.h | 5 ++-- drivers/ntb/hw/intel/ntb_hw_gen3.c | 33 +++++++++++++++++++++++++- drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 9 ++++++- include/linux/ntb.h | 10 +++++--- 5 files changed, 69 insertions(+), 13 deletions(-) diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.c b/drivers/ntb/hw/intel/ntb_hw_gen1.c index 2ad263f708da..bb57ec239029 100644 --- a/drivers/ntb/hw/intel/ntb_hw_gen1.c +++ b/drivers/ntb/hw/intel/ntb_hw_gen1.c @@ -180,7 +180,7 @@ int ndev_mw_to_bar(struct intel_ntb_dev *ndev, int idx) return ndev->reg->mw_bar[idx]; } -static inline int ndev_db_addr(struct intel_ntb_dev *ndev, +void ndev_db_addr(struct intel_ntb_dev *ndev, phys_addr_t *db_addr, resource_size_t *db_size, phys_addr_t reg_addr, unsigned long reg) { @@ -196,8 +196,6 @@ static inline int ndev_db_addr(struct intel_ntb_dev *ndev, *db_size = ndev->reg->db_size; dev_dbg(&ndev->ntb.pdev->dev, "Peer db size %llx\n", *db_size); } - - return 0; } u64 ndev_db_read(struct intel_ntb_dev *ndev, @@ -1111,13 +1109,28 @@ int intel_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits) ndev->self_reg->db_mask); } -int intel_ntb_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr, - resource_size_t *db_size) +static int intel_ntb_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr, + resource_size_t *db_size, u64 *db_data, int db_bit) { + u64 db_bits; struct intel_ntb_dev *ndev = ntb_ndev(ntb); - return ndev_db_addr(ndev, db_addr, db_size, ndev->peer_addr, + if (unlikely(db_bit >= BITS_PER_LONG_LONG)) + return -EINVAL; + + db_bits = BIT_ULL(db_bit); + + if (unlikely(db_bits & ~ntb_ndev(ntb)->db_valid_mask)) + return -EINVAL; + + ndev_db_addr(ndev, db_addr, db_size, ndev->peer_addr, ndev->peer_reg->db_bell); + + if (db_data) + *db_data = db_bits; + + + return 0; } static int intel_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits) diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.h b/drivers/ntb/hw/intel/ntb_hw_gen1.h index ad8ec1444436..544cf5c06f4d 100644 --- a/drivers/ntb/hw/intel/ntb_hw_gen1.h +++ b/drivers/ntb/hw/intel/ntb_hw_gen1.h @@ -147,6 +147,9 @@ extern struct intel_b2b_addr xeon_b2b_dsd_addr; int ndev_init_isr(struct intel_ntb_dev *ndev, int msix_min, int msix_max, int msix_shift, int total_shift); enum ntb_topo xeon_ppd_topo(struct intel_ntb_dev *ndev, u8 ppd); +void ndev_db_addr(struct intel_ntb_dev *ndev, + phys_addr_t *db_addr, resource_size_t *db_size, + phys_addr_t reg_addr, unsigned long reg); u64 ndev_db_read(struct intel_ntb_dev *ndev, void __iomem *mmio); int ndev_db_write(struct intel_ntb_dev *ndev, u64 db_bits, void __iomem *mmio); @@ -166,8 +169,6 @@ int intel_ntb_db_vector_count(struct ntb_dev *ntb); u64 intel_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector); int intel_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits); int intel_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits); -int intel_ntb_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr, - resource_size_t *db_size); int intel_ntb_spad_is_unsafe(struct ntb_dev *ntb); int intel_ntb_spad_count(struct ntb_dev *ntb); u32 intel_ntb_spad_read(struct ntb_dev *ntb, int idx); diff --git a/drivers/ntb/hw/intel/ntb_hw_gen3.c b/drivers/ntb/hw/intel/ntb_hw_gen3.c index b3fa24778f94..f475b56a3f49 100644 --- a/drivers/ntb/hw/intel/ntb_hw_gen3.c +++ b/drivers/ntb/hw/intel/ntb_hw_gen3.c @@ -532,6 +532,37 @@ static int intel_ntb3_mw_set_trans(struct ntb_dev *ntb, int pidx, int idx, return 0; } +int intel_ntb3_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr, + resource_size_t *db_size, + u64 *db_data, int db_bit) +{ + phys_addr_t db_addr_base; + struct intel_ntb_dev *ndev = ntb_ndev(ntb); + + if (unlikely(db_bit >= BITS_PER_LONG_LONG)) + return -EINVAL; + + if (unlikely(BIT_ULL(db_bit) & ~ntb_ndev(ntb)->db_valid_mask)) + return -EINVAL; + + ndev_db_addr(ndev, &db_addr_base, db_size, ndev->peer_addr, + ndev->peer_reg->db_bell); + + if (db_addr) { + *db_addr = db_addr_base + (db_bit * 4); + dev_dbg(&ndev->ntb.pdev->dev, "Peer db addr %llx db bit %d\n", + *db_addr, db_bit); + } + + if (db_data) { + *db_data = 1; + dev_dbg(&ndev->ntb.pdev->dev, "Peer db data %llx db bit %d\n", + *db_data, db_bit); + } + + return 0; +} + static int intel_ntb3_peer_db_set(struct ntb_dev *ntb, u64 db_bits) { struct intel_ntb_dev *ndev = ntb_ndev(ntb); @@ -584,7 +615,7 @@ const struct ntb_dev_ops intel_ntb3_ops = { .db_clear = intel_ntb3_db_clear, .db_set_mask = intel_ntb_db_set_mask, .db_clear_mask = intel_ntb_db_clear_mask, - .peer_db_addr = intel_ntb_peer_db_addr, + .peer_db_addr = intel_ntb3_peer_db_addr, .peer_db_set = intel_ntb3_peer_db_set, .spad_is_unsafe = intel_ntb_spad_is_unsafe, .spad_count = intel_ntb_spad_count, diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c index f6f00354047b..9ae944597708 100644 --- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c +++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c @@ -710,11 +710,16 @@ static u64 switchtec_ntb_db_read_mask(struct ntb_dev *ntb) static int switchtec_ntb_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr, - resource_size_t *db_size) + resource_size_t *db_size, + u64 *db_data, + int db_bit) { struct switchtec_ntb *sndev = ntb_sndev(ntb); unsigned long offset; + if (unlikely(db_bit >= BITS_PER_LONG_LONG)) + return -EINVAL; + offset = (unsigned long)sndev->mmio_peer_dbmsg->odb - (unsigned long)sndev->stdev->mmio; @@ -724,6 +729,8 @@ static int switchtec_ntb_peer_db_addr(struct ntb_dev *ntb, *db_addr = pci_resource_start(ntb->pdev, 0) + offset; if (db_size) *db_size = sizeof(u32); + if (db_data) + *db_data = BIT_ULL(db_bit) << sndev->db_peer_shift; return 0; } diff --git a/include/linux/ntb.h b/include/linux/ntb.h index 181d16601dd9..56a92e3ae3ae 100644 --- a/include/linux/ntb.h +++ b/include/linux/ntb.h @@ -296,7 +296,8 @@ struct ntb_dev_ops { int (*db_clear_mask)(struct ntb_dev *ntb, u64 db_bits); int (*peer_db_addr)(struct ntb_dev *ntb, - phys_addr_t *db_addr, resource_size_t *db_size); + phys_addr_t *db_addr, resource_size_t *db_size, + u64 *db_data, int db_bit); u64 (*peer_db_read)(struct ntb_dev *ntb); int (*peer_db_set)(struct ntb_dev *ntb, u64 db_bits); int (*peer_db_clear)(struct ntb_dev *ntb, u64 db_bits); @@ -1078,6 +1079,8 @@ static inline int ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits) * @ntb: NTB device context. * @db_addr: OUT - The address of the peer doorbell register. * @db_size: OUT - The number of bytes to write the peer doorbell register. + * @db_data: OUT - The data of peer doorbell register + * @db_bit: door bell bit number * * Return the address of the peer doorbell register. This may be used, for * example, by drivers that offload memory copy operations to a dma engine. @@ -1091,12 +1094,13 @@ static inline int ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits) */ static inline int ntb_peer_db_addr(struct ntb_dev *ntb, phys_addr_t *db_addr, - resource_size_t *db_size) + resource_size_t *db_size, + u64 *db_data, int db_bit) { if (!ntb->ops->peer_db_addr) return -EINVAL; - return ntb->ops->peer_db_addr(ntb, db_addr, db_size); + return ntb->ops->peer_db_addr(ntb, db_addr, db_size, db_data, db_bit); } /** From db610a640eeeb268c36a4558414f28e1c269433e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 Jan 2019 17:48:38 -0800 Subject: [PATCH 121/855] f2fs: add quick mode of checkpoint=disable for QA This mode returns mount() quickly with EAGAIN. We can trigger this by shutdown(F2FS_GOING_DOWN_NEED_FSCK). Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/file.c | 6 +++--- fs/f2fs/segment.c | 3 +++ fs/f2fs/super.c | 5 +++++ include/linux/f2fs_fs.h | 1 + 6 files changed, 19 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f955cd3e0677..622dca707752 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1259,6 +1259,11 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) else __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK)) + __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + else + __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); else diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6b6ec5600089..fe95abb05d40 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -191,6 +191,7 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ struct cp_control { @@ -1101,6 +1102,7 @@ enum { SBI_IS_SHUTDOWN, /* shutdown by ioctl */ SBI_IS_RECOVERED, /* recovered orphan/data */ SBI_CP_DISABLED, /* CP was disabled last mount */ + SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0d461321edfc..fe6f92fbba38 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1972,11 +1972,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) break; case F2FS_GOING_DOWN_NEED_FSCK: set_sbi_flag(sbi, SBI_NEED_FSCK); + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + set_sbi_flag(sbi, SBI_IS_DIRTY); /* do checkpoint only */ ret = f2fs_sync_fs(sb, 1); - if (ret) - goto out; - break; + goto out; default: ret = -EINVAL; goto out; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5b2b9be6f28d..342b720fb4db 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -868,6 +868,9 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi) if (holes[DATA] > ovp || holes[NODE] > ovp) return -EAGAIN; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && + dirty_segments(sbi) > overprovision_segments(sbi)) + return -EAGAIN; return 0; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 24efd76ca151..5e1f8573a17f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3205,6 +3205,10 @@ try_onemore: if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) { + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL; + } /* Initialize device list */ err = f2fs_scan_devices(sbi); @@ -3392,6 +3396,7 @@ skip_recovery: cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); + clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); return 0; free_meta: diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index d7711048ef93..d6befe1f9dc7 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -116,6 +116,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_DISABLED_QUICK_FLAG 0x00002000 #define CP_DISABLED_FLAG 0x00001000 #define CP_QUOTA_NEED_FSCK_FLAG 0x00000800 #define CP_LARGE_NAT_BITMAP_FLAG 0x00000400 From b86232536c3ec068b0960ffc62c61a872412b2b7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jan 2019 09:12:13 -0800 Subject: [PATCH 122/855] f2fs: try to keep CP_TRIMMED_FLAG after successful umount If every discard were issued successfully, we can avoid further discard. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 342b720fb4db..fdd8cd21522f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1063,6 +1063,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->max_requests = UINT_MAX; dpolicy->io_aware = false; + /* we need to issue all to keep CP_TRIMMED_FLAG */ + dpolicy->granularity = 1; } } From b460866d27089ffaf915a5a3340b4d8c9f079d33 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jan 2019 10:26:39 -0800 Subject: [PATCH 123/855] f2fs: don't wake up too frequently, if there is lots of IOs Otherwise, it wakes up discard thread which will sleep again by busy IOs in a loop. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a77f76f528b6..5c7ed0442d6e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -865,7 +865,7 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) } } mutex_unlock(&dcc->cmd_lock); - if (!wakeup) + if (!wakeup || !is_idle(sbi, DISCARD_TIME)) return; wake_up: dcc->discard_wake = 1; From 11ac8ef8d8c575409dcce3bcf1dda3c9f8c7f5e9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jan 2019 12:05:25 -0800 Subject: [PATCH 124/855] f2fs: avoid null pointer exception in dcc_info If dcc_info is not set yet, we can get null pointer panic. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fe95abb05d40..8c928cd72b61 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2161,10 +2161,17 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || get_pages(sbi, F2FS_WB_CP_DATA) || get_pages(sbi, F2FS_DIO_READ) || - get_pages(sbi, F2FS_DIO_WRITE) || - atomic_read(&SM_I(sbi)->dcc_info->queued_discard) || - atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + get_pages(sbi, F2FS_DIO_WRITE)) return false; + + if (SM_I(sbi) && SM_I(sbi)->dcc_info && + atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) + return false; + + if (SM_I(sbi) && SM_I(sbi)->fcc_info && + atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + return false; + return f2fs_time_over(sbi, type); } From 0e0667b625cf64243df83171bff61f9d350b9ca5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 27 Jan 2019 17:59:53 -0800 Subject: [PATCH 125/855] f2fs: flush quota blocks after turnning it off After quota_off, we'll get some dirty blocks. If put_super don't have a chance to flush them by checkpoint, it causes NULL pointer exception in end_io after iput(node_inode). (e.g., by checkpoint=disable) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5e1f8573a17f..7694cb350734 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2026,6 +2026,12 @@ void f2fs_quota_off_umount(struct super_block *sb) set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); } } + /* + * In case of checkpoint=disable, we must flush quota blocks. + * This can cause NULL exception for node_inode in end_io, since + * put_super already dropped it. + */ + sync_filesystem(sb); } static void f2fs_truncate_quota_inode_pages(struct super_block *sb) From 812a95977fd2f0d1f220c716a98a7f22e22f488d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 22 Jan 2019 14:04:33 -0800 Subject: [PATCH 126/855] f2fs: sync filesystem after roll-forward recovery Some works after roll-forward recovery can get an error which will release all the data structures. Let's flush them in order to make it clean. One possible corruption came from: [ 90.400500] list_del corruption. prev->next should be ffffffed1f566208, but was (null) [ 90.675349] Call trace: [ 90.677869] __list_del_entry_valid+0x94/0xb4 [ 90.682351] remove_dirty_inode+0xac/0x114 [ 90.686563] __f2fs_write_data_pages+0x6a8/0x6c8 [ 90.691302] f2fs_write_data_pages+0x40/0x4c [ 90.695695] do_writepages+0x80/0xf0 [ 90.699372] __writeback_single_inode+0xdc/0x4ac [ 90.704113] writeback_sb_inodes+0x280/0x440 [ 90.708501] wb_writeback+0x1b8/0x3d0 [ 90.712267] wb_workfn+0x1a8/0x4d4 [ 90.715765] process_one_work+0x1c0/0x3d4 [ 90.719883] worker_thread+0x224/0x344 [ 90.723739] kthread+0x120/0x130 [ 90.727055] ret_from_fork+0x10/0x18 Reported-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++-- fs/f2fs/node.c | 4 +++- fs/f2fs/super.c | 44 +++++++++++++++++++++++++++++++++----------- 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 622dca707752..03fea4efd64b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -306,8 +306,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* collect a number of dirty meta pages and write together */ - if (wbc->for_kupdate || - get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_META) < + nr_pages_to_skip(sbi, META)) goto skip_write; /* if locked failed, cp will flush dirty pages instead */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4f450e573312..f6ff84e29749 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1920,7 +1920,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_NODES) < + nr_pages_to_skip(sbi, NODE)) goto skip_write; if (wbc->sync_mode == WB_SYNC_ALL) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7694cb350734..384d1c248857 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1458,9 +1458,16 @@ static int f2fs_enable_quotas(struct super_block *sb); static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { + unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; - int err; + int err = 0; + int ret; + if (s_flags & SB_RDONLY) { + f2fs_msg(sbi->sb, KERN_ERR, + "checkpoint=disable on readonly fs"); + return -EINVAL; + } sbi->sb->s_flags |= SB_ACTIVE; f2fs_update_time(sbi, DISABLE_TIME); @@ -1468,18 +1475,24 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) while (!f2fs_time_over(sbi, DISABLE_TIME)) { mutex_lock(&sbi->gc_mutex); err = f2fs_gc(sbi, true, false, NULL_SEGNO); - if (err == -ENODATA) + if (err == -ENODATA) { + err = 0; break; + } if (err && err != -EAGAIN) - return err; + break; } - err = sync_filesystem(sbi->sb); - if (err) - return err; + ret = sync_filesystem(sbi->sb); + if (ret || err) { + err = ret ? ret: err; + goto restore_flag; + } - if (f2fs_disable_cp_again(sbi)) - return -EAGAIN; + if (f2fs_disable_cp_again(sbi)) { + err = -EAGAIN; + goto restore_flag; + } mutex_lock(&sbi->gc_mutex); cpc.reason = CP_PAUSE; @@ -1488,7 +1501,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) sbi->unusable_block_count = 0; mutex_unlock(&sbi->gc_mutex); - return 0; +restore_flag: + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return err; } static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) @@ -3369,7 +3384,7 @@ skip_recovery: if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) - goto free_meta; + goto sync_free_meta; } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) { f2fs_enable_checkpoint(sbi); } @@ -3382,7 +3397,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) - goto free_meta; + goto sync_free_meta; } kvfree(options); @@ -3405,6 +3420,11 @@ skip_recovery: clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); return 0; +sync_free_meta: + /* safe to flush all the data */ + sync_filesystem(sbi->sb); + retry = false; + free_meta: #ifdef CONFIG_QUOTA f2fs_truncate_quota_inode_pages(sb); @@ -3418,6 +3438,8 @@ free_meta: * falls into an infinite loop in f2fs_sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); + /* evict some inodes being cached by GC */ + evict_inodes(sb); f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); From eecfa42716e63a99c4d1053b137aa10a3b94c0da Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 25 Jan 2019 20:11:39 +0800 Subject: [PATCH 127/855] f2fs: use xattr_prefix to wrap up Let's use xattr_prefix instead of open code. No logic changes. Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 18d5ffbc5e8c..fa620d31ea5f 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -538,7 +538,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if (!handler || (handler->list && !handler->list(dentry))) continue; - prefix = handler->prefix ?: handler->name; + prefix = xattr_prefix(handler); prefix_len = strlen(prefix); size = prefix_len + entry->e_name_len + 1; if (buffer) { From 1ffdc3807589779626924eb600db784d3d943a08 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 25 Jan 2019 15:35:01 +0800 Subject: [PATCH 128/855] f2fs: fix typos in code comments lengh -> length Signed-off-by: Geliang Tang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index d6befe1f9dc7..8d57aaee8166 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -187,7 +187,7 @@ struct f2fs_orphan_block { struct f2fs_extent { __le32 fofs; /* start file offset of the extent */ __le32 blk; /* start block address of the extent */ - __le32 len; /* lengh of the extent */ + __le32 len; /* length of the extent */ } __packed; #define F2FS_NAME_LEN 255 @@ -512,7 +512,7 @@ typedef __le32 f2fs_hash_t; struct f2fs_dir_entry { __le32 hash_code; /* hash code of file name */ __le32 ino; /* inode number */ - __le16 name_len; /* lengh of file name */ + __le16 name_len; /* length of file name */ __u8 file_type; /* file type */ } __packed; From 41a8645ab1c3c37f96955fec3360e123dc06abcd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 4 Feb 2019 19:06:18 +1100 Subject: [PATCH 129/855] KVM: PPC: Book3S PR: Add emulation for slbfee. instruction Recent kernels, since commit e15a4fea4dee ("powerpc/64s/hash: Add some SLB debugging tests", 2018-10-03) use the slbfee. instruction, which PR KVM currently does not have code to emulate. Consequently recent kernels fail to boot under PR KVM. This adds emulation of slbfee., enabling these kernels to boot successfully. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_32_mmu.c | 1 + arch/powerpc/kvm/book3s_64_mmu.c | 14 ++++++++++++++ arch/powerpc/kvm/book3s_emulate.c | 18 ++++++++++++++++++ 4 files changed, 34 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0f98f00da2ea..091430339db1 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -377,6 +377,7 @@ struct kvmppc_mmu { void (*slbmte)(struct kvm_vcpu *vcpu, u64 rb, u64 rs); u64 (*slbmfee)(struct kvm_vcpu *vcpu, u64 slb_nr); u64 (*slbmfev)(struct kvm_vcpu *vcpu, u64 slb_nr); + int (*slbfee)(struct kvm_vcpu *vcpu, gva_t eaddr, ulong *ret_slb); void (*slbie)(struct kvm_vcpu *vcpu, u64 slb_nr); void (*slbia)(struct kvm_vcpu *vcpu); /* book3s */ diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index 612169988a3d..6f789f674048 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -425,6 +425,7 @@ void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu) mmu->slbmte = NULL; mmu->slbmfee = NULL; mmu->slbmfev = NULL; + mmu->slbfee = NULL; mmu->slbie = NULL; mmu->slbia = NULL; } diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index c92dd25bed23..d4b967f0e8d4 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -435,6 +435,19 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) kvmppc_mmu_map_segment(vcpu, esid << SID_SHIFT); } +static int kvmppc_mmu_book3s_64_slbfee(struct kvm_vcpu *vcpu, gva_t eaddr, + ulong *ret_slb) +{ + struct kvmppc_slb *slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr); + + if (slbe) { + *ret_slb = slbe->origv; + return 0; + } + *ret_slb = 0; + return -ENOENT; +} + static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr) { struct kvmppc_slb *slbe; @@ -670,6 +683,7 @@ void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu) mmu->slbmte = kvmppc_mmu_book3s_64_slbmte; mmu->slbmfee = kvmppc_mmu_book3s_64_slbmfee; mmu->slbmfev = kvmppc_mmu_book3s_64_slbmfev; + mmu->slbfee = kvmppc_mmu_book3s_64_slbfee; mmu->slbie = kvmppc_mmu_book3s_64_slbie; mmu->slbia = kvmppc_mmu_book3s_64_slbia; mmu->xlate = kvmppc_mmu_book3s_64_xlate; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 8c7e933e942e..6ef7c5f00a49 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -47,6 +47,7 @@ #define OP_31_XOP_SLBMFEV 851 #define OP_31_XOP_EIOIO 854 #define OP_31_XOP_SLBMFEE 915 +#define OP_31_XOP_SLBFEE 979 #define OP_31_XOP_TBEGIN 654 #define OP_31_XOP_TABORT 910 @@ -416,6 +417,23 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->arch.mmu.slbia(vcpu); break; + case OP_31_XOP_SLBFEE: + if (!(inst & 1) || !vcpu->arch.mmu.slbfee) { + return EMULATE_FAIL; + } else { + ulong b, t; + ulong cr = kvmppc_get_cr(vcpu) & ~CR0_MASK; + + b = kvmppc_get_gpr(vcpu, rb); + if (!vcpu->arch.mmu.slbfee(vcpu, b, &t)) + cr |= 2 << CR0_SHIFT; + kvmppc_set_gpr(vcpu, rt, t); + /* copy XER[SO] bit to CR0[SO] */ + cr |= (vcpu->arch.regs.xer & 0x80000000) >> + (31 - CR0_SHIFT); + kvmppc_set_cr(vcpu, cr); + } + break; case OP_31_XOP_SLBMFEE: if (!vcpu->arch.mmu.slbmfee) { emulated = EMULATE_FAIL; From 08434ab4694887effbdcae518f29062274871c8d Mon Sep 17 00:00:00 2001 From: wangbo Date: Mon, 7 Jan 2019 20:15:52 +0800 Subject: [PATCH 130/855] KVM: PPC: Book3S HV: Replace kmalloc_node+memset with kzalloc_node Replace kmalloc_node and memset with kzalloc_node Signed-off-by: wangbo Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5a066fc299e1..27112b70aa7a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5342,13 +5342,11 @@ static int kvm_init_subcore_bitmap(void) continue; sibling_subcore_state = - kmalloc_node(sizeof(struct sibling_subcore_state), + kzalloc_node(sizeof(struct sibling_subcore_state), GFP_KERNEL, node); if (!sibling_subcore_state) return -ENOMEM; - memset(sibling_subcore_state, 0, - sizeof(struct sibling_subcore_state)); for (j = 0; j < threads_per_core; j++) { int cpu = first_cpu + j; From f1adb9c48a01779311aff57d96dc578f91f37eb7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 11 Jan 2019 12:22:31 +0900 Subject: [PATCH 131/855] KVM: PPC: Remove -I. header search paths The header search path -I. in kernel Makefiles is very suspicious; it allows the compiler to search for headers in the top of $(srctree), where obviously no header file exists. Commit 46f43c6ee022 ("KVM: powerpc: convert marker probes to event trace") first added these options, but they are completely useless. Signed-off-by: Masahiro Yamada Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 64f1135e7732..3223aec88b2c 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -10,11 +10,6 @@ common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o -CFLAGS_e500_mmu.o := -I. -CFLAGS_e500_mmu_host.o := -I. -CFLAGS_emulate.o := -I. -CFLAGS_emulate_loadstore.o := -I. - common-objs-y += powerpc.o emulate_loadstore.o obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o From 03f953329bd872b176e825584d8c0b50685f16ee Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 4 Feb 2019 22:07:20 +1100 Subject: [PATCH 132/855] KVM: PPC: Book3S: Allow XICS emulation to work in nested hosts using XIVE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the KVM code assumes that if the host kernel is using the XIVE interrupt controller (the new interrupt controller that first appeared in POWER9 systems), then the in-kernel XICS emulation will use the XIVE hardware to deliver interrupts to the guest. However, this only works when the host is running in hypervisor mode and has full access to all of the XIVE functionality. It doesn't work in any nested virtualization scenario, either with PR KVM or nested-HV KVM, because the XICS-on-XIVE code calls directly into the native-XIVE routines, which are not initialized and cannot function correctly because they use OPAL calls, and OPAL is not available in a guest. This means that using the in-kernel XICS emulation in a nested hypervisor that is using XIVE as its interrupt controller will cause a (nested) host kernel crash. To fix this, we change most of the places where the current code calls xive_enabled() to select between the XICS-on-XIVE emulation and the plain XICS emulation to call a new function, xics_on_xive(), which returns false in a guest. However, there is a further twist. The plain XICS emulation has some functions which are used in real mode and access the underlying XICS controller (the interrupt controller of the host) directly. In the case of a nested hypervisor, this means doing XICS hypercalls directly. When the nested host is using XIVE as its interrupt controller, these hypercalls will fail. Therefore this also adds checks in the places where the XICS emulation wants to access the underlying interrupt controller directly, and if that is XIVE, makes the code use the virtual mode fallback paths, which call generic kernel infrastructure rather than doing direct XICS access. Signed-off-by: Paul Mackerras Reviewed-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_ppc.h | 14 ++++++++++++++ arch/powerpc/kvm/book3s.c | 10 +++++----- arch/powerpc/kvm/book3s_hv.c | 16 ++++++++-------- arch/powerpc/kvm/book3s_hv_builtin.c | 14 +++++++------- arch/powerpc/kvm/book3s_hv_rm_xics.c | 7 +++++++ arch/powerpc/kvm/book3s_rtas.c | 8 ++++---- arch/powerpc/kvm/powerpc.c | 4 ++-- 7 files changed, 47 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index eb0d79f0ca45..b3bf4f61b30c 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -36,6 +36,8 @@ #endif #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #include +#include +#include #endif /* @@ -616,6 +618,18 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } #endif /* CONFIG_KVM_XIVE */ +#ifdef CONFIG_PPC_POWERNV +static inline bool xics_on_xive(void) +{ + return xive_enabled() && cpu_has_feature(CPU_FTR_HVMODE); +} +#else +static inline bool xics_on_xive(void) +{ + return false; +} +#endif + /* * Prototypes for functions called only from assembler code. * Having prototypes reduces sparse errors. diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index bd1a677dd9e4..601c094f15ab 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -635,7 +635,7 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, r = -ENXIO; break; } - if (xive_enabled()) + if (xics_on_xive()) *val = get_reg_val(id, kvmppc_xive_get_icp(vcpu)); else *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu)); @@ -708,7 +708,7 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, r = -ENXIO; break; } - if (xive_enabled()) + if (xics_on_xive()) r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val)); else r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val)); @@ -984,7 +984,7 @@ int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall) int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { - if (xive_enabled()) + if (xics_on_xive()) return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level, line_status); else @@ -1037,7 +1037,7 @@ static int kvmppc_book3s_init(void) #ifdef CONFIG_KVM_XICS #ifdef CONFIG_KVM_XIVE - if (xive_enabled()) { + if (xics_on_xive()) { kvmppc_xive_init_module(); kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); } else @@ -1050,7 +1050,7 @@ static int kvmppc_book3s_init(void) static void kvmppc_book3s_exit(void) { #ifdef CONFIG_KVM_XICS - if (xive_enabled()) + if (xics_on_xive()) kvmppc_xive_exit_module(); #endif #ifdef CONFIG_KVM_BOOK3S_32_HANDLER diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 27112b70aa7a..1860c0bc1395 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -922,7 +922,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) case H_IPOLL: case H_XIRR_X: if (kvmppc_xics_enabled(vcpu)) { - if (xive_enabled()) { + if (xics_on_xive()) { ret = H_NOT_AVAILABLE; return RESUME_GUEST; } @@ -1431,7 +1431,7 @@ static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) case BOOK3S_INTERRUPT_HV_RM_HARD: vcpu->arch.trap = 0; r = RESUME_GUEST; - if (!xive_enabled()) + if (!xics_on_xive()) kvmppc_xics_rm_complete(vcpu, 0); break; default: @@ -3649,7 +3649,7 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) #ifdef CONFIG_KVM_XICS static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) { - if (!xive_enabled()) + if (!xics_on_xive()) return false; return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr < vcpu->arch.xive_saved_state.cppr; @@ -4209,7 +4209,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); srcu_read_unlock(&kvm->srcu, srcu_idx); } else if (r == RESUME_PASSTHROUGH) { - if (WARN_ON(xive_enabled())) + if (WARN_ON(xics_on_xive())) r = H_SUCCESS; else r = kvmppc_xics_rm_complete(vcpu, 0); @@ -4733,7 +4733,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) * If xive is enabled, we route 0x500 interrupts directly * to the guest. */ - if (xive_enabled()) + if (xics_on_xive()) lpcr |= LPCR_LPES; } @@ -4969,7 +4969,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) if (i == pimap->n_mapped) pimap->n_mapped++; - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc); else kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); @@ -5010,7 +5010,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) return -ENODEV; } - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc); else kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); @@ -5387,7 +5387,7 @@ static int kvmppc_book3s_init_hv(void) * indirectly, via OPAL. */ #ifdef CONFIG_SMP - if (!xive_enabled() && !kvmhv_on_pseries() && + if (!xics_on_xive() && !kvmhv_on_pseries() && !local_paca->kvm_hstate.xics_phys) { struct device_node *np; diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index a71e2fc00a4e..b0cf22477e87 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -257,7 +257,7 @@ void kvmhv_rm_send_ipi(int cpu) } /* We should never reach this */ - if (WARN_ON_ONCE(xive_enabled())) + if (WARN_ON_ONCE(xics_on_xive())) return; /* Else poke the target with an IPI */ @@ -577,7 +577,7 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_xirr(vcpu); if (unlikely(!__xive_vm_h_xirr)) @@ -592,7 +592,7 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; vcpu->arch.regs.gpr[5] = get_tb(); - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_xirr(vcpu); if (unlikely(!__xive_vm_h_xirr)) @@ -606,7 +606,7 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_ipoll(vcpu, server); if (unlikely(!__xive_vm_h_ipoll)) @@ -621,7 +621,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_ipi(vcpu, server, mfrr); if (unlikely(!__xive_vm_h_ipi)) @@ -635,7 +635,7 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_cppr(vcpu, cppr); if (unlikely(!__xive_vm_h_cppr)) @@ -649,7 +649,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_eoi(vcpu, xirr); if (unlikely(!__xive_vm_h_eoi)) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index b3f5786b20dc..3b9662a4207e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -144,6 +144,13 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, return; } + if (xive_enabled() && kvmhv_on_pseries()) { + /* No XICS access or hypercalls available, too hard */ + this_icp->rm_action |= XICS_RM_KICK_VCPU; + this_icp->rm_kick_target = vcpu; + return; + } + /* * Check if the core is loaded, * if not, find an available host core to post to wake the VCPU, diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c index 2d3b2b1cc272..4e178c4c1ea5 100644 --- a/arch/powerpc/kvm/book3s_rtas.c +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -33,7 +33,7 @@ static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) server = be32_to_cpu(args->args[1]); priority = be32_to_cpu(args->args[2]); - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority); else rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); @@ -56,7 +56,7 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) irq = be32_to_cpu(args->args[0]); server = priority = 0; - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority); else rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); @@ -83,7 +83,7 @@ static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args) irq = be32_to_cpu(args->args[0]); - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_int_off(vcpu->kvm, irq); else rc = kvmppc_xics_int_off(vcpu->kvm, irq); @@ -105,7 +105,7 @@ static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args) irq = be32_to_cpu(args->args[0]); - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_int_on(vcpu->kvm, irq); else rc = kvmppc_xics_int_on(vcpu->kvm, irq); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index b90a7d154180..8c69af10f91d 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -748,7 +748,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); break; case KVMPPC_IRQ_XICS: - if (xive_enabled()) + if (xics_on_xive()) kvmppc_xive_cleanup_vcpu(vcpu); else kvmppc_xics_free_icp(vcpu); @@ -1931,7 +1931,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = -EPERM; dev = kvm_device_from_filp(f.file); if (dev) { - if (xive_enabled()) + if (xics_on_xive()) r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]); else r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); From 1b6422574e2de4f3164d73c38d40539e19c88849 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Thu, 7 Feb 2019 15:56:50 +1100 Subject: [PATCH 133/855] KVM: PPC: Book3S HV: Optimise mmio emulation for devices on FAST_MMIO_BUS Devices on the KVM_FAST_MMIO_BUS by definition have length zero and are thus used for notification purposes rather than data transfer. For example eventfd for virtio devices. This means that when emulating mmio instructions which target devices on this bus we can immediately handle them and return without needing to load the instruction from guest memory. For now we restrict this to stores as this is the only use case at present. For a normal guest the effect is negligible, however for a nested guest we save on the order of 5us per access. Signed-off-by: Suraj Jitindar Singh Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index bd2dcfbf00cd..be7bc070eae5 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -441,6 +441,24 @@ int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, { u32 last_inst; + /* + * Fast path - check if the guest physical address corresponds to a + * device on the FAST_MMIO_BUS, if so we can avoid loading the + * instruction all together, then we can just handle it and return. + */ + if (is_store) { + int idx, ret; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + ret = kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, (gpa_t) gpa, 0, + NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + if (!ret) { + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); + return RESUME_GUEST; + } + } + /* * If we fail, we just return to the guest and try executing it again. */ From a67614cc05a5052b265ea48196dab2fce11f5f2e Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 12 Feb 2019 15:37:45 +1100 Subject: [PATCH 134/855] KVM: PPC: Release all hardware TCE tables attached to a group The SPAPR TCE KVM device references all hardware IOMMU tables assigned to some IOMMU group to ensure that in-kernel KVM acceleration of H_PUT_TCE can work. The tables are references when an IOMMU group gets registered with the VFIO KVM device by the KVM_DEV_VFIO_GROUP_ADD ioctl; KVM_DEV_VFIO_GROUP_DEL calls into the dereferencing code in kvm_spapr_tce_release_iommu_group() which walks through the list of LIOBNs, finds a matching IOMMU table and calls kref_put() when found. However that code stops after the very first successful derefencing leaving other tables referenced till the SPAPR TCE KVM device is destroyed which normally happens on guest reboot or termination so if we do hotplug and unplug in a loop, we are leaking IOMMU tables here. This removes a premature return to let kvm_spapr_tce_release_iommu_group() find and dereference all attached tables. Fixes: 121f80ba68f ("KVM: PPC: VFIO: Add in-kernel acceleration for VFIO") Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 532ab79734c7..6630dde56668 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -133,7 +133,6 @@ extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, continue; kref_put(&stit->kref, kvm_spapr_tce_liobn_put); - return; } } } From 8f1f7b9bedbce8d84e0b6b8beac671a6bc8f02c9 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Tue, 19 Feb 2019 14:53:45 +1100 Subject: [PATCH 135/855] KVM: PPC: Book3S HV: Add KVM stat largepages_[2M/1G] This adds an entry to the kvm_stats_debugfs directory which provides the number of large (2M or 1G) pages which have been used to setup the guest mappings, for radix guests. Signed-off-by: Suraj Jitindar Singh Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 2 ++ arch/powerpc/kvm/book3s.c | 3 +++ arch/powerpc/kvm/book3s_64_mmu_radix.c | 15 ++++++++++++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 091430339db1..4af498a53905 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -99,6 +99,8 @@ struct kvm_nested_guest; struct kvm_vm_stat { ulong remote_tlb_flush; + ulong num_2M_pages; + ulong num_1G_pages; }; struct kvm_vcpu_stat { diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 601c094f15ab..22a46c64536b 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -39,6 +39,7 @@ #include "book3s.h" #include "trace.h" +#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU /* #define EXIT_DEBUG */ @@ -71,6 +72,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "pthru_all", VCPU_STAT(pthru_all) }, { "pthru_host", VCPU_STAT(pthru_host) }, { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, + { "largepages_2M", VM_STAT(num_2M_pages) }, + { "largepages_1G", VM_STAT(num_1G_pages) }, { NULL } }; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 1b821c6efdef..f55ef071883f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -403,8 +403,13 @@ void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, if (!memslot) return; } - if (shift) + if (shift) { /* 1GB or 2MB page */ page_size = 1ul << shift; + if (shift == PMD_SHIFT) + kvm->stat.num_2M_pages--; + else if (shift == PUD_SHIFT) + kvm->stat.num_1G_pages--; + } gpa &= ~(page_size - 1); hpa = old & PTE_RPN_MASK; @@ -878,6 +883,14 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, put_page(page); } + /* Increment number of large pages if we (successfully) inserted one */ + if (!ret) { + if (level == 1) + kvm->stat.num_2M_pages++; + else if (level == 2) + kvm->stat.num_1G_pages++; + } + return ret; } From ee7930490a8f84570e96268f5736e99570b58307 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 6 Jul 2018 09:11:50 +0100 Subject: [PATCH 136/855] clocksource/arm_arch_timer: Store physical timer IRQ number for KVM on VHE A host running in VHE mode gets the EL2 physical timer as its time source (accessed using the EL1 sysreg accessors, which get re-directed to the EL2 sysregs by VHE). The EL1 physical timer remains unused by the host kernel, allowing us to pass that on directly to a KVM guest and saves us from emulating this timer for the guest on VHE systems. Store the EL1 Physical Timer's IRQ number in struct arch_timer_kvm_info on VHE systems to allow KVM to use it. Acked-by: Daniel Lezcano Signed-off-by: Andre Przywara Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- drivers/clocksource/arm_arch_timer.c | 11 +++++++++-- include/clocksource/arm_arch_timer.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 9a7d4dc00b6e..b9243e2328b4 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -1206,6 +1206,13 @@ static enum arch_timer_ppi_nr __init arch_timer_select_ppi(void) return ARCH_TIMER_PHYS_SECURE_PPI; } +static void __init arch_timer_populate_kvm_info(void) +{ + arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI]; + if (is_kernel_in_hyp_mode()) + arch_timer_kvm_info.physical_irq = arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]; +} + static int __init arch_timer_of_init(struct device_node *np) { int i, ret; @@ -1220,7 +1227,7 @@ static int __init arch_timer_of_init(struct device_node *np) for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) arch_timer_ppi[i] = irq_of_parse_and_map(np, i); - arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI]; + arch_timer_populate_kvm_info(); rate = arch_timer_get_cntfrq(); arch_timer_of_configure_rate(rate, np); @@ -1550,7 +1557,7 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table) arch_timer_ppi[ARCH_TIMER_HYP_PPI] = acpi_gtdt_map_ppi(ARCH_TIMER_HYP_PPI); - arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI]; + arch_timer_populate_kvm_info(); /* * When probing via ACPI, we have no mechanism to override the sysreg diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index 349e5957c949..702967d996bb 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -74,6 +74,7 @@ enum arch_timer_spi_nr { struct arch_timer_kvm_info { struct timecounter timecounter; int virtual_irq; + int physical_irq; }; struct arch_timer_mem_frame { From 7aa8d14641651a61a0b8892314a0bcfaceebe158 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 5 Jan 2019 15:49:50 +0000 Subject: [PATCH 137/855] arm/arm64: KVM: Introduce kvm_call_hyp_ret() Until now, we haven't differentiated between HYP calls that have a return value and those who don't. As we're about to change this, introduce kvm_call_hyp_ret(), and change all call sites that actually make use of a return value. Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 3 +++ arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/debug.c | 2 +- virt/kvm/arm/arm.c | 2 +- virt/kvm/arm/vgic/vgic-v3.c | 4 ++-- 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index ca56537b61bc..023c9f2b1eea 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -214,7 +214,10 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); + unsigned long kvm_call_hyp(void *hypfn, ...); +#define kvm_call_hyp_ret(f, ...) kvm_call_hyp(f, ##__VA_ARGS__) + void force_vm_exit(const cpumask_t *mask); int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7732d0ba4e60..e54cb7c88a4e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -371,6 +371,7 @@ void kvm_arm_resume_guest(struct kvm *kvm); u64 __kvm_call_hyp(void *hypfn, ...); #define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) +#define kvm_call_hyp_ret(f, ...) kvm_call_hyp(f, ##__VA_ARGS__) void force_vm_exit(const cpumask_t *mask); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index f39801e4136c..fd917d6d12af 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -76,7 +76,7 @@ static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) void kvm_arm_init_debug(void) { - __this_cpu_write(mdcr_el2, kvm_call_hyp(__kvm_get_mdcr_el2)); + __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2)); } /** diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 9e350fd34504..4d55f98f97f7 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -765,7 +765,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = kvm_vcpu_run_vhe(vcpu); kvm_arm_vhe_guest_exit(); } else { - ret = kvm_call_hyp(__kvm_vcpu_run_nvhe, vcpu); + ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu); } vcpu->mode = OUTSIDE_GUEST_MODE; diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 9c0dd234ebe8..67f98151c88d 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -589,7 +589,7 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); */ int vgic_v3_probe(const struct gic_kvm_info *info) { - u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); + u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2); int ret; /* @@ -679,7 +679,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; if (likely(cpu_if->vgic_sre)) - cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr); + cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr); kvm_call_hyp(__vgic_v3_save_aprs, vcpu); From 18fc7bf8e041272f74a3237b99261d59c3ff0388 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 5 Jan 2019 15:57:56 +0000 Subject: [PATCH 138/855] arm64: KVM: Allow for direct call of HYP functions when using VHE When running VHE, there is no need to jump via some stub to perform a "HYP" function call, as there is a single address space. Let's thus change kvm_call_hyp() and co to perform a direct call in this case. Although this results in a bit of code expansion, it allows the compiler to check for type compatibility, something that we are missing so far. Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_host.h | 32 +++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e54cb7c88a4e..8b7702bdb219 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -370,8 +370,36 @@ void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); u64 __kvm_call_hyp(void *hypfn, ...); -#define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) -#define kvm_call_hyp_ret(f, ...) kvm_call_hyp(f, ##__VA_ARGS__) + +/* + * The couple of isb() below are there to guarantee the same behaviour + * on VHE as on !VHE, where the eret to EL1 acts as a context + * synchronization event. + */ +#define kvm_call_hyp(f, ...) \ + do { \ + if (has_vhe()) { \ + f(__VA_ARGS__); \ + isb(); \ + } else { \ + __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__); \ + } \ + } while(0) + +#define kvm_call_hyp_ret(f, ...) \ + ({ \ + typeof(f(__VA_ARGS__)) ret; \ + \ + if (has_vhe()) { \ + ret = f(__VA_ARGS__); \ + isb(); \ + } else { \ + ret = __kvm_call_hyp(kvm_ksym_ref(f), \ + ##__VA_ARGS__); \ + } \ + \ + ret; \ + }) void force_vm_exit(const cpumask_t *mask); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); From 7cba8a8d0d39c8846d0095fc2d2225c7983f4009 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 5 Jan 2019 16:06:53 +0000 Subject: [PATCH 139/855] arm64: KVM: Drop VHE-specific HYP call stub We now call VHE code directly, without going through any central dispatching function. Let's drop that code. Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm64/kvm/hyp.S | 3 --- arch/arm64/kvm/hyp/hyp-entry.S | 12 ------------ 2 files changed, 15 deletions(-) diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 952f6cb9cf72..2845aa680841 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -40,9 +40,6 @@ * arch/arm64/kernel/hyp_stub.S. */ ENTRY(__kvm_call_hyp) -alternative_if_not ARM64_HAS_VIRT_HOST_EXTN hvc #0 ret -alternative_else_nop_endif - b __vhe_hyp_call ENDPROC(__kvm_call_hyp) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 73c1b483ec39..2b1e686772bf 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -43,18 +43,6 @@ ldr lr, [sp], #16 .endm -ENTRY(__vhe_hyp_call) - do_el2_call - /* - * We used to rely on having an exception return to get - * an implicit isb. In the E2H case, we don't have it anymore. - * rather than changing all the leaf functions, just do it here - * before returning to the rest of the kernel. - */ - isb - ret -ENDPROC(__vhe_hyp_call) - el1_sync: // Guest trapped into EL2 mrs x0, esr_el2 From d18232ea8a9480606c53e91d7e2d062c3c151815 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 11 Jan 2019 14:57:58 +0000 Subject: [PATCH 140/855] ARM: KVM: Teach some form of type-safety to kvm_call_hyp Just like on arm64, and for the same reasons, kvm_call_hyp removes any form of type safety when calling into HYP. But we can still try to tell the compiler what we're trying to achieve. Here, we can add code that would do the function call if it wasn't guarded by an always-false predicate. Hopefully, the compiler is dumb enough to do the type checking and clever enough to not emit the corresponding code... Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 31 ++++++++++++++++++++++++++++--- arch/arm/kvm/hyp/hyp-entry.S | 2 +- arch/arm/kvm/interrupts.S | 4 ++-- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 023c9f2b1eea..4b6193f2f0f6 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -215,8 +215,33 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); -unsigned long kvm_call_hyp(void *hypfn, ...); -#define kvm_call_hyp_ret(f, ...) kvm_call_hyp(f, ##__VA_ARGS__) +unsigned long __kvm_call_hyp(void *hypfn, ...); + +/* + * The has_vhe() part doesn't get emitted, but is used for type-checking. + */ +#define kvm_call_hyp(f, ...) \ + do { \ + if (has_vhe()) { \ + f(__VA_ARGS__); \ + } else { \ + __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__); \ + } \ + } while(0) + +#define kvm_call_hyp_ret(f, ...) \ + ({ \ + typeof(f(__VA_ARGS__)) ret; \ + \ + if (has_vhe()) { \ + ret = f(__VA_ARGS__); \ + } else { \ + ret = __kvm_call_hyp(kvm_ksym_ref(f), \ + ##__VA_ARGS__); \ + } \ + \ + ret; \ + }) void force_vm_exit(const cpumask_t *mask); int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, @@ -268,7 +293,7 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, * compliant with the PCS!). */ - kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); + __kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); } static inline void __cpu_init_stage2(void) diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S index aa3f9a9837ac..6ed3cf23fe89 100644 --- a/arch/arm/kvm/hyp/hyp-entry.S +++ b/arch/arm/kvm/hyp/hyp-entry.S @@ -176,7 +176,7 @@ THUMB( orr lr, lr, #PSR_T_BIT ) msr spsr_cxsf, lr ldr lr, =panic msr ELR_hyp, lr - ldr lr, =kvm_call_hyp + ldr lr, =__kvm_call_hyp clrex eret ENDPROC(__hyp_do_panic) diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 80a1d6cd261c..a08e6419ebe9 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -42,7 +42,7 @@ * r12: caller save * rest: callee save */ -ENTRY(kvm_call_hyp) +ENTRY(__kvm_call_hyp) hvc #0 bx lr -ENDPROC(kvm_call_hyp) +ENDPROC(__kvm_call_hyp) From 32f139551954512bfdf9d558341af453bb8b12b4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 19 Jan 2019 15:29:54 +0000 Subject: [PATCH 141/855] arm/arm64: KVM: Statically configure the host's view of MPIDR We currently eagerly save/restore MPIDR. It turns out to be slightly pointless: - On the host, this value is known as soon as we're scheduled on a physical CPU - In the guest, this value cannot change, as it is set by KVM (and this is a read-only register) The result of the above is that we can perfectly avoid the eager saving of MPIDR_EL1, and only keep the restore. We just have to setup the host contexts appropriately at boot time. Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 8 ++++++++ arch/arm/kvm/hyp/cp15-sr.c | 1 - arch/arm64/include/asm/kvm_host.h | 8 ++++++++ arch/arm64/kvm/hyp/sysreg-sr.c | 1 - virt/kvm/arm/arm.c | 1 + 5 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 4b6193f2f0f6..43e343e00fb8 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -147,6 +148,13 @@ struct kvm_cpu_context { typedef struct kvm_cpu_context kvm_cpu_context_t; +static inline void kvm_init_host_cpu_context(kvm_cpu_context_t *cpu_ctxt, + int cpu) +{ + /* The host's MPIDR is immutable, so let's set it up at boot time */ + cpu_ctxt->cp15[c0_MPIDR] = cpu_logical_map(cpu); +} + struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; diff --git a/arch/arm/kvm/hyp/cp15-sr.c b/arch/arm/kvm/hyp/cp15-sr.c index c4782812714c..8bf895ec6e04 100644 --- a/arch/arm/kvm/hyp/cp15-sr.c +++ b/arch/arm/kvm/hyp/cp15-sr.c @@ -27,7 +27,6 @@ static u64 *cp15_64(struct kvm_cpu_context *ctxt, int idx) void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt) { - ctxt->cp15[c0_MPIDR] = read_sysreg(VMPIDR); ctxt->cp15[c0_CSSELR] = read_sysreg(CSSELR); ctxt->cp15[c1_SCTLR] = read_sysreg(SCTLR); ctxt->cp15[c1_CPACR] = read_sysreg(CPACR); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8b7702bdb219..f497bb31031f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -418,6 +419,13 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); DECLARE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state); +static inline void kvm_init_host_cpu_context(kvm_cpu_context_t *cpu_ctxt, + int cpu) +{ + /* The host's MPIDR is immutable, so let's set it up at boot time */ + cpu_ctxt->sys_regs[MPIDR_EL1] = cpu_logical_map(cpu); +} + void __kvm_enable_ssbs(void); static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index 68d6f7c3b237..2498f86defcb 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -52,7 +52,6 @@ static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt) static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { - ctxt->sys_regs[MPIDR_EL1] = read_sysreg(vmpidr_el2); ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1); ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(sctlr); ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 4d55f98f97f7..3dd240ea9e76 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1561,6 +1561,7 @@ static int init_hyp_mode(void) kvm_cpu_context_t *cpu_ctxt; cpu_ctxt = per_cpu_ptr(&kvm_host_cpu_state, cpu); + kvm_init_host_cpu_context(cpu_ctxt, cpu); err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP); if (err) { From e329fb75d519e3dc3eb11b22d5bb846516be3521 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 11 Dec 2018 15:26:31 +0100 Subject: [PATCH 142/855] KVM: arm/arm64: Factor out VMID into struct kvm_vmid In preparation for nested virtualization where we are going to have more than a single VMID per VM, let's factor out the VMID data into a separate VMID data structure and change the VMID allocator to operate on this new structure instead of using a struct kvm. This also means that udate_vttbr now becomes update_vmid, and that the vttbr itself is generated on the fly based on the stage 2 page table base address and the vmid. We cache the physical address of the pgd when allocating the pgd to avoid doing the calculation on every entry to the guest and to avoid calling into potentially non-hyp-mapped code from hyp/EL2. If we wanted to merge the VMID allocator with the arm64 ASID allocator at some point in the future, it should actually become easier to do that after this patch. Note that to avoid mapping the kvm_vmid_bits variable into hyp, we simply forego the masking of the vmid value in kvm_get_vttbr and rely on update_vmid to always assign a valid vmid value (within the supported range). Reviewed-by: Marc Zyngier [maz: minor cleanups] Reviewed-by: Julien Thierry Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_host.h | 13 ++++--- arch/arm/include/asm/kvm_mmu.h | 9 +++-- arch/arm/kvm/hyp/switch.c | 2 +- arch/arm/kvm/hyp/tlb.c | 4 +-- arch/arm64/include/asm/kvm_host.h | 9 +++-- arch/arm64/include/asm/kvm_hyp.h | 3 +- arch/arm64/include/asm/kvm_mmu.h | 10 ++++-- virt/kvm/arm/arm.c | 57 +++++++++++-------------------- virt/kvm/arm/mmu.c | 7 ++++ 9 files changed, 61 insertions(+), 53 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 43e343e00fb8..8073267dc4a0 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -57,10 +57,13 @@ int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_reset_coprocs(struct kvm_vcpu *vcpu); -struct kvm_arch { - /* VTTBR value associated with below pgd and vmid */ - u64 vttbr; +struct kvm_vmid { + /* The VMID generation used for the virt. memory system */ + u64 vmid_gen; + u32 vmid; +}; +struct kvm_arch { /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; @@ -70,11 +73,11 @@ struct kvm_arch { */ /* The VMID generation used for the virt. memory system */ - u64 vmid_gen; - u32 vmid; + struct kvm_vmid vmid; /* Stage-2 page table */ pgd_t *pgd; + phys_addr_t pgd_phys; /* Interrupt controller */ struct vgic_dist vgic; diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 3a875fc1b63c..2de96a180166 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -421,9 +421,14 @@ static inline int hyp_map_aux_data(void) static inline void kvm_set_ipa_limit(void) {} -static inline bool kvm_cpu_has_cnp(void) +static __always_inline u64 kvm_get_vttbr(struct kvm *kvm) { - return false; + struct kvm_vmid *vmid = &kvm->arch.vmid; + u64 vmid_field, baddr; + + baddr = kvm->arch.pgd_phys; + vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT; + return kvm_phys_to_vttbr(baddr) | vmid_field; } #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c index acf1c37fa49c..3b058a5d7c5f 100644 --- a/arch/arm/kvm/hyp/switch.c +++ b/arch/arm/kvm/hyp/switch.c @@ -77,7 +77,7 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu) static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu) { struct kvm *kvm = kern_hyp_va(vcpu->kvm); - write_sysreg(kvm->arch.vttbr, VTTBR); + write_sysreg(kvm_get_vttbr(kvm), VTTBR); write_sysreg(vcpu->arch.midr, VPIDR); } diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c index c0edd450e104..8e4afba73635 100644 --- a/arch/arm/kvm/hyp/tlb.c +++ b/arch/arm/kvm/hyp/tlb.c @@ -41,7 +41,7 @@ void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) /* Switch to requested VMID */ kvm = kern_hyp_va(kvm); - write_sysreg(kvm->arch.vttbr, VTTBR); + write_sysreg(kvm_get_vttbr(kvm), VTTBR); isb(); write_sysreg(0, TLBIALLIS); @@ -61,7 +61,7 @@ void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) struct kvm *kvm = kern_hyp_va(kern_hyp_va(vcpu)->kvm); /* Switch to requested VMID */ - write_sysreg(kvm->arch.vttbr, VTTBR); + write_sysreg(kvm_get_vttbr(kvm), VTTBR); isb(); write_sysreg(0, TLBIALL); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f497bb31031f..444dd1cb1958 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -57,16 +57,19 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu); int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); -struct kvm_arch { +struct kvm_vmid { /* The VMID generation used for the virt. memory system */ u64 vmid_gen; u32 vmid; +}; + +struct kvm_arch { + struct kvm_vmid vmid; /* stage2 entry level table */ pgd_t *pgd; + phys_addr_t pgd_phys; - /* VTTBR value associated with above pgd and vmid */ - u64 vttbr; /* VTCR_EL2 value for this VM */ u64 vtcr; diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index a80a7ef57325..4da765f2cca5 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #define __hyp_text __section(.hyp.text) notrace @@ -163,7 +164,7 @@ void __noreturn __hyp_do_panic(unsigned long, ...); static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm) { write_sysreg(kvm->arch.vtcr, vtcr_el2); - write_sysreg(kvm->arch.vttbr, vttbr_el2); + write_sysreg(kvm_get_vttbr(kvm), vttbr_el2); /* * ARM erratum 1165522 requires the actual execution of the above diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 8af4b1befa42..c423c8c4fc39 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -591,9 +591,15 @@ static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm) return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); } -static inline bool kvm_cpu_has_cnp(void) +static __always_inline u64 kvm_get_vttbr(struct kvm *kvm) { - return system_supports_cnp(); + struct kvm_vmid *vmid = &kvm->arch.vmid; + u64 vmid_field, baddr; + u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0; + + baddr = kvm->arch.pgd_phys; + vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT; + return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; } #endif /* __ASSEMBLY__ */ diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 3dd240ea9e76..b77db673bb03 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -65,7 +65,6 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); /* The VMID used in the VTTBR */ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u32 kvm_next_vmid; -static unsigned int kvm_vmid_bits __read_mostly; static DEFINE_SPINLOCK(kvm_vmid_lock); static bool vgic_present; @@ -142,7 +141,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_vgic_early_init(kvm); /* Mark the initial VMID generation invalid */ - kvm->arch.vmid_gen = 0; + kvm->arch.vmid.vmid_gen = 0; /* The maximum number of VCPUs is limited by the host's GIC model */ kvm->arch.max_vcpus = vgic_present ? @@ -472,37 +471,31 @@ void force_vm_exit(const cpumask_t *mask) /** * need_new_vmid_gen - check that the VMID is still valid - * @kvm: The VM's VMID to check + * @vmid: The VMID to check * * return true if there is a new generation of VMIDs being used * - * The hardware supports only 256 values with the value zero reserved for the - * host, so we check if an assigned value belongs to a previous generation, - * which which requires us to assign a new value. If we're the first to use a - * VMID for the new generation, we must flush necessary caches and TLBs on all - * CPUs. + * The hardware supports a limited set of values with the value zero reserved + * for the host, so we check if an assigned value belongs to a previous + * generation, which which requires us to assign a new value. If we're the + * first to use a VMID for the new generation, we must flush necessary caches + * and TLBs on all CPUs. */ -static bool need_new_vmid_gen(struct kvm *kvm) +static bool need_new_vmid_gen(struct kvm_vmid *vmid) { u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ - return unlikely(READ_ONCE(kvm->arch.vmid_gen) != current_vmid_gen); + return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen); } /** - * update_vttbr - Update the VTTBR with a valid VMID before the guest runs - * @kvm The guest that we are about to run - * - * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the - * VM has a valid VMID, otherwise assigns a new one and flushes corresponding - * caches and TLBs. + * update_vmid - Update the vmid with a valid VMID for the current generation + * @kvm: The guest that struct vmid belongs to + * @vmid: The stage-2 VMID information struct */ -static void update_vttbr(struct kvm *kvm) +static void update_vmid(struct kvm_vmid *vmid) { - phys_addr_t pgd_phys; - u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0; - - if (!need_new_vmid_gen(kvm)) + if (!need_new_vmid_gen(vmid)) return; spin_lock(&kvm_vmid_lock); @@ -512,7 +505,7 @@ static void update_vttbr(struct kvm *kvm) * already allocated a valid vmid for this vm, then this vcpu should * use the same vmid. */ - if (!need_new_vmid_gen(kvm)) { + if (!need_new_vmid_gen(vmid)) { spin_unlock(&kvm_vmid_lock); return; } @@ -536,18 +529,12 @@ static void update_vttbr(struct kvm *kvm) kvm_call_hyp(__kvm_flush_vm_context); } - kvm->arch.vmid = kvm_next_vmid; + vmid->vmid = kvm_next_vmid; kvm_next_vmid++; - kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; - - /* update vttbr to be used with the new vmid */ - pgd_phys = virt_to_phys(kvm->arch.pgd); - BUG_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)); - vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); - kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp; + kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1; smp_wmb(); - WRITE_ONCE(kvm->arch.vmid_gen, atomic64_read(&kvm_vmid_gen)); + WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen)); spin_unlock(&kvm_vmid_lock); } @@ -690,7 +677,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) */ cond_resched(); - update_vttbr(vcpu->kvm); + update_vmid(&vcpu->kvm->arch.vmid); check_vcpu_requests(vcpu); @@ -739,7 +726,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || + if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) || kvm_request_pending(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ @@ -1417,10 +1404,6 @@ static inline void hyp_cpu_pm_exit(void) static int init_common_resources(void) { - /* set size of VMID supported by CPU */ - kvm_vmid_bits = kvm_get_vmid_bits(); - kvm_info("%d-bit VMID\n", kvm_vmid_bits); - kvm_set_ipa_limit(); return 0; diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index fbdf3ac2f001..f8dda452ea24 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -908,6 +908,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, */ int kvm_alloc_stage2_pgd(struct kvm *kvm) { + phys_addr_t pgd_phys; pgd_t *pgd; if (kvm->arch.pgd != NULL) { @@ -920,7 +921,12 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) if (!pgd) return -ENOMEM; + pgd_phys = virt_to_phys(pgd); + if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm))) + return -EINVAL; + kvm->arch.pgd = pgd; + kvm->arch.pgd_phys = pgd_phys; return 0; } @@ -1008,6 +1014,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; + kvm->arch.pgd_phys = 0; } spin_unlock(&kvm->mmu_lock); From accb99bcd0ca6d3ee412557b0c3f583a3abc0eb6 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 26 Nov 2018 18:21:22 +0100 Subject: [PATCH 143/855] KVM: arm/arm64: Simplify bg_timer programming Instead of calling into kvm_timer_[un]schedule from the main kvm blocking path, test if the VCPU is on the wait queue from the load/put path and perform the background timer setup/cancel in this path. This has the distinct advantage that we no longer race between load/put and schedule/unschedule and programming and canceling of the bg_timer always happens when the timer state is not loaded. Note that we must now remove the checks in kvm_timer_blocking that do not schedule a background timer if one of the timers can fire, because we no longer have a guarantee that kvm_vcpu_check_block() will be called before kvm_timer_blocking. Reported-by: Andre Przywara Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/kvm/arm_arch_timer.h | 3 --- virt/kvm/arm/arch_timer.c | 35 ++++++++++++++--------------------- virt/kvm/arm/arm.c | 2 -- 3 files changed, 14 insertions(+), 26 deletions(-) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 33771352dcd6..d6e6a45d1d24 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -76,9 +76,6 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); bool kvm_timer_is_pending(struct kvm_vcpu *vcpu); -void kvm_timer_schedule(struct kvm_vcpu *vcpu); -void kvm_timer_unschedule(struct kvm_vcpu *vcpu); - u64 kvm_phys_timer_read(void); void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index b07ac4614e1c..4986028d9829 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -349,22 +349,12 @@ out: * thread is removed from its waitqueue and made runnable when there's a timer * interrupt to handle. */ -void kvm_timer_schedule(struct kvm_vcpu *vcpu) +static void kvm_timer_blocking(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - vtimer_save_state(vcpu); - - /* - * No need to schedule a background timer if any guest timer has - * already expired, because kvm_vcpu_block will return before putting - * the thread to sleep. - */ - if (kvm_timer_should_fire(vtimer) || kvm_timer_should_fire(ptimer)) - return; - /* * If both timers are not capable of raising interrupts (disabled or * masked), then there's no more work for us to do. @@ -373,12 +363,19 @@ void kvm_timer_schedule(struct kvm_vcpu *vcpu) return; /* - * The guest timers have not yet expired, schedule a background timer. + * At least one guest time will expire. Schedule a background timer. * Set the earliest expiration time among the guest timers. */ soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); } +static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + soft_timer_cancel(&timer->bg_timer); +} + static void vtimer_restore_state(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; @@ -401,15 +398,6 @@ out: local_irq_restore(flags); } -void kvm_timer_unschedule(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - vtimer_restore_state(vcpu); - - soft_timer_cancel(&timer->bg_timer); -} - static void set_cntvoff(u64 cntvoff) { u32 low = lower_32_bits(cntvoff); @@ -485,6 +473,8 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) /* Set the background timer for the physical timer emulation. */ phys_timer_emulate(vcpu); + kvm_timer_unblocking(vcpu); + /* If the timer fired while we weren't running, inject it now */ if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); @@ -527,6 +517,9 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) */ soft_timer_cancel(&timer->phys_timer); + if (swait_active(kvm_arch_vcpu_wq(vcpu))) + kvm_timer_blocking(vcpu); + /* * The kernel may decide to run userspace after calling vcpu_put, so * we reset cntvoff to 0 to ensure a consistent read between user diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index b77db673bb03..9fbdb9e1c51f 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -335,13 +335,11 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { - kvm_timer_schedule(vcpu); kvm_vgic_v4_enable_doorbell(vcpu); } void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { - kvm_timer_unschedule(vcpu); kvm_vgic_v4_disable_doorbell(vcpu); } From b98c079ba480c606b13f6abf844187af09baeaab Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 4 Jan 2019 11:33:42 +0100 Subject: [PATCH 144/855] KVM: arm64: Fix ICH_ELRSR_EL2 sysreg naming We previously incorrectly named the define for this system register. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/arch_gicv3.h | 4 ++-- arch/arm64/include/asm/sysreg.h | 2 +- virt/kvm/arm/hyp/vgic-v3-sr.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index 0bd530702118..bdc87700def2 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -54,7 +54,7 @@ #define ICH_VTR __ACCESS_CP15(c12, 4, c11, 1) #define ICH_MISR __ACCESS_CP15(c12, 4, c11, 2) #define ICH_EISR __ACCESS_CP15(c12, 4, c11, 3) -#define ICH_ELSR __ACCESS_CP15(c12, 4, c11, 5) +#define ICH_ELRSR __ACCESS_CP15(c12, 4, c11, 5) #define ICH_VMCR __ACCESS_CP15(c12, 4, c11, 7) #define __LR0(x) __ACCESS_CP15(c12, 4, c12, x) @@ -151,7 +151,7 @@ CPUIF_MAP(ICH_HCR, ICH_HCR_EL2) CPUIF_MAP(ICH_VTR, ICH_VTR_EL2) CPUIF_MAP(ICH_MISR, ICH_MISR_EL2) CPUIF_MAP(ICH_EISR, ICH_EISR_EL2) -CPUIF_MAP(ICH_ELSR, ICH_ELSR_EL2) +CPUIF_MAP(ICH_ELRSR, ICH_ELRSR_EL2) CPUIF_MAP(ICH_VMCR, ICH_VMCR_EL2) CPUIF_MAP(ICH_AP0R3, ICH_AP0R3_EL2) CPUIF_MAP(ICH_AP0R2, ICH_AP0R2_EL2) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 72dc4c011014..3e5650903d6d 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -426,7 +426,7 @@ #define SYS_ICH_VTR_EL2 sys_reg(3, 4, 12, 11, 1) #define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2) #define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3) -#define SYS_ICH_ELSR_EL2 sys_reg(3, 4, 12, 11, 5) +#define SYS_ICH_ELRSR_EL2 sys_reg(3, 4, 12, 11, 5) #define SYS_ICH_VMCR_EL2 sys_reg(3, 4, 12, 11, 7) #define __SYS__LR0_EL2(x) sys_reg(3, 4, 12, 12, x) diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index 9652c453480f..264d92da3240 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -226,7 +226,7 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) int i; u32 elrsr; - elrsr = read_gicreg(ICH_ELSR_EL2); + elrsr = read_gicreg(ICH_ELRSR_EL2); write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); From 09838de943d4c0ee75a99cd7665940705ab8dcea Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 9 Jan 2019 19:18:40 +0000 Subject: [PATCH 145/855] KVM: arm64: Reuse sys_reg() macro when searching the trap table Instead of having an open-coded macro, reuse the sys_reg() macro that does the exact same thing (the encoding is slightly different, but the ordering property is the same). Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm64/kvm/sys_regs.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e3e37228ae4e..1a5bea4285e4 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -965,6 +965,10 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return true; } +#define reg_to_encoding(x) \ + sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ + (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2); + /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ @@ -1820,30 +1824,19 @@ static const struct sys_reg_desc *get_target_table(unsigned target, } } -#define reg_to_match_value(x) \ - ({ \ - unsigned long val; \ - val = (x)->Op0 << 14; \ - val |= (x)->Op1 << 11; \ - val |= (x)->CRn << 7; \ - val |= (x)->CRm << 3; \ - val |= (x)->Op2; \ - val; \ - }) - static int match_sys_reg(const void *key, const void *elt) { const unsigned long pval = (unsigned long)key; const struct sys_reg_desc *r = elt; - return pval - reg_to_match_value(r); + return pval - reg_to_encoding(r); } static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], unsigned int num) { - unsigned long pval = reg_to_match_value(params); + unsigned long pval = reg_to_encoding(params); return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); } From 84135d3d18da2ff17d3ad1a609b2818cc3049552 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 5 Jul 2018 16:48:23 +0100 Subject: [PATCH 146/855] KVM: arm/arm64: consolidate arch timer trap handlers At the moment we have separate system register emulation handlers for each timer register. Actually they are quite similar, and we rely on kvm_arm_timer_[gs]et_reg() for the actual emulation anyways, so let's just merge all of those handlers into one function, which just marshalls the arguments and then hands off to a set of common accessors. This makes extending the emulation to include EL2 timers much easier. Signed-off-by: Andre Przywara [Fixed 32-bit VM breakage and reduced to reworking existing code] Signed-off-by: Christoffer Dall [Fixed 32bit host, general cleanup] Signed-off-by: Marc Zyngier --- arch/arm/kvm/coproc.c | 23 +++--- arch/arm64/include/asm/sysreg.h | 4 + arch/arm64/kvm/sys_regs.c | 73 ++++++++---------- include/kvm/arm_arch_timer.h | 23 ++++++ virt/kvm/arm/arch_timer.c | 130 +++++++++++++++++++++++++++----- 5 files changed, 187 insertions(+), 66 deletions(-) diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 222c1635bc7a..51863364f8d1 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -293,15 +293,16 @@ static bool access_cntp_tval(struct kvm_vcpu *vcpu, const struct coproc_params *p, const struct coproc_reg *r) { - u64 now = kvm_phys_timer_read(); - u64 val; + u32 val; if (p->is_write) { val = *vcpu_reg(vcpu, p->Rt1); - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val + now); + kvm_arm_timer_write_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_TVAL, val); } else { - val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); - *vcpu_reg(vcpu, p->Rt1) = val - now; + val = kvm_arm_timer_read_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_TVAL); + *vcpu_reg(vcpu, p->Rt1) = val; } return true; @@ -315,9 +316,11 @@ static bool access_cntp_ctl(struct kvm_vcpu *vcpu, if (p->is_write) { val = *vcpu_reg(vcpu, p->Rt1); - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, val); + kvm_arm_timer_write_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_CTL, val); } else { - val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL); + val = kvm_arm_timer_read_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_CTL); *vcpu_reg(vcpu, p->Rt1) = val; } @@ -333,9 +336,11 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu, if (p->is_write) { val = (u64)*vcpu_reg(vcpu, p->Rt2) << 32; val |= *vcpu_reg(vcpu, p->Rt1); - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val); + kvm_arm_timer_write_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_CVAL, val); } else { - val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); + val = kvm_arm_timer_read_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_CVAL); *vcpu_reg(vcpu, p->Rt1) = val; *vcpu_reg(vcpu, p->Rt2) = val >> 32; } diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 3e5650903d6d..6482e8bcf1b8 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -392,6 +392,10 @@ #define SYS_CNTP_CTL_EL0 sys_reg(3, 3, 14, 2, 1) #define SYS_CNTP_CVAL_EL0 sys_reg(3, 3, 14, 2, 2) +#define SYS_AARCH32_CNTP_TVAL sys_reg(0, 0, 14, 2, 0) +#define SYS_AARCH32_CNTP_CTL sys_reg(0, 0, 14, 2, 1) +#define SYS_AARCH32_CNTP_CVAL sys_reg(0, 2, 0, 14, 0) + #define __PMEV_op2(n) ((n) & 0x7) #define __CNTR_CRm(n) (0x8 | (((n) >> 3) & 0x3)) #define SYS_PMEVCNTRn_EL0(n) sys_reg(3, 3, 14, __CNTR_CRm(n), __PMEV_op2(n)) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1a5bea4285e4..0d348500b24b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -990,44 +990,38 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { SYS_DESC(SYS_PMEVTYPERn_EL0(n)), \ access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), } -static bool access_cntp_tval(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) +static bool access_arch_timer(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) { - u64 now = kvm_phys_timer_read(); - u64 cval; + enum kvm_arch_timers tmr; + enum kvm_arch_timer_regs treg; + u64 reg = reg_to_encoding(r); - if (p->is_write) { - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, - p->regval + now); - } else { - cval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); - p->regval = cval - now; + switch (reg) { + case SYS_CNTP_TVAL_EL0: + case SYS_AARCH32_CNTP_TVAL: + tmr = TIMER_PTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_CNTP_CTL_EL0: + case SYS_AARCH32_CNTP_CTL: + tmr = TIMER_PTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_CNTP_CVAL_EL0: + case SYS_AARCH32_CNTP_CVAL: + tmr = TIMER_PTIMER; + treg = TIMER_REG_CVAL; + break; + default: + BUG(); } - return true; -} - -static bool access_cntp_ctl(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ if (p->is_write) - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, p->regval); + kvm_arm_timer_write_sysreg(vcpu, tmr, treg, p->regval); else - p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL); - - return true; -} - -static bool access_cntp_cval(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - if (p->is_write) - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, p->regval); - else - p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); + p->regval = kvm_arm_timer_read_sysreg(vcpu, tmr, treg); return true; } @@ -1392,9 +1386,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, - { SYS_DESC(SYS_CNTP_TVAL_EL0), access_cntp_tval }, - { SYS_DESC(SYS_CNTP_CTL_EL0), access_cntp_ctl }, - { SYS_DESC(SYS_CNTP_CVAL_EL0), access_cntp_cval }, + { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), @@ -1715,10 +1709,9 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID }, - /* CNTP_TVAL */ - { Op1( 0), CRn(14), CRm( 2), Op2( 0), access_cntp_tval }, - /* CNTP_CTL */ - { Op1( 0), CRn(14), CRm( 2), Op2( 1), access_cntp_ctl }, + /* Arch Tmers */ + { SYS_DESC(SYS_AARCH32_CNTP_TVAL), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTP_CTL), access_arch_timer }, /* PMEVCNTRn */ PMU_PMEVCNTR(0), @@ -1795,7 +1788,7 @@ static const struct sys_reg_desc cp15_64_regs[] = { { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ - { Op1( 2), CRn( 0), CRm(14), Op2( 0), access_cntp_cval }, + { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, }; /* Target specific emulation tables */ diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index d6e6a45d1d24..d26b7fde9935 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -22,6 +22,19 @@ #include #include +enum kvm_arch_timers { + TIMER_PTIMER, + TIMER_VTIMER, + NR_KVM_TIMERS +}; + +enum kvm_arch_timer_regs { + TIMER_REG_CNT, + TIMER_REG_CVAL, + TIMER_REG_TVAL, + TIMER_REG_CTL, +}; + struct arch_timer_context { /* Registers: control register, timer value */ u32 cnt_ctl; @@ -87,5 +100,15 @@ bool kvm_arch_timer_get_input_level(int vintid); #define vcpu_vtimer(v) (&(v)->arch.timer_cpu.vtimer) #define vcpu_ptimer(v) (&(v)->arch.timer_cpu.ptimer) +#define vcpu_get_timer(v,t) \ + (t == TIMER_VTIMER ? vcpu_vtimer(v) : vcpu_ptimer(v)) + +u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg); +void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg, + u64 val); #endif diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 4986028d9829..f7d377448438 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -25,6 +25,7 @@ #include #include +#include #include #include @@ -52,6 +53,13 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx); static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); +static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg, + u64 val); +static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg); u64 kvm_phys_timer_read(void) { @@ -628,24 +636,25 @@ static void kvm_timer_init_interrupt(void *info) int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - switch (regid) { case KVM_REG_ARM_TIMER_CTL: - vtimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; + kvm_arm_timer_write(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CTL, value); break; case KVM_REG_ARM_TIMER_CNT: update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); break; case KVM_REG_ARM_TIMER_CVAL: - vtimer->cnt_cval = value; + kvm_arm_timer_write(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CVAL, value); break; case KVM_REG_ARM_PTIMER_CTL: - ptimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; + kvm_arm_timer_write(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CTL, value); break; case KVM_REG_ARM_PTIMER_CVAL: - ptimer->cnt_cval = value; + kvm_arm_timer_write(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CVAL, value); break; default: @@ -672,26 +681,113 @@ static u64 read_timer_ctl(struct arch_timer_context *timer) u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) { - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - switch (regid) { case KVM_REG_ARM_TIMER_CTL: - return read_timer_ctl(vtimer); + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CTL); case KVM_REG_ARM_TIMER_CNT: - return kvm_phys_timer_read() - vtimer->cntvoff; + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CNT); case KVM_REG_ARM_TIMER_CVAL: - return vtimer->cnt_cval; + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CVAL); case KVM_REG_ARM_PTIMER_CTL: - return read_timer_ctl(ptimer); - case KVM_REG_ARM_PTIMER_CVAL: - return ptimer->cnt_cval; + return kvm_arm_timer_read(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CTL); case KVM_REG_ARM_PTIMER_CNT: - return kvm_phys_timer_read(); + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CNT); + case KVM_REG_ARM_PTIMER_CVAL: + return kvm_arm_timer_read(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CVAL); } return (u64)-1; } +static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg) +{ + u64 val; + + switch (treg) { + case TIMER_REG_TVAL: + val = kvm_phys_timer_read() - timer->cntvoff - timer->cnt_cval; + break; + + case TIMER_REG_CTL: + val = read_timer_ctl(timer); + break; + + case TIMER_REG_CVAL: + val = timer->cnt_cval; + break; + + case TIMER_REG_CNT: + val = kvm_phys_timer_read() - timer->cntvoff; + break; + + default: + BUG(); + } + + return val; +} + +u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg) +{ + u64 val; + + preempt_disable(); + kvm_timer_vcpu_put(vcpu); + + val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg); + + kvm_timer_vcpu_load(vcpu); + preempt_enable(); + + return val; +} + +static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg, + u64 val) +{ + switch (treg) { + case TIMER_REG_TVAL: + timer->cnt_cval = val - kvm_phys_timer_read() - timer->cntvoff; + break; + + case TIMER_REG_CTL: + timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT; + break; + + case TIMER_REG_CVAL: + timer->cnt_cval = val; + break; + + default: + BUG(); + } +} + +void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg, + u64 val) +{ + preempt_disable(); + kvm_timer_vcpu_put(vcpu); + + kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val); + + kvm_timer_vcpu_load(vcpu); + preempt_enable(); +} + static int kvm_timer_starting_cpu(unsigned int cpu) { kvm_timer_init_interrupt(NULL); From e604dd5d45c75c2112424dec74853efb708f4fa6 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 18 Sep 2018 10:08:18 -0700 Subject: [PATCH 147/855] KVM: arm/arm64: timer: Rework data structures for multiple timers Prepare for having 4 timer data structures (2 for now). Move loaded to the cpu data structure and not the individual timer structure, in preparation for assigning the EL1 phys timer as well. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/kvm/arm_arch_timer.h | 43 +++++++++++++------------- virt/kvm/arm/arch_timer.c | 58 +++++++++++++++++++----------------- 2 files changed, 52 insertions(+), 49 deletions(-) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index d26b7fde9935..ab835112204d 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -36,6 +36,8 @@ enum kvm_arch_timer_regs { }; struct arch_timer_context { + struct kvm_vcpu *vcpu; + /* Registers: control register, timer value */ u32 cnt_ctl; u64 cnt_cval; @@ -43,6 +45,22 @@ struct arch_timer_context { /* Timer IRQ */ struct kvm_irq_level irq; + /* Virtual offset */ + u64 cntvoff; + + /* Emulated Timer (may be unused) */ + struct hrtimer hrtimer; +}; + +struct arch_timer_cpu { + struct arch_timer_context timers[NR_KVM_TIMERS]; + + /* Background timer used when the guest is not running */ + struct hrtimer bg_timer; + + /* Is the timer enabled */ + bool enabled; + /* * We have multiple paths which can save/restore the timer state * onto the hardware, so we need some way of keeping track of @@ -52,23 +70,6 @@ struct arch_timer_context { * loaded == false: State is stored in memory. */ bool loaded; - - /* Virtual offset */ - u64 cntvoff; -}; - -struct arch_timer_cpu { - struct arch_timer_context vtimer; - struct arch_timer_context ptimer; - - /* Background timer used when the guest is not running */ - struct hrtimer bg_timer; - - /* Physical timer emulation */ - struct hrtimer phys_timer; - - /* Is the timer enabled */ - bool enabled; }; int kvm_timer_hyp_init(bool); @@ -98,10 +99,10 @@ void kvm_timer_init_vhe(void); bool kvm_arch_timer_get_input_level(int vintid); -#define vcpu_vtimer(v) (&(v)->arch.timer_cpu.vtimer) -#define vcpu_ptimer(v) (&(v)->arch.timer_cpu.ptimer) -#define vcpu_get_timer(v,t) \ - (t == TIMER_VTIMER ? vcpu_vtimer(v) : vcpu_ptimer(v)) +#define vcpu_timer(v) (&(v)->arch.timer_cpu) +#define vcpu_get_timer(v,t) (&vcpu_timer(v)->timers[(t)]) +#define vcpu_vtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_VTIMER]) +#define vcpu_ptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_PTIMER]) u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timers tmr, diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index f7d377448438..471f9fd004c9 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -184,13 +184,11 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) { struct arch_timer_context *ptimer; - struct arch_timer_cpu *timer; struct kvm_vcpu *vcpu; u64 ns; - timer = container_of(hrt, struct arch_timer_cpu, phys_timer); - vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); - ptimer = vcpu_ptimer(vcpu); + ptimer = container_of(hrt, struct arch_timer_context, hrtimer); + vcpu = ptimer->vcpu; /* * Check that the timer has really expired from the guest's @@ -209,9 +207,10 @@ static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { + struct arch_timer_cpu *timer = vcpu_timer(timer_ctx->vcpu); u64 cval, now; - if (timer_ctx->loaded) { + if (timer->loaded) { u32 cnt_ctl; /* Only the virtual timer can be loaded so far */ @@ -280,7 +279,6 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, /* Schedule the background timer for the emulated timer. */ static void phys_timer_emulate(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); /* @@ -289,11 +287,11 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu) * then we also don't need a soft timer. */ if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { - soft_timer_cancel(&timer->phys_timer); + soft_timer_cancel(&ptimer->hrtimer); return; } - soft_timer_start(&timer->phys_timer, kvm_timer_compute_delta(ptimer)); + soft_timer_start(&ptimer->hrtimer, kvm_timer_compute_delta(ptimer)); } /* @@ -303,7 +301,7 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu) */ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); bool level; @@ -329,13 +327,13 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) static void vtimer_save_state(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); unsigned long flags; local_irq_save(flags); - if (!vtimer->loaded) + if (!timer->loaded) goto out; if (timer->enabled) { @@ -347,7 +345,7 @@ static void vtimer_save_state(struct kvm_vcpu *vcpu) write_sysreg_el0(0, cntv_ctl); isb(); - vtimer->loaded = false; + timer->loaded = false; out: local_irq_restore(flags); } @@ -359,7 +357,7 @@ out: */ static void kvm_timer_blocking(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); @@ -379,20 +377,20 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu) static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); soft_timer_cancel(&timer->bg_timer); } static void vtimer_restore_state(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); unsigned long flags; local_irq_save(flags); - if (vtimer->loaded) + if (timer->loaded) goto out; if (timer->enabled) { @@ -401,7 +399,7 @@ static void vtimer_restore_state(struct kvm_vcpu *vcpu) write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); } - vtimer->loaded = true; + timer->loaded = true; out: local_irq_restore(flags); } @@ -462,7 +460,7 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); @@ -507,7 +505,8 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); if (unlikely(!timer->enabled)) return; @@ -523,7 +522,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * In any case, we re-schedule the hrtimer for the physical timer when * coming back to the VCPU thread in kvm_timer_vcpu_load(). */ - soft_timer_cancel(&timer->phys_timer); + soft_timer_cancel(&ptimer->hrtimer); if (swait_active(kvm_arch_vcpu_wq(vcpu))) kvm_timer_blocking(vcpu); @@ -559,7 +558,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); if (unlikely(!timer->enabled)) return; @@ -570,7 +569,7 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); @@ -611,22 +610,25 @@ static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff) void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); /* Synchronize cntvoff across all vtimers of a VM. */ update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); - vcpu_ptimer(vcpu)->cntvoff = 0; + ptimer->cntvoff = 0; hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->bg_timer.function = kvm_bg_timer_expire; - hrtimer_init(&timer->phys_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - timer->phys_timer.function = kvm_phys_timer_expire; + hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + ptimer->hrtimer.function = kvm_phys_timer_expire; vtimer->irq.irq = default_vtimer_irq.irq; ptimer->irq.irq = default_ptimer_irq.irq; + + vtimer->vcpu = vcpu; + ptimer->vcpu = vcpu; } static void kvm_timer_init_interrupt(void *info) @@ -860,7 +862,7 @@ out_free_irq: void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); soft_timer_cancel(&timer->bg_timer); } @@ -904,7 +906,7 @@ bool kvm_arch_timer_get_input_level(int vintid) int kvm_timer_enable(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); int ret; From 9e01dc76be6a3b5768cb02130d2ff0055a68809a Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 19 Feb 2019 14:04:30 +0100 Subject: [PATCH 148/855] KVM: arm/arm64: arch_timer: Assign the phys timer on VHE systems VHE systems don't have to emulate the physical timer, we can simply assign the EL1 physical timer directly to the VM as the host always uses the EL2 timers. In order to minimize the amount of cruft, AArch32 gets definitions for the physical timer too, but is should be generally unused on this architecture. Co-written with Marc Zyngier Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_hyp.h | 4 + include/kvm/arm_arch_timer.h | 6 + virt/kvm/arm/arch_timer.c | 219 +++++++++++++++++++++++++-------- 3 files changed, 180 insertions(+), 49 deletions(-) diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index e93a0cac9add..87bcd18df8d5 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -40,6 +40,7 @@ #define TTBR1 __ACCESS_CP15_64(1, c2) #define VTTBR __ACCESS_CP15_64(6, c2) #define PAR __ACCESS_CP15_64(0, c7) +#define CNTP_CVAL __ACCESS_CP15_64(2, c14) #define CNTV_CVAL __ACCESS_CP15_64(3, c14) #define CNTVOFF __ACCESS_CP15_64(4, c14) @@ -85,6 +86,7 @@ #define TID_PRIV __ACCESS_CP15(c13, 0, c0, 4) #define HTPIDR __ACCESS_CP15(c13, 4, c0, 2) #define CNTKCTL __ACCESS_CP15(c14, 0, c1, 0) +#define CNTP_CTL __ACCESS_CP15(c14, 0, c2, 1) #define CNTV_CTL __ACCESS_CP15(c14, 0, c3, 1) #define CNTHCTL __ACCESS_CP15(c14, 4, c1, 0) @@ -94,6 +96,8 @@ #define read_sysreg_el0(r) read_sysreg(r##_el0) #define write_sysreg_el0(v, r) write_sysreg(v, r##_el0) +#define cntp_ctl_el0 CNTP_CTL +#define cntp_cval_el0 CNTP_CVAL #define cntv_ctl_el0 CNTV_CTL #define cntv_cval_el0 CNTV_CVAL #define cntvoff_el2 CNTVOFF diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index ab835112204d..6d4a33a9c45a 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -50,6 +50,10 @@ struct arch_timer_context { /* Emulated Timer (may be unused) */ struct hrtimer hrtimer; + + /* Duplicated state from arch_timer.c for convenience */ + u32 host_timer_irq; + u32 host_timer_irq_flags; }; struct arch_timer_cpu { @@ -104,6 +108,8 @@ bool kvm_arch_timer_get_input_level(int vintid); #define vcpu_vtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_VTIMER]) #define vcpu_ptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_PTIMER]) +#define arch_timer_ctx_index(ctx) ((ctx) - vcpu_timer((ctx)->vcpu)->timers) + u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timers tmr, enum kvm_arch_timer_regs treg); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 471f9fd004c9..10c15151c87e 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -35,7 +35,9 @@ static struct timecounter *timecounter; static unsigned int host_vtimer_irq; +static unsigned int host_ptimer_irq; static u32 host_vtimer_irq_flags; +static u32 host_ptimer_irq_flags; static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); @@ -86,20 +88,24 @@ static void soft_timer_cancel(struct hrtimer *hrt) static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) { struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; - struct arch_timer_context *vtimer; + struct arch_timer_context *ctx; /* * We may see a timer interrupt after vcpu_put() has been called which * sets the CPU's vcpu pointer to NULL, because even though the timer - * has been disabled in vtimer_save_state(), the hardware interrupt + * has been disabled in timer_save_state(), the hardware interrupt * signal may not have been retired from the interrupt controller yet. */ if (!vcpu) return IRQ_HANDLED; - vtimer = vcpu_vtimer(vcpu); - if (kvm_timer_should_fire(vtimer)) - kvm_timer_update_irq(vcpu, true, vtimer); + if (irq == host_vtimer_irq) + ctx = vcpu_vtimer(vcpu); + else + ctx = vcpu_ptimer(vcpu); + + if (kvm_timer_should_fire(ctx)) + kvm_timer_update_irq(vcpu, true, ctx); if (userspace_irqchip(vcpu->kvm) && !static_branch_unlikely(&has_gic_active_state)) @@ -208,13 +214,25 @@ static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { struct arch_timer_cpu *timer = vcpu_timer(timer_ctx->vcpu); + enum kvm_arch_timers index = arch_timer_ctx_index(timer_ctx); u64 cval, now; if (timer->loaded) { - u32 cnt_ctl; + u32 cnt_ctl = 0; + + switch (index) { + case TIMER_VTIMER: + cnt_ctl = read_sysreg_el0(cntv_ctl); + break; + case TIMER_PTIMER: + cnt_ctl = read_sysreg_el0(cntp_ctl); + break; + case NR_KVM_TIMERS: + /* GCC is braindead */ + cnt_ctl = 0; + break; + } - /* Only the virtual timer can be loaded so far */ - cnt_ctl = read_sysreg_el0(cntv_ctl); return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) && (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) && !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); @@ -310,7 +328,7 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) return; /* - * The vtimer virtual interrupt is a 'mapped' interrupt, meaning part + * If the timer virtual interrupt is a 'mapped' interrupt, part * of its lifecycle is offloaded to the hardware, and we therefore may * not have lowered the irq.level value before having to signal a new * interrupt, but have to signal an interrupt every time the level is @@ -319,31 +337,55 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) level = kvm_timer_should_fire(vtimer); kvm_timer_update_irq(vcpu, level, vtimer); + if (has_vhe()) { + level = kvm_timer_should_fire(ptimer); + kvm_timer_update_irq(vcpu, level, ptimer); + + return; + } + phys_timer_emulate(vcpu); if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); } -static void vtimer_save_state(struct kvm_vcpu *vcpu) +static void timer_save_state(struct arch_timer_context *ctx) { - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + enum kvm_arch_timers index = arch_timer_ctx_index(ctx); unsigned long flags; + if (!timer->enabled) + return; + local_irq_save(flags); if (!timer->loaded) goto out; - if (timer->enabled) { - vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); - vtimer->cnt_cval = read_sysreg_el0(cntv_cval); - } + switch (index) { + case TIMER_VTIMER: + ctx->cnt_ctl = read_sysreg_el0(cntv_ctl); + ctx->cnt_cval = read_sysreg_el0(cntv_cval); - /* Disable the virtual timer */ - write_sysreg_el0(0, cntv_ctl); - isb(); + /* Disable the timer */ + write_sysreg_el0(0, cntv_ctl); + isb(); + + break; + case TIMER_PTIMER: + ctx->cnt_ctl = read_sysreg_el0(cntp_ctl); + ctx->cnt_cval = read_sysreg_el0(cntp_cval); + + /* Disable the timer */ + write_sysreg_el0(0, cntp_ctl); + isb(); + + break; + case NR_KVM_TIMERS: + break; /* GCC is braindead */ + } timer->loaded = false; out: @@ -382,21 +424,33 @@ static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) soft_timer_cancel(&timer->bg_timer); } -static void vtimer_restore_state(struct kvm_vcpu *vcpu) +static void timer_restore_state(struct arch_timer_context *ctx) { - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + enum kvm_arch_timers index = arch_timer_ctx_index(ctx); unsigned long flags; + if (!timer->enabled) + return; + local_irq_save(flags); if (timer->loaded) goto out; - if (timer->enabled) { - write_sysreg_el0(vtimer->cnt_cval, cntv_cval); + switch (index) { + case TIMER_VTIMER: + write_sysreg_el0(ctx->cnt_cval, cntv_cval); isb(); - write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); + write_sysreg_el0(ctx->cnt_ctl, cntv_ctl); + break; + case TIMER_PTIMER: + write_sysreg_el0(ctx->cnt_cval, cntp_cval); + isb(); + write_sysreg_el0(ctx->cnt_ctl, cntp_ctl); + break; + case NR_KVM_TIMERS: + break; /* GCC is braindead */ } timer->loaded = true; @@ -419,23 +473,23 @@ static void set_cntvoff(u64 cntvoff) kvm_call_hyp(__kvm_timer_set_cntvoff, low, high); } -static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active) +static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active) { int r; - r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active); + r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active); WARN_ON(r); } -static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu) +static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct kvm_vcpu *vcpu = ctx->vcpu; bool phys_active; if (irqchip_in_kernel(vcpu->kvm)) - phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); + phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq); else - phys_active = vtimer->irq.level; - set_vtimer_irq_phys_active(vcpu, phys_active); + phys_active = ctx->irq.level; + set_timer_irq_phys_active(ctx, phys_active); } static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) @@ -467,14 +521,22 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) if (unlikely(!timer->enabled)) return; - if (static_branch_likely(&has_gic_active_state)) - kvm_timer_vcpu_load_gic(vcpu); - else + if (static_branch_likely(&has_gic_active_state)) { + kvm_timer_vcpu_load_gic(vtimer); + if (has_vhe()) + kvm_timer_vcpu_load_gic(ptimer); + } else { kvm_timer_vcpu_load_nogic(vcpu); + } set_cntvoff(vtimer->cntvoff); - vtimer_restore_state(vcpu); + timer_restore_state(vtimer); + + if (has_vhe()) { + timer_restore_state(ptimer); + return; + } /* Set the background timer for the physical timer emulation. */ phys_timer_emulate(vcpu); @@ -506,12 +568,17 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); if (unlikely(!timer->enabled)) return; - vtimer_save_state(vcpu); + timer_save_state(vtimer); + if (has_vhe()) { + timer_save_state(ptimer); + return; + } /* * Cancel the physical timer emulation, because the only case where we @@ -534,8 +601,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * counter of non-VHE case. For VHE, the virtual counter uses a fixed * virtual offset of zero, so no need to zero CNTVOFF_EL2 register. */ - if (!has_vhe()) - set_cntvoff(0); + set_cntvoff(0); } /* @@ -550,7 +616,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) if (!kvm_timer_should_fire(vtimer)) { kvm_timer_update_irq(vcpu, false, vtimer); if (static_branch_likely(&has_gic_active_state)) - set_vtimer_irq_phys_active(vcpu, false); + set_timer_irq_phys_active(vtimer, false); else enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); } @@ -625,7 +691,12 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) ptimer->hrtimer.function = kvm_phys_timer_expire; vtimer->irq.irq = default_vtimer_irq.irq; + vtimer->host_timer_irq = host_vtimer_irq; + vtimer->host_timer_irq_flags = host_vtimer_irq_flags; + ptimer->irq.irq = default_ptimer_irq.irq; + ptimer->host_timer_irq = host_ptimer_irq; + ptimer->host_timer_irq_flags = host_ptimer_irq_flags; vtimer->vcpu = vcpu; ptimer->vcpu = vcpu; @@ -634,6 +705,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) static void kvm_timer_init_interrupt(void *info) { enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); + enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); } int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) @@ -815,6 +887,8 @@ int kvm_timer_hyp_init(bool has_gic) return -ENODEV; } + /* First, do the virtual EL1 timer irq */ + if (info->virtual_irq <= 0) { kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", info->virtual_irq); @@ -825,15 +899,15 @@ int kvm_timer_hyp_init(bool has_gic) host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq); if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH && host_vtimer_irq_flags != IRQF_TRIGGER_LOW) { - kvm_err("Invalid trigger for IRQ%d, assuming level low\n", + kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n", host_vtimer_irq); host_vtimer_irq_flags = IRQF_TRIGGER_LOW; } err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, - "kvm guest timer", kvm_get_running_vcpus()); + "kvm guest vtimer", kvm_get_running_vcpus()); if (err) { - kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n", + kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n", host_vtimer_irq, err); return err; } @@ -851,6 +925,43 @@ int kvm_timer_hyp_init(bool has_gic) kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq); + /* Now let's do the physical EL1 timer irq */ + + if (info->physical_irq > 0) { + host_ptimer_irq = info->physical_irq; + host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq); + if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH && + host_ptimer_irq_flags != IRQF_TRIGGER_LOW) { + kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n", + host_ptimer_irq); + host_ptimer_irq_flags = IRQF_TRIGGER_LOW; + } + + err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler, + "kvm guest ptimer", kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n", + host_ptimer_irq, err); + return err; + } + + if (has_gic) { + err = irq_set_vcpu_affinity(host_ptimer_irq, + kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); + goto out_free_irq; + } + } + + kvm_debug("physical timer IRQ%d\n", host_ptimer_irq); + } else if (has_vhe()) { + kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n", + info->physical_irq); + err = -ENODEV; + goto out_free_irq; + } + cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, "kvm/arm/timer:starting", kvm_timer_starting_cpu, kvm_timer_dying_cpu); @@ -898,8 +1009,10 @@ bool kvm_arch_timer_get_input_level(int vintid) if (vintid == vcpu_vtimer(vcpu)->irq.irq) timer = vcpu_vtimer(vcpu); + else if (vintid == vcpu_ptimer(vcpu)->irq.irq) + timer = vcpu_ptimer(vcpu); else - BUG(); /* We only map the vtimer so far */ + BUG(); return kvm_timer_should_fire(timer); } @@ -908,6 +1021,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); int ret; if (timer->enabled) @@ -930,14 +1044,21 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) if (ret) return ret; + if (has_vhe()) { + ret = kvm_vgic_map_phys_irq(vcpu, host_ptimer_irq, ptimer->irq.irq, + kvm_arch_timer_get_input_level); + if (ret) + return ret; + } + no_vgic: timer->enabled = 1; return 0; } /* - * On VHE system, we only need to configure trap on physical timer and counter - * accesses in EL0 and EL1 once, not for every world switch. + * On VHE system, we only need to configure the EL2 timer trap register once, + * not for every world switch. * The host kernel runs at EL2 with HCR_EL2.TGE == 1, * and this makes those bits have no effect for the host kernel execution. */ @@ -948,11 +1069,11 @@ void kvm_timer_init_vhe(void) u64 val; /* - * Disallow physical timer access for the guest. - * Physical counter access is allowed. + * VHE systems allow the guest direct access to the EL1 physical + * timer/counter. */ val = read_sysreg(cnthctl_el2); - val &= ~(CNTHCTL_EL1PCEN << cnthctl_shift); + val |= (CNTHCTL_EL1PCEN << cnthctl_shift); val |= (CNTHCTL_EL1PCTEN << cnthctl_shift); write_sysreg(val, cnthctl_el2); } From bee038a67487598ebbe995f85bf60c3a5b2e9099 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 4 Jan 2019 13:31:22 +0100 Subject: [PATCH 149/855] KVM: arm/arm64: Rework the timer code to use a timer_map We are currently emulating two timers in two different ways. When we add support for nested virtualization in the future, we are going to be emulating either two timers in two diffferent ways, or four timers in a single way. We need a unified data structure to keep track of how we map virtual state to physical state and we need to cleanup some of the timer code to operate more independently on a struct arch_timer_context instead of trying to consider the global state of the VCPU and recomputing all state. Co-written with Marc Zyngier Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- include/kvm/arm_arch_timer.h | 23 +-- virt/kvm/arm/arch_timer.c | 295 +++++++++++++++++++---------------- virt/kvm/arm/trace.h | 105 +++++++++++++ 3 files changed, 278 insertions(+), 145 deletions(-) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 6d4a33a9c45a..05a18dd265b5 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -51,11 +51,24 @@ struct arch_timer_context { /* Emulated Timer (may be unused) */ struct hrtimer hrtimer; + /* + * We have multiple paths which can save/restore the timer state onto + * the hardware, so we need some way of keeping track of where the + * latest state is. + */ + bool loaded; + /* Duplicated state from arch_timer.c for convenience */ u32 host_timer_irq; u32 host_timer_irq_flags; }; +struct timer_map { + struct arch_timer_context *direct_vtimer; + struct arch_timer_context *direct_ptimer; + struct arch_timer_context *emul_ptimer; +}; + struct arch_timer_cpu { struct arch_timer_context timers[NR_KVM_TIMERS]; @@ -64,16 +77,6 @@ struct arch_timer_cpu { /* Is the timer enabled */ bool enabled; - - /* - * We have multiple paths which can save/restore the timer state - * onto the hardware, so we need some way of keeping track of - * where the latest state is. - * - * loaded == true: State is loaded on the hardware registers. - * loaded == false: State is stored in memory. - */ - bool loaded; }; int kvm_timer_hyp_init(bool); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 10c15151c87e..17f9de73cc8a 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -68,6 +68,21 @@ u64 kvm_phys_timer_read(void) return timecounter->cc->read(timecounter->cc); } +static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) +{ + if (has_vhe()) { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = vcpu_ptimer(vcpu); + map->emul_ptimer = NULL; + } else { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = NULL; + map->emul_ptimer = vcpu_ptimer(vcpu); + } + + trace_kvm_get_timer_map(vcpu->vcpu_id, map); +} + static inline bool userspace_irqchip(struct kvm *kvm) { return static_branch_unlikely(&userspace_irqchip_in_use) && @@ -89,6 +104,7 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) { struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; struct arch_timer_context *ctx; + struct timer_map map; /* * We may see a timer interrupt after vcpu_put() has been called which @@ -99,10 +115,12 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) if (!vcpu) return IRQ_HANDLED; + get_timer_map(vcpu, &map); + if (irq == host_vtimer_irq) - ctx = vcpu_vtimer(vcpu); + ctx = map.direct_vtimer; else - ctx = vcpu_ptimer(vcpu); + ctx = map.direct_ptimer; if (kvm_timer_should_fire(ctx)) kvm_timer_update_irq(vcpu, true, ctx); @@ -136,7 +154,9 @@ static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) { - return !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && + WARN_ON(timer_ctx && timer_ctx->loaded); + return timer_ctx && + !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); } @@ -146,21 +166,22 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) */ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) { - u64 min_virt = ULLONG_MAX, min_phys = ULLONG_MAX; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + u64 min_delta = ULLONG_MAX; + int i; - if (kvm_timer_irq_can_fire(vtimer)) - min_virt = kvm_timer_compute_delta(vtimer); + for (i = 0; i < NR_KVM_TIMERS; i++) { + struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i]; - if (kvm_timer_irq_can_fire(ptimer)) - min_phys = kvm_timer_compute_delta(ptimer); + WARN(ctx->loaded, "timer %d loaded\n", i); + if (kvm_timer_irq_can_fire(ctx)) + min_delta = min(min_delta, kvm_timer_compute_delta(ctx)); + } /* If none of timers can fire, then return 0 */ - if ((min_virt == ULLONG_MAX) && (min_phys == ULLONG_MAX)) + if (min_delta == ULLONG_MAX) return 0; - return min(min_virt, min_phys); + return min_delta; } static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) @@ -187,37 +208,45 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) return HRTIMER_NORESTART; } -static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) +static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) { - struct arch_timer_context *ptimer; + struct arch_timer_context *ctx; struct kvm_vcpu *vcpu; u64 ns; - ptimer = container_of(hrt, struct arch_timer_context, hrtimer); - vcpu = ptimer->vcpu; + ctx = container_of(hrt, struct arch_timer_context, hrtimer); + vcpu = ctx->vcpu; + + trace_kvm_timer_hrtimer_expire(ctx); /* * Check that the timer has really expired from the guest's * PoV (NTP on the host may have forced it to expire * early). If not ready, schedule for a later time. */ - ns = kvm_timer_compute_delta(ptimer); + ns = kvm_timer_compute_delta(ctx); if (unlikely(ns)) { hrtimer_forward_now(hrt, ns_to_ktime(ns)); return HRTIMER_RESTART; } - kvm_timer_update_irq(vcpu, true, ptimer); + kvm_timer_update_irq(vcpu, true, ctx); return HRTIMER_NORESTART; } static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { - struct arch_timer_cpu *timer = vcpu_timer(timer_ctx->vcpu); - enum kvm_arch_timers index = arch_timer_ctx_index(timer_ctx); + struct arch_timer_cpu *timer; + enum kvm_arch_timers index; u64 cval, now; - if (timer->loaded) { + if (!timer_ctx) + return false; + + timer = vcpu_timer(timer_ctx->vcpu); + index = arch_timer_ctx_index(timer_ctx); + + if (timer_ctx->loaded) { u32 cnt_ctl = 0; switch (index) { @@ -249,13 +278,13 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; - if (kvm_timer_should_fire(vtimer)) - return true; + get_timer_map(vcpu, &map); - return kvm_timer_should_fire(ptimer); + return kvm_timer_should_fire(map.direct_vtimer) || + kvm_timer_should_fire(map.direct_ptimer) || + kvm_timer_should_fire(map.emul_ptimer); } /* @@ -294,60 +323,28 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, } } -/* Schedule the background timer for the emulated timer. */ -static void phys_timer_emulate(struct kvm_vcpu *vcpu) +static void timer_emulate(struct arch_timer_context *ctx) { - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + bool should_fire = kvm_timer_should_fire(ctx); + + trace_kvm_timer_emulate(ctx, should_fire); + + if (should_fire) { + kvm_timer_update_irq(ctx->vcpu, true, ctx); + return; + } /* * If the timer can fire now, we don't need to have a soft timer * scheduled for the future. If the timer cannot fire at all, * then we also don't need a soft timer. */ - if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { - soft_timer_cancel(&ptimer->hrtimer); + if (!kvm_timer_irq_can_fire(ctx)) { + soft_timer_cancel(&ctx->hrtimer); return; } - soft_timer_start(&ptimer->hrtimer, kvm_timer_compute_delta(ptimer)); -} - -/* - * Check if there was a change in the timer state, so that we should either - * raise or lower the line level to the GIC or schedule a background timer to - * emulate the physical timer. - */ -static void kvm_timer_update_state(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - bool level; - - if (unlikely(!timer->enabled)) - return; - - /* - * If the timer virtual interrupt is a 'mapped' interrupt, part - * of its lifecycle is offloaded to the hardware, and we therefore may - * not have lowered the irq.level value before having to signal a new - * interrupt, but have to signal an interrupt every time the level is - * asserted. - */ - level = kvm_timer_should_fire(vtimer); - kvm_timer_update_irq(vcpu, level, vtimer); - - if (has_vhe()) { - level = kvm_timer_should_fire(ptimer); - kvm_timer_update_irq(vcpu, level, ptimer); - - return; - } - - phys_timer_emulate(vcpu); - - if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) - kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); + soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx)); } static void timer_save_state(struct arch_timer_context *ctx) @@ -361,7 +358,7 @@ static void timer_save_state(struct arch_timer_context *ctx) local_irq_save(flags); - if (!timer->loaded) + if (!ctx->loaded) goto out; switch (index) { @@ -384,10 +381,12 @@ static void timer_save_state(struct arch_timer_context *ctx) break; case NR_KVM_TIMERS: - break; /* GCC is braindead */ + BUG(); } - timer->loaded = false; + trace_kvm_timer_save_state(ctx); + + ctx->loaded = false; out: local_irq_restore(flags); } @@ -400,14 +399,17 @@ out: static void kvm_timer_blocking(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; + + get_timer_map(vcpu, &map); /* - * If both timers are not capable of raising interrupts (disabled or + * If no timers are capable of raising interrupts (disabled or * masked), then there's no more work for us to do. */ - if (!kvm_timer_irq_can_fire(vtimer) && !kvm_timer_irq_can_fire(ptimer)) + if (!kvm_timer_irq_can_fire(map.direct_vtimer) && + !kvm_timer_irq_can_fire(map.direct_ptimer) && + !kvm_timer_irq_can_fire(map.emul_ptimer)) return; /* @@ -435,7 +437,7 @@ static void timer_restore_state(struct arch_timer_context *ctx) local_irq_save(flags); - if (timer->loaded) + if (ctx->loaded) goto out; switch (index) { @@ -450,10 +452,12 @@ static void timer_restore_state(struct arch_timer_context *ctx) write_sysreg_el0(ctx->cnt_ctl, cntp_ctl); break; case NR_KVM_TIMERS: - break; /* GCC is braindead */ + BUG(); } - timer->loaded = true; + trace_kvm_timer_restore_state(ctx); + + ctx->loaded = true; out: local_irq_restore(flags); } @@ -515,37 +519,31 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; if (unlikely(!timer->enabled)) return; + get_timer_map(vcpu, &map); + if (static_branch_likely(&has_gic_active_state)) { - kvm_timer_vcpu_load_gic(vtimer); - if (has_vhe()) - kvm_timer_vcpu_load_gic(ptimer); + kvm_timer_vcpu_load_gic(map.direct_vtimer); + if (map.direct_ptimer) + kvm_timer_vcpu_load_gic(map.direct_ptimer); } else { kvm_timer_vcpu_load_nogic(vcpu); } - set_cntvoff(vtimer->cntvoff); - - timer_restore_state(vtimer); - - if (has_vhe()) { - timer_restore_state(ptimer); - return; - } - - /* Set the background timer for the physical timer emulation. */ - phys_timer_emulate(vcpu); + set_cntvoff(map.direct_vtimer->cntvoff); kvm_timer_unblocking(vcpu); - /* If the timer fired while we weren't running, inject it now */ - if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) - kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); + timer_restore_state(map.direct_vtimer); + if (map.direct_ptimer) + timer_restore_state(map.direct_ptimer); + + if (map.emul_ptimer) + timer_emulate(map.emul_ptimer); } bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) @@ -568,20 +566,19 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; if (unlikely(!timer->enabled)) return; - timer_save_state(vtimer); - if (has_vhe()) { - timer_save_state(ptimer); - return; - } + get_timer_map(vcpu, &map); + + timer_save_state(map.direct_vtimer); + if (map.direct_ptimer) + timer_save_state(map.direct_ptimer); /* - * Cancel the physical timer emulation, because the only case where we + * Cancel soft timer emulation, because the only case where we * need it after a vcpu_put is in the context of a sleeping VCPU, and * in that case we already factor in the deadline for the physical * timer when scheduling the bg_timer. @@ -589,7 +586,8 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * In any case, we re-schedule the hrtimer for the physical timer when * coming back to the VCPU thread in kvm_timer_vcpu_load(). */ - soft_timer_cancel(&ptimer->hrtimer); + if (map.emul_ptimer) + soft_timer_cancel(&map.emul_ptimer->hrtimer); if (swait_active(kvm_arch_vcpu_wq(vcpu))) kvm_timer_blocking(vcpu); @@ -636,8 +634,9 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; + + get_timer_map(vcpu, &map); /* * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 @@ -645,12 +644,22 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) * resets the timer to be disabled and unmasked and is compliant with * the ARMv7 architecture. */ - vtimer->cnt_ctl = 0; - ptimer->cnt_ctl = 0; - kvm_timer_update_state(vcpu); + vcpu_vtimer(vcpu)->cnt_ctl = 0; + vcpu_ptimer(vcpu)->cnt_ctl = 0; - if (timer->enabled && irqchip_in_kernel(vcpu->kvm)) - kvm_vgic_reset_mapped_irq(vcpu, vtimer->irq.irq); + if (timer->enabled) { + kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu)); + kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu)); + + if (irqchip_in_kernel(vcpu->kvm)) { + kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq); + if (map.direct_ptimer) + kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq); + } + } + + if (map.emul_ptimer) + soft_timer_cancel(&map.emul_ptimer->hrtimer); return 0; } @@ -687,15 +696,18 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->bg_timer.function = kvm_bg_timer_expire; + hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - ptimer->hrtimer.function = kvm_phys_timer_expire; + vtimer->hrtimer.function = kvm_hrtimer_expire; + ptimer->hrtimer.function = kvm_hrtimer_expire; vtimer->irq.irq = default_vtimer_irq.irq; - vtimer->host_timer_irq = host_vtimer_irq; - vtimer->host_timer_irq_flags = host_vtimer_irq_flags; - ptimer->irq.irq = default_ptimer_irq.irq; + + vtimer->host_timer_irq = host_vtimer_irq; ptimer->host_timer_irq = host_ptimer_irq; + + vtimer->host_timer_irq_flags = host_vtimer_irq_flags; ptimer->host_timer_irq_flags = host_ptimer_irq_flags; vtimer->vcpu = vcpu; @@ -710,32 +722,39 @@ static void kvm_timer_init_interrupt(void *info) int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) { + struct arch_timer_context *timer; + bool level; + switch (regid) { case KVM_REG_ARM_TIMER_CTL: - kvm_arm_timer_write(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CTL, value); + timer = vcpu_vtimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); break; case KVM_REG_ARM_TIMER_CNT: + timer = vcpu_vtimer(vcpu); update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); break; case KVM_REG_ARM_TIMER_CVAL: - kvm_arm_timer_write(vcpu, - vcpu_vtimer(vcpu), TIMER_REG_CVAL, value); + timer = vcpu_vtimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); break; case KVM_REG_ARM_PTIMER_CTL: - kvm_arm_timer_write(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CTL, value); + timer = vcpu_ptimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); break; case KVM_REG_ARM_PTIMER_CVAL: - kvm_arm_timer_write(vcpu, - vcpu_ptimer(vcpu), TIMER_REG_CVAL, value); + timer = vcpu_ptimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); break; default: return -1; } - kvm_timer_update_state(vcpu); + level = kvm_timer_should_fire(timer); + kvm_timer_update_irq(vcpu, level, timer); + timer_emulate(timer); + return 0; } @@ -1020,8 +1039,7 @@ bool kvm_arch_timer_get_input_level(int vintid) int kvm_timer_enable(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; int ret; if (timer->enabled) @@ -1039,18 +1057,25 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) return -EINVAL; } - ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq, + get_timer_map(vcpu, &map); + + ret = kvm_vgic_map_phys_irq(vcpu, + map.direct_vtimer->host_timer_irq, + map.direct_vtimer->irq.irq, kvm_arch_timer_get_input_level); if (ret) return ret; - if (has_vhe()) { - ret = kvm_vgic_map_phys_irq(vcpu, host_ptimer_irq, ptimer->irq.irq, + if (map.direct_ptimer) { + ret = kvm_vgic_map_phys_irq(vcpu, + map.direct_ptimer->host_timer_irq, + map.direct_ptimer->irq.irq, kvm_arch_timer_get_input_level); - if (ret) - return ret; } + if (ret) + return ret; + no_vgic: timer->enabled = 1; return 0; diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index 3828beab93f2..54bb059243b9 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h @@ -2,6 +2,7 @@ #if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KVM_H +#include #include #undef TRACE_SYSTEM @@ -262,6 +263,110 @@ TRACE_EVENT(kvm_timer_update_irq, __entry->vcpu_id, __entry->irq, __entry->level) ); +TRACE_EVENT(kvm_get_timer_map, + TP_PROTO(unsigned long vcpu_id, struct timer_map *map), + TP_ARGS(vcpu_id, map), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( int, direct_vtimer ) + __field( int, direct_ptimer ) + __field( int, emul_ptimer ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer); + __entry->direct_ptimer = + (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1; + __entry->emul_ptimer = + (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1; + ), + + TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d", + __entry->vcpu_id, + __entry->direct_vtimer, + __entry->direct_ptimer, + __entry->emul_ptimer) +); + +TRACE_EVENT(kvm_timer_save_state, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( unsigned long, ctl ) + __field( unsigned long long, cval ) + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->ctl = ctx->cnt_ctl; + __entry->cval = ctx->cnt_cval; + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk(" CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", + __entry->ctl, + __entry->cval, + __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_restore_state, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( unsigned long, ctl ) + __field( unsigned long long, cval ) + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->ctl = ctx->cnt_ctl; + __entry->cval = ctx->cnt_cval; + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", + __entry->ctl, + __entry->cval, + __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_hrtimer_expire, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_emulate, + TP_PROTO(struct arch_timer_context *ctx, bool should_fire), + TP_ARGS(ctx, should_fire), + + TP_STRUCT__entry( + __field( int, timer_idx ) + __field( bool, should_fire ) + ), + + TP_fast_assign( + __entry->timer_idx = arch_timer_ctx_index(ctx); + __entry->should_fire = should_fire; + ), + + TP_printk("arch_timer_ctx_index: %d (should_fire: %d)", + __entry->timer_idx, __entry->should_fire) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH From 64cf98fa5544aee6c547786ee32f92b796b30635 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sun, 1 May 2016 22:29:58 +0200 Subject: [PATCH 150/855] KVM: arm/arm64: Move kvm_is_write_fault to header file Move this little function to the header files for arm/arm64 so other code can make use of it directly. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_emulate.h | 8 ++++++++ arch/arm64/include/asm/kvm_emulate.h | 8 ++++++++ virt/kvm/arm/mmu.c | 8 -------- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 77121b713bef..8927cae7c966 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -265,6 +265,14 @@ static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu) } } +static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_trap_is_iabt(vcpu)) + return false; + + return kvm_vcpu_dabt_iswrite(vcpu); +} + static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu) { return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK; diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 506386a3edde..a0d1ce9ae12b 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -331,6 +331,14 @@ static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) return ESR_ELx_SYS64_ISS_RT(esr); } +static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_trap_is_iabt(vcpu)) + return false; + + return kvm_vcpu_dabt_iswrite(vcpu); +} + static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) { return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index f8dda452ea24..e3e5a26b4845 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1403,14 +1403,6 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) return false; } -static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) -{ - if (kvm_vcpu_trap_is_iabt(vcpu)) - return false; - - return kvm_vcpu_dabt_iswrite(vcpu); -} - /** * stage2_wp_ptes - write protect PMD range * @pmd: pointer to pmd entry From f7f2b15c3d42fa5754131b34a0f7ad5a5c3f777f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 31 Jan 2019 14:17:17 +0100 Subject: [PATCH 151/855] arm64: KVM: Expose sanitised cache type register to guest We currently permit CPUs in the same system to deviate in the exact topology of the caches, and we subsequently hide this fact from user space by exposing a sanitised value of the cache type register CTR_EL0. However, guests running under KVM see the bare value of CTR_EL0, which could potentially result in issues with, e.g., JITs or other pieces of code that are sensitive to misreported cache line sizes. So let's start trapping cache ID instructions if there is a mismatch, and expose the sanitised version of CTR_EL0 to guests. Note that CTR_EL0 is treated as an invariant to KVM user space, so update that part as well. Acked-by: Christoffer Dall Signed-off-by: Ard Biesheuvel Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 3 ++ arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/kvm/sys_regs.c | 59 +++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index a0d1ce9ae12b..9acccb1e3746 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -77,6 +77,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) */ if (!vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 |= HCR_TID3; + + if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE)) + vcpu->arch.hcr_el2 |= HCR_TID2; } static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 6482e8bcf1b8..5b267dec6194 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -361,6 +361,7 @@ #define SYS_CNTKCTL_EL1 sys_reg(3, 0, 14, 1, 0) +#define SYS_CCSIDR_EL1 sys_reg(3, 1, 0, 0, 0) #define SYS_CLIDR_EL1 sys_reg(3, 1, 0, 0, 1) #define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 0d348500b24b..45a07cfc57ed 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1146,6 +1146,49 @@ static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return __set_id_reg(rd, uaddr, true); } +static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = read_sanitised_ftr_reg(SYS_CTR_EL0); + return true; +} + +static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = read_sysreg(clidr_el1); + return true; +} + +static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + vcpu_write_sys_reg(vcpu, p->regval, r->reg); + else + p->regval = vcpu_read_sys_reg(vcpu, r->reg); + return true; +} + +static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 csselr; + + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1); + p->regval = get_ccsidr(csselr); + return true; +} + /* sys_reg_desc initialiser for known cpufeature ID registers */ #define ID_SANITISED(name) { \ SYS_DESC(SYS_##name), \ @@ -1363,7 +1406,10 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0}, - { SYS_DESC(SYS_CSSELR_EL1), NULL, reset_unknown, CSSELR_EL1 }, + { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, + { SYS_DESC(SYS_CLIDR_EL1), access_clidr }, + { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, + { SYS_DESC(SYS_CTR_EL0), access_ctr }, { SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, }, { SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 }, @@ -1663,6 +1709,7 @@ static const struct sys_reg_desc cp14_64_regs[] = { * register). */ static const struct sys_reg_desc cp15_regs[] = { + { Op1( 0), CRn( 0), CRm( 0), Op2( 1), access_ctr }, { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR }, { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 }, @@ -1779,6 +1826,10 @@ static const struct sys_reg_desc cp15_regs[] = { PMU_PMEVTYPER(30), /* PMCCFILTR */ { Op1(0), CRn(14), CRm(15), Op2(7), access_pmu_evtyper }, + + { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr }, + { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr }, + { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, c0_CSSELR }, }; static const struct sys_reg_desc cp15_64_regs[] = { @@ -2192,11 +2243,15 @@ static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, } FUNCTION_INVARIANT(midr_el1) -FUNCTION_INVARIANT(ctr_el0) FUNCTION_INVARIANT(revidr_el1) FUNCTION_INVARIANT(clidr_el1) FUNCTION_INVARIANT(aidr_el1) +static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) +{ + ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0); +} + /* ->val is filled in by kvm_sys_reg_table_init() */ static struct sys_reg_desc invariant_sys_regs[] = { { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 }, From 793acf870ea3e492c6bb3edb5f8657d9c4f4903f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 31 Jan 2019 14:17:18 +0100 Subject: [PATCH 152/855] arm64: KVM: Describe data or unified caches as having 1 set and 1 way On SMP ARM systems, cache maintenance by set/way should only ever be done in the context of onlining or offlining CPUs, which is typically done by bare metal firmware and never in a virtual machine. For this reason, we trap set/way cache maintenance operations and replace them with conditional flushing of the entire guest address space. Due to this trapping, the set/way arguments passed into the set/way ops are completely ignored, and thus irrelevant. This also means that the set/way geometry is equally irrelevant, and we can simply report it as 1 set and 1 way, so that legacy 32-bit ARM system software (i.e., the kind that only receives odd fixes) doesn't take a performance hit due to the trapping when iterating over the cachelines. Acked-by: Christoffer Dall Signed-off-by: Ard Biesheuvel Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 3 ++- arch/arm64/kvm/sys_regs.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 9acccb1e3746..d3842791e1c4 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -78,7 +78,8 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) if (!vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 |= HCR_TID3; - if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE)) + if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) || + vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 |= HCR_TID2; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 45a07cfc57ed..81a342679e60 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1186,6 +1186,21 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1); p->regval = get_ccsidr(csselr); + + /* + * Guests should not be doing cache operations by set/way at all, and + * for this reason, we trap them and attempt to infer the intent, so + * that we can flush the entire guest's address space at the appropriate + * time. + * To prevent this trapping from causing performance problems, let's + * expose the geometry of all data and unified caches (which are + * guaranteed to be PIPT and thus non-aliasing) as 1 set and 1 way. + * [If guests should attempt to infer aliasing properties from the + * geometry (which is not permitted by the architecture), they would + * only do so for virtually indexed caches.] + */ + if (!(csselr & 1)) // data or unified cache + p->regval &= ~GENMASK(27, 3); return true; } From bae561c0cff7e2cde99990ac3b5a58f815d6789d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 20 Jan 2019 20:32:31 +0000 Subject: [PATCH 153/855] KVM: arm/arm64: arch_timer: Mark physical interrupt active when a virtual interrupt is pending When a guest gets scheduled, KVM performs a "load" operation, which for the timer includes evaluating the virtual "active" state of the interrupt, and replicating it on the physical side. This ensures that the deactivation in the guest will also take place in the physical GIC distributor. If the interrupt is not yet active, we flag it as inactive on the physical side. This means that on restoring the timer registers, if the timer has expired, we'll immediately take an interrupt. That's absolutely fine, as the interrupt will then be flagged as active on the physical side. What this assumes though is that we'll enter the guest right after having taken the interrupt, and that the guest will quickly ACK the interrupt, making it active at on the virtual side. It turns out that quite often, this assumption doesn't really hold. The guest may be preempted on the back on this interrupt, either from kernel space or whilst running at EL1 when a host interrupt fires. When this happens, we repeat the whole sequence on the next load (interrupt marked as inactive, timer registers restored, interrupt fires). And if it takes a really long time for a guest to activate the interrupt (as it does with nested virt), we end-up with many such events in quick succession, leading to the guest only making very slow progress. This can also be seen with the number of virtual timer interrupt on the host being far greater than the same number in the guest. An easy way to fix this is to evaluate the timer state when performing the "load" operation, just like we do when the interrupt actually fires. If the timer has a pending virtual interrupt at this stage, then we can safely flag the physical interrupt as being active, which prevents spurious exits. Signed-off-by: Marc Zyngier --- virt/kvm/arm/arch_timer.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 17f9de73cc8a..af8f2f1d01cc 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -487,12 +487,21 @@ static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, boo static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) { struct kvm_vcpu *vcpu = ctx->vcpu; - bool phys_active; + bool phys_active = false; + + /* + * Update the timer output so that it is likely to match the + * state we're about to restore. If the timer expires between + * this point and the register restoration, we'll take the + * interrupt anyway. + */ + kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx); if (irqchip_in_kernel(vcpu->kvm)) phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq); - else - phys_active = ctx->irq.level; + + phys_active |= ctx->irq.level; + set_timer_irq_phys_active(ctx, phys_active); } From 49dfe94fe5ad97d380f81544b6d3626b076a1ef6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 25 Jan 2019 16:57:28 +0900 Subject: [PATCH 154/855] KVM: arm/arm64: Fix TRACE_INCLUDE_PATH As the comment block in include/trace/define_trace.h says, TRACE_INCLUDE_PATH should be a relative path to the define_trace.h ../../virt/kvm/arm is the correct relative path. ../../../virt/kvm/arm is working by coincidence because the top Makefile adds -I$(srctree)/arch/$(SRCARCH)/include as a header search path, but we should not rely on it. Acked-by: Christoffer Dall Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier --- virt/kvm/arm/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index 54bb059243b9..204d210d01c2 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h @@ -370,7 +370,7 @@ TRACE_EVENT(kvm_timer_emulate, #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm +#define TRACE_INCLUDE_PATH ../../virt/kvm/arm #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace From 3644a35b0244ddaeafa170a371144c4f12682c98 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 25 Jan 2019 16:57:29 +0900 Subject: [PATCH 155/855] KVM: arm/arm64: Remove -I. header search paths The header search path -I. in kernel Makefiles is very suspicious; it allows the compiler to search for headers in the top of $(srctree), where obviously no header file exists. I was able to build without these extra header search paths. Acked-by: Christoffer Dall Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier --- arch/arm/kvm/Makefile | 3 +-- arch/arm64/kvm/Makefile | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 48de846f2246..bca775e222c3 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -9,8 +9,7 @@ ifeq ($(plus_virt),+virt) endif ccflags-y += -Iarch/arm/kvm -Ivirt/kvm/arm/vgic -CFLAGS_arm.o := -I. $(plus_virt_def) -CFLAGS_mmu.o := -I. +CFLAGS_arm.o := $(plus_virt_def) AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 0f2a135ba15b..3089b3159135 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -4,8 +4,6 @@ # ccflags-y += -Iarch/arm64/kvm -Ivirt/kvm/arm/vgic -CFLAGS_arm.o := -I. -CFLAGS_mmu.o := -I. KVM=../../../virt/kvm From 05277f368c33fdc12d91464f25bb5656b808ec8a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 25 Jan 2019 16:57:30 +0900 Subject: [PATCH 156/855] KVM: arm/arm64: Prefix header search paths with $(srctree)/ Currently, the Kbuild core manipulates header search paths in a crazy way [1]. To fix this mess, I want all Makefiles to add explicit $(srctree)/ to the search paths in the srctree. Some Makefiles are already written in that way, but not all. The goal of this work is to make the notation consistent, and finally get rid of the gross hacks. Having whitespaces after -I does not matter since commit 48f6e3cf5bc6 ("kbuild: do not drop -I without parameter"). [1]: https://patchwork.kernel.org/patch/9632347/ Acked-by: Christoffer Dall Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier --- arch/arm/kvm/Makefile | 2 +- arch/arm64/kvm/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index bca775e222c3..531e59f5be9c 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -8,7 +8,7 @@ ifeq ($(plus_virt),+virt) plus_virt_def := -DREQUIRES_VIRT=1 endif -ccflags-y += -Iarch/arm/kvm -Ivirt/kvm/arm/vgic +ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic CFLAGS_arm.o := $(plus_virt_def) AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 3089b3159135..690e033a91c0 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -3,7 +3,7 @@ # Makefile for Kernel-based Virtual Machine module # -ccflags-y += -Iarch/arm64/kvm -Ivirt/kvm/arm/vgic +ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic KVM=../../../virt/kvm From 2b59066902548c17ddf5443745dbf8ce50d0c037 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 12 Feb 2019 10:07:47 +0000 Subject: [PATCH 157/855] KVM: arm/arm64: Update MAINTAINERS entries For historical reasons, KVM/arm and KVM/arm64 have had different entries in the MAINTAINER file. This makes little sense, as they are maintained together. On top of that, we have a bunch of talented people helping with the reviewing, and they deserve to be mentioned in the consolidated entry. Acked-by: Christoffer Dall Acked-by: Suzuki K Poulose Acked-by: James Morse Signed-off-by: Marc Zyngier --- MAINTAINERS | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 51029a425dbe..5d68f050d155 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8297,29 +8297,25 @@ S: Maintained F: arch/x86/include/asm/svm.h F: arch/x86/kvm/svm.c -KERNEL VIRTUAL MACHINE FOR ARM (KVM/arm) +KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64) M: Christoffer Dall M: Marc Zyngier +R: James Morse +R: Julien Thierry +R: Suzuki K Pouloze L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: kvmarm@lists.cs.columbia.edu W: http://systems.cs.columbia.edu/projects/kvm-arm T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git -S: Supported +S: Maintained F: arch/arm/include/uapi/asm/kvm* F: arch/arm/include/asm/kvm* F: arch/arm/kvm/ -F: virt/kvm/arm/ -F: include/kvm/arm_* - -KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64) -M: Christoffer Dall -M: Marc Zyngier -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -L: kvmarm@lists.cs.columbia.edu -S: Maintained F: arch/arm64/include/uapi/asm/kvm* F: arch/arm64/include/asm/kvm* F: arch/arm64/kvm/ +F: virt/kvm/arm/ +F: include/kvm/arm_* KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips) M: James Hogan From a37f0c3c46d2d5f4e94bb1e80e3f6f1f8fd89b40 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 17 Feb 2019 22:36:26 +0000 Subject: [PATCH 158/855] KVM: arm/arm64: fix spelling mistake: "auxilary" -> "auxiliary" There is a spelling mistake in a kvm_err error message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Marc Zyngier --- virt/kvm/arm/arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 9fbdb9e1c51f..8de55041e7ba 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1553,7 +1553,7 @@ static int init_hyp_mode(void) err = hyp_map_aux_data(); if (err) - kvm_err("Cannot map host auxilary data: %d\n", err); + kvm_err("Cannot map host auxiliary data: %d\n", err); return 0; From 1b44471b555930cd8d03ae9be86bd2c32fbf0a92 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 14 Feb 2019 01:45:46 +0000 Subject: [PATCH 159/855] KVM: arm64: Fix comment for KVM_PHYS_SHIFT Since Suzuki K Poulose's work on Dynamic IPA support, KVM_PHYS_SHIFT will be used only when machine_type's bits[7:0] equal to 0 (by default). Thus the outdated comment should be fixed. Reviewed-by: Suzuki K Poulose Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_mmu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index c423c8c4fc39..b0742a16c6c9 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -138,7 +138,8 @@ static inline unsigned long __kern_hyp_va(unsigned long v) }) /* - * We currently only support a 40bit IPA. + * We currently support using a VM-specified IPA size. For backward + * compatibility, the default IPA size is fixed to 40bits. */ #define KVM_PHYS_SHIFT (40) From c2be79a0bcf34ed975787e03a129936df17ec0dc Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Tue, 19 Feb 2019 17:22:21 +0800 Subject: [PATCH 160/855] KVM: arm/arm64: Remove unused gpa_end variable The 'gpa_end' local variable is never used and let's remove it. Cc: Christoffer Dall Signed-off-by: Shaokun Zhang Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index e3e5a26b4845..aae68da2717f 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1597,14 +1597,13 @@ static void kvm_send_hwpoison_signal(unsigned long address, static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, unsigned long hva) { - gpa_t gpa_start, gpa_end; + gpa_t gpa_start; hva_t uaddr_start, uaddr_end; size_t size; size = memslot->npages * PAGE_SIZE; gpa_start = memslot->base_gfn << PAGE_SHIFT; - gpa_end = gpa_start + size; uaddr_start = memslot->userspace_addr; uaddr_end = uaddr_start + size; From 21c75763a3ae18679e5c4e2260aa9379b073566b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 20 Feb 2019 11:39:36 -0800 Subject: [PATCH 161/855] device-dax: Add a 'target_node' attribute The target-node attribute is the Linux numa-node that a device-dax instance may create when it is online. Prior to being online the device's 'numa_node' property reflects the closest online cpu node which is the typical expectation of a device 'numa_node'. Once it is online it becomes its own distinct numa node, i.e. 'target_node'. Export the 'target_node' property to give userspace tooling the ability to predict the effective numa-node from a device-dax instance configured to provide 'System RAM' capacity. Cc: Vishal Verma Reported-by: Dave Hansen Signed-off-by: Dan Williams --- drivers/dax/bus.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index a410154d75fb..28c3324271ac 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -279,13 +279,41 @@ static ssize_t size_show(struct device *dev, } static DEVICE_ATTR_RO(size); +static int dev_dax_target_node(struct dev_dax *dev_dax) +{ + struct dax_region *dax_region = dev_dax->region; + + return dax_region->target_node; +} + +static ssize_t target_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax)); +} +static DEVICE_ATTR_RO(target_node); + +static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct dev_dax *dev_dax = to_dev_dax(dev); + + if (a == &dev_attr_target_node.attr && dev_dax_target_node(dev_dax) < 0) + return 0; + return a->mode; +} + static struct attribute *dev_dax_attributes[] = { &dev_attr_size.attr, + &dev_attr_target_node.attr, NULL, }; static const struct attribute_group dev_dax_attribute_group = { .attrs = dev_dax_attributes, + .is_visible = dev_dax_visible, }; static const struct attribute_group *dax_attribute_groups[] = { From 95c7b77d6e40a407ca5bd21d7ba2e1c28ad8e85a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:09 -0800 Subject: [PATCH 162/855] KVM: x86: Explicitly #define the VCPU_REGS_* indices Declaring the VCPU_REGS_* as enums allows for more robust C code, but it prevents using the values in assembly files. Expliciting #define the indices in an asm-friendly file to prepare for VMX moving its transition code to a proper assembly file, but keep the enums for general usage. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 33 ++++++++++++++-------------- arch/x86/include/asm/kvm_vcpu_regs.h | 25 +++++++++++++++++++++ 2 files changed, 42 insertions(+), 16 deletions(-) create mode 100644 arch/x86/include/asm/kvm_vcpu_regs.h diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4660ce90de7f..0e2ef41efb9d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #define KVM_MAX_VCPUS 288 @@ -137,23 +138,23 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define ASYNC_PF_PER_VCPU 64 enum kvm_reg { - VCPU_REGS_RAX = 0, - VCPU_REGS_RCX = 1, - VCPU_REGS_RDX = 2, - VCPU_REGS_RBX = 3, - VCPU_REGS_RSP = 4, - VCPU_REGS_RBP = 5, - VCPU_REGS_RSI = 6, - VCPU_REGS_RDI = 7, + VCPU_REGS_RAX = __VCPU_REGS_RAX, + VCPU_REGS_RCX = __VCPU_REGS_RCX, + VCPU_REGS_RDX = __VCPU_REGS_RDX, + VCPU_REGS_RBX = __VCPU_REGS_RBX, + VCPU_REGS_RSP = __VCPU_REGS_RSP, + VCPU_REGS_RBP = __VCPU_REGS_RBP, + VCPU_REGS_RSI = __VCPU_REGS_RSI, + VCPU_REGS_RDI = __VCPU_REGS_RDI, #ifdef CONFIG_X86_64 - VCPU_REGS_R8 = 8, - VCPU_REGS_R9 = 9, - VCPU_REGS_R10 = 10, - VCPU_REGS_R11 = 11, - VCPU_REGS_R12 = 12, - VCPU_REGS_R13 = 13, - VCPU_REGS_R14 = 14, - VCPU_REGS_R15 = 15, + VCPU_REGS_R8 = __VCPU_REGS_R8, + VCPU_REGS_R9 = __VCPU_REGS_R9, + VCPU_REGS_R10 = __VCPU_REGS_R10, + VCPU_REGS_R11 = __VCPU_REGS_R11, + VCPU_REGS_R12 = __VCPU_REGS_R12, + VCPU_REGS_R13 = __VCPU_REGS_R13, + VCPU_REGS_R14 = __VCPU_REGS_R14, + VCPU_REGS_R15 = __VCPU_REGS_R15, #endif VCPU_REGS_RIP, NR_VCPU_REGS diff --git a/arch/x86/include/asm/kvm_vcpu_regs.h b/arch/x86/include/asm/kvm_vcpu_regs.h new file mode 100644 index 000000000000..1af2cb59233b --- /dev/null +++ b/arch/x86/include/asm/kvm_vcpu_regs.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_KVM_VCPU_REGS_H +#define _ASM_X86_KVM_VCPU_REGS_H + +#define __VCPU_REGS_RAX 0 +#define __VCPU_REGS_RCX 1 +#define __VCPU_REGS_RDX 2 +#define __VCPU_REGS_RBX 3 +#define __VCPU_REGS_RSP 4 +#define __VCPU_REGS_RBP 5 +#define __VCPU_REGS_RSI 6 +#define __VCPU_REGS_RDI 7 + +#ifdef CONFIG_X86_64 +#define __VCPU_REGS_R8 8 +#define __VCPU_REGS_R9 9 +#define __VCPU_REGS_R10 10 +#define __VCPU_REGS_R11 11 +#define __VCPU_REGS_R12 12 +#define __VCPU_REGS_R13 13 +#define __VCPU_REGS_R14 14 +#define __VCPU_REGS_R15 15 +#endif + +#endif /* _ASM_X86_KVM_VCPU_REGS_H */ From c14f9dd50b01b55834a757dd50af35b8e168512d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:10 -0800 Subject: [PATCH 163/855] KVM: VMX: Use #defines in place of immediates in VM-Enter inline asm ...to prepare for moving the inline asm to a proper asm sub-routine. Eliminating the immediates allows a nearly verbatim move, e.g. quotes, newlines, tabs and __stringify need to be dropped, but other than those cosmetic changes the only function change will be to replace the final "jmp" with a "ret". Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 113 ++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 52 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cd92568bed26..12fb342218ad 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6371,6 +6371,33 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } +#ifdef CONFIG_X86_64 +#define WORD_SIZE 8 +#else +#define WORD_SIZE 4 +#endif + +#define _WORD_SIZE __stringify(WORD_SIZE) + +#define VCPU_RAX __stringify(__VCPU_REGS_RAX * WORD_SIZE) +#define VCPU_RCX __stringify(__VCPU_REGS_RCX * WORD_SIZE) +#define VCPU_RDX __stringify(__VCPU_REGS_RDX * WORD_SIZE) +#define VCPU_RBX __stringify(__VCPU_REGS_RBX * WORD_SIZE) +/* Intentionally omit %RSP as it's context switched by hardware */ +#define VCPU_RBP __stringify(__VCPU_REGS_RBP * WORD_SIZE) +#define VCPU_RSI __stringify(__VCPU_REGS_RSI * WORD_SIZE) +#define VCPU_RDI __stringify(__VCPU_REGS_RDI * WORD_SIZE) +#ifdef CONFIG_X86_64 +#define VCPU_R8 __stringify(__VCPU_REGS_R8 * WORD_SIZE) +#define VCPU_R9 __stringify(__VCPU_REGS_R9 * WORD_SIZE) +#define VCPU_R10 __stringify(__VCPU_REGS_R10 * WORD_SIZE) +#define VCPU_R11 __stringify(__VCPU_REGS_R11 * WORD_SIZE) +#define VCPU_R12 __stringify(__VCPU_REGS_R12 * WORD_SIZE) +#define VCPU_R13 __stringify(__VCPU_REGS_R13 * WORD_SIZE) +#define VCPU_R14 __stringify(__VCPU_REGS_R14 * WORD_SIZE) +#define VCPU_R15 __stringify(__VCPU_REGS_R15 * WORD_SIZE) +#endif + static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { if (static_branch_unlikely(&vmx_l1d_should_flush)) @@ -6390,7 +6417,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "push %%" _ASM_ARG2 " \n\t" /* Adjust RSP to account for the CALL to vmx_vmenter(). */ - "lea -%c[wordsize](%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" + "lea -" _WORD_SIZE "(%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" "call vmx_update_host_rsp \n\t" /* Load RCX with @regs. */ @@ -6400,24 +6427,24 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "cmpb $0, %%bl \n\t" /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t" - "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t" - "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t" - "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t" - "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t" - "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t" + "mov " VCPU_RAX "(%%" _ASM_CX "), %%" _ASM_AX " \n\t" + "mov " VCPU_RBX "(%%" _ASM_CX "), %%" _ASM_BX " \n\t" + "mov " VCPU_RDX "(%%" _ASM_CX "), %%" _ASM_DX " \n\t" + "mov " VCPU_RSI "(%%" _ASM_CX "), %%" _ASM_SI " \n\t" + "mov " VCPU_RDI "(%%" _ASM_CX "), %%" _ASM_DI " \n\t" + "mov " VCPU_RBP "(%%" _ASM_CX "), %%" _ASM_BP " \n\t" #ifdef CONFIG_X86_64 - "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t" - "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t" - "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t" - "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t" - "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t" - "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t" - "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t" - "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t" + "mov " VCPU_R8 "(%%" _ASM_CX "), %%r8 \n\t" + "mov " VCPU_R9 "(%%" _ASM_CX "), %%r9 \n\t" + "mov " VCPU_R10 "(%%" _ASM_CX "), %%r10 \n\t" + "mov " VCPU_R11 "(%%" _ASM_CX "), %%r11 \n\t" + "mov " VCPU_R12 "(%%" _ASM_CX "), %%r12 \n\t" + "mov " VCPU_R13 "(%%" _ASM_CX "), %%r13 \n\t" + "mov " VCPU_R14 "(%%" _ASM_CX "), %%r14 \n\t" + "mov " VCPU_R15 "(%%" _ASM_CX "), %%r15 \n\t" #endif /* Load guest RCX. This kills the vmx_vcpu pointer! */ - "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t" + "mov " VCPU_RCX"(%%" _ASM_CX "), %%" _ASM_CX " \n\t" /* Enter guest mode */ "call vmx_vmenter\n\t" @@ -6427,25 +6454,25 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "push %%" _ASM_CX " \n\t" /* Reload RCX with @regs. */ - "mov %c[wordsize](%%" _ASM_SP "), %%" _ASM_CX " \n\t" + "mov " _WORD_SIZE "(%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Save all guest registers, including RCX from the stack */ - "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t" - __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t" + "mov %%" _ASM_AX ", " VCPU_RAX "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_BX ", " VCPU_RBX "(%%" _ASM_CX ") \n\t" + __ASM_SIZE(pop) " " VCPU_RCX "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_DX ", " VCPU_RDX "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_SI ", " VCPU_RSI "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_DI ", " VCPU_RDI "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_BP ", " VCPU_RBP "(%%" _ASM_CX ") \n\t" #ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t" - "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t" - "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t" - "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t" - "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t" - "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t" - "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t" - "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t" + "mov %%r8, " VCPU_R8 "(%%" _ASM_CX ") \n\t" + "mov %%r9, " VCPU_R9 "(%%" _ASM_CX ") \n\t" + "mov %%r10, " VCPU_R10 "(%%" _ASM_CX ") \n\t" + "mov %%r11, " VCPU_R11 "(%%" _ASM_CX ") \n\t" + "mov %%r12, " VCPU_R12 "(%%" _ASM_CX ") \n\t" + "mov %%r13, " VCPU_R13 "(%%" _ASM_CX ") \n\t" + "mov %%r14, " VCPU_R14 "(%%" _ASM_CX ") \n\t" + "mov %%r15, " VCPU_R15 "(%%" _ASM_CX ") \n\t" #endif /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ @@ -6479,7 +6506,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%ebp, %%ebp \n\t" /* "POP" the vcpu_vmx pointer. */ - "add $%c[wordsize], %%" _ASM_SP " \n\t" + "add $" _WORD_SIZE ", %%" _ASM_SP " \n\t" "pop %%" _ASM_BP " \n\t" "jmp 3f \n\t" @@ -6497,25 +6524,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "=a"((int){0}), "=d"((int){0}) : "a"(vmx), "d"(&vcpu->arch.regs), #endif - "b"(vmx->loaded_vmcs->launched), - [rax]"i"(VCPU_REGS_RAX * sizeof(ulong)), - [rbx]"i"(VCPU_REGS_RBX * sizeof(ulong)), - [rcx]"i"(VCPU_REGS_RCX * sizeof(ulong)), - [rdx]"i"(VCPU_REGS_RDX * sizeof(ulong)), - [rsi]"i"(VCPU_REGS_RSI * sizeof(ulong)), - [rdi]"i"(VCPU_REGS_RDI * sizeof(ulong)), - [rbp]"i"(VCPU_REGS_RBP * sizeof(ulong)), -#ifdef CONFIG_X86_64 - [r8]"i"(VCPU_REGS_R8 * sizeof(ulong)), - [r9]"i"(VCPU_REGS_R9 * sizeof(ulong)), - [r10]"i"(VCPU_REGS_R10 * sizeof(ulong)), - [r11]"i"(VCPU_REGS_R11 * sizeof(ulong)), - [r12]"i"(VCPU_REGS_R12 * sizeof(ulong)), - [r13]"i"(VCPU_REGS_R13 * sizeof(ulong)), - [r14]"i"(VCPU_REGS_R14 * sizeof(ulong)), - [r15]"i"(VCPU_REGS_R15 * sizeof(ulong)), -#endif - [wordsize]"i"(sizeof(ulong)) + "b"(vmx->loaded_vmcs->launched) : "cc", "memory" #ifdef CONFIG_X86_64 , "rax", "rcx", "rdx" From 63c73aa07fcabc090661a586f7ae5200a0fc5cb4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:11 -0800 Subject: [PATCH 164/855] KVM: VMX: Create a stack frame in vCPU-run ...in preparation for moving to a proper assembly sub-routnine. vCPU-run isn't a leaf function since it calls vmx_update_host_rsp() and vmx_vmenter(). And since we need to save/restore RBP anyways, unconditionally creating the frame costs a single MOV, i.e. don't bother keying off CONFIG_FRAME_POINTER or using FRAME_BEGIN, etc... Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 12fb342218ad..52fea89732db 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6407,8 +6407,8 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) write_cr2(vcpu->arch.cr2); asm( - /* Store host registers */ "push %%" _ASM_BP " \n\t" + "mov %%" _ASM_SP ", %%" _ASM_BP " \n\t" /* * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and From 5e0781df18990497f4cd96a959ca583d83b6d1e0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:12 -0800 Subject: [PATCH 165/855] KVM: VMX: Move vCPU-run code to a proper assembly routine As evidenced by the myriad patches leading up to this moment, using an inline asm blob for vCPU-run is nothing short of horrific. It's also been called "unholy", "an abomination" and likely a whole host of other names that would violate the Code of Conduct if recorded here and now. The code is relocated nearly verbatim, e.g. quotes, newlines, tabs and __stringify need to be dropped, but other than those cosmetic changes the only functional changees are to add the "call" and replace the final "jmp" with a "ret". Note that STACK_FRAME_NON_STANDARD is also dropped from __vmx_vcpu_run(). Suggested-by: Andi Kleen Suggested-by: Josh Poimboeuf Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 144 +++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 138 +---------------------------------- 2 files changed, 145 insertions(+), 137 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index bcef2c7e9bc4..e64617f3b196 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -1,6 +1,30 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include +#include + +#define WORD_SIZE (BITS_PER_LONG / 8) + +#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE +#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE +#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE +#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE +/* Intentionally omit RSP as it's context switched by hardware */ +#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE +#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE +#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE + +#ifdef CONFIG_X86_64 +#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE +#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE +#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE +#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE +#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE +#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE +#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE +#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE +#endif .text @@ -55,3 +79,123 @@ ENDPROC(vmx_vmenter) ENTRY(vmx_vmexit) ret ENDPROC(vmx_vmexit) + +/** + * ____vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode + * @vmx: struct vcpu_vmx * + * @regs: unsigned long * (to guest registers) + * %RBX: VMCS launched status (non-zero indicates already launched) + * + * Returns: + * %RBX is 0 on VM-Exit, 1 on VM-Fail + */ +ENTRY(____vmx_vcpu_run) + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + + /* + * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and + * @regs is needed after VM-Exit to save the guest's register values. + */ + push %_ASM_ARG2 + + /* Adjust RSP to account for the CALL to vmx_vmenter(). */ + lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 + call vmx_update_host_rsp + + /* Load @regs to RCX. */ + mov (%_ASM_SP), %_ASM_CX + + /* Check if vmlaunch or vmresume is needed */ + cmpb $0, %bl + + /* Load guest registers. Don't clobber flags. */ + mov VCPU_RAX(%_ASM_CX), %_ASM_AX + mov VCPU_RBX(%_ASM_CX), %_ASM_BX + mov VCPU_RDX(%_ASM_CX), %_ASM_DX + mov VCPU_RSI(%_ASM_CX), %_ASM_SI + mov VCPU_RDI(%_ASM_CX), %_ASM_DI + mov VCPU_RBP(%_ASM_CX), %_ASM_BP +#ifdef CONFIG_X86_64 + mov VCPU_R8 (%_ASM_CX), %r8 + mov VCPU_R9 (%_ASM_CX), %r9 + mov VCPU_R10(%_ASM_CX), %r10 + mov VCPU_R11(%_ASM_CX), %r11 + mov VCPU_R12(%_ASM_CX), %r12 + mov VCPU_R13(%_ASM_CX), %r13 + mov VCPU_R14(%_ASM_CX), %r14 + mov VCPU_R15(%_ASM_CX), %r15 +#endif + /* Load guest RCX. This kills the vmx_vcpu pointer! */ + mov VCPU_RCX(%_ASM_CX), %_ASM_CX + + /* Enter guest mode */ + call vmx_vmenter + + /* Jump on VM-Fail. */ + jbe 2f + + /* Temporarily save guest's RCX. */ + push %_ASM_CX + + /* Reload @regs to RCX. */ + mov WORD_SIZE(%_ASM_SP), %_ASM_CX + + /* Save all guest registers, including RCX from the stack */ + mov %_ASM_AX, VCPU_RAX(%_ASM_CX) + mov %_ASM_BX, VCPU_RBX(%_ASM_CX) + __ASM_SIZE(pop) VCPU_RCX(%_ASM_CX) + mov %_ASM_DX, VCPU_RDX(%_ASM_CX) + mov %_ASM_SI, VCPU_RSI(%_ASM_CX) + mov %_ASM_DI, VCPU_RDI(%_ASM_CX) + mov %_ASM_BP, VCPU_RBP(%_ASM_CX) +#ifdef CONFIG_X86_64 + mov %r8, VCPU_R8 (%_ASM_CX) + mov %r9, VCPU_R9 (%_ASM_CX) + mov %r10, VCPU_R10(%_ASM_CX) + mov %r11, VCPU_R11(%_ASM_CX) + mov %r12, VCPU_R12(%_ASM_CX) + mov %r13, VCPU_R13(%_ASM_CX) + mov %r14, VCPU_R14(%_ASM_CX) + mov %r15, VCPU_R15(%_ASM_CX) +#endif + + /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ + xor %ebx, %ebx + + /* + * Clear all general purpose registers except RSP and RBX to prevent + * speculative use of the guest's values, even those that are reloaded + * via the stack. In theory, an L1 cache miss when restoring registers + * could lead to speculative execution with the guest's values. + * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially + * free. RSP and RBX are exempt as RSP is restored by hardware during + * VM-Exit and RBX is explicitly loaded with 0 or 1 to "return" VM-Fail. + */ +1: +#ifdef CONFIG_X86_64 + xor %r8d, %r8d + xor %r9d, %r9d + xor %r10d, %r10d + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d +#endif + xor %eax, %eax + xor %ecx, %ecx + xor %edx, %edx + xor %esi, %esi + xor %edi, %edi + xor %ebp, %ebp + + /* "POP" @regs. */ + add $WORD_SIZE, %_ASM_SP + pop %_ASM_BP + ret + + /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ +2: mov $1, %ebx + jmp 1b +ENDPROC(____vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 52fea89732db..0e6e2dd6265e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6371,33 +6371,6 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } -#ifdef CONFIG_X86_64 -#define WORD_SIZE 8 -#else -#define WORD_SIZE 4 -#endif - -#define _WORD_SIZE __stringify(WORD_SIZE) - -#define VCPU_RAX __stringify(__VCPU_REGS_RAX * WORD_SIZE) -#define VCPU_RCX __stringify(__VCPU_REGS_RCX * WORD_SIZE) -#define VCPU_RDX __stringify(__VCPU_REGS_RDX * WORD_SIZE) -#define VCPU_RBX __stringify(__VCPU_REGS_RBX * WORD_SIZE) -/* Intentionally omit %RSP as it's context switched by hardware */ -#define VCPU_RBP __stringify(__VCPU_REGS_RBP * WORD_SIZE) -#define VCPU_RSI __stringify(__VCPU_REGS_RSI * WORD_SIZE) -#define VCPU_RDI __stringify(__VCPU_REGS_RDI * WORD_SIZE) -#ifdef CONFIG_X86_64 -#define VCPU_R8 __stringify(__VCPU_REGS_R8 * WORD_SIZE) -#define VCPU_R9 __stringify(__VCPU_REGS_R9 * WORD_SIZE) -#define VCPU_R10 __stringify(__VCPU_REGS_R10 * WORD_SIZE) -#define VCPU_R11 __stringify(__VCPU_REGS_R11 * WORD_SIZE) -#define VCPU_R12 __stringify(__VCPU_REGS_R12 * WORD_SIZE) -#define VCPU_R13 __stringify(__VCPU_REGS_R13 * WORD_SIZE) -#define VCPU_R14 __stringify(__VCPU_REGS_R14 * WORD_SIZE) -#define VCPU_R15 __stringify(__VCPU_REGS_R15 * WORD_SIZE) -#endif - static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { if (static_branch_unlikely(&vmx_l1d_should_flush)) @@ -6407,115 +6380,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) write_cr2(vcpu->arch.cr2); asm( - "push %%" _ASM_BP " \n\t" - "mov %%" _ASM_SP ", %%" _ASM_BP " \n\t" - - /* - * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and - * @regs is needed after VM-Exit to save the guest's register values. - */ - "push %%" _ASM_ARG2 " \n\t" - - /* Adjust RSP to account for the CALL to vmx_vmenter(). */ - "lea -" _WORD_SIZE "(%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" - "call vmx_update_host_rsp \n\t" - - /* Load RCX with @regs. */ - "mov (%%" _ASM_SP "), %%" _ASM_CX " \n\t" - - /* Check if vmlaunch or vmresume is needed */ - "cmpb $0, %%bl \n\t" - - /* Load guest registers. Don't clobber flags. */ - "mov " VCPU_RAX "(%%" _ASM_CX "), %%" _ASM_AX " \n\t" - "mov " VCPU_RBX "(%%" _ASM_CX "), %%" _ASM_BX " \n\t" - "mov " VCPU_RDX "(%%" _ASM_CX "), %%" _ASM_DX " \n\t" - "mov " VCPU_RSI "(%%" _ASM_CX "), %%" _ASM_SI " \n\t" - "mov " VCPU_RDI "(%%" _ASM_CX "), %%" _ASM_DI " \n\t" - "mov " VCPU_RBP "(%%" _ASM_CX "), %%" _ASM_BP " \n\t" -#ifdef CONFIG_X86_64 - "mov " VCPU_R8 "(%%" _ASM_CX "), %%r8 \n\t" - "mov " VCPU_R9 "(%%" _ASM_CX "), %%r9 \n\t" - "mov " VCPU_R10 "(%%" _ASM_CX "), %%r10 \n\t" - "mov " VCPU_R11 "(%%" _ASM_CX "), %%r11 \n\t" - "mov " VCPU_R12 "(%%" _ASM_CX "), %%r12 \n\t" - "mov " VCPU_R13 "(%%" _ASM_CX "), %%r13 \n\t" - "mov " VCPU_R14 "(%%" _ASM_CX "), %%r14 \n\t" - "mov " VCPU_R15 "(%%" _ASM_CX "), %%r15 \n\t" -#endif - /* Load guest RCX. This kills the vmx_vcpu pointer! */ - "mov " VCPU_RCX"(%%" _ASM_CX "), %%" _ASM_CX " \n\t" - - /* Enter guest mode */ - "call vmx_vmenter\n\t" - "jbe 2f \n\t" - - /* Temporarily save guest's RCX. */ - "push %%" _ASM_CX " \n\t" - - /* Reload RCX with @regs. */ - "mov " _WORD_SIZE "(%%" _ASM_SP "), %%" _ASM_CX " \n\t" - - /* Save all guest registers, including RCX from the stack */ - "mov %%" _ASM_AX ", " VCPU_RAX "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BX ", " VCPU_RBX "(%%" _ASM_CX ") \n\t" - __ASM_SIZE(pop) " " VCPU_RCX "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DX ", " VCPU_RDX "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_SI ", " VCPU_RSI "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DI ", " VCPU_RDI "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BP ", " VCPU_RBP "(%%" _ASM_CX ") \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, " VCPU_R8 "(%%" _ASM_CX ") \n\t" - "mov %%r9, " VCPU_R9 "(%%" _ASM_CX ") \n\t" - "mov %%r10, " VCPU_R10 "(%%" _ASM_CX ") \n\t" - "mov %%r11, " VCPU_R11 "(%%" _ASM_CX ") \n\t" - "mov %%r12, " VCPU_R12 "(%%" _ASM_CX ") \n\t" - "mov %%r13, " VCPU_R13 "(%%" _ASM_CX ") \n\t" - "mov %%r14, " VCPU_R14 "(%%" _ASM_CX ") \n\t" - "mov %%r15, " VCPU_R15 "(%%" _ASM_CX ") \n\t" -#endif - - /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ - "xor %%ebx, %%ebx \n\t" - - /* - * Clear all general purpose registers except RSP and RBX to prevent - * speculative use of the guest's values, even those that are reloaded - * via the stack. In theory, an L1 cache miss when restoring registers - * could lead to speculative execution with the guest's values. - * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially - * free. RSP and RBX are exempt as RSP is restored by hardware during - * VM-Exit and RBX is explicitly loaded with 0 or 1 to "return" VM-Fail. - */ - "1: \n\t" -#ifdef CONFIG_X86_64 - "xor %%r8d, %%r8d \n\t" - "xor %%r9d, %%r9d \n\t" - "xor %%r10d, %%r10d \n\t" - "xor %%r11d, %%r11d \n\t" - "xor %%r12d, %%r12d \n\t" - "xor %%r13d, %%r13d \n\t" - "xor %%r14d, %%r14d \n\t" - "xor %%r15d, %%r15d \n\t" -#endif - "xor %%eax, %%eax \n\t" - "xor %%ecx, %%ecx \n\t" - "xor %%edx, %%edx \n\t" - "xor %%esi, %%esi \n\t" - "xor %%edi, %%edi \n\t" - "xor %%ebp, %%ebp \n\t" - - /* "POP" the vcpu_vmx pointer. */ - "add $" _WORD_SIZE ", %%" _ASM_SP " \n\t" - "pop %%" _ASM_BP " \n\t" - "jmp 3f \n\t" - - /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ - "2: \n\t" - "mov $1, %%ebx \n\t" - "jmp 1b \n\t" - "3: \n\t" - + "call ____vmx_vcpu_run \n\t" : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 "=D"((int){0}), "=S"((int){0}) @@ -6536,7 +6401,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) vcpu->arch.cr2 = read_cr2(); } -STACK_FRAME_NON_STANDARD(__vmx_vcpu_run); static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { From c823dd5c0f3fafa595ed51cc72c6e006efc20ad3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:13 -0800 Subject: [PATCH 166/855] KVM: VMX: Fold __vmx_vcpu_run() back into vmx_vcpu_run() ...now that the code is no longer tagged with STACK_FRAME_NON_STANDARD. Arguably, providing __vmx_vcpu_run() to break up vmx_vcpu_run() is valuable on its own, but the previous split was purposely made as small as possible to limit the effects STACK_FRAME_NON_STANDARD. In other words, the current split is now completely arbitrary and likely not the most logical. This also allows renaming ____vmx_vcpu_run() to __vmx_vcpu_run() in a future patch. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 59 +++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0e6e2dd6265e..e61bb8da9767 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6371,37 +6371,6 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } -static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) -{ - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - - if (vcpu->arch.cr2 != read_cr2()) - write_cr2(vcpu->arch.cr2); - - asm( - "call ____vmx_vcpu_run \n\t" - : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), -#ifdef CONFIG_X86_64 - "=D"((int){0}), "=S"((int){0}) - : "D"(vmx), "S"(&vcpu->arch.regs), -#else - "=a"((int){0}), "=d"((int){0}) - : "a"(vmx), "d"(&vcpu->arch.regs), -#endif - "b"(vmx->loaded_vmcs->launched) - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rax", "rcx", "rdx" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ecx", "edi", "esi" -#endif - ); - - vcpu->arch.cr2 = read_cr2(); -} - static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6469,7 +6438,33 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - __vmx_vcpu_run(vcpu, vmx); + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + + if (vcpu->arch.cr2 != read_cr2()) + write_cr2(vcpu->arch.cr2); + + asm( + "call ____vmx_vcpu_run \n\t" + : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), +#ifdef CONFIG_X86_64 + "=D"((int){0}), "=S"((int){0}) + : "D"(vmx), "S"(&vcpu->arch.regs), +#else + "=a"((int){0}), "=d"((int){0}) + : "a"(vmx), "d"(&vcpu->arch.regs), +#endif + "b"(vmx->loaded_vmcs->launched) + : "cc", "memory" +#ifdef CONFIG_X86_64 + , "rax", "rcx", "rdx" + , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" +#else + , "ecx", "edi", "esi" +#endif + ); + + vcpu->arch.cr2 = read_cr2(); /* * We do not use IBRS in the kernel. If this vCPU has used the From ee2fc635ef714b1d46da3616bbec972067f0d187 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:14 -0800 Subject: [PATCH 167/855] KVM: VMX: Rename ____vmx_vcpu_run() to __vmx_vcpu_run() ...now that the name is no longer usurped by a defunct helper function. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 6 +++--- arch/x86/kvm/vmx/vmx.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index e64617f3b196..fa6e03b36348 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -81,7 +81,7 @@ ENTRY(vmx_vmexit) ENDPROC(vmx_vmexit) /** - * ____vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode + * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode * @vmx: struct vcpu_vmx * * @regs: unsigned long * (to guest registers) * %RBX: VMCS launched status (non-zero indicates already launched) @@ -89,7 +89,7 @@ ENDPROC(vmx_vmexit) * Returns: * %RBX is 0 on VM-Exit, 1 on VM-Fail */ -ENTRY(____vmx_vcpu_run) +ENTRY(__vmx_vcpu_run) push %_ASM_BP mov %_ASM_SP, %_ASM_BP @@ -198,4 +198,4 @@ ENTRY(____vmx_vcpu_run) /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ 2: mov $1, %ebx jmp 1b -ENDPROC(____vmx_vcpu_run) +ENDPROC(__vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e61bb8da9767..8bc8f09eaf0c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6445,7 +6445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) write_cr2(vcpu->arch.cr2); asm( - "call ____vmx_vcpu_run \n\t" + "call __vmx_vcpu_run \n\t" : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 "=D"((int){0}), "=S"((int){0}) From a62fd5a76c99dd96c74c6638408961b7ff3c71c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:15 -0800 Subject: [PATCH 168/855] KVM: VMX: Use RAX as the scratch register during vCPU-run ...to prepare for making the sub-routine callable from C code. That means returning the result in RAX. Since RAX will be used to return the result, use it as the scratch register as well to make the code readable and to document that the scratch register is more or less arbitrary. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 76 +++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index fa6e03b36348..7d8b09abcdec 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -103,31 +103,31 @@ ENTRY(__vmx_vcpu_run) lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp - /* Load @regs to RCX. */ - mov (%_ASM_SP), %_ASM_CX + /* Load @regs to RAX. */ + mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ cmpb $0, %bl /* Load guest registers. Don't clobber flags. */ - mov VCPU_RAX(%_ASM_CX), %_ASM_AX - mov VCPU_RBX(%_ASM_CX), %_ASM_BX - mov VCPU_RDX(%_ASM_CX), %_ASM_DX - mov VCPU_RSI(%_ASM_CX), %_ASM_SI - mov VCPU_RDI(%_ASM_CX), %_ASM_DI - mov VCPU_RBP(%_ASM_CX), %_ASM_BP + mov VCPU_RBX(%_ASM_AX), %_ASM_BX + mov VCPU_RCX(%_ASM_AX), %_ASM_CX + mov VCPU_RDX(%_ASM_AX), %_ASM_DX + mov VCPU_RSI(%_ASM_AX), %_ASM_SI + mov VCPU_RDI(%_ASM_AX), %_ASM_DI + mov VCPU_RBP(%_ASM_AX), %_ASM_BP #ifdef CONFIG_X86_64 - mov VCPU_R8 (%_ASM_CX), %r8 - mov VCPU_R9 (%_ASM_CX), %r9 - mov VCPU_R10(%_ASM_CX), %r10 - mov VCPU_R11(%_ASM_CX), %r11 - mov VCPU_R12(%_ASM_CX), %r12 - mov VCPU_R13(%_ASM_CX), %r13 - mov VCPU_R14(%_ASM_CX), %r14 - mov VCPU_R15(%_ASM_CX), %r15 + mov VCPU_R8 (%_ASM_AX), %r8 + mov VCPU_R9 (%_ASM_AX), %r9 + mov VCPU_R10(%_ASM_AX), %r10 + mov VCPU_R11(%_ASM_AX), %r11 + mov VCPU_R12(%_ASM_AX), %r12 + mov VCPU_R13(%_ASM_AX), %r13 + mov VCPU_R14(%_ASM_AX), %r14 + mov VCPU_R15(%_ASM_AX), %r15 #endif - /* Load guest RCX. This kills the vmx_vcpu pointer! */ - mov VCPU_RCX(%_ASM_CX), %_ASM_CX + /* Load guest RAX. This kills the vmx_vcpu pointer! */ + mov VCPU_RAX(%_ASM_AX), %_ASM_AX /* Enter guest mode */ call vmx_vmenter @@ -135,29 +135,29 @@ ENTRY(__vmx_vcpu_run) /* Jump on VM-Fail. */ jbe 2f - /* Temporarily save guest's RCX. */ - push %_ASM_CX + /* Temporarily save guest's RAX. */ + push %_ASM_AX - /* Reload @regs to RCX. */ - mov WORD_SIZE(%_ASM_SP), %_ASM_CX + /* Reload @regs to RAX. */ + mov WORD_SIZE(%_ASM_SP), %_ASM_AX - /* Save all guest registers, including RCX from the stack */ - mov %_ASM_AX, VCPU_RAX(%_ASM_CX) - mov %_ASM_BX, VCPU_RBX(%_ASM_CX) - __ASM_SIZE(pop) VCPU_RCX(%_ASM_CX) - mov %_ASM_DX, VCPU_RDX(%_ASM_CX) - mov %_ASM_SI, VCPU_RSI(%_ASM_CX) - mov %_ASM_DI, VCPU_RDI(%_ASM_CX) - mov %_ASM_BP, VCPU_RBP(%_ASM_CX) + /* Save all guest registers, including RAX from the stack */ + __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX) + mov %_ASM_BX, VCPU_RBX(%_ASM_AX) + mov %_ASM_CX, VCPU_RCX(%_ASM_AX) + mov %_ASM_DX, VCPU_RDX(%_ASM_AX) + mov %_ASM_SI, VCPU_RSI(%_ASM_AX) + mov %_ASM_DI, VCPU_RDI(%_ASM_AX) + mov %_ASM_BP, VCPU_RBP(%_ASM_AX) #ifdef CONFIG_X86_64 - mov %r8, VCPU_R8 (%_ASM_CX) - mov %r9, VCPU_R9 (%_ASM_CX) - mov %r10, VCPU_R10(%_ASM_CX) - mov %r11, VCPU_R11(%_ASM_CX) - mov %r12, VCPU_R12(%_ASM_CX) - mov %r13, VCPU_R13(%_ASM_CX) - mov %r14, VCPU_R14(%_ASM_CX) - mov %r15, VCPU_R15(%_ASM_CX) + mov %r8, VCPU_R8 (%_ASM_AX) + mov %r9, VCPU_R9 (%_ASM_AX) + mov %r10, VCPU_R10(%_ASM_AX) + mov %r11, VCPU_R11(%_ASM_AX) + mov %r12, VCPU_R12(%_ASM_AX) + mov %r13, VCPU_R13(%_ASM_AX) + mov %r14, VCPU_R14(%_ASM_AX) + mov %r15, VCPU_R15(%_ASM_AX) #endif /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ From 77df549559dbe7f265ab19bd444d6acb3a718b4d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:16 -0800 Subject: [PATCH 169/855] KVM: VMX: Pass @launched to the vCPU-run asm via standard ABI regs ...to prepare for making the sub-routine callable from C code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 5 ++++- arch/x86/kvm/vmx/vmx.c | 13 ++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 7d8b09abcdec..a3d9a8e062f9 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -84,7 +84,7 @@ ENDPROC(vmx_vmexit) * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode * @vmx: struct vcpu_vmx * * @regs: unsigned long * (to guest registers) - * %RBX: VMCS launched status (non-zero indicates already launched) + * @launched: %true if the VMCS has been launched * * Returns: * %RBX is 0 on VM-Exit, 1 on VM-Fail @@ -99,6 +99,9 @@ ENTRY(__vmx_vcpu_run) */ push %_ASM_ARG2 + /* Copy @launched to BL, _ASM_ARG3 is volatile. */ + mov %_ASM_ARG3B, %bl + /* Adjust RSP to account for the CALL to vmx_vmenter(). */ lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8bc8f09eaf0c..1b73a82444a2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6448,19 +6448,18 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) "call __vmx_vcpu_run \n\t" : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 - "=D"((int){0}), "=S"((int){0}) - : "D"(vmx), "S"(&vcpu->arch.regs), + "=D"((int){0}), "=S"((int){0}), "=d"((int){0}) + : "D"(vmx), "S"(&vcpu->arch.regs), "d"(vmx->loaded_vmcs->launched) #else - "=a"((int){0}), "=d"((int){0}) - : "a"(vmx), "d"(&vcpu->arch.regs), + "=a"((int){0}), "=d"((int){0}), "=c"((int){0}) + : "a"(vmx), "d"(&vcpu->arch.regs), "c"(vmx->loaded_vmcs->launched) #endif - "b"(vmx->loaded_vmcs->launched) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rcx", "rdx" + , "rax", "rcx" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "ecx", "edi", "esi" + , "edi", "esi" #endif ); From e75c3c3a0487da878cbfa7f125dcd080a8606eaf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:17 -0800 Subject: [PATCH 170/855] KVM: VMX: Return VM-Fail from vCPU-run assembly via standard ABI reg ...to prepare for making the assembly sub-routine callable from C code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 16 ++++++++-------- arch/x86/kvm/vmx/vmx.c | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index a3d9a8e062f9..e06a3f33311e 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -87,7 +87,7 @@ ENDPROC(vmx_vmexit) * @launched: %true if the VMCS has been launched * * Returns: - * %RBX is 0 on VM-Exit, 1 on VM-Fail + * 0 on VM-Exit, 1 on VM-Fail */ ENTRY(__vmx_vcpu_run) push %_ASM_BP @@ -163,17 +163,17 @@ ENTRY(__vmx_vcpu_run) mov %r15, VCPU_R15(%_ASM_AX) #endif - /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ - xor %ebx, %ebx + /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */ + xor %eax, %eax /* - * Clear all general purpose registers except RSP and RBX to prevent + * Clear all general purpose registers except RSP and RAX to prevent * speculative use of the guest's values, even those that are reloaded * via the stack. In theory, an L1 cache miss when restoring registers * could lead to speculative execution with the guest's values. * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially - * free. RSP and RBX are exempt as RSP is restored by hardware during - * VM-Exit and RBX is explicitly loaded with 0 or 1 to "return" VM-Fail. + * free. RSP and RAX are exempt as RSP is restored by hardware during + * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. */ 1: #ifdef CONFIG_X86_64 @@ -186,7 +186,7 @@ ENTRY(__vmx_vcpu_run) xor %r14d, %r14d xor %r15d, %r15d #endif - xor %eax, %eax + xor %ebx, %ebx xor %ecx, %ecx xor %edx, %edx xor %esi, %esi @@ -199,6 +199,6 @@ ENTRY(__vmx_vcpu_run) ret /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ -2: mov $1, %ebx +2: mov $1, %eax jmp 1b ENDPROC(__vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1b73a82444a2..9a1d27e77684 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6446,20 +6446,20 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) asm( "call __vmx_vcpu_run \n\t" - : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), + : ASM_CALL_CONSTRAINT, "=a"(vmx->fail), #ifdef CONFIG_X86_64 "=D"((int){0}), "=S"((int){0}), "=d"((int){0}) : "D"(vmx), "S"(&vcpu->arch.regs), "d"(vmx->loaded_vmcs->launched) #else - "=a"((int){0}), "=d"((int){0}), "=c"((int){0}) + "=d"((int){0}), "=c"((int){0}) : "a"(vmx), "d"(&vcpu->arch.regs), "c"(vmx->loaded_vmcs->launched) #endif : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rcx" + , "rbx", "rcx" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "edi", "esi" + , "ebx", "edi", "esi" #endif ); From 3b895ef48615382db03adcf125e0db8437b9acbe Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:18 -0800 Subject: [PATCH 171/855] KVM: VMX: Preserve callee-save registers in vCPU-run asm sub-routine ...to make it callable from C code. Note that because KVM chooses to be ultra paranoid about guest register values, all callee-save registers are still cleared after VM-Exit even though the host's values are now reloaded from the stack. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 21 +++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 5 +---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index e06a3f33311e..d325f1d6110b 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -92,6 +92,16 @@ ENDPROC(vmx_vmexit) ENTRY(__vmx_vcpu_run) push %_ASM_BP mov %_ASM_SP, %_ASM_BP +#ifdef CONFIG_X86_64 + push %r15 + push %r14 + push %r13 + push %r12 +#else + push %edi + push %esi +#endif + push %_ASM_BX /* * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and @@ -195,6 +205,17 @@ ENTRY(__vmx_vcpu_run) /* "POP" @regs. */ add $WORD_SIZE, %_ASM_SP + pop %_ASM_BX + +#ifdef CONFIG_X86_64 + pop %r12 + pop %r13 + pop %r14 + pop %r15 +#else + pop %esi + pop %edi +#endif pop %_ASM_BP ret diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9a1d27e77684..43723d0007be 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6456,10 +6456,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif : "cc", "memory" #ifdef CONFIG_X86_64 - , "rbx", "rcx" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ebx", "edi", "esi" + , "rcx", "r8", "r9", "r10", "r11" #endif ); From fc2ba5a27a1aaa16b664e64f85e0e1307d2bde3a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:19 -0800 Subject: [PATCH 172/855] KVM: VMX: Call vCPU-run asm sub-routine from C and remove clobbering ...now that the sub-routine follows standard calling conventions. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 43723d0007be..c39f1c38b878 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6371,6 +6371,8 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } +bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); + static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6444,21 +6446,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.cr2 != read_cr2()) write_cr2(vcpu->arch.cr2); - asm( - "call __vmx_vcpu_run \n\t" - : ASM_CALL_CONSTRAINT, "=a"(vmx->fail), -#ifdef CONFIG_X86_64 - "=D"((int){0}), "=S"((int){0}), "=d"((int){0}) - : "D"(vmx), "S"(&vcpu->arch.regs), "d"(vmx->loaded_vmcs->launched) -#else - "=d"((int){0}), "=c"((int){0}) - : "a"(vmx), "d"(&vcpu->arch.regs), "c"(vmx->loaded_vmcs->launched) -#endif - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rcx", "r8", "r9", "r10", "r11" -#endif - ); + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + vmx->loaded_vmcs->launched); vcpu->arch.cr2 = read_cr2(); From 4f44c4eec5b72020c6d99e9e4f3d0505874c98c9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:20 -0800 Subject: [PATCH 173/855] KVM: VMX: Reorder clearing of registers in the vCPU-run assembly flow Move the clearing of the common registers (not 64-bit-only) to the start of the flow that clears registers holding guest state. This is purely a cosmetic change so that the label doesn't point at a blank line and a #define. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index d325f1d6110b..7b272738c576 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -185,7 +185,12 @@ ENTRY(__vmx_vcpu_run) * free. RSP and RAX are exempt as RSP is restored by hardware during * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. */ -1: +1: xor %ebx, %ebx + xor %ecx, %ecx + xor %edx, %edx + xor %esi, %esi + xor %edi, %edi + xor %ebp, %ebp #ifdef CONFIG_X86_64 xor %r8d, %r8d xor %r9d, %r9d @@ -196,12 +201,6 @@ ENTRY(__vmx_vcpu_run) xor %r14d, %r14d xor %r15d, %r15d #endif - xor %ebx, %ebx - xor %ecx, %ecx - xor %edx, %edx - xor %esi, %esi - xor %edi, %edi - xor %ebp, %ebp /* "POP" @regs. */ add $WORD_SIZE, %_ASM_SP From b5179ec4187251a751832193693d6e474d3445ac Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Sat, 26 Jan 2019 12:49:56 -0500 Subject: [PATCH 174/855] x86/kvmclock: set offset for kvm unstable clock VMs may show incorrect uptime and dmesg printk offsets on hypervisors with unstable clock. The problem is produced when VM is rebooted without exiting from qemu. The fix is to calculate clock offset not only for stable clock but for unstable clock as well, and use kvm_sched_clock_read() which substracts the offset for both clocks. This is safe, because pvclock_clocksource_read() does the right thing and makes sure that clock always goes forward, so once offset is calculated with unstable clock, we won't get new reads that are smaller than offset, and thus won't get negative results. Thank you Jon DeVree for helping to reproduce this issue. Fixes: 857baa87b642 ("sched/clock: Enable sched clock early") Cc: stable@vger.kernel.org Reported-by: Dominique Martinet Signed-off-by: Pavel Tatashin Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e811d4d1c824..d908a37bf3f3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -104,12 +104,8 @@ static u64 kvm_sched_clock_read(void) static inline void kvm_sched_clock_init(bool stable) { - if (!stable) { - pv_ops.time.sched_clock = kvm_clock_read; + if (!stable) clear_sched_clock_stable(); - return; - } - kvm_sched_clock_offset = kvm_clock_read(); pv_ops.time.sched_clock = kvm_sched_clock_read; From 90952cd388595317170ad22a6bcee3cb8cde4942 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 30 Jan 2019 17:07:47 +0100 Subject: [PATCH 175/855] kvm: Use struct_size() in kmalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kmalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kmalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 585845203db8..791ced69dd0b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3714,8 +3714,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; - new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) * - sizeof(struct kvm_io_range)), GFP_KERNEL); + new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), + GFP_KERNEL); if (!new_bus) return -ENOMEM; @@ -3760,8 +3760,8 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, if (i == bus->dev_count) return; - new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * - sizeof(struct kvm_io_range)), GFP_KERNEL); + new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), + GFP_KERNEL); if (!new_bus) { pr_err("kvm: failed to shrink bus, removing it completely\n"); goto broken; From 98d90582be2e08246a70af31e09950ecb8876252 Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Tue, 29 Jan 2019 08:08:42 +0000 Subject: [PATCH 176/855] svm: Fix AVIC DFR and LDR handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current SVM AVIC driver makes two incorrect assumptions: 1. APIC LDR register cannot be zero 2. APIC DFR for all vCPUs must be the same LDR=0 means the local APIC does not support logical destination mode. Therefore, the driver should mark any previously assigned logical APIC ID table entry as invalid, and return success. Also, DFR is specific to a particular local APIC, and can be different among all vCPUs (as observed on Windows 10). These incorrect assumptions cause Windows 10 and FreeBSD VMs to fail to boot with AVIC enabled. So, instead of flush the whole logical APIC ID table, handle DFR and LDR for each vCPU independently. Fixes: 18f40c53e10f ('svm: Add VMEXIT handlers for AVIC') Cc: Radim Krčmář Cc: Paolo Bonzini Reported-by: Julian Stecklina Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 64 ++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f13a3a24d360..01b66bbcdd7a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -145,7 +145,6 @@ struct kvm_svm { /* Struct members for AVIC */ u32 avic_vm_id; - u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; struct hlist_node hnode; @@ -236,6 +235,7 @@ struct vcpu_svm { bool nrips_enabled : 1; u32 ldr_reg; + u32 dfr_reg; struct page *avic_backing_page; u64 *avic_physical_id_cache; bool avic_is_running; @@ -2106,6 +2106,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm) INIT_LIST_HEAD(&svm->ir_list); spin_lock_init(&svm->ir_list_lock); + svm->dfr_reg = APIC_DFR_FLAT; return ret; } @@ -4565,8 +4566,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) return &logical_apic_id_table[index]; } -static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, - bool valid) +static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) { bool flat; u32 *entry, new_entry; @@ -4579,31 +4579,39 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, new_entry = READ_ONCE(*entry); new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); - if (valid) - new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; - else - new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK; + new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; WRITE_ONCE(*entry, new_entry); return 0; } +static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + bool flat = svm->dfr_reg == APIC_DFR_FLAT; + u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); + + if (entry) + WRITE_ONCE(*entry, (u32) ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK); +} + static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) { - int ret; + int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); - if (!ldr) - return 1; + if (ldr == svm->ldr_reg) + return 0; - ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true); - if (ret && svm->ldr_reg) { - avic_ldr_write(vcpu, 0, svm->ldr_reg, false); - svm->ldr_reg = 0; - } else { + avic_invalidate_logical_id_entry(vcpu); + + if (ldr) + ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr); + + if (!ret) svm->ldr_reg = ldr; - } + return ret; } @@ -4637,27 +4645,16 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) return 0; } -static int avic_handle_dfr_update(struct kvm_vcpu *vcpu) +static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); - u32 mod = (dfr >> 28) & 0xf; - /* - * We assume that all local APICs are using the same type. - * If this changes, we need to flush the AVIC logical - * APID id table. - */ - if (kvm_svm->ldr_mode == mod) - return 0; + if (svm->dfr_reg == dfr) + return; - clear_page(page_address(kvm_svm->avic_logical_id_table_page)); - kvm_svm->ldr_mode = mod; - - if (svm->ldr_reg) - avic_handle_ldr_update(vcpu); - return 0; + avic_invalidate_logical_id_entry(vcpu); + svm->dfr_reg = dfr; } static int avic_unaccel_trap_write(struct vcpu_svm *svm) @@ -6163,8 +6160,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) { if (avic_handle_apic_id_update(vcpu) != 0) return; - if (avic_handle_dfr_update(vcpu) != 0) - return; + avic_handle_dfr_update(vcpu); avic_handle_ldr_update(vcpu); } From f7589cca50ef8970dda56cdced8d96d05aa0a777 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 30 Jan 2019 17:18:59 +0100 Subject: [PATCH 177/855] KVM: x86: cull apicv code when userspace irqchip is requested Currently apicv_active can be true even if in-kernel LAPIC emulation is disabled. Avoid this by properly initializing it in kvm_arch_vcpu_init, and then do not do anything to deactivate APICv when it is actually not used (Currently APICv is only deactivated by SynIC code that in turn is only reachable when in-kernel LAPIC is in use. However, it is cleaner if kvm_vcpu_deactivate_apicv avoids relying on this. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e67ecf25e690..a1fb99f21317 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7055,6 +7055,13 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) { + if (!lapic_in_kernel(vcpu)) { + WARN_ON_ONCE(vcpu->arch.apicv_active); + return; + } + if (!vcpu->arch.apicv_active) + return; + vcpu->arch.apicv_active = false; kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); } @@ -9005,7 +9012,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) struct page *page; int r; - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -9026,6 +9032,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_pio_data; if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; From c57cd3c89ecf2812976f53e494580a396f93efd2 Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Tue, 29 Jan 2019 08:09:46 +0000 Subject: [PATCH 178/855] svm: Fix improper check when deactivate AVIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function svm_refresh_apicv_exec_ctrl() always returning prematurely as kvm_vcpu_apicv_active() always return false when calling from the function arch/x86/kvm/x86.c:kvm_vcpu_deactivate_apicv(). This is because the apicv_active is set to false just before calling refresh_apicv_exec_ctrl(). Also, we need to mark VMCB_AVIC bit as dirty instead of VMCB_INTR. So, fix svm_refresh_apicv_exec_ctrl() to properly deactivate AVIC. Fixes: 67034bb9dd5e ('KVM: SVM: Add irqchip_split() checks before enabling AVIC') Cc: Radim Krčmář Cc: Paolo Bonzini Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 01b66bbcdd7a..74ceda470eae 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5122,11 +5122,11 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - if (!kvm_vcpu_apicv_active(&svm->vcpu)) - return; - - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; - mark_dirty(vmcb, VMCB_INTR); + if (kvm_vcpu_apicv_active(vcpu)) + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + else + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + mark_dirty(vmcb, VMCB_AVIC); } static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) From 946c522b603f281195af1df91837a1d4d1eb3bc9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Jan 2019 14:39:23 -0800 Subject: [PATCH 179/855] KVM: nVMX: Sign extend displacements of VMX instr's mem operands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VMCS.EXIT_QUALIFCATION field reports the displacements of memory operands for various instructions, including VMX instructions, as a naturally sized unsigned value, but masks the value by the addr size, e.g. given a ModRM encoded as -0x28(%ebp), the -0x28 displacement is reported as 0xffffffd8 for a 32-bit address size. Despite some weird wording regarding sign extension, the SDM explicitly states that bits beyond the instructions address size are undefined: In all cases, bits of this field beyond the instruction’s address size are undefined. Failure to sign extend the displacement results in KVM incorrectly treating a negative displacement as a large positive displacement when the address size of the VMX instruction is smaller than KVM's native size, e.g. a 32-bit address size on a 64-bit KVM. The very original decoding, added by commit 064aea774768 ("KVM: nVMX: Decoding memory operands of VMX instructions"), sort of modeled sign extension by truncating the final virtual/linear address for a 32-bit address size. I.e. it messed up the effective address but made it work by adjusting the final address. When segmentation checks were added, the truncation logic was kept as-is and no sign extension logic was introduced. In other words, it kept calculating the wrong effective address while mostly generating the correct virtual/linear address. As the effective address is what's used in the segment limit checks, this results in KVM incorreclty injecting #GP/#SS faults due to non-existent segment violations when a nested VMM uses negative displacements with an address size smaller than KVM's native address size. Using the -0x28(%ebp) example, an EBP value of 0x1000 will result in KVM using 0x100000fd8 as the effective address when checking for a segment limit violation. This causes a 100% failure rate when running a 32-bit KVM build as L1 on top of a 64-bit KVM L0. Fixes: f9eb4af67c9d ("KVM: nVMX: VMX instructions: add checks for #GP/#SS exceptions") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0e67649e39ce..d531f4c91a34 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4020,6 +4020,10 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, /* Addr = segment_base + offset */ /* offset = base + [index * scale] + displacement */ off = exit_qualification; /* holds the displacement */ + if (addr_size == 1) + off = (gva_t)sign_extend64(off, 31); + else if (addr_size == 0) + off = (gva_t)sign_extend64(off, 15); if (base_is_valid) off += kvm_register_read(vcpu, base_reg); if (index_is_valid) From 8570f9e881e3fde98801bb3a47eef84dd934d405 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Jan 2019 14:39:24 -0800 Subject: [PATCH 180/855] KVM: nVMX: Apply addr size mask to effective address for VMX instructions The address size of an instruction affects the effective address, not the virtual/linear address. The final address may still be truncated, e.g. to 32-bits outside of long mode, but that happens irrespective of the address size, e.g. a 32-bit address size can yield a 64-bit virtual address when using FS/GS with a non-zero base. Fixes: 064aea774768 ("KVM: nVMX: Decoding memory operands of VMX instructions") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d531f4c91a34..34081cc8cdeb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4029,20 +4029,41 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, if (index_is_valid) off += kvm_register_read(vcpu, index_reg)< Date: Wed, 23 Jan 2019 14:39:25 -0800 Subject: [PATCH 181/855] KVM: nVMX: Ignore limit checks on VMX instructions using flat segments Regarding segments with a limit==0xffffffff, the SDM officially states: When the effective limit is FFFFFFFFH (4 GBytes), these accesses may or may not cause the indicated exceptions. Behavior is implementation-specific and may vary from one execution to another. In practice, all CPUs that support VMX ignore limit checks for "flat segments", i.e. an expand-up data or code segment with base=0 and limit=0xffffffff. This is subtly different than wrapping the effective address calculation based on the address size, as the flat segment behavior also applies to accesses that would wrap the 4g boundary, e.g. a 4-byte access starting at 0xffffffff will access linear addresses 0xffffffff, 0x0, 0x1 and 0x2. Fixes: f9eb4af67c9d ("KVM: nVMX: VMX instructions: add checks for #GP/#SS exceptions") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 34081cc8cdeb..0050c179c5d3 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4087,10 +4087,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. */ exn = (s.unusable != 0); - /* Protected mode: #GP(0)/#SS(0) if the memory - * operand is outside the segment limit. + + /* + * Protected mode: #GP(0)/#SS(0) if the memory operand is + * outside the segment limit. All CPUs that support VMX ignore + * limit checks for flat segments, i.e. segments with base==0, + * limit==0xffffffff and of type expand-up data or code. */ - exn = exn || (off + sizeof(u64) > s.limit); + if (!(s.base == 0 && s.limit == 0xffffffff && + ((s.type & 8) || !(s.type & 4)))) + exn = exn || (off + sizeof(u64) > s.limit); } if (exn) { kvm_queue_exception_e(vcpu, From e0dfacbfe91ad7947f0c44829c0358ac2e17d3c6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 30 Jan 2019 17:25:38 +0100 Subject: [PATCH 182/855] KVM: nVMX: remove useless is_protmode check VMX is only accessible in protected mode, remove a confusing check that causes the conditional to lack a final "else" branch. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0050c179c5d3..be9b2c8fd32e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4056,7 +4056,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, * destination for long mode! */ exn = is_noncanonical_address(*ret, vcpu); - } else if (is_protmode(vcpu)) { + } else { /* * When not in long mode, the virtual/linear address is * unconditionally truncated to 32 bits regardless of the From 8acc0993e3f9a04a407ff1507dd329455f340121 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 15 Jan 2019 17:28:40 +1300 Subject: [PATCH 183/855] kvm, x86, mmu: Use kernel generic dynamic physical address mask AMD's SME/SEV is no longer the only case which reduces supported physical address bits, since Intel introduced Multi-key Total Memory Encryption (MKTME), which repurposes high bits of physical address as keyID, thus effectively shrinks supported physical address bits. To cover both cases (and potential similar future features), kernel MM introduced generic dynamaic physical address mask instead of hard-coded __PHYSICAL_MASK in 'commit 94d49eb30e854 ("x86/mm: Decouple dynamic __PHYSICAL_MASK from AMD SME")'. KVM should use that too. Change PT64_BASE_ADDR_MASK to use kernel dynamic physical address mask when it is enabled, instead of sme_clr. PT64_DIR_BASE_ADDR_MASK is also deleted since it is not used at all. Signed-off-by: Kai Huang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index da9c42349b1f..45eb988aa411 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -109,9 +109,11 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) -#define PT64_DIR_BASE_ADDR_MASK \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#else +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#endif #define PT64_LVL_ADDR_MASK(level) \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT64_LEVEL_BITS))) - 1)) From 74f2370bb64f5c1c418f115e338f20f11b75f954 Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Tue, 6 Nov 2018 13:55:27 +0800 Subject: [PATCH 184/855] KVM: x86: expose MOVDIRI CPU feature into VM. MOVDIRI moves doubleword or quadword from register to memory through direct store which is implemented by using write combining (WC) for writing data directly into memory without caching the data. Availability of the MOVDIRI instruction is indicated by the presence of the CPUID feature flag MOVDIRI(CPUID.0x07.0x0:ECX[bit 27]). This patch exposes the movdiri feature to the guest. The release document ref below link: https://software.intel.com/sites/default/files/managed/c5/15/\ architecture-instruction-set-extensions-programming-reference.pdf Signed-off-by: Liu Jingqi Cc: Xu Tao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bbffa6c54697..8045434093f8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -404,7 +404,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE); + F(CLDEMOTE) | F(MOVDIRI); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = From c029b5deb0b5d7e5090317b835f21c5d93999db7 Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Tue, 6 Nov 2018 13:55:28 +0800 Subject: [PATCH 185/855] KVM: x86: expose MOVDIR64B CPU feature into VM. MOVDIR64B moves 64-bytes as direct-store with 64-bytes write atomicity. Direct store is implemented by using write combining (WC) for writing data directly into memory without caching the data. Availability of the MOVDIR64B instruction is indicated by the presence of the CPUID feature flag MOVDIR64B (CPUID.0x07.0x0:ECX[bit 28]). This patch exposes the movdir64b feature to the guest. The release document ref below link: https://software.intel.com/sites/default/files/managed/c5/15/\ architecture-instruction-set-extensions-programming-reference.pdf Signed-off-by: Liu Jingqi Cc: Xu Tao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8045434093f8..e4644ba9c3da 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -404,7 +404,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI); + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = From 81b016676e1c8f58027bd4d2b1d8a981776b36fe Mon Sep 17 00:00:00 2001 From: Luwei Kang Date: Thu, 31 Jan 2019 16:52:02 +0800 Subject: [PATCH 186/855] KVM: x86: Sync the pending Posted-Interrupts Some Posted-Interrupts from passthrough devices may be lost or overwritten when the vCPU is in runnable state. The SN (Suppress Notification) of PID (Posted Interrupt Descriptor) will be set when the vCPU is preempted (vCPU in KVM_MP_STATE_RUNNABLE state but not running on physical CPU). If a posted interrupt coming at this time, the irq remmaping facility will set the bit of PIR (Posted Interrupt Requests) without ON (Outstanding Notification). So this interrupt can't be sync to APIC virtualization register and will not be handled by Guest because ON is zero. Signed-off-by: Luwei Kang [Eliminate the pi_clear_sn fast path. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 26 +++++++++++--------------- arch/x86/kvm/vmx/vmx.h | 16 ++++++++-------- arch/x86/kvm/x86.c | 2 +- 3 files changed, 20 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c39f1c38b878..c81dc632464a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1193,21 +1193,6 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; - /* - * First handle the simple case where no cmpxchg is necessary; just - * allow posting non-urgent interrupts. - * - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block will do it for us and the wakeup_handler - * expects the VCPU to be on the blocked_vcpu_list that matches - * PI.NDST. - */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || - vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - return; - } - /* The full case. */ do { old.control = new.control = pi_desc->control; @@ -1222,6 +1207,17 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.sn = 0; } while (cmpxchg64(&pi_desc->control, old.control, new.control) != old.control); + + /* + * Clear SN before reading the bitmap; this ensures that any + * interrupt that comes after the bitmap is read sets ON. The + * VT-d firmware * writes the bitmap and reads SN atomically (5.2.3 + * in the spec), so it doesn't really have a memory barrier that + * pairs with this. However, we cannot do that and we need one. + */ + smp_mb__after_atomic(); + if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) + pi_set_on(pi_desc); } /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 6ee6a492efaf..ec23b4d65fb7 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -336,16 +336,16 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - return clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - static inline void pi_set_sn(struct pi_desc *pi_desc) { - return set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); + set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_on(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); } static inline void pi_clear_on(struct pi_desc *pi_desc) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1fb99f21317..96f87d356c79 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7808,7 +7808,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * 1) We should set ->mode before checking ->requests. Please see * the comment in kvm_vcpu_exiting_guest_mode(). * - * 2) For APICv, we should set ->mode before checking PIR.ON. This + * 2) For APICv, we should set ->mode before checking PID.ON. This * pairs with the memory barrier implicit in pi_test_and_set_on * (see vmx_deliver_posted_interrupt). * From b4b65b5642d6edd50e35a159f20e793dc478d686 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Jan 2019 19:12:35 +0100 Subject: [PATCH 187/855] KVM: x86: cleanup freeing of nested state Ensure that the VCPU free path goes through vmx_leave_nested and thus nested_vmx_vmexit, so that the cancellation of the timer does not have to be in free_nested. In addition, because some paths through nested_vmx_vmexit do not go through sync_vmcs12, the cancellation of the timer is moved there. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 12 ++++++------ arch/x86/kvm/vmx/vmx.c | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index be9b2c8fd32e..523d30e935e1 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -211,7 +211,6 @@ static void free_nested(struct kvm_vcpu *vcpu) if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; - hrtimer_cancel(&vmx->nested.preemption_timer); vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; free_vpid(vmx->nested.vpid02); @@ -274,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); + vmx_leave_nested(vcpu); vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); free_nested(vcpu); vcpu_put(vcpu); @@ -3438,13 +3438,10 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - if (nested_cpu_has_preemption_timer(vmcs12)) { - if (vmcs12->vm_exit_controls & - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + if (nested_cpu_has_preemption_timer(vmcs12) && + vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) vmcs12->vmx_preemption_timer_value = vmx_get_preemption_timer_value(vcpu); - hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - } /* * In some cases (usually, nested EPT), L2 is allowed to change its @@ -3852,6 +3849,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, leave_guest_mode(vcpu); + if (nested_cpu_has_preemption_timer(vmcs12)) + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c81dc632464a..de6a9e7c6471 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6546,7 +6546,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) if (enable_pml) vmx_destroy_pml_buffer(vmx); free_vpid(vmx->vpid); - leave_guest_mode(vcpu); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); From d92935979adba274b1099e67b7f713f6d8413121 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Thu, 31 Jan 2019 11:26:39 +0800 Subject: [PATCH 188/855] kvm: vmx: Fix typos in vmentry/vmexit control setting Previously, 'commit f99e3daf94ff ("KVM: x86: Add Intel PT virtualization work mode")' work mode' offered framework to support Intel PT virtualization. However, the patch has some typos in vmx_vmentry_ctrl() and vmx_vmexit_ctrl(), e.g. used wrong flags and wrong variable, which will cause the VM entry failure later. Fixes: 'commit f99e3daf94ff ("KVM: x86: Add Intel PT virtualization work mode")' Signed-off-by: Yu Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index ec23b4d65fb7..d7d1048d221b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -444,7 +444,8 @@ static inline u32 vmx_vmentry_ctrl(void) { u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; if (pt_mode == PT_MODE_SYSTEM) - vmentry_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); + vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | + VM_ENTRY_LOAD_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmentry_ctrl & ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); @@ -454,9 +455,10 @@ static inline u32 vmx_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; if (pt_mode == PT_MODE_SYSTEM) - vmexit_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); + vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmcs_config.vmexit_ctrl & + return vmexit_ctrl & ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); } From 359a6c3ddc5184122989d5152a4b975c1c262d33 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Jan 2019 19:14:46 +0100 Subject: [PATCH 189/855] KVM: nVMX: do not start the preemption timer hrtimer unnecessarily The preemption timer can be started even if there is a vmentry failure during or after loading guest state. That is pointless, move the call after all conditions have been checked. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 523d30e935e1..11431bb4c7ed 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2278,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } vmx_set_rflags(vcpu, vmcs12->guest_rflags); - vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); - /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the * bitwise-or of what L1 wants to trap for L2, and what we want to * trap. Note that CR0.TS also needs updating - we do this later. @@ -3018,6 +3014,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) if (unlikely(evaluate_pending_interrupts)) kvm_make_request(KVM_REQ_EVENT, vcpu); + /* + * Do not start the preemption timer hrtimer until after we know + * we are successful, so that only nested_vmx_vmexit needs to cancel + * the timer. + */ + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet From b12ce36a43f29dbff0bca14c5a51c276aea5662f Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 11 Feb 2019 11:02:49 -0800 Subject: [PATCH 190/855] kvm: Add memcg accounting to KVM allocations There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. If the allocations aren't tied to the process, the OOM killer will not know that killing the process will free the associated kernel memory. Add __GFP_ACCOUNT flags to many of the allocations which are not yet being charged to the VM process's cgroup. Tested: Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch introduced no new failures. Ran a kernel memory accounting test which creates a VM to touch memory and then checks that the kernel memory allocated for the process is within certain bounds. With this patch we account for much more of the vmalloc and slab memory allocated for the VM. There remain a few allocations which should be charged to the VM's cgroup but are not. In they include: vcpu->run kvm->coalesced_mmio_ring There allocations are unaccounted in this patch because they are mapped to userspace, and accounting them to a cgroup causes problems. This should be addressed in a future patch. Signed-off-by: Ben Gardon Reviewed-by: Shakeel Butt Signed-off-by: Paolo Bonzini --- virt/kvm/coalesced_mmio.c | 3 ++- virt/kvm/eventfd.c | 7 ++++--- virt/kvm/irqchip.c | 4 ++-- virt/kvm/kvm_main.c | 29 +++++++++++++++-------------- virt/kvm/vfio.c | 4 ++-- 5 files changed, 25 insertions(+), 22 deletions(-) diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 6855cce3e528..5294abb3f178 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -144,7 +144,8 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, if (zone->pio != 1 && zone->pio != 0) return -EINVAL; - dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); + dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), + GFP_KERNEL_ACCOUNT); if (!dev) return -ENOMEM; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index b20b751286fc..4325250afd72 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -297,7 +297,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) if (!kvm_arch_intc_initialized(kvm)) return -EAGAIN; - irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT); if (!irqfd) return -ENOMEM; @@ -345,7 +345,8 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) } if (!irqfd->resampler) { - resampler = kzalloc(sizeof(*resampler), GFP_KERNEL); + resampler = kzalloc(sizeof(*resampler), + GFP_KERNEL_ACCOUNT); if (!resampler) { ret = -ENOMEM; mutex_unlock(&kvm->irqfds.resampler_lock); @@ -797,7 +798,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm, if (IS_ERR(eventfd)) return PTR_ERR(eventfd); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT); if (!p) { ret = -ENOMEM; goto fail; diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index b1286c4e0712..3547b0d8c91e 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -196,7 +196,7 @@ int kvm_set_irq_routing(struct kvm *kvm, nr_rt_entries += 1; new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new) return -ENOMEM; @@ -208,7 +208,7 @@ int kvm_set_irq_routing(struct kvm *kvm, for (i = 0; i < nr; ++i) { r = -ENOMEM; - e = kzalloc(sizeof(*e), GFP_KERNEL); + e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT); if (!e) goto out; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 791ced69dd0b..0a0ea8f4bb1b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -525,7 +525,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void) int i; struct kvm_memslots *slots; - slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); if (!slots) return NULL; @@ -601,12 +601,12 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, sizeof(*kvm->debugfs_stat_data), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!kvm->debugfs_stat_data) return -ENOMEM; for (p = debugfs_entries; p->name; p++) { - stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL); + stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) return -ENOMEM; @@ -671,7 +671,7 @@ static struct kvm *kvm_create_vm(unsigned long type) goto out_err_no_irq_srcu; for (i = 0; i < KVM_NR_BUSES; i++) { rcu_assign_pointer(kvm->buses[i], - kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); + kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); if (!kvm->buses[i]) goto out_err; } @@ -789,7 +789,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); - memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL); + memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT); if (!memslot->dirty_bitmap) return -ENOMEM; @@ -1018,7 +1018,7 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out_free; } - slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); if (!slots) goto out_free; memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); @@ -2683,7 +2683,7 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_regs *kvm_regs; r = -ENOMEM; - kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); + kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); if (!kvm_regs) goto out; r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); @@ -2711,7 +2711,8 @@ out_free1: break; } case KVM_GET_SREGS: { - kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); + kvm_sregs = kzalloc(sizeof(struct kvm_sregs), + GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!kvm_sregs) goto out; @@ -2803,7 +2804,7 @@ out_free1: break; } case KVM_GET_FPU: { - fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); + fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!fpu) goto out; @@ -2980,7 +2981,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm, if (test) return 0; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); if (!dev) return -ENOMEM; @@ -3715,7 +3716,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, return -ENOSPC; new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new_bus) return -ENOMEM; @@ -3761,7 +3762,7 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, return; new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new_bus) { pr_err("kvm: failed to shrink bus, removing it completely\n"); goto broken; @@ -4029,7 +4030,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) active = kvm_active_vms; spin_unlock(&kvm_lock); - env = kzalloc(sizeof(*env), GFP_KERNEL); + env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); if (!env) return; @@ -4045,7 +4046,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) add_uevent_var(env, "PID=%d", kvm->userspace_pid); if (kvm->debugfs_dentry) { - char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); + char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); if (p) { tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index d99850c462a1..524cbd20379f 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -219,7 +219,7 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) } } - kvg = kzalloc(sizeof(*kvg), GFP_KERNEL); + kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT); if (!kvg) { mutex_unlock(&kv->lock); kvm_vfio_group_put_external_user(vfio_group); @@ -405,7 +405,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type) if (tmp->ops == &kvm_vfio_ops) return -EBUSY; - kv = kzalloc(sizeof(*kv), GFP_KERNEL); + kv = kzalloc(sizeof(*kv), GFP_KERNEL_ACCOUNT); if (!kv) return -ENOMEM; From 254272ce6505948ecc0b4bf5cd0aa61cdd815994 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 11 Feb 2019 11:02:50 -0800 Subject: [PATCH 191/855] kvm: x86: Add memcg accounting to KVM allocations There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. If the allocations aren't tied to the process, the OOM killer will not know that killing the process will free the associated kernel memory. Add __GFP_ACCOUNT flags to many of the allocations which are not yet being charged to the VM process's cgroup. Tested: Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch introduced no new failures. Ran a kernel memory accounting test which creates a VM to touch memory and then checks that the kernel memory allocated for the process is within certain bounds. With this patch we account for much more of the vmalloc and slab memory allocated for the VM. There remain a few allocations which should be charged to the VM's cgroup but are not. In x86, they include: vcpu->arch.pio_data There allocations are unaccounted in this patch because they are mapped to userspace, and accounting them to a cgroup causes problems. This should be addressed in a future patch. Signed-off-by: Ben Gardon Reviewed-by: Shakeel Butt Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/i8254.c | 2 +- arch/x86/kvm/i8259.c | 2 +- arch/x86/kvm/ioapic.c | 2 +- arch/x86/kvm/lapic.c | 7 ++++--- arch/x86/kvm/mmu.c | 6 +++--- arch/x86/kvm/page_track.c | 2 +- arch/x86/kvm/x86.c | 16 +++++++++------- 8 files changed, 21 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 89d20ed1d2e8..27c43525a05f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1729,7 +1729,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) mutex_lock(&hv->hv_lock); ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); mutex_unlock(&hv->hv_lock); if (ret >= 0) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index af192895b1fc..4a6dc54cc12b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -653,7 +653,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_t pid_nr; int ret; - pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); + pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT); if (!pit) return NULL; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index bdcd4139eca9..8b38bb4868a6 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -583,7 +583,7 @@ int kvm_pic_init(struct kvm *kvm) struct kvm_pic *s; int ret; - s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); + s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT); if (!s) return -ENOMEM; spin_lock_init(&s->lock); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 4e822ad363f3..1add1bc881e2 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -622,7 +622,7 @@ int kvm_ioapic_init(struct kvm *kvm) struct kvm_ioapic *ioapic; int ret; - ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); + ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT); if (!ioapic) return -ENOMEM; spin_lock_init(&ioapic->lock); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4b6c2da7265c..991fdf7fc17f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -181,7 +181,8 @@ static void recalculate_apic_map(struct kvm *kvm) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); new = kvzalloc(sizeof(struct kvm_apic_map) + - sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL); + sizeof(struct kvm_lapic *) * ((u64)max_id + 1), + GFP_KERNEL_ACCOUNT); if (!new) goto out; @@ -2259,13 +2260,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) ASSERT(vcpu != NULL); apic_debug("apic_init %d\n", vcpu->vcpu_id); - apic = kzalloc(sizeof(*apic), GFP_KERNEL); + apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT); if (!apic) goto nomem; vcpu->arch.apic = apic; - apic->regs = (void *)get_zeroed_page(GFP_KERNEL); + apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!apic->regs) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 45eb988aa411..415d0e62cb3e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -961,7 +961,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); + obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT); if (!obj) return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = obj; @@ -3702,7 +3702,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) u64 *lm_root; - lm_root = (void*)get_zeroed_page(GFP_KERNEL); + lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (lm_root == NULL) return 1; @@ -5499,7 +5499,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) * Therefore we need to allocate shadow page tables in the first * 4GB of memory, which happens to fit the DMA32 zone. */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); if (!page) return -ENOMEM; diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 3052a59a3065..fd04d462fdae 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -42,7 +42,7 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { slot->arch.gfn_track[i] = kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!slot->arch.gfn_track[i]) goto track_free; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 96f87d356c79..3de586f89730 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3879,7 +3879,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; - u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), + GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.lapic) @@ -4066,7 +4067,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xsave) break; @@ -4090,7 +4091,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xcrs) break; @@ -9040,14 +9041,15 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) static_key_slow_inc(&kvm_no_apic_vcpu); vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!vcpu->arch.mce_banks) { r = -ENOMEM; goto fail_free_lapic; } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, + GFP_KERNEL_ACCOUNT)) { r = -ENOMEM; goto fail_free_mce_banks; } @@ -9306,13 +9308,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, slot->arch.rmap[i] = kvcalloc(lpages, sizeof(*slot->arch.rmap[i]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) goto out_free; if (i == 0) continue; - linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL); + linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) goto out_free; From 1ec696470c860e2b1b95e178e368e180ad2b040f Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 11 Feb 2019 11:02:51 -0800 Subject: [PATCH 192/855] kvm: svm: Add memcg accounting to KVM allocations There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. If the allocations aren't tied to the process, the OOM killer will not know that killing the process will free the associated kernel memory. Add __GFP_ACCOUNT flags to many of the allocations which are not yet being charged to the VM process's cgroup. Tested: Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch introduced no new failures. Ran a kernel memory accounting test which creates a VM to touch memory and then checks that the kernel memory allocated for the process is within certain bounds. With this patch we account for much more of the vmalloc and slab memory allocated for the VM. Signed-off-by: Ben Gardon Reviewed-by: Shakeel Butt Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 74ceda470eae..b5b128a0a051 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1795,9 +1795,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = vmalloc(size); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); else - pages = kmalloc(size, GFP_KERNEL); + pages = kmalloc(size, GFP_KERNEL_ACCOUNT); if (!pages) return NULL; @@ -1865,7 +1866,9 @@ static void __unregister_enc_region_locked(struct kvm *kvm, static struct kvm *svm_vm_alloc(void) { - struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm)); + struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm), + GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); return &kvm_svm->kvm; } @@ -1940,7 +1943,7 @@ static int avic_vm_init(struct kvm *kvm) return 0; /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL); + p_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!p_page) goto free_avic; @@ -1948,7 +1951,7 @@ static int avic_vm_init(struct kvm *kvm) clear_page(page_address(p_page)); /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL); + l_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!l_page) goto free_avic; @@ -2120,13 +2123,14 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct page *nested_msrpm_pages; int err; - svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!svm) { err = -ENOMEM; goto out; } - svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); + svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); if (!svm->vcpu.arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; @@ -2138,19 +2142,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto free_svm; err = -ENOMEM; - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_KERNEL_ACCOUNT); if (!page) goto uninit; - msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); if (!msrpm_pages) goto free_page1; - nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); if (!nested_msrpm_pages) goto free_page2; - hsave_page = alloc_page(GFP_KERNEL); + hsave_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!hsave_page) goto free_page3; @@ -5192,7 +5196,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; @@ -6307,7 +6311,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) if (ret) return ret; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6357,7 +6361,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - start = kzalloc(sizeof(*start), GFP_KERNEL); + start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT); if (!start) return -ENOMEM; @@ -6454,7 +6458,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6531,7 +6535,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, measure, sizeof(params))) return -EFAULT; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6593,7 +6597,7 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6614,7 +6618,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6642,7 +6646,7 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, struct sev_data_dbg *data; int ret; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6897,7 +6901,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) } ret = -ENOMEM; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) goto e_unpin_memory; @@ -7003,7 +7007,7 @@ static int svm_register_enc_region(struct kvm *kvm, if (range->addr > ULONG_MAX || range->size > ULONG_MAX) return -EINVAL; - region = kzalloc(sizeof(*region), GFP_KERNEL); + region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT); if (!region) return -ENOMEM; From 4183683918efc3549b5ebddde4ed5edfdac45c17 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 11 Feb 2019 11:02:52 -0800 Subject: [PATCH 193/855] kvm: vmx: Add memcg accounting to KVM allocations There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. If the allocations aren't tied to the process, the OOM killer will not know that killing the process will free the associated kernel memory. Add __GFP_ACCOUNT flags to many of the allocations which are not yet being charged to the VM process's cgroup. Tested: Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch introduced no new failures. Ran a kernel memory accounting test which creates a VM to touch memory and then checks that the kernel memory allocated for the process is within certain bounds. With this patch we account for much more of the vmalloc and slab memory allocated for the VM. Signed-off-by: Ben Gardon Reviewed-by: Shakeel Butt Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 ++++++-- arch/x86/kvm/vmx/vmx.c | 27 ++++++++++++++++++--------- arch/x86/kvm/vmx/vmx.h | 5 +++-- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 11431bb4c7ed..a4b1b5a50489 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4166,11 +4166,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (r < 0) goto out_vmcs02; - vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); + vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; - vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); + vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; @@ -5715,6 +5715,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { for (i = 0; i < VMX_BITMAP_NR; i++) { + /* + * The vmx_bitmap is not tied to a VM and so should + * not be charged to a memcg. + */ vmx_bitmap[i] = (unsigned long *) __get_free_page(GFP_KERNEL); if (!vmx_bitmap[i]) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index de6a9e7c6471..4950bb20e06a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -246,6 +246,10 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { + /* + * This allocation for vmx_l1d_flush_pages is not tied to a VM + * lifetime and so should not be charged to a memcg. + */ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); if (!page) return -ENOMEM; @@ -2386,13 +2390,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, return 0; } -struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) { int node = cpu_to_node(cpu); struct page *pages; struct vmcs *vmcs; - pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + pages = __alloc_pages_node(node, flags, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); @@ -2439,7 +2443,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) loaded_vmcs_init(loaded_vmcs); if (cpu_has_vmx_msr_bitmap()) { - loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + loaded_vmcs->msr_bitmap = (unsigned long *) + __get_free_page(GFP_KERNEL_ACCOUNT); if (!loaded_vmcs->msr_bitmap) goto out_vmcs; memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); @@ -2480,7 +2485,7 @@ static __init int alloc_kvm_area(void) for_each_possible_cpu(cpu) { struct vmcs *vmcs; - vmcs = alloc_vmcs_cpu(false, cpu); + vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); if (!vmcs) { free_kvm_area(); return -ENOMEM; @@ -6530,7 +6535,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) static struct kvm *vmx_vm_alloc(void) { - struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx)); + struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx), + GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); return &kvm_vmx->kvm; } @@ -6557,14 +6564,16 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; - struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + struct vcpu_vmx *vmx; unsigned long *msr_bitmap; int cpu; + vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vmx) return ERR_PTR(-ENOMEM); - vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); + vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); if (!vmx->vcpu.arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; @@ -6586,12 +6595,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) * for the guest, etc. */ if (enable_pml) { - vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!vmx->pml_pg) goto uninit_vcpu; } - vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) > PAGE_SIZE); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index d7d1048d221b..1554cb45b393 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -479,7 +479,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } -struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu); +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); @@ -488,7 +488,8 @@ void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); static inline struct vmcs *alloc_vmcs(bool shadow) { - return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); + return alloc_vmcs_cpu(shadow, raw_smp_processor_id(), + GFP_KERNEL_ACCOUNT); } u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); From 152482580a1b0accb60676063a1ac57b2d12daf6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 12:54:17 -0800 Subject: [PATCH 194/855] KVM: Call kvm_arch_memslots_updated() before updating memslots kvm_arch_memslots_updated() is at this point in time an x86-specific hook for handling MMIO generation wraparound. x86 stashes 19 bits of the memslots generation number in its MMIO sptes in order to avoid full page fault walks for repeat faults on emulated MMIO addresses. Because only 19 bits are used, wrapping the MMIO generation number is possible, if unlikely. kvm_arch_memslots_updated() alerts x86 that the generation has changed so that it can invalidate all MMIO sptes in case the effective MMIO generation has wrapped so as to avoid using a stale spte, e.g. a (very) old spte that was created with generation==0. Given that the purpose of kvm_arch_memslots_updated() is to prevent consuming stale entries, it needs to be called before the new generation is propagated to memslots. Invalidating the MMIO sptes after updating memslots means that there is a window where a vCPU could dereference the new memslots generation, e.g. 0, and incorrectly reuse an old MMIO spte that was created with (pre-wrap) generation==0. Fixes: e59dbe09f8e6 ("KVM: Introduce kvm_arch_memslots_updated()") Cc: Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 2 +- arch/powerpc/include/asm/kvm_host.h | 2 +- arch/s390/include/asm/kvm_host.h | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 2 +- virt/kvm/arm/mmu.c | 2 +- virt/kvm/kvm_main.c | 7 +++++-- 9 files changed, 15 insertions(+), 12 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index d2abd98471e8..41204a49cf95 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -1134,7 +1134,7 @@ static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} -static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0f98f00da2ea..19693b8add93 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -837,7 +837,7 @@ struct kvm_vcpu_arch { static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} -static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_exit(void) {} diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d5d24889c3bc..c2b8c8c6c9be 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -878,7 +878,7 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} -static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0e2ef41efb9d..c4758e1a8843 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1254,7 +1254,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots); +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 415d0e62cb3e..a53a0e7ad9e6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5893,13 +5893,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { /* * The very rare case: if the generation-number is round, * zap all shadow pages. */ - if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { + if (unlikely((gen & MMIO_GEN_MASK) == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3de586f89730..03d26ffb29cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9357,13 +9357,13 @@ out_free: return -ENOMEM; } -void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { /* * memslots->generation has been incremented. * mmio generation may have reached its maximum value. */ - kvm_mmu_invalidate_mmio_sptes(kvm, slots); + kvm_mmu_invalidate_mmio_sptes(kvm, gen); } int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c38cc5eb7e73..cf761ff58224 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -634,7 +634,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont); int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages); -void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots); +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, const struct kvm_userspace_memory_region *mem, diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index fbdf3ac2f001..e0355e0f8712 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -2350,7 +2350,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, return 0; } -void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0a0ea8f4bb1b..d54f6578a849 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -874,6 +874,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, int as_id, struct kvm_memslots *slots) { struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); + u64 gen; /* * Set the low bit in the generation, which disables SPTE caching @@ -896,9 +897,11 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, * space 0 will use generations 0, 4, 8, ... while * address space 1 will * use generations 2, 6, 10, 14, ... */ - slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1; + gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1; - kvm_arch_memslots_updated(kvm, slots); + kvm_arch_memslots_updated(kvm, gen); + + slots->generation = gen; return old_memslots; } From e1359e2beb8b0a1188abc997273acbaedc8ee791 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:12 -0800 Subject: [PATCH 195/855] KVM: x86/mmu: Detect MMIO generation wrap in any address space The check to detect a wrap of the MMIO generation explicitly looks for a generation number of zero. Now that unique memslots generation numbers are assigned to each address space, only address space 0 will get a generation number of exactly zero when wrapping. E.g. when address space 1 goes from 0x7fffe to 0x80002, the MMIO generation number will wrap to 0x2. Adjust the MMIO generation to strip the address space modifier prior to checking for a wrap. Fixes: 4bd518f1598d ("KVM: use separate generations for each address space") Cc: Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a53a0e7ad9e6..c2f2c9de63ed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5895,11 +5895,28 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { + gen &= MMIO_GEN_MASK; + /* - * The very rare case: if the generation-number is round, + * Shift to eliminate the "update in-progress" flag, which isn't + * included in the spte's generation number. + */ + gen >>= 1; + + /* + * Generation numbers are incremented in multiples of the number of + * address spaces in order to provide unique generations across all + * address spaces. Strip what is effectively the address space + * modifier prior to checking for a wrap of the MMIO generation so + * that a wrap in any address space is detected. + */ + gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); + + /* + * The very rare case: if the MMIO generation number has wrapped, * zap all shadow pages. */ - if (unlikely((gen & MMIO_GEN_MASK) == 0)) { + if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } From ddfd1730fd829743e41213e32ccc8b4aa6dc8325 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:13 -0800 Subject: [PATCH 196/855] KVM: x86/mmu: Do not cache MMIO accesses while memslots are in flux When installing new memslots, KVM sets bit 0 of the generation number to indicate that an update is in-progress. Until the update is complete, there are no guarantees as to whether a vCPU will see the old or the new memslots. Explicity prevent caching MMIO accesses so as to avoid using an access cached from the old memslots after the new memslots have been installed. Note that it is unclear whether or not disabling caching during the update window is strictly necessary as there is no definitive documentation as to what ordering guarantees KVM provides with respect to updating memslots. That being said, the MMIO spte code does not allow reusing sptes created while an update is in-progress, and the associated documentation explicitly states: We do not want to use an MMIO sptes created with an odd generation number, ... If KVM is unlucky and creates an MMIO spte while the low bit is 1, the next access to the spte will always be a cache miss. At the very least, disabling the per-vCPU MMIO cache during updates will make its behavior consistent with the MMIO spte behavior and documentation. Fixes: 56f17dd3fbc4 ("kvm: x86: fix stale mmio cache bug") Cc: Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 224cd0a47568..20ede17202bf 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -181,6 +181,11 @@ static inline bool emul_is_noncanonical_address(u64 la, static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, unsigned access) { + u64 gen = kvm_memslots(vcpu->kvm)->generation; + + if (unlikely(gen & 1)) + return; + /* * If this is a shadow nested page table, the "GVA" is * actually a nGPA. @@ -188,7 +193,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK; vcpu->arch.access = access; vcpu->arch.mmio_gfn = gfn; - vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation; + vcpu->arch.mmio_gen = gen; } static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu) From 361209e054a2c9f34da090ee1ee4c1e8bfe76a64 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:14 -0800 Subject: [PATCH 197/855] KVM: Explicitly define the "memslot update in-progress" bit KVM uses bit 0 of the memslots generation as an "update in-progress" flag, which is used by x86 to prevent caching MMIO access while the memslots are changing. Although the intended behavior is flag-like, e.g. MMIO sptes intentionally drop the in-progress bit so as to avoid caching data from in-flux memslots, the implementation oftentimes treats the bit as part of the generation number itself, e.g. incrementing the generation increments twice, once to set the flag and once to clear it. Prior to commit 4bd518f1598d ("KVM: use separate generations for each address space"), incorporating the "update in-progress" bit into the generation number largely made sense, e.g. "real" generations are even, "bogus" generations are odd, most code doesn't need to be aware of the bit, etc... Now that unique memslots generation numbers are assigned to each address space, stealthing the in-progress status into the generation number results in a wide variety of subtle code, e.g. kvm_create_vm() jumps over bit 0 when initializing the memslots generation without any hint as to why. Explicitly define the flag and convert as much code as possible (which isn't much) to actually treat it like a flag. This paves the way for eventually using a different bit for "update in-progress" so that it can be a flag in truth instead of a awkward extension to the generation number. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 2 +- include/linux/kvm_host.h | 21 +++++++++++++++++++++ virt/kvm/kvm_main.c | 26 +++++++++++++------------- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 20ede17202bf..28406aa1136d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -183,7 +183,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, { u64 gen = kvm_memslots(vcpu->kvm)->generation; - if (unlikely(gen & 1)) + if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) return; /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cf761ff58224..5e1cb74922b3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -48,6 +48,27 @@ */ #define KVM_MEMSLOT_INVALID (1UL << 16) +/* + * Bit 0 of the memslot generation number is an "update in-progress flag", + * e.g. is temporarily set for the duration of install_new_memslots(). + * This flag effectively creates a unique generation number that is used to + * mark cached memslot data, e.g. MMIO accesses, as potentially being stale, + * i.e. may (or may not) have come from the previous memslots generation. + * + * This is necessary because the actual memslots update is not atomic with + * respect to the generation number update. Updating the generation number + * first would allow a vCPU to cache a spte from the old memslots using the + * new generation number, and updating the generation number after switching + * to the new memslots would allow cache hits using the old generation number + * to reference the defunct memslots. + * + * This mechanism is used to prevent getting hits in KVM's caches while a + * memslot update is in-progress, and to prevent cache hits *after* updating + * the actual generation number against accesses that were inserted into the + * cache *before* the memslots were updated. + */ +#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS BIT_ULL(0) + /* Two fragments for cross MMIO pages. */ #define KVM_MAX_MMIO_FRAGMENTS 2 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d54f6578a849..0f1f1c7c7a36 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -874,30 +874,30 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, int as_id, struct kvm_memslots *slots) { struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); - u64 gen; + u64 gen = old_memslots->generation; - /* - * Set the low bit in the generation, which disables SPTE caching - * until the end of synchronize_srcu_expedited. - */ - WARN_ON(old_memslots->generation & 1); - slots->generation = old_memslots->generation + 1; + WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); + slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; rcu_assign_pointer(kvm->memslots[as_id], slots); synchronize_srcu_expedited(&kvm->srcu); /* - * Increment the new memslot generation a second time. This prevents - * vm exits that race with memslot updates from caching a memslot - * generation that will (potentially) be valid forever. - * + * Increment the new memslot generation a second time, dropping the + * update in-progress flag and incrementing then generation based on + * the number of address spaces. This provides a unique and easily + * identifiable generation number while the memslots are in flux. + */ + gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; + + /* * Generations must be unique even across address spaces. We do not need * a global counter for that, instead the generation space is evenly split * across address spaces. For example, with two address spaces, address - * space 0 will use generations 0, 4, 8, ... while * address space 1 will + * space 0 will use generations 0, 4, 8, ... while address space 1 will * use generations 2, 6, 10, 14, ... */ - gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1; + gen += KVM_ADDRESS_SPACE_NUM * 2; kvm_arch_memslots_updated(kvm, gen); From 5192f9b976f9687569a90602b8a6c053da4498f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:15 -0800 Subject: [PATCH 198/855] KVM: x86: Use a u64 when passing the MMIO gen around KVM currently uses an 'unsigned int' for the MMIO generation number despite it being derived from the 64-bit memslots generation and being propagated to (potentially) 64-bit sptes. There is no hidden agenda behind using an 'unsigned int', it's done simply because the MMIO generation will never set bits above bit 19. Passing a u64 will allow the "update in-progress" flag to be relocated from bit 0 to bit 63 and removes the need to cast the generation back to a u64 when propagating it to a spte. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c2f2c9de63ed..54bb706e40e8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -348,20 +348,20 @@ static inline bool is_access_track_spte(u64 spte) #define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) #define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) -static u64 generation_mmio_spte_mask(unsigned int gen) +static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; WARN_ON(gen & ~MMIO_GEN_MASK); mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; - mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; + mask |= (gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; return mask; } -static unsigned int get_mmio_spte_generation(u64 spte) +static u64 get_mmio_spte_generation(u64 spte) { - unsigned int gen; + u64 gen; spte &= ~shadow_mmio_mask; @@ -370,7 +370,7 @@ static unsigned int get_mmio_spte_generation(u64 spte) return gen; } -static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) +static u64 kvm_current_mmio_generation(struct kvm_vcpu *vcpu) { return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; } @@ -378,7 +378,7 @@ static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned access) { - unsigned int gen = kvm_current_mmio_generation(vcpu); + u64 gen = kvm_current_mmio_generation(vcpu); u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; @@ -426,7 +426,7 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { - unsigned int kvm_gen, spte_gen; + u64 kvm_gen, spte_gen; kvm_gen = kvm_current_mmio_generation(vcpu); spte_gen = get_mmio_spte_generation(spte); From cae7ed3c2cb06680400adab632a243c5e5f42637 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:16 -0800 Subject: [PATCH 199/855] KVM: x86: Refactor the MMIO SPTE generation handling The code to propagate the memslots generation number into MMIO sptes is a bit convoluted. The "what" is relatively straightfoward, e.g. the comment explaining which bits go where is quite readable, but the "how" requires a lot of staring to understand what is happening. For example, 'MMIO_GEN_LOW_SHIFT' is actually used to calculate the high bits of the spte, while 'MMIO_SPTE_GEN_LOW_SHIFT' is used to calculate the low bits. Refactor the code to: - use #defines whose values align with the bits defined in the comment - use consistent code for both the high and low mask - explicitly highlight the handling of bit 0 (update in-progress flag) - explicitly call out that the defines are for MMIO sptes (to avoid confusion with the per-vCPU MMIO cache, which uses the full memslots generation) In addition to making the code a little less magical, this paves the way for moving the update in-progress flag to bit 63 without having to simultaneously rewrite all of the MMIO spte code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 72 ++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 54bb706e40e8..364b2a737d94 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -332,30 +332,41 @@ static inline bool is_access_track_spte(u64 spte) } /* - * the low bit of the generation number is always presumed to be zero. - * This disables mmio caching during memslot updates. The concept is - * similar to a seqcount but instead of retrying the access we just punt - * and ignore the cache. + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of + * the memslots generation and is derived as follows: * - * spte bits 3-11 are used as bits 1-9 of the generation number, - * the bits 52-61 are used as bits 10-19 of the generation number. + * Bits 1-9 of the memslot generation are propagated to spte bits 3-11 + * Bits 10-19 of the memslot generation are propagated to spte bits 52-61 + * + * The MMIO generation starts at bit 1 of the memslots generation in order to + * skip over bit 0, the KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag. Including + * the flag would require stealing a bit from the "real" generation number and + * thus effectively halve the maximum number of MMIO generations that can be + * handled before encountering a wrap (which requires a full MMU zap). The + * flag is instead explicitly queried when checking for MMIO spte cache hits. */ -#define MMIO_SPTE_GEN_LOW_SHIFT 2 -#define MMIO_SPTE_GEN_HIGH_SHIFT 52 +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(19, 1) +#define MMIO_SPTE_GEN_SHIFT 1 -#define MMIO_GEN_SHIFT 20 -#define MMIO_GEN_LOW_SHIFT 10 -#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) -#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) +#define MMIO_SPTE_GEN_LOW_START 3 +#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ + MMIO_SPTE_GEN_LOW_START) +#define MMIO_SPTE_GEN_HIGH_START 52 +#define MMIO_SPTE_GEN_HIGH_END 61 +#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ + MMIO_SPTE_GEN_HIGH_START) static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; - WARN_ON(gen & ~MMIO_GEN_MASK); + WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; - mask |= (gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; + gen >>= MMIO_SPTE_GEN_SHIFT; + + mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; + mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; return mask; } @@ -365,20 +376,15 @@ static u64 get_mmio_spte_generation(u64 spte) spte &= ~shadow_mmio_mask; - gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; - gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; - return gen; -} - -static u64 kvm_current_mmio_generation(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; + gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; + gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; + return gen << MMIO_SPTE_GEN_SHIFT; } static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned access) { - u64 gen = kvm_current_mmio_generation(vcpu); + u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; @@ -409,7 +415,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte) static unsigned get_mmio_spte_access(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; + u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask; return (spte & ~mask) & ~PAGE_MASK; } @@ -426,9 +432,13 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { - u64 kvm_gen, spte_gen; + u64 kvm_gen, spte_gen, gen; - kvm_gen = kvm_current_mmio_generation(vcpu); + gen = kvm_vcpu_memslots(vcpu)->generation; + if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) + return false; + + kvm_gen = gen & MMIO_SPTE_GEN_MASK; spte_gen = get_mmio_spte_generation(spte); trace_check_mmio_spte(spte, kvm_gen, spte_gen); @@ -5895,13 +5905,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { - gen &= MMIO_GEN_MASK; + gen &= MMIO_SPTE_GEN_MASK; /* - * Shift to eliminate the "update in-progress" flag, which isn't - * included in the spte's generation number. + * Shift to adjust for the "update in-progress" flag, which isn't + * included in the MMIO generation number. */ - gen >>= 1; + gen >>= MMIO_SPTE_GEN_SHIFT; /* * Generation numbers are incremented in multiples of the number of From 0e32958ec449a9bb63c031ed04ac7a494ea1bc1c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:17 -0800 Subject: [PATCH 200/855] KVM: Remove the hack to trigger memslot generation wraparound x86 captures a subset of the memslot generation (19 bits) in its MMIO sptes so that it can expedite emulated MMIO handling by checking only the releveant spte, i.e. doesn't need to do a full page fault walk. Because the MMIO sptes capture only 19 bits (due to limited space in the sptes), there is a non-zero probability that the MMIO generation could wrap, e.g. after 500k memslot updates. Since normal usage is extremely unlikely to result in 500k memslot updates, a hack was added by commit 69c9ea93eaea ("KVM: MMU: init kvm generation close to mmio wrap-around value") to offset the MMIO generation in order to trigger a wraparound, e.g. after 150 memslot updates. When separate memslot generation sequences were assigned to each address space, commit 00f034a12fdd ("KVM: do not bias the generation number in kvm_current_mmio_generation") moved the offset logic into the initialization of the memslot generation itself so that the per-address space bit(s) were not dropped/corrupted by the MMIO shenanigans. Remove the offset hack for three reasons: - While it does exercise x86's kvm_mmu_invalidate_mmio_sptes(), simply wrapping the generation doesn't actually test the interesting case of having stale MMIO sptes with the new generation number, e.g. old sptes with a generation number of 0. - Triggering kvm_mmu_invalidate_mmio_sptes() prematurely makes its performance rather important since the probability of invalidating MMIO sptes jumps from "effectively never" to "fairly likely". This limits what can be done in future patches, e.g. to simplify the invalidation code, as doing so without proper caution could lead to a noticeable performance regression. - Forcing the memslots generation, which is a 64-bit number, to wrap prevents KVM from assuming the memslots generation will never wrap. This in turn prevents KVM from using an arbitrary bit for the "update in-progress" flag, e.g. using bit 63 would immediately collide with using a large value as the starting generation number. The "update in-progress" flag is effectively forced into bit 0 so that it's (subtly) taken into account when incrementing the generation. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0f1f1c7c7a36..5c2e7e173a46 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -656,12 +656,8 @@ static struct kvm *kvm_create_vm(unsigned long type) struct kvm_memslots *slots = kvm_alloc_memslots(); if (!slots) goto out_err_no_srcu; - /* - * Generations must be different for each address space. - * Init kvm generation close to the maximum to easily test the - * code of handling generation number wrap-around. - */ - slots->generation = i * 2 - 150; + /* Generations must be different for each address space. */ + slots->generation = i * 2; rcu_assign_pointer(kvm->memslots[i], slots); } From 164bf7e56c5a73f2f819c39ba7e0f20e0f97dc7b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:18 -0800 Subject: [PATCH 201/855] KVM: Move the memslot update in-progress flag to bit 63 ...now that KVM won't explode by moving it out of bit 0. Using bit 63 eliminates the need to jump over bit 0, e.g. when calculating a new memslots generation or when propagating the memslots generation to an MMIO spte. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/mmu.txt | 13 ++++++++----- arch/x86/kvm/mmu.c | 31 ++++++++++++------------------- include/linux/kvm_host.h | 4 ++-- virt/kvm/kvm_main.c | 8 ++++---- 4 files changed, 26 insertions(+), 30 deletions(-) diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index e507a9e0421e..367a952f50ab 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -452,13 +452,16 @@ stored into the MMIO spte. Thus, the MMIO spte might be created based on out-of-date information, but with an up-to-date generation number. To avoid this, the generation number is incremented again after synchronize_srcu -returns; thus, the low bit of kvm_memslots(kvm)->generation is only 1 during a +returns; thus, bit 63 of kvm_memslots(kvm)->generation set to 1 only during a memslot update, while some SRCU readers might be using the old copy. We do not want to use an MMIO sptes created with an odd generation number, and we can do -this without losing a bit in the MMIO spte. The low bit of the generation -is not stored in MMIO spte, and presumed zero when it is extracted out of the -spte. If KVM is unlucky and creates an MMIO spte while the low bit is 1, -the next access to the spte will always be a cache miss. +this without losing a bit in the MMIO spte. The "update in-progress" bit of the +generation is not stored in MMIO spte, and is so is implicitly zero when the +generation is extracted out of the spte. If KVM is unlucky and creates an MMIO +spte while an update is in-progress, the next access to the spte will always be +a cache miss. For example, a subsequent access during the update window will +miss due to the in-progress flag diverging, while an access after the update +window closes will have a higher generation number (as compared to the spte). Further reading diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 364b2a737d94..bcf62e1e1ff7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -335,18 +335,17 @@ static inline bool is_access_track_spte(u64 spte) * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of * the memslots generation and is derived as follows: * - * Bits 1-9 of the memslot generation are propagated to spte bits 3-11 - * Bits 10-19 of the memslot generation are propagated to spte bits 52-61 + * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 + * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 * - * The MMIO generation starts at bit 1 of the memslots generation in order to - * skip over bit 0, the KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag. Including - * the flag would require stealing a bit from the "real" generation number and - * thus effectively halve the maximum number of MMIO generations that can be - * handled before encountering a wrap (which requires a full MMU zap). The - * flag is instead explicitly queried when checking for MMIO spte cache hits. + * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in + * the MMIO generation number, as doing so would require stealing a bit from + * the "real" generation number and thus effectively halve the maximum number + * of MMIO generations that can be handled before encountering a wrap (which + * requires a full MMU zap). The flag is instead explicitly queried when + * checking for MMIO spte cache hits. */ -#define MMIO_SPTE_GEN_MASK GENMASK_ULL(19, 1) -#define MMIO_SPTE_GEN_SHIFT 1 +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0) #define MMIO_SPTE_GEN_LOW_START 3 #define MMIO_SPTE_GEN_LOW_END 11 @@ -363,8 +362,6 @@ static u64 generation_mmio_spte_mask(u64 gen) WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - gen >>= MMIO_SPTE_GEN_SHIFT; - mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; return mask; @@ -378,7 +375,7 @@ static u64 get_mmio_spte_generation(u64 spte) gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; - return gen << MMIO_SPTE_GEN_SHIFT; + return gen; } static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, @@ -5905,13 +5902,9 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { - gen &= MMIO_SPTE_GEN_MASK; + WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); - /* - * Shift to adjust for the "update in-progress" flag, which isn't - * included in the MMIO generation number. - */ - gen >>= MMIO_SPTE_GEN_SHIFT; + gen &= MMIO_SPTE_GEN_MASK; /* * Generation numbers are incremented in multiples of the number of diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5e1cb74922b3..85c0c00d5159 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -49,7 +49,7 @@ #define KVM_MEMSLOT_INVALID (1UL << 16) /* - * Bit 0 of the memslot generation number is an "update in-progress flag", + * Bit 63 of the memslot generation number is an "update in-progress flag", * e.g. is temporarily set for the duration of install_new_memslots(). * This flag effectively creates a unique generation number that is used to * mark cached memslot data, e.g. MMIO accesses, as potentially being stale, @@ -67,7 +67,7 @@ * the actual generation number against accesses that were inserted into the * cache *before* the memslots were updated. */ -#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS BIT_ULL(0) +#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS BIT_ULL(63) /* Two fragments for cross MMIO pages. */ #define KVM_MAX_MMIO_FRAGMENTS 2 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5c2e7e173a46..c9d0bc01f8cb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -657,7 +657,7 @@ static struct kvm *kvm_create_vm(unsigned long type) if (!slots) goto out_err_no_srcu; /* Generations must be different for each address space. */ - slots->generation = i * 2; + slots->generation = i; rcu_assign_pointer(kvm->memslots[i], slots); } @@ -890,10 +890,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, * Generations must be unique even across address spaces. We do not need * a global counter for that, instead the generation space is evenly split * across address spaces. For example, with two address spaces, address - * space 0 will use generations 0, 4, 8, ... while address space 1 will - * use generations 2, 6, 10, 14, ... + * space 0 will use generations 0, 2, 4, ... while address space 1 will + * use generations 1, 3, 5, ... */ - gen += KVM_ADDRESS_SPACE_NUM * 2; + gen += KVM_ADDRESS_SPACE_NUM; kvm_arch_memslots_updated(kvm, gen); From 85875a133ea3aca5e4ea423c7cb991ad53f4866e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:19 -0800 Subject: [PATCH 202/855] KVM: x86/mmu: Move slot_level_*() helper functions up a few lines ...so that kvm_mmu_invalidate_zap_pages_in_memslot() can utilize the helpers in future patches. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 139 +++++++++++++++++++++++---------------------- 1 file changed, 70 insertions(+), 69 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bcf62e1e1ff7..5a8af2bcd891 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5487,6 +5487,76 @@ void kvm_disable_tdp(void) } EXPORT_SYMBOL_GPL(kvm_disable_tdp); + +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); + +/* The caller should hold mmu-lock before calling this function. */ +static __always_inline bool +slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) +{ + struct slot_rmap_walk_iterator iterator; + bool flush = false; + + for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + cond_resched_lock(&kvm->mmu_lock); + } + } + + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + + return flush; +} + +static __always_inline bool +slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + bool lock_flush_tlb) +{ + return slot_handle_level_range(kvm, memslot, fn, start_level, + end_level, memslot->base_gfn, + memslot->base_gfn + memslot->npages - 1, + lock_flush_tlb); +} + +static __always_inline bool +slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static __always_inline bool +slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static __always_inline bool +slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_PAGE_TABLE_LEVEL, lock_flush_tlb); +} + static void free_mmu_pages(struct kvm_vcpu *vcpu) { free_page((unsigned long)vcpu->arch.mmu->pae_root); @@ -5561,75 +5631,6 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_page_track_unregister_notifier(kvm, node); } -/* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); - -/* The caller should hold mmu-lock before calling this function. */ -static __always_inline bool -slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) -{ - struct slot_rmap_walk_iterator iterator; - bool flush = false; - - for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, - end_gfn, &iterator) { - if (iterator.rmap) - flush |= fn(kvm, iterator.rmap); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - if (flush && lock_flush_tlb) { - kvm_flush_remote_tlbs(kvm); - flush = false; - } - cond_resched_lock(&kvm->mmu_lock); - } - } - - if (flush && lock_flush_tlb) { - kvm_flush_remote_tlbs(kvm); - flush = false; - } - - return flush; -} - -static __always_inline bool -slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - bool lock_flush_tlb) -{ - return slot_handle_level_range(kvm, memslot, fn, start_level, - end_level, memslot->base_gfn, - memslot->base_gfn + memslot->npages - 1, - lock_flush_tlb); -} - -static __always_inline bool -slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} - -static __always_inline bool -slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} - -static __always_inline bool -slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_PAGE_TABLE_LEVEL, lock_flush_tlb); -} - void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; From a21136345cb6f1a5b7f576701b6a454da5b6e606 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:20 -0800 Subject: [PATCH 203/855] KVM: x86/mmu: Split remote_flush+zap case out of kvm_mmu_flush_or_zap() ...and into a separate helper, kvm_mmu_remote_flush_or_zap(), that does not require a vcpu so that the code can be (re)used by kvm_mmu_invalidate_zap_pages_in_memslot(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5a8af2bcd891..1cce120f06ae 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2240,18 +2240,28 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return true; } +static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, + struct list_head *invalid_list, + bool remote_flush) +{ + if (!remote_flush && !list_empty(invalid_list)) + return false; + + if (!list_empty(invalid_list)) + kvm_mmu_commit_zap_page(kvm, invalid_list); + else + kvm_flush_remote_tlbs(kvm); + return true; +} + static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, struct list_head *invalid_list, bool remote_flush, bool local_flush) { - if (!list_empty(invalid_list)) { - kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list); + if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush)) return; - } - if (remote_flush) - kvm_flush_remote_tlbs(vcpu->kvm); - else if (local_flush) + if (local_flush) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } From 4e103134b862314dc2f2f18f2fb0ab972adc3f5f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:21 -0800 Subject: [PATCH 204/855] KVM: x86/mmu: Zap only the relevant pages when removing a memslot Modify kvm_mmu_invalidate_zap_pages_in_memslot(), a.k.a. the x86 MMU's handler for kvm_arch_flush_shadow_memslot(), to zap only the pages/PTEs that actually belong to the memslot being removed. This improves performance, especially why the deleted memslot has only a few shadow entries, or even no entries. E.g. a microbenchmark to access regular memory while concurrently reading PCI ROM to trigger memslot deletion showed a 5% improvement in throughput. Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1cce120f06ae..b81e2cad0237 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5622,7 +5622,38 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node) { - kvm_mmu_invalidate_zap_all_pages(kvm); + struct kvm_mmu_page *sp; + LIST_HEAD(invalid_list); + unsigned long i; + bool flush; + gfn_t gfn; + + spin_lock(&kvm->mmu_lock); + + if (list_empty(&kvm->arch.active_mmu_pages)) + goto out_unlock; + + flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false); + + for (i = 0; i < slot->npages; i++) { + gfn = slot->base_gfn + i; + + for_each_valid_sp(kvm, sp, gfn) { + if (sp->gfn != gfn) + continue; + + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + } + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + flush = false; + cond_resched_lock(&kvm->mmu_lock); + } + } + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + +out_unlock: + spin_unlock(&kvm->mmu_lock); } void kvm_mmu_init_vm(struct kvm *kvm) From a592a3b8fc62af25a6e76aebde97a5d5f6f13e0f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:22 -0800 Subject: [PATCH 205/855] Revert "KVM: MMU: document fast invalidate all pages" Remove x86 KVM's fast invalidate mechanism, i.e. revert all patches from the original series[1]. Though not explicitly stated, for all intents and purposes the fast invalidate mechanism was added to speed up the scenario where removing a memslot, e.g. as part of accessing reading PCI ROM, caused KVM to flush all shadow entries[1]. Now that the memslot case flushes only shadow entries belonging to the memslot, i.e. doesn't use the fast invalidate mechanism, the only remaining usage of the mechanism are when the VM is being destroyed and when the MMIO generation rolls over. When a VM is being destroyed, either there are no active vcpus, i.e. there's no lock contention, or the VM has ungracefully terminated, in which case we want to reclaim its pages as quickly as possible, i.e. not release the MMU lock if there are still CPUs executing in the VM. The MMIO generation scenario is almost literally a one-in-a-million occurrence, i.e. is not a performance sensitive scenario. Given that lock-breaking is not desirable (VM teardown) or irrelevant (MMIO generation overflow), remove the fast invalidate mechanism to simplify the code (a small amount) and to discourage future code from zapping all pages as using such a big hammer should be a last restort. This reverts commit f6f8adeef542a18b1cb26a0b772c9781a10bb477. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/mmu.txt | 28 +--------------------------- arch/x86/include/asm/kvm_host.h | 3 --- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index 367a952f50ab..f365102c80f5 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -224,10 +224,6 @@ Shadow pages contain the following information: A bitmap indicating which sptes in spt point (directly or indirectly) at pages that may be unsynchronized. Used to quickly locate all unsychronized pages reachable from a given page. - mmu_valid_gen: - Generation number of the page. It is compared with kvm->arch.mmu_valid_gen - during hash table lookup, and used to skip invalidated shadow pages (see - "Zapping all pages" below.) clear_spte_count: Only present on 32-bit hosts, where a 64-bit spte cannot be written atomically. The reader uses this while running out of the MMU lock @@ -402,27 +398,6 @@ causes its disallow_lpage to be incremented, thus preventing instantiation of a large spte. The frames at the end of an unaligned memory slot have artificially inflated ->disallow_lpages so they can never be instantiated. -Zapping all pages (page generation count) -========================================= - -For the large memory guests, walking and zapping all pages is really slow -(because there are a lot of pages), and also blocks memory accesses of -all VCPUs because it needs to hold the MMU lock. - -To make it be more scalable, kvm maintains a global generation number -which is stored in kvm->arch.mmu_valid_gen. Every shadow page stores -the current global generation-number into sp->mmu_valid_gen when it -is created. Pages with a mismatching generation number are "obsolete". - -When KVM need zap all shadow pages sptes, it just simply increases the global -generation-number then reload root shadow pages on all vcpus. As the VCPUs -create new shadow page tables, the old pages are not used because of the -mismatching generation number. - -KVM then walks through all pages and zaps obsolete pages. While the zap -operation needs to take the MMU lock, the lock can be released periodically -so that the VCPUs can make progress. - Fast invalidation of MMIO sptes =============================== @@ -435,8 +410,7 @@ shadow pages, and is made more scalable with a similar technique. MMIO sptes have a few spare bits, which are used to store a generation number. The global generation number is stored in kvm_memslots(kvm)->generation, and increased whenever guest memory info -changes. This generation number is distinct from the one described in -the previous section. +changes. When KVM finds an MMIO spte, it checks the generation number of the spte. If the generation number of the spte does not equal the global generation diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c4758e1a8843..69daa57b08a7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -333,10 +333,7 @@ struct kvm_mmu_page { int root_count; /* Currently serving as active root */ unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - - /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */ unsigned long mmu_valid_gen; - DECLARE_BITMAP(unsync_child_bitmap, 512); #ifdef CONFIG_X86_32 From 4771450c345dc5e3e3417d82aff62e0d88e7eee6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:23 -0800 Subject: [PATCH 206/855] Revert "KVM: MMU: drop kvm_mmu_zap_mmio_sptes" Revert back to a dedicated (and slower) mechanism for handling the scenario where all MMIO shadow PTEs need to be zapped due to overflowing the MMIO generation number. The MMIO generation scenario is almost literally a one-in-a-million occurrence, i.e. is not a performance sensitive scenario. Restoring kvm_mmu_zap_mmio_sptes() leaves VM teardown as the only user of kvm_mmu_invalidate_zap_all_pages() and paves the way for removing the fast invalidate mechanism altogether. This reverts commit a8eca9dcc656a405a28ffba43f3d86a1ff0eb331. Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 22 +++++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 69daa57b08a7..aeca3fb1cf63 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -319,6 +319,7 @@ struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; bool unsync; + bool mmio_cached; /* * The following two entries are used to key the shadow page in the diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b81e2cad0237..d80c1558b23c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -391,6 +391,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, mask |= (gpa & shadow_nonpresent_or_rsvd_mask) << shadow_nonpresent_or_rsvd_mask_len; + page_header(__pa(sptep))->mmio_cached = true; + trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); } @@ -5942,6 +5944,24 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } +static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); + + spin_lock(&kvm->mmu_lock); +restart: + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { + if (!sp->mmio_cached) + continue; + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + goto restart; + } + + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); +} + void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); @@ -5963,7 +5983,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) */ if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_mmu_zap_mmio_sptes(kvm); } } From 571c5af06e303b4a69016193fd6b5afbc96eac40 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:24 -0800 Subject: [PATCH 207/855] KVM: x86/mmu: Voluntarily reschedule as needed when zapping MMIO sptes Call cond_resched_lock() when zapping MMIO to reschedule if needed or to release and reacquire mmu_lock in case of contention. There is no need to flush or zap when temporarily dropping mmu_lock as zapping MMIO sptes is done when holding the memslots lock and with the "update in-progress" bit set in the memslots generation, which disables MMIO spte caching. The walk does need to be restarted if mmu_lock is dropped as the active pages list may be modified. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d80c1558b23c..2190679eda39 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5954,7 +5954,8 @@ restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (!sp->mmio_cached) continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) || + cond_resched_lock(&kvm->mmu_lock)) goto restart; } From 5ff0568374ed2e585376a3832857ade5daccd381 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:25 -0800 Subject: [PATCH 208/855] KVM: x86/mmu: Remove is_obsolete() call Unwinding usage of is_obsolete() is a step towards removing x86's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This is a partial revert of commit 05988d728dcd ("KVM: MMU: reduce KVM_REQ_MMU_RELOAD when root page is zapped"). [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2190679eda39..6cbffc775220 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2713,11 +2713,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); - /* - * The obsolete pages can not be used on any vcpus. - * See the comments in kvm_mmu_invalidate_zap_all_pages(). - */ - if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) + if (!sp->role.invalid) kvm_reload_remote_mmus(kvm); } From 52d5dedc79bdcbac2976159a172069618cf31be5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:26 -0800 Subject: [PATCH 209/855] Revert "KVM: MMU: reclaim the zapped-obsolete page first" Unwinding optimizations related to obsolete pages is a step towards removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit 365c886860c4ba670d245e762b23987c912c129a. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu.c | 21 ++++----------------- arch/x86/kvm/x86.c | 1 - 3 files changed, 4 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aeca3fb1cf63..fbe16a908076 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -851,7 +851,6 @@ struct kvm_arch { * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; - struct list_head zapped_obsolete_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6cbffc775220..255b0212fc5b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5858,6 +5858,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); int batch = 0; restart: @@ -5890,8 +5891,7 @@ restart: goto restart; } - ret = kvm_mmu_prepare_zap_page(kvm, sp, - &kvm->arch.zapped_obsolete_pages); + ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); batch += ret; if (ret) @@ -5902,7 +5902,7 @@ restart: * Should flush tlb before free page tables since lockless-walking * may use the pages. */ - kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); + kvm_mmu_commit_zap_page(kvm, &invalid_list); } /* @@ -5935,11 +5935,6 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) spin_unlock(&kvm->mmu_lock); } -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) -{ - return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); -} - static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; @@ -6011,24 +6006,16 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) * want to shrink a VM that only started to populate its MMU * anyway. */ - if (!kvm->arch.n_used_mmu_pages && - !kvm_has_zapped_obsolete_pages(kvm)) + if (!kvm->arch.n_used_mmu_pages) continue; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - if (kvm_has_zapped_obsolete_pages(kvm)) { - kvm_mmu_commit_zap_page(kvm, - &kvm->arch.zapped_obsolete_pages); - goto unlock; - } - if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) freed++; kvm_mmu_commit_zap_page(kvm, &invalid_list); -unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03d26ffb29cd..78fb13f190a3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9113,7 +9113,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); From 210f494261e1e84ad1f15877baa1c615afe3b342 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:27 -0800 Subject: [PATCH 210/855] Revert "KVM: MMU: collapse TLB flushes when zap all pages" Unwinding optimizations related to obsolete pages is a step towards removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit f34d251d66ba263c077ed9d2bbd1874339a4c887. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 255b0212fc5b..e733262027ed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2211,14 +2211,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); -/* - * NOTE: we should pay more attention on the zapped-obsolete page - * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk - * since it has been deleted from active_mmu_pages but still can be found - * at hast list. - * - * for_each_valid_sp() has skipped that kind of pages. - */ #define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ @@ -5881,13 +5873,11 @@ restart: if (sp->role.invalid) continue; - /* - * Need not flush tlb since we only zap the sp with invalid - * generation number. - */ if (batch >= BATCH_ZAP_PAGES && - cond_resched_lock(&kvm->mmu_lock)) { + (need_resched() || spin_needbreak(&kvm->mmu_lock))) { batch = 0; + kvm_mmu_commit_zap_page(kvm, &invalid_list); + cond_resched_lock(&kvm->mmu_lock); goto restart; } @@ -5898,10 +5888,6 @@ restart: goto restart; } - /* - * Should flush tlb before free page tables since lockless-walking - * may use the pages. - */ kvm_mmu_commit_zap_page(kvm, &invalid_list); } @@ -5920,17 +5906,6 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) trace_kvm_mmu_invalidate_zap_all_pages(kvm); kvm->arch.mmu_valid_gen++; - /* - * Notify all vcpus to reload its shadow page table - * and flush TLB. Then all vcpus will switch to new - * shadow page table with the new mmu_valid_gen. - * - * Note: we should do this under the protection of - * mmu-lock, otherwise, vcpu would purge shadow page - * but miss tlb flush. - */ - kvm_reload_remote_mmus(kvm); - kvm_zap_obsolete_pages(kvm); spin_unlock(&kvm->mmu_lock); } From 43d2b14b105fb00b8864c7b0ee7043cc1cc4a969 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:28 -0800 Subject: [PATCH 211/855] Revert "KVM: MMU: zap pages in batch" Unwinding optimizations related to obsolete pages is a step towards removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit e7d11c7a894986a13817c1c001e1e7668c5c4eb4. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e733262027ed..cb9fd69d2632 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5846,18 +5846,14 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); -#define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); - int batch = 0; restart: list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { - int ret; - /* * No obsolete page exists before new created page since * active_mmu_pages is the FIFO list. @@ -5866,6 +5862,28 @@ restart: break; /* + * Do not repeatedly zap a root page to avoid unnecessary + * KVM_REQ_MMU_RELOAD, otherwise we may not be able to + * progress: + * vcpu 0 vcpu 1 + * call vcpu_enter_guest(): + * 1): handle KVM_REQ_MMU_RELOAD + * and require mmu-lock to + * load mmu + * repeat: + * 1): zap root page and + * send KVM_REQ_MMU_RELOAD + * + * 2): if (cond_resched_lock(mmu-lock)) + * + * 2): hold mmu-lock and load mmu + * + * 3): see KVM_REQ_MMU_RELOAD bit + * on vcpu->requests is set + * then return 1 to call + * vcpu_enter_guest() again. + * goto repeat; + * * Since we are reversely walking the list and the invalid * list will be moved to the head, skip the invalid page * can help us to avoid the infinity list walking. @@ -5873,18 +5891,13 @@ restart: if (sp->role.invalid) continue; - if (batch >= BATCH_ZAP_PAGES && - (need_resched() || spin_needbreak(&kvm->mmu_lock))) { - batch = 0; + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { kvm_mmu_commit_zap_page(kvm, &invalid_list); cond_resched_lock(&kvm->mmu_lock); goto restart; } - ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - batch += ret; - - if (ret) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; } From 42560fb1f3c6c7f730897b7fa7a478bc37e0be50 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:29 -0800 Subject: [PATCH 212/855] Revert "KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages" ...as part of removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit 35006126f024f68727c67001b9cb703c38f69268. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 1 - arch/x86/kvm/mmutrace.h | 21 --------------------- 2 files changed, 22 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cb9fd69d2632..df4025e6f1e1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5916,7 +5916,6 @@ restart: void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) { spin_lock(&kvm->mmu_lock); - trace_kvm_mmu_invalidate_zap_all_pages(kvm); kvm->arch.mmu_valid_gen++; kvm_zap_obsolete_pages(kvm); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index c73bf4e4988c..cac88910b310 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -282,27 +282,6 @@ TRACE_EVENT( ) ); -TRACE_EVENT( - kvm_mmu_invalidate_zap_all_pages, - TP_PROTO(struct kvm *kvm), - TP_ARGS(kvm), - - TP_STRUCT__entry( - __field(unsigned long, mmu_valid_gen) - __field(unsigned int, mmu_used_pages) - ), - - TP_fast_assign( - __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; - __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; - ), - - TP_printk("kvm-mmu-valid-gen %lx used_pages %x", - __entry->mmu_valid_gen, __entry->mmu_used_pages - ) -); - - TRACE_EVENT( check_mmio_spte, TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), From b59c4830ca185ba0e9f9e046fb1cd10a4a92627a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:30 -0800 Subject: [PATCH 213/855] Revert "KVM: MMU: show mmu_valid_gen in shadow page related tracepoints" ...as part of removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit 2248b023219251908aedda0621251cffc548f258. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmutrace.h | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index cac88910b310..9f6c855a0043 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -8,18 +8,16 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define KVM_MMU_PAGE_FIELDS \ - __field(unsigned long, mmu_valid_gen) \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ +#define KVM_MMU_PAGE_FIELDS \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ __field(bool, unsync) -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->mmu_valid_gen = sp->mmu_valid_gen; \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ @@ -31,9 +29,8 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \ + trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s" \ " %snxe %sad root %u %s%c", \ - __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.cr4_pae ? " pae" : "", \ role.quadrant, \ From 7390de1e99a70895721165d0ccd4a6e16482960a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:31 -0800 Subject: [PATCH 214/855] Revert "KVM: x86: use the fast way to invalidate all pages" Revert to a slow kvm_mmu_zap_all() for kvm_arch_flush_shadow_all(). Flushing all shadow entries is only done during VM teardown, i.e. kvm_arch_flush_shadow_all() is only called when the associated MM struct is being released or when the VM instance is being freed. Although the performance of teardown itself isn't critical, KVM should still voluntarily schedule to play nice with the rest of the kernel; but that can be done without the fast invalidate mechanism in a future patch. This reverts commit 6ca18b6950f8dee29361722f28f69847724b276f. Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 15 +++++++++++++++ arch/x86/kvm/x86.c | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index df4025e6f1e1..5cdeb88850f8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5846,6 +5846,21 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); +void kvm_mmu_zap_all(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); + + spin_lock(&kvm->mmu_lock); +restart: + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + goto restart; + + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); +} + static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78fb13f190a3..65e4559eef2f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9470,7 +9470,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_mmu_zap_all(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, From 8a674adc11cd4cc59e51eaea6f0cc4b3d5710411 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:32 -0800 Subject: [PATCH 215/855] KVM: x86/mmu: skip over invalid root pages when zapping all sptes ...to guarantee forward progress. When zapped, root pages are marked invalid and moved to the head of the active pages list until they are explicitly freed. Theoretically, having unzappable root pages at the head of the list could prevent kvm_mmu_zap_all() from making forward progress were a future patch to add a loop restart after processing a page, e.g. to drop mmu_lock on contention. Although kvm_mmu_prepare_zap_page() can theoretically take action on invalid pages, e.g. to zap unsync children, functionally it's not necessary (root pages will be re-zapped when freed) and practically speaking the odds of e.g. @unsync or @unsync_children becoming %true while zapping all pages is basically nil. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5cdeb88850f8..c79ad7f31fdb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5853,9 +5853,12 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { + if (sp->role.invalid && sp->root_count) + continue; if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; + } kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); From 5d6317ca4e61a3fa7528f832cd945c42fde8e67f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:33 -0800 Subject: [PATCH 216/855] KVM: x86/mmu: Voluntarily reschedule as needed when zapping all sptes Call cond_resched_lock() when zapping all sptes to reschedule if needed or to release and reacquire mmu_lock in case of contention. There is no need to flush or zap when temporarily dropping mmu_lock as zapping all sptes is done only when the owning userspace VMM has exited or when the VM is being destroyed, i.e. there is no interplay with memslots or MMIO generations to worry about. Be paranoid and restart the walk if mmu_lock is dropped to avoid any potential issues with consuming a stale iterator. The overhead in doing so is negligible as at worst there will be a few root shadow pages at the head of the list, i.e. the iterator is essentially the head of the list already. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c79ad7f31fdb..fa153d771f47 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5856,7 +5856,8 @@ restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (sp->role.invalid && sp->root_count) continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) || + cond_resched_lock(&kvm->mmu_lock)) goto restart; } From ea145aacf4ae8485cf179a4d0dc502e9f75044f4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:34 -0800 Subject: [PATCH 217/855] Revert "KVM: MMU: fast invalidate all pages" Remove x86 KVM's fast invalidate mechanism, i.e. revert all patches from the original series[1], now that all users of the fast invalidate mechanism are gone. This reverts commit 5304b8d37c2a5ebca48330f5e7868d240eafbed1. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 - arch/x86/kvm/mmu.c | 98 +-------------------------------- arch/x86/kvm/mmu.h | 1 - 3 files changed, 1 insertion(+), 100 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fbe16a908076..9417febf8490 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -334,7 +334,6 @@ struct kvm_mmu_page { int root_count; /* Currently serving as active root */ unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - unsigned long mmu_valid_gen; DECLARE_BITMAP(unsync_child_bitmap, 512); #ifdef CONFIG_X86_32 @@ -845,7 +844,6 @@ struct kvm_arch { unsigned int n_requested_mmu_pages; unsigned int n_max_mmu_pages; unsigned int indirect_shadow_pages; - unsigned long mmu_valid_gen; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fa153d771f47..6d602d4c3ca4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2060,12 +2060,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct if (!direct) sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - - /* - * The active_mmu_pages list is the FIFO list, do not move the - * page until it is zapped. kvm_zap_obsolete_pages depends on - * this feature. See the comments in kvm_zap_obsolete_pages(). - */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; @@ -2214,7 +2208,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, #define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ + if ((_sp)->role.invalid) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ @@ -2266,11 +2260,6 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } static void mmu_audit_disable(void) { } #endif -static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); -} - static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { @@ -2495,7 +2484,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PT_PAGE_TABLE_LEVEL && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); @@ -4206,14 +4194,6 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, return false; if (cached_root_available(vcpu, new_cr3, new_role)) { - /* - * It is possible that the cached previous root page is - * obsolete because of a change in the MMU - * generation number. However, that is accompanied by - * KVM_REQ_MMU_RELOAD, which will free the root that we - * have set here and allocate a new one. - */ - kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); if (!skip_tlb_flush) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); @@ -5865,82 +5845,6 @@ restart: spin_unlock(&kvm->mmu_lock); } -static void kvm_zap_obsolete_pages(struct kvm *kvm) -{ - struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); - -restart: - list_for_each_entry_safe_reverse(sp, node, - &kvm->arch.active_mmu_pages, link) { - /* - * No obsolete page exists before new created page since - * active_mmu_pages is the FIFO list. - */ - if (!is_obsolete_sp(kvm, sp)) - break; - - /* - * Do not repeatedly zap a root page to avoid unnecessary - * KVM_REQ_MMU_RELOAD, otherwise we may not be able to - * progress: - * vcpu 0 vcpu 1 - * call vcpu_enter_guest(): - * 1): handle KVM_REQ_MMU_RELOAD - * and require mmu-lock to - * load mmu - * repeat: - * 1): zap root page and - * send KVM_REQ_MMU_RELOAD - * - * 2): if (cond_resched_lock(mmu-lock)) - * - * 2): hold mmu-lock and load mmu - * - * 3): see KVM_REQ_MMU_RELOAD bit - * on vcpu->requests is set - * then return 1 to call - * vcpu_enter_guest() again. - * goto repeat; - * - * Since we are reversely walking the list and the invalid - * list will be moved to the head, skip the invalid page - * can help us to avoid the infinity list walking. - */ - if (sp->role.invalid) - continue; - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_mmu_commit_zap_page(kvm, &invalid_list); - cond_resched_lock(&kvm->mmu_lock); - goto restart; - } - - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; - } - - kvm_mmu_commit_zap_page(kvm, &invalid_list); -} - -/* - * Fast invalidate all shadow pages and use lock-break technique - * to zap obsolete pages. - * - * It's required when memslot is being deleted or VM is being - * destroyed, in these cases, we should ensure that KVM MMU does - * not use any resource of the being-deleted slot or all slots - * after calling the function. - */ -void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) -{ - spin_lock(&kvm->mmu_lock); - kvm->arch.mmu_valid_gen++; - - kvm_zap_obsolete_pages(kvm); - spin_unlock(&kvm->mmu_lock); -} - static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index c7b333147c4a..bbdc60f2fae8 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -203,7 +203,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); From 83cdb56864bcb1466b454f17fff47348ca7925a2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:35 -0800 Subject: [PATCH 218/855] KVM: x86/mmu: Differentiate between nr zapped and list unstable The return value of kvm_mmu_prepare_zap_page() has evolved to become overloaded to convey two separate pieces of information. 1) was at least one page zapped and 2) has the list of MMU pages become unstable. In it's original incarnation (as kvm_mmu_zap_page()), there was no return value at all. Commit 0738541396be ("KVM: MMU: awareness of new kvm_mmu_zap_page behaviour") added a return value in preparation for commit 4731d4c7a077 ("KVM: MMU: out of sync shadow core"). Although the return value was of type 'int', it was actually used as a boolean to indicate whether or not active_mmu_pages may have become unstable due to zapping children. Walking a list with list_for_each_entry_safe() only protects against deleting/moving the current entry, i.e. zapping a child page would break iteration due to modifying any number of entries. Later, commit 60c8aec6e2c9 ("KVM: MMU: use page array in unsync walk") modified mmu_zap_unsync_children() to return an approximation of the number of children zapped. This was not intentional, it was simply a side effect of how the code was written. The unintented side affect was then morphed into an actual feature by commit 77662e0028c7 ("KVM: MMU: fix kvm_mmu_zap_page() and its calling path"), which modified kvm_mmu_change_mmu_pages() to use the number of zapped pages when determining the number of MMU pages in use by the VM. Finally, commit 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return the number of pages it actually freed") added the initial page to the return value to make its behavior more consistent with what most users would expect. Incorporating the initial parent page in the return value of kvm_mmu_zap_page() breaks the original usage of restarting a list walk on a non-zero return value to handle a potentially unstable list, i.e. walks will unnecessarily restart when any page is zapped. Fix this by restoring the original behavior of kvm_mmu_zap_page(), i.e. return a boolean to indicate that the list may be unstable and move the number of zapped children to a dedicated parameter. Since the majority of callers to kvm_mmu_prepare_zap_page() don't care about either return value, preserve the current definition of kvm_mmu_prepare_zap_page() by making it a wrapper of a new helper, __kvm_mmu_prepare_zap_page(). This avoids having to update every call site and also provides cleaner code for functions that only care about the number of pages zapped. Fixes: 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return the number of pages it actually freed") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6d602d4c3ca4..4b93fcdf0839 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2200,8 +2200,8 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) --kvm->stat.mmu_unsync; } -static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, - struct list_head *invalid_list); +static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list); static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); @@ -2669,17 +2669,22 @@ static int mmu_zap_unsync_children(struct kvm *kvm, return zapped; } -static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, - struct list_head *invalid_list) +static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, + struct kvm_mmu_page *sp, + struct list_head *invalid_list, + int *nr_zapped) { - int ret; + bool list_unstable; trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; - ret = mmu_zap_unsync_children(kvm, sp, invalid_list); + *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); + /* Zapping children means active_mmu_pages has become unstable. */ + list_unstable = *nr_zapped; + if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp); @@ -2687,7 +2692,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { /* Count self */ - ret++; + (*nr_zapped)++; list_move(&sp->link, invalid_list); kvm_mod_used_mmu_pages(kvm, -1); } else { @@ -2698,7 +2703,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, } sp->role.invalid = 1; - return ret; + return list_unstable; +} + +static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list) +{ + int nr_zapped; + + __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped); + return nr_zapped; } static void kvm_mmu_commit_zap_page(struct kvm *kvm, @@ -5830,13 +5844,14 @@ void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); + int ign; spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (sp->role.invalid && sp->root_count) continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) || + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) || cond_resched_lock(&kvm->mmu_lock)) goto restart; } @@ -5849,13 +5864,14 @@ static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); + int ign; spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (!sp->mmio_cached) continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) || + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) || cond_resched_lock(&kvm->mmu_lock)) goto restart; } From 24efe61f696c7b44a66c7d6a41c181b31aa338fc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:36 -0800 Subject: [PATCH 219/855] KVM: x86/mmu: WARN if zapping a MMIO spte results in zapping children Paolo expressed a concern that kvm_mmu_zap_mmio_sptes() could have a quadratic runtime[1], i.e. restarting the spte walk while zapping only MMIO sptes could result in re-walking large portions of the list over and over due to the non-MMIO sptes encountered before the restart not being removed. At the time, the concern was legitimate as the walk was restarted when any spte was zapped. But that is no longer the case as the walk is now restarted iff one or more children have been zapped, which is necessary because zapping children makes the active_mmu_pages list unstable. Furthermore, it should be impossible for an MMIO spte to have children, i.e. zapping an MMIO spte should never result in zapping children. In other words, kvm_mmu_zap_mmio_sptes() should never restart its walk, and so should always execute in linear time. WARN if this assertion fails. Although it should never be needed, leave the restart logic in place. In normal operation, the cost is at worst an extra CMP+Jcc, and if for some reason the list does become unstable, not restarting would likely crash KVM, or worse, the kernel. [1] https://patchwork.kernel.org/patch/10756589/#22452085 Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4b93fcdf0839..81618070367b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5871,8 +5871,11 @@ restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (!sp->mmio_cached) continue; - if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) || - cond_resched_lock(&kvm->mmu_lock)) + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) { + WARN_ON_ONCE(1); + goto restart; + } + if (cond_resched_lock(&kvm->mmu_lock)) goto restart; } From 8ab3c471eef20925bf64c6d4fa46e88cdb4e86d5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:37 -0800 Subject: [PATCH 220/855] KVM: x86/mmu: Consolidate kvm_mmu_zap_all() and kvm_mmu_zap_mmio_sptes() ...via a new helper, __kvm_mmu_zap_all(). An alternative to passing a 'bool mmio_only' would be to pass a callback function to filter the shadow page, i.e. to make __kvm_mmu_zap_all() generic and reusable, but zapping all shadow pages is a last resort, i.e. making the helper less extensible is a feature of sorts. And the explicit MMIO parameter makes it easy to preserve the WARN_ON_ONCE() if a restart is triggered when zapping MMIO sptes. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 81618070367b..8d43b7c0f56f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5840,7 +5840,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); -void kvm_mmu_zap_all(struct kvm *kvm) +static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); @@ -5849,30 +5849,12 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { + if (mmio_only && !sp->mmio_cached) + continue; if (sp->role.invalid && sp->root_count) continue; - if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) || - cond_resched_lock(&kvm->mmu_lock)) - goto restart; - } - - kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); -} - -static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) -{ - struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); - int ign; - - spin_lock(&kvm->mmu_lock); -restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (!sp->mmio_cached) - continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(mmio_only); goto restart; } if (cond_resched_lock(&kvm->mmu_lock)) @@ -5883,6 +5865,11 @@ restart: spin_unlock(&kvm->mmu_lock); } +void kvm_mmu_zap_all(struct kvm *kvm) +{ + return __kvm_mmu_zap_all(kvm, false); +} + void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); @@ -5904,7 +5891,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) */ if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); - kvm_mmu_zap_mmio_sptes(kvm); + __kvm_mmu_zap_all(kvm, true); } } From 7fa08e71b4a0591a518814fa78b32e124f90d587 Mon Sep 17 00:00:00 2001 From: Nir Weiner Date: Sun, 27 Jan 2019 12:17:14 +0200 Subject: [PATCH 221/855] KVM: grow_halt_poll_ns() should never shrink vCPU halt_poll_ns grow_halt_poll_ns() have a strange behavior in case (halt_poll_ns_grow == 0) && (vcpu->halt_poll_ns != 0). In this case, vcpu->halt_pol_ns will be set to zero. That results in shrinking instead of growing. Fix issue by changing grow_halt_poll_ns() to not modify vcpu->halt_poll_ns in case halt_poll_ns_grow is zero Reviewed-by: Boris Ostrovsky Reviewed-by: Liran Alon Signed-off-by: Nir Weiner Suggested-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv.c | 5 ++++- virt/kvm/kvm_main.c | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5a066fc299e1..e316a2ddb70b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3631,8 +3631,11 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc, static void grow_halt_poll_ns(struct kvmppc_vcore *vc) { + if (!halt_poll_ns_grow) + return; + /* 10us base */ - if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) + if (vc->halt_poll_ns == 0) vc->halt_poll_ns = 10000; else vc->halt_poll_ns *= halt_poll_ns_grow; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c9d0bc01f8cb..9c8a8bf6e686 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2188,8 +2188,11 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) old = val = vcpu->halt_poll_ns; grow = READ_ONCE(halt_poll_ns_grow); + if (!grow) + goto out; + /* 10us base */ - if (val == 0 && grow) + if (val == 0) val = 10000; else val *= grow; @@ -2198,6 +2201,7 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) val = halt_poll_ns; vcpu->halt_poll_ns = val; +out: trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); } From 49113d360bdeb4dd916fb6bffbcc3e157422b6fd Mon Sep 17 00:00:00 2001 From: Nir Weiner Date: Sun, 27 Jan 2019 12:17:15 +0200 Subject: [PATCH 222/855] KVM: Expose the initial start value in grow_halt_poll_ns() as a module parameter The hard-coded value 10000 in grow_halt_poll_ns() stands for the initial start value when raising up vcpu->halt_poll_ns. It actually sets the first timeout to the first polling session. This value has significant effect on how tolerant we are to outliers. On the standard case, higher value is better - we will spend more time in the polling busyloop, handle events/interrupts faster and result in better performance. But on outliers it puts us in a busy loop that does nothing. Even if the shrink factor is zero, we will still waste time on the first iteration. The optimal value changes between different workloads. It depends on outliers rate and polling sessions length. As this value has significant effect on the dynamic halt-polling algorithm, it should be configurable and exposed. Reviewed-by: Boris Ostrovsky Reviewed-by: Liran Alon Signed-off-by: Nir Weiner Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/halt-polling.txt | 37 ++++++++++++++-------- arch/powerpc/kvm/book3s_hv.c | 3 +- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 8 +++-- 4 files changed, 31 insertions(+), 18 deletions(-) diff --git a/Documentation/virtual/kvm/halt-polling.txt b/Documentation/virtual/kvm/halt-polling.txt index 4a8418318769..4f791b128dd2 100644 --- a/Documentation/virtual/kvm/halt-polling.txt +++ b/Documentation/virtual/kvm/halt-polling.txt @@ -53,7 +53,8 @@ the global max polling interval then the polling interval can be increased in the hope that next time during the longer polling interval the wake up source will be received while the host is polling and the latency benefits will be received. The polling interval is grown in the function grow_halt_poll_ns() and -is multiplied by the module parameter halt_poll_ns_grow. +is multiplied by the module parameters halt_poll_ns_grow and +halt_poll_ns_grow_start. In the event that the total block time was greater than the global max polling interval then the host will never poll for long enough (limited by the global @@ -80,22 +81,30 @@ shrunk. These variables are defined in include/linux/kvm_host.h and as module parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the powerpc kvm-hv case. -Module Parameter | Description | Default Value +Module Parameter | Description | Default Value -------------------------------------------------------------------------------- -halt_poll_ns | The global max polling interval | KVM_HALT_POLL_NS_DEFAULT - | which defines the ceiling value | - | of the polling interval for | (per arch value) - | each vcpu. | +halt_poll_ns | The global max polling | KVM_HALT_POLL_NS_DEFAULT + | interval which defines | + | the ceiling value of the | + | polling interval for | (per arch value) + | each vcpu. | -------------------------------------------------------------------------------- -halt_poll_ns_grow | The value by which the halt | 2 - | polling interval is multiplied | - | in the grow_halt_poll_ns() | - | function. | +halt_poll_ns_grow | The value by which the | 2 + | halt polling interval is | + | multiplied in the | + | grow_halt_poll_ns() | + | function. | -------------------------------------------------------------------------------- -halt_poll_ns_shrink | The value by which the halt | 0 - | polling interval is divided in | - | the shrink_halt_poll_ns() | - | function. | +halt_poll_ns_grow_start | The initial value to grow | 10000 + | to from zero in the | + | grow_halt_poll_ns() | + | function. | +-------------------------------------------------------------------------------- +halt_poll_ns_shrink | The value by which the | 0 + | halt polling interval is | + | divided in the | + | shrink_halt_poll_ns() | + | function. | -------------------------------------------------------------------------------- These module parameters can be set from the debugfs files in: diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e316a2ddb70b..29ffc99bd79b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3634,9 +3634,8 @@ static void grow_halt_poll_ns(struct kvmppc_vcore *vc) if (!halt_poll_ns_grow) return; - /* 10us base */ if (vc->halt_poll_ns == 0) - vc->halt_poll_ns = 10000; + vc->halt_poll_ns = halt_poll_ns_grow_start; else vc->halt_poll_ns *= halt_poll_ns_grow; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 85c0c00d5159..9d55c63db09b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1203,6 +1203,7 @@ extern bool kvm_rebooting; extern unsigned int halt_poll_ns; extern unsigned int halt_poll_ns_grow; +extern unsigned int halt_poll_ns_grow_start; extern unsigned int halt_poll_ns_shrink; struct kvm_device { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9c8a8bf6e686..ae818d27a1a4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -81,6 +81,11 @@ unsigned int halt_poll_ns_grow = 2; module_param(halt_poll_ns_grow, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_grow); +/* The start value to grow halt_poll_ns from */ +unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ +module_param(halt_poll_ns_grow_start, uint, 0644); +EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); + /* Default resets per-vcpu halt_poll_ns . */ unsigned int halt_poll_ns_shrink; module_param(halt_poll_ns_shrink, uint, 0644); @@ -2191,9 +2196,8 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) if (!grow) goto out; - /* 10us base */ if (val == 0) - val = 10000; + val = halt_poll_ns_grow_start; else val *= grow; From dee339b5c1da3e6fa139b97f74f99dc9f0b03ff6 Mon Sep 17 00:00:00 2001 From: Nir Weiner Date: Sun, 27 Jan 2019 12:17:16 +0200 Subject: [PATCH 223/855] KVM: Never start grow vCPU halt_poll_ns from value below halt_poll_ns_grow_start grow_halt_poll_ns() have a strange behaviour in case (vcpu->halt_poll_ns != 0) && (vcpu->halt_poll_ns < halt_poll_ns_grow_start). In this case, vcpu->halt_poll_ns will be multiplied by grow factor (halt_poll_ns_grow) which will require several grow iteration in order to reach a value bigger than halt_poll_ns_grow_start. This means that growing vcpu->halt_poll_ns from value of 0 is slower than growing it from a positive value less than halt_poll_ns_grow_start. Which is misleading and inaccurate. Fix issue by changing grow_halt_poll_ns() to set vcpu->halt_poll_ns to halt_poll_ns_grow_start in any case that (vcpu->halt_poll_ns < halt_poll_ns_grow_start). Regardless if vcpu->halt_poll_ns is 0. use READ_ONCE to get a consistent number for all cases. Reviewed-by: Boris Ostrovsky Reviewed-by: Liran Alon Signed-off-by: Nir Weiner Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv.c | 5 ++--- virt/kvm/kvm_main.c | 10 +++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 29ffc99bd79b..062f3c92c871 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3634,10 +3634,9 @@ static void grow_halt_poll_ns(struct kvmppc_vcore *vc) if (!halt_poll_ns_grow) return; - if (vc->halt_poll_ns == 0) + vc->halt_poll_ns *= halt_poll_ns_grow; + if (vc->halt_poll_ns < halt_poll_ns_grow_start) vc->halt_poll_ns = halt_poll_ns_grow_start; - else - vc->halt_poll_ns *= halt_poll_ns_grow; } static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ae818d27a1a4..5087cf703ed1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2189,17 +2189,17 @@ void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) { - unsigned int old, val, grow; + unsigned int old, val, grow, grow_start; old = val = vcpu->halt_poll_ns; + grow_start = READ_ONCE(halt_poll_ns_grow_start); grow = READ_ONCE(halt_poll_ns_grow); if (!grow) goto out; - if (val == 0) - val = halt_poll_ns_grow_start; - else - val *= grow; + val *= grow; + if (val < grow_start) + val = grow_start; if (val > halt_poll_ns) val = halt_poll_ns; From 7539b174aef405d9d57db48c58390ba360c91312 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 4 Jan 2019 15:54:14 -0200 Subject: [PATCH 224/855] x86: kvmguest: use TSC clocksource if invariant TSC is exposed The invariant TSC bit has the following meaning: "The time stamp counter in newer processors may support an enhancement, referred to as invariant TSC. Processor's support for invariant TSC is indicated by CPUID.80000007H:EDX[8]. The invariant TSC will run at a constant rate in all ACPI P-, C-. and T-states. This is the architectural behavior moving forward. On processors with invariant TSC support, the OS may use the TSC for wall clock timer services (instead of ACPI or HPET timers). TSC reads are much more efficient and do not incur the overhead associated with a ring transition or access to a platform resource." IOW, TSC does not change frequency. In such case, and with TSC scaling hardware available to handle migration, it is possible to use the TSC clocksource directly, whose system calls are faster. Reduce the rating of kvmclock clocksource to allow TSC clocksource to be the default if invariant TSC is exposed. Signed-off-by: Marcelo Tosatti v2: Use feature bits and tsc_unstable() check (Sean Christopherson) Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d908a37bf3f3..904494b924c1 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -351,6 +351,20 @@ void __init kvmclock_init(void) machine_ops.crash_shutdown = kvm_crash_shutdown; #endif kvm_get_preset_lpj(); + + /* + * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate + * with P/T states and does not stop in deep C-states. + * + * Invariant TSC exposed by host means kvmclock is not necessary: + * can use TSC as clocksource. + * + */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && + !check_tsc_unstable()) + kvm_clock.rating = 299; + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; } From a67794cafbc4594debf53dbe4e2a7708426f492e Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Sat, 2 Feb 2019 17:20:27 +0800 Subject: [PATCH 225/855] Revert "KVM: Eliminate extra function calls in kvm_get_dirty_log_protect()" The value of "dirty_bitmap[i]" is already check before setting its value to mask. The following check of "mask" is redundant. The check of "mask" was introduced by commit 58d2930f4ee3 ("KVM: Eliminate extra function calls in kvm_get_dirty_log_protect()"), revert it. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5087cf703ed1..276af92ace6c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1205,11 +1205,9 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, mask = xchg(&dirty_bitmap[i], 0); dirty_bitmap_buffer[i] = mask; - if (mask) { - offset = i * BITS_PER_LONG; - kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, - offset, mask); - } + offset = i * BITS_PER_LONG; + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, + offset, mask); } spin_unlock(&kvm->mmu_lock); } From e40542aff909ac34d2c24712c5c0769c8f77f895 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Thu, 21 Feb 2019 14:28:48 +1100 Subject: [PATCH 226/855] KVM: PPC: Book3S HV: Fix build failure without IOMMU support Currently trying to build without IOMMU support will fail: (.text+0x1380): undefined reference to `kvmppc_h_get_tce' (.text+0x1384): undefined reference to `kvmppc_rm_h_put_tce' (.text+0x149c): undefined reference to `kvmppc_rm_h_stuff_tce' (.text+0x14a0): undefined reference to `kvmppc_rm_h_put_tce_indirect' This happens because turning off IOMMU support will prevent book3s_64_vio_hv.c from being built because it is only built when SPAPR_TCE_IOMMU is set, which depends on IOMMU support. Fix it using ifdefs for the undefined references. Fixes: 76d837a4c0f9 ("KVM: PPC: Book3S PR: Don't include SPAPR TCE code on non-pseries platforms") Signed-off-by: Jordan Niethe Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 2 ++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1860c0bc1395..ba8db3881ef9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -937,6 +937,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5)); break; +#ifdef CONFIG_SPAPR_TCE_IOMMU case H_GET_TCE: ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5)); @@ -966,6 +967,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) if (ret == H_TOO_HARD) return RESUME_HOST; break; +#endif case H_RANDOM: if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4])) ret = H_HARDWARE; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9b8d50a7cbaf..541b121477e4 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2264,8 +2264,13 @@ hcall_real_table: .long DOTSYM(kvmppc_h_clear_mod) - hcall_real_table .long DOTSYM(kvmppc_h_clear_ref) - hcall_real_table .long DOTSYM(kvmppc_h_protect) - hcall_real_table +#ifdef CONFIG_SPAPR_TCE_IOMMU .long DOTSYM(kvmppc_h_get_tce) - hcall_real_table .long DOTSYM(kvmppc_rm_h_put_tce) - hcall_real_table +#else + .long 0 /* 0x1c */ + .long 0 /* 0x20 */ +#endif .long 0 /* 0x24 - H_SET_SPRG0 */ .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table .long 0 /* 0x2c */ @@ -2343,8 +2348,13 @@ hcall_real_table: .long 0 /* 0x12c */ .long 0 /* 0x130 */ .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table +#ifdef CONFIG_SPAPR_TCE_IOMMU .long DOTSYM(kvmppc_rm_h_stuff_tce) - hcall_real_table .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table +#else + .long 0 /* 0x138 */ + .long 0 /* 0x13c */ +#endif .long 0 /* 0x140 */ .long 0 /* 0x144 */ .long 0 /* 0x148 */ From 716cb1160819721c39f807e103d9c307dbca2cf4 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Feb 2019 14:44:14 +1100 Subject: [PATCH 227/855] KVM: PPC: Book3S: Improve KVM reference counting The anon fd's ops releases the KVM reference in the release hook. However we reference the KVM object after we create the fd so there is small window when the release function can be called and dereferenced the KVM object which potentially may free it. It is not a problem at the moment as the file is created and KVM is referenced under the KVM lock and the release function obtains the same lock before dereferencing the KVM (although the lock is not held when calling kvm_put_kvm()) but it is potentially fragile against future changes. This references the KVM object before creating a file. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 6630dde56668..f02b04973710 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -337,14 +337,15 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, } } + kvm_get_kvm(kvm); if (!ret) ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, stt, O_RDWR | O_CLOEXEC); - if (ret >= 0) { + if (ret >= 0) list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); - kvm_get_kvm(kvm); - } + else + kvm_put_kvm(kvm); mutex_unlock(&kvm->lock); From 7f5d9c1bc0e6c13ba01546e9c6fda083764a4e9c Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Fri, 22 Feb 2019 14:34:29 +0800 Subject: [PATCH 228/855] KVM: arm/arm64: Remove unused timer variable The 'timer' local variable became unused after commit bee038a67487 ("KVM: arm/arm64: Rework the timer code to use a timer_map"). Remove it to avoid [-Wunused-but-set-variable] warning. Cc: Christoffer Dall Cc: James Morse Cc: Suzuki K Pouloze Reviewed-by: Julien Thierry Signed-off-by: Shaokun Zhang Signed-off-by: Marc Zyngier --- virt/kvm/arm/arch_timer.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index af8f2f1d01cc..3417f2dbc366 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -236,14 +236,12 @@ static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { - struct arch_timer_cpu *timer; enum kvm_arch_timers index; u64 cval, now; if (!timer_ctx) return false; - timer = vcpu_timer(timer_ctx->vcpu); index = arch_timer_ctx_index(timer_ctx); if (timer_ctx->loaded) { From c88b093693ccbe41991ef2e9b1d251945e6e54ed Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Thu, 21 Feb 2019 11:42:32 +0000 Subject: [PATCH 229/855] arm64: KVM: Fix architecturally invalid reset value for FPEXC32_EL2 Due to what looks like a typo dating back to the original addition of FPEXC32_EL2 handling, KVM currently initialises this register to an architecturally invalid value. As a result, the VECITR field (RES1) in bits [10:8] is initialised with 0, and the two reserved (RES0) bits [6:5] are initialised with 1. (In the Common VFP Subarchitecture as specified by ARMv7-A, these two bits were IMP DEF. ARMv8-A removes them.) This patch changes the reset value from 0x70 to 0x700, which reflects the architectural constraints and is presumably what was originally intended. Cc: # 4.12.x- Cc: Christoffer Dall Fixes: 62a89c44954f ("arm64: KVM: 32bit handling of coprocessor traps") Signed-off-by: Dave Martin Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 81a342679e60..a398d04274b7 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1523,7 +1523,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 }, { SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 }, - { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x70 }, + { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 }, }; static bool trap_dbgidr(struct kvm_vcpu *vcpu, From 346fa2f891c71a9b98014f8f62c15f4c7dd95ec1 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 18 Feb 2019 07:48:25 -0500 Subject: [PATCH 230/855] KVM: s390: implement subfunction processor calls While we will not implement interception for query functions yet, we can and should disable functions that have a control bit based on the given CPU model. Let us start with enabling the subfunction interface. Signed-off-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Janosch Frank Reviewed-by: Cornelia Huck --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/kvm-s390.c | 35 ++++++++++++++++++++------------ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index c5f51566ecd6..3369677a7df7 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -711,6 +711,7 @@ struct s390_io_adapter { struct kvm_s390_cpu_model { /* facility mask supported by kvm & hosting machine */ __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64]; + struct kvm_s390_vm_cpu_subfunc subfuncs; /* facility list requested by guest (in dma page) */ __u64 *fac_list; u64 cpuid; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2e47c724679e..82a95afa6629 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1266,11 +1266,20 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm, static int kvm_s390_set_processor_subfunc(struct kvm *kvm, struct kvm_device_attr *attr) { - /* - * Once supported by kernel + hw, we have to store the subfunctions - * in kvm->arch and remember that user space configured them. - */ - return -ENXIO; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + mutex_unlock(&kvm->lock); + return -EBUSY; + } + + if (copy_from_user(&kvm->arch.model.subfuncs, (void __user *)attr->addr, + sizeof(struct kvm_s390_vm_cpu_subfunc))) { + mutex_unlock(&kvm->lock); + return -EFAULT; + } + mutex_unlock(&kvm->lock); + + return 0; } static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) @@ -1389,12 +1398,11 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm, static int kvm_s390_get_processor_subfunc(struct kvm *kvm, struct kvm_device_attr *attr) { - /* - * Once we can actually configure subfunctions (kernel + hw support), - * we have to check if they were already set by user space, if so copy - * them from kvm->arch. - */ - return -ENXIO; + if (copy_to_user((void __user *)attr->addr, &kvm->arch.model.subfuncs, + sizeof(struct kvm_s390_vm_cpu_subfunc))) + return -EFAULT; + + return 0; } static int kvm_s390_get_machine_subfunc(struct kvm *kvm, @@ -1405,6 +1413,7 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm, return -EFAULT; return 0; } + static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -1522,10 +1531,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_PROCESSOR_FEAT: case KVM_S390_VM_CPU_MACHINE_FEAT: case KVM_S390_VM_CPU_MACHINE_SUBFUNC: + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: ret = 0; break; - /* configuring subfunctions is not supported yet */ - case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: default: ret = -ENXIO; break; @@ -2227,6 +2235,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] & kvm_s390_fac_base[i]; } + kvm->arch.model.subfuncs = kvm_s390_available_subfunc; /* we are always in czam mode - even on pre z14 machines */ set_kvm_facility(kvm->arch.model.fac_mask, 138); From 11ba5961a2156a4f210627ed8421387e2531b100 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 20 Feb 2019 11:38:42 -0500 Subject: [PATCH 231/855] KVM: s390: add debug logging for cpu model subfunctions As userspace can now get/set the subfunctions we want to trace those. This will allow to also check QEMUs cpu model vs. what the real hardware provides. Signed-off-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Reviewed-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 136 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 82a95afa6629..4638303ba6a8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1279,6 +1279,51 @@ static int kvm_s390_set_processor_subfunc(struct kvm *kvm, } mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "SET: guest PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[3]); + VM_EVENT(kvm, 3, "SET: guest PTFF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[1]); + VM_EVENT(kvm, 3, "SET: guest KMAC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[1]); + VM_EVENT(kvm, 3, "SET: guest KMC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[1]); + VM_EVENT(kvm, 3, "SET: guest KM subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.km)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.km)[1]); + VM_EVENT(kvm, 3, "SET: guest KIMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[1]); + VM_EVENT(kvm, 3, "SET: guest KLMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[1]); + VM_EVENT(kvm, 3, "SET: guest PCKMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[1]); + VM_EVENT(kvm, 3, "SET: guest KMCTR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[1]); + VM_EVENT(kvm, 3, "SET: guest KMF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[1]); + VM_EVENT(kvm, 3, "SET: guest KMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[1]); + VM_EVENT(kvm, 3, "SET: guest PCC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[1]); + VM_EVENT(kvm, 3, "SET: guest PPNO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[1]); + VM_EVENT(kvm, 3, "SET: guest KMA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]); + return 0; } @@ -1402,6 +1447,51 @@ static int kvm_s390_get_processor_subfunc(struct kvm *kvm, sizeof(struct kvm_s390_vm_cpu_subfunc))) return -EFAULT; + VM_EVENT(kvm, 3, "GET: guest PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[3]); + VM_EVENT(kvm, 3, "GET: guest PTFF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[1]); + VM_EVENT(kvm, 3, "GET: guest KMAC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[1]); + VM_EVENT(kvm, 3, "GET: guest KMC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[1]); + VM_EVENT(kvm, 3, "GET: guest KM subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.km)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.km)[1]); + VM_EVENT(kvm, 3, "GET: guest KIMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[1]); + VM_EVENT(kvm, 3, "GET: guest KLMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[1]); + VM_EVENT(kvm, 3, "GET: guest PCKMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[1]); + VM_EVENT(kvm, 3, "GET: guest KMCTR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[1]); + VM_EVENT(kvm, 3, "GET: guest KMF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[1]); + VM_EVENT(kvm, 3, "GET: guest KMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[1]); + VM_EVENT(kvm, 3, "GET: guest PCC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[1]); + VM_EVENT(kvm, 3, "GET: guest PPNO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[1]); + VM_EVENT(kvm, 3, "GET: guest KMA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]); + return 0; } @@ -1411,6 +1501,52 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm, if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc, sizeof(struct kvm_s390_vm_cpu_subfunc))) return -EFAULT; + + VM_EVENT(kvm, 3, "GET: host PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.plo)[0], + ((unsigned long *) &kvm_s390_available_subfunc.plo)[1], + ((unsigned long *) &kvm_s390_available_subfunc.plo)[2], + ((unsigned long *) &kvm_s390_available_subfunc.plo)[3]); + VM_EVENT(kvm, 3, "GET: host PTFF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.ptff)[0], + ((unsigned long *) &kvm_s390_available_subfunc.ptff)[1]); + VM_EVENT(kvm, 3, "GET: host KMAC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmac)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmac)[1]); + VM_EVENT(kvm, 3, "GET: host KMC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmc)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmc)[1]); + VM_EVENT(kvm, 3, "GET: host KM subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.km)[0], + ((unsigned long *) &kvm_s390_available_subfunc.km)[1]); + VM_EVENT(kvm, 3, "GET: host KIMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kimd)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kimd)[1]); + VM_EVENT(kvm, 3, "GET: host KLMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.klmd)[0], + ((unsigned long *) &kvm_s390_available_subfunc.klmd)[1]); + VM_EVENT(kvm, 3, "GET: host PCKMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pckmo)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pckmo)[1]); + VM_EVENT(kvm, 3, "GET: host KMCTR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmctr)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmctr)[1]); + VM_EVENT(kvm, 3, "GET: host KMF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmf)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmf)[1]); + VM_EVENT(kvm, 3, "GET: host KMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmo)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmo)[1]); + VM_EVENT(kvm, 3, "GET: host PCC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pcc)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pcc)[1]); + VM_EVENT(kvm, 3, "GET: host PPNO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.ppno)[0], + ((unsigned long *) &kvm_s390_available_subfunc.ppno)[1]); + VM_EVENT(kvm, 3, "GET: host KMA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kma)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kma)[1]); + return 0; } From a242010776f880ec0d8917aae0406e4a70cdc36c Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 22 Feb 2019 16:10:09 +0800 Subject: [PATCH 232/855] KVM: Minor cleanups for kvm_main.c This patch contains two minor cleanups: firstly it puts exported symbol for kvm_io_bus_write() by following the function definition; secondly it removes a redundant blank line. Signed-off-by: Leo Yan Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 276af92ace6c..0fb0e9aa0935 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3631,6 +3631,7 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } +EXPORT_SYMBOL_GPL(kvm_io_bus_write); /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, @@ -3681,7 +3682,6 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -EXPORT_SYMBOL_GPL(kvm_io_bus_write); /* kvm_io_bus_read - called under kvm->slots_lock */ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, @@ -3703,7 +3703,6 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, return r < 0 ? r : 0; } - /* Caller must hold slots_lock. */ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev) From 6a7a20ed29ca427c961020e87cb809a3d79f6fa6 Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Tue, 19 Feb 2019 16:02:11 -0500 Subject: [PATCH 233/855] drm/powerplay: print current clock level when dpm is disabled on vg20 When DPM for the specific clock is disabled, driver should still print out current clock info for rocm-smi support on vega20 Signed-off-by: shaoyunl Reviewed-by: Eric Huang Signed-off-by: Alex Deucher --- .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c index aad79affb081..c95e0f35ed7f 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c @@ -2641,9 +2641,8 @@ static int vega20_get_sclks(struct pp_hwmgr *hwmgr, struct vega20_single_dpm_table *dpm_table = &(data->dpm_table.gfx_table); int i, count; - PP_ASSERT_WITH_CODE(data->smu_features[GNLD_DPM_GFXCLK].enabled, - "[GetSclks]: gfxclk dpm not enabled!\n", - return -EPERM); + if (!data->smu_features[GNLD_DPM_GFXCLK].enabled) + return -1; count = (dpm_table->count > MAX_NUM_CLOCKS) ? MAX_NUM_CLOCKS : dpm_table->count; clocks->num_levels = count; @@ -2670,9 +2669,8 @@ static int vega20_get_memclocks(struct pp_hwmgr *hwmgr, struct vega20_single_dpm_table *dpm_table = &(data->dpm_table.mem_table); int i, count; - PP_ASSERT_WITH_CODE(data->smu_features[GNLD_DPM_UCLK].enabled, - "[GetMclks]: uclk dpm not enabled!\n", - return -EPERM); + if (!data->smu_features[GNLD_DPM_UCLK].enabled) + return -1; count = (dpm_table->count > MAX_NUM_CLOCKS) ? MAX_NUM_CLOCKS : dpm_table->count; clocks->num_levels = data->mclk_latency_table.count = count; @@ -2696,9 +2694,8 @@ static int vega20_get_dcefclocks(struct pp_hwmgr *hwmgr, struct vega20_single_dpm_table *dpm_table = &(data->dpm_table.dcef_table); int i, count; - PP_ASSERT_WITH_CODE(data->smu_features[GNLD_DPM_DCEFCLK].enabled, - "[GetDcfclocks]: dcefclk dpm not enabled!\n", - return -EPERM); + if (!data->smu_features[GNLD_DPM_DCEFCLK].enabled) + return -1; count = (dpm_table->count > MAX_NUM_CLOCKS) ? MAX_NUM_CLOCKS : dpm_table->count; clocks->num_levels = count; @@ -2719,9 +2716,8 @@ static int vega20_get_socclocks(struct pp_hwmgr *hwmgr, struct vega20_single_dpm_table *dpm_table = &(data->dpm_table.soc_table); int i, count; - PP_ASSERT_WITH_CODE(data->smu_features[GNLD_DPM_SOCCLK].enabled, - "[GetSocclks]: socclk dpm not enabled!\n", - return -EPERM); + if (!data->smu_features[GNLD_DPM_SOCCLK].enabled) + return -1; count = (dpm_table->count > MAX_NUM_CLOCKS) ? MAX_NUM_CLOCKS : dpm_table->count; clocks->num_levels = count; @@ -3137,10 +3133,11 @@ static int vega20_print_clock_levels(struct pp_hwmgr *hwmgr, "Attempt to get current gfx clk Failed!", return ret); - ret = vega20_get_sclks(hwmgr, &clocks); - PP_ASSERT_WITH_CODE(!ret, - "Attempt to get gfx clk levels Failed!", - return ret); + if (vega20_get_sclks(hwmgr, &clocks)) { + size += sprintf(buf + size, "0: %uMhz * (DPM disabled)\n", + now / 100); + break; + } for (i = 0; i < clocks.num_levels; i++) size += sprintf(buf + size, "%d: %uMhz %s\n", @@ -3154,10 +3151,11 @@ static int vega20_print_clock_levels(struct pp_hwmgr *hwmgr, "Attempt to get current mclk freq Failed!", return ret); - ret = vega20_get_memclocks(hwmgr, &clocks); - PP_ASSERT_WITH_CODE(!ret, - "Attempt to get memory clk levels Failed!", - return ret); + if (vega20_get_memclocks(hwmgr, &clocks)) { + size += sprintf(buf + size, "0: %uMhz * (DPM disabled)\n", + now / 100); + break; + } for (i = 0; i < clocks.num_levels; i++) size += sprintf(buf + size, "%d: %uMhz %s\n", @@ -3171,10 +3169,11 @@ static int vega20_print_clock_levels(struct pp_hwmgr *hwmgr, "Attempt to get current socclk freq Failed!", return ret); - ret = vega20_get_socclocks(hwmgr, &clocks); - PP_ASSERT_WITH_CODE(!ret, - "Attempt to get soc clk levels Failed!", - return ret); + if (vega20_get_socclocks(hwmgr, &clocks)) { + size += sprintf(buf + size, "0: %uMhz * (DPM disabled)\n", + now / 100); + break; + } for (i = 0; i < clocks.num_levels; i++) size += sprintf(buf + size, "%d: %uMhz %s\n", @@ -3200,10 +3199,11 @@ static int vega20_print_clock_levels(struct pp_hwmgr *hwmgr, "Attempt to get current dcefclk freq Failed!", return ret); - ret = vega20_get_dcefclocks(hwmgr, &clocks); - PP_ASSERT_WITH_CODE(!ret, - "Attempt to get dcefclk levels Failed!", - return ret); + if (vega20_get_dcefclocks(hwmgr, &clocks)) { + size += sprintf(buf + size, "0: %uMhz * (DPM disabled)\n", + now / 100); + break; + } for (i = 0; i < clocks.num_levels; i++) size += sprintf(buf + size, "%d: %uMhz %s\n", From b7d485df6658942e30c0faccb7213b63c167ceeb Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Tue, 19 Feb 2019 12:20:54 +0800 Subject: [PATCH 234/855] drm/amd/powerplay: fix the confusing ppfeature mask calculations Simplify the ppfeature mask calculations. Signed-off-by: Evan Quan Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c | 4 ++-- drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c | 4 ++-- drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c index 5479125ff4f6..6d8e9609e900 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c @@ -4407,9 +4407,9 @@ static int vega10_set_ppfeature_status(struct pp_hwmgr *hwmgr, uint64_t new_ppfe return ret; features_to_disable = - (features_enabled ^ new_ppfeature_masks) & features_enabled; + features_enabled & ~new_ppfeature_masks; features_to_enable = - (features_enabled ^ new_ppfeature_masks) ^ features_to_disable; + ~features_enabled & new_ppfeature_masks; pr_debug("features_to_disable 0x%llx\n", features_to_disable); pr_debug("features_to_enable 0x%llx\n", features_to_enable); diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c index 6c8e78611c03..bdb48e94eff6 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c @@ -2009,9 +2009,9 @@ static int vega12_set_ppfeature_status(struct pp_hwmgr *hwmgr, uint64_t new_ppfe return ret; features_to_disable = - (features_enabled ^ new_ppfeature_masks) & features_enabled; + features_enabled & ~new_ppfeature_masks; features_to_enable = - (features_enabled ^ new_ppfeature_masks) ^ features_to_disable; + ~features_enabled & new_ppfeature_masks; pr_debug("features_to_disable 0x%llx\n", features_to_disable); pr_debug("features_to_enable 0x%llx\n", features_to_enable); diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c index c95e0f35ed7f..fae95d9ebd7a 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c @@ -3084,9 +3084,9 @@ static int vega20_set_ppfeature_status(struct pp_hwmgr *hwmgr, uint64_t new_ppfe return ret; features_to_disable = - (features_enabled ^ new_ppfeature_masks) & features_enabled; + features_enabled & ~new_ppfeature_masks; features_to_enable = - (features_enabled ^ new_ppfeature_masks) ^ features_to_disable; + ~features_enabled & new_ppfeature_masks; pr_debug("features_to_disable 0x%llx\n", features_to_disable); pr_debug("features_to_enable 0x%llx\n", features_to_enable); From 4a8c31a1c6f526ec96a35e613f2a71e26ffbd7dd Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Sun, 24 Feb 2019 10:17:27 -0500 Subject: [PATCH 235/855] xen/blkback: rework connect_ring() to avoid inconsistent xenstore 'ring-page-order' set by malicious blkfront The xenstore 'ring-page-order' is used globally for each blkback queue and therefore should be read from xenstore only once. However, it is obtained in read_per_ring_refs() which might be called multiple times during the initialization of each blkback queue. If the blkfront is malicious and the 'ring-page-order' is set in different value by blkfront every time before blkback reads it, this may end up at the "WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));" in xen_blkif_disconnect() when frontend is destroyed. This patch reworks connect_ring() to read xenstore 'ring-page-order' only once. Signed-off-by: Dongli Zhang Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/xenbus.c | 76 ++++++++++++++++++------------ 1 file changed, 45 insertions(+), 31 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index a4aadac772e5..24896ffb04ed 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -926,7 +926,7 @@ static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir) int err, i, j; struct xen_blkif *blkif = ring->blkif; struct xenbus_device *dev = blkif->be->dev; - unsigned int ring_page_order, nr_grefs, evtchn; + unsigned int nr_grefs, evtchn; err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u", &evtchn); @@ -936,43 +936,42 @@ static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir) return err; } - err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", - &ring_page_order); + nr_grefs = blkif->nr_ring_pages; + + if (unlikely(!nr_grefs)) { + WARN_ON(true); + return -EINVAL; + } + + for (i = 0; i < nr_grefs; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_scanf(XBT_NIL, dir, ring_ref_name, + "%u", &ring_ref[i]); + + if (err != 1) { + if (nr_grefs == 1) + break; + + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/%s", + dir, ring_ref_name); + return err; + } + } + if (err != 1) { - err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]); + WARN_ON(nr_grefs != 1); + + err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", + &ring_ref[0]); if (err != 1) { err = -EINVAL; xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir); return err; } - nr_grefs = 1; - } else { - unsigned int i; - - if (ring_page_order > xen_blkif_max_ring_order) { - err = -EINVAL; - xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", - dir, ring_page_order, - xen_blkif_max_ring_order); - return err; - } - - nr_grefs = 1 << ring_page_order; - for (i = 0; i < nr_grefs; i++) { - char ring_ref_name[RINGREF_NAME_LEN]; - - snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_scanf(XBT_NIL, dir, ring_ref_name, - "%u", &ring_ref[i]); - if (err != 1) { - err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/%s", - dir, ring_ref_name); - return err; - } - } } - blkif->nr_ring_pages = nr_grefs; for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { req = kzalloc(sizeof(*req), GFP_KERNEL); @@ -1031,6 +1030,7 @@ static int connect_ring(struct backend_info *be) size_t xspathsize; const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */ unsigned int requested_num_queues = 0; + unsigned int ring_page_order; pr_debug("%s %s\n", __func__, dev->otherend); @@ -1076,6 +1076,20 @@ static int connect_ring(struct backend_info *be) blkif->nr_rings, blkif->blk_protocol, protocol, pers_grants ? "persistent grants" : ""); + ring_page_order = xenbus_read_unsigned(dev->otherend, + "ring-page-order", 0); + + if (ring_page_order > xen_blkif_max_ring_order) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, + "requested ring page order %d exceed max:%d", + ring_page_order, + xen_blkif_max_ring_order); + return err; + } + + blkif->nr_ring_pages = 1 << ring_page_order; + if (blkif->nr_rings == 1) return read_per_ring_refs(&blkif->rings[0], dev->otherend); else { From ef4c54c340de47bf167b326f7560c1cc3751d154 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Thu, 24 Jan 2019 15:17:03 +0300 Subject: [PATCH 236/855] ARC: DTB: [scripted] fix node name and address spelling 1. Remove "0x" prefix from unit-address of node names ----------------------->8------------------------ sed -i 's/@0x/@/g' arch/arc/boot/dts/*.dts* ----------------------->8------------------------ 2. Make all hex addresses lowercase: ----------------------->8------------------------ sed -i 's/@\([0-9A-Za-z]*\)/@\L\1/g' arch/arc/boot/dts/*.dts* sed -i 's/0x\([0-9A-Za-z]*\)/0x\L\1/g' arch/arc/boot/dts/*.dts* ----------------------->8------------------------ Inspired by [1] and the like. [1] http://kisskb.ellerman.id.au/kisskb/buildresult/13612017/ Reviewed-by: Rob Herring Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/abilis_tb100.dtsi | 58 ++++++++++++------------- arch/arc/boot/dts/abilis_tb100_dvk.dts | 14 +++--- arch/arc/boot/dts/abilis_tb101.dtsi | 58 ++++++++++++------------- arch/arc/boot/dts/abilis_tb101_dvk.dts | 14 +++--- arch/arc/boot/dts/abilis_tb10x.dtsi | 60 +++++++++++++------------- arch/arc/boot/dts/axc001.dtsi | 6 +-- arch/arc/boot/dts/axc003.dtsi | 16 +++---- arch/arc/boot/dts/axc003_idu.dtsi | 16 +++---- arch/arc/boot/dts/axs10x_mb.dtsi | 22 +++++----- arch/arc/boot/dts/hsdk.dts | 4 +- arch/arc/boot/dts/vdk_axc003.dtsi | 4 +- arch/arc/boot/dts/vdk_axc003_idu.dtsi | 4 +- arch/arc/boot/dts/vdk_axs10x_mb.dtsi | 18 ++++---- 13 files changed, 147 insertions(+), 147 deletions(-) diff --git a/arch/arc/boot/dts/abilis_tb100.dtsi b/arch/arc/boot/dts/abilis_tb100.dtsi index 02410b211433..c0bcd97522bb 100644 --- a/arch/arc/boot/dts/abilis_tb100.dtsi +++ b/arch/arc/boot/dts/abilis_tb100.dtsi @@ -38,7 +38,7 @@ clock-div = <6>; }; - iomux: iomux@FF10601c { + iomux: iomux@ff10601c { /* Port 1 */ pctl_tsin_s0: pctl-tsin-s0 { /* Serial TS-in 0 */ abilis,function = "mis0"; @@ -162,182 +162,182 @@ }; }; - gpioa: gpio@FF140000 { + gpioa: gpio@ff140000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF140000 0x1000>; + reg = <0xff140000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <3>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpioa"; }; - gpiob: gpio@FF141000 { + gpiob: gpio@ff141000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF141000 0x1000>; + reg = <0xff141000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <2>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiob"; }; - gpioc: gpio@FF142000 { + gpioc: gpio@ff142000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF142000 0x1000>; + reg = <0xff142000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <3>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpioc"; }; - gpiod: gpio@FF143000 { + gpiod: gpio@ff143000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF143000 0x1000>; + reg = <0xff143000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <2>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiod"; }; - gpioe: gpio@FF144000 { + gpioe: gpio@ff144000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF144000 0x1000>; + reg = <0xff144000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <3>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpioe"; }; - gpiof: gpio@FF145000 { + gpiof: gpio@ff145000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF145000 0x1000>; + reg = <0xff145000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <2>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiof"; }; - gpiog: gpio@FF146000 { + gpiog: gpio@ff146000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF146000 0x1000>; + reg = <0xff146000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <3>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiog"; }; - gpioh: gpio@FF147000 { + gpioh: gpio@ff147000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF147000 0x1000>; + reg = <0xff147000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <2>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpioh"; }; - gpioi: gpio@FF148000 { + gpioi: gpio@ff148000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF148000 0x1000>; + reg = <0xff148000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <12>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpioi"; }; - gpioj: gpio@FF149000 { + gpioj: gpio@ff149000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF149000 0x1000>; + reg = <0xff149000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <32>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpioj"; }; - gpiok: gpio@FF14a000 { + gpiok: gpio@ff14a000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF14A000 0x1000>; + reg = <0xff14a000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <22>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiok"; }; - gpiol: gpio@FF14b000 { + gpiol: gpio@ff14b000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF14B000 0x1000>; + reg = <0xff14b000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <4>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiol"; }; - gpiom: gpio@FF14c000 { + gpiom: gpio@ff14c000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF14C000 0x1000>; + reg = <0xff14c000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <4>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiom"; }; - gpion: gpio@FF14d000 { + gpion: gpio@ff14d000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF14D000 0x1000>; + reg = <0xff14d000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <5>; diff --git a/arch/arc/boot/dts/abilis_tb100_dvk.dts b/arch/arc/boot/dts/abilis_tb100_dvk.dts index 3acf04db8030..c968e677db46 100644 --- a/arch/arc/boot/dts/abilis_tb100_dvk.dts +++ b/arch/arc/boot/dts/abilis_tb100_dvk.dts @@ -37,27 +37,27 @@ }; soc100 { - uart@FF100000 { + uart@ff100000 { pinctrl-names = "default"; pinctrl-0 = <&pctl_uart0>; }; - ethernet@FE100000 { + ethernet@fe100000 { phy-mode = "rgmii"; }; - i2c0: i2c@FF120000 { + i2c0: i2c@ff120000 { i2c-sda-hold-time-ns = <432>; }; - i2c1: i2c@FF121000 { + i2c1: i2c@ff121000 { i2c-sda-hold-time-ns = <432>; }; - i2c2: i2c@FF122000 { + i2c2: i2c@ff122000 { i2c-sda-hold-time-ns = <432>; }; - i2c3: i2c@FF123000 { + i2c3: i2c@ff123000 { i2c-sda-hold-time-ns = <432>; }; - i2c4: i2c@FF124000 { + i2c4: i2c@ff124000 { i2c-sda-hold-time-ns = <432>; }; diff --git a/arch/arc/boot/dts/abilis_tb101.dtsi b/arch/arc/boot/dts/abilis_tb101.dtsi index f9e7686044eb..6a1615f58f05 100644 --- a/arch/arc/boot/dts/abilis_tb101.dtsi +++ b/arch/arc/boot/dts/abilis_tb101.dtsi @@ -38,7 +38,7 @@ clock-div = <6>; }; - iomux: iomux@FF10601c { + iomux: iomux@ff10601c { /* Port 1 */ pctl_tsin_s0: pctl-tsin-s0 { /* Serial TS-in 0 */ abilis,function = "mis0"; @@ -171,182 +171,182 @@ }; }; - gpioa: gpio@FF140000 { + gpioa: gpio@ff140000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF140000 0x1000>; + reg = <0xff140000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <3>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpioa"; }; - gpiob: gpio@FF141000 { + gpiob: gpio@ff141000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF141000 0x1000>; + reg = <0xff141000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <2>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiob"; }; - gpioc: gpio@FF142000 { + gpioc: gpio@ff142000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF142000 0x1000>; + reg = <0xff142000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <3>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpioc"; }; - gpiod: gpio@FF143000 { + gpiod: gpio@ff143000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF143000 0x1000>; + reg = <0xff143000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <2>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiod"; }; - gpioe: gpio@FF144000 { + gpioe: gpio@ff144000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF144000 0x1000>; + reg = <0xff144000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <3>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpioe"; }; - gpiof: gpio@FF145000 { + gpiof: gpio@ff145000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF145000 0x1000>; + reg = <0xff145000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <2>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiof"; }; - gpiog: gpio@FF146000 { + gpiog: gpio@ff146000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF146000 0x1000>; + reg = <0xff146000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <3>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiog"; }; - gpioh: gpio@FF147000 { + gpioh: gpio@ff147000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF147000 0x1000>; + reg = <0xff147000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <2>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpioh"; }; - gpioi: gpio@FF148000 { + gpioi: gpio@ff148000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF148000 0x1000>; + reg = <0xff148000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <12>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpioi"; }; - gpioj: gpio@FF149000 { + gpioj: gpio@ff149000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF149000 0x1000>; + reg = <0xff149000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <32>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpioj"; }; - gpiok: gpio@FF14a000 { + gpiok: gpio@ff14a000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF14A000 0x1000>; + reg = <0xff14a000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <22>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiok"; }; - gpiol: gpio@FF14b000 { + gpiol: gpio@ff14b000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF14B000 0x1000>; + reg = <0xff14b000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <4>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiol"; }; - gpiom: gpio@FF14c000 { + gpiom: gpio@ff14c000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF14C000 0x1000>; + reg = <0xff14c000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <4>; gpio-ranges = <&iomux 0 0 0>; gpio-ranges-group-names = "gpiom"; }; - gpion: gpio@FF14d000 { + gpion: gpio@ff14d000 { compatible = "abilis,tb10x-gpio"; interrupt-controller; #interrupt-cells = <1>; interrupt-parent = <&tb10x_ictl>; interrupts = <27 2>; - reg = <0xFF14D000 0x1000>; + reg = <0xff14d000 0x1000>; gpio-controller; #gpio-cells = <2>; abilis,ngpio = <5>; diff --git a/arch/arc/boot/dts/abilis_tb101_dvk.dts b/arch/arc/boot/dts/abilis_tb101_dvk.dts index 37d88c5dd181..05143ce9c120 100644 --- a/arch/arc/boot/dts/abilis_tb101_dvk.dts +++ b/arch/arc/boot/dts/abilis_tb101_dvk.dts @@ -37,27 +37,27 @@ }; soc100 { - uart@FF100000 { + uart@ff100000 { pinctrl-names = "default"; pinctrl-0 = <&pctl_uart0>; }; - ethernet@FE100000 { + ethernet@fe100000 { phy-mode = "rgmii"; }; - i2c0: i2c@FF120000 { + i2c0: i2c@ff120000 { i2c-sda-hold-time-ns = <432>; }; - i2c1: i2c@FF121000 { + i2c1: i2c@ff121000 { i2c-sda-hold-time-ns = <432>; }; - i2c2: i2c@FF122000 { + i2c2: i2c@ff122000 { i2c-sda-hold-time-ns = <432>; }; - i2c3: i2c@FF123000 { + i2c3: i2c@ff123000 { i2c-sda-hold-time-ns = <432>; }; - i2c4: i2c@FF124000 { + i2c4: i2c@ff124000 { i2c-sda-hold-time-ns = <432>; }; diff --git a/arch/arc/boot/dts/abilis_tb10x.dtsi b/arch/arc/boot/dts/abilis_tb10x.dtsi index 3121536b25a3..2fbf1bdfe6de 100644 --- a/arch/arc/boot/dts/abilis_tb10x.dtsi +++ b/arch/arc/boot/dts/abilis_tb10x.dtsi @@ -54,7 +54,7 @@ #size-cells = <1>; device_type = "soc"; ranges = <0xfe000000 0xfe000000 0x02000000 - 0x000F0000 0x000F0000 0x00010000>; + 0x000f0000 0x000f0000 0x00010000>; compatible = "abilis,tb10x", "simple-bus"; pll0: oscillator { @@ -75,10 +75,10 @@ clock-output-names = "ahb_clk"; }; - iomux: iomux@FF10601c { + iomux: iomux@ff10601c { compatible = "abilis,tb10x-iomux"; #gpio-range-cells = <3>; - reg = <0xFF10601c 0x4>; + reg = <0xff10601c 0x4>; }; intc: interrupt-controller { @@ -88,7 +88,7 @@ }; tb10x_ictl: pic@fe002000 { compatible = "abilis,tb10x-ictl"; - reg = <0xFE002000 0x20>; + reg = <0xfe002000 0x20>; interrupt-controller; #interrupt-cells = <2>; interrupt-parent = <&intc>; @@ -96,27 +96,27 @@ 20 21 22 23 24 25 26 27 28 29 30 31>; }; - uart@FF100000 { + uart@ff100000 { compatible = "snps,dw-apb-uart"; - reg = <0xFF100000 0x100>; + reg = <0xff100000 0x100>; clock-frequency = <166666666>; interrupts = <25 8>; reg-shift = <2>; reg-io-width = <4>; interrupt-parent = <&tb10x_ictl>; }; - ethernet@FE100000 { + ethernet@fe100000 { compatible = "snps,dwmac-3.70a","snps,dwmac"; - reg = <0xFE100000 0x1058>; + reg = <0xfe100000 0x1058>; interrupt-parent = <&tb10x_ictl>; interrupts = <6 8>; interrupt-names = "macirq"; clocks = <&ahb_clk>; clock-names = "stmmaceth"; }; - dma@FE000000 { + dma@fe000000 { compatible = "snps,dma-spear1340"; - reg = <0xFE000000 0x400>; + reg = <0xfe000000 0x400>; interrupt-parent = <&tb10x_ictl>; interrupts = <14 8>; dma-channels = <6>; @@ -132,70 +132,70 @@ multi-block = <1 1 1 1 1 1>; }; - i2c0: i2c@FF120000 { + i2c0: i2c@ff120000 { #address-cells = <1>; #size-cells = <0>; compatible = "snps,designware-i2c"; - reg = <0xFF120000 0x1000>; + reg = <0xff120000 0x1000>; interrupt-parent = <&tb10x_ictl>; interrupts = <12 8>; clocks = <&ahb_clk>; }; - i2c1: i2c@FF121000 { + i2c1: i2c@ff121000 { #address-cells = <1>; #size-cells = <0>; compatible = "snps,designware-i2c"; - reg = <0xFF121000 0x1000>; + reg = <0xff121000 0x1000>; interrupt-parent = <&tb10x_ictl>; interrupts = <12 8>; clocks = <&ahb_clk>; }; - i2c2: i2c@FF122000 { + i2c2: i2c@ff122000 { #address-cells = <1>; #size-cells = <0>; compatible = "snps,designware-i2c"; - reg = <0xFF122000 0x1000>; + reg = <0xff122000 0x1000>; interrupt-parent = <&tb10x_ictl>; interrupts = <12 8>; clocks = <&ahb_clk>; }; - i2c3: i2c@FF123000 { + i2c3: i2c@ff123000 { #address-cells = <1>; #size-cells = <0>; compatible = "snps,designware-i2c"; - reg = <0xFF123000 0x1000>; + reg = <0xff123000 0x1000>; interrupt-parent = <&tb10x_ictl>; interrupts = <12 8>; clocks = <&ahb_clk>; }; - i2c4: i2c@FF124000 { + i2c4: i2c@ff124000 { #address-cells = <1>; #size-cells = <0>; compatible = "snps,designware-i2c"; - reg = <0xFF124000 0x1000>; + reg = <0xff124000 0x1000>; interrupt-parent = <&tb10x_ictl>; interrupts = <12 8>; clocks = <&ahb_clk>; }; - spi0: spi@0xFE010000 { + spi0: spi@fe010000 { #address-cells = <1>; #size-cells = <0>; cell-index = <0>; compatible = "abilis,tb100-spi"; num-cs = <1>; - reg = <0xFE010000 0x20>; + reg = <0xfe010000 0x20>; interrupt-parent = <&tb10x_ictl>; interrupts = <26 8>; clocks = <&ahb_clk>; }; - spi1: spi@0xFE011000 { + spi1: spi@fe011000 { #address-cells = <1>; #size-cells = <0>; cell-index = <1>; compatible = "abilis,tb100-spi"; num-cs = <2>; - reg = <0xFE011000 0x20>; + reg = <0xfe011000 0x20>; interrupt-parent = <&tb10x_ictl>; interrupts = <10 8>; clocks = <&ahb_clk>; @@ -226,23 +226,23 @@ interrupts = <20 2>, <19 2>; interrupt-names = "cmd_irq", "event_irq"; }; - tb10x_mdsc0: tb10x-mdscr@FF300000 { + tb10x_mdsc0: tb10x-mdscr@ff300000 { compatible = "abilis,tb100-mdscr"; - reg = <0xFF300000 0x7000>; + reg = <0xff300000 0x7000>; tb100-mdscr-manage-tsin; }; - tb10x_mscr0: tb10x-mdscr@FF307000 { + tb10x_mscr0: tb10x-mdscr@ff307000 { compatible = "abilis,tb100-mdscr"; - reg = <0xFF307000 0x7000>; + reg = <0xff307000 0x7000>; }; tb10x_scr0: tb10x-mdscr@ff30e000 { compatible = "abilis,tb100-mdscr"; - reg = <0xFF30e000 0x4000>; + reg = <0xff30e000 0x4000>; tb100-mdscr-manage-tsin; }; tb10x_scr1: tb10x-mdscr@ff312000 { compatible = "abilis,tb100-mdscr"; - reg = <0xFF312000 0x4000>; + reg = <0xff312000 0x4000>; tb100-mdscr-manage-tsin; }; tb10x_wfb: tb10x-wfb@ff319000 { diff --git a/arch/arc/boot/dts/axc001.dtsi b/arch/arc/boot/dts/axc001.dtsi index fdc266504ada..37be3bf03ad6 100644 --- a/arch/arc/boot/dts/axc001.dtsi +++ b/arch/arc/boot/dts/axc001.dtsi @@ -41,7 +41,7 @@ * this GPIO block ORs all interrupts on CPU card (creg,..) * to uplink only 1 IRQ to ARC core intc */ - dw-apb-gpio@0x2000 { + dw-apb-gpio@2000 { compatible = "snps,dw-apb-gpio"; reg = < 0x2000 0x80 >; #address-cells = <1>; @@ -60,7 +60,7 @@ }; }; - debug_uart: dw-apb-uart@0x5000 { + debug_uart: dw-apb-uart@5000 { compatible = "snps,dw-apb-uart"; reg = <0x5000 0x100>; clock-frequency = <33333000>; @@ -88,7 +88,7 @@ * avoid duplicating the MB dtsi file given that IRQ from * this intc to cpu intc are different for axs101 and axs103 */ - mb_intc: dw-apb-ictl@0xe0012000 { + mb_intc: dw-apb-ictl@e0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; reg = < 0x0 0xe0012000 0x0 0x200 >; diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi index d75d65ddf8e3..effa37536d7a 100644 --- a/arch/arc/boot/dts/axc003.dtsi +++ b/arch/arc/boot/dts/axc003.dtsi @@ -55,7 +55,7 @@ * this GPIO block ORs all interrupts on CPU card (creg,..) * to uplink only 1 IRQ to ARC core intc */ - dw-apb-gpio@0x2000 { + dw-apb-gpio@2000 { compatible = "snps,dw-apb-gpio"; reg = < 0x2000 0x80 >; #address-cells = <1>; @@ -74,7 +74,7 @@ }; }; - debug_uart: dw-apb-uart@0x5000 { + debug_uart: dw-apb-uart@5000 { compatible = "snps,dw-apb-uart"; reg = <0x5000 0x100>; clock-frequency = <33333000>; @@ -102,19 +102,19 @@ * external DMA buffer located outside of IOC aperture. */ axs10x_mb { - ethernet@0x18000 { + ethernet@18000 { dma-coherent; }; - ehci@0x40000 { + ehci@40000 { dma-coherent; }; - ohci@0x60000 { + ohci@60000 { dma-coherent; }; - mmc@0x15000 { + mmc@15000 { dma-coherent; }; }; @@ -132,7 +132,7 @@ * avoid duplicating the MB dtsi file given that IRQ from * this intc to cpu intc are different for axs101 and axs103 */ - mb_intc: dw-apb-ictl@0xe0012000 { + mb_intc: dw-apb-ictl@e0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; reg = < 0x0 0xe0012000 0x0 0x200 >; @@ -153,7 +153,7 @@ #size-cells = <2>; ranges; /* - * Move frame buffer out of IOC aperture (0x8z-0xAz). + * Move frame buffer out of IOC aperture (0x8z-0xaz). */ frame_buffer: frame_buffer@be000000 { compatible = "shared-dma-pool"; diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi index a05bb737ea63..e401e59f6180 100644 --- a/arch/arc/boot/dts/axc003_idu.dtsi +++ b/arch/arc/boot/dts/axc003_idu.dtsi @@ -62,7 +62,7 @@ * this GPIO block ORs all interrupts on CPU card (creg,..) * to uplink only 1 IRQ to ARC core intc */ - dw-apb-gpio@0x2000 { + dw-apb-gpio@2000 { compatible = "snps,dw-apb-gpio"; reg = < 0x2000 0x80 >; #address-cells = <1>; @@ -81,7 +81,7 @@ }; }; - debug_uart: dw-apb-uart@0x5000 { + debug_uart: dw-apb-uart@5000 { compatible = "snps,dw-apb-uart"; reg = <0x5000 0x100>; clock-frequency = <33333000>; @@ -109,19 +109,19 @@ * external DMA buffer located outside of IOC aperture. */ axs10x_mb { - ethernet@0x18000 { + ethernet@18000 { dma-coherent; }; - ehci@0x40000 { + ehci@40000 { dma-coherent; }; - ohci@0x60000 { + ohci@60000 { dma-coherent; }; - mmc@0x15000 { + mmc@15000 { dma-coherent; }; }; @@ -138,7 +138,7 @@ * avoid duplicating the MB dtsi file given that IRQ from * this intc to cpu intc are different for axs101 and axs103 */ - mb_intc: dw-apb-ictl@0xe0012000 { + mb_intc: dw-apb-ictl@e0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; reg = < 0x0 0xe0012000 0x0 0x200 >; @@ -159,7 +159,7 @@ #size-cells = <2>; ranges; /* - * Move frame buffer out of IOC aperture (0x8z-0xAz). + * Move frame buffer out of IOC aperture (0x8z-0xaz). */ frame_buffer: frame_buffer@be000000 { compatible = "shared-dma-pool"; diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi index 37bafd44e36d..4ead6dc9af2f 100644 --- a/arch/arc/boot/dts/axs10x_mb.dtsi +++ b/arch/arc/boot/dts/axs10x_mb.dtsi @@ -72,7 +72,7 @@ }; }; - gmac: ethernet@0x18000 { + gmac: ethernet@18000 { #interrupt-cells = <1>; compatible = "snps,dwmac"; reg = < 0x18000 0x2000 >; @@ -88,13 +88,13 @@ mac-address = [00 00 00 00 00 00]; /* Filled in by U-Boot */ }; - ehci@0x40000 { + ehci@40000 { compatible = "generic-ehci"; reg = < 0x40000 0x100 >; interrupts = < 8 >; }; - ohci@0x60000 { + ohci@60000 { compatible = "generic-ohci"; reg = < 0x60000 0x100 >; interrupts = < 8 >; @@ -118,7 +118,7 @@ * dw_mci_pltfm_prepare_command() is used in generic platform * code. */ - mmc@0x15000 { + mmc@15000 { compatible = "altr,socfpga-dw-mshc"; reg = < 0x15000 0x400 >; fifo-depth = < 16 >; @@ -129,7 +129,7 @@ bus-width = < 4 >; }; - uart@0x20000 { + uart@20000 { compatible = "snps,dw-apb-uart"; reg = <0x20000 0x100>; clock-frequency = <33333333>; @@ -139,7 +139,7 @@ reg-io-width = <4>; }; - uart@0x21000 { + uart@21000 { compatible = "snps,dw-apb-uart"; reg = <0x21000 0x100>; clock-frequency = <33333333>; @@ -150,7 +150,7 @@ }; /* UART muxed with USB data port (ttyS3) */ - uart@0x22000 { + uart@22000 { compatible = "snps,dw-apb-uart"; reg = <0x22000 0x100>; clock-frequency = <33333333>; @@ -160,7 +160,7 @@ reg-io-width = <4>; }; - i2c@0x1d000 { + i2c@1d000 { compatible = "snps,designware-i2c"; reg = <0x1d000 0x100>; clock-frequency = <400000>; @@ -177,7 +177,7 @@ #sound-dai-cells = <0>; }; - i2c@0x1f000 { + i2c@1f000 { compatible = "snps,designware-i2c"; #address-cells = <1>; #size-cells = <0>; @@ -218,13 +218,13 @@ }; }; - eeprom@0x54{ + eeprom@54{ compatible = "atmel,24c01"; reg = <0x54>; pagesize = <0x8>; }; - eeprom@0x57{ + eeprom@57{ compatible = "atmel,24c04"; reg = <0x57>; pagesize = <0x8>; diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index 43f17b51ee89..6c23fa063ced 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -110,12 +110,12 @@ cgu_rst: reset-controller@8a0 { compatible = "snps,hsdk-reset"; #reset-cells = <1>; - reg = <0x8A0 0x4>, <0xFF0 0x4>; + reg = <0x8a0 0x4>, <0xff0 0x4>; }; core_clk: core-clk@0 { compatible = "snps,hsdk-core-pll-clock"; - reg = <0x00 0x10>, <0x14B8 0x4>; + reg = <0x00 0x10>, <0x14b8 0x4>; #clock-cells = <0>; clocks = <&input_clk>; diff --git a/arch/arc/boot/dts/vdk_axc003.dtsi b/arch/arc/boot/dts/vdk_axc003.dtsi index 0fd6ba985b16..84e8766c8ca2 100644 --- a/arch/arc/boot/dts/vdk_axc003.dtsi +++ b/arch/arc/boot/dts/vdk_axc003.dtsi @@ -36,7 +36,7 @@ #interrupt-cells = <1>; }; - debug_uart: dw-apb-uart@0x5000 { + debug_uart: dw-apb-uart@5000 { compatible = "snps,dw-apb-uart"; reg = <0x5000 0x100>; clock-frequency = <2403200>; @@ -49,7 +49,7 @@ }; - mb_intc: dw-apb-ictl@0xe0012000 { + mb_intc: dw-apb-ictl@e0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; reg = < 0xe0012000 0x200 >; diff --git a/arch/arc/boot/dts/vdk_axc003_idu.dtsi b/arch/arc/boot/dts/vdk_axc003_idu.dtsi index 28956f9a9f3d..eb7e705e8a27 100644 --- a/arch/arc/boot/dts/vdk_axc003_idu.dtsi +++ b/arch/arc/boot/dts/vdk_axc003_idu.dtsi @@ -44,7 +44,7 @@ #interrupt-cells = <1>; }; - debug_uart: dw-apb-uart@0x5000 { + debug_uart: dw-apb-uart@5000 { compatible = "snps,dw-apb-uart"; reg = <0x5000 0x100>; clock-frequency = <2403200>; @@ -57,7 +57,7 @@ }; - mb_intc: dw-apb-ictl@0xe0012000 { + mb_intc: dw-apb-ictl@e0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; reg = < 0xe0012000 0x200 >; diff --git a/arch/arc/boot/dts/vdk_axs10x_mb.dtsi b/arch/arc/boot/dts/vdk_axs10x_mb.dtsi index 48bb4b4cd234..925d5cc95dbb 100644 --- a/arch/arc/boot/dts/vdk_axs10x_mb.dtsi +++ b/arch/arc/boot/dts/vdk_axs10x_mb.dtsi @@ -36,7 +36,7 @@ }; }; - ethernet@0x18000 { + ethernet@18000 { #interrupt-cells = <1>; compatible = "snps,dwmac"; reg = < 0x18000 0x2000 >; @@ -49,13 +49,13 @@ clock-names = "stmmaceth"; }; - ehci@0x40000 { + ehci@40000 { compatible = "generic-ehci"; reg = < 0x40000 0x100 >; interrupts = < 8 >; }; - uart@0x20000 { + uart@20000 { compatible = "snps,dw-apb-uart"; reg = <0x20000 0x100>; clock-frequency = <2403200>; @@ -65,7 +65,7 @@ reg-io-width = <4>; }; - uart@0x21000 { + uart@21000 { compatible = "snps,dw-apb-uart"; reg = <0x21000 0x100>; clock-frequency = <2403200>; @@ -75,7 +75,7 @@ reg-io-width = <4>; }; - uart@0x22000 { + uart@22000 { compatible = "snps,dw-apb-uart"; reg = <0x22000 0x100>; clock-frequency = <2403200>; @@ -101,7 +101,7 @@ interrupt-names = "arc_ps2_irq"; }; - mmc@0x15000 { + mmc@15000 { compatible = "snps,dw-mshc"; reg = <0x15000 0x400>; fifo-depth = <1024>; @@ -117,11 +117,11 @@ * Embedded Vision subsystem UIO mappings; only relevant for EV VDK * * This node is intentionally put outside of MB above becase - * it maps areas outside of MB's 0xEz-0xFz. + * it maps areas outside of MB's 0xez-0xfz. */ - uio_ev: uio@0xD0000000 { + uio_ev: uio@d0000000 { compatible = "generic-uio"; - reg = <0xD0000000 0x2000 0xD1000000 0x2000 0x90000000 0x10000000 0xC0000000 0x10000000>; + reg = <0xd0000000 0x2000 0xd1000000 0x2000 0x90000000 0x10000000 0xc0000000 0x10000000>; reg-names = "ev_gsa", "ev_ctrl", "ev_shared_mem", "ev_code_mem"; interrupt-parent = <&mb_intc>; interrupts = <23>; From 66f7d3709c431d37a0b7f2b59ed00fbf93de1f0d Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Fri, 15 Feb 2019 19:17:33 +0300 Subject: [PATCH 237/855] ARC: [plat-hsdk]: Add reset controller handle to manage USB reset DW USB controller on HSDK hangs sometimes after SW reset, so add reset handle to make possible to reset DW USB controller HW. Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/hsdk.dts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index 6c23fa063ced..b62088bf03a8 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -200,6 +200,7 @@ compatible = "snps,hsdk-v1.0-ohci", "generic-ohci"; reg = <0x60000 0x100>; interrupts = <15>; + resets = <&cgu_rst HSDK_USB_RESET>; dma-coherent; }; @@ -207,6 +208,7 @@ compatible = "snps,hsdk-v1.0-ehci", "generic-ehci"; reg = <0x40000 0x100>; interrupts = <15>; + resets = <&cgu_rst HSDK_USB_RESET>; dma-coherent; }; From 5d4ab8d0960e399e85a55e659b11cfc86a03a776 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Tue, 19 Feb 2019 13:52:26 +0300 Subject: [PATCH 238/855] ARC: [plat-hsdk]: Enable AXI DW DMAC support Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/hsdk.dts | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index b62088bf03a8..69bc1c9e8e50 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -167,6 +167,18 @@ #clock-cells = <0>; }; + dmac_core_clk: dmac-core-clk { + compatible = "fixed-clock"; + clock-frequency = <400000000>; + #clock-cells = <0>; + }; + + dmac_cfg_clk: dmac-gpu-cfg-clk { + compatible = "fixed-clock"; + clock-frequency = <200000000>; + #clock-cells = <0>; + }; + gmac: ethernet@8000 { #interrupt-cells = <1>; compatible = "snps,dwmac"; @@ -239,6 +251,21 @@ reg = <0>; }; }; + + dmac: dmac@80000 { + compatible = "snps,axi-dma-1.01a"; + reg = <0x80000 0x400>; + interrupts = <27>; + clocks = <&dmac_core_clk>, <&dmac_cfg_clk>; + clock-names = "core-clk", "cfgr-clk"; + + dma-channels = <4>; + snps,dma-masters = <2>; + snps,data-width = <3>; + snps,block-size = <4096 4096 4096 4096>; + snps,priority = <0 1 2 3>; + snps,axi-max-burst-len = <16>; + }; }; memory@80000000 { From 4d1e7918aae59ef504f5170a4f0c7ae82339fcb2 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Wed, 30 Jan 2019 19:32:43 +0300 Subject: [PATCH 239/855] ARCv2: lib: introduce memcpy optimized for unaligned access Optimise code to use efficient unaligned memory access which is available on ARCv2. This allows us to really simplify memcpy code and speed up the code one and a half times (in case of unaligned source or destination). Don't wire it up yet ! Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/lib/memcpy-archs-unaligned.S | 47 +++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 arch/arc/lib/memcpy-archs-unaligned.S diff --git a/arch/arc/lib/memcpy-archs-unaligned.S b/arch/arc/lib/memcpy-archs-unaligned.S new file mode 100644 index 000000000000..28993a73fdde --- /dev/null +++ b/arch/arc/lib/memcpy-archs-unaligned.S @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * ARCv2 memcpy implementation optimized for unaligned memory access using. + * + * Copyright (C) 2019 Synopsys + * Author: Eugeniy Paltsev + */ + +#include + +#ifdef CONFIG_ARC_HAS_LL64 +# define LOADX(DST,RX) ldd.ab DST, [RX, 8] +# define STOREX(SRC,RX) std.ab SRC, [RX, 8] +# define ZOLSHFT 5 +# define ZOLAND 0x1F +#else +# define LOADX(DST,RX) ld.ab DST, [RX, 4] +# define STOREX(SRC,RX) st.ab SRC, [RX, 4] +# define ZOLSHFT 4 +# define ZOLAND 0xF +#endif + +ENTRY_CFI(memcpy) + mov r3, r0 ; don;t clobber ret val + + lsr.f lp_count, r2, ZOLSHFT + lpnz @.Lcopy32_64bytes + ;; LOOP START + LOADX (r6, r1) + LOADX (r8, r1) + LOADX (r10, r1) + LOADX (r4, r1) + STOREX (r6, r3) + STOREX (r8, r3) + STOREX (r10, r3) + STOREX (r4, r3) +.Lcopy32_64bytes: + + and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes + lpnz @.Lcopyremainingbytes + ;; LOOP START + ldb.ab r5, [r1, 1] + stb.ab r5, [r3, 1] +.Lcopyremainingbytes: + + j [blink] +END_CFI(memcpy) From 76551468833cd5c356b1d9ff4bc9393fcf768a59 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Wed, 30 Jan 2019 19:32:41 +0300 Subject: [PATCH 240/855] ARCv2: Add explcit unaligned access support (and ability to disable too) As of today we enable unaligned access unconditionally on ARCv2. Do this under a Kconfig option to allow disable it for test, benchmarking etc. Also while at it - Select HAVE_EFFICIENT_UNALIGNED_ACCESS - Although gcc defaults to unaligned access (since GNU 2018.03), add the right toggles for enabling or disabling as appropriate - update bootlog to prints both HW feature status (exists, enabled/disabled) and SW status (used / not used). - wire up the relaxed memcpy for unaligned access Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta [vgupta: squashed patches, handle gcc -mno-unaligned-access quick] --- arch/arc/Kconfig | 9 +++++++++ arch/arc/Makefile | 6 ++++++ arch/arc/include/asm/arcregs.h | 1 + arch/arc/include/asm/irqflags-arcv2.h | 8 +++++++- arch/arc/kernel/head.S | 5 +++++ arch/arc/kernel/intc-arcv2.c | 2 +- arch/arc/kernel/setup.c | 27 +++++++++++++++++---------- arch/arc/kernel/troubleshoot.c | 5 ++++- arch/arc/lib/Makefile | 8 +++++++- 9 files changed, 57 insertions(+), 14 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index d750b302d5ab..95fb11a85566 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -386,6 +386,15 @@ config ARC_HAS_SWAPE if ISA_ARCV2 +config ARC_USE_UNALIGNED_MEM_ACCESS + bool "Enable unaligned access in HW" + default y + select HAVE_EFFICIENT_UNALIGNED_ACCESS + help + The ARC HS architecture supports unaligned memory access + which is disabled by default. Enable unaligned access in + hardware and use software to use it + config ARC_HAS_LL64 bool "Insn: 64bit LDD/STD" help diff --git a/arch/arc/Makefile b/arch/arc/Makefile index df00578c279d..e2b991f75bc5 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -28,6 +28,12 @@ cflags-$(CONFIG_ARC_HAS_SWAPE) += -mswape ifdef CONFIG_ISA_ARCV2 +ifdef CONFIG_ARC_USE_UNALIGNED_MEM_ACCESS +cflags-y += -munaligned-access +else +cflags-y += -mno-unaligned-access +endif + ifndef CONFIG_ARC_HAS_LL64 cflags-y += -mno-ll64 endif diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index a27eafdc8260..0c13317af1d6 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -82,6 +82,7 @@ #define ECR_V_DTLB_MISS 0x05 #define ECR_V_PROTV 0x06 #define ECR_V_TRAP 0x09 +#define ECR_V_MISALIGN 0x0d #endif /* DTLB Miss and Protection Violation Cause Codes */ diff --git a/arch/arc/include/asm/irqflags-arcv2.h b/arch/arc/include/asm/irqflags-arcv2.h index 8a4f77ea3238..e66d0339e1d8 100644 --- a/arch/arc/include/asm/irqflags-arcv2.h +++ b/arch/arc/include/asm/irqflags-arcv2.h @@ -44,7 +44,13 @@ #define ARCV2_IRQ_DEF_PRIO 1 /* seed value for status register */ -#define ISA_INIT_STATUS_BITS (STATUS_IE_MASK | STATUS_AD_MASK | \ +#ifdef CONFIG_ARC_USE_UNALIGNED_MEM_ACCESS +#define __AD_ENB STATUS_AD_MASK +#else +#define __AD_ENB 0 +#endif + +#define ISA_INIT_STATUS_BITS (STATUS_IE_MASK | __AD_ENB | \ (ARCV2_IRQ_DEF_PRIO << 1)) #ifndef __ASSEMBLY__ diff --git a/arch/arc/kernel/head.S b/arch/arc/kernel/head.S index 30e090625916..36774348d23b 100644 --- a/arch/arc/kernel/head.S +++ b/arch/arc/kernel/head.S @@ -54,7 +54,12 @@ ; gcc 7.3.1 (ARC GNU 2018.03) onwards generates unaligned access ; by default lr r5, [status32] +#ifdef CONFIG_ARC_USE_UNALIGNED_MEM_ACCESS bset r5, r5, STATUS_AD_BIT +#else + ; Although disabled at reset, bootloader might have enabled it + bclr r5, r5, STATUS_AD_BIT +#endif kflag r5 #endif .endm diff --git a/arch/arc/kernel/intc-arcv2.c b/arch/arc/kernel/intc-arcv2.c index cf18b3e5a934..c0d0124de089 100644 --- a/arch/arc/kernel/intc-arcv2.c +++ b/arch/arc/kernel/intc-arcv2.c @@ -95,7 +95,7 @@ void arc_init_IRQ(void) /* setup status32, don't enable intr yet as kernel doesn't want */ tmp = read_aux_reg(ARC_REG_STATUS32); - tmp |= STATUS_AD_MASK | (ARCV2_IRQ_DEF_PRIO << 1); + tmp |= ARCV2_IRQ_DEF_PRIO << 1; tmp &= ~STATUS_IE_MASK; asm volatile("kflag %0 \n"::"r"(tmp)); } diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 7b2340996cf8..2266e4ee4142 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -263,7 +263,7 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) { struct cpuinfo_arc *cpu = &cpuinfo_arc700[cpu_id]; struct bcr_identity *core = &cpu->core; - int i, n = 0, ua = 0; + int n = 0; FIX_PTR(cpu); @@ -283,16 +283,23 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) IS_AVAIL2(cpu->extn.rtc, "RTC [UP 64-bit] ", CONFIG_ARC_TIMERS_64BIT), IS_AVAIL2(cpu->extn.gfrc, "GFRC [SMP 64-bit] ", CONFIG_ARC_TIMERS_64BIT)); -#ifdef __ARC_UNALIGNED__ - ua = 1; -#endif - n += i = scnprintf(buf + n, len - n, "%s%s%s%s%s%s", - IS_AVAIL2(cpu->isa.atomic, "atomic ", CONFIG_ARC_HAS_LLSC), - IS_AVAIL2(cpu->isa.ldd, "ll64 ", CONFIG_ARC_HAS_LL64), - IS_AVAIL1(cpu->isa.unalign, "unalign "), IS_USED_RUN(ua)); + n += scnprintf(buf + n, len - n, "%s%s%s%s%s%s", + IS_AVAIL2(cpu->isa.atomic, "atomic ", CONFIG_ARC_HAS_LLSC), + IS_AVAIL2(cpu->isa.ldd, "ll64 ", CONFIG_ARC_HAS_LL64), + IS_AVAIL2(cpu->isa.unalign, "unalign ", CONFIG_ARC_USE_UNALIGNED_MEM_ACCESS)); - if (i) - n += scnprintf(buf + n, len - n, "\n\t\t: "); +#if defined(__ARC_UNALIGNED__) && !defined(CONFIG_ARC_USE_UNALIGNED_MEM_ACCESS) + /* + * gcc 7.3.1 (GNU 2018.03) onwards generate unaligned access by default + * but -mno-unaligned-access to disable that didn't work until gcc 8.2.1 + * (GNU 2019.03). So landing here implies the interim period, when + * despite Kconfig being off, gcc is generating unaligned accesses which + * could bomb later on. So better to disallow such broken builds + */ + BUILD_BUG_ON_MSG(1, "gcc doesn't support -mno-unaligned-access"); +#endif + + n += scnprintf(buf + n, len - n, "\n\t\t: "); if (cpu->extn_mpy.ver) { if (cpu->extn_mpy.ver <= 0x2) { /* ARCompact */ diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index 215f515442e0..b0aa8c028331 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -145,7 +145,8 @@ static void show_ecr_verbose(struct pt_regs *regs) } else if (vec == ECR_V_PROTV) { if (cause_code == ECR_C_PROTV_INST_FETCH) pr_cont("Execute from Non-exec Page\n"); - else if (cause_code == ECR_C_PROTV_MISALIG_DATA) + else if (cause_code == ECR_C_PROTV_MISALIG_DATA && + IS_ENABLED(CONFIG_ISA_ARCOMPACT)) pr_cont("Misaligned r/w from 0x%08lx\n", address); else pr_cont("%s access not allowed on page\n", @@ -161,6 +162,8 @@ static void show_ecr_verbose(struct pt_regs *regs) pr_cont("Bus Error from Data Mem\n"); else pr_cont("Bus Error, check PRM\n"); + } else if (vec == ECR_V_MISALIGN) { + pr_cont("Misaligned r/w from 0x%08lx\n", address); #endif } else if (vec == ECR_V_TRAP) { if (regs->ecr_param == 5) diff --git a/arch/arc/lib/Makefile b/arch/arc/lib/Makefile index b1656d156097..f7537b466b23 100644 --- a/arch/arc/lib/Makefile +++ b/arch/arc/lib/Makefile @@ -8,4 +8,10 @@ lib-y := strchr-700.o strcpy-700.o strlen.o memcmp.o lib-$(CONFIG_ISA_ARCOMPACT) += memcpy-700.o memset.o strcmp.o -lib-$(CONFIG_ISA_ARCV2) += memcpy-archs.o memset-archs.o strcmp-archs.o +lib-$(CONFIG_ISA_ARCV2) += memset-archs.o strcmp-archs.o + +ifdef CONFIG_ARC_USE_UNALIGNED_MEM_ACCESS +lib-$(CONFIG_ISA_ARCV2) +=memcpy-archs-unaligned.o +else +lib-$(CONFIG_ISA_ARCV2) +=memcpy-archs.o +endif From fbe025c3eaf5f52060c2820eddb7357363db0d27 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 22 Feb 2019 10:42:44 -0800 Subject: [PATCH 241/855] ARC: perf: bpok condition only exists for ARCompact Signed-off-by: Vineet Gupta --- arch/arc/include/asm/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h index 6958545390f0..9cd7ee4fad39 100644 --- a/arch/arc/include/asm/perf_event.h +++ b/arch/arc/include/asm/perf_event.h @@ -105,10 +105,10 @@ static const char * const arc_pmu_ev_hw_map[] = { [PERF_COUNT_HW_INSTRUCTIONS] = "iall", /* All jump instructions that are taken */ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmptak", - [PERF_COUNT_ARC_BPOK] = "bpok", /* NP-NT, PT-T, PNT-NT */ #ifdef CONFIG_ISA_ARCV2 [PERF_COUNT_HW_BRANCH_MISSES] = "bpmp", #else + [PERF_COUNT_ARC_BPOK] = "bpok", /* NP-NT, PT-T, PNT-NT */ [PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", /* NP-T, PT-NT, PNT-T */ #endif [PERF_COUNT_ARC_LDC] = "imemrdc", /* Instr: mem read cached */ From edb64bca50cd736c6894cc6081d5263c007ce005 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Mon, 25 Feb 2019 20:16:01 +0300 Subject: [PATCH 242/855] ARC: u-boot args: check that magic number is correct In case of devboards we really often disable bootloader and load Linux image in memory via JTAG. Even if kernel tries to verify uboot_tag and uboot_arg there is sill a chance that we treat some garbage in registers as valid u-boot arguments in JTAG case. E.g. it is enough to have '1' in r0 to treat any value in r2 as a boot command line. So check that magic number passed from u-boot is correct and drop u-boot arguments otherwise. That helps to reduce the possibility of using garbage as u-boot arguments in JTAG case. We can safely check U-boot magic value (0x0) in linux passed via r1 register as U-boot pass it from the beginning. So there is no backward-compatibility issues. Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/kernel/head.S | 1 + arch/arc/kernel/setup.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/arch/arc/kernel/head.S b/arch/arc/kernel/head.S index 36774348d23b..8f6e0447dd17 100644 --- a/arch/arc/kernel/head.S +++ b/arch/arc/kernel/head.S @@ -111,6 +111,7 @@ ENTRY(stext) ; r2 = pointer to uboot provided cmdline or external DTB in mem ; These are handled later in handle_uboot_args() st r0, [@uboot_tag] + st r1, [@uboot_magic] st r2, [@uboot_arg] ; setup "current" tsk and optionally cache it in dedicated r25 diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 2266e4ee4142..ec2fd231ecd9 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -36,6 +36,7 @@ unsigned int intr_to_DE_cnt; /* Part of U-boot ABI: see head.S */ int __initdata uboot_tag; +int __initdata uboot_magic; char __initdata *uboot_arg; const struct machine_desc *machine_desc; @@ -504,6 +505,8 @@ static inline bool uboot_arg_invalid(unsigned long addr) #define UBOOT_TAG_NONE 0 #define UBOOT_TAG_CMDLINE 1 #define UBOOT_TAG_DTB 2 +/* We always pass 0 as magic from U-boot */ +#define UBOOT_MAGIC_VALUE 0 void __init handle_uboot_args(void) { @@ -518,6 +521,11 @@ void __init handle_uboot_args(void) goto ignore_uboot_args; } + if (uboot_magic != UBOOT_MAGIC_VALUE) { + pr_warn(IGNORE_ARGS "non zero uboot magic\n"); + goto ignore_uboot_args; + } + if (uboot_tag != UBOOT_TAG_NONE && uboot_arg_invalid((unsigned long)uboot_arg)) { pr_warn(IGNORE_ARGS "invalid uboot arg: '%px'\n", uboot_arg); From 0728aeb7ead99a9b0dac2f3c92b3752b4e02ff97 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Mon, 25 Feb 2019 09:45:38 +0000 Subject: [PATCH 243/855] arc: hsdk_defconfig: Enable CONFIG_BLK_DEV_RAM We have now a HSDK device in our kernelci lab, but kernel builded via the hsdk_defconfig lacks ramfs supports, so it cannot boot kernelci jobs yet. So this patch enable CONFIG_BLK_DEV_RAM in hsdk_defconfig. Signed-off-by: Corentin Labbe Acked-by: Alexey Brodkin Signed-off-by: Vineet Gupta --- arch/arc/configs/hsdk_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig index 87b23b7fb781..aefcf7a4e17a 100644 --- a/arch/arc/configs/hsdk_defconfig +++ b/arch/arc/configs/hsdk_defconfig @@ -8,6 +8,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_BLK_DEV_RAM=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set From 00a4ae65cc600b008c80429a4fa37ccee21f139e Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 25 Feb 2019 11:24:05 -0800 Subject: [PATCH 244/855] ARCv2: boot log: refurbish HS core/release identification HS core names and releases have so far been identified based solely on IDENTIFY.ARCVER field. With the future HS releases this will not be sufficient as same ARCVER 0x54 could be an HS38 or HS48. So rewrite the code to use a new BCR to identify the cores properly. Signed-off-by: Vineet Gupta --- arch/arc/include/asm/arcregs.h | 2 +- arch/arc/kernel/setup.c | 130 +++++++++++++++++++-------------- 2 files changed, 75 insertions(+), 57 deletions(-) diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index 0c13317af1d6..36e5d124f5ae 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -313,7 +313,7 @@ struct cpuinfo_arc { struct cpuinfo_arc_bpu bpu; struct bcr_identity core; struct bcr_isa_arcv2 isa; - const char *details, *name; + const char *release, *name; unsigned int vec_base; struct cpuinfo_arc_ccm iccm, dccm; struct { diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index ec2fd231ecd9..6bd466892915 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -45,29 +45,24 @@ struct task_struct *_current_task[NR_CPUS]; /* For stack switching */ struct cpuinfo_arc cpuinfo_arc700[NR_CPUS]; -static const struct id_to_str arc_cpu_rel[] = { +static const struct id_to_str arc_legacy_rel[] = { + /* ID.ARCVER, Release */ #ifdef CONFIG_ISA_ARCOMPACT - { 0x34, "R4.10"}, - { 0x35, "R4.11"}, + { 0x34, "R4.10"}, + { 0x35, "R4.11"}, #else - { 0x51, "R2.0" }, - { 0x52, "R2.1" }, - { 0x53, "R3.0" }, - { 0x54, "R3.10a" }, + { 0x51, "R2.0" }, + { 0x52, "R2.1" }, + { 0x53, "R3.0" }, #endif - { 0x00, NULL } + { 0x00, NULL } }; -static const struct id_to_str arc_cpu_nm[] = { -#ifdef CONFIG_ISA_ARCOMPACT - { 0x20, "ARC 600" }, - { 0x30, "ARC 770" }, /* 750 identified seperately */ -#else - { 0x40, "ARC EM" }, - { 0x50, "ARC HS38" }, - { 0x54, "ARC HS48" }, -#endif - { 0x00, "Unknown" } +static const struct id_to_str arc_cpu_rel[] = { + /* UARCH.MAJOR, Release */ + { 0, "R3.10a"}, + { 1, "R3.50a"}, + { 0xFF, NULL } }; static void read_decode_ccm_bcr(struct cpuinfo_arc *cpu) @@ -117,31 +112,72 @@ static void read_decode_ccm_bcr(struct cpuinfo_arc *cpu) } } +static void decode_arc_core(struct cpuinfo_arc *cpu) +{ + struct bcr_uarch_build_arcv2 uarch; + const struct id_to_str *tbl; + + /* + * Up until (including) the first core4 release (0x54) things were + * simple: AUX IDENTITY.ARCVER was sufficient to identify arc family + * and release: 0x50 to 0x53 was HS38, 0x54 was HS48 (dual issue) + */ + + if (cpu->core.family < 0x54) { /* includes arc700 */ + + for (tbl = &arc_legacy_rel[0]; tbl->id != 0; tbl++) { + if (cpu->core.family == tbl->id) { + cpu->release = tbl->str; + break; + } + } + + if (is_isa_arcompact()) + cpu->name = "ARC700"; + else if (tbl->str) + cpu->name = "HS38"; + else + cpu->name = cpu->release = "Unknown"; + + return; + } + + /* + * However the subsequent HS release (same 0x54) allow HS38 or HS48 + * configurations and encode this info in a different BCR. + * The BCR was introduced in 0x54 so can't be read unconditionally. + */ + + READ_BCR(ARC_REG_MICRO_ARCH_BCR, uarch); + + if (uarch.prod == 4) { + cpu->name = "HS48"; + cpu->extn.dual = 1; + + } else { + cpu->name = "HS38"; + } + + for (tbl = &arc_cpu_rel[0]; tbl->id != 0xFF; tbl++) { + if (uarch.maj == tbl->id) { + cpu->release = tbl->str; + break; + } + } +} + static void read_arc_build_cfg_regs(void) { struct bcr_timer timer; struct bcr_generic bcr; struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()]; - const struct id_to_str *tbl; struct bcr_isa_arcv2 isa; struct bcr_actionpoint ap; FIX_PTR(cpu); READ_BCR(AUX_IDENTITY, cpu->core); - - for (tbl = &arc_cpu_rel[0]; tbl->id != 0; tbl++) { - if (cpu->core.family == tbl->id) { - cpu->details = tbl->str; - break; - } - } - - for (tbl = &arc_cpu_nm[0]; tbl->id != 0; tbl++) { - if ((cpu->core.family & 0xF4) == tbl->id) - break; - } - cpu->name = tbl->str; + decode_arc_core(cpu); READ_BCR(ARC_REG_TIMERS_BCR, timer); cpu->extn.timer0 = timer.t0; @@ -199,30 +235,12 @@ static void read_arc_build_cfg_regs(void) cpu->bpu.num_pred = 2048 << bpu.pte; cpu->bpu.ret_stk = 4 << bpu.rse; - if (cpu->core.family >= 0x54) { + /* if dual issue hardware, is it enabled ? */ + if (cpu->extn.dual) { + unsigned int exec_ctrl; - struct bcr_uarch_build_arcv2 uarch; - - /* - * The first 0x54 core (uarch maj:min 0:1 or 0:2) was - * dual issue only (HS4x). But next uarch rev (1:0) - * allows it be configured for single issue (HS3x) - * Ensure we fiddle with dual issue only on HS4x - */ - READ_BCR(ARC_REG_MICRO_ARCH_BCR, uarch); - - if (uarch.prod == 4) { - unsigned int exec_ctrl; - - /* dual issue hardware always present */ - cpu->extn.dual = 1; - - READ_BCR(AUX_EXEC_CTRL, exec_ctrl); - - /* dual issue hardware enabled ? */ - cpu->extn.dual_enb = !(exec_ctrl & 1); - - } + READ_BCR(AUX_EXEC_CTRL, exec_ctrl); + cpu->extn.dual_enb = !(exec_ctrl & 1); } } @@ -273,7 +291,7 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) core->family, core->cpu_id, core->chip_id); n += scnprintf(buf + n, len - n, "processor [%d]\t: %s %s (%s ISA) %s%s%s\n", - cpu_id, cpu->name, cpu->details, + cpu_id, cpu->name, cpu->release, is_isa_arcompact() ? "ARCompact" : "ARCv2", IS_AVAIL1(cpu->isa.be, "[Big-Endian]"), IS_AVAIL3(cpu->extn.dual, cpu->extn.dual_enb, " Dual-Issue ")); From 85d6adcbbe6dfc557755543c6c39b497d3032cdc Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 25 Feb 2019 11:56:28 -0800 Subject: [PATCH 245/855] ARC: boot log: cut down on verbosity The syscall ABI has long been fixed, so no need to call that out now. Also, there's no need to print really fine details such as norm, barrel-shifter etc. Those are given in a Linux enabled hardware config. So now we print just 1 line for all optional "instruction" related hardware features | | ISA Extn : atomic ll64 unalign mpy[opt 9] div_rem vs. 2 before | |ISA Extn : atomic ll64 unalign | : mpy[opt 9] div_rem norm barrel-shift swap minmax swape Signed-off-by: Vineet Gupta --- arch/arc/include/asm/arcregs.h | 9 ---- arch/arc/kernel/setup.c | 84 +++++++++++++--------------------- 2 files changed, 32 insertions(+), 61 deletions(-) diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index 36e5d124f5ae..a7d4be87b2f0 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -168,14 +168,6 @@ struct bcr_mpy { #endif }; -struct bcr_extn_xymem { -#ifdef CONFIG_CPU_BIG_ENDIAN - unsigned int ram_org:2, num_banks:4, bank_sz:4, ver:8; -#else - unsigned int ver:8, bank_sz:4, num_banks:4, ram_org:2; -#endif -}; - struct bcr_iccm_arcompact { #ifdef CONFIG_CPU_BIG_ENDIAN unsigned int base:16, pad:5, sz:3, ver:8; @@ -323,7 +315,6 @@ struct cpuinfo_arc { timer0:1, timer1:1, rtc:1, gfrc:1, pad4:4; } extn; struct bcr_mpy extn_mpy; - struct bcr_extn_xymem extn_xymem; }; extern struct cpuinfo_arc cpuinfo_arc700[]; diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 6bd466892915..15da43d2e416 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -188,16 +188,6 @@ static void read_arc_build_cfg_regs(void) READ_BCR(ARC_REG_MUL_BCR, cpu->extn_mpy); - cpu->extn.norm = read_aux_reg(ARC_REG_NORM_BCR) > 1 ? 1 : 0; /* 2,3 */ - cpu->extn.barrel = read_aux_reg(ARC_REG_BARREL_BCR) > 1 ? 1 : 0; /* 2,3 */ - cpu->extn.swap = read_aux_reg(ARC_REG_SWAP_BCR) ? 1 : 0; /* 1,3 */ - cpu->extn.crc = read_aux_reg(ARC_REG_CRC_BCR) ? 1 : 0; - cpu->extn.minmax = read_aux_reg(ARC_REG_MIXMAX_BCR) > 1 ? 1 : 0; /* 2 */ - cpu->extn.swape = (cpu->core.family >= 0x34) ? 1 : - IS_ENABLED(CONFIG_ARC_HAS_SWAPE); - - READ_BCR(ARC_REG_XY_MEM_BCR, cpu->extn_xymem); - /* Read CCM BCRs for boot reporting even if not enabled in Kconfig */ read_decode_ccm_bcr(cpu); @@ -282,6 +272,7 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) { struct cpuinfo_arc *cpu = &cpuinfo_arc700[cpu_id]; struct bcr_identity *core = &cpu->core; + char mpy_opt[16]; int n = 0; FIX_PTR(cpu); @@ -302,10 +293,27 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) IS_AVAIL2(cpu->extn.rtc, "RTC [UP 64-bit] ", CONFIG_ARC_TIMERS_64BIT), IS_AVAIL2(cpu->extn.gfrc, "GFRC [SMP 64-bit] ", CONFIG_ARC_TIMERS_64BIT)); - n += scnprintf(buf + n, len - n, "%s%s%s%s%s%s", + if (cpu->extn_mpy.ver) { + if (is_isa_arcompact()) { + scnprintf(mpy_opt, 16, "mpy"); + } else { + + int opt = 2; /* stock MPY/MPYH */ + + if (cpu->extn_mpy.dsp) /* OPT 7-9 */ + opt = cpu->extn_mpy.dsp + 6; + + scnprintf(mpy_opt, 16, "mpy[opt %d] ", opt); + } + } + + n += scnprintf(buf + n, len - n, "%s%s%s%s%s%s%s%s\n", IS_AVAIL2(cpu->isa.atomic, "atomic ", CONFIG_ARC_HAS_LLSC), IS_AVAIL2(cpu->isa.ldd, "ll64 ", CONFIG_ARC_HAS_LL64), - IS_AVAIL2(cpu->isa.unalign, "unalign ", CONFIG_ARC_USE_UNALIGNED_MEM_ACCESS)); + IS_AVAIL2(cpu->isa.unalign, "unalign ", CONFIG_ARC_USE_UNALIGNED_MEM_ACCESS), + IS_AVAIL1(cpu->extn_mpy.ver, mpy_opt), + IS_AVAIL1(cpu->isa.div_rem, "div_rem ")); + #if defined(__ARC_UNALIGNED__) && !defined(CONFIG_ARC_USE_UNALIGNED_MEM_ACCESS) /* @@ -318,52 +326,29 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) BUILD_BUG_ON_MSG(1, "gcc doesn't support -mno-unaligned-access"); #endif - n += scnprintf(buf + n, len - n, "\n\t\t: "); - - if (cpu->extn_mpy.ver) { - if (cpu->extn_mpy.ver <= 0x2) { /* ARCompact */ - n += scnprintf(buf + n, len - n, "mpy "); - } else { - int opt = 2; /* stock MPY/MPYH */ - - if (cpu->extn_mpy.dsp) /* OPT 7-9 */ - opt = cpu->extn_mpy.dsp + 6; - - n += scnprintf(buf + n, len - n, "mpy[opt %d] ", opt); - } - } - - n += scnprintf(buf + n, len - n, "%s%s%s%s%s%s%s%s\n", - IS_AVAIL1(cpu->isa.div_rem, "div_rem "), - IS_AVAIL1(cpu->extn.norm, "norm "), - IS_AVAIL1(cpu->extn.barrel, "barrel-shift "), - IS_AVAIL1(cpu->extn.swap, "swap "), - IS_AVAIL1(cpu->extn.minmax, "minmax "), - IS_AVAIL1(cpu->extn.crc, "crc "), - IS_AVAIL2(cpu->extn.swape, "swape", CONFIG_ARC_HAS_SWAPE)); - - if (cpu->bpu.ver) + if (cpu->bpu.ver) { n += scnprintf(buf + n, len - n, "BPU\t\t: %s%s match, cache:%d, Predict Table:%d Return stk: %d", IS_AVAIL1(cpu->bpu.full, "full"), IS_AVAIL1(!cpu->bpu.full, "partial"), cpu->bpu.num_cache, cpu->bpu.num_pred, cpu->bpu.ret_stk); - if (is_isa_arcv2()) { - struct bcr_lpb lpb; + if (is_isa_arcv2()) { + struct bcr_lpb lpb; - READ_BCR(ARC_REG_LPB_BUILD, lpb); - if (lpb.ver) { - unsigned int ctl; - ctl = read_aux_reg(ARC_REG_LPB_CTRL); + READ_BCR(ARC_REG_LPB_BUILD, lpb); + if (lpb.ver) { + unsigned int ctl; + ctl = read_aux_reg(ARC_REG_LPB_CTRL); - n += scnprintf(buf + n, len - n, " Loop Buffer:%d %s", - lpb.entries, - IS_DISABLED_RUN(!ctl)); + n += scnprintf(buf + n, len - n, " Loop Buffer:%d %s", + lpb.entries, + IS_DISABLED_RUN(!ctl)); + } } + n += scnprintf(buf + n, len - n, "\n"); } - n += scnprintf(buf + n, len - n, "\n"); return buf; } @@ -416,11 +401,6 @@ static char *arc_extn_mumbojumbo(int cpu_id, char *buf, int len) } } - n += scnprintf(buf + n, len - n, "OS ABI [v%d]\t: %s\n", - EF_ARC_OSABI_CURRENT >> 8, - EF_ARC_OSABI_CURRENT == EF_ARC_OSABI_V3 ? - "no-legacy-syscalls" : "64-bit data any register aligned"); - return buf; } From e85fa28ebcb598bbb439402609fdde5d0f80622d Mon Sep 17 00:00:00 2001 From: Mike Leach Date: Wed, 13 Feb 2019 14:41:49 +0100 Subject: [PATCH 246/855] ARM: 8838/1: drivers: amba: Updates to component identification for driver matching. The CoreSight specification (ARM IHI 0029E), updates the ID register requirements for components on an AMBA bus, to cover both traditional ARM Primecell type devices, and newer CoreSight and other components. The Peripheral ID (PID) / Component ID (CID) pair is extended in certain cases to uniquely identify components. CoreSight components related to a single function can share Peripheral ID values, and must be further identified using a Unique Component Identifier (UCI). e.g. the ETM, CTI, PMU and Debug hardware of the A35 all share the same PID. Bits 15:12 of the CID are defined to be the device class. Class 0xF remains for PrimeCell and legacy components. Class 0x9 defines the component as CoreSight (CORESIGHT_CID above) Class 0x0, 0x1, 0xB, 0xE define components that do not have driver support at present. Class 0x2-0x8,0xA and 0xD-0xD are presently reserved. The specification futher defines which classes of device use the standard CID/PID pair, and when additional ID registers are required. This patch introduces the amba_cs_uci_id structure which will be used in all coresight drivers for indentification via the private data pointer in the amba_id structure. Existing drivers that currently use the amba_id->data pointer for private data are updated to use the amba_cs_uci_id->data pointer. Macros and inline functions are added to simplify this code. Signed-off-by: Mike Leach Reviewed-by: Mathieu Poirier Reviewed-by: Suzuki K Poulose Tested-by: Sai Prakash Ranjan Signed-off-by: Russell King --- drivers/hwtracing/coresight/coresight-etm3x.c | 44 ++++++------------- drivers/hwtracing/coresight/coresight-priv.h | 32 ++++++++++++++ drivers/hwtracing/coresight/coresight-stm.c | 14 ++---- drivers/hwtracing/coresight/coresight-tmc.c | 30 ++++--------- include/linux/amba/bus.h | 33 ++++++++++++++ 5 files changed, 90 insertions(+), 63 deletions(-) diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c index 9a63e87ea5f3..be302ec5f66b 100644 --- a/drivers/hwtracing/coresight/coresight-etm3x.c +++ b/drivers/hwtracing/coresight/coresight-etm3x.c @@ -871,7 +871,7 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id) } pm_runtime_put(&adev->dev); - dev_info(dev, "%s initialized\n", (char *)id->data); + dev_info(dev, "%s initialized\n", (char *)coresight_get_uci_data(id)); if (boot_enable) { coresight_enable(drvdata->csdev); drvdata->boot_enable = true; @@ -915,36 +915,18 @@ static const struct dev_pm_ops etm_dev_pm_ops = { }; static const struct amba_id etm_ids[] = { - { /* ETM 3.3 */ - .id = 0x000bb921, - .mask = 0x000fffff, - .data = "ETM 3.3", - }, - { /* ETM 3.5 - Cortex-A5 */ - .id = 0x000bb955, - .mask = 0x000fffff, - .data = "ETM 3.5", - }, - { /* ETM 3.5 */ - .id = 0x000bb956, - .mask = 0x000fffff, - .data = "ETM 3.5", - }, - { /* PTM 1.0 */ - .id = 0x000bb950, - .mask = 0x000fffff, - .data = "PTM 1.0", - }, - { /* PTM 1.1 */ - .id = 0x000bb95f, - .mask = 0x000fffff, - .data = "PTM 1.1", - }, - { /* PTM 1.1 Qualcomm */ - .id = 0x000b006f, - .mask = 0x000fffff, - .data = "PTM 1.1", - }, + /* ETM 3.3 */ + CS_AMBA_ID_DATA(0x000bb921, "ETM 3.3"), + /* ETM 3.5 - Cortex-A5 */ + CS_AMBA_ID_DATA(0x000bb955, "ETM 3.5"), + /* ETM 3.5 */ + CS_AMBA_ID_DATA(0x000bb956, "ETM 3.5"), + /* PTM 1.0 */ + CS_AMBA_ID_DATA(0x000bb950, "PTM 1.0"), + /* PTM 1.1 */ + CS_AMBA_ID_DATA(0x000bb95f, "PTM 1.1"), + /* PTM 1.1 Qualcomm */ + CS_AMBA_ID_DATA(0x000b006f, "PTM 1.1"), { 0, 0}, }; diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h index 579f34943bf1..02a1f5204f9d 100644 --- a/drivers/hwtracing/coresight/coresight-priv.h +++ b/drivers/hwtracing/coresight/coresight-priv.h @@ -6,6 +6,7 @@ #ifndef _CORESIGHT_PRIV_H #define _CORESIGHT_PRIV_H +#include #include #include #include @@ -159,4 +160,35 @@ static inline int etm_readl_cp14(u32 off, unsigned int *val) { return 0; } static inline int etm_writel_cp14(u32 off, u32 val) { return 0; } #endif +/* + * Macros and inline functions to handle CoreSight UCI data and driver + * private data in AMBA ID table entries, and extract data values. + */ + +/* coresight AMBA ID, no UCI, no driver data: id table entry */ +#define CS_AMBA_ID(pid) \ + { \ + .id = pid, \ + .mask = 0x000fffff, \ + } + +/* coresight AMBA ID, UCI with driver data only: id table entry. */ +#define CS_AMBA_ID_DATA(pid, dval) \ + { \ + .id = pid, \ + .mask = 0x000fffff, \ + .data = (void *)&(struct amba_cs_uci_id) \ + { \ + .data = (void *)dval, \ + } \ + } + +/* extract the data value from a UCI structure given amba_id pointer. */ +static inline void *coresight_get_uci_data(const struct amba_id *id) +{ + if (id->data) + return ((struct amba_cs_uci_id *)(id->data))->data; + return 0; +} + #endif diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c index ef339ff22090..2a70cdd68a7b 100644 --- a/drivers/hwtracing/coresight/coresight-stm.c +++ b/drivers/hwtracing/coresight/coresight-stm.c @@ -874,7 +874,7 @@ static int stm_probe(struct amba_device *adev, const struct amba_id *id) pm_runtime_put(&adev->dev); - dev_info(dev, "%s initialized\n", (char *)id->data); + dev_info(dev, "%s initialized\n", (char *)coresight_get_uci_data(id)); return 0; stm_unregister: @@ -909,16 +909,8 @@ static const struct dev_pm_ops stm_dev_pm_ops = { }; static const struct amba_id stm_ids[] = { - { - .id = 0x000bb962, - .mask = 0x000fffff, - .data = "STM32", - }, - { - .id = 0x000bb963, - .mask = 0x000fffff, - .data = "STM500", - }, + CS_AMBA_ID_DATA(0x000bb962, "STM32"), + CS_AMBA_ID_DATA(0x000bb963, "STM500"), { 0, 0}, }; diff --git a/drivers/hwtracing/coresight/coresight-tmc.c b/drivers/hwtracing/coresight/coresight-tmc.c index ea249f0bcd73..2a02da3d630f 100644 --- a/drivers/hwtracing/coresight/coresight-tmc.c +++ b/drivers/hwtracing/coresight/coresight-tmc.c @@ -443,7 +443,8 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id) desc.type = CORESIGHT_DEV_TYPE_SINK; desc.subtype.sink_subtype = CORESIGHT_DEV_SUBTYPE_SINK_BUFFER; desc.ops = &tmc_etr_cs_ops; - ret = tmc_etr_setup_caps(drvdata, devid, id->data); + ret = tmc_etr_setup_caps(drvdata, devid, + coresight_get_uci_data(id)); if (ret) goto out; break; @@ -475,26 +476,13 @@ out: } static const struct amba_id tmc_ids[] = { - { - .id = 0x000bb961, - .mask = 0x000fffff, - }, - { - /* Coresight SoC 600 TMC-ETR/ETS */ - .id = 0x000bb9e8, - .mask = 0x000fffff, - .data = (void *)(unsigned long)CORESIGHT_SOC_600_ETR_CAPS, - }, - { - /* Coresight SoC 600 TMC-ETB */ - .id = 0x000bb9e9, - .mask = 0x000fffff, - }, - { - /* Coresight SoC 600 TMC-ETF */ - .id = 0x000bb9ea, - .mask = 0x000fffff, - }, + CS_AMBA_ID(0x000bb961), + /* Coresight SoC 600 TMC-ETR/ETS */ + CS_AMBA_ID_DATA(0x000bb9e8, (unsigned long)CORESIGHT_SOC_600_ETR_CAPS), + /* Coresight SoC 600 TMC-ETB */ + CS_AMBA_ID(0x000bb9e9), + /* Coresight SoC 600 TMC-ETF */ + CS_AMBA_ID(0x000bb9ea), { 0, 0}, }; diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index d143c13bed26..e3c36223e40b 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -25,6 +25,39 @@ #define AMBA_CID 0xb105f00d #define CORESIGHT_CID 0xb105900d +/* + * CoreSight Architecture specification updates the ID specification + * for components on the AMBA bus. (ARM IHI 0029E) + * + * Bits 15:12 of the CID are the device class. + * + * Class 0xF remains for PrimeCell and legacy components. (AMBA_CID above) + * Class 0x9 defines the component as CoreSight (CORESIGHT_CID above) + * Class 0x0, 0x1, 0xB, 0xE define components that do not have driver support + * at present. + * Class 0x2-0x8,0xA and 0xD-0xD are presently reserved. + * + * Remaining CID bits stay as 0xb105-00d + */ + +/** + * Class 0x9 components use additional values to form a Unique Component + * Identifier (UCI), where peripheral ID values are identical for different + * components. Passed to the amba bus code from the component driver via + * the amba_id->data pointer. + * @devarch : coresight devarch register value + * @devarch_mask: mask bits used for matching. 0 indicates UCI not used. + * @devtype : coresight device type value + * @data : additional driver data. As we have usurped the original + * pointer some devices may still need additional data + */ +struct amba_cs_uci_id { + unsigned int devarch; + unsigned int devarch_mask; + unsigned int devtype; + void *data; +}; + struct clk; struct amba_device { From 4a2910fa80d75dbe18d822482a48ae50a218029c Mon Sep 17 00:00:00 2001 From: Mike Leach Date: Wed, 13 Feb 2019 14:41:50 +0100 Subject: [PATCH 247/855] ARM: 8836/1: drivers: amba: Update component matching to use the CoreSight UCI values. The patches provide an update of amba_device and matching code to handle the additional registers required for the Class 0x9 (CoreSight) UCI. The *data pointer in the amba_id is used by the driver to provide extended ID register values for matching. CoreSight components where PID/CID pair is currently sufficient for unique identification need not provide this additional information. Signed-off-by: Mike Leach Reviewed-by: Mathieu Poirier Reviewed-by: Suzuki K Poulose Tested-by: Sai Prakash Ranjan Signed-off-by: Russell King --- drivers/amba/bus.c | 45 +++++++++++++++++++++++++++++++++------- include/linux/amba/bus.h | 6 ++++++ 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index 41b706403ef7..b4dae624b9af 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -26,19 +26,36 @@ #define to_amba_driver(d) container_of(d, struct amba_driver, drv) +/* called on periphid match and class 0x9 coresight device. */ +static int +amba_cs_uci_id_match(const struct amba_id *table, struct amba_device *dev) +{ + int ret = 0; + struct amba_cs_uci_id *uci; + + uci = table->data; + + /* no table data or zero mask - return match on periphid */ + if (!uci || (uci->devarch_mask == 0)) + return 1; + + /* test against read devtype and masked devarch value */ + ret = (dev->uci.devtype == uci->devtype) && + ((dev->uci.devarch & uci->devarch_mask) == uci->devarch); + return ret; +} + static const struct amba_id * amba_lookup(const struct amba_id *table, struct amba_device *dev) { - int ret = 0; - while (table->mask) { - ret = (dev->periphid & table->mask) == table->id; - if (ret) - break; + if (((dev->periphid & table->mask) == table->id) && + ((dev->cid != CORESIGHT_CID) || + (amba_cs_uci_id_match(table, dev)))) + return table; table++; } - - return ret ? table : NULL; + return NULL; } static int amba_match(struct device *dev, struct device_driver *drv) @@ -399,10 +416,22 @@ static int amba_device_try_add(struct amba_device *dev, struct resource *parent) cid |= (readl(tmp + size - 0x10 + 4 * i) & 255) << (i * 8); + if (cid == CORESIGHT_CID) { + /* set the base to the start of the last 4k block */ + void __iomem *csbase = tmp + size - 4096; + + dev->uci.devarch = + readl(csbase + UCI_REG_DEVARCH_OFFSET); + dev->uci.devtype = + readl(csbase + UCI_REG_DEVTYPE_OFFSET) & 0xff; + } + amba_put_disable_pclk(dev); - if (cid == AMBA_CID || cid == CORESIGHT_CID) + if (cid == AMBA_CID || cid == CORESIGHT_CID) { dev->periphid = pid; + dev->cid = cid; + } if (!dev->periphid) ret = -ENODEV; diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index e3c36223e40b..f99b74a6e4ca 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -58,6 +58,10 @@ struct amba_cs_uci_id { void *data; }; +/* define offsets for registers used by UCI */ +#define UCI_REG_DEVTYPE_OFFSET 0xFCC +#define UCI_REG_DEVARCH_OFFSET 0xFBC + struct clk; struct amba_device { @@ -65,6 +69,8 @@ struct amba_device { struct resource res; struct clk *pclk; unsigned int periphid; + unsigned int cid; + struct amba_cs_uci_id uci; unsigned int irq[AMBA_NR_IRQS]; char *driver_override; }; From 28941701a49a1d6c4fe0e9d294b0af673080baca Mon Sep 17 00:00:00 2001 From: Mike Leach Date: Wed, 13 Feb 2019 14:41:51 +0100 Subject: [PATCH 248/855] ARM: 8837/1: coresight: etmv4: Update ID register table to add UCI support Adds macro to enable UCI entries to be added to AMBA ID tables. Updates the ID register tables to contain a UCI entry for the A35 ETM device to allow correct matching of driver in the amba bus code. Signed-off-by: Mike Leach Reviewed-by: Mathieu Poirier Reviewed-by: Suzuki K Poulose Tested-by: Sai Prakash Ranjan Signed-off-by: Russell King --- drivers/hwtracing/coresight/coresight-etm4x.c | 21 +++++++++++-------- drivers/hwtracing/coresight/coresight-priv.h | 8 +++++++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c index 53e2fb6e86f6..dd9b9b5ebb84 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x.c +++ b/drivers/hwtracing/coresight/coresight-etm4x.c @@ -1067,18 +1067,21 @@ err_arch_supported: return ret; } -#define ETM4x_AMBA_ID(pid) \ - { \ - .id = pid, \ - .mask = 0x000fffff, \ +static struct amba_cs_uci_id uci_id_etm4[] = { + { + /* ETMv4 UCI data */ + .devarch = 0x47704a13, + .devarch_mask = 0xfff0ffff, + .devtype = 0x00000013, } +}; static const struct amba_id etm4_ids[] = { - ETM4x_AMBA_ID(0x000bb95d), /* Cortex-A53 */ - ETM4x_AMBA_ID(0x000bb95e), /* Cortex-A57 */ - ETM4x_AMBA_ID(0x000bb95a), /* Cortex-A72 */ - ETM4x_AMBA_ID(0x000bb959), /* Cortex-A73 */ - ETM4x_AMBA_ID(0x000bb9da), /* Cortex-A35 */ + CS_AMBA_ID(0x000bb95d), /* Cortex-A53 */ + CS_AMBA_ID(0x000bb95e), /* Cortex-A57 */ + CS_AMBA_ID(0x000bb95a), /* Cortex-A72 */ + CS_AMBA_ID(0x000bb959), /* Cortex-A73 */ + CS_AMBA_UCI_ID(0x000bb9da, uci_id_etm4), /* Cortex-A35 */ {}, }; diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h index 02a1f5204f9d..fd69ad24432a 100644 --- a/drivers/hwtracing/coresight/coresight-priv.h +++ b/drivers/hwtracing/coresight/coresight-priv.h @@ -183,6 +183,14 @@ static inline int etm_writel_cp14(u32 off, u32 val) { return 0; } } \ } +/* coresight AMBA ID, full UCI structure: id table entry. */ +#define CS_AMBA_UCI_ID(pid, uci_ptr) \ + { \ + .id = pid, \ + .mask = 0x000fffff, \ + .data = uci_ptr \ + } + /* extract the data value from a UCI structure given amba_id pointer. */ static inline void *coresight_get_uci_data(const struct amba_id *id) { From 143c2a89e0e5fda6c6fd08d7bc1126438c19ae90 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 13 Feb 2019 17:14:23 +0100 Subject: [PATCH 249/855] ARM: 8839/1: kprobe: make patch_lock a raw_spinlock_t When running kprobe on -rt kernel, the below bug is caught: |BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:931 |in_atomic(): 1, irqs_disabled(): 128, pid: 14, name: migration/0 |Preemption disabled at:[<802f2b98>] cpu_stopper_thread+0xc0/0x140 |CPU: 0 PID: 14 Comm: migration/0 Tainted: G O 4.8.3-rt2 #1 |Hardware name: Freescale LS1021A |[<8025a43c>] (___might_sleep) |[<80b5b324>] (rt_spin_lock) |[<80b5c31c>] (__patch_text_real) |[<80b5c3ac>] (patch_text_stop_machine) |[<802f2920>] (multi_cpu_stop) Since patch_text_stop_machine() is called in stop_machine() which disables IRQ, sleepable lock should be not used in this atomic context, so replace patch_lock to raw lock. Signed-off-by: Yang Shi Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Arnd Bergmann Signed-off-by: Russell King --- arch/arm/kernel/patch.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c index a50dc00d79a2..d0a05a3bdb96 100644 --- a/arch/arm/kernel/patch.c +++ b/arch/arm/kernel/patch.c @@ -16,7 +16,7 @@ struct patch { unsigned int insn; }; -static DEFINE_SPINLOCK(patch_lock); +static DEFINE_RAW_SPINLOCK(patch_lock); static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags) __acquires(&patch_lock) @@ -33,7 +33,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags) return addr; if (flags) - spin_lock_irqsave(&patch_lock, *flags); + raw_spin_lock_irqsave(&patch_lock, *flags); else __acquire(&patch_lock); @@ -48,7 +48,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags) clear_fixmap(fixmap); if (flags) - spin_unlock_irqrestore(&patch_lock, *flags); + raw_spin_unlock_irqrestore(&patch_lock, *flags); else __release(&patch_lock); } From 74ffe79ae538283bbf7c155e62339f1e5c87b55a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 13 Feb 2019 17:14:42 +0100 Subject: [PATCH 250/855] ARM: 8840/1: use a raw_spinlock_t in unwind Mostly unwind is done with irqs enabled however SLUB may call it with irqs disabled while creating a new SLUB cache. I had system freeze while loading a module which called kmem_cache_create() on init. That means SLUB's __slab_alloc() disabled interrupts and then ->new_slab_objects() ->new_slab() ->setup_object() ->setup_object_debug() ->init_tracking() ->set_track() ->save_stack_trace() ->save_stack_trace_tsk() ->walk_stackframe() ->unwind_frame() ->unwind_find_idx() =>spin_lock_irqsave(&unwind_lock); Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Russell King --- arch/arm/kernel/unwind.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c index 0bee233fef9a..314cfb232a63 100644 --- a/arch/arm/kernel/unwind.c +++ b/arch/arm/kernel/unwind.c @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[]; static const struct unwind_idx *__origin_unwind_idx; extern const struct unwind_idx __stop_unwind_idx[]; -static DEFINE_SPINLOCK(unwind_lock); +static DEFINE_RAW_SPINLOCK(unwind_lock); static LIST_HEAD(unwind_tables); /* Convert a prel31 symbol to an absolute address */ @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr) /* module unwind tables */ struct unwind_table *table; - spin_lock_irqsave(&unwind_lock, flags); + raw_spin_lock_irqsave(&unwind_lock, flags); list_for_each_entry(table, &unwind_tables, list) { if (addr >= table->begin_addr && addr < table->end_addr) { @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr) break; } } - spin_unlock_irqrestore(&unwind_lock, flags); + raw_spin_unlock_irqrestore(&unwind_lock, flags); } pr_debug("%s: idx = %p\n", __func__, idx); @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size, tab->begin_addr = text_addr; tab->end_addr = text_addr + text_size; - spin_lock_irqsave(&unwind_lock, flags); + raw_spin_lock_irqsave(&unwind_lock, flags); list_add_tail(&tab->list, &unwind_tables); - spin_unlock_irqrestore(&unwind_lock, flags); + raw_spin_unlock_irqrestore(&unwind_lock, flags); return tab; } @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab) if (!tab) return; - spin_lock_irqsave(&unwind_lock, flags); + raw_spin_lock_irqsave(&unwind_lock, flags); list_del(&tab->list); - spin_unlock_irqrestore(&unwind_lock, flags); + raw_spin_unlock_irqrestore(&unwind_lock, flags); kfree(tab); } From a216376add730ec86a8bcee5735f62fd890cb2d0 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 18 Feb 2019 00:54:36 +0100 Subject: [PATCH 251/855] ARM: 8841/1: use unified assembler in macros Use unified assembler syntax (UAL) in macros. Divided syntax is considered deprecated. This will also allow to build the kernel using LLVM's integrated assembler. Signed-off-by: Stefan Agner Acked-by: Nicolas Pitre Signed-off-by: Russell King --- arch/arm/lib/copy_from_user.S | 2 +- arch/arm/lib/copy_to_user.S | 2 +- arch/arm/lib/memcpy.S | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S index 0d4c189c7f4f..6a3419e2c6d8 100644 --- a/arch/arm/lib/copy_from_user.S +++ b/arch/arm/lib/copy_from_user.S @@ -91,7 +91,7 @@ .endm .macro str1b ptr reg cond=al abort - str\cond\()b \reg, [\ptr], #1 + strb\cond \reg, [\ptr], #1 .endm .macro enter reg1 reg2 diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S index 97a6ff4b7e3c..c7d08096e354 100644 --- a/arch/arm/lib/copy_to_user.S +++ b/arch/arm/lib/copy_to_user.S @@ -49,7 +49,7 @@ .endm .macro ldr1b ptr reg cond=al abort - ldr\cond\()b \reg, [\ptr], #1 + ldrb\cond \reg, [\ptr], #1 .endm #ifdef CONFIG_CPU_USE_DOMAINS diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S index 64111bd4440b..4a6997bb4404 100644 --- a/arch/arm/lib/memcpy.S +++ b/arch/arm/lib/memcpy.S @@ -30,7 +30,7 @@ .endm .macro ldr1b ptr reg cond=al abort - ldr\cond\()b \reg, [\ptr], #1 + ldrb\cond \reg, [\ptr], #1 .endm .macro str1w ptr reg abort @@ -42,7 +42,7 @@ .endm .macro str1b ptr reg cond=al abort - str\cond\()b \reg, [\ptr], #1 + strb\cond \reg, [\ptr], #1 .endm .macro enter reg1 reg2 From c001899a5d6c2d7a0f3b75b2307ddef137fb46a6 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 18 Feb 2019 00:56:58 +0100 Subject: [PATCH 252/855] ARM: 8843/1: use unified assembler in headers Use unified assembler syntax (UAL) in headers. Divided syntax is considered deprecated. This will also allow to build the kernel using LLVM's integrated assembler. Signed-off-by: Stefan Agner Acked-by: Nicolas Pitre Signed-off-by: Russell King --- arch/arm/include/asm/assembler.h | 12 ++++++------ arch/arm/include/asm/vfpmacros.h | 8 ++++---- arch/arm/lib/bitops.h | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 28a48e0d4cca..b59921a560da 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -376,9 +376,9 @@ THUMB( orr \reg , \reg , #PSR_T_BIT ) .macro usraccoff, instr, reg, ptr, inc, off, cond, abort, t=TUSER() 9999: .if \inc == 1 - \instr\cond\()b\()\t\().w \reg, [\ptr, #\off] + \instr\()b\t\cond\().w \reg, [\ptr, #\off] .elseif \inc == 4 - \instr\cond\()\t\().w \reg, [\ptr, #\off] + \instr\t\cond\().w \reg, [\ptr, #\off] .else .error "Unsupported inc macro argument" .endif @@ -417,9 +417,9 @@ THUMB( orr \reg , \reg , #PSR_T_BIT ) .rept \rept 9999: .if \inc == 1 - \instr\cond\()b\()\t \reg, [\ptr], #\inc + \instr\()b\t\cond \reg, [\ptr], #\inc .elseif \inc == 4 - \instr\cond\()\t \reg, [\ptr], #\inc + \instr\t\cond \reg, [\ptr], #\inc .else .error "Unsupported inc macro argument" .endif @@ -460,7 +460,7 @@ THUMB( orr \reg , \reg , #PSR_T_BIT ) .macro check_uaccess, addr:req, size:req, limit:req, tmp:req, bad:req #ifndef CONFIG_CPU_USE_DOMAINS adds \tmp, \addr, #\size - 1 - sbcccs \tmp, \tmp, \limit + sbcscc \tmp, \tmp, \limit bcs \bad #ifdef CONFIG_CPU_SPECTRE movcs \addr, #0 @@ -474,7 +474,7 @@ THUMB( orr \reg , \reg , #PSR_T_BIT ) sub \tmp, \limit, #1 subs \tmp, \tmp, \addr @ tmp = limit - 1 - addr addhs \tmp, \tmp, #1 @ if (tmp >= 0) { - subhss \tmp, \tmp, \size @ tmp = limit - (addr + size) } + subshs \tmp, \tmp, \size @ tmp = limit - (addr + size) } movlo \addr, #0 @ if (tmp < 0) addr = NULL csdb #endif diff --git a/arch/arm/include/asm/vfpmacros.h b/arch/arm/include/asm/vfpmacros.h index ef5dfedacd8d..628c336e8e3b 100644 --- a/arch/arm/include/asm/vfpmacros.h +++ b/arch/arm/include/asm/vfpmacros.h @@ -29,13 +29,13 @@ ldr \tmp, =elf_hwcap @ may not have MVFR regs ldr \tmp, [\tmp, #0] tst \tmp, #HWCAP_VFPD32 - ldcnel p11, cr0, [\base],#32*4 @ FLDMIAD \base!, {d16-d31} + ldclne p11, cr0, [\base],#32*4 @ FLDMIAD \base!, {d16-d31} addeq \base, \base, #32*4 @ step over unused register space #else VFPFMRX \tmp, MVFR0 @ Media and VFP Feature Register 0 and \tmp, \tmp, #MVFR0_A_SIMD_MASK @ A_SIMD field cmp \tmp, #2 @ 32 x 64bit registers? - ldceql p11, cr0, [\base],#32*4 @ FLDMIAD \base!, {d16-d31} + ldcleq p11, cr0, [\base],#32*4 @ FLDMIAD \base!, {d16-d31} addne \base, \base, #32*4 @ step over unused register space #endif #endif @@ -53,13 +53,13 @@ ldr \tmp, =elf_hwcap @ may not have MVFR regs ldr \tmp, [\tmp, #0] tst \tmp, #HWCAP_VFPD32 - stcnel p11, cr0, [\base],#32*4 @ FSTMIAD \base!, {d16-d31} + stclne p11, cr0, [\base],#32*4 @ FSTMIAD \base!, {d16-d31} addeq \base, \base, #32*4 @ step over unused register space #else VFPFMRX \tmp, MVFR0 @ Media and VFP Feature Register 0 and \tmp, \tmp, #MVFR0_A_SIMD_MASK @ A_SIMD field cmp \tmp, #2 @ 32 x 64bit registers? - stceql p11, cr0, [\base],#32*4 @ FSTMIAD \base!, {d16-d31} + stcleq p11, cr0, [\base],#32*4 @ FSTMIAD \base!, {d16-d31} addne \base, \base, #32*4 @ step over unused register space #endif #endif diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h index 93cddab73072..95bd35991288 100644 --- a/arch/arm/lib/bitops.h +++ b/arch/arm/lib/bitops.h @@ -7,7 +7,7 @@ ENTRY( \name ) UNWIND( .fnstart ) ands ip, r1, #3 - strneb r1, [ip] @ assert word-aligned + strbne r1, [ip] @ assert word-aligned mov r2, #1 and r3, r0, #31 @ Get bit offset mov r0, r0, lsr #5 @@ -32,7 +32,7 @@ ENDPROC(\name ) ENTRY( \name ) UNWIND( .fnstart ) ands ip, r1, #3 - strneb r1, [ip] @ assert word-aligned + strbne r1, [ip] @ assert word-aligned mov r2, #1 and r3, r0, #31 @ Get bit offset mov r0, r0, lsr #5 @@ -62,7 +62,7 @@ ENDPROC(\name ) ENTRY( \name ) UNWIND( .fnstart ) ands ip, r1, #3 - strneb r1, [ip] @ assert word-aligned + strbne r1, [ip] @ assert word-aligned and r2, r0, #31 mov r0, r0, lsr #5 mov r3, #1 @@ -89,7 +89,7 @@ ENDPROC(\name ) ENTRY( \name ) UNWIND( .fnstart ) ands ip, r1, #3 - strneb r1, [ip] @ assert word-aligned + strbne r1, [ip] @ assert word-aligned and r3, r0, #31 mov r0, r0, lsr #5 save_and_disable_irqs ip From e44fc38818ed795f4c661d5414c6e0affae0fa63 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 18 Feb 2019 00:57:38 +0100 Subject: [PATCH 253/855] ARM: 8844/1: use unified assembler in assembly files Use unified assembler syntax (UAL) in assembly files. Divided syntax is considered deprecated. This will also allow to build the kernel using LLVM's integrated assembler. Signed-off-by: Stefan Agner Acked-by: Nicolas Pitre Signed-off-by: Russell King --- arch/arm/boot/bootp/init.S | 2 +- arch/arm/boot/compressed/ll_char_wr.S | 4 +- .../include/asm/hardware/entry-macro-iomd.S | 10 ++--- arch/arm/include/debug/tegra.S | 2 +- arch/arm/kernel/debug.S | 2 +- arch/arm/kernel/entry-armv.S | 12 +++--- arch/arm/kernel/entry-common.S | 2 +- arch/arm/kernel/entry-header.S | 8 ++-- arch/arm/lib/clear_user.S | 2 +- arch/arm/lib/copy_page.S | 4 +- arch/arm/lib/copy_template.S | 4 +- arch/arm/lib/csumpartial.S | 20 ++++----- arch/arm/lib/csumpartialcopygeneric.S | 4 +- arch/arm/lib/csumpartialcopyuser.S | 2 +- arch/arm/lib/div64.S | 4 +- arch/arm/lib/floppydma.S | 10 ++--- arch/arm/lib/io-readsb.S | 20 ++++----- arch/arm/lib/io-readsl.S | 2 +- arch/arm/lib/io-readsw-armv3.S | 6 +-- arch/arm/lib/io-readsw-armv4.S | 12 +++--- arch/arm/lib/io-writesb.S | 20 ++++----- arch/arm/lib/io-writesl.S | 2 +- arch/arm/lib/io-writesw-armv3.S | 2 +- arch/arm/lib/io-writesw-armv4.S | 6 +-- arch/arm/lib/lib1funcs.S | 4 +- arch/arm/lib/memmove.S | 24 +++++------ arch/arm/lib/memset.S | 42 +++++++++---------- .../mach-ks8695/include/mach/entry-macro.S | 2 +- arch/arm/mach-tegra/reset-handler.S | 2 +- arch/arm/mm/cache-v6.S | 8 ++-- arch/arm/mm/proc-v7m.S | 4 +- 31 files changed, 124 insertions(+), 124 deletions(-) diff --git a/arch/arm/boot/bootp/init.S b/arch/arm/boot/bootp/init.S index 78b508075161..142927e5f485 100644 --- a/arch/arm/boot/bootp/init.S +++ b/arch/arm/boot/bootp/init.S @@ -44,7 +44,7 @@ _start: add lr, pc, #-0x8 @ lr = current load addr */ movne r10, #0 @ terminator movne r4, #2 @ Size of this entry (2 words) - stmneia r9, {r4, r5, r10} @ Size, ATAG_CORE, terminator + stmiane r9, {r4, r5, r10} @ Size, ATAG_CORE, terminator /* * find the end of the tag list, and then add an INITRD tag on the end. diff --git a/arch/arm/boot/compressed/ll_char_wr.S b/arch/arm/boot/compressed/ll_char_wr.S index 8517c8606b4a..b1dcdb9f4030 100644 --- a/arch/arm/boot/compressed/ll_char_wr.S +++ b/arch/arm/boot/compressed/ll_char_wr.S @@ -75,7 +75,7 @@ Lrow4bpplp: tst r1, #7 @ avoid using r7 directly after str r7, [r0, -r5]! subne r1, r1, #1 - ldrneb r7, [r6, r1] + ldrbne r7, [r6, r1] bne Lrow4bpplp ldmfd sp!, {r4 - r7, pc} @@ -103,7 +103,7 @@ Lrow8bpplp: sub r0, r0, r5 @ avoid ip stmia r0, {r4, ip} subne r1, r1, #1 - ldrneb r7, [r6, r1] + ldrbne r7, [r6, r1] bne Lrow8bpplp ldmfd sp!, {r4 - r7, pc} diff --git a/arch/arm/include/asm/hardware/entry-macro-iomd.S b/arch/arm/include/asm/hardware/entry-macro-iomd.S index 8c215acd9b57..f7692731e514 100644 --- a/arch/arm/include/asm/hardware/entry-macro-iomd.S +++ b/arch/arm/include/asm/hardware/entry-macro-iomd.S @@ -16,25 +16,25 @@ ldr \tmp, =irq_prio_h teq \irqstat, #0 #ifdef IOMD_BASE - ldreqb \irqstat, [\base, #IOMD_DMAREQ] @ get dma + ldrbeq \irqstat, [\base, #IOMD_DMAREQ] @ get dma addeq \tmp, \tmp, #256 @ irq_prio_h table size teqeq \irqstat, #0 bne 2406f #endif - ldreqb \irqstat, [\base, #IOMD_IRQREQA] @ get low priority + ldrbeq \irqstat, [\base, #IOMD_IRQREQA] @ get low priority addeq \tmp, \tmp, #256 @ irq_prio_d table size teqeq \irqstat, #0 #ifdef IOMD_IRQREQC - ldreqb \irqstat, [\base, #IOMD_IRQREQC] + ldrbeq \irqstat, [\base, #IOMD_IRQREQC] addeq \tmp, \tmp, #256 @ irq_prio_l table size teqeq \irqstat, #0 #endif #ifdef IOMD_IRQREQD - ldreqb \irqstat, [\base, #IOMD_IRQREQD] + ldrbeq \irqstat, [\base, #IOMD_IRQREQD] addeq \tmp, \tmp, #256 @ irq_prio_lc table size teqeq \irqstat, #0 #endif -2406: ldrneb \irqnr, [\tmp, \irqstat] @ get IRQ number +2406: ldrbne \irqnr, [\tmp, \irqstat] @ get IRQ number .endm /* diff --git a/arch/arm/include/debug/tegra.S b/arch/arm/include/debug/tegra.S index 3bc80599c022..4a5a645c76e2 100644 --- a/arch/arm/include/debug/tegra.S +++ b/arch/arm/include/debug/tegra.S @@ -173,7 +173,7 @@ .macro senduart, rd, rx cmp \rx, #0 - strneb \rd, [\rx, #UART_TX << UART_SHIFT] + strbne \rd, [\rx, #UART_TX << UART_SHIFT] 1001: .endm diff --git a/arch/arm/kernel/debug.S b/arch/arm/kernel/debug.S index b795dc2408c0..b9f94e03d916 100644 --- a/arch/arm/kernel/debug.S +++ b/arch/arm/kernel/debug.S @@ -86,7 +86,7 @@ hexbuf_rel: .long hexbuf_addr - . ENTRY(printascii) addruart_current r3, r1, r2 1: teq r0, #0 - ldrneb r1, [r0], #1 + ldrbne r1, [r0], #1 teqne r1, #0 reteq lr 2: teq r1, #'\n' diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index e85a3af9ddeb..ce4aea57130a 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -636,7 +636,7 @@ call_fpe: @ Test if we need to give access to iWMMXt coprocessors ldr r5, [r10, #TI_FLAGS] rsbs r7, r8, #(1 << 8) @ CP 0 or 1 only - movcss r7, r5, lsr #(TIF_USING_IWMMXT + 1) + movscs r7, r5, lsr #(TIF_USING_IWMMXT + 1) bcs iwmmxt_task_enable #endif ARM( add pc, pc, r8, lsr #6 ) @@ -872,7 +872,7 @@ __kuser_cmpxchg64: @ 0xffff0f60 smp_dmb arm 1: ldrexd r0, r1, [r2] @ load current val eors r3, r0, r4 @ compare with oldval (1) - eoreqs r3, r1, r5 @ compare with oldval (2) + eorseq r3, r1, r5 @ compare with oldval (2) strexdeq r3, r6, r7, [r2] @ store newval if eq teqeq r3, #1 @ success? beq 1b @ if no then retry @@ -896,8 +896,8 @@ __kuser_cmpxchg64: @ 0xffff0f60 ldmia r1, {r6, lr} @ load new val 1: ldmia r2, {r0, r1} @ load current val eors r3, r0, r4 @ compare with oldval (1) - eoreqs r3, r1, r5 @ compare with oldval (2) -2: stmeqia r2, {r6, lr} @ store newval if eq + eorseq r3, r1, r5 @ compare with oldval (2) +2: stmiaeq r2, {r6, lr} @ store newval if eq rsbs r0, r3, #0 @ set return val and C flag ldmfd sp!, {r4, r5, r6, pc} @@ -911,7 +911,7 @@ kuser_cmpxchg64_fixup: mov r7, #0xffff0fff sub r7, r7, #(0xffff0fff - (0xffff0f60 + (1b - __kuser_cmpxchg64))) subs r8, r4, r7 - rsbcss r8, r8, #(2b - 1b) + rsbscs r8, r8, #(2b - 1b) strcs r7, [sp, #S_PC] #if __LINUX_ARM_ARCH__ < 6 bcc kuser_cmpxchg32_fixup @@ -969,7 +969,7 @@ kuser_cmpxchg32_fixup: mov r7, #0xffff0fff sub r7, r7, #(0xffff0fff - (0xffff0fc0 + (1b - __kuser_cmpxchg))) subs r8, r4, r7 - rsbcss r8, r8, #(2b - 1b) + rsbscs r8, r8, #(2b - 1b) strcs r7, [sp, #S_PC] ret lr .previous diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 0465d65d23de..f7649adef505 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -373,7 +373,7 @@ sys_syscall: movhs scno, #0 csdb #endif - stmloia sp, {r5, r6} @ shuffle args + stmialo sp, {r5, r6} @ shuffle args movlo r0, r1 movlo r1, r2 movlo r2, r3 diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S index 62db1c9746cb..32051ec5b33f 100644 --- a/arch/arm/kernel/entry-header.S +++ b/arch/arm/kernel/entry-header.S @@ -388,8 +388,8 @@ badr lr, \ret @ return address .if \reload add r1, sp, #S_R0 + S_OFF @ pointer to regs - ldmccia r1, {r0 - r6} @ reload r0-r6 - stmccia sp, {r4, r5} @ update stack arguments + ldmiacc r1, {r0 - r6} @ reload r0-r6 + stmiacc sp, {r4, r5} @ update stack arguments .endif ldrcc pc, [\table, \tmp, lsl #2] @ call sys_* routine #else @@ -397,8 +397,8 @@ badr lr, \ret @ return address .if \reload add r1, sp, #S_R0 + S_OFF @ pointer to regs - ldmccia r1, {r0 - r6} @ reload r0-r6 - stmccia sp, {r4, r5} @ update stack arguments + ldmiacc r1, {r0 - r6} @ reload r0-r6 + stmiacc sp, {r4, r5} @ update stack arguments .endif ldrcc pc, [\table, \nr, lsl #2] @ call sys_* routine #endif diff --git a/arch/arm/lib/clear_user.S b/arch/arm/lib/clear_user.S index e936352ccb00..55946e3fa2ba 100644 --- a/arch/arm/lib/clear_user.S +++ b/arch/arm/lib/clear_user.S @@ -44,7 +44,7 @@ UNWIND(.save {r1, lr}) strusr r2, r0, 1, ne, rept=2 tst r1, #1 @ x1 x0 x1 x0 x1 x0 x1 it ne @ explicit IT needed for the label -USER( strnebt r2, [r0]) +USER( strbtne r2, [r0]) mov r0, #0 ldmfd sp!, {r1, pc} UNWIND(.fnend) diff --git a/arch/arm/lib/copy_page.S b/arch/arm/lib/copy_page.S index 6ee2f6706f86..b84ce1792043 100644 --- a/arch/arm/lib/copy_page.S +++ b/arch/arm/lib/copy_page.S @@ -39,9 +39,9 @@ ENTRY(copy_page) .endr subs r2, r2, #1 @ 1 stmia r0!, {r3, r4, ip, lr} @ 4 - ldmgtia r1!, {r3, r4, ip, lr} @ 4 + ldmiagt r1!, {r3, r4, ip, lr} @ 4 bgt 1b @ 1 - PLD( ldmeqia r1!, {r3, r4, ip, lr} ) + PLD( ldmiaeq r1!, {r3, r4, ip, lr} ) PLD( beq 2b ) ldmfd sp!, {r4, pc} @ 3 ENDPROC(copy_page) diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S index 2d54491b0e22..a11f2c25e03a 100644 --- a/arch/arm/lib/copy_template.S +++ b/arch/arm/lib/copy_template.S @@ -99,7 +99,7 @@ CALGN( ands ip, r0, #31 ) CALGN( rsb r3, ip, #32 ) - CALGN( sbcnes r4, r3, r2 ) @ C is always set here + CALGN( sbcsne r4, r3, r2 ) @ C is always set here CALGN( bcs 2f ) CALGN( adr r4, 6f ) CALGN( subs r2, r2, r3 ) @ C gets set @@ -204,7 +204,7 @@ CALGN( ands ip, r0, #31 ) CALGN( rsb ip, ip, #32 ) - CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( sbcsne r4, ip, r2 ) @ C is always set here CALGN( subcc r2, r2, ip ) CALGN( bcc 15f ) diff --git a/arch/arm/lib/csumpartial.S b/arch/arm/lib/csumpartial.S index 984e0f29d548..bd84e2db353b 100644 --- a/arch/arm/lib/csumpartial.S +++ b/arch/arm/lib/csumpartial.S @@ -40,9 +40,9 @@ td3 .req lr /* we must have at least one byte. */ tst buf, #1 @ odd address? movne sum, sum, ror #8 - ldrneb td0, [buf], #1 + ldrbne td0, [buf], #1 subne len, len, #1 - adcnes sum, sum, td0, put_byte_1 + adcsne sum, sum, td0, put_byte_1 .Lless4: tst len, #6 beq .Lless8_byte @@ -68,8 +68,8 @@ td3 .req lr bne .Lless8_wordlp .Lless8_byte: tst len, #1 @ odd number of bytes - ldrneb td0, [buf], #1 @ include last byte - adcnes sum, sum, td0, put_byte_0 @ update checksum + ldrbne td0, [buf], #1 @ include last byte + adcsne sum, sum, td0, put_byte_0 @ update checksum .Ldone: adc r0, sum, #0 @ collect up the last carry ldr td0, [sp], #4 @@ -78,17 +78,17 @@ td3 .req lr ldr pc, [sp], #4 @ return .Lnot_aligned: tst buf, #1 @ odd address - ldrneb td0, [buf], #1 @ make even + ldrbne td0, [buf], #1 @ make even subne len, len, #1 - adcnes sum, sum, td0, put_byte_1 @ update checksum + adcsne sum, sum, td0, put_byte_1 @ update checksum tst buf, #2 @ 32-bit aligned? #if __LINUX_ARM_ARCH__ >= 4 - ldrneh td0, [buf], #2 @ make 32-bit aligned + ldrhne td0, [buf], #2 @ make 32-bit aligned subne len, len, #2 #else - ldrneb td0, [buf], #1 - ldrneb ip, [buf], #1 + ldrbne td0, [buf], #1 + ldrbne ip, [buf], #1 subne len, len, #2 #ifndef __ARMEB__ orrne td0, td0, ip, lsl #8 @@ -96,7 +96,7 @@ td3 .req lr orrne td0, ip, td0, lsl #8 #endif #endif - adcnes sum, sum, td0 @ update checksum + adcsne sum, sum, td0 @ update checksum ret lr ENTRY(csum_partial) diff --git a/arch/arm/lib/csumpartialcopygeneric.S b/arch/arm/lib/csumpartialcopygeneric.S index 10b45909610c..08e17758cbea 100644 --- a/arch/arm/lib/csumpartialcopygeneric.S +++ b/arch/arm/lib/csumpartialcopygeneric.S @@ -148,9 +148,9 @@ FN_ENTRY strb r5, [dst], #1 mov r5, r4, get_byte_2 .Lexit: tst len, #1 - strneb r5, [dst], #1 + strbne r5, [dst], #1 andne r5, r5, #255 - adcnes sum, sum, r5, put_byte_0 + adcsne sum, sum, r5, put_byte_0 /* * If the dst pointer was not 16-bit aligned, we diff --git a/arch/arm/lib/csumpartialcopyuser.S b/arch/arm/lib/csumpartialcopyuser.S index b83fdc06286a..f4716d98e0b4 100644 --- a/arch/arm/lib/csumpartialcopyuser.S +++ b/arch/arm/lib/csumpartialcopyuser.S @@ -95,7 +95,7 @@ add r2, r2, r1 mov r0, #0 @ zero the buffer 9002: teq r2, r1 - strneb r0, [r1], #1 + strbne r0, [r1], #1 bne 9002b load_regs .popsection diff --git a/arch/arm/lib/div64.S b/arch/arm/lib/div64.S index a9eafe4981eb..4d80f690c48b 100644 --- a/arch/arm/lib/div64.S +++ b/arch/arm/lib/div64.S @@ -88,8 +88,8 @@ UNWIND(.fnstart) @ Break out early if dividend reaches 0. 2: cmp xh, yl orrcs yh, yh, ip - subcss xh, xh, yl - movnes ip, ip, lsr #1 + subscs xh, xh, yl + movsne ip, ip, lsr #1 mov yl, yl, lsr #1 bne 2b diff --git a/arch/arm/lib/floppydma.S b/arch/arm/lib/floppydma.S index 617150b1baef..de68d3b343e3 100644 --- a/arch/arm/lib/floppydma.S +++ b/arch/arm/lib/floppydma.S @@ -14,8 +14,8 @@ .global floppy_fiqin_end ENTRY(floppy_fiqin_start) subs r9, r9, #1 - ldrgtb r12, [r11, #-4] - ldrleb r12, [r11], #0 + ldrbgt r12, [r11, #-4] + ldrble r12, [r11], #0 strb r12, [r10], #1 subs pc, lr, #4 floppy_fiqin_end: @@ -23,10 +23,10 @@ floppy_fiqin_end: .global floppy_fiqout_end ENTRY(floppy_fiqout_start) subs r9, r9, #1 - ldrgeb r12, [r10], #1 + ldrbge r12, [r10], #1 movlt r12, #0 - strleb r12, [r11], #0 - subles pc, lr, #4 + strble r12, [r11], #0 + subsle pc, lr, #4 strb r12, [r11, #-4] subs pc, lr, #4 floppy_fiqout_end: diff --git a/arch/arm/lib/io-readsb.S b/arch/arm/lib/io-readsb.S index c31b2f3153f1..91038a0a77b5 100644 --- a/arch/arm/lib/io-readsb.S +++ b/arch/arm/lib/io-readsb.S @@ -16,10 +16,10 @@ cmp ip, #2 ldrb r3, [r0] strb r3, [r1], #1 - ldrgeb r3, [r0] - strgeb r3, [r1], #1 - ldrgtb r3, [r0] - strgtb r3, [r1], #1 + ldrbge r3, [r0] + strbge r3, [r1], #1 + ldrbgt r3, [r0] + strbgt r3, [r1], #1 subs r2, r2, ip bne .Linsb_aligned @@ -72,7 +72,7 @@ ENTRY(__raw_readsb) bpl .Linsb_16_lp tst r2, #15 - ldmeqfd sp!, {r4 - r6, pc} + ldmfdeq sp!, {r4 - r6, pc} .Linsb_no_16: tst r2, #8 beq .Linsb_no_8 @@ -109,15 +109,15 @@ ENTRY(__raw_readsb) str r3, [r1], #4 .Linsb_no_4: ands r2, r2, #3 - ldmeqfd sp!, {r4 - r6, pc} + ldmfdeq sp!, {r4 - r6, pc} cmp r2, #2 ldrb r3, [r0] strb r3, [r1], #1 - ldrgeb r3, [r0] - strgeb r3, [r1], #1 - ldrgtb r3, [r0] - strgtb r3, [r1] + ldrbge r3, [r0] + strbge r3, [r1], #1 + ldrbgt r3, [r0] + strbgt r3, [r1] ldmfd sp!, {r4 - r6, pc} ENDPROC(__raw_readsb) diff --git a/arch/arm/lib/io-readsl.S b/arch/arm/lib/io-readsl.S index 2ed86fa5465f..f2e2064318d2 100644 --- a/arch/arm/lib/io-readsl.S +++ b/arch/arm/lib/io-readsl.S @@ -30,7 +30,7 @@ ENTRY(__raw_readsl) 2: movs r2, r2, lsl #31 ldrcs r3, [r0, #0] ldrcs ip, [r0, #0] - stmcsia r1!, {r3, ip} + stmiacs r1!, {r3, ip} ldrne r3, [r0, #0] strne r3, [r1, #0] ret lr diff --git a/arch/arm/lib/io-readsw-armv3.S b/arch/arm/lib/io-readsw-armv3.S index 413da9914529..8b25b69c516e 100644 --- a/arch/arm/lib/io-readsw-armv3.S +++ b/arch/arm/lib/io-readsw-armv3.S @@ -68,7 +68,7 @@ ENTRY(__raw_readsw) bpl .Linsw_8_lp tst r2, #7 - ldmeqfd sp!, {r4, r5, r6, pc} + ldmfdeq sp!, {r4, r5, r6, pc} .Lno_insw_8: tst r2, #4 beq .Lno_insw_4 @@ -97,9 +97,9 @@ ENTRY(__raw_readsw) .Lno_insw_2: tst r2, #1 ldrne r3, [r0] - strneb r3, [r1], #1 + strbne r3, [r1], #1 movne r3, r3, lsr #8 - strneb r3, [r1] + strbne r3, [r1] ldmfd sp!, {r4, r5, r6, pc} diff --git a/arch/arm/lib/io-readsw-armv4.S b/arch/arm/lib/io-readsw-armv4.S index d9a45e9692ae..5efdd66f5dcd 100644 --- a/arch/arm/lib/io-readsw-armv4.S +++ b/arch/arm/lib/io-readsw-armv4.S @@ -76,8 +76,8 @@ ENTRY(__raw_readsw) pack r3, r3, ip str r3, [r1], #4 -.Lno_insw_2: ldrneh r3, [r0] - strneh r3, [r1] +.Lno_insw_2: ldrhne r3, [r0] + strhne r3, [r1] ldmfd sp!, {r4, r5, pc} @@ -94,7 +94,7 @@ ENTRY(__raw_readsw) #endif .Linsw_noalign: stmfd sp!, {r4, lr} - ldrccb ip, [r1, #-1]! + ldrbcc ip, [r1, #-1]! bcc 1f ldrh ip, [r0] @@ -121,11 +121,11 @@ ENTRY(__raw_readsw) 3: tst r2, #1 strb ip, [r1], #1 - ldrneh ip, [r0] + ldrhne ip, [r0] _BE_ONLY_( movne ip, ip, ror #8 ) - strneb ip, [r1], #1 + strbne ip, [r1], #1 _LE_ONLY_( movne ip, ip, lsr #8 ) _BE_ONLY_( movne ip, ip, lsr #24 ) - strneb ip, [r1] + strbne ip, [r1] ldmfd sp!, {r4, pc} ENDPROC(__raw_readsw) diff --git a/arch/arm/lib/io-writesb.S b/arch/arm/lib/io-writesb.S index a46bbc9b168b..7d2881a2381e 100644 --- a/arch/arm/lib/io-writesb.S +++ b/arch/arm/lib/io-writesb.S @@ -36,10 +36,10 @@ cmp ip, #2 ldrb r3, [r1], #1 strb r3, [r0] - ldrgeb r3, [r1], #1 - strgeb r3, [r0] - ldrgtb r3, [r1], #1 - strgtb r3, [r0] + ldrbge r3, [r1], #1 + strbge r3, [r0] + ldrbgt r3, [r1], #1 + strbgt r3, [r0] subs r2, r2, ip bne .Loutsb_aligned @@ -64,7 +64,7 @@ ENTRY(__raw_writesb) bpl .Loutsb_16_lp tst r2, #15 - ldmeqfd sp!, {r4, r5, pc} + ldmfdeq sp!, {r4, r5, pc} .Loutsb_no_16: tst r2, #8 beq .Loutsb_no_8 @@ -80,15 +80,15 @@ ENTRY(__raw_writesb) outword r3 .Loutsb_no_4: ands r2, r2, #3 - ldmeqfd sp!, {r4, r5, pc} + ldmfdeq sp!, {r4, r5, pc} cmp r2, #2 ldrb r3, [r1], #1 strb r3, [r0] - ldrgeb r3, [r1], #1 - strgeb r3, [r0] - ldrgtb r3, [r1] - strgtb r3, [r0] + ldrbge r3, [r1], #1 + strbge r3, [r0] + ldrbgt r3, [r1] + strbgt r3, [r0] ldmfd sp!, {r4, r5, pc} ENDPROC(__raw_writesb) diff --git a/arch/arm/lib/io-writesl.S b/arch/arm/lib/io-writesl.S index 4ea2435988c1..7596ac0c90b0 100644 --- a/arch/arm/lib/io-writesl.S +++ b/arch/arm/lib/io-writesl.S @@ -28,7 +28,7 @@ ENTRY(__raw_writesl) bpl 1b ldmfd sp!, {r4, lr} 2: movs r2, r2, lsl #31 - ldmcsia r1!, {r3, ip} + ldmiacs r1!, {r3, ip} strcs r3, [r0, #0] ldrne r3, [r1, #0] strcs ip, [r0, #0] diff --git a/arch/arm/lib/io-writesw-armv3.S b/arch/arm/lib/io-writesw-armv3.S index 121789eb6802..cb94b9b49405 100644 --- a/arch/arm/lib/io-writesw-armv3.S +++ b/arch/arm/lib/io-writesw-armv3.S @@ -79,7 +79,7 @@ ENTRY(__raw_writesw) bpl .Loutsw_8_lp tst r2, #7 - ldmeqfd sp!, {r4, r5, r6, pc} + ldmfdeq sp!, {r4, r5, r6, pc} .Lno_outsw_8: tst r2, #4 beq .Lno_outsw_4 diff --git a/arch/arm/lib/io-writesw-armv4.S b/arch/arm/lib/io-writesw-armv4.S index 269f90c51ad2..e6645b2f249e 100644 --- a/arch/arm/lib/io-writesw-armv4.S +++ b/arch/arm/lib/io-writesw-armv4.S @@ -61,8 +61,8 @@ ENTRY(__raw_writesw) ldr r3, [r1], #4 outword r3 -.Lno_outsw_2: ldrneh r3, [r1] - strneh r3, [r0] +.Lno_outsw_2: ldrhne r3, [r1] + strhne r3, [r0] ldmfd sp!, {r4, r5, pc} @@ -95,6 +95,6 @@ ENTRY(__raw_writesw) tst r2, #1 3: movne ip, r3, lsr #8 - strneh ip, [r0] + strhne ip, [r0] ret lr ENDPROC(__raw_writesw) diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S index 9397b2e532af..c23f9d9e2970 100644 --- a/arch/arm/lib/lib1funcs.S +++ b/arch/arm/lib/lib1funcs.S @@ -96,7 +96,7 @@ Boston, MA 02111-1307, USA. */ subhs \dividend, \dividend, \divisor, lsr #3 orrhs \result, \result, \curbit, lsr #3 cmp \dividend, #0 @ Early termination? - movnes \curbit, \curbit, lsr #4 @ No, any more bits to do? + movsne \curbit, \curbit, lsr #4 @ No, any more bits to do? movne \divisor, \divisor, lsr #4 bne 1b @@ -182,7 +182,7 @@ Boston, MA 02111-1307, USA. */ subhs \dividend, \dividend, \divisor, lsr #3 cmp \dividend, #1 mov \divisor, \divisor, lsr #4 - subges \order, \order, #4 + subsge \order, \order, #4 bge 1b tst \order, #3 diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S index 69a9d47fc5ab..d70304cb2cd0 100644 --- a/arch/arm/lib/memmove.S +++ b/arch/arm/lib/memmove.S @@ -59,7 +59,7 @@ ENTRY(memmove) blt 5f CALGN( ands ip, r0, #31 ) - CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( sbcsne r4, ip, r2 ) @ C is always set here CALGN( bcs 2f ) CALGN( adr r4, 6f ) CALGN( subs r2, r2, ip ) @ C is set here @@ -114,20 +114,20 @@ ENTRY(memmove) UNWIND( .save {r0, r4, lr} ) @ still in first stmfd block 8: movs r2, r2, lsl #31 - ldrneb r3, [r1, #-1]! - ldrcsb r4, [r1, #-1]! - ldrcsb ip, [r1, #-1] - strneb r3, [r0, #-1]! - strcsb r4, [r0, #-1]! - strcsb ip, [r0, #-1] + ldrbne r3, [r1, #-1]! + ldrbcs r4, [r1, #-1]! + ldrbcs ip, [r1, #-1] + strbne r3, [r0, #-1]! + strbcs r4, [r0, #-1]! + strbcs ip, [r0, #-1] ldmfd sp!, {r0, r4, pc} 9: cmp ip, #2 - ldrgtb r3, [r1, #-1]! - ldrgeb r4, [r1, #-1]! + ldrbgt r3, [r1, #-1]! + ldrbge r4, [r1, #-1]! ldrb lr, [r1, #-1]! - strgtb r3, [r0, #-1]! - strgeb r4, [r0, #-1]! + strbgt r3, [r0, #-1]! + strbge r4, [r0, #-1]! subs r2, r2, ip strb lr, [r0, #-1]! blt 8b @@ -150,7 +150,7 @@ ENTRY(memmove) blt 14f CALGN( ands ip, r0, #31 ) - CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( sbcsne r4, ip, r2 ) @ C is always set here CALGN( subcc r2, r2, ip ) CALGN( bcc 15f ) diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S index ed6d35d9cdb5..5593a45e0a8c 100644 --- a/arch/arm/lib/memset.S +++ b/arch/arm/lib/memset.S @@ -44,20 +44,20 @@ UNWIND( .save {r8, lr} ) mov lr, r3 2: subs r2, r2, #64 - stmgeia ip!, {r1, r3, r8, lr} @ 64 bytes at a time. - stmgeia ip!, {r1, r3, r8, lr} - stmgeia ip!, {r1, r3, r8, lr} - stmgeia ip!, {r1, r3, r8, lr} + stmiage ip!, {r1, r3, r8, lr} @ 64 bytes at a time. + stmiage ip!, {r1, r3, r8, lr} + stmiage ip!, {r1, r3, r8, lr} + stmiage ip!, {r1, r3, r8, lr} bgt 2b - ldmeqfd sp!, {r8, pc} @ Now <64 bytes to go. + ldmfdeq sp!, {r8, pc} @ Now <64 bytes to go. /* * No need to correct the count; we're only testing bits from now on */ tst r2, #32 - stmneia ip!, {r1, r3, r8, lr} - stmneia ip!, {r1, r3, r8, lr} + stmiane ip!, {r1, r3, r8, lr} + stmiane ip!, {r1, r3, r8, lr} tst r2, #16 - stmneia ip!, {r1, r3, r8, lr} + stmiane ip!, {r1, r3, r8, lr} ldmfd sp!, {r8, lr} UNWIND( .fnend ) @@ -87,22 +87,22 @@ UNWIND( .save {r4-r8, lr} ) rsb r8, r8, #32 sub r2, r2, r8 movs r8, r8, lsl #(32 - 4) - stmcsia ip!, {r4, r5, r6, r7} - stmmiia ip!, {r4, r5} + stmiacs ip!, {r4, r5, r6, r7} + stmiami ip!, {r4, r5} tst r8, #(1 << 30) mov r8, r1 strne r1, [ip], #4 3: subs r2, r2, #64 - stmgeia ip!, {r1, r3-r8, lr} - stmgeia ip!, {r1, r3-r8, lr} + stmiage ip!, {r1, r3-r8, lr} + stmiage ip!, {r1, r3-r8, lr} bgt 3b - ldmeqfd sp!, {r4-r8, pc} + ldmfdeq sp!, {r4-r8, pc} tst r2, #32 - stmneia ip!, {r1, r3-r8, lr} + stmiane ip!, {r1, r3-r8, lr} tst r2, #16 - stmneia ip!, {r4-r7} + stmiane ip!, {r4-r7} ldmfd sp!, {r4-r8, lr} UNWIND( .fnend ) @@ -110,7 +110,7 @@ UNWIND( .fnend ) UNWIND( .fnstart ) 4: tst r2, #8 - stmneia ip!, {r1, r3} + stmiane ip!, {r1, r3} tst r2, #4 strne r1, [ip], #4 /* @@ -118,17 +118,17 @@ UNWIND( .fnstart ) * may have an unaligned pointer as well. */ 5: tst r2, #2 - strneb r1, [ip], #1 - strneb r1, [ip], #1 + strbne r1, [ip], #1 + strbne r1, [ip], #1 tst r2, #1 - strneb r1, [ip], #1 + strbne r1, [ip], #1 ret lr 6: subs r2, r2, #4 @ 1 do we have enough blt 5b @ 1 bytes to align with? cmp r3, #2 @ 1 - strltb r1, [ip], #1 @ 1 - strleb r1, [ip], #1 @ 1 + strblt r1, [ip], #1 @ 1 + strble r1, [ip], #1 @ 1 strb r1, [ip], #1 @ 1 add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) b 1b diff --git a/arch/arm/mach-ks8695/include/mach/entry-macro.S b/arch/arm/mach-ks8695/include/mach/entry-macro.S index 8315b34f32ff..7ff812cb010b 100644 --- a/arch/arm/mach-ks8695/include/mach/entry-macro.S +++ b/arch/arm/mach-ks8695/include/mach/entry-macro.S @@ -42,6 +42,6 @@ moveq \irqstat, \irqstat, lsr #2 addeq \irqnr, \irqnr, #2 tst \irqstat, #0x01 - addeqs \irqnr, \irqnr, #1 + addseq \irqnr, \irqnr, #1 1001: .endm diff --git a/arch/arm/mach-tegra/reset-handler.S b/arch/arm/mach-tegra/reset-handler.S index 805f306fa6f7..e22ccf87eded 100644 --- a/arch/arm/mach-tegra/reset-handler.S +++ b/arch/arm/mach-tegra/reset-handler.S @@ -172,7 +172,7 @@ after_errata: mov32 r5, TEGRA_IRAM_BASE + TEGRA_IRAM_RESET_HANDLER_OFFSET mov r0, #CPU_NOT_RESETTABLE cmp r10, #0 - strneb r0, [r5, #__tegra20_cpu1_resettable_status_offset] + strbne r0, [r5, #__tegra20_cpu1_resettable_status_offset] 1: #endif diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S index 24659952c278..be68d62566c7 100644 --- a/arch/arm/mm/cache-v6.S +++ b/arch/arm/mm/cache-v6.S @@ -215,8 +215,8 @@ v6_dma_inv_range: #endif tst r1, #D_CACHE_LINE_SIZE - 1 #ifdef CONFIG_DMA_CACHE_RWFO - ldrneb r2, [r1, #-1] @ read for ownership - strneb r2, [r1, #-1] @ write for ownership + ldrbne r2, [r1, #-1] @ read for ownership + strbne r2, [r1, #-1] @ write for ownership #endif bic r1, r1, #D_CACHE_LINE_SIZE - 1 #ifdef HARVARD_CACHE @@ -284,8 +284,8 @@ ENTRY(v6_dma_flush_range) add r0, r0, #D_CACHE_LINE_SIZE cmp r0, r1 #ifdef CONFIG_DMA_CACHE_RWFO - ldrlob r2, [r0] @ read for ownership - strlob r2, [r0] @ write for ownership + ldrblo r2, [r0] @ read for ownership + strblo r2, [r0] @ write for ownership #endif blo 1b mov r0, #0 diff --git a/arch/arm/mm/proc-v7m.S b/arch/arm/mm/proc-v7m.S index 92e84181933a..acd5a66dfc23 100644 --- a/arch/arm/mm/proc-v7m.S +++ b/arch/arm/mm/proc-v7m.S @@ -152,10 +152,10 @@ __v7m_setup_cont: @ Configure caches (if implemented) teq r8, #0 - stmneia sp, {r0-r6, lr} @ v7m_invalidate_l1 touches r0-r6 + stmiane sp, {r0-r6, lr} @ v7m_invalidate_l1 touches r0-r6 blne v7m_invalidate_l1 teq r8, #0 @ re-evalutae condition - ldmneia sp, {r0-r6, lr} + ldmiane sp, {r0-r6, lr} @ Configure the System Control Register to ensure 8-byte stack alignment @ Note the STKALIGN bit is either RW or RAO. From b7e8c9397cd4efe6567d2728f091f1b728025533 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 18 Feb 2019 00:58:29 +0100 Subject: [PATCH 254/855] ARM: 8845/1: use unified assembler in c files Use unified assembler syntax (UAL) in inline assembler. Divided syntax is considered deprecated. This will also allow to build the kernel using LLVM's integrated assembler. When compiling non-Thumb2 GCC always emits a ".syntax divided" at the beginning of the inline assembly which makes the assembler fail. Since GCC 5 there is the -masm-syntax-unified GCC option which make GCC assume unified syntax asm and hence emits ".syntax unified" even in ARM mode. However, the option is broken since GCC version 6 (see GCC PR88648 [1]). Work around by adding ".syntax unified" as part of the inline assembly. [0] https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html#index-masm-syntax-unified [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88648 Signed-off-by: Stefan Agner Acked-by: Nicolas Pitre Signed-off-by: Russell King --- arch/arm/mm/copypage-v4mc.c | 3 ++- arch/arm/mm/copypage-v4wb.c | 3 ++- arch/arm/mm/copypage-v4wt.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c index b03202cddddb..f74cdce6d4da 100644 --- a/arch/arm/mm/copypage-v4mc.c +++ b/arch/arm/mm/copypage-v4mc.c @@ -45,6 +45,7 @@ static void mc_copy_user_page(void *from, void *to) int tmp; asm volatile ("\ + .syntax unified\n\ ldmia %0!, {r2, r3, ip, lr} @ 4\n\ 1: mcr p15, 0, %1, c7, c6, 1 @ 1 invalidate D line\n\ stmia %1!, {r2, r3, ip, lr} @ 4\n\ @@ -56,7 +57,7 @@ static void mc_copy_user_page(void *from, void *to) ldmia %0!, {r2, r3, ip, lr} @ 4\n\ subs %2, %2, #1 @ 1\n\ stmia %1!, {r2, r3, ip, lr} @ 4\n\ - ldmneia %0!, {r2, r3, ip, lr} @ 4\n\ + ldmiane %0!, {r2, r3, ip, lr} @ 4\n\ bne 1b @ " : "+&r" (from), "+&r" (to), "=&r" (tmp) : "2" (PAGE_SIZE / 64) diff --git a/arch/arm/mm/copypage-v4wb.c b/arch/arm/mm/copypage-v4wb.c index cd3e165afeed..6d336740aae4 100644 --- a/arch/arm/mm/copypage-v4wb.c +++ b/arch/arm/mm/copypage-v4wb.c @@ -27,6 +27,7 @@ static void v4wb_copy_user_page(void *kto, const void *kfrom) int tmp; asm volatile ("\ + .syntax unified\n\ ldmia %1!, {r3, r4, ip, lr} @ 4\n\ 1: mcr p15, 0, %0, c7, c6, 1 @ 1 invalidate D line\n\ stmia %0!, {r3, r4, ip, lr} @ 4\n\ @@ -38,7 +39,7 @@ static void v4wb_copy_user_page(void *kto, const void *kfrom) ldmia %1!, {r3, r4, ip, lr} @ 4\n\ subs %2, %2, #1 @ 1\n\ stmia %0!, {r3, r4, ip, lr} @ 4\n\ - ldmneia %1!, {r3, r4, ip, lr} @ 4\n\ + ldmiane %1!, {r3, r4, ip, lr} @ 4\n\ bne 1b @ 1\n\ mcr p15, 0, %1, c7, c10, 4 @ 1 drain WB" : "+&r" (kto), "+&r" (kfrom), "=&r" (tmp) diff --git a/arch/arm/mm/copypage-v4wt.c b/arch/arm/mm/copypage-v4wt.c index 8614572e1296..3851bb396442 100644 --- a/arch/arm/mm/copypage-v4wt.c +++ b/arch/arm/mm/copypage-v4wt.c @@ -25,6 +25,7 @@ static void v4wt_copy_user_page(void *kto, const void *kfrom) int tmp; asm volatile ("\ + .syntax unified\n\ ldmia %1!, {r3, r4, ip, lr} @ 4\n\ 1: stmia %0!, {r3, r4, ip, lr} @ 4\n\ ldmia %1!, {r3, r4, ip, lr} @ 4+1\n\ @@ -34,7 +35,7 @@ static void v4wt_copy_user_page(void *kto, const void *kfrom) ldmia %1!, {r3, r4, ip, lr} @ 4\n\ subs %2, %2, #1 @ 1\n\ stmia %0!, {r3, r4, ip, lr} @ 4\n\ - ldmneia %1!, {r3, r4, ip, lr} @ 4\n\ + ldmiane %1!, {r3, r4, ip, lr} @ 4\n\ bne 1b @ 1\n\ mcr p15, 0, %2, c7, c7, 0 @ flush ID cache" : "+&r" (kto), "+&r" (kfrom), "=&r" (tmp) From ca70ea43f80c98582f5ffbbd1e6f4da2742da0c4 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Mon, 18 Feb 2019 09:31:41 +0100 Subject: [PATCH 255/855] ARM: 8847/1: pm: fix HYP/SVC mode mismatch when MCPM is used MCPM does a soft reset of the CPUs and uses common cpu_resume() routine to perform low-level platform initialization. This results in a try to install HYP stubs for the second time for each CPU and results in false HYP/SVC mode mismatch detection. The HYP stubs are already installed at the beginning of the kernel initialization on the boot CPU (head.S) or in the secondary_startup() for other CPUs. To fix this issue MCPM code should use a cpu_resume() routine without HYP stubs installation. This change fixes HYP/SVC mode mismatch on Samsung Exynos5422-based Odroid XU3/XU4/HC1 boards. Fixes: 3721924c8154 ("ARM: 8081/1: MCPM: provide infrastructure to allow for MCPM loopback") Signed-off-by: Marek Szyprowski Acked-by: Nicolas Pitre Tested-by: Anand Moon Signed-off-by: Russell King --- arch/arm/common/mcpm_entry.c | 2 +- arch/arm/include/asm/suspend.h | 1 + arch/arm/kernel/sleep.S | 12 ++++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c index ad574d20415c..1b1b82b37ce0 100644 --- a/arch/arm/common/mcpm_entry.c +++ b/arch/arm/common/mcpm_entry.c @@ -381,7 +381,7 @@ static int __init nocache_trampoline(unsigned long _arg) unsigned int cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1); phys_reset_t phys_reset; - mcpm_set_entry_vector(cpu, cluster, cpu_resume); + mcpm_set_entry_vector(cpu, cluster, cpu_resume_no_hyp); setup_mm_for_reboot(); __mcpm_cpu_going_down(cpu, cluster); diff --git a/arch/arm/include/asm/suspend.h b/arch/arm/include/asm/suspend.h index 452bbdcbcc83..506314265c6f 100644 --- a/arch/arm/include/asm/suspend.h +++ b/arch/arm/include/asm/suspend.h @@ -10,6 +10,7 @@ struct sleep_save_sp { }; extern void cpu_resume(void); +extern void cpu_resume_no_hyp(void); extern void cpu_resume_arm(void); extern int cpu_suspend(unsigned long, int (*)(unsigned long)); diff --git a/arch/arm/kernel/sleep.S b/arch/arm/kernel/sleep.S index a8257fc9cf2a..5dc8b80bb693 100644 --- a/arch/arm/kernel/sleep.S +++ b/arch/arm/kernel/sleep.S @@ -120,6 +120,14 @@ ENDPROC(cpu_resume_after_mmu) .text .align +#ifdef CONFIG_MCPM + .arm +THUMB( .thumb ) +ENTRY(cpu_resume_no_hyp) +ARM_BE8(setend be) @ ensure we are in BE mode + b no_hyp +#endif + #ifdef CONFIG_MMU .arm ENTRY(cpu_resume_arm) @@ -135,6 +143,7 @@ ARM_BE8(setend be) @ ensure we are in BE mode bl __hyp_stub_install_secondary #endif safe_svcmode_maskall r1 +no_hyp: mov r1, #0 ALT_SMP(mrc p15, 0, r0, c0, c0, 5) ALT_UP_B(1f) @@ -163,6 +172,9 @@ ENDPROC(cpu_resume) #ifdef CONFIG_MMU ENDPROC(cpu_resume_arm) +#endif +#ifdef CONFIG_MCPM +ENDPROC(cpu_resume_no_hyp) #endif .align 2 From 9db043d36bd379f4cc99054c079de0dabfc38d03 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Wed, 20 Feb 2019 15:00:13 +0100 Subject: [PATCH 256/855] ARM: 8848/1: virt: Align GIC version check with arm64 counterpart arm64 has got relaxation on GIC version check at early boot stage due to update of the GIC architecture let's align ARM with that. To help backports (even though the code was correct at the time of writing) Fixes: e59941b9b381 ("ARM: 8527/1: virt: enable GICv3 system registers") Signed-off-by: Vladimir Murzin Reviewed-by: Marc Zyngier Signed-off-by: Russell King --- arch/arm/kernel/hyp-stub.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S index 60146e32619a..82a942894fc0 100644 --- a/arch/arm/kernel/hyp-stub.S +++ b/arch/arm/kernel/hyp-stub.S @@ -180,8 +180,8 @@ ARM_BE8(orr r7, r7, #(1 << 25)) @ HSCTLR.EE @ Check whether GICv3 system registers are available mrc p15, 0, r7, c0, c1, 1 @ ID_PFR1 ubfx r7, r7, #28, #4 - cmp r7, #1 - bne 2f + teq r7, #0 + beq 2f @ Enable system register accesses mrc p15, 4, r7, c12, c9, 5 @ ICC_HSRE From d410a8a49e3e00e07d43037e90f776d522b25a6a Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Wed, 20 Feb 2019 15:00:53 +0100 Subject: [PATCH 257/855] ARM: 8849/1: NOMMU: Fix encodings for PMSAv8's PRBAR4/PRLAR4 To access PRBARn, where n is referenced as a binary number: MRC p15, 0, , c6, c8+n[3:1], 4*n[0] ; Read PRBARn into Rt MCR p15, 0, , c6, c8+n[3:1], 4*n[0] ; Write Rt into PRBARn To access PRLARn, where n is referenced as a binary number: MRC p15, 0, , c6, c8+n[3:1], 4*n[0]+1 ; Read PRLARn into Rt MCR p15, 0, , c6, c8+n[3:1], 4*n[0]+1 ; Write Rt into PRLARn For PR{B,L}AR4, n is 4, n[0] is 0, n[3:1] is 2, while current encoding done with n[0] set to 1 which is wrong. Use proper encoding instead. Fixes: 046835b4aa22b9ab6aa0bb274e3b71047c4b887d ("ARM: 8757/1: NOMMU: Support PMSAv8 MPU") Signed-off-by: Vladimir Murzin Signed-off-by: Russell King --- arch/arm/kernel/head-nommu.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S index ec29de250076..c08d2d890f7b 100644 --- a/arch/arm/kernel/head-nommu.S +++ b/arch/arm/kernel/head-nommu.S @@ -439,8 +439,8 @@ M_CLASS(str r6, [r12, #PMSAv8_RLAR_A(3)]) str r5, [r12, #PMSAv8_RBAR_A(0)] str r6, [r12, #PMSAv8_RLAR_A(0)] #else - mcr p15, 0, r5, c6, c10, 1 @ PRBAR4 - mcr p15, 0, r6, c6, c10, 2 @ PRLAR4 + mcr p15, 0, r5, c6, c10, 0 @ PRBAR4 + mcr p15, 0, r6, c6, c10, 1 @ PRLAR4 #endif #endif ret lr From e74d53e30e2927fa5b223296ac7922baf15ea89a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 25 Feb 2019 14:35:06 +1100 Subject: [PATCH 258/855] KVM: PPC: Fix compilation when KVM is not enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling with CONFIG_PPC_POWERNV=y and KVM disabled currently gives an error like this: CC arch/powerpc/kernel/dbell.o In file included from arch/powerpc/kernel/dbell.c:20:0: arch/powerpc/include/asm/kvm_ppc.h: In function ‘xics_on_xive’: arch/powerpc/include/asm/kvm_ppc.h:625:9: error: implicit declaration of function ‘xive_enabled’ [-Werror=implicit-function-declaration] return xive_enabled() && cpu_has_feature(CPU_FTR_HVMODE); ^ cc1: all warnings being treated as errors scripts/Makefile.build:276: recipe for target 'arch/powerpc/kernel/dbell.o' failed make[3]: *** [arch/powerpc/kernel/dbell.o] Error 1 Fix this by making the xics_on_xive() definition conditional on the same symbol (CONFIG_KVM_BOOK3S_64_HANDLER) that determines whether we include or not, since that's the header that defines xive_enabled(). Fixes: 03f953329bd8 ("KVM: PPC: Book3S: Allow XICS emulation to work in nested hosts using XIVE") Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_ppc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index d283d3179fbc..ac22b28ae78d 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -619,7 +619,7 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } #endif /* CONFIG_KVM_XIVE */ -#ifdef CONFIG_PPC_POWERNV +#if defined(CONFIG_PPC_POWERNV) && defined(CONFIG_KVM_BOOK3S_64_HANDLER) static inline bool xics_on_xive(void) { return xive_enabled() && cpu_has_feature(CPU_FTR_HVMODE); From 6feaa4194c18578623565017f95d1b2f6b243567 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 18 Feb 2019 17:32:12 -0500 Subject: [PATCH 259/855] drm/amdgpu/powerplay: add missing breaks in polaris10_smumgr This was noticed by Gustavo and his -Wimplicit-fallthrough patches. However, in this case, I believe we should have breaks rather than falling though, that said, in practice we should never fall through in the first place so there should be no change in behavior. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/smumgr/polaris10_smumgr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/smumgr/polaris10_smumgr.c b/drivers/gpu/drm/amd/powerplay/smumgr/polaris10_smumgr.c index 52abca065764..2d4cfe14f72e 100644 --- a/drivers/gpu/drm/amd/powerplay/smumgr/polaris10_smumgr.c +++ b/drivers/gpu/drm/amd/powerplay/smumgr/polaris10_smumgr.c @@ -2330,6 +2330,7 @@ static uint32_t polaris10_get_offsetof(uint32_t type, uint32_t member) case DRAM_LOG_BUFF_SIZE: return offsetof(SMU74_SoftRegisters, DRAM_LOG_BUFF_SIZE); } + break; case SMU_Discrete_DpmTable: switch (member) { case UvdBootLevel: @@ -2339,6 +2340,7 @@ static uint32_t polaris10_get_offsetof(uint32_t type, uint32_t member) case LowSclkInterruptThreshold: return offsetof(SMU74_Discrete_DpmTable, LowSclkInterruptThreshold); } + break; } pr_warn("can't get the offset of type %x member %x\n", type, member); return 0; From dcd5fb82ffb484124203aa339733663ac0b059f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathias=20Fr=C3=B6hlich?= Date: Sun, 10 Feb 2019 11:13:01 +0100 Subject: [PATCH 260/855] drm/amd/display: Fix reference counting for struct dc_sink. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reference counting in amdgpu_dm_connector for amdgpu_dm_connector::dc_sink and amdgpu_dm_connector::dc_em_sink as well as in dc_link::local_sink seems to be out of shape. Thus make reference counting consistent for these members and just plain increment the reference count when the variable gets assigned and decrement when the pointer is set to zero or replaced. Also simplify reference counting in selected function sopes to be sure the reference is released in any case. In some cases add NULL pointer check before dereferencing. At a hand full of places a comment is placed to stat that the reference increment happened already somewhere else. This actually fixes the following kernel bug on my system when enabling display core in amdgpu. There are some more similar bug reports around, so it probably helps at more places. kernel BUG at mm/slub.c:294! invalid opcode: 0000 [#1] SMP PTI CPU: 9 PID: 1180 Comm: Xorg Not tainted 5.0.0-rc1+ #2 Hardware name: Supermicro X10DAi/X10DAI, BIOS 3.0a 02/05/2018 RIP: 0010:__slab_free+0x1e2/0x3d0 Code: 8b 54 24 30 48 89 4c 24 28 e8 da fb ff ff 4c 8b 54 24 28 85 c0 0f 85 67 fe ff ff 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f 5d c3 <0f> 0b 49 3b 5c 24 28 75 ab 48 8b 44 24 30 49 89 4c 24 28 49 89 44 RSP: 0018:ffffb0978589fa90 EFLAGS: 00010246 RAX: ffff92f12806c400 RBX: 0000000080200019 RCX: ffff92f12806c400 RDX: ffff92f12806c400 RSI: ffffdd6421a01a00 RDI: ffff92ed2f406e80 RBP: ffffb0978589fb40 R08: 0000000000000001 R09: ffffffffc0ee4748 R10: ffff92f12806c400 R11: 0000000000000001 R12: ffffdd6421a01a00 R13: ffff92f12806c400 R14: ffff92ed2f406e80 R15: ffffdd6421a01a20 FS: 00007f4170be0ac0(0000) GS:ffff92ed2fb40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000562818aaa000 CR3: 000000045745a002 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? drm_dbg+0x87/0x90 [drm] dc_stream_release+0x28/0x50 [amdgpu] amdgpu_dm_connector_mode_valid+0xb4/0x1f0 [amdgpu] drm_helper_probe_single_connector_modes+0x492/0x6b0 [drm_kms_helper] drm_mode_getconnector+0x457/0x490 [drm] ? drm_connector_property_set_ioctl+0x60/0x60 [drm] drm_ioctl_kernel+0xa9/0xf0 [drm] drm_ioctl+0x201/0x3a0 [drm] ? drm_connector_property_set_ioctl+0x60/0x60 [drm] amdgpu_drm_ioctl+0x49/0x80 [amdgpu] do_vfs_ioctl+0xa4/0x630 ? __sys_recvmsg+0x83/0xa0 ksys_ioctl+0x60/0x90 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f417110809b Code: 0f 1e fa 48 8b 05 ed bd 0c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d bd bd 0c 00 f7 d8 64 89 01 48 RSP: 002b:00007ffdd8d1c268 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000562818a8ebc0 RCX: 00007f417110809b RDX: 00007ffdd8d1c2a0 RSI: 00000000c05064a7 RDI: 0000000000000012 RBP: 00007ffdd8d1c2a0 R08: 0000562819012280 R09: 0000000000000007 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000c05064a7 R13: 0000000000000012 R14: 0000000000000012 R15: 00007ffdd8d1c2a0 Modules linked in: nfsv4 dns_resolver nfs lockd grace fscache fuse vfat fat amdgpu intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul chash gpu_sched crc32_pclmul snd_hda_codec_realtek ghash_clmulni_intel amd_iommu_v2 iTCO_wdt iTCO_vendor_support ttm snd_hda_codec_generic snd_hda_codec_hdmi ledtrig_audio snd_hda_intel drm_kms_helper snd_hda_codec intel_cstate snd_hda_core drm snd_hwdep snd_seq snd_seq_device intel_uncore snd_pcm intel_rapl_perf snd_timer snd soundcore ioatdma pcspkr intel_wmi_thunderbolt mxm_wmi i2c_i801 lpc_ich pcc_cpufreq auth_rpcgss sunrpc igb crc32c_intel i2c_algo_bit dca wmi hid_cherry analog gameport joydev This patch is based on agd5f/drm-next-5.1-wip. This patch does not require all of that, but agd5f/drm-next-5.1-wip contains at least one more dc_sink counting fix that I could spot. Signed-off-by: Mathias Fröhlich Reviewed-by: Leo Li Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 43 +++++++++++++++---- .../display/amdgpu_dm/amdgpu_dm_mst_types.c | 1 + drivers/gpu/drm/amd/display/dc/core/dc_link.c | 1 + 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index c87fcda61b66..2e7f4d2ae73a 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -886,6 +886,7 @@ static void emulated_link_detect(struct dc_link *link) return; } + /* dc_sink_create returns a new reference */ link->local_sink = sink; edid_status = dm_helpers_read_local_edid( @@ -952,6 +953,8 @@ static int dm_resume(void *handle) if (aconnector->fake_enable && aconnector->dc_link->local_sink) aconnector->fake_enable = false; + if (aconnector->dc_sink) + dc_sink_release(aconnector->dc_sink); aconnector->dc_sink = NULL; amdgpu_dm_update_connector_after_detect(aconnector); mutex_unlock(&aconnector->hpd_lock); @@ -1061,6 +1064,8 @@ amdgpu_dm_update_connector_after_detect(struct amdgpu_dm_connector *aconnector) sink = aconnector->dc_link->local_sink; + if (sink) + dc_sink_retain(sink); /* * Edid mgmt connector gets first update only in mode_valid hook and then @@ -1085,21 +1090,24 @@ amdgpu_dm_update_connector_after_detect(struct amdgpu_dm_connector *aconnector) * to it anymore after disconnect, so on next crtc to connector * reshuffle by UMD we will get into unwanted dc_sink release */ - if (aconnector->dc_sink != aconnector->dc_em_sink) - dc_sink_release(aconnector->dc_sink); + dc_sink_release(aconnector->dc_sink); } aconnector->dc_sink = sink; + dc_sink_retain(aconnector->dc_sink); amdgpu_dm_update_freesync_caps(connector, aconnector->edid); } else { amdgpu_dm_update_freesync_caps(connector, NULL); - if (!aconnector->dc_sink) + if (!aconnector->dc_sink) { aconnector->dc_sink = aconnector->dc_em_sink; - else if (aconnector->dc_sink != aconnector->dc_em_sink) dc_sink_retain(aconnector->dc_sink); + } } mutex_unlock(&dev->mode_config.mutex); + + if (sink) + dc_sink_release(sink); return; } @@ -1107,8 +1115,10 @@ amdgpu_dm_update_connector_after_detect(struct amdgpu_dm_connector *aconnector) * TODO: temporary guard to look for proper fix * if this sink is MST sink, we should not do anything */ - if (sink && sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT_MST) + if (sink && sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT_MST) { + dc_sink_release(sink); return; + } if (aconnector->dc_sink == sink) { /* @@ -1117,6 +1127,8 @@ amdgpu_dm_update_connector_after_detect(struct amdgpu_dm_connector *aconnector) */ DRM_DEBUG_DRIVER("DCHPD: connector_id=%d: dc_sink didn't change.\n", aconnector->connector_id); + if (sink) + dc_sink_release(sink); return; } @@ -1138,6 +1150,7 @@ amdgpu_dm_update_connector_after_detect(struct amdgpu_dm_connector *aconnector) amdgpu_dm_update_freesync_caps(connector, NULL); aconnector->dc_sink = sink; + dc_sink_retain(aconnector->dc_sink); if (sink->dc_edid.length == 0) { aconnector->edid = NULL; drm_dp_cec_unset_edid(&aconnector->dm_dp_aux.aux); @@ -1158,11 +1171,15 @@ amdgpu_dm_update_connector_after_detect(struct amdgpu_dm_connector *aconnector) amdgpu_dm_update_freesync_caps(connector, NULL); drm_connector_update_edid_property(connector, NULL); aconnector->num_modes = 0; + dc_sink_release(aconnector->dc_sink); aconnector->dc_sink = NULL; aconnector->edid = NULL; } mutex_unlock(&dev->mode_config.mutex); + + if (sink) + dc_sink_release(sink); } static void handle_hpd_irq(void *param) @@ -2977,6 +2994,7 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector, return stream; } else { sink = aconnector->dc_sink; + dc_sink_retain(sink); } stream = dc_create_stream_for_sink(sink); @@ -3042,8 +3060,7 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector, update_stream_signal(stream, sink); finish: - if (sink && sink->sink_signal == SIGNAL_TYPE_VIRTUAL && aconnector->base.force != DRM_FORCE_ON) - dc_sink_release(sink); + dc_sink_release(sink); return stream; } @@ -3301,6 +3318,14 @@ static void amdgpu_dm_connector_destroy(struct drm_connector *connector) dm->backlight_dev = NULL; } #endif + + if (aconnector->dc_em_sink) + dc_sink_release(aconnector->dc_em_sink); + aconnector->dc_em_sink = NULL; + if (aconnector->dc_sink) + dc_sink_release(aconnector->dc_sink); + aconnector->dc_sink = NULL; + drm_dp_cec_unregister_connector(&aconnector->dm_dp_aux.aux); drm_connector_unregister(connector); drm_connector_cleanup(connector); @@ -3398,10 +3423,12 @@ static void create_eml_sink(struct amdgpu_dm_connector *aconnector) (edid->extensions + 1) * EDID_LENGTH, &init_params); - if (aconnector->base.force == DRM_FORCE_ON) + if (aconnector->base.force == DRM_FORCE_ON) { aconnector->dc_sink = aconnector->dc_link->local_sink ? aconnector->dc_link->local_sink : aconnector->dc_em_sink; + dc_sink_retain(aconnector->dc_sink); + } } static void handle_edid_mgmt(struct amdgpu_dm_connector *aconnector) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index f51d52eb52e6..c4ea3a91f17a 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -191,6 +191,7 @@ static int dm_dp_mst_get_modes(struct drm_connector *connector) &init_params); dc_sink->priv = aconnector; + /* dc_link_add_remote_sink returns a new reference */ aconnector->dc_sink = dc_sink; if (aconnector->dc_sink) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c b/drivers/gpu/drm/amd/display/dc/core/dc_link.c index 7f5a947ad31d..792114e737ce 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c @@ -794,6 +794,7 @@ bool dc_link_detect(struct dc_link *link, enum dc_detect_reason reason) sink->link->dongle_max_pix_clk = sink_caps.max_hdmi_pixel_clock; sink->converter_disable_audio = converter_disable_audio; + /* dc_sink_create returns a new reference */ link->local_sink = sink; edid_status = dm_helpers_read_local_edid( From 293b9160839fb6965fd0f58e6f66217193174538 Mon Sep 17 00:00:00 2001 From: Anthony Koo Date: Thu, 7 Feb 2019 20:38:34 -0500 Subject: [PATCH 261/855] drm/amd/display: Fix issue with link_active state not correct for MST [Why] For MST, link not disabled until all streams disabled [How] Add check for stream_count before setting link_active = false for MST Signed-off-by: Anthony Koo Reviewed-by: Wenjing Liu Acked-by: Leo Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_link.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c b/drivers/gpu/drm/amd/display/dc/core/dc_link.c index 792114e737ce..4eba3c4800b6 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c @@ -2038,6 +2038,9 @@ static enum dc_status enable_link( break; } + if (status == DC_OK) + pipe_ctx->stream->link->link_status.link_active = true; + return status; } @@ -2061,6 +2064,14 @@ static void disable_link(struct dc_link *link, enum signal_type signal) dp_disable_link_phy_mst(link, signal); } else link->link_enc->funcs->disable_output(link->link_enc, signal); + + if (signal == SIGNAL_TYPE_DISPLAY_PORT_MST) { + /* MST disable link only when no stream use the link */ + if (link->mst_stream_alloc_table.stream_count <= 0) + link->link_status.link_active = false; + } else { + link->link_status.link_active = false; + } } static bool dp_active_dongle_validate_timing( @@ -2624,8 +2635,6 @@ void core_link_enable_stream( } } - stream->link->link_status.link_active = true; - core_dc->hwss.enable_audio_stream(pipe_ctx); /* turn off otg test pattern if enable */ @@ -2660,8 +2669,6 @@ void core_link_disable_stream(struct pipe_ctx *pipe_ctx, int option) core_dc->hwss.disable_stream(pipe_ctx, option); disable_link(pipe_ctx->stream->link, pipe_ctx->stream->signal); - - pipe_ctx->stream->link->link_status.link_active = false; } void core_link_set_avmute(struct pipe_ctx *pipe_ctx, bool enable) From 5307db85c745c46295f3ee2159667e1606fba268 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Mon, 25 Feb 2019 14:54:20 +0800 Subject: [PATCH 262/855] drm/amd/powerplay: use REG32_PCIE wrapper instead for powerplay This patch uses REG32_PCIE wrapper instead of writting pci_index2 and reading pci_data2 for powerplay. This sequence should be protected by pcie_idx_lock. Signed-off-by: Huang Rui Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/smumgr/smu9_smumgr.c | 6 ++---- drivers/gpu/drm/amd/powerplay/smumgr/vega20_smumgr.c | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/smumgr/smu9_smumgr.c b/drivers/gpu/drm/amd/powerplay/smumgr/smu9_smumgr.c index 079fc8e8f709..742b3dc1f6cb 100644 --- a/drivers/gpu/drm/amd/powerplay/smumgr/smu9_smumgr.c +++ b/drivers/gpu/drm/amd/powerplay/smumgr/smu9_smumgr.c @@ -40,10 +40,8 @@ bool smu9_is_smc_ram_running(struct pp_hwmgr *hwmgr) struct amdgpu_device *adev = hwmgr->adev; uint32_t mp1_fw_flags; - WREG32_SOC15(NBIF, 0, mmPCIE_INDEX2, - (MP1_Public | (smnMP1_FIRMWARE_FLAGS & 0xffffffff))); - - mp1_fw_flags = RREG32_SOC15(NBIF, 0, mmPCIE_DATA2); + mp1_fw_flags = RREG32_PCIE(MP1_Public | + (smnMP1_FIRMWARE_FLAGS & 0xffffffff)); if (mp1_fw_flags & MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) return true; diff --git a/drivers/gpu/drm/amd/powerplay/smumgr/vega20_smumgr.c b/drivers/gpu/drm/amd/powerplay/smumgr/vega20_smumgr.c index b7ff7d4d6f44..ba00744c3413 100644 --- a/drivers/gpu/drm/amd/powerplay/smumgr/vega20_smumgr.c +++ b/drivers/gpu/drm/amd/powerplay/smumgr/vega20_smumgr.c @@ -49,10 +49,8 @@ static bool vega20_is_smc_ram_running(struct pp_hwmgr *hwmgr) struct amdgpu_device *adev = hwmgr->adev; uint32_t mp1_fw_flags; - WREG32_SOC15(NBIF, 0, mmPCIE_INDEX2, - (MP1_Public | (smnMP1_FIRMWARE_FLAGS & 0xffffffff))); - - mp1_fw_flags = RREG32_SOC15(NBIF, 0, mmPCIE_DATA2); + mp1_fw_flags = RREG32_PCIE(MP1_Public | + (smnMP1_FIRMWARE_FLAGS & 0xffffffff)); if ((mp1_fw_flags & MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) >> MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED__SHIFT) From 76f8f6992a0cf7e4a8401eadaa67ea4a06e54568 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Mon, 25 Feb 2019 15:02:44 +0800 Subject: [PATCH 263/855] drm/amdgpu: use REG32_PCIE wrapper instead for psp This patch uses REG32_PCIE wrapper instead of writting pci_index2 and reading pci_data2 for psp. This sequence should be protected by pcie_idx_lock. Signed-off-by: Huang Rui Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/psp_v3_1.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c b/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c index c63de945c021..0487e3a4e9e7 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c @@ -500,9 +500,7 @@ static bool psp_v3_1_smu_reload_quirk(struct psp_context *psp) struct amdgpu_device *adev = psp->adev; uint32_t reg; - reg = smnMP1_FIRMWARE_FLAGS | 0x03b00000; - WREG32_SOC15(NBIO, 0, mmPCIE_INDEX2, reg); - reg = RREG32_SOC15(NBIO, 0, mmPCIE_DATA2); + reg = RREG32_PCIE(smnMP1_FIRMWARE_FLAGS | 0x03b00000); return (reg & MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) ? true : false; } From cac734c2dbd2514f14c8c6a17caba1990d83bf1d Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Fri, 22 Feb 2019 12:36:49 +0800 Subject: [PATCH 264/855] drm/amdkfd: use init_mqd function to allocate object for hid_mqd (CI) if use the legacy method to allocate object, when mqd_hiq need to run uninit code, it will be cause WARNING call trace. eg: (s3 suspend test) [ 34.918944] Call Trace: [ 34.918948] [] dump_stack+0x19/0x1b [ 34.918950] [] __warn+0xd8/0x100 [ 34.918951] [] warn_slowpath_null+0x1d/0x20 [ 34.918991] [] uninit_mqd_hiq_sdma+0x4e/0x50 [amdgpu] [ 34.919028] [] uninitialize+0x37/0xe0 [amdgpu] [ 34.919064] [] kernel_queue_uninit+0x16/0x30 [amdgpu] [ 34.919086] [] pm_uninit+0x12/0x20 [amdgpu] [ 34.919107] [] stop_nocpsch+0x15/0x20 [amdgpu] [ 34.919129] [] kgd2kfd_suspend.part.4+0x2e/0x50 [amdgpu] [ 34.919150] [] kgd2kfd_suspend+0x17/0x20 [amdgpu] [ 34.919171] [] amdgpu_amdkfd_suspend+0x1a/0x20 [amdgpu] [ 34.919187] [] amdgpu_device_suspend+0x88/0x3a0 [amdgpu] [ 34.919189] [] ? enqueue_entity+0x2ef/0xbe0 [ 34.919205] [] amdgpu_pmops_suspend+0x20/0x30 [amdgpu] [ 34.919207] [] pci_pm_suspend+0x6f/0x150 [ 34.919208] [] ? pci_pm_freeze+0xf0/0xf0 [ 34.919210] [] dpm_run_callback+0x46/0x90 [ 34.919212] [] __device_suspend+0xfb/0x2a0 [ 34.919213] [] async_suspend+0x1f/0xa0 [ 34.919214] [] async_run_entry_fn+0x3f/0x130 [ 34.919216] [] process_one_work+0x17f/0x440 [ 34.919217] [] worker_thread+0x126/0x3c0 [ 34.919218] [] ? manage_workers.isra.25+0x2a0/0x2a0 [ 34.919220] [] kthread+0xd1/0xe0 [ 34.919221] [] ? insert_kthread_work+0x40/0x40 [ 34.919222] [] ret_from_fork_nospec_begin+0x7/0x21 [ 34.919224] [] ? insert_kthread_work+0x40/0x40 [ 34.919224] ---[ end trace 38cd9f65c963adad ]--- Signed-off-by: Kevin Wang Reviewed-by: Oak Zeng Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 52 +------------------ 1 file changed, 1 insertion(+), 51 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index 47243165a082..ae90a99909ef 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -323,57 +323,7 @@ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { - uint64_t addr; - struct cik_mqd *m; - int retval; - - retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), - mqd_mem_obj); - - if (retval != 0) - return -ENOMEM; - - m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr; - addr = (*mqd_mem_obj)->gpu_addr; - - memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256)); - - m->header = 0xC0310800; - m->compute_pipelinestat_enable = 1; - m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; - m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; - m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; - m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; - - m->cp_hqd_persistent_state = DEFAULT_CP_HQD_PERSISTENT_STATE | - PRELOAD_REQ; - m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | - QUANTUM_DURATION(10); - - m->cp_mqd_control = MQD_CONTROL_PRIV_STATE_EN; - m->cp_mqd_base_addr_lo = lower_32_bits(addr); - m->cp_mqd_base_addr_hi = upper_32_bits(addr); - - m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE; - - /* - * Pipe Priority - * Identifies the pipe relative priority when this queue is connected - * to the pipeline. The pipe priority is against the GFX pipe and HP3D. - * In KFD we are using a fixed pipe priority set to CS_MEDIUM. - * 0 = CS_LOW (typically below GFX) - * 1 = CS_MEDIUM (typically between HP3D and GFX - * 2 = CS_HIGH (typically above HP3D) - */ - m->cp_hqd_pipe_priority = 1; - m->cp_hqd_queue_priority = 15; - - *mqd = m; - if (gart_addr) - *gart_addr = addr; - retval = mm->update_mqd(mm, m, q); - - return retval; + return init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); } static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, From fe1331a2eca05239df3ebd510f3f21fdd08e1ffa Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Wed, 20 Feb 2019 16:12:03 +0800 Subject: [PATCH 265/855] drm/amd/powerplay: drop redundant soft min/max settings As these are already set during apply_clocks_adjust_rules. Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c index fae95d9ebd7a..8ca49c3cf6db 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c @@ -2314,32 +2314,8 @@ static int vega20_force_dpm_lowest(struct pp_hwmgr *hwmgr) static int vega20_unforce_dpm_levels(struct pp_hwmgr *hwmgr) { - struct vega20_hwmgr *data = - (struct vega20_hwmgr *)(hwmgr->backend); - uint32_t soft_min_level, soft_max_level; int ret = 0; - soft_min_level = vega20_find_lowest_dpm_level(&(data->dpm_table.gfx_table)); - soft_max_level = vega20_find_highest_dpm_level(&(data->dpm_table.gfx_table)); - data->dpm_table.gfx_table.dpm_state.soft_min_level = - data->dpm_table.gfx_table.dpm_levels[soft_min_level].value; - data->dpm_table.gfx_table.dpm_state.soft_max_level = - data->dpm_table.gfx_table.dpm_levels[soft_max_level].value; - - soft_min_level = vega20_find_lowest_dpm_level(&(data->dpm_table.mem_table)); - soft_max_level = vega20_find_highest_dpm_level(&(data->dpm_table.mem_table)); - data->dpm_table.mem_table.dpm_state.soft_min_level = - data->dpm_table.mem_table.dpm_levels[soft_min_level].value; - data->dpm_table.mem_table.dpm_state.soft_max_level = - data->dpm_table.mem_table.dpm_levels[soft_max_level].value; - - soft_min_level = vega20_find_lowest_dpm_level(&(data->dpm_table.soc_table)); - soft_max_level = vega20_find_highest_dpm_level(&(data->dpm_table.soc_table)); - data->dpm_table.soc_table.dpm_state.soft_min_level = - data->dpm_table.soc_table.dpm_levels[soft_min_level].value; - data->dpm_table.soc_table.dpm_state.soft_max_level = - data->dpm_table.soc_table.dpm_levels[soft_max_level].value; - ret = vega20_upload_dpm_min_level(hwmgr, 0xFFFFFFFF); PP_ASSERT_WITH_CODE(!ret, "Failed to upload DPM Bootup Levels!", From d19e923337d9a517c359bccd52d2e3af7e889064 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Wed, 20 Feb 2019 16:21:10 +0800 Subject: [PATCH 266/855] drm/amd/powerplay: need to reapply the dpm level settings As these settings got reset during above phm_apply_clock_adjust_rules. Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/hwmgr/pp_psm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/pp_psm.c b/drivers/gpu/drm/amd/powerplay/hwmgr/pp_psm.c index ce177d7f04cb..6bf48934fdc4 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/pp_psm.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/pp_psm.c @@ -277,8 +277,7 @@ int psm_adjust_power_state_dynamic(struct pp_hwmgr *hwmgr, bool skip_display_set if (!skip_display_settings) phm_notify_smc_display_config_after_ps_adjustment(hwmgr); - if ((hwmgr->request_dpm_level != hwmgr->dpm_level) && - !phm_force_dpm_levels(hwmgr, hwmgr->request_dpm_level)) + if (!phm_force_dpm_levels(hwmgr, hwmgr->request_dpm_level)) hwmgr->dpm_level = hwmgr->request_dpm_level; if (hwmgr->dpm_level != AMD_DPM_FORCED_LEVEL_MANUAL) { From 971e7ac1ab6201484aef14cb1de62375fa317d41 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Wed, 20 Feb 2019 16:08:41 +0800 Subject: [PATCH 267/855] drm/amd/powerplay: force FCLK to highest also for 5K or higher displays This can fix possible screen freeze on high resolution displays. Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c index 8ca49c3cf6db..17ea5c699240 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c @@ -3332,6 +3332,31 @@ static int vega20_set_uclk_to_highest_dpm_level(struct pp_hwmgr *hwmgr, return ret; } +static int vega20_set_fclk_to_highest_dpm_level(struct pp_hwmgr *hwmgr) +{ + struct vega20_hwmgr *data = (struct vega20_hwmgr *)(hwmgr->backend); + struct vega20_single_dpm_table *dpm_table = &(data->dpm_table.fclk_table); + int ret = 0; + + if (data->smu_features[GNLD_DPM_FCLK].enabled) { + PP_ASSERT_WITH_CODE(dpm_table->count > 0, + "[SetFclkToHightestDpmLevel] Dpm table has no entry!", + return -EINVAL); + PP_ASSERT_WITH_CODE(dpm_table->count <= NUM_FCLK_DPM_LEVELS, + "[SetFclkToHightestDpmLevel] Dpm table has too many entries!", + return -EINVAL); + + dpm_table->dpm_state.soft_min_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + PP_ASSERT_WITH_CODE(!(ret = smum_send_msg_to_smc_with_parameter(hwmgr, + PPSMC_MSG_SetSoftMinByFreq, + (PPCLK_FCLK << 16 ) | dpm_table->dpm_state.soft_min_level)), + "[SetFclkToHightestDpmLevel] Set soft min fclk failed!", + return ret); + } + + return ret; +} + static int vega20_pre_display_configuration_changed_task(struct pp_hwmgr *hwmgr) { struct vega20_hwmgr *data = (struct vega20_hwmgr *)(hwmgr->backend); @@ -3342,8 +3367,10 @@ static int vega20_pre_display_configuration_changed_task(struct pp_hwmgr *hwmgr) ret = vega20_set_uclk_to_highest_dpm_level(hwmgr, &data->dpm_table.mem_table); + if (ret) + return ret; - return ret; + return vega20_set_fclk_to_highest_dpm_level(hwmgr); } static int vega20_display_configuration_changed_task(struct pp_hwmgr *hwmgr) @@ -3502,6 +3529,15 @@ static int vega20_apply_clocks_adjust_rules(struct pp_hwmgr *hwmgr) if (hwmgr->display_config->nb_pstate_switch_disable) dpm_table->dpm_state.hard_min_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + /* fclk */ + dpm_table = &(data->dpm_table.fclk_table); + dpm_table->dpm_state.soft_min_level = dpm_table->dpm_levels[0].value; + dpm_table->dpm_state.soft_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.hard_min_level = dpm_table->dpm_levels[0].value; + dpm_table->dpm_state.hard_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + if (hwmgr->display_config->nb_pstate_switch_disable) + dpm_table->dpm_state.soft_min_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + /* vclk */ dpm_table = &(data->dpm_table.vclk_table); dpm_table->dpm_state.soft_min_level = dpm_table->dpm_levels[0].value; From 3a301bc5d2941ef1e04acb390c0ef9e06b802477 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Wed, 20 Feb 2019 16:40:47 +0800 Subject: [PATCH 268/855] drm/amd/powerplay: overwrite ODSettingsMin for UCLK_FMAX feature For UCLK_FMAX OD feature, SMU overwrites the highest UCLK DPM level freq. Therefore it can only take values that are greater than the second highest DPM level freq. Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c index 17ea5c699240..ea79417958ec 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c @@ -979,6 +979,8 @@ static int vega20_od8_set_feature_capabilities( } if (data->smu_features[GNLD_DPM_UCLK].enabled) { + pptable_information->od_settings_min[OD8_SETTING_UCLK_FMAX] = + data->dpm_table.mem_table.dpm_levels[data->dpm_table.mem_table.count - 2].value; if (pptable_information->od_feature_capabilities[ATOM_VEGA20_ODFEATURE_UCLK_MAX] && pptable_information->od_settings_min[OD8_SETTING_UCLK_FMAX] > 0 && pptable_information->od_settings_max[OD8_SETTING_UCLK_FMAX] > 0 && @@ -2771,7 +2773,6 @@ static int vega20_odn_edit_dpm_table(struct pp_hwmgr *hwmgr, data->od8_settings.od8_settings_array; OverDriveTable_t *od_table = &(data->smc_state_table.overdrive_table); - struct pp_clock_levels_with_latency clocks; int32_t input_index, input_clk, input_vol, i; int od8_id; int ret; @@ -2830,11 +2831,6 @@ static int vega20_odn_edit_dpm_table(struct pp_hwmgr *hwmgr, return -EOPNOTSUPP; } - ret = vega20_get_memclocks(hwmgr, &clocks); - PP_ASSERT_WITH_CODE(!ret, - "Attempt to get memory clk levels failed!", - return ret); - for (i = 0; i < size; i += 2) { if (i + 2 > size) { pr_info("invalid number of input parameters %d\n", @@ -2851,11 +2847,11 @@ static int vega20_odn_edit_dpm_table(struct pp_hwmgr *hwmgr, return -EINVAL; } - if (input_clk < clocks.data[0].clocks_in_khz / 1000 || + if (input_clk < od8_settings[OD8_SETTING_UCLK_FMAX].min_value || input_clk > od8_settings[OD8_SETTING_UCLK_FMAX].max_value) { pr_info("clock freq %d is not within allowed range [%d - %d]\n", input_clk, - clocks.data[0].clocks_in_khz / 1000, + od8_settings[OD8_SETTING_UCLK_FMAX].min_value, od8_settings[OD8_SETTING_UCLK_FMAX].max_value); return -EINVAL; } @@ -3264,13 +3260,8 @@ static int vega20_print_clock_levels(struct pp_hwmgr *hwmgr, } if (od8_settings[OD8_SETTING_UCLK_FMAX].feature_id) { - ret = vega20_get_memclocks(hwmgr, &clocks); - PP_ASSERT_WITH_CODE(!ret, - "Fail to get memory clk levels!", - return ret); - size += sprintf(buf + size, "MCLK: %7uMhz %10uMhz\n", - clocks.data[0].clocks_in_khz / 1000, + od8_settings[OD8_SETTING_UCLK_FMAX].min_value, od8_settings[OD8_SETTING_UCLK_FMAX].max_value); } From 2e41a8747acf0e85667b81038e7f28e6e41a41d7 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Wed, 20 Feb 2019 17:13:16 +0800 Subject: [PATCH 269/855] drm/amd/powerplay: support retrieving clock information from other sysplls There will be some needs to retrieve clock information from other sysplls also except default 0. Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- .../drm/amd/powerplay/hwmgr/ppatomfwctrl.c | 27 ++++++++++--------- .../drm/amd/powerplay/hwmgr/ppatomfwctrl.h | 3 ++- .../drm/amd/powerplay/hwmgr/vega10_hwmgr.c | 4 +-- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.c b/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.c index 4588bddf8b33..a28192bfb035 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.c @@ -489,15 +489,16 @@ int pp_atomfwctrl_get_gpio_information(struct pp_hwmgr *hwmgr, } int pp_atomfwctrl_get_clk_information_by_clkid(struct pp_hwmgr *hwmgr, - uint8_t id, uint32_t *frequency) + uint8_t clk_id, uint8_t syspll_id, + uint32_t *frequency) { struct amdgpu_device *adev = hwmgr->adev; struct atom_get_smu_clock_info_parameters_v3_1 parameters; struct atom_get_smu_clock_info_output_parameters_v3_1 *output; uint32_t ix; - parameters.clk_id = id; - parameters.syspll_id = 0; + parameters.clk_id = clk_id; + parameters.syspll_id = syspll_id; parameters.command = GET_SMU_CLOCK_INFO_V3_1_GET_CLOCK_FREQ; parameters.dfsdid = 0; @@ -530,19 +531,19 @@ static void pp_atomfwctrl_copy_vbios_bootup_values_3_2(struct pp_hwmgr *hwmgr, boot_values->ulSocClk = 0; boot_values->ulDCEFClk = 0; - if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU11_SYSPLL0_SOCCLK_ID, &frequency)) + if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU11_SYSPLL0_SOCCLK_ID, SMU11_SYSPLL0_ID, &frequency)) boot_values->ulSocClk = frequency; - if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU11_SYSPLL0_DCEFCLK_ID, &frequency)) + if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU11_SYSPLL0_DCEFCLK_ID, SMU11_SYSPLL0_ID, &frequency)) boot_values->ulDCEFClk = frequency; - if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU11_SYSPLL0_ECLK_ID, &frequency)) + if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU11_SYSPLL0_ECLK_ID, SMU11_SYSPLL0_ID, &frequency)) boot_values->ulEClk = frequency; - if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU11_SYSPLL0_VCLK_ID, &frequency)) + if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU11_SYSPLL0_VCLK_ID, SMU11_SYSPLL0_ID, &frequency)) boot_values->ulVClk = frequency; - if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU11_SYSPLL0_DCLK_ID, &frequency)) + if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU11_SYSPLL0_DCLK_ID, SMU11_SYSPLL0_ID, &frequency)) boot_values->ulDClk = frequency; } @@ -563,19 +564,19 @@ static void pp_atomfwctrl_copy_vbios_bootup_values_3_1(struct pp_hwmgr *hwmgr, boot_values->ulSocClk = 0; boot_values->ulDCEFClk = 0; - if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU9_SYSPLL0_SOCCLK_ID, &frequency)) + if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU9_SYSPLL0_SOCCLK_ID, 0, &frequency)) boot_values->ulSocClk = frequency; - if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU9_SYSPLL0_DCEFCLK_ID, &frequency)) + if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU9_SYSPLL0_DCEFCLK_ID, 0, &frequency)) boot_values->ulDCEFClk = frequency; - if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU9_SYSPLL0_ECLK_ID, &frequency)) + if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU9_SYSPLL0_ECLK_ID, 0, &frequency)) boot_values->ulEClk = frequency; - if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU9_SYSPLL0_VCLK_ID, &frequency)) + if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU9_SYSPLL0_VCLK_ID, 0, &frequency)) boot_values->ulVClk = frequency; - if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU9_SYSPLL0_DCLK_ID, &frequency)) + if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU9_SYSPLL0_DCLK_ID, 0, &frequency)) boot_values->ulDClk = frequency; } diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.h b/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.h index fe9e8ceef50e..9bafd00324a9 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.h +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.h @@ -236,7 +236,8 @@ int pp_atomfwctrl_get_vbios_bootup_values(struct pp_hwmgr *hwmgr, int pp_atomfwctrl_get_smc_dpm_information(struct pp_hwmgr *hwmgr, struct pp_atomfwctrl_smc_dpm_parameters *param); int pp_atomfwctrl_get_clk_information_by_clkid(struct pp_hwmgr *hwmgr, - uint8_t id, uint32_t *frequency); + uint8_t clk_id, uint8_t syspll_id, + uint32_t *frequency); #endif diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c index 6d8e9609e900..5c4f701939ea 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c @@ -2575,10 +2575,10 @@ static int vega10_init_smc_table(struct pp_hwmgr *hwmgr) data->vbios_boot_state.gfx_clock = boot_up_values.ulGfxClk; data->vbios_boot_state.mem_clock = boot_up_values.ulUClk; pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, - SMU9_SYSPLL0_SOCCLK_ID, &boot_up_values.ulSocClk); + SMU9_SYSPLL0_SOCCLK_ID, 0, &boot_up_values.ulSocClk); pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, - SMU9_SYSPLL0_DCEFCLK_ID, &boot_up_values.ulDCEFClk); + SMU9_SYSPLL0_DCEFCLK_ID, 0, &boot_up_values.ulDCEFClk); data->vbios_boot_state.soc_clock = boot_up_values.ulSocClk; data->vbios_boot_state.dcef_clock = boot_up_values.ulDCEFClk; From f5e79735cab448981e245a41ee6cbebf0e334f61 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Wed, 20 Feb 2019 17:20:40 +0800 Subject: [PATCH 270/855] drm/amd/powerplay: set default fclk for no fclk dpm support case Set the default fclk as what we got from VBIOS. Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.c | 3 +++ drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.h | 1 + drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c | 7 +++++-- drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h | 1 + 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.c b/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.c index a28192bfb035..615cf2c09e54 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.c @@ -545,6 +545,9 @@ static void pp_atomfwctrl_copy_vbios_bootup_values_3_2(struct pp_hwmgr *hwmgr, if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU11_SYSPLL0_DCLK_ID, SMU11_SYSPLL0_ID, &frequency)) boot_values->ulDClk = frequency; + + if (!pp_atomfwctrl_get_clk_information_by_clkid(hwmgr, SMU11_SYSPLL1_0_FCLK_ID, SMU11_SYSPLL1_2_ID, &frequency)) + boot_values->ulFClk = frequency; } static void pp_atomfwctrl_copy_vbios_bootup_values_3_1(struct pp_hwmgr *hwmgr, diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.h b/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.h index 9bafd00324a9..b7e2651b570b 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.h +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/ppatomfwctrl.h @@ -139,6 +139,7 @@ struct pp_atomfwctrl_bios_boot_up_values { uint32_t ulEClk; uint32_t ulVClk; uint32_t ulDClk; + uint32_t ulFClk; uint16_t usVddc; uint16_t usVddci; uint16_t usMvddc; diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c index ea79417958ec..d7350bbadafa 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c @@ -711,8 +711,10 @@ static int vega20_setup_default_dpm_tables(struct pp_hwmgr *hwmgr) PP_ASSERT_WITH_CODE(!ret, "[SetupDefaultDpmTable] failed to get fclk dpm levels!", return ret); - } else - dpm_table->count = 0; + } else { + dpm_table->count = 1; + dpm_table->dpm_levels[0].value = data->vbios_boot_state.fclock / 100; + } vega20_init_dpm_state(&(dpm_table->dpm_state)); /* save a copy of the default DPM table */ @@ -754,6 +756,7 @@ static int vega20_init_smc_table(struct pp_hwmgr *hwmgr) data->vbios_boot_state.eclock = boot_up_values.ulEClk; data->vbios_boot_state.vclock = boot_up_values.ulVClk; data->vbios_boot_state.dclock = boot_up_values.ulDClk; + data->vbios_boot_state.fclock = boot_up_values.ulFClk; data->vbios_boot_state.uc_cooling_id = boot_up_values.ucCoolingID; smum_send_msg_to_smc_with_parameter(hwmgr, diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h index 37f5f5e657da..4a4cad35dc8f 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h @@ -219,6 +219,7 @@ struct vega20_vbios_boot_state { uint32_t eclock; uint32_t dclock; uint32_t vclock; + uint32_t fclock; }; #define DPMTABLE_OD_UPDATE_SCLK 0x00000001 From 65543b28893a81c7a7987d5b25387ff03243dd81 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Wed, 20 Feb 2019 17:47:35 +0800 Subject: [PATCH 271/855] drm/amd/powerplay: honor the OD settings Set the soft/hard max settings as max possible to not violate the OD settings. Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c | 32 +++++++++---------- .../drm/amd/powerplay/hwmgr/vega20_hwmgr.h | 2 ++ 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c index d7350bbadafa..90ce402a1b2c 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c @@ -463,9 +463,9 @@ static int vega20_setup_asic_task(struct pp_hwmgr *hwmgr) static void vega20_init_dpm_state(struct vega20_dpm_state *dpm_state) { dpm_state->soft_min_level = 0x0; - dpm_state->soft_max_level = 0xffff; + dpm_state->soft_max_level = VG20_CLOCK_MAX_DEFAULT; dpm_state->hard_min_level = 0x0; - dpm_state->hard_max_level = 0xffff; + dpm_state->hard_max_level = VG20_CLOCK_MAX_DEFAULT; } static int vega20_get_number_of_dpm_level(struct pp_hwmgr *hwmgr, @@ -3458,9 +3458,9 @@ static int vega20_apply_clocks_adjust_rules(struct pp_hwmgr *hwmgr) /* gfxclk */ dpm_table = &(data->dpm_table.gfx_table); dpm_table->dpm_state.soft_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.soft_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.soft_max_level = VG20_CLOCK_MAX_DEFAULT; dpm_table->dpm_state.hard_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.hard_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.hard_max_level = VG20_CLOCK_MAX_DEFAULT; if (PP_CAP(PHM_PlatformCaps_UMDPState)) { if (VEGA20_UMD_PSTATE_GFXCLK_LEVEL < dpm_table->count) { @@ -3482,9 +3482,9 @@ static int vega20_apply_clocks_adjust_rules(struct pp_hwmgr *hwmgr) /* memclk */ dpm_table = &(data->dpm_table.mem_table); dpm_table->dpm_state.soft_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.soft_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.soft_max_level = VG20_CLOCK_MAX_DEFAULT; dpm_table->dpm_state.hard_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.hard_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.hard_max_level = VG20_CLOCK_MAX_DEFAULT; if (PP_CAP(PHM_PlatformCaps_UMDPState)) { if (VEGA20_UMD_PSTATE_MCLK_LEVEL < dpm_table->count) { @@ -3526,18 +3526,18 @@ static int vega20_apply_clocks_adjust_rules(struct pp_hwmgr *hwmgr) /* fclk */ dpm_table = &(data->dpm_table.fclk_table); dpm_table->dpm_state.soft_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.soft_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.soft_max_level = VG20_CLOCK_MAX_DEFAULT; dpm_table->dpm_state.hard_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.hard_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.hard_max_level = VG20_CLOCK_MAX_DEFAULT; if (hwmgr->display_config->nb_pstate_switch_disable) dpm_table->dpm_state.soft_min_level = dpm_table->dpm_levels[dpm_table->count - 1].value; /* vclk */ dpm_table = &(data->dpm_table.vclk_table); dpm_table->dpm_state.soft_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.soft_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.soft_max_level = VG20_CLOCK_MAX_DEFAULT; dpm_table->dpm_state.hard_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.hard_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.hard_max_level = VG20_CLOCK_MAX_DEFAULT; if (PP_CAP(PHM_PlatformCaps_UMDPState)) { if (VEGA20_UMD_PSTATE_UVDCLK_LEVEL < dpm_table->count) { @@ -3554,9 +3554,9 @@ static int vega20_apply_clocks_adjust_rules(struct pp_hwmgr *hwmgr) /* dclk */ dpm_table = &(data->dpm_table.dclk_table); dpm_table->dpm_state.soft_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.soft_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.soft_max_level = VG20_CLOCK_MAX_DEFAULT; dpm_table->dpm_state.hard_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.hard_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.hard_max_level = VG20_CLOCK_MAX_DEFAULT; if (PP_CAP(PHM_PlatformCaps_UMDPState)) { if (VEGA20_UMD_PSTATE_UVDCLK_LEVEL < dpm_table->count) { @@ -3573,9 +3573,9 @@ static int vega20_apply_clocks_adjust_rules(struct pp_hwmgr *hwmgr) /* socclk */ dpm_table = &(data->dpm_table.soc_table); dpm_table->dpm_state.soft_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.soft_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.soft_max_level = VG20_CLOCK_MAX_DEFAULT; dpm_table->dpm_state.hard_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.hard_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.hard_max_level = VG20_CLOCK_MAX_DEFAULT; if (PP_CAP(PHM_PlatformCaps_UMDPState)) { if (VEGA20_UMD_PSTATE_SOCCLK_LEVEL < dpm_table->count) { @@ -3592,9 +3592,9 @@ static int vega20_apply_clocks_adjust_rules(struct pp_hwmgr *hwmgr) /* eclk */ dpm_table = &(data->dpm_table.eclk_table); dpm_table->dpm_state.soft_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.soft_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.soft_max_level = VG20_CLOCK_MAX_DEFAULT; dpm_table->dpm_state.hard_min_level = dpm_table->dpm_levels[0].value; - dpm_table->dpm_state.hard_max_level = dpm_table->dpm_levels[dpm_table->count - 1].value; + dpm_table->dpm_state.hard_max_level = VG20_CLOCK_MAX_DEFAULT; if (PP_CAP(PHM_PlatformCaps_UMDPState)) { if (VEGA20_UMD_PSTATE_VCEMCLK_LEVEL < dpm_table->count) { diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h index 4a4cad35dc8f..ccaf415400a7 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h @@ -42,6 +42,8 @@ #define AVFS_CURVE 0 #define OD8_HOTCURVE_TEMPERATURE 85 +#define VG20_CLOCK_MAX_DEFAULT 0xFFFF + typedef uint32_t PP_Clock; enum { From 084a56c723b34e493684e5f2ac9692c452564dce Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Thu, 21 Feb 2019 15:41:21 +0800 Subject: [PATCH 272/855] drm/amd/powerplay: show the right override pcie parameters Instead of the hard-coded ones from VBIOS. Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c | 46 ++++++++++++------- .../drm/amd/powerplay/hwmgr/vega20_hwmgr.h | 4 ++ 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c index 90ce402a1b2c..9aa7bec1b5fe 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c @@ -783,6 +783,8 @@ static int vega20_init_smc_table(struct pp_hwmgr *hwmgr) static int vega20_override_pcie_parameters(struct pp_hwmgr *hwmgr) { struct amdgpu_device *adev = (struct amdgpu_device *)(hwmgr->adev); + struct vega20_hwmgr *data = + (struct vega20_hwmgr *)(hwmgr->backend); uint32_t pcie_gen = 0, pcie_width = 0, smu_pcie_arg; int ret; @@ -819,6 +821,10 @@ static int vega20_override_pcie_parameters(struct pp_hwmgr *hwmgr) "[OverridePcieParameters] Attempt to override pcie params failed!", return ret); + data->pcie_parameters_override = 1; + data->pcie_gen_level1 = pcie_gen; + data->pcie_width_level1 = pcie_width; + return 0; } @@ -3099,7 +3105,7 @@ static int vega20_print_clock_levels(struct pp_hwmgr *hwmgr, &(data->dpm_table.fclk_table); int i, now, size = 0; int ret = 0; - uint32_t gen_speed, lane_width; + uint32_t gen_speed, lane_width, current_gen_speed, current_lane_width; switch (type) { case PP_SCLK: @@ -3187,28 +3193,36 @@ static int vega20_print_clock_levels(struct pp_hwmgr *hwmgr, break; case PP_PCIE: - gen_speed = (RREG32_PCIE(smnPCIE_LC_SPEED_CNTL) & + current_gen_speed = (RREG32_PCIE(smnPCIE_LC_SPEED_CNTL) & PSWUSP0_PCIE_LC_SPEED_CNTL__LC_CURRENT_DATA_RATE_MASK) >> PSWUSP0_PCIE_LC_SPEED_CNTL__LC_CURRENT_DATA_RATE__SHIFT; - lane_width = (RREG32_PCIE(smnPCIE_LC_LINK_WIDTH_CNTL) & + current_lane_width = (RREG32_PCIE(smnPCIE_LC_LINK_WIDTH_CNTL) & PCIE_LC_LINK_WIDTH_CNTL__LC_LINK_WIDTH_RD_MASK) >> PCIE_LC_LINK_WIDTH_CNTL__LC_LINK_WIDTH_RD__SHIFT; - for (i = 0; i < NUM_LINK_LEVELS; i++) + for (i = 0; i < NUM_LINK_LEVELS; i++) { + if (i == 1 && data->pcie_parameters_override) { + gen_speed = data->pcie_gen_level1; + lane_width = data->pcie_width_level1; + } else { + gen_speed = pptable->PcieGenSpeed[i]; + lane_width = pptable->PcieLaneCount[i]; + } size += sprintf(buf + size, "%d: %s %s %dMhz %s\n", i, - (pptable->PcieGenSpeed[i] == 0) ? "2.5GT/s," : - (pptable->PcieGenSpeed[i] == 1) ? "5.0GT/s," : - (pptable->PcieGenSpeed[i] == 2) ? "8.0GT/s," : - (pptable->PcieGenSpeed[i] == 3) ? "16.0GT/s," : "", - (pptable->PcieLaneCount[i] == 1) ? "x1" : - (pptable->PcieLaneCount[i] == 2) ? "x2" : - (pptable->PcieLaneCount[i] == 3) ? "x4" : - (pptable->PcieLaneCount[i] == 4) ? "x8" : - (pptable->PcieLaneCount[i] == 5) ? "x12" : - (pptable->PcieLaneCount[i] == 6) ? "x16" : "", + (gen_speed == 0) ? "2.5GT/s," : + (gen_speed == 1) ? "5.0GT/s," : + (gen_speed == 2) ? "8.0GT/s," : + (gen_speed == 3) ? "16.0GT/s," : "", + (lane_width == 1) ? "x1" : + (lane_width == 2) ? "x2" : + (lane_width == 3) ? "x4" : + (lane_width == 4) ? "x8" : + (lane_width == 5) ? "x12" : + (lane_width == 6) ? "x16" : "", pptable->LclkFreq[i], - (gen_speed == pptable->PcieGenSpeed[i]) && - (lane_width == pptable->PcieLaneCount[i]) ? + (current_gen_speed == gen_speed) && + (current_lane_width == lane_width) ? "*" : ""); + } break; case OD_SCLK: diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h index ccaf415400a7..a5bc758ae097 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.h @@ -526,6 +526,10 @@ struct vega20_hwmgr { unsigned long metrics_time; SmuMetrics_t metrics_table; + + bool pcie_parameters_override; + uint32_t pcie_gen_level1; + uint32_t pcie_width_level1; }; #define VEGA20_DPM2_NEAR_TDP_DEC 10 From 7db329e57b90ddebcb58fc88eedbb3082d22a957 Mon Sep 17 00:00:00 2001 From: Candice Li Date: Mon, 25 Feb 2019 10:59:08 +0800 Subject: [PATCH 273/855] Revert "drm/amdgpu: use BACO reset on vega20 if platform support" This reverts commit 2172b89e7c94605380d8c0dedf543c93f0a0b27c. Signed-off-by: Candice Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/soc15.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 99ebcf29dcb0..ed89a101f73f 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -461,7 +461,6 @@ static int soc15_asic_reset(struct amdgpu_device *adev) switch (adev->asic_type) { case CHIP_VEGA10: - case CHIP_VEGA20: soc15_asic_get_baco_capability(adev, &baco_reset); break; default: From 672e78cab819ebe31e3b9b8abac367be8a110472 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 10 Dec 2018 16:42:01 -0700 Subject: [PATCH 274/855] drm/amd/display: Pass app_tf by value rather than by reference Clang warns when an expression that equals zero is used as a null pointer constant (in lieu of NULL): drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:4435:3: warning: expression which evaluates to zero treated as a null pointer constant of type 'const enum color_transfer_func *' [-Wnon-literal-null-conversion] TRANSFER_FUNC_UNKNOWN, ^~~~~~~~~~~~~~~~~~~~~ 1 warning generated. This warning is caused by commit bb47de736661 ("drm/amdgpu: Set FreeSync state using drm VRR properties") and it could be solved by using NULL instead of TRANSFER_FUNC_UNKNOWN or casting TRANSFER_FUNC_UNKNOWN as a pointer. However, after looking into it, there doesn't appear to be a good reason to pass app_tf by reference as it is never mutated along the way. This is the only code path in which app_tf is used: mod_freesync_build_vrr_infopacket -> build_vrr_infopacket_v2 -> build_vrr_infopacket_fs2_data Neither mod_freesync_build_vrr_infopacket or build_vrr_infopacket_v2 modify app_tf's value and build_vrr_infopacket_fs2_data expects just the value so we can avoid dereferencing anything by just passing in app_tf's value to mod_freesync_build_vrr_infopacket and build_vrr_infopacket_v2. There is no functional change because build_vrr_infopacket_fs2_data doesn't do anything if TRANSFER_FUNC_UNKNOWN is passed to it, the same as not calling build_vrr_infopacket_fs2_data at all like before this change when NULL was used for app_tf. Reviewed-by: Harry Wentland Signed-off-by: Nathan Chancellor Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/modules/freesync/freesync.c | 7 +++---- drivers/gpu/drm/amd/display/modules/inc/mod_freesync.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c index 94a84bc57c7a..bfd27f10879e 100644 --- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c +++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c @@ -724,7 +724,7 @@ static void build_vrr_infopacket_v1(enum signal_type signal, static void build_vrr_infopacket_v2(enum signal_type signal, const struct mod_vrr_params *vrr, - const enum color_transfer_func *app_tf, + enum color_transfer_func app_tf, struct dc_info_packet *infopacket) { unsigned int payload_size = 0; @@ -732,8 +732,7 @@ static void build_vrr_infopacket_v2(enum signal_type signal, build_vrr_infopacket_header_v2(signal, infopacket, &payload_size); build_vrr_infopacket_data(vrr, infopacket); - if (app_tf != NULL) - build_vrr_infopacket_fs2_data(*app_tf, infopacket); + build_vrr_infopacket_fs2_data(app_tf, infopacket); build_vrr_infopacket_checksum(&payload_size, infopacket); @@ -757,7 +756,7 @@ void mod_freesync_build_vrr_infopacket(struct mod_freesync *mod_freesync, const struct dc_stream_state *stream, const struct mod_vrr_params *vrr, enum vrr_packet_type packet_type, - const enum color_transfer_func *app_tf, + enum color_transfer_func app_tf, struct dc_info_packet *infopacket) { /* SPD info packet for FreeSync diff --git a/drivers/gpu/drm/amd/display/modules/inc/mod_freesync.h b/drivers/gpu/drm/amd/display/modules/inc/mod_freesync.h index 4222e403b151..dcef85994c45 100644 --- a/drivers/gpu/drm/amd/display/modules/inc/mod_freesync.h +++ b/drivers/gpu/drm/amd/display/modules/inc/mod_freesync.h @@ -145,7 +145,7 @@ void mod_freesync_build_vrr_infopacket(struct mod_freesync *mod_freesync, const struct dc_stream_state *stream, const struct mod_vrr_params *vrr, enum vrr_packet_type packet_type, - const enum color_transfer_func *app_tf, + enum color_transfer_func app_tf, struct dc_info_packet *infopacket); void mod_freesync_build_vrr_params(struct mod_freesync *mod_freesync, From 1e2930374f565312e726f86150d9d1484f81c4d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 30 Jan 2019 14:44:36 +0100 Subject: [PATCH 275/855] drm/amdgpu: clear PDs/PTs only after initializing them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clear the VM PDs/PTs only after initializing all the structures. Signed-off-by: Christian König Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 942b5ebc6dc2..12d51d96491e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -945,10 +945,6 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev, if (r) return r; - r = amdgpu_vm_clear_bo(adev, vm, pt, cursor.level, ats); - if (r) - goto error_free_pt; - if (vm->use_cpu_for_update) { r = amdgpu_bo_kmap(pt, NULL); if (r) @@ -961,6 +957,10 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev, pt->parent = amdgpu_bo_ref(cursor.parent->base.bo); amdgpu_vm_bo_base_init(&entry->base, vm, pt); + + r = amdgpu_vm_clear_bo(adev, vm, pt, cursor.level, ats); + if (r) + goto error_free_pt; } return 0; @@ -3031,13 +3031,14 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, if (r) goto error_unreserve; + amdgpu_vm_bo_base_init(&vm->root.base, vm, root); + r = amdgpu_vm_clear_bo(adev, vm, root, adev->vm_manager.root_level, vm->pte_support_ats); if (r) goto error_unreserve; - amdgpu_vm_bo_base_init(&vm->root.base, vm, root); amdgpu_bo_unreserve(vm->root.base.bo); if (pasid) { From c347bd71dcdb2d0ac8b3a771486584dca8c8dd80 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Fri, 22 Feb 2019 16:58:54 -0700 Subject: [PATCH 276/855] device-dax: Add a 'modalias' attribute to DAX 'bus' devices Add a 'modalias' attribute to devices under the DAX bus so that userspace is able to dynamically load modules as needed. Normally, udev can get the modalias from 'uevent', and that is correctly set up by the DAX bus. However other tooling such as 'libndctl' for interacting with drivers/nvdimm/, and 'libdaxctl' for drivers/dax/ can also use the modalias to dynamically load modules via libkmod lookups. The 'nd' bus set up by the libnvdimm subsystem exports a modalias attribute. Imitate this to export the same for the 'dax' bus. Cc: Dave Hansen Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/dax/bus.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 28c3324271ac..2109cfe80219 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -295,6 +295,17 @@ static ssize_t target_node_show(struct device *dev, } static DEVICE_ATTR_RO(target_node); +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + /* + * We only ever expect to handle device-dax instances, i.e. the + * @type argument to MODULE_ALIAS_DAX_DEVICE() is always zero + */ + return sprintf(buf, DAX_DEVICE_MODALIAS_FMT "\n", 0); +} +static DEVICE_ATTR_RO(modalias); + static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); @@ -306,6 +317,7 @@ static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n) } static struct attribute *dev_dax_attributes[] = { + &dev_attr_modalias.attr, &dev_attr_size.attr, &dev_attr_target_node.attr, NULL, From 5cd401ace914dc68556c6d2fcae0c349444d5f86 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 25 Feb 2019 10:57:30 -0800 Subject: [PATCH 277/855] mm/resource: Return real error codes from walk failures walk_system_ram_range() can return an error code either becuase *it* failed, or because the 'func' that it calls returned an error. The memory hotplug does the following: ret = walk_system_ram_range(..., func); if (ret) return ret; and 'ret' makes it out to userspace, eventually. The problem s, walk_system_ram_range() failues that result from *it* failing (as opposed to 'func') return -1. That leads to a very odd -EPERM (-1) return code out to userspace. Make walk_system_ram_range() return -EINVAL for internal failures to keep userspace less confused. This return code is compatible with all the callers that I audited. Signed-off-by: Dave Hansen Reviewed-by: Bjorn Helgaas Acked-by: Michael Ellerman (powerpc) Cc: Dan Williams Cc: Dave Jiang Cc: Ross Zwisler Cc: Vishal Verma Cc: Tom Lendacky Cc: Andrew Morton Cc: Michal Hocko Cc: linux-nvdimm@lists.01.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: Huang Ying Cc: Fengguang Wu Cc: Borislav Petkov Cc: Yaowei Bai Cc: Takashi Iwai Cc: Jerome Glisse Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: linuxppc-dev@lists.ozlabs.org Cc: Keith Busch Signed-off-by: Dan Williams --- kernel/resource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 915c02e8e5dd..ca7ed5158cff 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -382,7 +382,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, int (*func)(struct resource *, void *)) { struct resource res; - int ret = -1; + int ret = -EINVAL; while (start < end && !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) { @@ -462,7 +462,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, unsigned long flags; struct resource res; unsigned long pfn, end_pfn; - int ret = -1; + int ret = -EINVAL; start = (u64) start_pfn << PAGE_SHIFT; end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; From b926b7f3baecb2a855db629e6822e1a85212e91c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 25 Feb 2019 10:57:33 -0800 Subject: [PATCH 278/855] mm/resource: Move HMM pr_debug() deeper into resource code HMM consumes physical address space for its own use, even though nothing is mapped or accessible there. It uses a special resource description (IORES_DESC_DEVICE_PRIVATE_MEMORY) to uniquely identify these areas. When HMM consumes address space, it makes a best guess about what to consume. However, it is possible that a future memory or device hotplug can collide with the reserved area. In the case of these conflicts, there is an error message in register_memory_resource(). Later patches in this series move register_memory_resource() from using request_resource_conflict() to __request_region(). Unfortunately, __request_region() does not return the conflict like the previous function did, which makes it impossible to check for IORES_DESC_DEVICE_PRIVATE_MEMORY in a conflicting resource. Instead of warning in register_memory_resource(), move the check into the core resource code itself (__request_region()) where the conflicting resource _is_ available. This has the added bonus of producing a warning in case of HMM conflicts with devices *or* RAM address space, as opposed to the RAM- only warnings that were there previously. Signed-off-by: Dave Hansen Reviewed-by: Jerome Glisse Cc: Dan Williams Cc: Dave Jiang Cc: Ross Zwisler Cc: Vishal Verma Cc: Tom Lendacky Cc: Andrew Morton Cc: Michal Hocko Cc: linux-nvdimm@lists.01.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: Huang Ying Cc: Fengguang Wu Cc: Keith Busch Signed-off-by: Dan Williams --- kernel/resource.c | 9 +++++++++ mm/memory_hotplug.c | 5 ----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index ca7ed5158cff..35fe105d581e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1132,6 +1132,15 @@ struct resource * __request_region(struct resource *parent, conflict = __request_resource(parent, res); if (!conflict) break; + /* + * mm/hmm.c reserves physical addresses which then + * become unavailable to other users. Conflicts are + * not expected. Warn to aid debugging if encountered. + */ + if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { + pr_warn("Unaddressable device %s %pR conflicts with %pR", + conflict->name, conflict, res); + } if (conflict != parent) { if (!(conflict->flags & IORESOURCE_BUSY)) { parent = conflict; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b9a667d36c55..e198974c968d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -110,11 +110,6 @@ static struct resource *register_memory_resource(u64 start, u64 size) res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; conflict = request_resource_conflict(&iomem_resource, res); if (conflict) { - if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { - pr_debug("Device unaddressable memory block " - "memory hotplug at %#010llx !\n", - (unsigned long long)start); - } pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); return ERR_PTR(-EEXIST); From 2794129e902d8eb69413d884dc6404b8716ed9ed Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 25 Feb 2019 10:57:36 -0800 Subject: [PATCH 279/855] mm/memory-hotplug: Allow memory resources to be children The mm/resource.c code is used to manage the physical address space. The current resource configuration can be viewed in /proc/iomem. An example of this is at the bottom of this description. The nvdimm subsystem "owns" the physical address resources which map to persistent memory and has resources inserted for them as "Persistent Memory". The best way to repurpose this for volatile use is to leave the existing resource in place, but add a "System RAM" resource underneath it. This clearly communicates the ownership relationship of this memory. The request_resource_conflict() API only deals with the top-level resources. Replace it with __request_region() which will search for !IORESOURCE_BUSY areas lower in the resource tree than the top level. We *could* also simply truncate the existing top-level "Persistent Memory" resource and take over the released address space. But, this means that if we ever decide to hot-unplug the "RAM" and give it back, we need to recreate the original setup, which may mean going back to the BIOS tables. This should have no real effect on the existing collision detection because the areas that truly conflict should be marked IORESOURCE_BUSY. 00000000-00000fff : Reserved 00001000-0009fbff : System RAM 0009fc00-0009ffff : Reserved 000a0000-000bffff : PCI Bus 0000:00 000c0000-000c97ff : Video ROM 000c9800-000ca5ff : Adapter ROM 000f0000-000fffff : Reserved 000f0000-000fffff : System ROM 00100000-9fffffff : System RAM 01000000-01e071d0 : Kernel code 01e071d1-027dfdff : Kernel data 02dc6000-0305dfff : Kernel bss a0000000-afffffff : Persistent Memory (legacy) a0000000-a7ffffff : System RAM b0000000-bffdffff : System RAM bffe0000-bfffffff : Reserved c0000000-febfffff : PCI Bus 0000:00 Signed-off-by: Dave Hansen Reviewed-by: Dan Williams Reviewed-by: Vishal Verma Cc: Dave Jiang Cc: Ross Zwisler Cc: Vishal Verma Cc: Tom Lendacky Cc: Andrew Morton Cc: Michal Hocko Cc: linux-nvdimm@lists.01.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: Huang Ying Cc: Fengguang Wu Cc: Borislav Petkov Cc: Bjorn Helgaas Cc: Yaowei Bai Cc: Takashi Iwai Cc: Jerome Glisse Cc: Keith Busch Signed-off-by: Dan Williams --- mm/memory_hotplug.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e198974c968d..b37f3a5c4833 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -99,19 +99,21 @@ void mem_hotplug_done(void) /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { - struct resource *res, *conflict; - res = kzalloc(sizeof(struct resource), GFP_KERNEL); - if (!res) - return ERR_PTR(-ENOMEM); + struct resource *res; + unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + char *resource_name = "System RAM"; - res->name = "System RAM"; - res->start = start; - res->end = start + size - 1; - res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - conflict = request_resource_conflict(&iomem_resource, res); - if (conflict) { - pr_debug("System RAM resource %pR cannot be added\n", res); - kfree(res); + /* + * Request ownership of the new memory range. This might be + * a child of an existing resource that was present but + * not marked as busy. + */ + res = __request_region(&iomem_resource, start, size, + resource_name, flags); + + if (!res) { + pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n", + start, start + size); return ERR_PTR(-EEXIST); } return res; From 2b539aefe9e48e3908cff02699aa63a8b9bd268e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 25 Feb 2019 10:57:38 -0800 Subject: [PATCH 280/855] mm/resource: Let walk_system_ram_range() search child resources In the process of onlining memory, we use walk_system_ram_range() to find the actual RAM areas inside of the area being onlined. However, it currently only finds memory resources which are "top-level" iomem_resources. Children are not currently searched which causes it to skip System RAM in areas like this (in the format of /proc/iomem): a0000000-bfffffff : Persistent Memory (legacy) a0000000-afffffff : System RAM Changing the true->false here allows children to be searched as well. We need this because we add a new "System RAM" resource underneath the "persistent memory" resource when we use persistent memory in a volatile mode. Signed-off-by: Dave Hansen Cc: Keith Busch Cc: Dan Williams Cc: Dave Jiang Cc: Ross Zwisler Cc: Vishal Verma Cc: Tom Lendacky Cc: Andrew Morton Cc: Michal Hocko Cc: linux-nvdimm@lists.01.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: Huang Ying Cc: Fengguang Wu Cc: Borislav Petkov Cc: Bjorn Helgaas Cc: Yaowei Bai Cc: Takashi Iwai Cc: Jerome Glisse Signed-off-by: Dan Williams --- kernel/resource.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/resource.c b/kernel/resource.c index 35fe105d581e..e7f9d2a5db25 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -454,6 +454,9 @@ int walk_mem_res(u64 start, u64 end, void *arg, * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. * It is to be used only for System RAM. + * + * This will find System RAM ranges that are children of top-level resources + * in addition to top-level System RAM resources. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) @@ -469,7 +472,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; while (start < end && !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, - true, &res)) { + false, &res)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) From c221c0b0308fd01d9fb33a16f64d2fd95f8830a4 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 25 Feb 2019 10:57:40 -0800 Subject: [PATCH 281/855] device-dax: "Hotplug" persistent memory for use like normal RAM This is intended for use with NVDIMMs that are physically persistent (physically like flash) so that they can be used as a cost-effective RAM replacement. Intel Optane DC persistent memory is one implementation of this kind of NVDIMM. Currently, a persistent memory region is "owned" by a device driver, either the "Direct DAX" or "Filesystem DAX" drivers. These drivers allow applications to explicitly use persistent memory, generally by being modified to use special, new libraries. (DIMM-based persistent memory hardware/software is described in great detail here: Documentation/nvdimm/nvdimm.txt). However, this limits persistent memory use to applications which *have* been modified. To make it more broadly usable, this driver "hotplugs" memory into the kernel, to be managed and used just like normal RAM would be. To make this work, management software must remove the device from being controlled by the "Device DAX" infrastructure: echo dax0.0 > /sys/bus/dax/drivers/device_dax/unbind and then tell the new driver that it can bind to the device: echo dax0.0 > /sys/bus/dax/drivers/kmem/new_id After this, there will be a number of new memory sections visible in sysfs that can be onlined, or that may get onlined by existing udev-initiated memory hotplug rules. This rebinding procedure is currently a one-way trip. Once memory is bound to "kmem", it's there permanently and can not be unbound and assigned back to device_dax. The kmem driver will never bind to a dax device unless the device is *explicitly* bound to the driver. There are two reasons for this: One, since it is a one-way trip, it can not be undone if bound incorrectly. Two, the kmem driver destroys data on the device. Think of if you had good data on a pmem device. It would be catastrophic if you compile-in "kmem", but leave out the "device_dax" driver. kmem would take over the device and write volatile data all over your good data. This inherits any existing NUMA information for the newly-added memory from the persistent memory device that came from the firmware. On Intel platforms, the firmware has guarantees that require each socket's persistent memory to be in a separate memory-only NUMA node. That means that this patch is not expected to create NUMA nodes, but will simply hotplug memory into existing nodes. Because NUMA nodes are created, the existing NUMA APIs and tools are sufficient to create policies for applications or memory areas to have affinity for or an aversion to using this memory. There is currently some metadata at the beginning of pmem regions. The section-size memory hotplug restrictions, plus this small reserved area can cause the "loss" of a section or two of capacity. This should be fixable in follow-on patches. But, as a first step, losing 256MB of memory (worst case) out of hundreds of gigabytes is a good tradeoff vs. the required code to fix this up precisely. This calculation is also the reason we export memory_block_size_bytes(). Signed-off-by: Dave Hansen Reviewed-by: Dan Williams Reviewed-by: Keith Busch Cc: Dave Jiang Cc: Ross Zwisler Cc: Vishal Verma Cc: Tom Lendacky Cc: Andrew Morton Cc: Michal Hocko Cc: linux-nvdimm@lists.01.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: Huang Ying Cc: Fengguang Wu Cc: Borislav Petkov Cc: Bjorn Helgaas Cc: Yaowei Bai Cc: Takashi Iwai Cc: Jerome Glisse Reviewed-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/base/memory.c | 1 + drivers/dax/Kconfig | 16 +++++++ drivers/dax/Makefile | 1 + drivers/dax/kmem.c | 108 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 126 insertions(+) create mode 100644 drivers/dax/kmem.c diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 048cbf7d5233..cb8347500ce2 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -88,6 +88,7 @@ unsigned long __weak memory_block_size_bytes(void) { return MIN_MEMORY_BLOCK_SIZE; } +EXPORT_SYMBOL_GPL(memory_block_size_bytes); static unsigned long get_memory_block_size(void) { diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 6fc96f03920e..5ef624fe3934 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -32,6 +32,22 @@ config DEV_DAX_PMEM Say M if unsure +config DEV_DAX_KMEM + tristate "KMEM DAX: volatile-use of persistent memory" + default DEV_DAX + depends on DEV_DAX + depends on MEMORY_HOTPLUG # for add_memory() and friends + help + Support access to persistent memory as if it were RAM. This + allows easier use of persistent memory by unmodified + applications. + + To use this feature, a DAX device must be unbound from the + device_dax driver (PMEM DAX) and bound to this kmem driver + on each boot. + + Say N if unsure. + config DEV_DAX_PMEM_COMPAT tristate "PMEM DAX: support the deprecated /sys/class/dax interface" depends on DEV_DAX_PMEM diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 233bbffccbe6..81f7d54dadfb 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DAX) += dax.o obj-$(CONFIG_DEV_DAX) += device_dax.o +obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o dax-y := super.o dax-y += bus.o diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c new file mode 100644 index 000000000000..a02318c6d28a --- /dev/null +++ b/drivers/dax/kmem.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-2019 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dax-private.h" +#include "bus.h" + +int dev_dax_kmem_probe(struct device *dev) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct resource *res = &dev_dax->region->res; + resource_size_t kmem_start; + resource_size_t kmem_size; + resource_size_t kmem_end; + struct resource *new_res; + int numa_node; + int rc; + + /* + * Ensure good NUMA information for the persistent memory. + * Without this check, there is a risk that slow memory + * could be mixed in a node with faster memory, causing + * unavoidable performance issues. + */ + numa_node = dev_dax->target_node; + if (numa_node < 0) { + dev_warn(dev, "rejecting DAX region %pR with invalid node: %d\n", + res, numa_node); + return -EINVAL; + } + + /* Hotplug starting at the beginning of the next block: */ + kmem_start = ALIGN(res->start, memory_block_size_bytes()); + + kmem_size = resource_size(res); + /* Adjust the size down to compensate for moving up kmem_start: */ + kmem_size -= kmem_start - res->start; + /* Align the size down to cover only complete blocks: */ + kmem_size &= ~(memory_block_size_bytes() - 1); + kmem_end = kmem_start + kmem_size; + + /* Region is permanently reserved. Hot-remove not yet implemented. */ + new_res = request_mem_region(kmem_start, kmem_size, dev_name(dev)); + if (!new_res) { + dev_warn(dev, "could not reserve region [%pa-%pa]\n", + &kmem_start, &kmem_end); + return -EBUSY; + } + + /* + * Set flags appropriate for System RAM. Leave ..._BUSY clear + * so that add_memory() can add a child resource. Do not + * inherit flags from the parent since it may set new flags + * unknown to us that will break add_memory() below. + */ + new_res->flags = IORESOURCE_SYSTEM_RAM; + new_res->name = dev_name(dev); + + rc = add_memory(numa_node, new_res->start, resource_size(new_res)); + if (rc) + return rc; + + return 0; +} + +static int dev_dax_kmem_remove(struct device *dev) +{ + /* + * Purposely leak the request_mem_region() for the device-dax + * range and return '0' to ->remove() attempts. The removal of + * the device from the driver always succeeds, but the region + * is permanently pinned as reserved by the unreleased + * request_mem_region(). + */ + return 0; +} + +static struct dax_device_driver device_dax_kmem_driver = { + .drv = { + .probe = dev_dax_kmem_probe, + .remove = dev_dax_kmem_remove, + }, +}; + +static int __init dax_kmem_init(void) +{ + return dax_driver_register(&device_dax_kmem_driver); +} + +static void __exit dax_kmem_exit(void) +{ + dax_driver_unregister(&device_dax_kmem_driver); +} + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL v2"); +module_init(dax_kmem_init); +module_exit(dax_kmem_exit); +MODULE_ALIAS_DAX_DEVICE(0); From 2b57ecd0208f7ac0b20b1b171698f027481a39f6 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Fri, 1 Mar 2019 14:25:16 +1100 Subject: [PATCH 282/855] KVM: PPC: Book3S: Add count cache flush parameters to kvmppc_get_cpu_char() Add KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST & KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE to the characteristics returned from the H_GET_CPU_CHARACTERISTICS H-CALL, as queried from either the hypervisor or the device tree. Signed-off-by: Suraj Jitindar Singh Signed-off-by: Paul Mackerras --- arch/powerpc/include/uapi/asm/kvm.h | 2 ++ arch/powerpc/kvm/powerpc.c | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 8c876c166ef2..26ca425f4c2c 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -463,10 +463,12 @@ struct kvm_ppc_cpu_char { #define KVM_PPC_CPU_CHAR_BR_HINT_HONOURED (1ULL << 58) #define KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF (1ULL << 57) #define KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS (1ULL << 56) +#define KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST (1ull << 54) #define KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY (1ULL << 63) #define KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR (1ULL << 62) #define KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ULL << 61) +#define KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE (1ull << 58) /* Per-vcpu XICS interrupt controller state */ #define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 8c69af10f91d..8885377ec3e0 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2189,10 +2189,12 @@ static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp) KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV | KVM_PPC_CPU_CHAR_BR_HINT_HONOURED | KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF | - KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS | + KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST; cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY | KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR | - KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR | + KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE; } return 0; } @@ -2251,12 +2253,16 @@ static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp) if (have_fw_feat(fw_features, "enabled", "fw-count-cache-disabled")) cp->character |= KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + if (have_fw_feat(fw_features, "enabled", + "fw-count-cache-flush-bcctr2,0,0")) + cp->character |= KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST; cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 | KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED | KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 | KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 | KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV | - KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS | + KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST; if (have_fw_feat(fw_features, "enabled", "speculation-policy-favor-security")) @@ -2267,9 +2273,13 @@ static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp) if (!have_fw_feat(fw_features, "disabled", "needs-spec-barrier-for-bound-checks")) cp->behaviour |= KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + if (have_fw_feat(fw_features, "enabled", + "needs-count-cache-flush-on-context-switch")) + cp->behaviour |= KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE; cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY | KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR | - KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR | + KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE; of_node_put(fw_features); } From 4c83c2f75a8f4737a8fb177bb7f2ffe464f567ca Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 1 Mar 2019 15:45:09 +0100 Subject: [PATCH 283/855] fbdev: omap2: fix warnings in dss core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 60d2fa0dad06 ("fbdev: omap2: no need to check return value of debugfs_create functions") changed the declaration of the return value of function dss_debugfs_create_file() and the following two warnings appeared: drivers/video/fbdev/omap2/omapfb/dss/core.c: In function ‘dss_debugfs_create_file’: drivers/video/fbdev/omap2/omapfb/dss/core.c:139:9: warning: ‘return’ with a value, in function returning void return 0; ^ drivers/video/fbdev/omap2/omapfb/dss/core.c: In function ‘omap_dss_probe’: drivers/video/fbdev/omap2/omapfb/dss/core.c:172:6: warning: unused variable ‘r’ [-Wunused-variable] int r; ^ Rework so function dss_debugfs_create_file() that is declared to return void don't 'return 0' and remove the declaration of the unused variable 'r'. Signed-off-by: Anders Roxell Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/omap2/omapfb/dss/core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/video/fbdev/omap2/omapfb/dss/core.c b/drivers/video/fbdev/omap2/omapfb/dss/core.c index 7e6a3eb266d0..b5956a1a30d4 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/core.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/core.c @@ -136,7 +136,6 @@ static inline void dss_uninitialize_debugfs(void) } void dss_debugfs_create_file(const char *name, void (*write)(struct seq_file *)) { - return 0; } #endif /* CONFIG_FB_OMAP2_DSS_DEBUGFS */ @@ -169,8 +168,6 @@ static struct notifier_block omap_dss_pm_notif_block = { static int __init omap_dss_probe(struct platform_device *pdev) { - int r; - core.pdev = pdev; dss_features_init(omapdss_get_version()); From 5e3cc1ee1405a7eb3487ed24f786dec01b4cbe1f Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 24 Jan 2019 14:35:13 +0800 Subject: [PATCH 284/855] 9p: use inode->i_lock to protect i_size_write() under 32-bit Use inode->i_lock to protect i_size_write(), else i_size_read() in generic_fillattr() may loop infinitely in read_seqcount_begin() when multiple processes invoke v9fs_vfs_getattr() or v9fs_vfs_getattr_dotl() simultaneously under 32-bit SMP environment, and a soft lockup will be triggered as show below: watchdog: BUG: soft lockup - CPU#5 stuck for 22s! [stat:2217] Modules linked in: CPU: 5 PID: 2217 Comm: stat Not tainted 5.0.0-rc1-00005-g7f702faf5a9e #4 Hardware name: Generic DT based system PC is at generic_fillattr+0x104/0x108 LR is at 0xec497f00 pc : [<802b8898>] lr : [] psr: 200c0013 sp : ec497e20 ip : ed608030 fp : ec497e3c r10: 00000000 r9 : ec497f00 r8 : ed608030 r7 : ec497ebc r6 : ec497f00 r5 : ee5c1550 r4 : ee005780 r3 : 0000052d r2 : 00000000 r1 : ec497f00 r0 : ed608030 Flags: nzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none Control: 10c5387d Table: ac48006a DAC: 00000051 CPU: 5 PID: 2217 Comm: stat Not tainted 5.0.0-rc1-00005-g7f702faf5a9e #4 Hardware name: Generic DT based system Backtrace: [<8010d974>] (dump_backtrace) from [<8010dc88>] (show_stack+0x20/0x24) [<8010dc68>] (show_stack) from [<80a1d194>] (dump_stack+0xb0/0xdc) [<80a1d0e4>] (dump_stack) from [<80109f34>] (show_regs+0x1c/0x20) [<80109f18>] (show_regs) from [<801d0a80>] (watchdog_timer_fn+0x280/0x2f8) [<801d0800>] (watchdog_timer_fn) from [<80198658>] (__hrtimer_run_queues+0x18c/0x380) [<801984cc>] (__hrtimer_run_queues) from [<80198e60>] (hrtimer_run_queues+0xb8/0xf0) [<80198da8>] (hrtimer_run_queues) from [<801973e8>] (run_local_timers+0x28/0x64) [<801973c0>] (run_local_timers) from [<80197460>] (update_process_times+0x3c/0x6c) [<80197424>] (update_process_times) from [<801ab2b8>] (tick_nohz_handler+0xe0/0x1bc) [<801ab1d8>] (tick_nohz_handler) from [<80843050>] (arch_timer_handler_virt+0x38/0x48) [<80843018>] (arch_timer_handler_virt) from [<80180a64>] (handle_percpu_devid_irq+0x8c/0x240) [<801809d8>] (handle_percpu_devid_irq) from [<8017ac20>] (generic_handle_irq+0x34/0x44) [<8017abec>] (generic_handle_irq) from [<8017b344>] (__handle_domain_irq+0x6c/0xc4) [<8017b2d8>] (__handle_domain_irq) from [<801022e0>] (gic_handle_irq+0x4c/0x88) [<80102294>] (gic_handle_irq) from [<80101a30>] (__irq_svc+0x70/0x98) [<802b8794>] (generic_fillattr) from [<8056b284>] (v9fs_vfs_getattr_dotl+0x74/0xa4) [<8056b210>] (v9fs_vfs_getattr_dotl) from [<802b8904>] (vfs_getattr_nosec+0x68/0x7c) [<802b889c>] (vfs_getattr_nosec) from [<802b895c>] (vfs_getattr+0x44/0x48) [<802b8918>] (vfs_getattr) from [<802b8a74>] (vfs_statx+0x9c/0xec) [<802b89d8>] (vfs_statx) from [<802b9428>] (sys_lstat64+0x48/0x78) [<802b93e0>] (sys_lstat64) from [<80101000>] (ret_fast_syscall+0x0/0x28) [dominique.martinet@cea.fr: updated comment to not refer to a function in another subsystem] Link: http://lkml.kernel.org/r/20190124063514.8571-2-houtao1@huawei.com Cc: stable@vger.kernel.org Fixes: 7549ae3e81cc ("9p: Use the i_size_[read, write]() macros instead of using inode->i_size directly.") Reported-by: Xing Gaopeng Signed-off-by: Hou Tao Signed-off-by: Dominique Martinet --- fs/9p/v9fs_vfs.h | 23 +++++++++++++++++++++-- fs/9p/vfs_file.c | 6 +++++- fs/9p/vfs_inode.c | 23 +++++++++++------------ fs/9p/vfs_inode_dotl.c | 27 ++++++++++++++------------- fs/9p/vfs_super.c | 4 ++-- 5 files changed, 53 insertions(+), 30 deletions(-) diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 5a0db6dec8d1..aaee1e6584e6 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -40,6 +40,9 @@ */ #define P9_LOCK_TIMEOUT (30*HZ) +/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */ +#define V9FS_STAT2INODE_KEEP_ISIZE 1 + extern struct file_system_type v9fs_fs_type; extern const struct address_space_operations v9fs_addr_operations; extern const struct file_operations v9fs_file_operations; @@ -61,8 +64,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); -void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); -void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *); +void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, + struct super_block *sb, unsigned int flags); +void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, + unsigned int flags); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); @@ -83,4 +88,18 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode) } int v9fs_open_to_dotl_flags(int flags); + +static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size) +{ + /* + * 32-bit need the lock, concurrent updates could break the + * sequences and make i_size_read() loop forever. + * 64-bit updates are atomic and can skip the locking. + */ + if (sizeof(i_size) > sizeof(long)) + spin_lock(&inode->i_lock); + i_size_write(inode, i_size); + if (sizeof(i_size) > sizeof(long)) + spin_unlock(&inode->i_lock); +} #endif diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index a25efa782fcc..9a1125305d84 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -446,7 +446,11 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) i_size = i_size_read(inode); if (iocb->ki_pos > i_size) { inode_add_bytes(inode, iocb->ki_pos - i_size); - i_size_write(inode, iocb->ki_pos); + /* + * Need to serialize against i_size_write() in + * v9fs_stat2inode() + */ + v9fs_i_size_write(inode, iocb->ki_pos); } return retval; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 85ff859d3af5..72b779bc0942 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -538,7 +538,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, if (retval) goto error; - v9fs_stat2inode(st, inode, sb); + v9fs_stat2inode(st, inode, sb, 0); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); return inode; @@ -1092,7 +1092,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb); + v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); generic_fillattr(d_inode(dentry), stat); p9stat_free(st); @@ -1170,12 +1170,13 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) * @stat: Plan 9 metadata (mistat) structure * @inode: inode to populate * @sb: superblock of filesystem + * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, - struct super_block *sb) + struct super_block *sb, unsigned int flags) { umode_t mode; char ext[32]; @@ -1216,10 +1217,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode = p9mode2perm(v9ses, stat); mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - i_size_write(inode, stat->length); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) + v9fs_i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ - inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; + inode->i_blocks = (stat->length + 512 - 1) >> 9; v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } @@ -1416,9 +1418,9 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) { int umode; dev_t rdev; - loff_t i_size; struct p9_wstat *st; struct v9fs_session_info *v9ses; + unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_stat(fid); @@ -1431,16 +1433,13 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) goto out; - spin_lock(&inode->i_lock); /* * We don't want to refresh inode->i_size, * because we may have cached data */ - i_size = inode->i_size; - v9fs_stat2inode(st, inode, inode->i_sb); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - inode->i_size = i_size; - spin_unlock(&inode->i_lock); + flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + V9FS_STAT2INODE_KEEP_ISIZE : 0; + v9fs_stat2inode(st, inode, inode->i_sb, flags); out: p9stat_free(st); kfree(st); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 4823e1c46999..a950a927a626 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -143,7 +143,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, if (retval) goto error; - v9fs_stat2inode_dotl(st, inode); + v9fs_stat2inode_dotl(st, inode, 0); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) @@ -496,7 +496,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode_dotl(st, d_inode(dentry)); + v9fs_stat2inode_dotl(st, d_inode(dentry), 0); generic_fillattr(d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -607,11 +607,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) * v9fs_stat2inode_dotl - populate an inode structure with stat info * @stat: stat structure * @inode: inode to populate + * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void -v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) +v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, + unsigned int flags) { umode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); @@ -631,7 +633,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - i_size_write(inode, stat->st_size); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) + v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; } else { if (stat->st_result_mask & P9_STATS_ATIME) { @@ -661,8 +664,9 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) } if (stat->st_result_mask & P9_STATS_RDEV) inode->i_rdev = new_decode_dev(stat->st_rdev); - if (stat->st_result_mask & P9_STATS_SIZE) - i_size_write(inode, stat->st_size); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && + stat->st_result_mask & P9_STATS_SIZE) + v9fs_i_size_write(inode, stat->st_size); if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } @@ -928,9 +932,9 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry, int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) { - loff_t i_size; struct p9_stat_dotl *st; struct v9fs_session_info *v9ses; + unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_getattr_dotl(fid, P9_STATS_ALL); @@ -942,16 +946,13 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) goto out; - spin_lock(&inode->i_lock); /* * We don't want to refresh inode->i_size, * because we may have cached data */ - i_size = inode->i_size; - v9fs_stat2inode_dotl(st, inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - inode->i_size = i_size; - spin_unlock(&inode->i_lock); + flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + V9FS_STAT2INODE_KEEP_ISIZE : 0; + v9fs_stat2inode_dotl(st, inode, flags); out: kfree(st); return 0; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 48ce50484e80..eeab9953af89 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -172,7 +172,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode_dotl(st, d_inode(root)); + v9fs_stat2inode_dotl(st, d_inode(root), 0); kfree(st); } else { struct p9_wstat *st = NULL; @@ -183,7 +183,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, d_inode(root), sb); + v9fs_stat2inode(st, d_inode(root), sb, 0); p9stat_free(st); kfree(st); From e552f0851070fe4975d610a99910be4e9bf5d7bd Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 1 Mar 2019 12:00:46 +0000 Subject: [PATCH 285/855] drm: add __user attribute to ptr_to_compat() The ptr_to_compat() call takes a "void __user *", so cast the compat drm calls that use it to avoid the following warnings from sparse: drivers/gpu/drm/drm_ioc32.c:188:39: warning: incorrect type in argument 1 (different address spaces) drivers/gpu/drm/drm_ioc32.c:188:39: expected void [noderef] *uptr drivers/gpu/drm/drm_ioc32.c:188:39: got void *[addressable] [assigned] handle drivers/gpu/drm/drm_ioc32.c:529:41: warning: incorrect type in argument 1 (different address spaces) drivers/gpu/drm/drm_ioc32.c:529:41: expected void [noderef] *uptr drivers/gpu/drm/drm_ioc32.c:529:41: got void *[addressable] [assigned] handle Cc: stable@vger.kernel.org Signed-off-by: Ben Dooks Signed-off-by: Sean Paul Link: https://patchwork.freedesktop.org/patch/msgid/20190301120046.26961-1-ben.dooks@codethink.co.uk --- drivers/gpu/drm/drm_ioc32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_ioc32.c b/drivers/gpu/drm/drm_ioc32.c index 67b1fca39aa6..0e3043e08c69 100644 --- a/drivers/gpu/drm/drm_ioc32.c +++ b/drivers/gpu/drm/drm_ioc32.c @@ -185,7 +185,7 @@ static int compat_drm_getmap(struct file *file, unsigned int cmd, m32.size = map.size; m32.type = map.type; m32.flags = map.flags; - m32.handle = ptr_to_compat(map.handle); + m32.handle = ptr_to_compat((void __user *)map.handle); m32.mtrr = map.mtrr; if (copy_to_user(argp, &m32, sizeof(m32))) return -EFAULT; @@ -216,7 +216,7 @@ static int compat_drm_addmap(struct file *file, unsigned int cmd, m32.offset = map.offset; m32.mtrr = map.mtrr; - m32.handle = ptr_to_compat(map.handle); + m32.handle = ptr_to_compat((void __user *)map.handle); if (map.handle != compat_ptr(m32.handle)) pr_err_ratelimited("compat_drm_addmap truncated handle %p for type %d offset %x\n", map.handle, m32.type, m32.offset); @@ -526,7 +526,7 @@ static int compat_drm_getsareactx(struct file *file, unsigned int cmd, if (err) return err; - req32.handle = ptr_to_compat(req.handle); + req32.handle = ptr_to_compat((void __user *)req.handle); if (copy_to_user(argp, &req32, sizeof(req32))) return -EFAULT; From 7d3a5eb78e3614b025389d2cd89d6d85e91f5549 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Mar 2019 21:02:18 +0100 Subject: [PATCH 286/855] irqchip/imx-irqsteer: Fix of_property_read_u32() error handling gcc points out that irqs_num is not initialized when of_property_read_u32() is an empty stub function: Included from drivers/irqchip/irq-imx-irqsteer.c:7: drivers/irqchip/irq-imx-irqsteer.c: In function 'imx_irqsteer_probe': include/uapi/linux/kernel.h:13:49: error: 'irqs_num' may be used uninitialized in this function [-Werror=maybe-uninitialized] The same can actually happen with CONFIG_OF=y as well, though we don't get a warning then. Add error checking here that lets the code deal with missing or invalid properties as well as avoid the warning. Fixes: 28528fca4908 ("irqchip/imx-irqsteer: Add multi output interrupts support") Signed-off-by: Arnd Bergmann Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-imx-irqsteer.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-imx-irqsteer.c b/drivers/irqchip/irq-imx-irqsteer.c index d1098f4da6a4..88df3d00052c 100644 --- a/drivers/irqchip/irq-imx-irqsteer.c +++ b/drivers/irqchip/irq-imx-irqsteer.c @@ -169,8 +169,12 @@ static int imx_irqsteer_probe(struct platform_device *pdev) raw_spin_lock_init(&data->lock); - of_property_read_u32(np, "fsl,num-irqs", &irqs_num); - of_property_read_u32(np, "fsl,channel", &data->channel); + ret = of_property_read_u32(np, "fsl,num-irqs", &irqs_num); + if (ret) + return ret; + ret = of_property_read_u32(np, "fsl,channel", &data->channel); + if (ret) + return ret; /* * There is one output irq for each group of 64 inputs. From 35f2806b481f5b9207f25e1886cba5d1c4d12cc7 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 22 Feb 2019 22:55:31 +0530 Subject: [PATCH 287/855] powerpc/hugetlb: Don't do runtime allocation of 16G pages in LPAR configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We added runtime allocation of 16G pages in commit 4ae279c2c96a ("powerpc/mm/hugetlb: Allow runtime allocation of 16G.") That was done to enable 16G allocation on PowerNV and KVM config. In case of KVM config, we mostly would have the entire guest RAM backed by 16G hugetlb pages for this to work. PAPR do support partial backing of guest RAM with hugepages via ibm,expected#pages node of memory node in the device tree. This means rest of the guest RAM won't be backed by 16G contiguous pages in the host and hence a hash page table insertion can fail in such case. An example error message will look like hash-mmu: mm: Hashing failure ! EA=0x7efc00000000 access=0x8000000000000006 current=readback hash-mmu: trap=0x300 vsid=0x67af789 ssize=1 base psize=14 psize 14 pte=0xc000000400000386 readback[12260]: unhandled signal 7 at 00007efc00000000 nip 00000000100012d0 lr 000000001000127c code 2 This patch address that by preventing runtime allocation of 16G hugepages in LPAR config. To allocate 16G hugetlb one need to kernel command line hugepagesz=16G hugepages= With radix translation mode we don't run into this issue. This change will prevent runtime allocation of 16G hugetlb pages on kvm with hash translation mode. However, with the current upstream it was observed that 16G hugetlbfs backed guest doesn't boot at all. We observe boot failure with the below message: [131354.647546] KVM: map_vrma at 0 failed, ret=-4 That means this patch is not resulting in an observable regression. Once we fix the boot issue with 16G hugetlb backed memory, we need to use ibm,expected#pages memory node attribute to indicate 16G page reservation to the guest. This will also enable partial backing of guest RAM with 16G pages. Fixes: 4ae279c2c96a ("powerpc/mm/hugetlb: Allow runtime allocation of 16G.") Cc: stable@vger.kernel.org # v4.14+ Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 5b0177733994..46130ef4941c 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -35,6 +35,14 @@ static inline int hstate_get_psize(struct hstate *hstate) #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static inline bool gigantic_page_supported(void) { + /* + * We used gigantic page reservation with hypervisor assist in some case. + * We cannot use runtime allocation of gigantic pages in those platforms + * This is hash translation mode LPARs. + */ + if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) + return false; + return true; } #endif From 805bf3b75529dd0bb8c3c9d8f6b89c2c781bd603 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 5 Mar 2019 16:35:08 +1100 Subject: [PATCH 288/855] powerpc/configs: Sync skiroot defconfig This updates the skiroot defconfig with the version from the OpenPower firmwre build tree. Important changes are the addition of QED and E1000E ethernet drivers. Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/configs/skiroot_defconfig | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index cfdd08897a06..5ba131c30f6b 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -37,7 +37,8 @@ CONFIG_MODULE_SIG=y CONFIG_MODULE_SIG_FORCE=y CONFIG_MODULE_SIG_SHA512=y CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_DEADLINE is not set +# CONFIG_MQ_IOSCHED_DEADLINE is not set +# CONFIG_MQ_IOSCHED_KYBER is not set # CONFIG_PPC_VAS is not set # CONFIG_PPC_PSERIES is not set # CONFIG_PPC_OF_BOOT_TRAMPOLINE is not set @@ -49,7 +50,6 @@ CONFIG_IRQ_ALL_CPUS=y CONFIG_NUMA=y # CONFIG_COMPACTION is not set # CONFIG_MIGRATION is not set -# CONFIG_BOUNCE is not set CONFIG_PPC_64K_PAGES=y CONFIG_SCHED_SMT=y CONFIG_CMDLINE_BOOL=y @@ -136,9 +136,11 @@ CONFIG_ACENIC_OMIT_TIGON_I=y # CONFIG_NET_VENDOR_AQUANTIA is not set # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_ATHEROS is not set +# CONFIG_NET_VENDOR_AURORA is not set CONFIG_TIGON3=m CONFIG_BNX2X=m # CONFIG_NET_VENDOR_BROCADE is not set +# CONFIG_NET_VENDOR_CADENCE is not set # CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_CAVIUM is not set CONFIG_CHELSIO_T1=m @@ -151,6 +153,7 @@ CONFIG_BE2NET=m # CONFIG_NET_VENDOR_HP is not set # CONFIG_NET_VENDOR_HUAWEI is not set CONFIG_E1000=m +CONFIG_E1000E=m CONFIG_IGB=m CONFIG_IXGB=m CONFIG_IXGBE=m @@ -161,15 +164,18 @@ CONFIG_MLX4_EN=m # CONFIG_MLX4_CORE_GEN2 is not set CONFIG_MLX5_CORE=m # CONFIG_NET_VENDOR_MICREL is not set +# CONFIG_NET_VENDOR_MICROSEMI is not set CONFIG_MYRI10GE=m # CONFIG_NET_VENDOR_NATSEMI is not set # CONFIG_NET_VENDOR_NETRONOME is not set # CONFIG_NET_VENDOR_NI is not set # CONFIG_NET_VENDOR_NVIDIA is not set # CONFIG_NET_VENDOR_OKI is not set -# CONFIG_NET_PACKET_ENGINE is not set +# CONFIG_NET_VENDOR_PACKET_ENGINES is not set CONFIG_QLGE=m CONFIG_NETXEN_NIC=m +CONFIG_QED=m +CONFIG_QEDE=m # CONFIG_NET_VENDOR_QUALCOMM is not set # CONFIG_NET_VENDOR_RDC is not set # CONFIG_NET_VENDOR_REALTEK is not set From e585f51c4ee00046175ace52ca87ea4726302688 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 28 Feb 2019 16:31:21 +0800 Subject: [PATCH 289/855] powerpc: remove dead code in head_fsl_booke.S This code is dead. Just remove it. Signed-off-by: Jason Yan Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_fsl_booke.S | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 1881127682e9..32332e24e421 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -194,13 +194,6 @@ set_ivor: #endif mtspr SPRN_MAS4, r2 -#if 0 - /* Enable DOZE */ - mfspr r2,SPRN_HID0 - oris r2,r2,HID0_DOZE@h - mtspr SPRN_HID0, r2 -#endif - #if !defined(CONFIG_BDI_SWITCH) /* * The Abatron BDI JTAG debugger does not tolerate others From 9a9f1d1a81a972513636c333e26c542f8aebae55 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Mar 2019 12:36:33 +0100 Subject: [PATCH 290/855] fbdev: mbx: fix a misspelled variable name A recent cleanup introduced a build failure when a variable was spelled incorrectly: In file included from drivers/video/fbdev/mbx/mbxfb.c:881: drivers/video/fbdev/mbx/mbxdebugfs.c: In function 'mbxfb_debugfs_init': drivers/video/fbdev/mbx/mbxdebugfs.c:217:2: error: 'mbfi' undeclared (first use in this function); did you mean 'mfbi'? mbfi->debugfs_dir = dir; ^~~~ mfbi drivers/video/fbdev/mbx/mbxdebugfs.c:217:2: note: each undeclared identifier is reported only once for each function it appears in drivers/video/fbdev/mbx/mbxdebugfs.c:213:21: error: unused variable 'mfbi' [-Werror=unused-variable] struct mbxfb_info *mfbi = fbi->par; ^~~~ Fixes: 72aed9e31344 ("fbdev: mbx: fix up debugfs file creation") Signed-off-by: Arnd Bergmann Cc: Greg Kroah-Hartman Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/mbx/mbxdebugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/mbx/mbxdebugfs.c b/drivers/video/fbdev/mbx/mbxdebugfs.c index 52cfe0132b25..09af721638fb 100644 --- a/drivers/video/fbdev/mbx/mbxdebugfs.c +++ b/drivers/video/fbdev/mbx/mbxdebugfs.c @@ -214,7 +214,7 @@ static void mbxfb_debugfs_init(struct fb_info *fbi) struct dentry *dir; dir = debugfs_create_dir("mbxfb", NULL); - mbfi->debugfs_dir = dir; + mfbi->debugfs_dir = dir; debugfs_create_file("sysconf", 0444, dir, fbi, &sysconf_fops); debugfs_create_file("clock", 0444, dir, fbi, &clock_fops); From 3eb39f47934f9d5a3027fe00d906a45fe3a15fad Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 19 Nov 2018 00:51:56 +0100 Subject: [PATCH 291/855] signal: add pidfd_send_signal() syscall The kill() syscall operates on process identifiers (pid). After a process has exited its pid can be reused by another process. If a caller sends a signal to a reused pid it will end up signaling the wrong process. This issue has often surfaced and there has been a push to address this problem [1]. This patch uses file descriptors (fd) from proc/ as stable handles on struct pid. Even if a pid is recycled the handle will not change. The fd can be used to send signals to the process it refers to. Thus, the new syscall pidfd_send_signal() is introduced to solve this problem. Instead of pids it operates on process fds (pidfd). /* prototype and argument /* long pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags); /* syscall number 424 */ The syscall number was chosen to be 424 to align with Arnd's rework in his y2038 to minimize merge conflicts (cf. [25]). In addition to the pidfd and signal argument it takes an additional siginfo_t and flags argument. If the siginfo_t argument is NULL then pidfd_send_signal() is equivalent to kill(, ). If it is not NULL pidfd_send_signal() is equivalent to rt_sigqueueinfo(). The flags argument is added to allow for future extensions of this syscall. It currently needs to be passed as 0. Failing to do so will cause EINVAL. /* pidfd_send_signal() replaces multiple pid-based syscalls */ The pidfd_send_signal() syscall currently takes on the job of rt_sigqueueinfo(2) and parts of the functionality of kill(2), Namely, when a positive pid is passed to kill(2). It will however be possible to also replace tgkill(2) and rt_tgsigqueueinfo(2) if this syscall is extended. /* sending signals to threads (tid) and process groups (pgid) */ Specifically, the pidfd_send_signal() syscall does currently not operate on process groups or threads. This is left for future extensions. In order to extend the syscall to allow sending signal to threads and process groups appropriately named flags (e.g. PIDFD_TYPE_PGID, and PIDFD_TYPE_TID) should be added. This implies that the flags argument will determine what is signaled and not the file descriptor itself. Put in other words, grouping in this api is a property of the flags argument not a property of the file descriptor (cf. [13]). Clarification for this has been requested by Eric (cf. [19]). When appropriate extensions through the flags argument are added then pidfd_send_signal() can additionally replace the part of kill(2) which operates on process groups as well as the tgkill(2) and rt_tgsigqueueinfo(2) syscalls. How such an extension could be implemented has been very roughly sketched in [14], [15], and [16]. However, this should not be taken as a commitment to a particular implementation. There might be better ways to do it. Right now this is intentionally left out to keep this patchset as simple as possible (cf. [4]). /* naming */ The syscall had various names throughout iterations of this patchset: - procfd_signal() - procfd_send_signal() - taskfd_send_signal() In the last round of reviews it was pointed out that given that if the flags argument decides the scope of the signal instead of different types of fds it might make sense to either settle for "procfd_" or "pidfd_" as prefix. The community was willing to accept either (cf. [17] and [18]). Given that one developer expressed strong preference for the "pidfd_" prefix (cf. [13]) and with other developers less opinionated about the name we should settle for "pidfd_" to avoid further bikeshedding. The "_send_signal" suffix was chosen to reflect the fact that the syscall takes on the job of multiple syscalls. It is therefore intentional that the name is not reminiscent of neither kill(2) nor rt_sigqueueinfo(2). Not the fomer because it might imply that pidfd_send_signal() is a replacement for kill(2), and not the latter because it is a hassle to remember the correct spelling - especially for non-native speakers - and because it is not descriptive enough of what the syscall actually does. The name "pidfd_send_signal" makes it very clear that its job is to send signals. /* zombies */ Zombies can be signaled just as any other process. No special error will be reported since a zombie state is an unreliable state (cf. [3]). However, this can be added as an extension through the @flags argument if the need ever arises. /* cross-namespace signals */ The patch currently enforces that the signaler and signalee either are in the same pid namespace or that the signaler's pid namespace is an ancestor of the signalee's pid namespace. This is done for the sake of simplicity and because it is unclear to what values certain members of struct siginfo_t would need to be set to (cf. [5], [6]). /* compat syscalls */ It became clear that we would like to avoid adding compat syscalls (cf. [7]). The compat syscall handling is now done in kernel/signal.c itself by adding __copy_siginfo_from_user_generic() which lets us avoid compat syscalls (cf. [8]). It should be noted that the addition of __copy_siginfo_from_user_any() is caused by a bug in the original implementation of rt_sigqueueinfo(2) (cf. 12). With upcoming rework for syscall handling things might improve significantly (cf. [11]) and __copy_siginfo_from_user_any() will not gain any additional callers. /* testing */ This patch was tested on x64 and x86. /* userspace usage */ An asciinema recording for the basic functionality can be found under [9]. With this patch a process can be killed via: #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include static inline int do_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) { #ifdef __NR_pidfd_send_signal return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); #else return -ENOSYS; #endif } int main(int argc, char *argv[]) { int fd, ret, saved_errno, sig; if (argc < 3) exit(EXIT_FAILURE); fd = open(argv[1], O_DIRECTORY | O_CLOEXEC); if (fd < 0) { printf("%s - Failed to open \"%s\"\n", strerror(errno), argv[1]); exit(EXIT_FAILURE); } sig = atoi(argv[2]); printf("Sending signal %d to process %s\n", sig, argv[1]); ret = do_pidfd_send_signal(fd, sig, NULL, 0); saved_errno = errno; close(fd); errno = saved_errno; if (ret < 0) { printf("%s - Failed to send signal %d to process %s\n", strerror(errno), sig, argv[1]); exit(EXIT_FAILURE); } exit(EXIT_SUCCESS); } /* Q&A * Given that it seems the same questions get asked again by people who are * late to the party it makes sense to add a Q&A section to the commit * message so it's hopefully easier to avoid duplicate threads. * * For the sake of progress please consider these arguments settled unless * there is a new point that desperately needs to be addressed. Please make * sure to check the links to the threads in this commit message whether * this has not already been covered. */ Q-01: (Florian Weimer [20], Andrew Morton [21]) What happens when the target process has exited? A-01: Sending the signal will fail with ESRCH (cf. [22]). Q-02: (Andrew Morton [21]) Is the task_struct pinned by the fd? A-02: No. A reference to struct pid is kept. struct pid - as far as I understand - was created exactly for the reason to not require to pin struct task_struct (cf. [22]). Q-03: (Andrew Morton [21]) Does the entire procfs directory remain visible? Just one entry within it? A-03: The same thing that happens right now when you hold a file descriptor to /proc/ open (cf. [22]). Q-04: (Andrew Morton [21]) Does the pid remain reserved? A-04: No. This patchset guarantees a stable handle not that pids are not recycled (cf. [22]). Q-05: (Andrew Morton [21]) Do attempts to signal that fd return errors? A-05: See {Q,A}-01. Q-06: (Andrew Morton [22]) Is there a cleaner way of obtaining the fd? Another syscall perhaps. A-06: Userspace can already trivially retrieve file descriptors from procfs so this is something that we will need to support anyway. Hence, there's no immediate need to add another syscalls just to make pidfd_send_signal() not dependent on the presence of procfs. However, adding a syscalls to get such file descriptors is planned for a future patchset (cf. [22]). Q-07: (Andrew Morton [21] and others) This fd-for-a-process sounds like a handy thing and people may well think up other uses for it in the future, probably unrelated to signals. Are the code and the interface designed to permit such future applications? A-07: Yes (cf. [22]). Q-08: (Andrew Morton [21] and others) Now I think about it, why a new syscall? This thing is looking rather like an ioctl? A-08: This has been extensively discussed. It was agreed that a syscall is preferred for a variety or reasons. Here are just a few taken from prior threads. Syscalls are safer than ioctl()s especially when signaling to fds. Processes are a core kernel concept so a syscall seems more appropriate. The layout of the syscall with its four arguments would require the addition of a custom struct for the ioctl() thereby causing at least the same amount or even more complexity for userspace than a simple syscall. The new syscall will replace multiple other pid-based syscalls (see description above). The file-descriptors-for-processes concept introduced with this syscall will be extended with other syscalls in the future. See also [22], [23] and various other threads already linked in here. Q-09: (Florian Weimer [24]) What happens if you use the new interface with an O_PATH descriptor? A-09: pidfds opened as O_PATH fds cannot be used to send signals to a process (cf. [2]). Signaling processes through pidfds is the equivalent of writing to a file. Thus, this is not an operation that operates "purely at the file descriptor level" as required by the open(2) manpage. See also [4]. /* References */ [1]: https://lore.kernel.org/lkml/20181029221037.87724-1-dancol@google.com/ [2]: https://lore.kernel.org/lkml/874lbtjvtd.fsf@oldenburg2.str.redhat.com/ [3]: https://lore.kernel.org/lkml/20181204132604.aspfupwjgjx6fhva@brauner.io/ [4]: https://lore.kernel.org/lkml/20181203180224.fkvw4kajtbvru2ku@brauner.io/ [5]: https://lore.kernel.org/lkml/20181121213946.GA10795@mail.hallyn.com/ [6]: https://lore.kernel.org/lkml/20181120103111.etlqp7zop34v6nv4@brauner.io/ [7]: https://lore.kernel.org/lkml/36323361-90BD-41AF-AB5B-EE0D7BA02C21@amacapital.net/ [8]: https://lore.kernel.org/lkml/87tvjxp8pc.fsf@xmission.com/ [9]: https://asciinema.org/a/IQjuCHew6bnq1cr78yuMv16cy [11]: https://lore.kernel.org/lkml/F53D6D38-3521-4C20-9034-5AF447DF62FF@amacapital.net/ [12]: https://lore.kernel.org/lkml/87zhtjn8ck.fsf@xmission.com/ [13]: https://lore.kernel.org/lkml/871s6u9z6u.fsf@xmission.com/ [14]: https://lore.kernel.org/lkml/20181206231742.xxi4ghn24z4h2qki@brauner.io/ [15]: https://lore.kernel.org/lkml/20181207003124.GA11160@mail.hallyn.com/ [16]: https://lore.kernel.org/lkml/20181207015423.4miorx43l3qhppfz@brauner.io/ [17]: https://lore.kernel.org/lkml/CAGXu5jL8PciZAXvOvCeCU3wKUEB_dU-O3q0tDw4uB_ojMvDEew@mail.gmail.com/ [18]: https://lore.kernel.org/lkml/20181206222746.GB9224@mail.hallyn.com/ [19]: https://lore.kernel.org/lkml/20181208054059.19813-1-christian@brauner.io/ [20]: https://lore.kernel.org/lkml/8736rebl9s.fsf@oldenburg.str.redhat.com/ [21]: https://lore.kernel.org/lkml/20181228152012.dbf0508c2508138efc5f2bbe@linux-foundation.org/ [22]: https://lore.kernel.org/lkml/20181228233725.722tdfgijxcssg76@brauner.io/ [23]: https://lwn.net/Articles/773459/ [24]: https://lore.kernel.org/lkml/8736rebl9s.fsf@oldenburg.str.redhat.com/ [25]: https://lore.kernel.org/lkml/CAK8P3a0ej9NcJM8wXNPbcGUyOUZYX+VLoDFdbenW3s3114oQZw@mail.gmail.com/ Cc: "Eric W. Biederman" Cc: Jann Horn Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Oleg Nesterov Cc: Al Viro Cc: Florian Weimer Signed-off-by: Christian Brauner Reviewed-by: Tycho Andersen Reviewed-by: Kees Cook Reviewed-by: David Howells Acked-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Serge Hallyn Acked-by: Aleksa Sarai --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/proc/base.c | 9 ++ include/linux/proc_fs.h | 6 ++ include/linux/syscalls.h | 3 + include/uapi/asm-generic/unistd.h | 4 +- kernel/signal.c | 133 +++++++++++++++++++++++-- kernel/sys_ni.c | 1 + 8 files changed, 151 insertions(+), 7 deletions(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 3cf7b533b3d1..234d91df8ca6 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -398,3 +398,4 @@ 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl 385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents 386 i386 rseq sys_rseq __ia32_sys_rseq +424 i386 pidfd_send_signal sys_pidfd_send_signal __ia32_sys_pidfd_send_signal diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f0b1709a5ffb..58f4b3ad4fe0 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -343,6 +343,7 @@ 332 common statx __x64_sys_statx 333 common io_pgetevents __x64_sys_io_pgetevents 334 common rseq __x64_sys_rseq +424 common pidfd_send_signal __x64_sys_pidfd_send_signal # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/proc/base.c b/fs/proc/base.c index 633a63462573..b6627c471078 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3046,6 +3046,15 @@ static const struct file_operations proc_tgid_base_operations = { .llseek = generic_file_llseek, }; +struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + if (!d_is_dir(file->f_path.dentry) || + (file->f_op != &proc_tgid_base_operations)) + return ERR_PTR(-EBADF); + + return proc_pid(file_inode(file)); +} + static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index d0e1f1522a78..52a283ba0465 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -73,6 +73,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo int (*show)(struct seq_file *, void *), proc_write_t write, void *data); +extern struct pid *tgid_pidfd_to_pid(const struct file *file); #else /* CONFIG_PROC_FS */ @@ -114,6 +115,11 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;}) #define proc_create_net_single(name, mode, parent, show, data) ({NULL;}) +static inline struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + return ERR_PTR(-EBADF); +} + #endif /* CONFIG_PROC_FS */ struct net; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 257cccba3062..5eb2e351675e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -926,6 +926,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); +asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, + siginfo_t __user *info, + unsigned int flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index d90127298f12..c861e7d1053b 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -740,9 +740,11 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents) __SYSCALL(__NR_rseq, sys_rseq) #define __NR_kexec_file_load 294 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) +#define __NR_pidfd_send_signal 424 +__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal) #undef __NR_syscalls -#define __NR_syscalls 295 +#define __NR_syscalls 425 /* * 32 bit systems traditionally used different diff --git a/kernel/signal.c b/kernel/signal.c index e1d7ad8e6ab1..268bed80244f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -19,7 +19,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -3429,6 +3431,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, #endif #endif +static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info) +{ + clear_siginfo(info); + info->si_signo = sig; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_tgid_vnr(current); + info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); +} + /** * sys_kill - send a signal to a process * @pid: the PID of the process @@ -3438,16 +3450,125 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; - clear_siginfo(&info); - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = task_tgid_vnr(current); - info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + prepare_kill_siginfo(sig, &info); return kill_something_info(sig, &info, pid); } +#ifdef CONFIG_PROC_FS +/* + * Verify that the signaler and signalee either are in the same pid namespace + * or that the signaler's pid namespace is an ancestor of the signalee's pid + * namespace. + */ +static bool access_pidfd_pidns(struct pid *pid) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *p = ns_of_pid(pid); + + for (;;) { + if (!p) + return false; + if (p == active) + break; + p = p->parent; + } + + return true; +} + +static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) +{ +#ifdef CONFIG_COMPAT + /* + * Avoid hooking up compat syscalls and instead handle necessary + * conversions here. Note, this is a stop-gap measure and should not be + * considered a generic solution. + */ + if (in_compat_syscall()) + return copy_siginfo_from_user32( + kinfo, (struct compat_siginfo __user *)info); +#endif + return copy_siginfo_from_user(kinfo, info); +} + +/** + * sys_pidfd_send_signal - send a signal to a process through a task file + * descriptor + * @pidfd: the file descriptor of the process + * @sig: signal to be sent + * @info: the signal info + * @flags: future flags to be passed + * + * The syscall currently only signals via PIDTYPE_PID which covers + * kill(, . It does not signal threads or process + * groups. + * In order to extend the syscall to threads and process groups the @flags + * argument should be used. In essence, the @flags argument will determine + * what is signaled and not the file descriptor itself. Put in other words, + * grouping is a property of the flags argument not a property of the file + * descriptor. + * + * Return: 0 on success, negative errno on failure + */ +SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, + siginfo_t __user *, info, unsigned int, flags) +{ + int ret; + struct fd f; + struct pid *pid; + kernel_siginfo_t kinfo; + + /* Enforce flags be set to 0 until we add an extension. */ + if (flags) + return -EINVAL; + + f = fdget_raw(pidfd); + if (!f.file) + return -EBADF; + + /* Is this a pidfd? */ + pid = tgid_pidfd_to_pid(f.file); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto err; + } + + ret = -EINVAL; + if (!access_pidfd_pidns(pid)) + goto err; + + if (info) { + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + goto err; + + ret = -EINVAL; + if (unlikely(sig != kinfo.si_signo)) + goto err; + + if ((task_pid(current) != pid) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) { + /* Only allow sending arbitrary signals to yourself. */ + ret = -EPERM; + if (kinfo.si_code != SI_USER) + goto err; + + /* Turn this into a regular kill signal. */ + prepare_kill_siginfo(sig, &kinfo); + } + } else { + prepare_kill_siginfo(sig, &kinfo); + } + + ret = kill_pid_info(sig, &kinfo, pid); + +err: + fdput(f); + return ret; +} +#endif /* CONFIG_PROC_FS */ + static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) { diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ab9d0e3c6d50..f905f4f9f677 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -163,6 +163,7 @@ COND_SYSCALL(syslog); /* kernel/sched/core.c */ /* kernel/signal.c */ +COND_SYSCALL(pidfd_send_signal); /* kernel/sys.c */ COND_SYSCALL(setregid); From 575a0ae9744d571f7c6aae4487a05850baae9e1c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 29 Dec 2018 22:27:33 +0100 Subject: [PATCH 292/855] selftests: add tests for pidfd_send_signal() As suggested by Andrew Morton in [1] add selftests for the new sys_pidfd_send_signal() syscall: /* test_pidfd_send_signal_syscall_support */ Test whether the pidfd_send_signal() syscall is supported and the tests can be run or need to be skipped. /* test_pidfd_send_signal_simple_success */ Test whether sending a signal via a pidfd works. /* test_pidfd_send_signal_exited_fail */ Verify that sending a signal to an already exited process fails with ESRCH. /* test_pidfd_send_signal_recycled_pid_fail */ Verify that a recycled pid cannot be signaled via a pidfd referring to an already exited process that had the same pid (cf. [2], [3]). [1]: https://lore.kernel.org/lkml/20181228152012.dbf0508c2508138efc5f2bbe@linux-foundation.org/ [2]: https://lore.kernel.org/lkml/20181230210245.GA30252@mail.hallyn.com/ [3]: https://lore.kernel.org/lkml/20181230232711.7aayb7vnhogbv4co@brauner.io/ Cc: Arnd Bergmann Cc: "Eric W. Biederman" Cc: Kees Cook Cc: Jann Horn Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Oleg Nesterov Cc: Aleksa Sarai Cc: Al Viro Cc: Florian Weimer Signed-off-by: Christian Brauner Reviewed-by: Tycho Andersen Acked-by: Serge Hallyn --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/pidfd/Makefile | 6 + tools/testing/selftests/pidfd/pidfd_test.c | 381 +++++++++++++++++++++ 3 files changed, 388 insertions(+) create mode 100644 tools/testing/selftests/pidfd/Makefile create mode 100644 tools/testing/selftests/pidfd/pidfd_test.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 1a2bd15c5b6e..cfbc3528e9ef 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -30,6 +30,7 @@ TARGETS += net TARGETS += netfilter TARGETS += networking/timestamping TARGETS += nsfs +TARGETS += pidfd TARGETS += powerpc TARGETS += proc TARGETS += pstore diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile new file mode 100644 index 000000000000..deaf8073bc06 --- /dev/null +++ b/tools/testing/selftests/pidfd/Makefile @@ -0,0 +1,6 @@ +CFLAGS += -g -I../../../../usr/include/ + +TEST_GEN_PROGS := pidfd_test + +include ../lib.mk + diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c new file mode 100644 index 000000000000..d59378a93782 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -0,0 +1,381 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, + unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int signal_received; + +static void set_signal_received_on_sigusr1(int sig) +{ + if (sig == SIGUSR1) + signal_received = 1; +} + +/* + * Straightforward test to see whether pidfd_send_signal() works is to send + * a signal to ourself. + */ +static int test_pidfd_send_signal_simple_success(void) +{ + int pidfd, ret; + const char *test_name = "pidfd_send_signal send SIGUSR1"; + + pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC); + if (pidfd < 0) + ksft_exit_fail_msg( + "%s test: Failed to open process file descriptor\n", + test_name); + + signal(SIGUSR1, set_signal_received_on_sigusr1); + + ret = sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0); + close(pidfd); + if (ret < 0) + ksft_exit_fail_msg("%s test: Failed to send signal\n", + test_name); + + if (signal_received != 1) + ksft_exit_fail_msg("%s test: Failed to receive signal\n", + test_name); + + signal_received = 0; + ksft_test_result_pass("%s test: Sent signal\n", test_name); + return 0; +} + +static int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (ret != pid) + goto again; + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + +static int test_pidfd_send_signal_exited_fail(void) +{ + int pidfd, ret, saved_errno; + char buf[256]; + pid_t pid; + const char *test_name = "pidfd_send_signal signal exited process"; + + pid = fork(); + if (pid < 0) + ksft_exit_fail_msg("%s test: Failed to create new process\n", + test_name); + + if (pid == 0) + _exit(EXIT_SUCCESS); + + snprintf(buf, sizeof(buf), "/proc/%d", pid); + + pidfd = open(buf, O_DIRECTORY | O_CLOEXEC); + + (void)wait_for_pid(pid); + + if (pidfd < 0) + ksft_exit_fail_msg( + "%s test: Failed to open process file descriptor\n", + test_name); + + ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); + saved_errno = errno; + close(pidfd); + if (ret == 0) + ksft_exit_fail_msg( + "%s test: Managed to send signal to process even though it should have failed\n", + test_name); + + if (saved_errno != ESRCH) + ksft_exit_fail_msg( + "%s test: Expected to receive ESRCH as errno value but received %d instead\n", + test_name, saved_errno); + + ksft_test_result_pass("%s test: Failed to send signal as expected\n", + test_name); + return 0; +} + +/* + * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c + * That means, when it wraps around any pid < 300 will be skipped. + * So we need to use a pid > 300 in order to test recycling. + */ +#define PID_RECYCLE 1000 + +/* + * Maximum number of cycles we allow. This is equivalent to PID_MAX_DEFAULT. + * If users set a higher limit or we have cycled PIDFD_MAX_DEFAULT number of + * times then we skip the test to not go into an infinite loop or block for a + * long time. + */ +#define PIDFD_MAX_DEFAULT 0x8000 + +/* + * Define a few custom error codes for the child process to clearly indicate + * what is happening. This way we can tell the difference between a system + * error, a test error, etc. + */ +#define PIDFD_PASS 0 +#define PIDFD_FAIL 1 +#define PIDFD_ERROR 2 +#define PIDFD_SKIP 3 +#define PIDFD_XFAIL 4 + +static int test_pidfd_send_signal_recycled_pid_fail(void) +{ + int i, ret; + pid_t pid1; + const char *test_name = "pidfd_send_signal signal recycled pid"; + + ret = unshare(CLONE_NEWPID); + if (ret < 0) + ksft_exit_fail_msg("%s test: Failed to unshare pid namespace\n", + test_name); + + ret = unshare(CLONE_NEWNS); + if (ret < 0) + ksft_exit_fail_msg( + "%s test: Failed to unshare mount namespace\n", + test_name); + + ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0); + if (ret < 0) + ksft_exit_fail_msg("%s test: Failed to remount / private\n", + test_name); + + /* pid 1 in new pid namespace */ + pid1 = fork(); + if (pid1 < 0) + ksft_exit_fail_msg("%s test: Failed to create new process\n", + test_name); + + if (pid1 == 0) { + char buf[256]; + pid_t pid2; + int pidfd = -1; + + (void)umount2("/proc", MNT_DETACH); + ret = mount("proc", "/proc", "proc", 0, NULL); + if (ret < 0) + _exit(PIDFD_ERROR); + + /* grab pid PID_RECYCLE */ + for (i = 0; i <= PIDFD_MAX_DEFAULT; i++) { + pid2 = fork(); + if (pid2 < 0) + _exit(PIDFD_ERROR); + + if (pid2 == 0) + _exit(PIDFD_PASS); + + if (pid2 == PID_RECYCLE) { + snprintf(buf, sizeof(buf), "/proc/%d", pid2); + ksft_print_msg("pid to recycle is %d\n", pid2); + pidfd = open(buf, O_DIRECTORY | O_CLOEXEC); + } + + if (wait_for_pid(pid2)) + _exit(PIDFD_ERROR); + + if (pid2 >= PID_RECYCLE) + break; + } + + /* + * We want to be as predictable as we can so if we haven't been + * able to grab pid PID_RECYCLE skip the test. + */ + if (pid2 != PID_RECYCLE) { + /* skip test */ + close(pidfd); + _exit(PIDFD_SKIP); + } + + if (pidfd < 0) + _exit(PIDFD_ERROR); + + for (i = 0; i <= PIDFD_MAX_DEFAULT; i++) { + char c; + int pipe_fds[2]; + pid_t recycled_pid; + int child_ret = PIDFD_PASS; + + ret = pipe2(pipe_fds, O_CLOEXEC); + if (ret < 0) + _exit(PIDFD_ERROR); + + recycled_pid = fork(); + if (recycled_pid < 0) + _exit(PIDFD_ERROR); + + if (recycled_pid == 0) { + close(pipe_fds[1]); + (void)read(pipe_fds[0], &c, 1); + close(pipe_fds[0]); + + _exit(PIDFD_PASS); + } + + /* + * Stop the child so we can inspect whether we have + * recycled pid PID_RECYCLE. + */ + close(pipe_fds[0]); + ret = kill(recycled_pid, SIGSTOP); + close(pipe_fds[1]); + if (ret) { + (void)wait_for_pid(recycled_pid); + _exit(PIDFD_ERROR); + } + + /* + * We have recycled the pid. Try to signal it. This + * needs to fail since this is a different process than + * the one the pidfd refers to. + */ + if (recycled_pid == PID_RECYCLE) { + ret = sys_pidfd_send_signal(pidfd, SIGCONT, + NULL, 0); + if (ret && errno == ESRCH) + child_ret = PIDFD_XFAIL; + else + child_ret = PIDFD_FAIL; + } + + /* let the process move on */ + ret = kill(recycled_pid, SIGCONT); + if (ret) + (void)kill(recycled_pid, SIGKILL); + + if (wait_for_pid(recycled_pid)) + _exit(PIDFD_ERROR); + + switch (child_ret) { + case PIDFD_FAIL: + /* fallthrough */ + case PIDFD_XFAIL: + _exit(child_ret); + case PIDFD_PASS: + break; + default: + /* not reached */ + _exit(PIDFD_ERROR); + } + + /* + * If the user set a custom pid_max limit we could be + * in the millions. + * Skip the test in this case. + */ + if (recycled_pid > PIDFD_MAX_DEFAULT) + _exit(PIDFD_SKIP); + } + + /* failed to recycle pid */ + _exit(PIDFD_SKIP); + } + + ret = wait_for_pid(pid1); + switch (ret) { + case PIDFD_FAIL: + ksft_exit_fail_msg( + "%s test: Managed to signal recycled pid %d\n", + test_name, PID_RECYCLE); + case PIDFD_PASS: + ksft_exit_fail_msg("%s test: Failed to recycle pid %d\n", + test_name, PID_RECYCLE); + case PIDFD_SKIP: + ksft_print_msg("%s test: Skipping test\n", test_name); + ret = 0; + break; + case PIDFD_XFAIL: + ksft_test_result_pass( + "%s test: Failed to signal recycled pid as expected\n", + test_name); + ret = 0; + break; + default /* PIDFD_ERROR */: + ksft_exit_fail_msg("%s test: Error while running tests\n", + test_name); + } + + return ret; +} + +static int test_pidfd_send_signal_syscall_support(void) +{ + int pidfd, ret; + const char *test_name = "pidfd_send_signal check for support"; + + pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC); + if (pidfd < 0) + ksft_exit_fail_msg( + "%s test: Failed to open process file descriptor\n", + test_name); + + ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); + if (ret < 0) { + /* + * pidfd_send_signal() will currently return ENOSYS when + * CONFIG_PROC_FS is not set. + */ + if (errno == ENOSYS) + ksft_exit_skip( + "%s test: pidfd_send_signal() syscall not supported (Ensure that CONFIG_PROC_FS=y is set)\n", + test_name); + + ksft_exit_fail_msg("%s test: Failed to send signal\n", + test_name); + } + + close(pidfd); + ksft_test_result_pass( + "%s test: pidfd_send_signal() syscall is supported. Tests can be executed\n", + test_name); + return 0; +} + +int main(int argc, char **argv) +{ + ksft_print_header(); + + test_pidfd_send_signal_syscall_support(); + test_pidfd_send_signal_simple_success(); + test_pidfd_send_signal_exited_fail(); + test_pidfd_send_signal_recycled_pid_fail(); + + return ksft_exit_pass(); +} From 6dd356d8fc0641a8f0afb6ba457fb9351eea5076 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 5 Mar 2019 09:16:29 -0800 Subject: [PATCH 293/855] ARC: unaligned: relax the check for gcc supporting -mno-unaligned-access Without bleeding edge gcc, kernel builds were tripping everywhere. So current gcc will generate unaligned code despite !CONFIG_ARC_USE_UNALIGNED_MEM_ACCESS but that is something we have to live with. Signed-off-by: Vineet Gupta --- arch/arc/kernel/setup.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 15da43d2e416..a9c88b7e9182 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -314,18 +314,6 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) IS_AVAIL1(cpu->extn_mpy.ver, mpy_opt), IS_AVAIL1(cpu->isa.div_rem, "div_rem ")); - -#if defined(__ARC_UNALIGNED__) && !defined(CONFIG_ARC_USE_UNALIGNED_MEM_ACCESS) - /* - * gcc 7.3.1 (GNU 2018.03) onwards generate unaligned access by default - * but -mno-unaligned-access to disable that didn't work until gcc 8.2.1 - * (GNU 2019.03). So landing here implies the interim period, when - * despite Kconfig being off, gcc is generating unaligned accesses which - * could bomb later on. So better to disallow such broken builds - */ - BUILD_BUG_ON_MSG(1, "gcc doesn't support -mno-unaligned-access"); -#endif - if (cpu->bpu.ver) { n += scnprintf(buf + n, len - n, "BPU\t\t: %s%s match, cache:%d, Predict Table:%d Return stk: %d", From a0770e13c8da83bdb64738c0209ab02dd3cfff8b Mon Sep 17 00:00:00 2001 From: zhengliang Date: Mon, 4 Mar 2019 09:32:25 +0800 Subject: [PATCH 294/855] f2fs: fix to data block override node segment by mistake v4: Rearrange the previous three versions. The following scenario could lead to data block override by mistake. TASK A | TASK kworker | TASK B | TASK C | | | open | | | write | | | close | | | | f2fs_write_data_pages | | | f2fs_write_cache_pages | | | f2fs_outplace_write_data | | | f2fs_allocate_data_block (get block in seg S, | | | S is full, and only | | | have this valid data | | | block) | | | allocate_segment | | | locate_dirty_segment (mark S as PRE) | | | f2fs_submit_page_write (submit but is not | | | written on dev) | | unlink | | | iput_final | | | f2fs_drop_inode | | | f2fs_truncate | | | (not evict) | | | | | write_checkpoint | | | flush merged bio but not wait file data writeback | | | set_prefree_as_free (mark S as FREE) | | | | update NODE/DATA | | | allocate_segment (select S) | writeback done | | So we need to guarantee io complete before truncate inode in f2fs_drop_inode. Reviewed-by: Chao Yu Signed-off-by: Zheng Liang Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 384d1c248857..e382be2f10f9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -915,6 +915,10 @@ static int f2fs_drop_inode(struct inode *inode) sb_start_intwrite(inode->i_sb); f2fs_i_size_write(inode, 0); + f2fs_submit_merged_write_cond(F2FS_I_SB(inode), + inode, NULL, 0, DATA); + truncate_inode_pages_final(inode->i_mapping); + if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); From 7321dd97b5bb1801da6881141e56308e593a69f5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 24 Jan 2019 17:18:07 +0800 Subject: [PATCH 295/855] f2fs: fix to document inline_xattr_size option We missed to add document for inline_xattr_size mount option in f2fs.txt, add it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index e46c2147ddf8..f7b5e4ff0de3 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -126,6 +126,8 @@ disable_ext_identify Disable the extension list configured by mkfs, so f2fs does not aware of cold files such as media files. inline_xattr Enable the inline xattrs feature. noinline_xattr Disable the inline xattrs feature. +inline_xattr_size=%u Support configuring inline xattr size, it depends on + flexible inline xattr feature. inline_data Enable the inline data feature: New created small(<~3.4k) files can be written into inode block. inline_dentry Enable the inline dir feature: data in new created From 025cdb166c1e6a142fdec46bcd0873e00f0be0fd Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 23 Jan 2019 15:49:44 +0800 Subject: [PATCH 296/855] f2fs: jump to label 'free_node_inode' when failing from d_make_root() When sb->s_root is NULL dput() will do nothing, so jump to label 'free_node_inode' instead of lable 'free_root_inode' when failing from d_make_root(). Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e382be2f10f9..83bbe7424fc1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3322,7 +3322,7 @@ try_onemore: sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { err = -ENOMEM; - goto free_root_inode; + goto free_node_inode; } err = f2fs_register_sysfs(sbi); From c42d28ce3e16dbd88e575c0acfda96d221dae2c9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 2 Feb 2019 17:33:01 +0800 Subject: [PATCH 297/855] f2fs: fix potential data inconsistence of checkpoint Previously, we changed lock from cp_rwsem to node_change, it solved the deadlock issue which was caused by below race condition: Thread A Thread B - f2fs_setattr - f2fs_lock_op -- read_lock - dquot_transfer - __dquot_transfer - dquot_acquire - commit_dqblk - f2fs_quota_write - f2fs_write_begin - f2fs_write_failed - write_checkpoint - block_operations - f2fs_lock_all -- write_lock - f2fs_truncate_blocks - f2fs_lock_op -- read_lock But it breaks the sematics of cp_rwsem, in other callers like: - f2fs_file_write_iter -> f2fs_write_begin -> f2fs_write_failed - f2fs_direct_IO -> f2fs_write_failed We allow to truncate dnode w/o cp_rwsem held, result in incorrect sit bitmap update, which can cause further data corruption. So this patch reverts previous fix implementation, and try to fix deadlock by skipping calling f2fs_truncate_blocks() in f2fs_write_failed() only for quota file, and keep the preallocated data/node in the tail of quota file, we can expecte that the preallocated space can be used to store quota info latter soon. Fixes: af033b2aa8a8 ("f2fs: guarantee journalled quota data by checkpoint") Signed-off-by: Gao Xiang Signed-off-by: Sheng Yong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/file.c | 14 ++++++-------- fs/f2fs/inline.c | 4 ++-- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4e402add2e60..e099babf85bd 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2312,7 +2312,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - f2fs_truncate_blocks(inode, i_size, true, true); + if (!IS_NOQUOTA(inode)) + f2fs_truncate_blocks(inode, i_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8c928cd72b61..4665bff1bf55 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2843,8 +2843,7 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void f2fs_truncate_data_blocks(struct dnode_of_data *dn); -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, - bool buf_write); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); int f2fs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fe6f92fbba38..b8f5d1208619 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -589,8 +589,7 @@ truncate_out: return 0; } -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, - bool buf_write) +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -598,7 +597,6 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, int count = 0, err = 0; struct page *ipage; bool truncate_page = false; - int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; trace_f2fs_truncate_blocks_enter(inode, from); @@ -608,7 +606,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, goto free_partial; if (lock) - __do_map_lock(sbi, flag, true); + f2fs_lock_op(sbi); ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { @@ -646,7 +644,7 @@ free_next: err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) - __do_map_lock(sbi, flag, false); + f2fs_unlock_op(sbi); free_partial: /* lastly zero out the first data page */ if (!err) @@ -681,7 +679,7 @@ int f2fs_truncate(struct inode *inode) return err; } - err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); + err = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (err) return err; @@ -1262,7 +1260,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) new_size = i_size_read(inode) - len; truncate_pagecache(inode, new_size); - ret = f2fs_truncate_blocks(inode, new_size, true, false); + ret = f2fs_truncate_blocks(inode, new_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); @@ -1447,7 +1445,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); down_write(&F2FS_I(inode)->i_mmap_sem); - ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) return ret; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index d636cbcf68f2..a1381d05956b 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -298,7 +298,7 @@ process_inline: clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (f2fs_truncate_blocks(inode, 0, false, false)) + if (f2fs_truncate_blocks(inode, 0, false)) return false; goto process_inline; } @@ -470,7 +470,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) return 0; punch_dentry_pages: truncate_inode_pages(&dir->i_data, 0); - f2fs_truncate_blocks(dir, 0, false, false); + f2fs_truncate_blocks(dir, 0, false); f2fs_remove_dirty_inode(dir); return err; } From 9083977dabf3833298ddcd40dee28687f1e6b483 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Mon, 4 Feb 2019 13:36:53 +0530 Subject: [PATCH 298/855] f2fs: do not use mutex lock in atomic context Fix below warning coming because of using mutex lock in atomic context. BUG: sleeping function called from invalid context at kernel/locking/mutex.c:98 in_atomic(): 1, irqs_disabled(): 0, pid: 585, name: sh Preemption disabled at: __radix_tree_preload+0x28/0x130 Call trace: dump_backtrace+0x0/0x2b4 show_stack+0x20/0x28 dump_stack+0xa8/0xe0 ___might_sleep+0x144/0x194 __might_sleep+0x58/0x8c mutex_lock+0x2c/0x48 f2fs_trace_pid+0x88/0x14c f2fs_set_node_page_dirty+0xd0/0x184 Do not use f2fs_radix_tree_insert() to avoid doing cond_resched() with spin_lock() acquired. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/trace.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index ce2a5eb210b6..d0ab533a9ce8 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -14,7 +14,7 @@ #include "trace.h" static RADIX_TREE(pids, GFP_ATOMIC); -static struct mutex pids_lock; +static spinlock_t pids_lock; static struct last_io_info last_io; static inline void __print_last_io(void) @@ -58,23 +58,29 @@ void f2fs_trace_pid(struct page *page) set_page_private(page, (unsigned long)pid); +retry: if (radix_tree_preload(GFP_NOFS)) return; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); p = radix_tree_lookup(&pids, pid); if (p == current) goto out; if (p) radix_tree_delete(&pids, pid); - f2fs_radix_tree_insert(&pids, pid, current); + if (radix_tree_insert(&pids, pid, current)) { + spin_unlock(&pids_lock); + radix_tree_preload_end(); + cond_resched(); + goto retry; + } trace_printk("%3x:%3x %4x %-16s\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), pid, current->comm); out: - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); radix_tree_preload_end(); } @@ -119,7 +125,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) void f2fs_build_trace_ios(void) { - mutex_init(&pids_lock); + spin_lock_init(&pids_lock); } #define PIDVEC_SIZE 128 @@ -147,7 +153,7 @@ void f2fs_destroy_trace_ios(void) pid_t next_pid = 0; unsigned int found; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { unsigned idx; @@ -155,5 +161,5 @@ void f2fs_destroy_trace_ios(void) for (idx = 0; idx < found; idx++) radix_tree_delete(&pids, pid[idx]); } - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); } From 500e0b28ecd3c5aade98f3c3a339d18dcb166bb6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 15 Feb 2019 00:08:25 +0800 Subject: [PATCH 299/855] f2fs: fix to check inline_xattr_size boundary correctly We use below condition to check inline_xattr_size boundary: if (!F2FS_OPTION(sbi).inline_xattr_size || F2FS_OPTION(sbi).inline_xattr_size >= DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE - DEF_INLINE_RESERVED_SIZE - DEF_MIN_INLINE_SIZE) There is there problems in that check: - we should allow inline_xattr_size equaling to min size of inline {data,dentry} area. - F2FS_TOTAL_EXTRA_ATTR_SIZE and inline_xattr_size are based on different size unit, previous one is 4 bytes, latter one is 1 bytes. - DEF_MIN_INLINE_SIZE only indicate min size of inline data area, however, we need to consider min size of inline dentry area as well, minimal inline dentry should at least contain two entries: '.' and '..', so that min inline_dentry size is 40 bytes. .bitmap 1 * 1 = 1 .reserved 1 * 1 = 1 .dentry 11 * 2 = 22 .filename 8 * 2 = 16 total 40 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/super.c | 13 +++++++------ include/linux/f2fs_fs.h | 13 +++++++------ 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4665bff1bf55..f1f0d2810852 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -459,7 +459,6 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 -#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 83bbe7424fc1..be8be445c6ed 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -834,12 +834,13 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (!F2FS_OPTION(sbi).inline_xattr_size || - F2FS_OPTION(sbi).inline_xattr_size >= - DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE - - DEF_INLINE_RESERVED_SIZE - - DEF_MIN_INLINE_SIZE) { + if (F2FS_OPTION(sbi).inline_xattr_size < + sizeof(struct f2fs_xattr_header) / sizeof(__le32) || + F2FS_OPTION(sbi).inline_xattr_size > + DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - + DEF_INLINE_RESERVED_SIZE - + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { f2fs_msg(sb, KERN_ERR, "inline xattr size is out of range"); return -EINVAL; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 8d57aaee8166..666db8eb71e0 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -490,12 +490,12 @@ typedef __le32 f2fs_hash_t; /* * space utilization of regular dentry and inline dentry (w/o extra reservation) - * regular dentry inline dentry - * bitmap 1 * 27 = 27 1 * 23 = 23 - * reserved 1 * 3 = 3 1 * 7 = 7 - * dentry 11 * 214 = 2354 11 * 182 = 2002 - * filename 8 * 214 = 1712 8 * 182 = 1456 - * total 4096 3488 + * regular dentry inline dentry (def) inline dentry (min) + * bitmap 1 * 27 = 27 1 * 23 = 23 1 * 1 = 1 + * reserved 1 * 3 = 3 1 * 7 = 7 1 * 1 = 1 + * dentry 11 * 214 = 2354 11 * 182 = 2002 11 * 2 = 22 + * filename 8 * 214 = 1712 8 * 182 = 1456 8 * 2 = 16 + * total 4096 3488 40 * * Note: there are more reserved space in inline dentry than in regular * dentry, when converting inline dentry we should handle this carefully. @@ -507,6 +507,7 @@ typedef __le32 f2fs_hash_t; #define SIZE_OF_RESERVED (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \ F2FS_SLOT_LEN) * \ NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP)) +#define MIN_INLINE_DENTRY_SIZE 40 /* just include '.' and '..' entries */ /* One directory entry slot representing F2FS_SLOT_LEN-sized file name */ struct f2fs_dir_entry { From 6d52e135c8a8b2063e0a9fe1f12c06e9208a941c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 15 Feb 2019 00:16:15 +0800 Subject: [PATCH 300/855] f2fs: don't allow negative ->write_io_size_bits As Dan reported: "We put an upper bound on ->write_io_size_bits but we don't have a lower bound." So let's add lower bound check for ->write_io_size_bits in parse_options(). [We don't allow configuring ->write_io_size_bits to zero, since at least we need to fill one dummy page for aligned IO.] Reported-by: Dan Carpenter Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index be8be445c6ed..230845221c74 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -586,7 +586,7 @@ static int parse_options(struct super_block *sb, char *options) case Opt_io_size_bits: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) { f2fs_msg(sb, KERN_WARNING, "Not support %d, larger than %d", 1 << arg, BIO_MAX_PAGES); From fb40d618b03978b7cc5820697894461f4a2af98b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 5 Feb 2019 07:59:57 -0800 Subject: [PATCH 301/855] f2fs: don't clear CP_QUOTA_NEED_FSCK_FLAG If we met this once, let fsck.f2fs clear this only. Note that, this addresses all the subtle fault injection test. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 03fea4efd64b..c65a1e8e1e95 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1267,8 +1267,10 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); - else - __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + /* + * TODO: we count on fsck.f2fs to clear this flag until we figure out + * missing cases which clear it incorrectly. + */ if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); From 05b700ba6003fd98c41314f390df36e2b893e167 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 3 Mar 2019 21:17:48 +0800 Subject: [PATCH 302/855] block: fix segment calculation for passthrough IO blk_recount_segments() can be called in bio_add_pc_page() for calculating how many segments this bio will has after one page is added to this bio. If the resulted segment number is beyond the queue limit, the added page will be removed. The try-and-fix policy requires blk_recount_segments(__blk_recalc_rq_segments) to not consider the segment number limit. Unfortunately bvec_split_segs() does check this limit, and causes small segment number returned to bio_add_pc_page(), then page still may be added to the bio even though segment number limit becomes broken. Fixes this issue by not considering segment number limit when calcualting bio's segment number. Fixes: dcebd755926b ("block: use bio_for_each_bvec() to compute multi-page bvec count") Cc: Christoph Hellwig Cc: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 22467f475ab4..1c9d4f0f96ea 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -180,7 +180,7 @@ static unsigned get_max_segment_size(struct request_queue *q, */ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, unsigned *nsegs, unsigned *last_seg_size, - unsigned *front_seg_size, unsigned *sectors) + unsigned *front_seg_size, unsigned *sectors, unsigned max_segs) { unsigned len = bv->bv_len; unsigned total_len = 0; @@ -190,7 +190,7 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, * Multi-page bvec may be too big to hold in one segment, so the * current bvec has to be splitted as multiple segments. */ - while (len && new_nsegs + *nsegs < queue_max_segments(q)) { + while (len && new_nsegs + *nsegs < max_segs) { seg_size = get_max_segment_size(q, bv->bv_offset + total_len); seg_size = min(seg_size, len); @@ -240,6 +240,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bool do_split = true; struct bio *new = NULL; const unsigned max_sectors = get_max_io_size(q, bio); + const unsigned max_segs = queue_max_segments(q); bio_for_each_bvec(bv, bio, iter) { /* @@ -254,14 +255,14 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, * Consider this a new segment if we're splitting in * the middle of this vector. */ - if (nsegs < queue_max_segments(q) && + if (nsegs < max_segs && sectors < max_sectors) { /* split in the middle of bvec */ bv.bv_len = (max_sectors - sectors) << 9; bvec_split_segs(q, &bv, &nsegs, &seg_size, &front_seg_size, - §ors); + §ors, max_segs); } goto split; } @@ -283,7 +284,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, continue; } new_segment: - if (nsegs == queue_max_segments(q)) + if (nsegs == max_segs) goto split; bvprv = bv; @@ -296,7 +297,7 @@ new_segment: if (nsegs == 1 && seg_size > front_seg_size) front_seg_size = seg_size; } else if (bvec_split_segs(q, &bv, &nsegs, &seg_size, - &front_seg_size, §ors)) { + &front_seg_size, §ors, max_segs)) { goto split; } } @@ -415,7 +416,7 @@ new_segment: bvprv = bv; prev = 1; bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size, - &front_seg_size, NULL); + &front_seg_size, NULL, UINT_MAX); } bbio = bio; } From 441b7195e2812f2d7a9dafe02f052f642957bd8f Mon Sep 17 00:00:00 2001 From: Erwan Velu Date: Fri, 1 Mar 2019 17:08:06 +0100 Subject: [PATCH 303/855] scsi: smartpqi: Reporting 'logical unit failure' When the HARDWARE_ERROR/0x3e/0x1 case is triggered, the logical volume is offlined. When reading the kernel log, the reason why the device got offlined isn't reported to the user. This situation makes it difficult for admins to root cause. Log a message when this condition occurs. [mkp: tweaked commit message] Signed-off-by: Erwan Velu Acked-by: Don Brace Signed-off-by: Martin K. Petersen --- drivers/scsi/smartpqi/smartpqi_init.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index f6eaea2eadd1..c7b023683575 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -2764,6 +2764,12 @@ static void pqi_process_raid_io_error(struct pqi_io_request *io_request) sshdr.sense_key == HARDWARE_ERROR && sshdr.asc == 0x3e && sshdr.ascq == 0x1) { + struct pqi_ctrl_info *ctrl_info = shost_to_hba(scmd->device->host); + struct pqi_scsi_dev *device = scmd->device->hostdata; + + if (printk_ratelimit()) + scmd_printk(KERN_ERR, scmd, "received 'logical unit failure' from controller for scsi %d:%d:%d:%d\n", + ctrl_info->scsi_host->host_no, device->bus, device->target, device->lun); pqi_take_device_offline(scmd->device, "RAID"); host_byte = DID_NO_CONNECT; } From 3722e6a52174d7c3a00e6f5efd006ca093f346c1 Mon Sep 17 00:00:00 2001 From: Felipe Franciosi Date: Wed, 27 Feb 2019 16:10:34 +0000 Subject: [PATCH 304/855] scsi: virtio_scsi: don't send sc payload with tmfs The virtio scsi spec defines struct virtio_scsi_ctrl_tmf as a set of device-readable records and a single device-writable response entry: struct virtio_scsi_ctrl_tmf { // Device-readable part le32 type; le32 subtype; u8 lun[8]; le64 id; // Device-writable part u8 response; } The above should be organised as two descriptor entries (or potentially more if using VIRTIO_F_ANY_LAYOUT), but without any extra data after "le64 id" or after "u8 response". The Linux driver doesn't respect that, with virtscsi_abort() and virtscsi_device_reset() setting cmd->sc before calling virtscsi_tmf(). It results in the original scsi command payload (or writable buffers) added to the tmf. This fixes the problem by leaving cmd->sc zeroed out, which makes virtscsi_kick_cmd() add the tmf to the control vq without any payload. Cc: stable@vger.kernel.org Signed-off-by: Felipe Franciosi Reviewed-by: Paolo Bonzini Signed-off-by: Martin K. Petersen --- drivers/scsi/virtio_scsi.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 1a6f150cd2d8..8af01777d09c 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -586,7 +586,6 @@ static int virtscsi_device_reset(struct scsi_cmnd *sc) return FAILED; memset(cmd, 0, sizeof(*cmd)); - cmd->sc = sc; cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){ .type = VIRTIO_SCSI_T_TMF, .subtype = cpu_to_virtio32(vscsi->vdev, @@ -645,7 +644,6 @@ static int virtscsi_abort(struct scsi_cmnd *sc) return FAILED; memset(cmd, 0, sizeof(*cmd)); - cmd->sc = sc; cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){ .type = VIRTIO_SCSI_T_TMF, .subtype = VIRTIO_SCSI_T_TMF_ABORT_TASK, From 59d3191f14dc18881fec1172c7096b7863622803 Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Wed, 30 Jan 2019 15:45:18 -0500 Subject: [PATCH 305/855] drm/amd/display: don't call dm_pp_ function from an fpu block Powerplay functions called from dm_pp_* functions tend to do a mutex_lock which isn't safe to do inside a kernel_fpu_begin/end block as those will disable/enable preemption. Rearrange the dm_pp_get_clock_levels_by_type_with_voltage calls to make sure they happen outside of kernel_fpu_begin/end. Cc: stable@vger.kernel.org Acked-by: Alex Deucher Signed-off-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c index 12d1842079ae..eb62d10bb65c 100644 --- a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c +++ b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c @@ -1348,12 +1348,12 @@ void dcn_bw_update_from_pplib(struct dc *dc) struct dm_pp_clock_levels_with_voltage fclks = {0}, dcfclks = {0}; bool res; - kernel_fpu_begin(); - /* TODO: This is not the proper way to obtain fabric_and_dram_bandwidth, should be min(fclk, memclk) */ res = dm_pp_get_clock_levels_by_type_with_voltage( ctx, DM_PP_CLOCK_TYPE_FCLK, &fclks); + kernel_fpu_begin(); + if (res) res = verify_clock_values(&fclks); @@ -1372,9 +1372,13 @@ void dcn_bw_update_from_pplib(struct dc *dc) } else BREAK_TO_DEBUGGER(); + kernel_fpu_end(); + res = dm_pp_get_clock_levels_by_type_with_voltage( ctx, DM_PP_CLOCK_TYPE_DCFCLK, &dcfclks); + kernel_fpu_begin(); + if (res) res = verify_clock_values(&dcfclks); From df20af1e270bb6e024d9d009516b1da864d58632 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 3 Jan 2019 03:12:41 +0000 Subject: [PATCH 306/855] um: Remove duplicated include from vector_user.c Remove duplicated include. Signed-off-by: YueHaibing Reviewed-by: Anton Ivanov Acked-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/vector_user.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c index d2c17dd74620..b3f7b3ca896d 100644 --- a/arch/um/drivers/vector_user.c +++ b/arch/um/drivers/vector_user.c @@ -16,14 +16,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include #include #include @@ -31,7 +29,6 @@ #include #include #include -#include #include "vector_user.h" #define ID_GRE 0 From aea05eb56e47046de1e5b508d40931dca969f1e5 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Tue, 26 Feb 2019 15:55:25 +0000 Subject: [PATCH 307/855] um: Fix for a possible OOPS in ubd initialization If the ubd device failed to allocate a queue during initialization it tried call blk_cleanup_queue resulting in an oops. This patch simplifies the cleanup logic and ensures that blk_queue_cleanup is called only if there is a valid queue. Reported-by: Dan Carpenter Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/ubd_kern.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index a4a41421c5e2..aca09be2373e 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -938,7 +938,7 @@ static int ubd_add(int n, char **error_out) ubd_dev->queue = blk_mq_init_queue(&ubd_dev->tag_set); if (IS_ERR(ubd_dev->queue)) { err = PTR_ERR(ubd_dev->queue); - goto out_cleanup; + goto out_cleanup_tags; } ubd_dev->queue->queuedata = ubd_dev; @@ -968,8 +968,8 @@ out: out_cleanup_tags: blk_mq_free_tag_set(&ubd_dev->tag_set); -out_cleanup: - blk_cleanup_queue(ubd_dev->queue); + if (!(IS_ERR(ubd_dev->queue))) + blk_cleanup_queue(ubd_dev->queue); goto out; } From 2ff717cdc693d532e4560f43858443aea7d88197 Mon Sep 17 00:00:00 2001 From: Cathy Avery Date: Thu, 28 Feb 2019 14:28:24 -0500 Subject: [PATCH 308/855] scsi: target: tcmu: wait for nl reply only if there are listeners or during an add genlmsg_multicast_allns now returns the correct statuses when a message is sent to a listener. However in the case of adding a device we want to wait for the listener otherwise we may miss the the device during startup. Signed-off-by: Cathy Avery Acked-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/target/target_core_user.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 1e6d24943565..396c2fa1ec22 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1764,11 +1764,12 @@ static int tcmu_netlink_event_send(struct tcmu_dev *udev, ret = genlmsg_multicast_allns(&tcmu_genl_family, skb, 0, TCMU_MCGRP_CONFIG, GFP_KERNEL); - /* We don't care if no one is listening */ - if (ret == -ESRCH) - ret = 0; - if (!ret) - ret = tcmu_wait_genl_cmd_reply(udev); + + /* Wait during an add as the listener may not be up yet */ + if (ret == 0 || + (ret == -ESRCH && cmd == TCMU_CMD_ADDED_DEVICE)) + return tcmu_wait_genl_cmd_reply(udev); + return ret; } From 31b6a05f86e690e1818116fd23c3be915cc9d9ed Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 1 Mar 2019 06:46:28 -0800 Subject: [PATCH 309/855] scsi: megaraid_sas: reduce module load time megaraid_sas takes 1+ seconds to load while waiting for firmware: [2.822603] megaraid_sas 0000:03:00.0: Waiting for FW to come to ready state [3.871003] megaraid_sas 0000:03:00.0: FW now in Ready state This is due to the following loop in megasas_transition_to_ready(), which waits a minimum of 1 second, even though the FW becomes ready in tens of millisecs: /* * The cur_state should not last for more than max_wait secs */ for (i = 0; i < max_wait; i++) { ... msleep(1000); ... dev_info(&instance->pdev->dev, "FW now in Ready state\n"); This is a regression, caused by a change of the msleep granularity from 1 to 1000 due to concern about waiting too long on systems with coarse jiffies. To fix, increase iterations and use msleep(20), which results in: [2.670627] megaraid_sas 0000:03:00.0: Waiting for FW to come to ready state [2.739386] megaraid_sas 0000:03:00.0: FW now in Ready state Fixes: fb2f3e96d80f ("scsi: megaraid_sas: Fix msleep granularity") Signed-off-by: Steve Sistare Acked-by: Sumit Saxena Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 8a2ee4cf26b0..387fb930f799 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -3924,12 +3924,12 @@ megasas_transition_to_ready(struct megasas_instance *instance, int ocr) /* * The cur_state should not last for more than max_wait secs */ - for (i = 0; i < max_wait; i++) { + for (i = 0; i < max_wait * 50; i++) { curr_abs_state = instance->instancet-> read_fw_status_reg(instance); if (abs_state == curr_abs_state) { - msleep(1000); + msleep(20); } else break; } From db0f166e9a37215b15d5d732c98fa15219adccf0 Mon Sep 17 00:00:00 2001 From: Bill Kuzeja Date: Mon, 4 Mar 2019 08:25:46 -0500 Subject: [PATCH 310/855] scsi: qla2xxx: Fix panic in qla_dfs_tgt_counters_show When trying to display tgt_counters in the debugfs, a panic can result. There is no null check for qpair after it is assigned in the for-loop. Unless vha->hw->queue_pair_map array is completely filled with entries, the system will panic dereferencing a null pointer. Signed-off-by: Bill Kuzeja Acked-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_dfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/qla2xxx/qla_dfs.c b/drivers/scsi/qla2xxx/qla_dfs.c index ead17288e2a7..5819a45ac5ef 100644 --- a/drivers/scsi/qla2xxx/qla_dfs.c +++ b/drivers/scsi/qla2xxx/qla_dfs.c @@ -193,6 +193,8 @@ qla_dfs_tgt_counters_show(struct seq_file *s, void *unused) for (i = 0; i < vha->hw->max_qpairs; i++) { qpair = vha->hw->queue_pair_map[i]; + if (!qpair) + continue; qla_core_sbt_cmd += qpair->tgt_counters.qla_core_sbt_cmd; core_qla_que_buf += qpair->tgt_counters.core_qla_que_buf; qla_core_ret_ctio += qpair->tgt_counters.qla_core_ret_ctio; From 176eb927744201ba95ca9a85c2f9af457559374f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Mar 2019 20:39:11 +0100 Subject: [PATCH 311/855] scsi: ufs: hisi: fix ufs_hba_variant_ops passing Without CONFIG_OF, the of_match_node() helper does not evaluate its argument, and the compiler warns about the unused variable: drivers/scsi/ufs/ufs-hisi.c: In function 'ufs_hisi_probe': drivers/scsi/ufs/ufs-hisi.c:673:17: error: unused variable 'dev' [-Werror=unused-variable] Rework this code to pass the data directly, and while we're at it, correctly handle the const pointers. Fixes: 653fcb07d95e ("scsi: ufs: Add HI3670 SoC UFS driver support") Signed-off-by: Arnd Bergmann Reviewed-by: Avri Altman Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufs-hisi.c | 11 ++++------- drivers/scsi/ufs/ufshcd-pltfrm.c | 2 +- drivers/scsi/ufs/ufshcd-pltfrm.h | 2 +- drivers/scsi/ufs/ufshcd.h | 2 +- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/ufs/ufs-hisi.c b/drivers/scsi/ufs/ufs-hisi.c index f2d3df357a97..0e855b5afe82 100644 --- a/drivers/scsi/ufs/ufs-hisi.c +++ b/drivers/scsi/ufs/ufs-hisi.c @@ -640,7 +640,7 @@ static int ufs_hi3670_init(struct ufs_hba *hba) return 0; } -static struct ufs_hba_variant_ops ufs_hba_hi3660_vops = { +static const struct ufs_hba_variant_ops ufs_hba_hi3660_vops = { .name = "hi3660", .init = ufs_hi3660_init, .link_startup_notify = ufs_hisi_link_startup_notify, @@ -649,7 +649,7 @@ static struct ufs_hba_variant_ops ufs_hba_hi3660_vops = { .resume = ufs_hisi_resume, }; -static struct ufs_hba_variant_ops ufs_hba_hi3670_vops = { +static const struct ufs_hba_variant_ops ufs_hba_hi3670_vops = { .name = "hi3670", .init = ufs_hi3670_init, .link_startup_notify = ufs_hisi_link_startup_notify, @@ -669,13 +669,10 @@ MODULE_DEVICE_TABLE(of, ufs_hisi_of_match); static int ufs_hisi_probe(struct platform_device *pdev) { const struct of_device_id *of_id; - struct ufs_hba_variant_ops *vops; - struct device *dev = &pdev->dev; - of_id = of_match_node(ufs_hisi_of_match, dev->of_node); - vops = (struct ufs_hba_variant_ops *)of_id->data; + of_id = of_match_node(ufs_hisi_of_match, pdev->dev.of_node); - return ufshcd_pltfrm_init(pdev, vops); + return ufshcd_pltfrm_init(pdev, of_id->data); } static int ufs_hisi_remove(struct platform_device *pdev) diff --git a/drivers/scsi/ufs/ufshcd-pltfrm.c b/drivers/scsi/ufs/ufshcd-pltfrm.c index 895a9b5ac989..27213676329c 100644 --- a/drivers/scsi/ufs/ufshcd-pltfrm.c +++ b/drivers/scsi/ufs/ufshcd-pltfrm.c @@ -297,7 +297,7 @@ static void ufshcd_init_lanes_per_dir(struct ufs_hba *hba) * Returns 0 on success, non-zero value on failure */ int ufshcd_pltfrm_init(struct platform_device *pdev, - struct ufs_hba_variant_ops *vops) + const struct ufs_hba_variant_ops *vops) { struct ufs_hba *hba; void __iomem *mmio_base; diff --git a/drivers/scsi/ufs/ufshcd-pltfrm.h b/drivers/scsi/ufs/ufshcd-pltfrm.h index df64c4180340..1f29e1fd6d52 100644 --- a/drivers/scsi/ufs/ufshcd-pltfrm.h +++ b/drivers/scsi/ufs/ufshcd-pltfrm.h @@ -17,7 +17,7 @@ #include "ufshcd.h" int ufshcd_pltfrm_init(struct platform_device *pdev, - struct ufs_hba_variant_ops *vops); + const struct ufs_hba_variant_ops *vops); void ufshcd_pltfrm_shutdown(struct platform_device *pdev); #ifdef CONFIG_PM diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 69ba7445d2b3..ecfa898b9ccc 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -546,7 +546,7 @@ struct ufs_hba { int nutrs; int nutmrs; u32 ufs_version; - struct ufs_hba_variant_ops *vops; + const struct ufs_hba_variant_ops *vops; void *priv; unsigned int irq; bool is_irq_enabled; From cda7fa18653c47865d2a048d5bfa211d464b23e1 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 4 Mar 2019 15:15:43 -0800 Subject: [PATCH 312/855] scsi: lpfc: Correct __lpfc_sli_issue_iocb_s4 lockdep check The outer routine lpfc_sli_issue_iocb(), which decomposes into the SLI3 (s3) or SLI4 (s4) subroutines takes out the locks. For s3, it takes out the hbalock. For s4, it takes out the ring_lock. The lockdep check in the s3 and s4 subroutines both check hbalock, which is incorrect for s4. Revise the s4 subroutine to lockdep check the ring_lock. Reported-by: Bart Van Assche Signed-off-by: Dick Kennedy Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_sli.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 62957d682d3f..f29f7cbb959d 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -9881,7 +9881,7 @@ __lpfc_sli_issue_iocb_s4(struct lpfc_hba *phba, uint32_t ring_number, * The WQE can be either 64 or 128 bytes, */ - lockdep_assert_held(&phba->hbalock); + lockdep_assert_held(&pring->ring_lock); if (piocb->sli4_xritag == NO_XRI) { if (piocb->iocb.ulpCommand == CMD_ABORT_XRI_CN || From 1ffdd2c0440dfda533ca9477a89550c9f48f4b35 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 4 Mar 2019 15:27:51 -0800 Subject: [PATCH 313/855] scsi: lpfc: resolve static checker warning in lpfc_sli4_hba_unset The patch that replaced io channels for hdw_queues now reports the following static checker warning: drivers/scsi/lpfc/lpfc_init.c:11136 lpfc_sli4_hba_unset() error: we previously assumed 'phba->pport' could be null (see line 11074) Resolve by adding a pport NULL check. [mkp: tag tweak] Fixes: cdb42becdd40 ("scsi: lpfc: Replace io_channels for nvme and fcp with general hdw_queues per cpu"_ Reported-by: Dan Carpenter Signed-off-by: Dick Kennedy Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 3eb04c3be300..cd09b2120abd 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -11132,7 +11132,8 @@ lpfc_sli4_hba_unset(struct lpfc_hba *phba) lpfc_sli4_ras_dma_free(phba); /* Stop the SLI4 device port */ - phba->pport->work_port_events = 0; + if (phba->pport) + phba->pport->work_port_events = 0; } /** From 038d710fca5bb149d3af2e0b71f1284f8430a979 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Mar 2019 20:39:10 +0100 Subject: [PATCH 314/855] scsi: qla2xxx: avoid printf format warning Depending on the target architecture and configuration, both phys_addr_t and dma_addr_t may be smaller than 'long long', so we get a warning when printing either of them using the %llx format string: drivers/scsi/qla2xxx/qla_iocb.c: In function 'qla24xx_walk_and_build_prot_sglist': drivers/scsi/qla2xxx/qla_iocb.c:1140:46: error: format '%llx' expects argument of type 'long long unsigned int', but argument 6 has type 'dma_addr_t' {aka 'unsigned int'} [-Werror=format=] "%s: page boundary crossing (phys=%llx len=%x)\n", ~~~^ %x __func__, sle_phys, sg->length); ~~~~~~~~ drivers/scsi/qla2xxx/qla_iocb.c:1180:29: error: format '%llx' expects argument of type 'long long unsigned int', but argument 7 has type 'dma_addr_t' {aka 'unsigned int'} [-Werror=format=] "%s: sg[%x] (phys=%llx sglen=%x) ldma_sg_len: %x dif_bundl_len: %x ldma_needed: %x\n", ~~~^ There are special %pad and %pap format strings in Linux that we could use here, but since the driver already does 64-bit arithmetic on the values, using a plain 'u64' seems more consistent here. Note: A possible related issue may be that the driver possibly checks the wrong kind of overflow: when an IOMMU is in use, buffers that cross a 32-bit boundary in physical addresses would still be mapped into dma addresses within the low 4GB space, so I suspect that we actually want to check sg_dma_address() instead of sg_phys() here. Fixes: 50b812755e97 ("scsi: qla2xxx: Fix DMA error when the DIF sg buffer crosses 4GB boundary") Signed-off-by: Arnd Bergmann Acked-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_iocb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c index 63f8e3c19841..456a41d2e2c6 100644 --- a/drivers/scsi/qla2xxx/qla_iocb.c +++ b/drivers/scsi/qla2xxx/qla_iocb.c @@ -1132,7 +1132,7 @@ qla24xx_walk_and_build_prot_sglist(struct qla_hw_data *ha, srb_t *sp, /* if initiator doing write or target doing read */ if (direction_to_device) { for_each_sg(sgl, sg, tot_dsds, i) { - dma_addr_t sle_phys = sg_phys(sg); + u64 sle_phys = sg_phys(sg); /* If SGE addr + len flips bits in upper 32-bits */ if (MSD(sle_phys + sg->length) ^ MSD(sle_phys)) { @@ -1178,7 +1178,7 @@ qla24xx_walk_and_build_prot_sglist(struct qla_hw_data *ha, srb_t *sp, ql_dbg(ql_dbg_tgt + ql_dbg_verbose, vha, 0xe023, "%s: sg[%x] (phys=%llx sglen=%x) ldma_sg_len: %x dif_bundl_len: %x ldma_needed: %x\n", - __func__, i, sg_phys(sg), sglen, ldma_sg_len, + __func__, i, (u64)sg_phys(sg), sglen, ldma_sg_len, difctx->dif_bundl_len, ldma_needed); while (sglen) { From 08b11eaccfcf86a3bac6755625d933ac15ccc27a Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 21 Feb 2019 18:23:17 +0300 Subject: [PATCH 315/855] scsi: libiscsi: fall back to sendmsg for slab pages In "XFS over network block device" scenario XFS can create IO requests with slab-based XFS metadata. During processing such requests tcp_sendpage() can merge skb fragments with neighbour slab objects. If receiving side is located on the same host tcp_recvmsg() can trigger BUG_ON in hardening check and crash the host with following message: usercopy: kernel memory exposure attempt detected from XXXXXXXX (kmalloc-512) (1024 bytes) This patch redirect such requests from sednpage to sendmsg path. The problem is similar to one described in recent commit 7e241f647dc7 ("libceph: fall back to sendmsg for slab pages") Signed-off-by: Vasily Averin Acked-by: Chris Leech Signed-off-by: Martin K. Petersen --- drivers/scsi/libiscsi_tcp.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c index 9923e9e3b884..c3fe3f3a78f5 100644 --- a/drivers/scsi/libiscsi_tcp.c +++ b/drivers/scsi/libiscsi_tcp.c @@ -129,12 +129,17 @@ static void iscsi_tcp_segment_map(struct iscsi_segment *segment, int recv) BUG_ON(sg->length == 0); /* + * We always map for the recv path. + * * If the page count is greater than one it is ok to send * to the network layer's zero copy send path. If not we - * have to go the slow sendmsg path. We always map for the - * recv path. + * have to go the slow sendmsg path. + * + * Same goes for slab pages: skb_can_coalesce() allows + * coalescing neighboring slab objects into a single frag which + * triggers one of hardened usercopy checks. */ - if (page_count(sg_page(sg)) >= 1 && !recv) + if (!recv && page_count(sg_page(sg)) >= 1 && !PageSlab(sg_page(sg))) return; if (recv) { From 98effe4746c6e64b76703bbf5fd64e304e3e5396 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 4 Mar 2019 11:44:34 +0200 Subject: [PATCH 316/855] scsi: target: tcmu: Switch to bitmap_zalloc() Switch to bitmap_zalloc() to show clearly what we are allocating. Besides that it returns pointer of bitmap type instead of opaque void *. Signed-off-by: Andy Shevchenko Reviewed-by: Bart Van Assche Acked-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/target/target_core_user.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 396c2fa1ec22..4eeeaaf3796b 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1633,7 +1633,7 @@ static void tcmu_dev_kref_release(struct kref *kref) WARN_ON(!all_expired); tcmu_blocks_release(&udev->data_blocks, 0, udev->dbi_max + 1); - kfree(udev->data_bitmap); + bitmap_free(udev->data_bitmap); mutex_unlock(&udev->cmdr_lock); call_rcu(&dev->rcu_head, tcmu_dev_call_rcu); @@ -1841,9 +1841,7 @@ static int tcmu_configure_device(struct se_device *dev) info = &udev->uio_info; mutex_lock(&udev->cmdr_lock); - udev->data_bitmap = kcalloc(BITS_TO_LONGS(udev->max_blocks), - sizeof(unsigned long), - GFP_KERNEL); + udev->data_bitmap = bitmap_zalloc(udev->max_blocks, GFP_KERNEL); mutex_unlock(&udev->cmdr_lock); if (!udev->data_bitmap) { ret = -ENOMEM; @@ -1930,7 +1928,7 @@ err_register: vfree(udev->mb_addr); udev->mb_addr = NULL; err_vzalloc: - kfree(udev->data_bitmap); + bitmap_free(udev->data_bitmap); udev->data_bitmap = NULL; err_bitmap_alloc: kfree(info->name); From 352b205a3bac6bb68f27228768a5c12294e650bf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Mar 2019 20:39:08 +0100 Subject: [PATCH 317/855] scsi: lpfc: fix unused variable warning The newly introduced 'cpu' variable is only used inside of an optional block, so we get a warning without CONFIG_SCSI_LPFC_DEBUG_FS: drivers/scsi/lpfc/lpfc_nvme.c: In function 'lpfc_nvme_io_cmd_wqe_cmpl': drivers/scsi/lpfc/lpfc_nvme.c:968:30: error: unused variable 'cpu' [-Werror=unused-variable] uint32_t code, status, idx, cpu; Move the declaration into the same block to avoid the warning. Fixes: 63df6d637e33 ("scsi: lpfc: Adapt cpucheck debugfs logic to Hardware Queues") Signed-off-by: Arnd Bergmann Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_nvme.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index e73895674f39..4aa5c004785e 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -964,7 +964,7 @@ lpfc_nvme_io_cmd_wqe_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn, struct lpfc_nodelist *ndlp; struct lpfc_nvme_fcpreq_priv *freqpriv; struct lpfc_nvme_lport *lport; - uint32_t code, status, idx, cpu; + uint32_t code, status, idx; uint16_t cid, sqhd, data; uint32_t *ptr; @@ -1137,6 +1137,7 @@ out_err: lpfc_nvme_ktime(phba, lpfc_ncmd); } if (phba->cpucheck_on & LPFC_CHECK_NVME_IO) { + uint32_t cpu; idx = lpfc_ncmd->cur_iocbq.hba_wqidx; cpu = smp_processor_id(); if (cpu < LPFC_CHECK_CPU_CNT) { From f996861be108f6152d60d7357301697c3309c77f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Mar 2019 20:39:09 +0100 Subject: [PATCH 318/855] scsi: lpfc: fix 32-bit format string warning On 32-bit architectures, we see a warning when %ld is used to print a size_t: In file included from drivers/scsi/lpfc/lpfc_init.c:62: drivers/scsi/lpfc/lpfc_init.c: In function 'lpfc_new_io_buf': drivers/scsi/lpfc/lpfc_logmsg.h:62:45: error: format '%ld' expects argument of type 'long int', but argument 5 has type 'unsigned int' [-Werror=format=] This is harmless, but portable code should just use %zd to avoid the warning. Fixes: 0794d601d174 ("scsi: lpfc: Implement common IO buffers between NVME and SCSI") Signed-off-by: Arnd Bergmann Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index cd09b2120abd..707dbe7a8d15 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -4090,7 +4090,7 @@ lpfc_new_io_buf(struct lpfc_hba *phba, int num_to_alloc) /* Sanity check to ensure our sizing is right for both SCSI and NVME */ if (sizeof(struct lpfc_io_buf) > LPFC_COMMON_IO_BUF_SZ) { lpfc_printf_log(phba, KERN_ERR, LOG_FCP, - "6426 Common buffer size %ld exceeds %d\n", + "6426 Common buffer size %zd exceeds %d\n", sizeof(struct lpfc_io_buf), LPFC_COMMON_IO_BUF_SZ); return 0; From b8870ec63676aba1d823f0b36c5f7e9929e57d23 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Feb 2019 08:39:13 +0300 Subject: [PATCH 319/855] scsi: qla2xxx: check for kstrtol() failure The error handling was unintentionally left out so it introduces a Smatch static checker warning: drivers/scsi/qla2xxx/qla_attr.c:1655 qla2x00_port_speed_store() error: uninitialized symbol 'type'. Fixes: a7b9ca7fc87a ("scsi: qla2xxx: Add support for setting port speed") Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_attr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index f8fd482a06e0..41734efe4bf9 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -1652,6 +1652,8 @@ qla2x00_port_speed_store(struct device *dev, struct device_attribute *attr, } rval = kstrtol(buf, 10, &type); + if (rval) + return rval; speed = type; if (type == 40 || type == 80 || type == 160 || type == 320) { From fba770c6682447f1998267f4143595e77be4f112 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Thu, 28 Feb 2019 22:50:57 +0800 Subject: [PATCH 320/855] scsi: hisi_sas: Change return variable type in phy_up_v3_hw() According to the tool fortify, phy_up_v3_hw() returns signed value, while it should return an unsigned value. So change variable "res" from int to irq_return_t. Signed-off-by: Xiang Chen Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 00738d0673fe..151a102c4cc7 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -1344,7 +1344,8 @@ static void prep_abort_v3_hw(struct hisi_hba *hisi_hba, static irqreturn_t phy_up_v3_hw(int phy_no, struct hisi_hba *hisi_hba) { - int i, res; + int i; + irqreturn_t res; u32 context, port_id, link_rate; struct hisi_sas_phy *phy = &hisi_hba->phy[phy_no]; struct asd_sas_phy *sas_phy = &phy->sas_phy; From 4790595723d4b833b18c994973d39f9efb842887 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Thu, 28 Feb 2019 22:50:58 +0800 Subject: [PATCH 321/855] scsi: hisi_sas: Fix a timeout race of driver internal and SMP IO For internal IO and SMP IO, there is a time-out timer for them. In the timer handler, it checks whether IO is done according to the flag task->task_state_lock. There is an issue which may cause system suspended: internal IO or SMP IO is sent, but at that time because of hardware exception (such as inject 2Bit ECC error), so IO is not completed and also not timeout. But, at that time, the SAS controller reset occurs to recover system. It will release the resource and set the status of IO to be SAS_TASK_STATE_DONE, so when IO timeout, it will never complete the completion of IO and wait for ever. [ 729.123632] Call trace: [ 729.126791] [] __switch_to+0x94/0xa8 [ 729.133106] [] __schedule+0x1e8/0x7fc [ 729.138975] [] schedule+0x34/0x8c [ 729.144401] [] schedule_timeout+0x1d8/0x3cc [ 729.150690] [] wait_for_common+0xdc/0x1a0 [ 729.157101] [] wait_for_completion+0x28/0x34 [ 729.165973] [] hisi_sas_internal_task_abort+0x2a0/0x424 [hisi_sas_test_main] [ 729.176447] [] hisi_sas_abort_task+0x244/0x2d8 [hisi_sas_test_main] [ 729.185258] [] sas_eh_handle_sas_errors+0x1c8/0x7b8 [ 729.192391] [] sas_scsi_recover_host+0x130/0x398 [ 729.199237] [] scsi_error_handler+0x148/0x5c0 [ 729.206009] [] kthread+0x10c/0x138 [ 729.211563] [] ret_from_fork+0x10/0x18 To solve the issue, callback function task_done of those IOs need to be called when on SAS controller reset. Signed-off-by: Xiang Chen Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 923296653ed7..dd03dcbd3786 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -980,7 +980,8 @@ static void hisi_sas_do_release_task(struct hisi_hba *hisi_hba, struct sas_task spin_lock_irqsave(&task->task_state_lock, flags); task->task_state_flags &= ~(SAS_TASK_STATE_PENDING | SAS_TASK_AT_INITIATOR); - task->task_state_flags |= SAS_TASK_STATE_DONE; + if (!slot->is_internal && task->task_proto != SAS_PROTOCOL_SMP) + task->task_state_flags |= SAS_TASK_STATE_DONE; spin_unlock_irqrestore(&task->task_state_lock, flags); } From aaeb82323d12f3b61015878e035d74600e1f2c85 Mon Sep 17 00:00:00 2001 From: Xiaofei Tan Date: Thu, 28 Feb 2019 22:50:59 +0800 Subject: [PATCH 322/855] scsi: hisi_sas: print PHY RX errors count for later revision of v3 hw The later revision of v3 hw has added an function of interrupt coalesce according to time for PHY RX errors. We set the coalesce time to 1s. Then we print PHY RX errors count when PHY RX errors happen, and don't need to worry that there may be too much log prints. Besides, we use hisi_sas_phy.lock to protect error count value. Because we update them by calling phy_get_events_v3_hw(), which is also used by core driver (for get PHY events function). We relocate phy_get_events_v3_hw() to avoid a further declaration. Signed-off-by: Xiaofei Tan Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas.h | 1 + drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 84 ++++++++++++++++++-------- 2 files changed, 60 insertions(+), 25 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h index 6c87bd34509a..515aee9318a4 100644 --- a/drivers/scsi/hisi_sas/hisi_sas.h +++ b/drivers/scsi/hisi_sas/hisi_sas.h @@ -161,6 +161,7 @@ struct hisi_sas_phy { u8 in_reset; u8 reserved[2]; u32 phy_type; + u32 code_violation_err_count; enum sas_linkrate minimum_linkrate; enum sas_linkrate maximum_linkrate; }; diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 151a102c4cc7..720721196b12 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -181,6 +181,8 @@ #define CHL_INT1_DMAC_RX_AXI_RD_ERR_OFF 22 #define CHL_INT2 (PORT_BASE + 0x1bc) #define CHL_INT2_SL_IDAF_TOUT_CONF_OFF 0 +#define CHL_INT2_RX_DISP_ERR_OFF 28 +#define CHL_INT2_RX_CODE_ERR_OFF 29 #define CHL_INT2_RX_INVLD_DW_OFF 30 #define CHL_INT2_STP_LINK_TIMEOUT_OFF 31 #define CHL_INT0_MSK (PORT_BASE + 0x1c0) @@ -544,6 +546,8 @@ static void init_reg_v3_hw(struct hisi_hba *hisi_hba) hisi_sas_phy_write32(hisi_hba, i, STP_LINK_TIMER, 0x7f7a120); hisi_sas_phy_write32(hisi_hba, i, CON_CFG_DRIVER, 0x2a0a01); hisi_sas_phy_write32(hisi_hba, i, SAS_SSP_CON_TIMER_CFG, 0x32); + hisi_sas_phy_write32(hisi_hba, i, SAS_EC_INT_COAL_TIME, + 0x30f4240); /* used for 12G negotiate */ hisi_sas_phy_write32(hisi_hba, i, COARSETUNE_TIME, 0x1e); hisi_sas_phy_write32(hisi_hba, i, AIP_LIMIT, 0x2ffff); @@ -1576,6 +1580,39 @@ static void handle_chl_int1_v3_hw(struct hisi_hba *hisi_hba, int phy_no) hisi_sas_phy_write32(hisi_hba, phy_no, CHL_INT1, irq_value); } +static void phy_get_events_v3_hw(struct hisi_hba *hisi_hba, int phy_no) +{ + struct hisi_sas_phy *phy = &hisi_hba->phy[phy_no]; + struct asd_sas_phy *sas_phy = &phy->sas_phy; + struct sas_phy *sphy = sas_phy->phy; + unsigned long flags; + u32 reg_value; + + spin_lock_irqsave(&phy->lock, flags); + + /* loss dword sync */ + reg_value = hisi_sas_phy_read32(hisi_hba, phy_no, ERR_CNT_DWS_LOST); + sphy->loss_of_dword_sync_count += reg_value; + + /* phy reset problem */ + reg_value = hisi_sas_phy_read32(hisi_hba, phy_no, ERR_CNT_RESET_PROB); + sphy->phy_reset_problem_count += reg_value; + + /* invalid dword */ + reg_value = hisi_sas_phy_read32(hisi_hba, phy_no, ERR_CNT_INVLD_DW); + sphy->invalid_dword_count += reg_value; + + /* disparity err */ + reg_value = hisi_sas_phy_read32(hisi_hba, phy_no, ERR_CNT_DISP_ERR); + sphy->running_disparity_error_count += reg_value; + + /* code violation error */ + reg_value = hisi_sas_phy_read32(hisi_hba, phy_no, ERR_CNT_CODE_ERR); + phy->code_violation_err_count += reg_value; + + spin_unlock_irqrestore(&phy->lock, flags); +} + static void handle_chl_int2_v3_hw(struct hisi_hba *hisi_hba, int phy_no) { u32 irq_msk = hisi_sas_phy_read32(hisi_hba, phy_no, CHL_INT2_MSK); @@ -1583,6 +1620,9 @@ static void handle_chl_int2_v3_hw(struct hisi_hba *hisi_hba, int phy_no) struct hisi_sas_phy *phy = &hisi_hba->phy[phy_no]; struct pci_dev *pci_dev = hisi_hba->pci_dev; struct device *dev = hisi_hba->dev; + static const u32 msk = BIT(CHL_INT2_RX_DISP_ERR_OFF) | + BIT(CHL_INT2_RX_CODE_ERR_OFF) | + BIT(CHL_INT2_RX_INVLD_DW_OFF); irq_value &= ~irq_msk; if (!irq_value) @@ -1603,6 +1643,25 @@ static void handle_chl_int2_v3_hw(struct hisi_hba *hisi_hba, int phy_no) hisi_sas_notify_phy_event(phy, HISI_PHYE_LINK_RESET); } + if (pci_dev->revision > 0x20 && (irq_value & msk)) { + struct asd_sas_phy *sas_phy = &phy->sas_phy; + struct sas_phy *sphy = sas_phy->phy; + + phy_get_events_v3_hw(hisi_hba, phy_no); + + if (irq_value & BIT(CHL_INT2_RX_INVLD_DW_OFF)) + dev_info(dev, "phy%d invalid dword cnt: %u\n", phy_no, + sphy->invalid_dword_count); + + if (irq_value & BIT(CHL_INT2_RX_CODE_ERR_OFF)) + dev_info(dev, "phy%d code violation cnt: %u\n", phy_no, + phy->code_violation_err_count); + + if (irq_value & BIT(CHL_INT2_RX_DISP_ERR_OFF)) + dev_info(dev, "phy%d disparity error cnt: %u\n", phy_no, + sphy->running_disparity_error_count); + } + if ((irq_value & BIT(CHL_INT2_RX_INVLD_DW_OFF)) && (pci_dev->revision == 0x20)) { u32 reg_value; @@ -2231,31 +2290,6 @@ static u32 get_phys_state_v3_hw(struct hisi_hba *hisi_hba) return hisi_sas_read32(hisi_hba, PHY_STATE); } -static void phy_get_events_v3_hw(struct hisi_hba *hisi_hba, int phy_no) -{ - struct hisi_sas_phy *phy = &hisi_hba->phy[phy_no]; - struct asd_sas_phy *sas_phy = &phy->sas_phy; - struct sas_phy *sphy = sas_phy->phy; - u32 reg_value; - - /* loss dword sync */ - reg_value = hisi_sas_phy_read32(hisi_hba, phy_no, ERR_CNT_DWS_LOST); - sphy->loss_of_dword_sync_count += reg_value; - - /* phy reset problem */ - reg_value = hisi_sas_phy_read32(hisi_hba, phy_no, ERR_CNT_RESET_PROB); - sphy->phy_reset_problem_count += reg_value; - - /* invalid dword */ - reg_value = hisi_sas_phy_read32(hisi_hba, phy_no, ERR_CNT_INVLD_DW); - sphy->invalid_dword_count += reg_value; - - /* disparity err */ - reg_value = hisi_sas_phy_read32(hisi_hba, phy_no, ERR_CNT_DISP_ERR); - sphy->running_disparity_error_count += reg_value; - -} - static int disable_host_v3_hw(struct hisi_hba *hisi_hba) { struct device *dev = hisi_hba->dev; From efdcad62e7b8a02fcccc5ccca57806dce1482ac8 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 28 Feb 2019 22:51:00 +0800 Subject: [PATCH 323/855] scsi: hisi_sas: Set PHY linkrate when disconnected When the PHY comes down, we currently do not set the negotiated linkrate: root@(none)$ pwd /sys/class/sas_phy/phy-0:0 root@(none)$ more enable 1 root@(none)$ more negotiated_linkrate 12.0 Gbit root@(none)$ echo 0 > enable root@(none)$ more negotiated_linkrate 12.0 Gbit root@(none)$ This patch fixes the driver code to set it properly when the PHY comes down. If the PHY had been enabled, then set unknown; otherwise, flag as disabled. The logical place to set the negotiated linkrate for this scenario is PHY down routine, which is called from the PHY down ISR. However, it is not possible to know if the PHY comes down due to PHY disable or loss of link, as sas_phy.enabled member is not set until after the transport disable routine is complete, which races with the PHY down ISR. As an imperfect solution, use sas_phy_data.enable as the flag to know if the PHY is down due to disable. It's imperfect, as sas_phy_data is internal to libsas. I can't see another way without adding a new field to hisi_sas_phy and managing it, or changing SCSI SAS transport. Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index dd03dcbd3786..d8204bc3931b 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -10,6 +10,7 @@ */ #include "hisi_sas.h" +#include "../libsas/sas_internal.h" #define DRV_NAME "hisi_sas" #define DEV_IS_GONE(dev) \ @@ -2126,9 +2127,18 @@ static int hisi_sas_write_gpio(struct sas_ha_struct *sha, u8 reg_type, static void hisi_sas_phy_disconnected(struct hisi_sas_phy *phy) { + struct asd_sas_phy *sas_phy = &phy->sas_phy; + struct sas_phy *sphy = sas_phy->phy; + struct sas_phy_data *d = sphy->hostdata; + phy->phy_attached = 0; phy->phy_type = 0; phy->port = NULL; + + if (d->enable) + sphy->negotiated_linkrate = SAS_LINK_RATE_UNKNOWN; + else + sphy->negotiated_linkrate = SAS_PHY_DISABLED; } void hisi_sas_phy_down(struct hisi_hba *hisi_hba, int phy_no, int rdy) From 57dbb2b218eb6b4faa39025e3e5974742a4e8986 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Thu, 28 Feb 2019 22:51:01 +0800 Subject: [PATCH 324/855] scsi: hisi_sas: Send HARD RESET to clear the previous affiliation of STP target port If we exchange SAS expander from one SAS controller to other SAS controller without powering it down, the STP target port will maintain previous affiliation and reject all subsequent connection requests from other STP initiator ports with OPEN_REJECT (STP RESOURCES BUSY). To solve this issue, send HARD RESET to clear the previous affiliation of STP target port according to SPL (chapter 6.19.4). We (re-)introduce dev status flag to know if to sleep in NEXUS reset code or not for remote PHYs. The idea is that if the device is being initialised, we don't require the delay, and caller would wait for link to be established, cf. sas_ata_hard_reset(). Co-developed-by: Luo Jiaxing Signed-off-by: Xiang Chen Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas.h | 7 ++++ drivers/scsi/hisi_sas/hisi_sas_main.c | 44 ++++++++++++++++++++++++-- drivers/scsi/hisi_sas/hisi_sas_v2_hw.c | 1 + 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h index 515aee9318a4..9bfa9f12d81e 100644 --- a/drivers/scsi/hisi_sas/hisi_sas.h +++ b/drivers/scsi/hisi_sas/hisi_sas.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -94,6 +95,11 @@ enum { PORT_TYPE_SATA = (1U << 0), }; +enum dev_status { + HISI_SAS_DEV_INIT, + HISI_SAS_DEV_NORMAL, +}; + enum { HISI_SAS_INT_ABT_CMD = 0, HISI_SAS_INT_ABT_DEV = 1, @@ -195,6 +201,7 @@ struct hisi_sas_device { struct hisi_sas_dq *dq; struct list_head list; enum sas_device_type dev_type; + enum dev_status dev_status; int device_id; int sata_idx; spinlock_t lock; /* For protecting slots */ diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index d8204bc3931b..d12924256964 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -708,6 +708,7 @@ static struct hisi_sas_device *hisi_sas_alloc_dev(struct domain_device *device) hisi_hba->devices[i].device_id = i; sas_dev = &hisi_hba->devices[i]; + sas_dev->dev_status = HISI_SAS_DEV_INIT; sas_dev->dev_type = device->dev_type; sas_dev->hisi_hba = hisi_hba; sas_dev->sas_device = device; @@ -732,6 +733,8 @@ static int hisi_sas_init_device(struct domain_device *device) struct hisi_sas_tmf_task tmf_task; int retry = HISI_SAS_SRST_ATA_DISK_CNT; struct hisi_hba *hisi_hba = dev_to_hisi_hba(device); + struct device *dev = hisi_hba->dev; + struct sas_phy *local_phy; switch (device->dev_type) { case SAS_END_DEVICE: @@ -747,6 +750,31 @@ static int hisi_sas_init_device(struct domain_device *device) case SAS_SATA_PM: case SAS_SATA_PM_PORT: case SAS_SATA_PENDING: + /* + * send HARD RESET to clear previous affiliation of + * STP target port + */ + local_phy = sas_get_local_phy(device); + if (!scsi_is_sas_phy_local(local_phy)) { + unsigned long deadline = ata_deadline(jiffies, 20000); + struct sata_device *sata_dev = &device->sata_dev; + struct ata_host *ata_host = sata_dev->ata_host; + struct ata_port_operations *ops = ata_host->ops; + struct ata_port *ap = sata_dev->ap; + struct ata_link *link; + unsigned int classes; + + ata_for_each_link(link, ap, EDGE) + rc = ops->hardreset(link, &classes, + deadline); + } + sas_put_local_phy(local_phy); + if (rc) { + dev_warn(dev, "SATA disk hardreset fail: 0x%x\n", + rc); + return rc; + } + while (retry-- > 0) { rc = hisi_sas_softreset_ata_disk(device); if (!rc) @@ -809,6 +837,7 @@ static int hisi_sas_dev_found(struct domain_device *device) rc = hisi_sas_init_device(device); if (rc) goto err_out; + sas_dev->dev_status = HISI_SAS_DEV_NORMAL; return 0; err_out: @@ -1715,20 +1744,23 @@ static int hisi_sas_clear_aca(struct domain_device *device, u8 *lun) static int hisi_sas_debug_I_T_nexus_reset(struct domain_device *device) { struct sas_phy *local_phy = sas_get_local_phy(device); - int rc, reset_type = (device->dev_type == SAS_SATA_DEV || - (device->tproto & SAS_PROTOCOL_STP)) ? 0 : 1; + struct hisi_sas_device *sas_dev = device->lldd_dev; struct hisi_hba *hisi_hba = dev_to_hisi_hba(device); struct sas_ha_struct *sas_ha = &hisi_hba->sha; struct asd_sas_phy *sas_phy = sas_ha->sas_phy[local_phy->number]; struct hisi_sas_phy *phy = container_of(sas_phy, struct hisi_sas_phy, sas_phy); DECLARE_COMPLETION_ONSTACK(phyreset); + int rc, reset_type; if (scsi_is_sas_phy_local(local_phy)) { phy->in_reset = 1; phy->reset_completion = &phyreset; } + reset_type = (sas_dev->dev_status == HISI_SAS_DEV_INIT || + !dev_is_sata(device)) ? 1 : 0; + rc = sas_phy_reset(local_phy, reset_type); sas_put_local_phy(local_phy); @@ -1744,8 +1776,13 @@ static int hisi_sas_debug_I_T_nexus_reset(struct domain_device *device) /* report PHY down if timed out */ if (!ret) hisi_sas_phy_down(hisi_hba, sas_phy->id, 0); - } else + } else if (sas_dev->dev_status != HISI_SAS_DEV_INIT) { + /* + * If in init state, we rely on caller to wait for link to be + * ready; otherwise, delay. + */ msleep(2000); + } return rc; } @@ -2264,6 +2301,7 @@ int hisi_sas_alloc(struct hisi_hba *hisi_hba) for (i = 0; i < HISI_SAS_MAX_DEVICES; i++) { hisi_hba->devices[i].dev_type = SAS_PHY_UNUSED; hisi_hba->devices[i].device_id = i; + hisi_hba->devices[i].dev_status = HISI_SAS_DEV_INIT; } for (i = 0; i < hisi_hba->queue_count; i++) { diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c index e40cc6b3b67b..89160ab3efb0 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c @@ -868,6 +868,7 @@ hisi_sas_device *alloc_dev_quirk_v2_hw(struct domain_device *device) hisi_hba->devices[i].device_id = i; sas_dev = &hisi_hba->devices[i]; + sas_dev->dev_status = HISI_SAS_DEV_INIT; sas_dev->dev_type = device->dev_type; sas_dev->hisi_hba = hisi_hba; sas_dev->sas_device = device; From cf9efd5d92365696580a45e0351208eef0ea1562 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Thu, 28 Feb 2019 22:51:02 +0800 Subject: [PATCH 325/855] scsi: hisi_sas: Change SERDES_CFG init value to increase reliability of HiLink With default value of register SERDES_CFG, the link is not stable for some special disks when running IO. According to HW guys' suggestion, need to make the bit10~19 value of register SERDES_CFG the max value to increase the reliability of the HiLink. Signed-off-by: Xiang Chen Reviewed-by: Yupeng Zhou Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 720721196b12..e2f2c04355b9 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -129,6 +129,7 @@ #define PHY_CTRL_RESET_MSK (0x1 << PHY_CTRL_RESET_OFF) #define CMD_HDR_PIR_OFF 8 #define CMD_HDR_PIR_MSK (0x1 << CMD_HDR_PIR_OFF) +#define SERDES_CFG (PORT_BASE + 0x1c) #define SL_CFG (PORT_BASE + 0x84) #define AIP_LIMIT (PORT_BASE + 0x90) #define SL_CONTROL (PORT_BASE + 0x94) @@ -525,6 +526,7 @@ static void init_reg_v3_hw(struct hisi_hba *hisi_hba) } hisi_sas_phy_write32(hisi_hba, i, PROG_PHY_LINK_RATE, prog_phy_link_rate); + hisi_sas_phy_write32(hisi_hba, i, SERDES_CFG, 0xffc00); hisi_sas_phy_write32(hisi_hba, i, SAS_RX_TRAIN_TIMER, 0x13e80); hisi_sas_phy_write32(hisi_hba, i, CHL_INT0, 0xffffffff); hisi_sas_phy_write32(hisi_hba, i, CHL_INT1, 0xffffffff); From c6b38fbbde91ee7b072febe4b83022e4850f934f Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Fri, 1 Mar 2019 10:24:59 +0100 Subject: [PATCH 326/855] drm: move i915_kick_out_vgacon to vgaarb Also rename it to vga_remove_vgacon and add kerneldoc text. Signed-off-by: Gerd Hoffmann Reviewed-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/20190301092502.30948-2-kraxel@redhat.com --- drivers/gpu/drm/i915/i915_drv.c | 35 +---------------------- drivers/gpu/vga/vgaarb.c | 49 +++++++++++++++++++++++++++++++++ include/linux/vgaarb.h | 2 ++ 3 files changed, 52 insertions(+), 34 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 6630212f2faf..9df65d386d11 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -757,39 +757,6 @@ static int i915_kick_out_firmware_fb(struct drm_i915_private *dev_priv) return ret; } -#if !defined(CONFIG_VGA_CONSOLE) -static int i915_kick_out_vgacon(struct drm_i915_private *dev_priv) -{ - return 0; -} -#elif !defined(CONFIG_DUMMY_CONSOLE) -static int i915_kick_out_vgacon(struct drm_i915_private *dev_priv) -{ - return -ENODEV; -} -#else -static int i915_kick_out_vgacon(struct drm_i915_private *dev_priv) -{ - int ret = 0; - - DRM_INFO("Replacing VGA console driver\n"); - - console_lock(); - if (con_is_bound(&vga_con)) - ret = do_take_over_console(&dummy_con, 0, MAX_NR_CONSOLES - 1, 1); - if (ret == 0) { - ret = do_unregister_con_driver(&vga_con); - - /* Ignore "already unregistered". */ - if (ret == -ENODEV) - ret = 0; - } - console_unlock(); - - return ret; -} -#endif - static void intel_init_dpio(struct drm_i915_private *dev_priv) { /* @@ -1420,7 +1387,7 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv) goto err_ggtt; } - ret = i915_kick_out_vgacon(dev_priv); + ret = vga_remove_vgacon(pdev); if (ret) { DRM_ERROR("failed to remove conflicting VGA console\n"); goto err_ggtt; diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c index dc8e039bfab5..f2f3ef8af271 100644 --- a/drivers/gpu/vga/vgaarb.c +++ b/drivers/gpu/vga/vgaarb.c @@ -48,6 +48,8 @@ #include #include #include +#include +#include #include @@ -168,6 +170,53 @@ void vga_set_default_device(struct pci_dev *pdev) vga_default = pci_dev_get(pdev); } +/** + * vga_remove_vgacon - deactivete vga console + * + * Unbind and unregister vgacon in case pdev is the default vga + * device. Can be called by gpu drivers on initialization to make + * sure vga register access done by vgacon will not disturb the + * device. + * + * @pdev: pci device. + */ +#if !defined(CONFIG_VGA_CONSOLE) +int vga_remove_vgacon(struct pci_dev *pdev) +{ + return 0; +} +#elif !defined(CONFIG_DUMMY_CONSOLE) +int vga_remove_vgacon(struct pci_dev *pdev) +{ + return -ENODEV; +} +#else +int vga_remove_vgacon(struct pci_dev *pdev) +{ + int ret = 0; + + if (pdev != vga_default) + return 0; + vgaarb_info(&pdev->dev, "deactivate vga console\n"); + + console_lock(); + if (con_is_bound(&vga_con)) + ret = do_take_over_console(&dummy_con, 0, + MAX_NR_CONSOLES - 1, 1); + if (ret == 0) { + ret = do_unregister_con_driver(&vga_con); + + /* Ignore "already unregistered". */ + if (ret == -ENODEV) + ret = 0; + } + console_unlock(); + + return ret; +} +#endif +EXPORT_SYMBOL(vga_remove_vgacon); + static inline void vga_irq_set_state(struct vga_device *vgadev, bool state) { if (vgadev->irq_set_state) diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h index ee162e3e879b..553b34c8b5f7 100644 --- a/include/linux/vgaarb.h +++ b/include/linux/vgaarb.h @@ -125,9 +125,11 @@ extern void vga_put(struct pci_dev *pdev, unsigned int rsrc); #ifdef CONFIG_VGA_ARB extern struct pci_dev *vga_default_device(void); extern void vga_set_default_device(struct pci_dev *pdev); +extern int vga_remove_vgacon(struct pci_dev *pdev); #else static inline struct pci_dev *vga_default_device(void) { return NULL; }; static inline void vga_set_default_device(struct pci_dev *pdev) { }; +static inline int vga_remove_vgacon(struct pci_dev *pdev) { return 0; }; #endif /* From 1c74ca7a1a9ae6ce71032c997a586f061a0029f8 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Fri, 1 Mar 2019 10:25:00 +0100 Subject: [PATCH 327/855] drm/fb-helper: call vga_remove_vgacon automatically. Add vga_remove_vgacon() call to drm_fb_helper_remove_conflicting_pci_framebuffers(). Signed-off-by: Gerd Hoffmann Reviewed-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/20190301092502.30948-3-kraxel@redhat.com --- include/drm/drm_fb_helper.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h index bb9acea61369..286d58efed5d 100644 --- a/include/drm/drm_fb_helper.h +++ b/include/drm/drm_fb_helper.h @@ -36,6 +36,7 @@ struct drm_fb_helper; #include #include #include +#include enum mode_set_atomic { LEAVE_ATOMIC_MODE_SET, @@ -642,11 +643,18 @@ drm_fb_helper_remove_conflicting_pci_framebuffers(struct pci_dev *pdev, int resource_id, const char *name) { + int ret = 0; + + /* + * WARNING: Apparently we must kick fbdev drivers before vgacon, + * otherwise the vga fbdev driver falls over. + */ #if IS_REACHABLE(CONFIG_FB) - return remove_conflicting_pci_framebuffers(pdev, resource_id, name); -#else - return 0; + ret = remove_conflicting_pci_framebuffers(pdev, resource_id, name); #endif + if (ret == 0) + ret = vga_remove_vgacon(pdev); + return ret; } #endif From c34674a23d1e8674ac532bf3397333f24a41e7b2 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Fri, 1 Mar 2019 10:25:01 +0100 Subject: [PATCH 328/855] drm/qxl: remove conflicting framebuffers earlier Add error checking while being at it. Signed-off-by: Gerd Hoffmann Reviewed-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/20190301092502.30948-4-kraxel@redhat.com --- drivers/gpu/drm/qxl/qxl_drv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/qxl/qxl_drv.c b/drivers/gpu/drm/qxl/qxl_drv.c index 11a76b6c9165..f33e349c4ec5 100644 --- a/drivers/gpu/drm/qxl/qxl_drv.c +++ b/drivers/gpu/drm/qxl/qxl_drv.c @@ -79,6 +79,10 @@ qxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (ret) goto free_dev; + ret = drm_fb_helper_remove_conflicting_pci_framebuffers(pdev, 0, "qxl"); + if (ret) + goto disable_pci; + ret = qxl_device_init(qdev, &qxl_driver, pdev); if (ret) goto disable_pci; @@ -94,7 +98,6 @@ qxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (ret) goto modeset_cleanup; - drm_fb_helper_remove_conflicting_pci_framebuffers(pdev, 0, "qxl"); drm_fbdev_generic_setup(&qdev->ddev, 32); return 0; From cd44bc40a1f1eb4e259889579d599f30b1287828 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 28 Feb 2019 14:31:31 +0100 Subject: [PATCH 329/855] mt76: introduce q->stopped parameter Introduce mt76_queue stopped parameter in order to run ieee80211_wake_queue only when mac80211 queues have been previously stopped and avoid to disable interrupts when it is not necessary Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/dma.c | 5 ++++- drivers/net/wireless/mediatek/mt76/mt76.h | 1 + drivers/net/wireless/mediatek/mt76/tx.c | 5 ++++- drivers/net/wireless/mediatek/mt76/usb.c | 6 +++++- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index 6eedc0ec7661..99e341cb1f92 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -180,7 +180,10 @@ mt76_dma_tx_cleanup(struct mt76_dev *dev, enum mt76_txq_id qid, bool flush) else mt76_dma_sync_idx(dev, q); - wake = wake && qid < IEEE80211_NUM_ACS && q->queued < q->ndesc - 8; + wake = wake && q->stopped && + qid < IEEE80211_NUM_ACS && q->queued < q->ndesc - 8; + if (wake) + q->stopped = false; if (!q->queued) wake_up(&dev->tx_wait); diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index 5dfb0601f101..3152b8c73c4a 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -126,6 +126,7 @@ struct mt76_queue { int ndesc; int queued; int buf_size; + bool stopped; u8 buf_offset; u8 hw_idx; diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c index 5a349fe3e576..4d38a54014e8 100644 --- a/drivers/net/wireless/mediatek/mt76/tx.c +++ b/drivers/net/wireless/mediatek/mt76/tx.c @@ -289,8 +289,11 @@ mt76_tx(struct mt76_dev *dev, struct ieee80211_sta *sta, dev->queue_ops->tx_queue_skb(dev, q, skb, wcid, sta); dev->queue_ops->kick(dev, q); - if (q->queued > q->ndesc - 8) + if (q->queued > q->ndesc - 8 && !q->stopped) { ieee80211_stop_queue(dev->hw, skb_get_queue_mapping(skb)); + q->stopped = true; + } + spin_unlock_bh(&q->lock); } EXPORT_SYMBOL_GPL(mt76_tx); diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c index ae6ada370597..4c1abd492405 100644 --- a/drivers/net/wireless/mediatek/mt76/usb.c +++ b/drivers/net/wireless/mediatek/mt76/usb.c @@ -655,7 +655,11 @@ static void mt76u_tx_tasklet(unsigned long data) spin_lock_bh(&q->lock); } mt76_txq_schedule(dev, q); - wake = i < IEEE80211_NUM_ACS && q->queued < q->ndesc - 8; + + wake = q->stopped && q->queued < q->ndesc - 8; + if (wake) + q->stopped = false; + if (!q->queued) wake_up(&dev->tx_wait); From fc7801021733b9fbf213ae2bde5dc5e73896a9c7 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 27 Feb 2019 19:38:29 +0100 Subject: [PATCH 330/855] mt76: rewrite dma descriptor base and ring size on queue reset Useful in case the hardware reset clobbers these values Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/dma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index 99e341cb1f92..76629b98c78d 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -130,6 +130,8 @@ mt76_dma_tx_cleanup_idx(struct mt76_dev *dev, struct mt76_queue *q, int idx, static void mt76_dma_sync_idx(struct mt76_dev *dev, struct mt76_queue *q) { + iowrite32(q->desc_dma, &q->regs->desc_base); + iowrite32(q->ndesc, &q->regs->ring_size); q->head = ioread32(&q->regs->dma_idx); q->tail = q->head; iowrite32(q->head, &q->regs->cpu_idx); From de3c2af15fce23c42407ad0a868ac47df2e7279a Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 27 Feb 2019 19:40:27 +0100 Subject: [PATCH 331/855] mt76: mt76x02: when setting a key, use PN from mac80211 Preparation for full device restart support Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt76x02_mac.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c index 91ff6598eccf..8109bac5aee6 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c @@ -73,6 +73,7 @@ int mt76x02_mac_wcid_set_key(struct mt76x02_dev *dev, u8 idx, enum mt76x02_cipher_type cipher; u8 key_data[32]; u8 iv_data[8]; + u64 pn; cipher = mt76x02_mac_get_key_info(key, key_data); if (cipher == MT_CIPHER_NONE && key) @@ -85,9 +86,22 @@ int mt76x02_mac_wcid_set_key(struct mt76x02_dev *dev, u8 idx, if (key) { mt76_rmw_field(dev, MT_WCID_ATTR(idx), MT_WCID_ATTR_PAIRWISE, !!(key->flags & IEEE80211_KEY_FLAG_PAIRWISE)); + + pn = atomic64_read(&key->tx_pn); + iv_data[3] = key->keyidx << 6; - if (cipher >= MT_CIPHER_TKIP) + if (cipher >= MT_CIPHER_TKIP) { iv_data[3] |= 0x20; + put_unaligned_le32(pn >> 16, &iv_data[4]); + } + + if (cipher == MT_CIPHER_TKIP) { + iv_data[0] = (pn >> 8) & 0xff; + iv_data[1] = (iv_data[0] | 0x20) & 0x7f; + iv_data[2] = pn & 0xff; + } else if (cipher >= MT_CIPHER_AES_CCMP) { + put_unaligned_le16((pn & 0xffff), &iv_data[0]); + } } mt76_wr_copy(dev, MT_WCID_IV(idx), iv_data, sizeof(iv_data)); From 004960423fe17dfff93753017b7081dab36c7180 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 27 Feb 2019 19:42:39 +0100 Subject: [PATCH 332/855] mt76: mt76x2: implement full device restart on watchdog reset Restart the firmware and re-initialize the MAC to be able to recover from more kinds of hang states Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt76.h | 1 + .../net/wireless/mediatek/mt76/mt76x02_mac.c | 26 ++++++ .../net/wireless/mediatek/mt76/mt76x02_mac.h | 2 + .../net/wireless/mediatek/mt76/mt76x02_mmio.c | 81 +++++++++++++++++-- .../net/wireless/mediatek/mt76/mt76x02_util.c | 4 + .../wireless/mediatek/mt76/mt76x2/mt76x2.h | 1 + .../wireless/mediatek/mt76/mt76x2/pci_init.c | 2 +- .../wireless/mediatek/mt76/mt76x2/pci_mcu.c | 21 +++++ drivers/net/wireless/mediatek/mt76/tx.c | 3 + 9 files changed, 132 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index 3152b8c73c4a..cb50e96e736b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -144,6 +144,7 @@ struct mt76_mcu_ops { const struct mt76_reg_pair *rp, int len); int (*mcu_rd_rp)(struct mt76_dev *dev, u32 base, struct mt76_reg_pair *rp, int len); + int (*mcu_restart)(struct mt76_dev *dev); }; struct mt76_queue_ops { diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c index 8109bac5aee6..e1e0c8da5a8c 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c @@ -67,6 +67,32 @@ int mt76x02_mac_shared_key_setup(struct mt76x02_dev *dev, u8 vif_idx, } EXPORT_SYMBOL_GPL(mt76x02_mac_shared_key_setup); +void mt76x02_mac_wcid_sync_pn(struct mt76x02_dev *dev, u8 idx, + struct ieee80211_key_conf *key) +{ + enum mt76x02_cipher_type cipher; + u8 key_data[32]; + u32 iv, eiv; + u64 pn; + + cipher = mt76x02_mac_get_key_info(key, key_data); + iv = mt76_rr(dev, MT_WCID_IV(idx)); + eiv = mt76_rr(dev, MT_WCID_IV(idx) + 4); + + pn = (u64)eiv << 16; + if (cipher == MT_CIPHER_TKIP) { + pn |= (iv >> 16) & 0xff; + pn |= (iv & 0xff) << 8; + } else if (cipher >= MT_CIPHER_AES_CCMP) { + pn |= iv & 0xffff; + } else { + return; + } + + atomic64_set(&key->tx_pn, pn); +} + + int mt76x02_mac_wcid_set_key(struct mt76x02_dev *dev, u8 idx, struct ieee80211_key_conf *key) { diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.h b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.h index 6b1f25d2f64c..caeeef96c42f 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.h +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.h @@ -177,6 +177,8 @@ int mt76x02_mac_shared_key_setup(struct mt76x02_dev *dev, u8 vif_idx, u8 key_idx, struct ieee80211_key_conf *key); int mt76x02_mac_wcid_set_key(struct mt76x02_dev *dev, u8 idx, struct ieee80211_key_conf *key); +void mt76x02_mac_wcid_sync_pn(struct mt76x02_dev *dev, u8 idx, + struct ieee80211_key_conf *key); void mt76x02_mac_wcid_setup(struct mt76x02_dev *dev, u8 idx, u8 vif_idx, u8 *mac); void mt76x02_mac_wcid_set_drop(struct mt76x02_dev *dev, u8 idx, bool drop); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c index 1229f19f2b02..9de015bcd4f9 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c @@ -19,6 +19,7 @@ #include #include "mt76x02.h" +#include "mt76x02_mcu.h" #include "mt76x02_trace.h" struct beacon_bc_data { @@ -418,9 +419,65 @@ static bool mt76x02_tx_hang(struct mt76x02_dev *dev) return i < 4; } +static void mt76x02_key_sync(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + struct ieee80211_key_conf *key, void *data) +{ + struct mt76x02_dev *dev = hw->priv; + struct mt76_wcid *wcid; + + if (!sta) + return; + + wcid = (struct mt76_wcid *) sta->drv_priv; + + if (wcid->hw_key_idx != key->keyidx || wcid->sw_iv) + return; + + mt76x02_mac_wcid_sync_pn(dev, wcid->idx, key); +} + +static void mt76x02_reset_state(struct mt76x02_dev *dev) +{ + int i; + + clear_bit(MT76_STATE_RUNNING, &dev->mt76.state); + + rcu_read_lock(); + + ieee80211_iter_keys_rcu(dev->mt76.hw, NULL, mt76x02_key_sync, NULL); + + for (i = 0; i < ARRAY_SIZE(dev->mt76.wcid); i++) { + struct mt76_wcid *wcid = rcu_dereference(dev->mt76.wcid[i]); + struct mt76x02_sta *msta; + struct ieee80211_sta *sta; + struct ieee80211_vif *vif; + void *priv; + + if (!wcid) + continue; + + priv = msta = container_of(wcid, struct mt76x02_sta, wcid); + sta = container_of(priv, struct ieee80211_sta, drv_priv); + + priv = msta->vif; + vif = container_of(priv, struct ieee80211_vif, drv_priv); + + mt76_sta_state(dev->mt76.hw, vif, sta, + IEEE80211_STA_NONE, IEEE80211_STA_NOTEXIST); + memset(msta, 0, sizeof(*msta)); + } + + rcu_read_unlock(); + + dev->vif_mask = 0; + dev->beacon_mask = 0; +} + static void mt76x02_watchdog_reset(struct mt76x02_dev *dev) { u32 mask = dev->mt76.mmio.irqmask; + bool restart = dev->mt76.mcu_ops->mcu_restart; int i; ieee80211_stop_queues(dev->mt76.hw); @@ -432,6 +489,9 @@ static void mt76x02_watchdog_reset(struct mt76x02_dev *dev) for (i = 0; i < ARRAY_SIZE(dev->mt76.napi); i++) napi_disable(&dev->mt76.napi[i]); + if (restart) + mt76x02_reset_state(dev); + mutex_lock(&dev->mt76.mutex); if (dev->beacon_mask) @@ -452,20 +512,21 @@ static void mt76x02_watchdog_reset(struct mt76x02_dev *dev) /* let fw reset DMA */ mt76_set(dev, 0x734, 0x3); + if (restart) + dev->mt76.mcu_ops->mcu_restart(&dev->mt76); + for (i = 0; i < ARRAY_SIZE(dev->mt76.q_tx); i++) mt76_queue_tx_cleanup(dev, i, true); for (i = 0; i < ARRAY_SIZE(dev->mt76.q_rx); i++) mt76_queue_rx_reset(dev, i); - mt76_wr(dev, MT_MAC_SYS_CTRL, - MT_MAC_SYS_CTRL_ENABLE_TX | MT_MAC_SYS_CTRL_ENABLE_RX); - mt76_set(dev, MT_WPDMA_GLO_CFG, - MT_WPDMA_GLO_CFG_TX_DMA_EN | MT_WPDMA_GLO_CFG_RX_DMA_EN); + mt76x02_mac_start(dev); + if (dev->ed_monitor) mt76_set(dev, MT_TXOP_CTRL_CFG, MT_TXOP_ED_CCA_EN); - if (dev->beacon_mask) + if (dev->beacon_mask && !restart) mt76_set(dev, MT_BEACON_TIME_CFG, MT_BEACON_TIME_CFG_BEACON_TX | MT_BEACON_TIME_CFG_TBTT_EN); @@ -486,9 +547,13 @@ static void mt76x02_watchdog_reset(struct mt76x02_dev *dev) napi_schedule(&dev->mt76.napi[i]); } - ieee80211_wake_queues(dev->mt76.hw); - - mt76_txq_schedule_all(&dev->mt76); + if (restart) { + mt76x02_mcu_function_select(dev, Q_SELECT, 1); + ieee80211_restart_hw(dev->mt76.hw); + } else { + ieee80211_wake_queues(dev->mt76.hw); + mt76_txq_schedule_all(&dev->mt76); + } } static void mt76x02_check_tx_hang(struct mt76x02_dev *dev) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c index a48c261b0c63..28eccb3119d1 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c @@ -237,6 +237,8 @@ int mt76x02_sta_add(struct mt76_dev *mdev, struct ieee80211_vif *vif, struct mt76x02_vif *mvif = (struct mt76x02_vif *)vif->drv_priv; int idx = 0; + memset(msta, 0, sizeof(*msta)); + idx = mt76_wcid_alloc(dev->mt76.wcid_mask, ARRAY_SIZE(dev->mt76.wcid)); if (idx < 0) return -ENOSPC; @@ -274,6 +276,8 @@ mt76x02_vif_init(struct mt76x02_dev *dev, struct ieee80211_vif *vif, struct mt76x02_vif *mvif = (struct mt76x02_vif *)vif->drv_priv; struct mt76_txq *mtxq; + memset(mvif, 0, sizeof(*mvif)); + mvif->idx = idx; mvif->group_wcid.idx = MT_VIF_WCID(idx); mvif->group_wcid.hw_key_idx = -1; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/mt76x2.h b/drivers/net/wireless/mediatek/mt76/mt76x2/mt76x2.h index 6c619f1c65c9..d7abe3d73bad 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/mt76x2.h +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/mt76x2.h @@ -71,6 +71,7 @@ int mt76x2_mcu_load_cr(struct mt76x02_dev *dev, u8 type, u8 temp_level, void mt76x2_cleanup(struct mt76x02_dev *dev); +int mt76x2_mac_reset(struct mt76x02_dev *dev, bool hard); void mt76x2_reset_wlan(struct mt76x02_dev *dev, bool enable); void mt76x2_init_txpower(struct mt76x02_dev *dev, struct ieee80211_supported_band *sband); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci_init.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci_init.c index 984d9c4c2e1a..d3927a13e92e 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci_init.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci_init.c @@ -77,7 +77,7 @@ mt76x2_fixup_xtal(struct mt76x02_dev *dev) } } -static int mt76x2_mac_reset(struct mt76x02_dev *dev, bool hard) +int mt76x2_mac_reset(struct mt76x02_dev *dev, bool hard) { const u8 *macaddr = dev->mt76.macaddr; u32 val; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci_mcu.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci_mcu.c index 03e24ae7f66c..605dc66ae83b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci_mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci_mcu.c @@ -165,9 +165,30 @@ error: return -ENOENT; } +static int +mt76pci_mcu_restart(struct mt76_dev *mdev) +{ + struct mt76x02_dev *dev; + int ret; + + dev = container_of(mdev, struct mt76x02_dev, mt76); + + mt76x02_mcu_cleanup(dev); + mt76x2_mac_reset(dev, true); + + ret = mt76pci_load_firmware(dev); + if (ret) + return ret; + + mt76_wr(dev, MT_WPDMA_RST_IDX, ~0); + + return 0; +} + int mt76x2_mcu_init(struct mt76x02_dev *dev) { static const struct mt76_mcu_ops mt76x2_mcu_ops = { + .mcu_restart = mt76pci_mcu_restart, .mcu_send_msg = mt76x02_mcu_msg_send, }; int ret; diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c index 4d38a54014e8..fc7dffe066be 100644 --- a/drivers/net/wireless/mediatek/mt76/tx.c +++ b/drivers/net/wireless/mediatek/mt76/tx.c @@ -580,6 +580,9 @@ void mt76_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *txq) struct mt76_txq *mtxq = (struct mt76_txq *) txq->drv_priv; struct mt76_queue *hwq = mtxq->hwq; + if (!test_bit(MT76_STATE_RUNNING, &dev->state)) + return; + spin_lock_bh(&hwq->lock); if (list_empty(&mtxq->list)) list_add_tail(&mtxq->list, &hwq->swq); From 7b25d3b8e485c7721cba9c71b44d1c286e61c8e7 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 28 Feb 2019 16:11:06 +0100 Subject: [PATCH 333/855] mt76x02: fix hdr pointer in write txwi for USB Since we add txwi at the begining of skb->data, it no longer point to ieee80211_hdr. This breaks settings TS bit for probe response and beacons. Acked-by: Lorenzo Bianconi Signed-off-by: Stanislaw Gruszka Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c b/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c index 43f07461c8d3..6fb52b596d42 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c @@ -85,8 +85,9 @@ int mt76x02u_tx_prepare_skb(struct mt76_dev *mdev, void *data, mt76x02_insert_hdr_pad(skb); - txwi = skb_push(skb, sizeof(struct mt76x02_txwi)); + txwi = (struct mt76x02_txwi *)(skb->data - sizeof(struct mt76x02_txwi)); mt76x02_mac_write_txwi(dev, txwi, skb, wcid, sta, len); + skb_push(skb, sizeof(struct mt76x02_txwi)); pid = mt76_tx_status_skb_add(mdev, wcid, skb); txwi->pktid = pid; From 3fd0824a2f800a2870569d385917ae1102647055 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 1 Mar 2019 16:51:03 +0100 Subject: [PATCH 334/855] mt76: mt76x02: only update the base mac address if necessary Also update the mask first before calculating the vif index. Fixes an issue where adding back the same interfaces in a different order fails because of duplicate vif index use Fixes: 06662264ce2ad ("mt76x02: use mask for vifs") Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt76x02_util.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c index 28eccb3119d1..cd072ac614f7 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c @@ -293,6 +293,12 @@ mt76x02_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif) struct mt76x02_dev *dev = hw->priv; unsigned int idx = 0; + /* Allow to change address in HW if we create first interface. */ + if (!dev->vif_mask && + (((vif->addr[0] ^ dev->mt76.macaddr[0]) & ~GENMASK(4, 1)) || + memcmp(vif->addr + 1, dev->mt76.macaddr + 1, ETH_ALEN - 1))) + mt76x02_mac_setaddr(dev, vif->addr); + if (vif->addr[0] & BIT(1)) idx = 1 + (((dev->mt76.macaddr[0] ^ vif->addr[0]) >> 2) & 7); @@ -315,10 +321,6 @@ mt76x02_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif) if (dev->vif_mask & BIT(idx)) return -EBUSY; - /* Allow to change address in HW if we create first interface. */ - if (!dev->vif_mask && !ether_addr_equal(dev->mt76.macaddr, vif->addr)) - mt76x02_mac_setaddr(dev, vif->addr); - dev->vif_mask |= BIT(idx); mt76x02_vif_init(dev, vif, idx); From a0ac806109277bd865b1048ec521f708b195670b Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 2 Mar 2019 18:19:20 +0100 Subject: [PATCH 335/855] mt76: mt76x02: reduce false positives in ED/CCA tx blocking Full tx blocking (as opposed to CCA blocking) should only happen if there is a continuous non-802.11 signal above the energy detect threshold. Unfortunately the ED/CCA counter can't detect that, as it also counts 802.11 signals as busy. Similar to the vendor code, implement a learning mode that waits until the AGC gain has already been adjusted to the lowest value (due to false CCA events), and the number of false CCA events still remains high, and the blocking threshold is exceeded for more than 5 seconds. Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt76x02.h | 3 +++ .../net/wireless/mediatek/mt76/mt76x02_mac.c | 25 ++++++++++++++++--- .../net/wireless/mediatek/mt76/mt76x02_phy.c | 2 ++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02.h b/drivers/net/wireless/mediatek/mt76/mt76x02.h index 6915cce5def9..42cc9da68f7b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02.h +++ b/drivers/net/wireless/mediatek/mt76/mt76x02.h @@ -51,6 +51,7 @@ struct mt76x02_calibration { u16 false_cca; s8 avg_rssi_all; s8 agc_gain_adjust; + s8 agc_lowest_gain; s8 low_gain; s8 temp_vco; @@ -114,8 +115,10 @@ struct mt76x02_dev { struct mt76x02_dfs_pattern_detector dfs_pd; /* edcca monitor */ + unsigned long ed_trigger_timeout; bool ed_tx_blocked; bool ed_monitor; + u8 ed_monitor_learning; u8 ed_trigger; u8 ed_silent; ktime_t ed_time; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c index e1e0c8da5a8c..9ed231abe916 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mac.c @@ -960,6 +960,7 @@ void mt76x02_edcca_init(struct mt76x02_dev *dev, bool enable) } } mt76x02_edcca_tx_enable(dev, true); + dev->ed_monitor_learning = true; /* clear previous CCA timer value */ mt76_rr(dev, MT_ED_CCA_TIMER); @@ -969,6 +970,10 @@ EXPORT_SYMBOL_GPL(mt76x02_edcca_init); #define MT_EDCCA_TH 92 #define MT_EDCCA_BLOCK_TH 2 +#define MT_EDCCA_LEARN_TH 50 +#define MT_EDCCA_LEARN_CCA 180 +#define MT_EDCCA_LEARN_TIMEOUT (20 * HZ) + static void mt76x02_edcca_check(struct mt76x02_dev *dev) { ktime_t cur_time; @@ -991,11 +996,23 @@ static void mt76x02_edcca_check(struct mt76x02_dev *dev) dev->ed_trigger = 0; } - if (dev->ed_trigger > MT_EDCCA_BLOCK_TH && - !dev->ed_tx_blocked) + if (dev->cal.agc_lowest_gain && + dev->cal.false_cca > MT_EDCCA_LEARN_CCA && + dev->ed_trigger > MT_EDCCA_LEARN_TH) { + dev->ed_monitor_learning = false; + dev->ed_trigger_timeout = jiffies + 20 * HZ; + } else if (!dev->ed_monitor_learning && + time_is_after_jiffies(dev->ed_trigger_timeout)) { + dev->ed_monitor_learning = true; + mt76x02_edcca_tx_enable(dev, true); + } + + if (dev->ed_monitor_learning) + return; + + if (dev->ed_trigger > MT_EDCCA_BLOCK_TH && !dev->ed_tx_blocked) mt76x02_edcca_tx_enable(dev, false); - else if (dev->ed_silent > MT_EDCCA_BLOCK_TH && - dev->ed_tx_blocked) + else if (dev->ed_silent > MT_EDCCA_BLOCK_TH && dev->ed_tx_blocked) mt76x02_edcca_tx_enable(dev, true); } diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_phy.c b/drivers/net/wireless/mediatek/mt76/mt76x02_phy.c index a020c757ba5c..a54b63a96eae 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_phy.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_phy.c @@ -194,6 +194,8 @@ bool mt76x02_phy_adjust_vga_gain(struct mt76x02_dev *dev) ret = true; } + dev->cal.agc_lowest_gain = dev->cal.agc_gain_adjust >= limit; + return ret; } EXPORT_SYMBOL_GPL(mt76x02_phy_adjust_vga_gain); From 7635276989a183bdb424f9e930f836b6264d54dc Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 3 Mar 2019 11:06:19 +0100 Subject: [PATCH 336/855] mt76: mt7603: fix tx status HT rate validation Use the correct variable in the check. Fixes an uninitialized variable warning Reported-by: Gustavo A. R. Silva Fixes: c8846e1015022 ("mt76: add driver for MT7603E and MT7628/7688") Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7603/mac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/mac.c b/drivers/net/wireless/mediatek/mt76/mt7603/mac.c index 0a0115861b51..5e31d7da96fc 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/mac.c @@ -1072,7 +1072,7 @@ out: case MT_PHY_TYPE_HT: final_rate_flags |= IEEE80211_TX_RC_MCS; final_rate &= GENMASK(5, 0); - if (i > 15) + if (final_rate > 15) return false; break; default: From 45a042e3026824a7e910db7a4dd38fef0540b902 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 3 Mar 2019 15:10:00 +0100 Subject: [PATCH 337/855] mt76: mt76x2: fix external LNA gain settings Devices with external LNA need different values for AGC registers 8 and 9 Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt76x2/phy.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/phy.c b/drivers/net/wireless/mediatek/mt76/mt76x2/phy.c index 1848e8ab2e21..c7e71f2ba2a7 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/phy.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/phy.c @@ -260,10 +260,15 @@ mt76x2_phy_set_gain_val(struct mt76x02_dev *dev) gain_val[0] = dev->cal.agc_gain_cur[0] - dev->cal.agc_gain_adjust; gain_val[1] = dev->cal.agc_gain_cur[1] - dev->cal.agc_gain_adjust; - if (dev->mt76.chandef.width >= NL80211_CHAN_WIDTH_40) + val = 0x1836 << 16; + if (!mt76x2_has_ext_lna(dev) && + dev->mt76.chandef.width >= NL80211_CHAN_WIDTH_40) val = 0x1e42 << 16; - else - val = 0x1836 << 16; + + if (mt76x2_has_ext_lna(dev) && + dev->mt76.chandef.chan->band == NL80211_BAND_2GHZ && + dev->mt76.chandef.width < NL80211_CHAN_WIDTH_40) + val = 0x0f36 << 16; val |= 0xf8; From b8cfd87ac24273e36fbd3ecda631f3ba6566d493 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 3 Mar 2019 15:12:14 +0100 Subject: [PATCH 338/855] mt76: mt76x2: fix 2.4 GHz channel gain settings AGC register 35, 37 override for the low gain setting should only be done on 5 GHz. Also, 2.4 GHz needs a different value for register 35 Signed-off-by: Felix Fietkau --- .../net/wireless/mediatek/mt76/mt76x2/phy.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/phy.c b/drivers/net/wireless/mediatek/mt76/mt76x2/phy.c index c7e71f2ba2a7..769a9b972044 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/phy.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/phy.c @@ -285,6 +285,7 @@ void mt76x2_phy_update_channel_gain(struct mt76x02_dev *dev) { u8 *gain = dev->cal.agc_gain_init; u8 low_gain_delta, gain_delta; + u32 agc_35, agc_37; bool gain_change; int low_gain; u32 val; @@ -323,6 +324,16 @@ void mt76x2_phy_update_channel_gain(struct mt76x02_dev *dev) else low_gain_delta = 14; + agc_37 = 0x2121262c; + if (dev->mt76.chandef.chan->band == NL80211_BAND_2GHZ) + agc_35 = 0x11111516; + else if (low_gain == 2) + agc_35 = agc_37 = 0x08080808; + else if (dev->mt76.chandef.width == NL80211_CHAN_WIDTH_80) + agc_35 = 0x10101014; + else + agc_35 = 0x11111116; + if (low_gain == 2) { mt76_wr(dev, MT_BBP(RXO, 18), 0xf000a990); mt76_wr(dev, MT_BBP(AGC, 35), 0x08080808); @@ -331,15 +342,13 @@ void mt76x2_phy_update_channel_gain(struct mt76x02_dev *dev) dev->cal.agc_gain_adjust = 0; } else { mt76_wr(dev, MT_BBP(RXO, 18), 0xf000a991); - if (dev->mt76.chandef.width == NL80211_CHAN_WIDTH_80) - mt76_wr(dev, MT_BBP(AGC, 35), 0x10101014); - else - mt76_wr(dev, MT_BBP(AGC, 35), 0x11111116); - mt76_wr(dev, MT_BBP(AGC, 37), 0x2121262C); gain_delta = 0; dev->cal.agc_gain_adjust = low_gain_delta; } + mt76_wr(dev, MT_BBP(AGC, 35), agc_35); + mt76_wr(dev, MT_BBP(AGC, 37), agc_37); + dev->cal.agc_gain_cur[0] = gain[0] - gain_delta; dev->cal.agc_gain_cur[1] = gain[1] - gain_delta; mt76x2_phy_set_gain_val(dev); From f25e813bf48dce541c29b12cfc19a2c25b6db915 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 3 Mar 2019 18:39:08 +0100 Subject: [PATCH 339/855] mt76: mt7603: clear ps filtering mode before releasing buffered frames Fixes sending them, otherwise they loop back right into the buffer Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7603/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/main.c b/drivers/net/wireless/mediatek/mt76/mt7603/main.c index b10775ed92e6..8da0b8707d24 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/main.c @@ -399,6 +399,8 @@ mt7603_release_buffered_frames(struct ieee80211_hw *hw, __skb_queue_head_init(&list); + mt7603_wtbl_set_ps(dev, msta, false); + spin_lock_bh(&dev->ps_lock); skb_queue_walk_safe(&msta->psq, skb, tmp) { if (!nframes) From fca9615f1a436d1e9f64042cda08a34fe32ce668 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 3 Mar 2019 18:40:18 +0100 Subject: [PATCH 340/855] mt76: mt7603: fix up hardware queue index for PS filtered packets Make the queue index match the hardware queue on which they get sent out Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7603/dma.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/dma.c b/drivers/net/wireless/mediatek/mt76/mt7603/dma.c index d69e82c66ab2..494a48e023ef 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/dma.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/dma.c @@ -50,6 +50,10 @@ mt7603_rx_loopback_skb(struct mt7603_dev *dev, struct sk_buff *skb) val = le32_to_cpu(txd[0]); skb_set_queue_mapping(skb, FIELD_GET(MT_TXD0_Q_IDX, val)); + val &= ~(MT_TXD0_P_IDX | MT_TXD0_Q_IDX); + val |= FIELD_PREP(MT_TXD0_Q_IDX, MT_TX_HW_QUEUE_MGMT); + txd[0] = cpu_to_le32(val); + spin_lock_bh(&dev->ps_lock); __skb_queue_tail(&msta->psq, skb); if (skb_queue_len(&msta->psq) >= 64) { From e004b7006600258615b969f05c4d008c1d68973b Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 3 Mar 2019 18:51:35 +0100 Subject: [PATCH 341/855] mt76: mt7603: notify mac80211 about buffered frames in ps queue Also fix the size check for filtered powersave frames Fixes a corner case with waking up clients Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7603/dma.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/dma.c b/drivers/net/wireless/mediatek/mt76/mt7603/dma.c index 494a48e023ef..b3ae0aaea62a 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/dma.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/dma.c @@ -27,12 +27,16 @@ static void mt7603_rx_loopback_skb(struct mt7603_dev *dev, struct sk_buff *skb) { __le32 *txd = (__le32 *)skb->data; + struct ieee80211_hdr *hdr; + struct ieee80211_sta *sta; struct mt7603_sta *msta; struct mt76_wcid *wcid; + void *priv; int idx; u32 val; + u8 tid; - if (skb->len < sizeof(MT_TXD_SIZE) + sizeof(struct ieee80211_hdr)) + if (skb->len < MT_TXD_SIZE + sizeof(struct ieee80211_hdr)) goto free; val = le32_to_cpu(txd[1]); @@ -46,7 +50,7 @@ mt7603_rx_loopback_skb(struct mt7603_dev *dev, struct sk_buff *skb) if (!wcid) goto free; - msta = container_of(wcid, struct mt7603_sta, wcid); + priv = msta = container_of(wcid, struct mt7603_sta, wcid); val = le32_to_cpu(txd[0]); skb_set_queue_mapping(skb, FIELD_GET(MT_TXD0_Q_IDX, val)); @@ -54,6 +58,11 @@ mt7603_rx_loopback_skb(struct mt7603_dev *dev, struct sk_buff *skb) val |= FIELD_PREP(MT_TXD0_Q_IDX, MT_TX_HW_QUEUE_MGMT); txd[0] = cpu_to_le32(val); + sta = container_of(priv, struct ieee80211_sta, drv_priv); + hdr = (struct ieee80211_hdr *) &skb->data[MT_TXD_SIZE]; + tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + ieee80211_sta_set_buffered(sta, tid, true); + spin_lock_bh(&dev->ps_lock); __skb_queue_tail(&msta->psq, skb); if (skb_queue_len(&msta->psq) >= 64) { From b7001f46085e06a74e4677b44ac55566f66e55aa Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 3 Mar 2019 19:16:03 +0100 Subject: [PATCH 342/855] mt76: mt7603: clear the service period on releasing PS filtered packets These packets have no txwi entry in the ring, so tracking via tx status does not work. To prevent PS poll requests from being unanswered, end the service period right away Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7603/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/main.c b/drivers/net/wireless/mediatek/mt76/mt7603/main.c index 8da0b8707d24..ea25eff5e81c 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/main.c @@ -416,6 +416,9 @@ mt7603_release_buffered_frames(struct ieee80211_hw *hw, } spin_unlock_bh(&dev->ps_lock); + if (!skb_queue_empty(&list)) + ieee80211_sta_eosp(sta); + mt7603_ps_tx_list(dev, &list); if (nframes) From ffc9a7ff59a41df9a0a265b2dd04d97f2b35893a Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 3 Mar 2019 19:19:21 +0100 Subject: [PATCH 343/855] mt76: when releasing PS frames, end the service period if no frame was found Fixes a rare corner case if the txq dequeue attempt fails, but mac80211 still has PS buffered packets Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/tx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c index fc7dffe066be..2585df512335 100644 --- a/drivers/net/wireless/mediatek/mt76/tx.c +++ b/drivers/net/wireless/mediatek/mt76/tx.c @@ -377,7 +377,10 @@ mt76_release_buffered_frames(struct ieee80211_hw *hw, struct ieee80211_sta *sta, if (last_skb) { mt76_queue_ps_skb(dev, sta, last_skb, true); dev->queue_ops->kick(dev, hwq); + } else { + ieee80211_sta_eosp(sta); } + spin_unlock_bh(&hwq->lock); } EXPORT_SYMBOL_GPL(mt76_release_buffered_frames); From 643749d4a82b150afd2f87b29ca3dda885f8e384 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sun, 3 Mar 2019 19:40:36 +0100 Subject: [PATCH 344/855] mt76: mt76x02: disable ED/CCA by default This feature has been reported to cause stability issues on several systems. Disable it until it has been fixed and verified. It can still be enabled through debugfs Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt76x02.h | 1 + .../wireless/mediatek/mt76/mt76x02_debugfs.c | 27 +++++++++++++++++++ .../net/wireless/mediatek/mt76/mt76x02_dfs.c | 3 ++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02.h b/drivers/net/wireless/mediatek/mt76/mt76x02.h index 42cc9da68f7b..b5a36d9fb2bd 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02.h +++ b/drivers/net/wireless/mediatek/mt76/mt76x02.h @@ -118,6 +118,7 @@ struct mt76x02_dev { unsigned long ed_trigger_timeout; bool ed_tx_blocked; bool ed_monitor; + u8 ed_monitor_enabled; u8 ed_monitor_learning; u8 ed_trigger; u8 ed_silent; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_debugfs.c b/drivers/net/wireless/mediatek/mt76/mt76x02_debugfs.c index 7580c5c986ff..b1d6fd4861e3 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_debugfs.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_debugfs.c @@ -116,6 +116,32 @@ static int read_agc(struct seq_file *file, void *data) return 0; } +static int +mt76_edcca_set(void *data, u64 val) +{ + struct mt76x02_dev *dev = data; + enum nl80211_dfs_regions region = dev->dfs_pd.region; + + dev->ed_monitor_enabled = !!val; + dev->ed_monitor = dev->ed_monitor_enabled && + region == NL80211_DFS_ETSI; + mt76x02_edcca_init(dev, true); + + return 0; +} + +static int +mt76_edcca_get(void *data, u64 *val) +{ + struct mt76x02_dev *dev = data; + + *val = dev->ed_monitor_enabled; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_edcca, mt76_edcca_get, mt76_edcca_set, + "%lld\n"); + void mt76x02_init_debugfs(struct mt76x02_dev *dev) { struct dentry *dir; @@ -127,6 +153,7 @@ void mt76x02_init_debugfs(struct mt76x02_dev *dev) debugfs_create_u8("temperature", 0400, dir, &dev->cal.temp); debugfs_create_bool("tpc", 0600, dir, &dev->enable_tpc); + debugfs_create_file("edcca", 0400, dir, dev, &fops_edcca); debugfs_create_file("ampdu_stat", 0400, dir, dev, &fops_ampdu_stat); debugfs_create_file("dfs_stats", 0400, dir, dev, &fops_dfs_stat); debugfs_create_devm_seqfile(dev->mt76.dev, "txpower", dir, diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_dfs.c b/drivers/net/wireless/mediatek/mt76/mt76x02_dfs.c index e4649103efd4..17d12d212d1b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_dfs.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_dfs.c @@ -885,7 +885,8 @@ mt76x02_dfs_set_domain(struct mt76x02_dev *dev, if (dfs_pd->region != region) { tasklet_disable(&dfs_pd->dfs_tasklet); - dev->ed_monitor = region == NL80211_DFS_ETSI; + dev->ed_monitor = dev->ed_monitor_enabled && + region == NL80211_DFS_ETSI; mt76x02_edcca_init(dev, true); dfs_pd->region = region; From b126c889743513543d007ac1c5c82b61ef003133 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 4 Mar 2019 08:36:11 +0100 Subject: [PATCH 345/855] mt76: mt7603: set moredata flag when queueing ps-filtered packets Clients should poll for more packets afterwards Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7603/main.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/main.c b/drivers/net/wireless/mediatek/mt76/mt7603/main.c index ea25eff5e81c..cc0fe0933b2d 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/main.c @@ -5,6 +5,7 @@ #include #include #include "mt7603.h" +#include "mac.h" #include "eeprom.h" static int @@ -385,6 +386,15 @@ mt7603_sta_ps(struct mt76_dev *mdev, struct ieee80211_sta *sta, bool ps) mt7603_ps_tx_list(dev, &list); } +static void +mt7603_ps_set_more_data(struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr; + + hdr = (struct ieee80211_hdr *) &skb->data[MT_TXD_SIZE]; + hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); +} + static void mt7603_release_buffered_frames(struct ieee80211_hw *hw, struct ieee80211_sta *sta, @@ -411,6 +421,7 @@ mt7603_release_buffered_frames(struct ieee80211_hw *hw, skb_set_queue_mapping(skb, MT_TXQ_PSD); __skb_unlink(skb, &msta->psq); + mt7603_ps_set_more_data(skb); __skb_queue_tail(&list, skb); nframes--; } From 7c1b998d3483acf785c45035c0e14a62023f1edb Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 4 Mar 2019 01:10:00 +0000 Subject: [PATCH 346/855] mt76: fix return value check in mt76_wmac_probe() In case of error, the function devm_ioremap_resource() returns ERR_PTR() and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR(). Fixes: c8846e101502 ("mt76: add driver for MT7603E and MT7628/7688") Signed-off-by: Wei Yongjun Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt7603/soc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/soc.c b/drivers/net/wireless/mediatek/mt76/mt7603/soc.c index e13fea80d970..b920be1f5718 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/soc.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/soc.c @@ -23,9 +23,9 @@ mt76_wmac_probe(struct platform_device *pdev) } mem_base = devm_ioremap_resource(&pdev->dev, res); - if (!mem_base) { + if (IS_ERR(mem_base)) { dev_err(&pdev->dev, "Failed to get memory resource\n"); - return -EINVAL; + return PTR_ERR(mem_base); } mdev = mt76_alloc_device(&pdev->dev, sizeof(*dev), &mt7603_ops, From 411e05f4e87794332f328ca3fa201b731f023db5 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 6 Mar 2019 10:51:12 +0100 Subject: [PATCH 347/855] mt76x2u: remove duplicated entry in mt76x2u_device_table Remove duplicated entry in mt76x2u_device_table since Alfa AWUS036ACM and Aukey USB-AC1200 have the same ids Fixes: 62a25dc56990a ("mt76x2u: Add support for Alfa AWUS036ACM") Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau --- drivers/net/wireless/mediatek/mt76/mt76x2/usb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/usb.c b/drivers/net/wireless/mediatek/mt76/mt76x2/usb.c index ddb6b2c48e01..7a5d539873ca 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/usb.c @@ -21,11 +21,10 @@ #include "mt76x2u.h" static const struct usb_device_id mt76x2u_device_table[] = { - { USB_DEVICE(0x0e8d, 0x7612) }, /* Alfa AWUS036ACM */ { USB_DEVICE(0x0b05, 0x1833) }, /* Asus USB-AC54 */ { USB_DEVICE(0x0b05, 0x17eb) }, /* Asus USB-AC55 */ { USB_DEVICE(0x0b05, 0x180b) }, /* Asus USB-N53 B1 */ - { USB_DEVICE(0x0e8d, 0x7612) }, /* Aukey USB-AC1200 */ + { USB_DEVICE(0x0e8d, 0x7612) }, /* Aukey USBAC1200 - Alfa AWUS036ACM */ { USB_DEVICE(0x057c, 0x8503) }, /* Avm FRITZ!WLAN AC860 */ { USB_DEVICE(0x7392, 0xb711) }, /* Edimax EW 7722 UAC */ { USB_DEVICE(0x0846, 0x9053) }, /* Netgear A6210 */ From 9205e44916b2ca2e959be4210133292a19e79b0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Thu, 7 Mar 2019 13:18:53 +0100 Subject: [PATCH 348/855] pblk: fix max_io calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When calculating the maximun I/O size allowed into the buffer, consider the write size (ws_opt) used by the write thread in order to cover the case in which, due to flushes, the mem and subm pointers are disaligned by (ws_opt - 1). This case currently translates into a stall when an I/O of the largest possible size is submitted. Fixes: f9f9d1ae2c66 ("lightnvm: pblk: prevent stall due to wb threshold") Signed-off-by: Javier González Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-rl.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index b014957dde0b..a5f8bc2defbc 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -233,10 +233,15 @@ void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold) /* To start with, all buffer is available to user I/O writers */ rl->rb_budget = budget; rl->rb_user_max = budget; - rl->rb_max_io = threshold ? (budget - threshold) : (budget - 1); rl->rb_gc_max = 0; rl->rb_state = PBLK_RL_HIGH; + /* Maximize I/O size and ansure that back threshold is respected */ + if (threshold) + rl->rb_max_io = budget - pblk->min_write_pgs_data - threshold; + else + rl->rb_max_io = budget - pblk->min_write_pgs_data - 1; + atomic_set(&rl->rb_user_cnt, 0); atomic_set(&rl->rb_gc_cnt, 0); atomic_set(&rl->rb_space, -1); From 688cd8bd2c0fa9dc88e5ced55a73ddc79edf875d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Mar 2019 21:38:42 +0100 Subject: [PATCH 349/855] iwlwifi: fix 64-bit division do_div() expects unsigned operands and otherwise triggers a warning like: drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c:465:2: error: comparison of distinct pointer types ('typeof ((rtt_avg)) *' (aka 'long long *') and 'uint64_t *' (aka 'unsigned long long *')) [-Werror,-Wcompare-distinct-pointer-types] do_div(rtt_avg, 6666); ^~~~~~~~~~~~~~~~~~~~~ include/asm-generic/div64.h:222:28: note: expanded from macro 'do_div' (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \ ~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~ 1 error generated. Change the do_div() to the simpler div_s64() that can handle negative inputs correctly. Fixes: 937b10c0de68 ("iwlwifi: mvm: add debug prints for FTM") Signed-off-by: Arnd Bergmann Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c index e9822a3ec373..94132cfd1f56 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c @@ -460,9 +460,7 @@ static int iwl_mvm_ftm_range_resp_valid(struct iwl_mvm *mvm, u8 request_id, static void iwl_mvm_debug_range_resp(struct iwl_mvm *mvm, u8 index, struct cfg80211_pmsr_result *res) { - s64 rtt_avg = res->ftm.rtt_avg * 100; - - do_div(rtt_avg, 6666); + s64 rtt_avg = div_s64(res->ftm.rtt_avg * 100, 6666); IWL_DEBUG_INFO(mvm, "entry %d\n", index); IWL_DEBUG_INFO(mvm, "\tstatus: %d\n", res->status); From a656183e6c58a3a4d818278afab522b30a17014c Mon Sep 17 00:00:00 2001 From: Lee Duncan Date: Mon, 25 Feb 2019 09:41:30 -0800 Subject: [PATCH 350/855] scsi: libiscsi: Hold back_lock when calling iscsi_complete_task If there is an error queueing an iscsi command in iscsi_queuecommand(), for example if the transport fails to take the command in sessuin->tt->xmit_task(), then the error path can call iscsi_complete_task() without first aquiring the back_lock as required. This can lead to things like ITT pool can get corrupt, resulting in duplicate ITTs being sent out. The solution is to hold the back_lock around iscsi_complete_task() calls, and to add a little commenting to help others understand when back_lock must be held. Signed-off-by: Lee Duncan Acked-by: Chris Leech Signed-off-by: Martin K. Petersen --- drivers/scsi/libiscsi.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index dd314d2b1111..418dbed7618f 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -798,7 +798,7 @@ EXPORT_SYMBOL_GPL(iscsi_conn_send_pdu); * @datalen: len of buffer * * iscsi_cmd_rsp sets up the scsi_cmnd fields based on the PDU and - * then completes the command and task. + * then completes the command and task. called under back_lock **/ static void iscsi_scsi_cmd_rsp(struct iscsi_conn *conn, struct iscsi_hdr *hdr, struct iscsi_task *task, char *data, @@ -894,6 +894,9 @@ out: * @conn: iscsi connection * @hdr: iscsi pdu * @task: scsi command task + * + * iscsi_data_in_rsp sets up the scsi_cmnd fields based on the data received + * then completes the command and task. called under back_lock **/ static void iscsi_data_in_rsp(struct iscsi_conn *conn, struct iscsi_hdr *hdr, @@ -978,6 +981,16 @@ static int iscsi_send_nopout(struct iscsi_conn *conn, struct iscsi_nopin *rhdr) return 0; } +/** + * iscsi_nop_out_rsp - SCSI NOP Response processing + * @task: scsi command task + * @nop: the nop structure + * @data: where to put the data + * @datalen: length of data + * + * iscsi_nop_out_rsp handles nop response from use or + * from user space. called under back_lock + **/ static int iscsi_nop_out_rsp(struct iscsi_task *task, struct iscsi_nopin *nop, char *data, int datalen) { @@ -1744,7 +1757,9 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc) return 0; prepd_reject: + spin_lock_bh(&session->back_lock); iscsi_complete_task(task, ISCSI_TASK_REQUEUE_SCSIQ); + spin_unlock_bh(&session->back_lock); reject: spin_unlock_bh(&session->frwd_lock); ISCSI_DBG_SESSION(session, "cmd 0x%x rejected (%d)\n", @@ -1752,7 +1767,9 @@ reject: return SCSI_MLQUEUE_TARGET_BUSY; prepd_fault: + spin_lock_bh(&session->back_lock); iscsi_complete_task(task, ISCSI_TASK_REQUEUE_SCSIQ); + spin_unlock_bh(&session->back_lock); fault: spin_unlock_bh(&session->frwd_lock); ISCSI_DBG_SESSION(session, "iscsi: cmd 0x%x is not queued (%d)\n", @@ -3069,8 +3086,9 @@ fail_mgmt_tasks(struct iscsi_session *session, struct iscsi_conn *conn) state = ISCSI_TASK_ABRT_SESS_RECOV; if (task->state == ISCSI_TASK_PENDING) state = ISCSI_TASK_COMPLETED; + spin_lock_bh(&session->back_lock); iscsi_complete_task(task, state); - + spin_unlock_bh(&session->back_lock); } } From 08f68752cefcda4df036879e41313f374474b0fa Mon Sep 17 00:00:00 2001 From: Ramalingam C Date: Mon, 4 Feb 2019 21:14:40 +0530 Subject: [PATCH 351/855] drm/i915: HDCP state handling in ddi_update_pipe The downgrade of the fullmodeset into fastset intel_encoder->update_pipe, in possible scenario, skips the En/Dis-able DDI. Hence breaks the HDCP state change handling. We also don't have any hdcp tests in CI, because the shard runs don't have hdcp capable outputs :-/ So this change fixs it by handling the HDCP state change request at intel_encoder->update_pipe too along with enable and disable of the DDI. Fixes: d19f958db23c ("drm/i915: Enable fastset for non-boot modesets.") v2: Added commit id that broke the HDCP [Daniel] Signed-off-by: Ramalingam C cc: Maarten Lankhorst cc: Hans de Goede cc: Daniel Vetter Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/1549295080-18353-1-git-send-email-ramalingam.c@intel.com (cherry picked from commit 634852d1f468ccc8cc2e790757c6c1c0f95eb955) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_ddi.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c index ca705546a0ab..2323b7cb1d38 100644 --- a/drivers/gpu/drm/i915/intel_ddi.c +++ b/drivers/gpu/drm/i915/intel_ddi.c @@ -3568,6 +3568,13 @@ static void intel_ddi_update_pipe(struct intel_encoder *encoder, { if (!intel_crtc_has_type(crtc_state, INTEL_OUTPUT_HDMI)) intel_ddi_update_pipe_dp(encoder, crtc_state, conn_state); + + if (conn_state->content_protection == + DRM_MODE_CONTENT_PROTECTION_DESIRED) + intel_hdcp_enable(to_intel_connector(conn_state->connector)); + else if (conn_state->content_protection == + DRM_MODE_CONTENT_PROTECTION_UNDESIRED) + intel_hdcp_disable(to_intel_connector(conn_state->connector)); } static void intel_ddi_set_fia_lane_count(struct intel_encoder *encoder, From df069367f3b10606195e81053febc8aa4796bbc6 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 8 Feb 2019 13:47:04 +0000 Subject: [PATCH 352/855] drm/i915: Protect i915_active iterators from the shrinker If we allocate while iterating the rbtree of active nodes, we may hit the shrinker and so retire the i915_active, reaping the rbtree. Modifying the rbtree as we iterate is not good behaviour, so acquire the i915_active first to keep the tree intact whenever we allocate. Fixes: a42375af0a30 ("drm/i915: Release the active tracker tree upon idling") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20190208134704.23039-1-chris@chris-wilson.co.uk Reviewed-by: Tvrtko Ursulin (cherry picked from commit 312c4ba1bb71d666f924f84afd5bdc775b71278f) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_active.c | 36 +++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c index 215b6ff8aa73..db7bb5bd5add 100644 --- a/drivers/gpu/drm/i915/i915_active.c +++ b/drivers/gpu/drm/i915/i915_active.c @@ -163,17 +163,25 @@ int i915_active_ref(struct i915_active *ref, struct i915_request *rq) { struct i915_active_request *active; + int err = 0; + + /* Prevent reaping in case we malloc/wait while building the tree */ + i915_active_acquire(ref); active = active_instance(ref, timeline); - if (IS_ERR(active)) - return PTR_ERR(active); + if (IS_ERR(active)) { + err = PTR_ERR(active); + goto out; + } if (!i915_active_request_isset(active)) ref->count++; __i915_active_request_set(active, rq); GEM_BUG_ON(!ref->count); - return 0; +out: + i915_active_release(ref); + return err; } bool i915_active_acquire(struct i915_active *ref) @@ -223,19 +231,25 @@ int i915_request_await_active_request(struct i915_request *rq, int i915_request_await_active(struct i915_request *rq, struct i915_active *ref) { struct active_node *it, *n; - int ret; + int err = 0; - ret = i915_request_await_active_request(rq, &ref->last); - if (ret) - return ret; + /* await allocates and so we need to avoid hitting the shrinker */ + if (i915_active_acquire(ref)) + goto out; /* was idle */ + + err = i915_request_await_active_request(rq, &ref->last); + if (err) + goto out; rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) { - ret = i915_request_await_active_request(rq, &it->base); - if (ret) - return ret; + err = i915_request_await_active_request(rq, &it->base); + if (err) + goto out; } - return 0; +out: + i915_active_release(ref); + return err; } #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) From 7b1366b48c1f063c902f87a4bd6cdd0cbb86664e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 11 Feb 2019 20:46:47 +0000 Subject: [PATCH 353/855] drm/i915: Reacquire priolist cache after dropping the engine lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we drop the engine lock, we may run execlists_dequeue which may free the priolist. Therefore if we ever drop the execution lock on the engine, we have to discard our cache and refetch the priolist to ensure we do not use a stale pointer. [ 506.418935] [IGT] gem_exec_whisper: starting subtest contexts-priority [ 593.240825] general protection fault: 0000 [#1] SMP [ 593.240863] CPU: 1 PID: 494 Comm: gem_exec_whispe Tainted: G U 5.0.0-rc6+ #100 [ 593.240879] Hardware name: /NUC6CAYB, BIOS AYAPLCEL.86A.0029.2016.1124.1625 11/24/2016 [ 593.240965] RIP: 0010:__i915_schedule+0x1fe/0x320 [i915] [ 593.240981] Code: 48 8b 0c 24 48 89 c3 49 8b 45 28 49 8b 75 20 4c 89 3c 24 48 89 46 08 48 89 30 48 8b 43 08 48 89 4b 08 49 89 5d 20 49 89 45 28 <48> 89 08 45 39 a7 b8 03 00 00 7d 44 45 89 a7 b8 03 00 00 49 8b 85 [ 593.240999] RSP: 0018:ffffc90000057a60 EFLAGS: 00010046 [ 593.241013] RAX: 6b6b6b6b6b6b6b6b RBX: ffff8882582d7870 RCX: ffff88826baba6f0 [ 593.241026] RDX: 0000000000000000 RSI: ffff8882582d6e70 RDI: ffff888273482194 [ 593.241049] RBP: ffffc90000057a68 R08: ffff8882582d7680 R09: ffff8882582d7840 [ 593.241068] R10: 0000000000000000 R11: ffffea00095ebe08 R12: 0000000000000728 [ 593.241105] R13: ffff88826baba6d0 R14: ffffc90000057a40 R15: ffff888273482158 [ 593.241120] FS: 00007f4613fb3900(0000) GS:ffff888277a80000(0000) knlGS:0000000000000000 [ 593.241133] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 593.241146] CR2: 00007f57d3c66a84 CR3: 000000026e2b6000 CR4: 00000000001406e0 [ 593.241158] Call Trace: [ 593.241233] i915_schedule+0x1f/0x30 [i915] [ 593.241326] i915_request_add+0x1a9/0x290 [i915] [ 593.241393] i915_gem_do_execbuffer+0x45f/0x1150 [i915] [ 593.241411] ? init_object+0x49/0x80 [ 593.241425] ? ___slab_alloc.constprop.91+0x4b8/0x4e0 [ 593.241491] ? i915_gem_execbuffer2_ioctl+0x99/0x380 [i915] [ 593.241563] ? i915_gem_execbuffer_ioctl+0x270/0x270 [i915] [ 593.241629] i915_gem_execbuffer2_ioctl+0x1bb/0x380 [i915] [ 593.241705] ? i915_gem_execbuffer_ioctl+0x270/0x270 [i915] [ 593.241724] drm_ioctl_kernel+0x81/0xd0 [ 593.241738] drm_ioctl+0x1a7/0x310 [ 593.241803] ? i915_gem_execbuffer_ioctl+0x270/0x270 [i915] [ 593.241819] ? __update_load_avg_se+0x1c9/0x240 [ 593.241834] ? pick_next_entity+0x7e/0x120 [ 593.241851] do_vfs_ioctl+0x88/0x5d0 [ 593.241880] ksys_ioctl+0x35/0x70 [ 593.241894] __x64_sys_ioctl+0x11/0x20 [ 593.241907] do_syscall_64+0x44/0xf0 [ 593.241924] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 593.241940] RIP: 0033:0x7f4615ffe757 [ 593.241952] Code: 00 00 90 48 8b 05 39 a7 0c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 09 a7 0c 00 f7 d8 64 89 01 48 [ 593.241970] RSP: 002b:00007ffc1030ddf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 593.241984] RAX: ffffffffffffffda RBX: 00007ffc10324420 RCX: 00007f4615ffe757 [ 593.241997] RDX: 00007ffc1030e220 RSI: 0000000040406469 RDI: 0000000000000003 [ 593.242010] RBP: 00007ffc1030e220 R08: 00007f46160c9208 R09: 00007f46160c9240 [ 593.242022] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000040406469 [ 593.242038] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000000 [ 593.242058] Modules linked in: i915 intel_gtt drm_kms_helper prime_numbers v2: Track the local engine cache and explicitly clear it when switching engine locks. Fixes: a02eb975be78 ("drm/i915/execlists: Cache the priolist when rescheduling") Testcase: igt/gem_exec_whisper/contexts-priority # rare! Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Tvrtko Ursulin Cc: Michał Winiarski Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20190211204647.26723-1-chris@chris-wilson.co.uk (cherry picked from commit ed7dc6777400937b4686e9ec1db1533ea4546864) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_scheduler.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c index d01683167c77..8bc042551692 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.c +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -223,8 +223,14 @@ out: return &p->requests[idx]; } +struct sched_cache { + struct list_head *priolist; +}; + static struct intel_engine_cs * -sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked) +sched_lock_engine(const struct i915_sched_node *node, + struct intel_engine_cs *locked, + struct sched_cache *cache) { struct intel_engine_cs *engine = node_to_request(node)->engine; @@ -232,6 +238,7 @@ sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked) if (engine != locked) { spin_unlock(&locked->timeline.lock); + memset(cache, 0, sizeof(*cache)); spin_lock(&engine->timeline.lock); } @@ -253,11 +260,11 @@ static bool inflight(const struct i915_request *rq, static void __i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr) { - struct list_head *uninitialized_var(pl); - struct intel_engine_cs *engine, *last; + struct intel_engine_cs *engine; struct i915_dependency *dep, *p; struct i915_dependency stack; const int prio = attr->priority; + struct sched_cache cache; LIST_HEAD(dfs); /* Needed in order to use the temporary link inside i915_dependency */ @@ -328,7 +335,7 @@ static void __i915_schedule(struct i915_request *rq, __list_del_entry(&stack.dfs_link); } - last = NULL; + memset(&cache, 0, sizeof(cache)); engine = rq->engine; spin_lock_irq(&engine->timeline.lock); @@ -338,7 +345,7 @@ static void __i915_schedule(struct i915_request *rq, INIT_LIST_HEAD(&dep->dfs_link); - engine = sched_lock_engine(node, engine); + engine = sched_lock_engine(node, engine, &cache); lockdep_assert_held(&engine->timeline.lock); /* Recheck after acquiring the engine->timeline.lock */ @@ -347,11 +354,11 @@ static void __i915_schedule(struct i915_request *rq, node->attr.priority = prio; if (!list_empty(&node->link)) { - if (last != engine) { - pl = i915_sched_lookup_priolist(engine, prio); - last = engine; - } - list_move_tail(&node->link, pl); + if (!cache.priolist) + cache.priolist = + i915_sched_lookup_priolist(engine, + prio); + list_move_tail(&node->link, cache.priolist); } else { /* * If the request is not in the priolist queue because From 339cc6ae0f9dced4b5f8fbc1ed81b6195da93a8a Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 15 Feb 2019 19:50:10 +0000 Subject: [PATCH 354/855] drm/i915/selftests: Always free spinner on __sseu_prepare error Prepare a nice little onion unwind to ensure that we always free the spinner if we __sseu_prepare fails. Fixes: c06ee6ff2cbc ("drm/i915/selftests: Context SSEU reconfiguration tests") Reported-by: Radhakrishna Sripada Signed-off-by: Chris Wilson Cc: Radhakrishna Sripada Cc: Tvrtko Ursulin Cc: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20190215195010.16637-1-chris@chris-wilson.co.uk Reviewed-by: Radhakrishna Sripada (cherry picked from commit 2a4a2754039594c60b58b02b6781428a85f6d745) Signed-off-by: Rodrigo Vivi --- .../gpu/drm/i915/selftests/i915_gem_context.c | 73 +++++++++---------- 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c index d00d0bb07784..7eb58a9d1319 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c @@ -710,47 +710,45 @@ __sseu_prepare(struct drm_i915_private *i915, unsigned int flags, struct i915_gem_context *ctx, struct intel_engine_cs *engine, - struct igt_spinner **spin_out) + struct igt_spinner **spin) { - int ret = 0; + struct i915_request *rq; + int ret; - if (flags & (TEST_BUSY | TEST_RESET)) { - struct igt_spinner *spin; - struct i915_request *rq; + *spin = NULL; + if (!(flags & (TEST_BUSY | TEST_RESET))) + return 0; - spin = kzalloc(sizeof(*spin), GFP_KERNEL); - if (!spin) { - ret = -ENOMEM; - goto out; - } + *spin = kzalloc(sizeof(**spin), GFP_KERNEL); + if (!*spin) + return -ENOMEM; - ret = igt_spinner_init(spin, i915); - if (ret) - return ret; + ret = igt_spinner_init(*spin, i915); + if (ret) + goto err_free; - rq = igt_spinner_create_request(spin, ctx, engine, MI_NOOP); - if (IS_ERR(rq)) { - ret = PTR_ERR(rq); - igt_spinner_fini(spin); - kfree(spin); - goto out; - } - - i915_request_add(rq); - - if (!igt_wait_for_spinner(spin, rq)) { - pr_err("%s: Spinner failed to start!\n", name); - igt_spinner_end(spin); - igt_spinner_fini(spin); - kfree(spin); - ret = -ETIMEDOUT; - goto out; - } - - *spin_out = spin; + rq = igt_spinner_create_request(*spin, ctx, engine, MI_NOOP); + if (IS_ERR(rq)) { + ret = PTR_ERR(rq); + goto err_fini; } -out: + i915_request_add(rq); + + if (!igt_wait_for_spinner(*spin, rq)) { + pr_err("%s: Spinner failed to start!\n", name); + ret = -ETIMEDOUT; + goto err_end; + } + + return 0; + +err_end: + igt_spinner_end(*spin); +err_fini: + igt_spinner_fini(*spin); +err_free: + kfree(fetch_and_zero(spin)); return ret; } @@ -897,22 +895,23 @@ __sseu_test(struct drm_i915_private *i915, ret = __sseu_prepare(i915, name, flags, ctx, engine, &spin); if (ret) - goto out; + goto out_context; ret = __i915_gem_context_reconfigure_sseu(ctx, engine, sseu); if (ret) - goto out; + goto out_spin; ret = __sseu_finish(i915, name, flags, ctx, kctx, engine, obj, hweight32(sseu.slice_mask), spin); -out: +out_spin: if (spin) { igt_spinner_end(spin); igt_spinner_fini(spin); kfree(spin); } +out_context: kernel_context_close(kctx); return ret; From a89c09624f6135b12b446388c734e833ea918d9c Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 4 Mar 2019 11:41:13 +0000 Subject: [PATCH 355/855] drm/i915: Acquire breadcrumb ref before cancelling We may race the interrupt signaling with retirement, in which case the order in which we acquire the reference inside the interrupt is vital to provide the correct barrier against the request being freed in retirement, i.e. we need to acquire our reference before marking the breadcrumb as cancelled (as soon as the breadcrumb is cancelled retirement may drop its reference to the request without serialisation with the interrupt handler). <3>[ 683.372226] BUG i915_request (Tainted: G U ): Object already free <3>[ 683.372269] ----------------------------------------------------------------------------- <4>[ 683.372323] Disabling lock debugging due to kernel taint <3>[ 683.372393] INFO: Allocated in i915_request_alloc+0x169/0x810 [i915] age=0 cpu=2 pid=1420 <3>[ 683.372412] kmem_cache_alloc+0x21c/0x280 <3>[ 683.372478] i915_request_alloc+0x169/0x810 [i915] <3>[ 683.372540] i915_gem_do_execbuffer+0x84e/0x1ae0 [i915] <3>[ 683.372603] i915_gem_execbuffer2_ioctl+0x11b/0x420 [i915] <3>[ 683.372617] drm_ioctl_kernel+0x83/0xf0 <3>[ 683.372626] drm_ioctl+0x2f3/0x3b0 <3>[ 683.372636] do_vfs_ioctl+0xa0/0x6e0 <3>[ 683.372645] ksys_ioctl+0x35/0x60 <3>[ 683.372654] __x64_sys_ioctl+0x11/0x20 <3>[ 683.372664] do_syscall_64+0x55/0x190 <3>[ 683.372675] entry_SYSCALL_64_after_hwframe+0x49/0xbe <3>[ 683.372740] INFO: Freed in i915_request_retire_upto+0xfb/0x2e0 [i915] age=0 cpu=0 pid=1419 <3>[ 683.372807] i915_request_retire_upto+0xfb/0x2e0 [i915] <3>[ 683.372870] i915_request_add+0x3bd/0x9d0 [i915] <3>[ 683.372931] i915_gem_do_execbuffer+0x141c/0x1ae0 [i915] <3>[ 683.372991] i915_gem_execbuffer2_ioctl+0x11b/0x420 [i915] <3>[ 683.373001] drm_ioctl_kernel+0x83/0xf0 <3>[ 683.373008] drm_ioctl+0x2f3/0x3b0 <3>[ 683.373015] do_vfs_ioctl+0xa0/0x6e0 <3>[ 683.373023] ksys_ioctl+0x35/0x60 <3>[ 683.373030] __x64_sys_ioctl+0x11/0x20 <3>[ 683.373037] do_syscall_64+0x55/0x190 <3>[ 683.373045] entry_SYSCALL_64_after_hwframe+0x49/0xbe <3>[ 683.373054] INFO: Slab 0x0000000079bcdd71 objects=30 used=2 fp=0x000000006d77b8af flags=0x8000000000010201 <3>[ 683.373069] INFO: Object 0x000000006d77b8af @offset=24000 fp=0x000000007b061eab <3>[ 683.373083] Redzone 00000000ee47ef28: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ <3>[ 683.373097] Redzone 000000000cb91471: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ <3>[ 683.373111] Redzone 00000000cf2b86ee: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ <3>[ 683.373125] Redzone 00000000f1f5a2cd: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ <3>[ 683.373139] Object 000000006d77b8af: 00 00 00 00 5a 5a 5a 5a 00 3c 49 c0 ff ff ff ff ....ZZZZ.[ 683.373153] Object 000000006f9b6204: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ <3>[ 683.373167] Object 0000000091410ffb: e0 dd 6b fa 87 9f ff ff e0 dd 6b fa 87 9f ff ff ..k.......k..... <3>[ 683.373181] Object 000000004cdf799d: 20 de 6b fa 87 9f ff ff 3d 00 00 00 00 00 00 00 .k.....=....... <3>[ 683.373195] Object 00000000545afebc: aa b3 00 00 00 00 00 00 0f 00 00 00 00 00 00 00 ................ <3>[ 683.373209] Object 00000000e4a394a8: 25 bd bd 1b 9f 00 00 00 00 00 00 00 5a 5a 5a 5a %...........ZZZZ <3>[ 683.373223] Object 0000000029a7878a: 00 00 00 00 ad 4e ad de ff ff ff ff 5a 5a 5a 5a .....N......ZZZZ <3>[ 683.373237] Object 00000000d37797b3: ff ff ff ff ff ff ff ff e8 6e 57 c0 ff ff ff ff .........nW..... <3>[ 683.373251] Object 00000000d50414f6: 00 b3 c8 8e ff ff ff ff 80 b0 c8 8e ff ff ff ff ................ <3>[ 683.373265] Object 00000000c28e8847: 41 01 4b c0 ff ff ff ff 00 00 88 8e 88 9f ff ff A.K............. <3>[ 683.373279] Object 00000000c74212ab: 38 c1 6d 8a 88 9f ff ff 58 21 74 8a 88 9f ff ff 8.m.....X!t..... <3>[ 683.373293] Object 000000000d8012cf: c0 c1 6d 8a 88 9f ff ff 58 79 dd d9 87 9f ff ff ..m.....Xy...... <3>[ 683.373306] Object 00000000c9900b91: 98 d0 4e 8a 88 9f ff ff 58 3c e8 9b 88 9f ff ff ..N.....X<...... <3>[ 683.373320] Object 0000000044bb8c3d: 58 3c e8 9b 88 9f ff ff 64 f5 04 00 00 00 00 00 X<......d....... <3>[ 683.373334] Object 00000000180c4cca: 00 00 00 00 ad 4e ad de ff ff ff ff 5a 5a 5a 5a .....N......ZZZZ <3>[ 683.373348] Object 00000000c9044498: ff ff ff ff ff ff ff ff e0 6e 57 c0 ff ff ff ff .........nW..... <3>[ 683.373362] Object 0000000072d0dfb3: 00 00 00 00 00 00 00 00 c0 b1 c8 8e ff ff ff ff ................ <3>[ 683.373376] Object 0000000081f198b9: 55 01 4b c0 ff ff ff ff d8 de 6b fa 87 9f ff ff U.K.......k..... <3>[ 683.373390] Object 000000006a375a13: d8 de 6b fa 87 9f ff ff cc 05 39 c0 ff ff ff ff ..k.......9..... <3>[ 683.373404] Object 00000000b8392dd1: ff ff ff ff 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ....ZZZZZZZZZZZZ <3>[ 683.373418] Object 00000000e5c1bbcb: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ <3>[ 683.373432] Object 00000000199feccd: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ <3>[ 683.373446] Object 0000000020f5e08b: 20 df 6b fa 87 9f ff ff 20 df 6b fa 87 9f ff ff .k..... .k..... <3>[ 683.373460] Object 0000000090591b0f: 30 df 6b fa 87 9f ff ff 30 df 6b fa 87 9f ff ff 0.k.....0.k..... <3>[ 683.373473] Object 00000000232f7cd0: 40 df 6b fa 87 9f ff ff 40 df 6b fa 87 9f ff ff @.k.....@.k..... <3>[ 683.373487] Object 0000000060458027: 50 df 6b fa 87 9f ff ff 50 df 6b fa 87 9f ff ff P.k.....P.k..... <3>[ 683.373501] Object 00000000e3c82ce2: 06 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ <3>[ 683.373515] Object 00000000ec804eb8: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ <3>[ 683.373529] Object 00000000ce7ccc08: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ <3>[ 683.373543] Object 000000002dbc575c: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ <3>[ 683.373557] Object 00000000b86d3417: 5a 5a 5a 5a 5a 5a 5a 5a 00 de 6b fa 87 9f ff ff ZZZZZZZZ..k..... <3>[ 683.373571] Object 00000000d1e82276: b8 61 dd d9 87 9f ff ff a0 06 00 00 d0 06 00 00 .a.............. <3>[ 683.373585] Object 00000000cc53f969: e8 06 00 00 20 07 00 00 28 07 00 00 00 00 00 00 .... ...(....... <3>[ 683.373599] Object 00000000ea2426d2: 40 0c 8c 7b 88 9f ff ff 00 00 00 00 00 00 00 00 @..{............ <3>[ 683.373613] Object 00000000b860c1c3: 68 0d 8c 7b 88 9f ff ff 68 25 8c 7b 88 9f ff ff h..{....h%.{.... <3>[ 683.373627] Object 0000000016455ea0: 96 d5 05 00 01 00 00 00 00 5a 5a 5a 5a 5a 5a 5a .........ZZZZZZZ <3>[ 683.373640] Object 00000000e66ede82: 00 e0 6b fa 87 9f ff ff 00 e0 6b fa 87 9f ff ff ..k.......k..... <3>[ 683.373654] Object 0000000080964939: 10 e0 6b fa 87 9f ff ff 10 e0 6b fa 87 9f ff ff ..k.......k..... <3>[ 683.373668] Object 00000000e7ffc5dd: 00 00 00 00 00 00 00 00 00 01 00 00 00 00 ad de ................ <3>[ 683.373682] Object 000000000ce9d6ca: 00 02 00 00 00 00 ad de 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ <3>[ 683.373696] Object 00000000386659d0: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ <3>[ 683.373710] Redzone 0000000075d2069d: bb bb bb bb bb bb bb bb ........ <3>[ 683.373723] Padding 0000000054e14c6b: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ <3>[ 683.373737] Padding 00000000425e5b34: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ <3>[ 683.373751] Padding 00000000ad3d4db9: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ <4>[ 683.373767] CPU: 1 PID: 151 Comm: kworker/1:2 Tainted: G BU 5.0.0-rc8-g39139489403b-drmtip_236+ #1 <4>[ 683.373769] Hardware name: Intel Corporation Ice Lake Client Platform/IceLake Y LPDDR4x T4 RVP TLC, BIOS ICLSFWR1.R00.3087.A00.1902250334 02/25/2019 <4>[ 683.373773] Workqueue: events delayed_fput <4>[ 683.373775] Call Trace: <4>[ 683.373777] <4>[ 683.373781] dump_stack+0x67/0x9b <4>[ 683.373783] free_debug_processing+0x344/0x370 <4>[ 683.373832] ? intel_engine_breadcrumbs_irq+0x2e4/0x380 [i915] <4>[ 683.373836] __slab_free+0x337/0x4f0 <4>[ 683.373840] ? _raw_spin_unlock_irqrestore+0x39/0x60 <4>[ 683.373844] ? debug_check_no_obj_freed+0x132/0x210 <4>[ 683.373889] ? intel_engine_breadcrumbs_irq+0x2e4/0x380 [i915] <4>[ 683.373892] ? kmem_cache_free+0x275/0x2e0 <4>[ 683.373894] kmem_cache_free+0x275/0x2e0 <4>[ 683.373939] intel_engine_breadcrumbs_irq+0x2e4/0x380 [i915] <4>[ 683.373984] gen8_cs_irq_handler+0x4e/0xa0 [i915] <4>[ 683.374026] gen11_irq_handler+0x24b/0x330 [i915] <4>[ 683.374032] __handle_irq_event_percpu+0x41/0x2d0 <4>[ 683.374034] ? handle_irq_event+0x27/0x50 <4>[ 683.374038] handle_irq_event_percpu+0x2b/0x70 <4>[ 683.374040] handle_irq_event+0x2f/0x50 <4>[ 683.374044] handle_edge_irq+0xe7/0x190 <4>[ 683.374048] handle_irq+0x67/0x160 <4>[ 683.374051] do_IRQ+0x5e/0x130 <4>[ 683.374054] common_interrupt+0xf/0xf Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109827 Fixes: 52c0fdb25c7c ("drm/i915: Replace global breadcrumbs with per-context interrupt tracking") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20190304114113.371-1-chris@chris-wilson.co.uk (cherry picked from commit e781a7a3235e9ff68095d2cd4d9c1e039a0516d7) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_breadcrumbs.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c index cacaa1d04d17..09ed90c0ba00 100644 --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c @@ -106,16 +106,6 @@ bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine) GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)); - clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); - - /* - * We may race with direct invocation of - * dma_fence_signal(), e.g. i915_request_retire(), - * in which case we can skip processing it ourselves. - */ - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, - &rq->fence.flags)) - continue; /* * Queue for execution after dropping the signaling @@ -123,6 +113,14 @@ bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine) * more signalers to the same context or engine. */ i915_request_get(rq); + + /* + * We may race with direct invocation of + * dma_fence_signal(), e.g. i915_request_retire(), + * so we need to acquire our reference to the request + * before we cancel the breadcrumb. + */ + clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); list_add_tail(&rq->signal_link, &signal); } From c8c16f59b3d814a72d81e291d101d093b1383056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Roberto=20de=20Souza?= Date: Fri, 1 Mar 2019 16:33:47 -0800 Subject: [PATCH 356/855] drm/i915: Fix atomic state leak when resetting HDMI link MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Atomic state needs to be put even if the commit was successful. Fixes: dba14b27dd3c ("drm/i915: Reinitialize sink scrambling/TMDS clock ratio on HPD") Reviewed-by: Ville Syrjälä Cc: Ville Syrjälä Cc: Lyude Paul Signed-off-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20190302003349.19189-1-jose.souza@intel.com (cherry picked from commit a551cd66bc0a15ba00433743094c2453e1ee7aa9) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_ddi.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c index 2323b7cb1d38..14d580cdefd3 100644 --- a/drivers/gpu/drm/i915/intel_ddi.c +++ b/drivers/gpu/drm/i915/intel_ddi.c @@ -3969,12 +3969,7 @@ static int modeset_pipe(struct drm_crtc *crtc, goto out; ret = drm_atomic_commit(state); - if (ret) - goto out; - - return 0; - - out: +out: drm_atomic_state_put(state); return ret; From ca22f32a6296cbfa29de56328c8505560a18cfa8 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 5 Mar 2019 11:04:08 +0000 Subject: [PATCH 357/855] drm/i915: Relax mmap VMA check Legacy behaviour was to allow non-page-aligned mmap requests, as does the linux mmap(2) implementation by virtue of automatically rounding up for the caller. To avoid breaking legacy userspace relax the newly introduced fix. Signed-off-by: Tvrtko Ursulin Fixes: 5c4604e757ba ("drm/i915: Prevent a race during I915_GEM_MMAP ioctl with WC set") Reported-by: Guenter Roeck Cc: Adam Zabrocki Cc: Joonas Lahtinen Cc: # v4.0+ Cc: Akash Goel Cc: Chris Wilson Cc: Jani Nikula Cc: Rodrigo Vivi Cc: intel-gfx@lists.freedesktop.org Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20190305110409.28633-1-tvrtko.ursulin@linux.intel.com (cherry picked from commit a90e1948efb648f567444f87f3c19b2a0787affd) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_gem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 6728ea5c71d4..30d516e975c6 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1688,7 +1688,8 @@ __vma_matches(struct vm_area_struct *vma, struct file *filp, if (vma->vm_file != filp) return false; - return vma->vm_start == addr && (vma->vm_end - vma->vm_start) == size; + return vma->vm_start == addr && + (vma->vm_end - vma->vm_start) == PAGE_ALIGN(size); } /** From 3032f0c9008088a3effdc2622ce16c3e1bcb13a2 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 7 Mar 2019 13:29:59 -0800 Subject: [PATCH 358/855] ARCv2: spinlock: remove the extra smp_mb before lock, after unlock - ARCv2 LLSC spinlocks have smp_mb() both before and after the LLSC instructions, which is not required per lkmm ACQ/REL semantics. smp_mb() is only needed _after_ lock and _before_ unlock. So remove the extra barriers. The reason they were there was mainly historical. At the time of initial SMP Linux bringup on HS38 cores, I was too conservative, given the fluidity of both hw and sw. The last attempt to ditch the extra barrier showed some hackbench regression which is apparently not the case now (atleast for LLSC case, read on...) - EX based spinlocks (!CONFIG_ARC_HAS_LLSC) still needs the extra smp_mb(), not due to lkmm, but due to some hardware shenanigans. W/o that, hackbench triggers RCU stall splat so extra DMB is retained !LLSC based systems are not realistic Linux sstem anyways so they can afford to be a nit suboptimal ;-) | [ARCLinux]# for i in (seq 1 1 5) ; do hackbench; done | Running with 10 groups 400 process | INFO: task hackbench:158 blocked for more than 10 seconds. | Not tainted 4.20.0-00005-g96b18288a88e-dirty #117 | "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. | hackbench D 0 158 135 0x00000000 | | Stack Trace: | watchdog: BUG: soft lockup - CPU#3 stuck for 59s! [hackbench:469] | Modules linked in: | Path: (null) | CPU: 3 PID: 469 Comm: hackbench Not tainted 4.20.0-00005-g96b18288a88e-dirty | | [ECR ]: 0x00000000 => Check Programmer's Manual | [EFA ]: 0x00000000 | [BLINK ]: do_exit+0x4a6/0x7d0 | [ERET ]: _raw_write_unlock_irq+0x44/0x5c - And while at it, remove the extar smp_mb() from EX based arch_read_trylock() since the spin lock there guarantees a full barrier anyways - For LLSC case, hackbench threads improves with this patch (HAPS @ 50MHz) ---- before ---- | | [ARCLinux]# for i in 1 2 3 4 5; do hackbench 10 thread; done | Running with 10 groups 400 threads | Time: 16.253 | Time: 16.445 | Time: 16.590 | Time: 16.721 | Time: 16.544 ---- after ---- | | [ARCLinux]# for i in 1 2 3 4 5; do hackbench 10 thread; done | Running with 10 groups 400 threads | Time: 15.638 | Time: 15.730 | Time: 15.870 | Time: 15.842 | Time: 15.729 Acked-by: Peter Zijlstra (Intel) Signed-off-by: Vineet Gupta --- arch/arc/include/asm/spinlock.h | 49 ++++++++++----------------------- 1 file changed, 14 insertions(+), 35 deletions(-) diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h index 2ba04a7db621..daa914da7968 100644 --- a/arch/arc/include/asm/spinlock.h +++ b/arch/arc/include/asm/spinlock.h @@ -21,8 +21,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) { unsigned int val; - smp_mb(); - __asm__ __volatile__( "1: llock %[val], [%[slock]] \n" " breq %[val], %[LOCKED], 1b \n" /* spin while LOCKED */ @@ -34,6 +32,14 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) [LOCKED] "r" (__ARCH_SPIN_LOCK_LOCKED__) : "memory", "cc"); + /* + * ACQUIRE barrier to ensure load/store after taking the lock + * don't "bleed-up" out of the critical section (leak-in is allowed) + * http://www.spinics.net/lists/kernel/msg2010409.html + * + * ARCv2 only has load-load, store-store and all-all barrier + * thus need the full all-all barrier + */ smp_mb(); } @@ -42,8 +48,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) { unsigned int val, got_it = 0; - smp_mb(); - __asm__ __volatile__( "1: llock %[val], [%[slock]] \n" " breq %[val], %[LOCKED], 4f \n" /* already LOCKED, just bail */ @@ -67,9 +71,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) { smp_mb(); - lock->slock = __ARCH_SPIN_LOCK_UNLOCKED__; - - smp_mb(); + WRITE_ONCE(lock->slock, __ARCH_SPIN_LOCK_UNLOCKED__); } /* @@ -81,8 +83,6 @@ static inline void arch_read_lock(arch_rwlock_t *rw) { unsigned int val; - smp_mb(); - /* * zero means writer holds the lock exclusively, deny Reader. * Otherwise grant lock to first/subseq reader @@ -113,8 +113,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) { unsigned int val, got_it = 0; - smp_mb(); - __asm__ __volatile__( "1: llock %[val], [%[rwlock]] \n" " brls %[val], %[WR_LOCKED], 4f\n" /* <= 0: already write locked, bail */ @@ -140,8 +138,6 @@ static inline void arch_write_lock(arch_rwlock_t *rw) { unsigned int val; - smp_mb(); - /* * If reader(s) hold lock (lock < __ARCH_RW_LOCK_UNLOCKED__), * deny writer. Otherwise if unlocked grant to writer @@ -175,8 +171,6 @@ static inline int arch_write_trylock(arch_rwlock_t *rw) { unsigned int val, got_it = 0; - smp_mb(); - __asm__ __volatile__( "1: llock %[val], [%[rwlock]] \n" " brne %[val], %[UNLOCKED], 4f \n" /* !UNLOCKED, bail */ @@ -217,17 +211,13 @@ static inline void arch_read_unlock(arch_rwlock_t *rw) : [val] "=&r" (val) : [rwlock] "r" (&(rw->counter)) : "memory", "cc"); - - smp_mb(); } static inline void arch_write_unlock(arch_rwlock_t *rw) { smp_mb(); - rw->counter = __ARCH_RW_LOCK_UNLOCKED__; - - smp_mb(); + WRITE_ONCE(rw->counter, __ARCH_RW_LOCK_UNLOCKED__); } #else /* !CONFIG_ARC_HAS_LLSC */ @@ -237,10 +227,9 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) unsigned int val = __ARCH_SPIN_LOCK_LOCKED__; /* - * This smp_mb() is technically superfluous, we only need the one - * after the lock for providing the ACQUIRE semantics. - * However doing the "right" thing was regressing hackbench - * so keeping this, pending further investigation + * Per lkmm, smp_mb() is only required after _lock (and before_unlock) + * for ACQ and REL semantics respectively. However EX based spinlocks + * need the extra smp_mb to workaround a hardware quirk. */ smp_mb(); @@ -257,14 +246,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) #endif : "memory"); - /* - * ACQUIRE barrier to ensure load/store after taking the lock - * don't "bleed-up" out of the critical section (leak-in is allowed) - * http://www.spinics.net/lists/kernel/msg2010409.html - * - * ARCv2 only has load-load, store-store and all-all barrier - * thus need the full all-all barrier - */ smp_mb(); } @@ -309,8 +290,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) : "memory"); /* - * superfluous, but keeping for now - see pairing version in - * arch_spin_lock above + * see pairing version/comment in arch_spin_lock above */ smp_mb(); } @@ -344,7 +324,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) arch_spin_unlock(&(rw->lock_mutex)); local_irq_restore(flags); - smp_mb(); return ret; } From 7be73fa1c1b0b6aaf15d590320f2c5c1108cb87a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 7 Mar 2019 16:48:11 -0800 Subject: [PATCH 359/855] xfs: Zero initialize highstale and lowstale in xfs_dir2_leafn_add When building with -Wsometimes-uninitialized, Clang warns: fs/xfs/libxfs/xfs_dir2_node.c:481:6: warning: variable 'lowstale' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized] fs/xfs/libxfs/xfs_dir2_node.c:481:6: warning: variable 'highstale' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized] While it isn't technically wrong, it isn't a problem in practice because highstale and lowstale are only initialized in xfs_dir2_leafn_add when compact is not zero then they are passed to xfs_dir3_leaf_find_entry, where they are initialized before use when compact is zero. Regardless, it's better not to be passing around uninitialized stack memory so zero initialize these variables, which silences this warning. Link: https://github.com/ClangBuiltLinux/linux/issues/393 Signed-off-by: Nathan Chancellor Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_dir2_node.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 3b03703c5c3d..de46f26c5292 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -444,6 +444,8 @@ xfs_dir2_leafn_add( dp = args->dp; leaf = bp->b_addr; + highstale = 0; + lowstale = 0; dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ents = dp->d_ops->leaf_ents_p(leaf); From 79622c7ce6879c25ce121ee0db91c0ac4c7b137c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Mar 2019 16:50:11 -0800 Subject: [PATCH 360/855] xfs: clean up xfs_dir2_leafn_add Remove typedefs and consolidate local variable initialization. Signed-off-by: Darrick J. Wong Reviewed-by: Nick Desaulniers --- fs/xfs/libxfs/xfs_dir2_node.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index de46f26c5292..16731d2d684b 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -426,26 +426,22 @@ xfs_dir2_leaf_to_node( static int /* error */ xfs_dir2_leafn_add( struct xfs_buf *bp, /* leaf buffer */ - xfs_da_args_t *args, /* operation arguments */ + struct xfs_da_args *args, /* operation arguments */ int index) /* insertion pt for new entry */ { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_inode *dp = args->dp; + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *lep; + struct xfs_dir2_leaf_entry *ents; int compact; /* compacting stale leaves */ - xfs_inode_t *dp; /* incore directory inode */ - int highstale; /* next stale entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int highstale = 0; /* next stale entry */ int lfloghigh; /* high leaf entry logging */ int lfloglow; /* low leaf entry logging */ - int lowstale; /* previous stale entry */ - struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_dir2_leaf_entry *ents; + int lowstale = 0; /* previous stale entry */ trace_xfs_dir2_leafn_add(args, index); - dp = args->dp; - leaf = bp->b_addr; - highstale = 0; - lowstale = 0; dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ents = dp->d_ops->leaf_ents_p(leaf); From f51fac68926235ef5bc482eb759d2c60b86fa358 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 10 Mar 2019 11:41:31 -0700 Subject: [PATCH 361/855] xfs: zero initialize highstale and lowstale in xfs_dir2_leaf_addname Smatch complains about the following: fs/xfs/libxfs/xfs_dir2_leaf.c:848 xfs_dir2_leaf_addname() error: uninitialized symbol 'lowstale'. fs/xfs/libxfs/xfs_dir2_leaf.c:849 xfs_dir2_leaf_addname() error: uninitialized symbol 'highstale'. I don't think there's any incorrect behavior associated with the uninitialized variable, but as the author of the previous zero-init patch points out, it's best not to be passing around pointers to uninitialized stack areas. Signed-off-by: Darrick J. Wong Reviewed-by: Nathan Chancellor Reviewed-by: Allison Henderson Reviewed-by: Bill O'Donnell --- fs/xfs/libxfs/xfs_dir2_leaf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 9a3767818c50..2abf945e5844 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -574,7 +574,7 @@ xfs_dir2_leaf_addname( xfs_dir2_data_unused_t *dup; /* data unused entry */ int error; /* error return value */ int grown; /* allocated new data block */ - int highstale; /* index of next stale leaf */ + int highstale = 0; /* index of next stale leaf */ int i; /* temporary, index */ int index; /* leaf table position */ struct xfs_buf *lbp; /* leaf's buffer */ @@ -583,7 +583,7 @@ xfs_dir2_leaf_addname( xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ int lfloglow; /* low leaf logging index */ int lfloghigh; /* high leaf logging index */ - int lowstale; /* index of prev stale leaf */ + int lowstale = 0; /* index of prev stale leaf */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ int needbytes; /* leaf block bytes needed */ int needlog; /* need to log data header */ From 62c9d2674b31d4c8a674bee86b7edc6da2803aea Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Sat, 2 Mar 2019 09:17:32 +0800 Subject: [PATCH 362/855] inotify: Fix fsnotify_mark refcount leak in inotify_update_existing_watch() Commit 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for inotify_add_watch()") forgot to call fsnotify_put_mark() with IN_MASK_CREATE after fsnotify_find_mark() Fixes: 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for inotify_add_watch()") Signed-off-by: ZhangXiaoxu Signed-off-by: Jan Kara --- fs/notify/inotify/inotify_user.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index e2901fbb9f76..7b53598c8804 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -519,8 +519,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) return -ENOENT; - else if (create) - return -EEXIST; + else if (create) { + ret = -EEXIST; + goto out; + } i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); @@ -548,6 +550,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, /* return the wd */ ret = i_mark->wd; +out: /* match the get from fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); From d9c1bb2f6a2157b38e8eb63af437cb22701d31ee Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 7 Mar 2019 10:52:33 -0800 Subject: [PATCH 363/855] perf/core: Restore mmap record type correctly On mmap(), perf_events generates a RECORD_MMAP record and then checks which events are interested in this record. There are currently 2 versions of mmap records: RECORD_MMAP and RECORD_MMAP2. MMAP2 is larger. The event configuration controls which version the user level tool accepts. If the event->attr.mmap2=1 field then MMAP2 record is returned. The perf_event_mmap_output() takes care of this. It checks attr->mmap2 and corrects the record fields before putting it in the sampling buffer of the event. At the end the function restores the modified MMAP record fields. The problem is that the function restores the size but not the type. Thus, if a subsequent event only accepts MMAP type, then it would instead receive an MMAP2 record with a size of MMAP record. This patch fixes the problem by restoring the record type on exit. Signed-off-by: Stephane Eranian Acked-by: Peter Zijlstra (Intel) Cc: Andi Kleen Cc: Jiri Olsa Cc: Kan Liang Fixes: 13d7a2410fa6 ("perf: Add attr->mmap2 attribute to an event") Link: http://lkml.kernel.org/r/20190307185233.225521-1-eranian@google.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 6fb27b564730..514b8e014a2d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7189,6 +7189,7 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; + u32 type = mmap_event->event_id.header.type; int ret; if (!perf_event_mmap_match(event, data)) @@ -7232,6 +7233,7 @@ static void perf_event_mmap_output(struct perf_event *event, perf_output_end(&handle); out: mmap_event->event_id.header.size = size; + mmap_event->event_id.header.type = type; } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) From 3ab481a1cfe1511b94e142b648e2c5ade9175ed3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Mar 2019 06:47:45 -0800 Subject: [PATCH 364/855] perf script: Support insn output for normal samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit perf script -F +insn was only working for PT traces because the PT instruction decoder was filling in the insn/insn_len sample attributes. Support it for non PT samples too on x86 using the existing x86 instruction decoder. This adds some extra checking to ensure that we don't try to decode instructions when using perf.data from a different architecture. % perf record -a sleep 1 % perf script -F ip,sym,insn --xed ffffffff811704c9 remote_function movl %eax, 0x18(%rbx) ffffffff8100bb50 intel_bts_enable_local retq ffffffff81048612 native_apic_mem_write movl %esi, -0xa04000(%rdi) ffffffff81048612 native_apic_mem_write movl %esi, -0xa04000(%rdi) ffffffff81048612 native_apic_mem_write movl %esi, -0xa04000(%rdi) ffffffff810f1f79 generic_exec_single xor %eax, %eax ffffffff811704c9 remote_function movl %eax, 0x18(%rbx) ffffffff8100bb34 intel_bts_enable_local movl 0x2000(%rax), %edx ffffffff81048610 native_apic_mem_write mov %edi, %edi ... Committer testing: Before: # perf script -F ip,sym,insn --xed | head -5 ffffffffa4068804 native_write_msr addb %al, (%rax) ffffffffa4068804 native_write_msr addb %al, (%rax) ffffffffa4068804 native_write_msr addb %al, (%rax) ffffffffa4068806 native_write_msr addb %al, (%rax) ffffffffa4068806 native_write_msr addb %al, (%rax) # perf script -F ip,sym,insn --xed | grep -v "addb %al, (%rax)" # After: # perf script -F ip,sym,insn --xed | head -5 ffffffffa4068804 native_write_msr wrmsr ffffffffa4068804 native_write_msr wrmsr ffffffffa4068804 native_write_msr wrmsr ffffffffa4068806 native_write_msr nopl %eax, (%rax,%rax,1) ffffffffa4068806 native_write_msr nopl %eax, (%rax,%rax,1) # perf script -F ip,sym,insn --xed | grep -v "addb %al, (%rax)" | head -5 ffffffffa4068804 native_write_msr wrmsr ffffffffa4068804 native_write_msr wrmsr ffffffffa4068804 native_write_msr wrmsr ffffffffa4068806 native_write_msr nopl %eax, (%rax,%rax,1) ffffffffa4068806 native_write_msr nopl %eax, (%rax,%rax,1) # More examples: # perf script -F ip,sym,insn --xed | grep -v native_write_msr | head ffffffffa416b90e tick_check_broadcast_expired btq %rax, 0x1a5f42a(%rip) ffffffffa4956bd0 nmi_cpu_backtrace pushq %r13 ffffffffa415b95e __hrtimer_next_event_base movq 0x18(%rax), %rdx ffffffffa4956bf3 nmi_cpu_backtrace popq %r12 ffffffffa4171d5c smp_call_function_single pause ffffffffa4956bdd nmi_cpu_backtrace mov %ebp, %r12d ffffffffa4797e4d menu_select cmp $0x190, %rax ffffffffa4171d5c smp_call_function_single pause ffffffffa405a7d8 nmi_cpu_backtrace_handler callq 0xffffffffa4956bd0 ffffffffa4797f7a menu_select shr $0x3, %rax # Which matches the annotate output modulo resolving callqs: # perf annotate --stdio2 nmi_cpu_backtrace_handler Samples: 4 of event 'cycles:ppp', 4000 Hz, Event count (approx.): 35908, [percent: local period] nmi_cpu_backtrace_handler() /lib/modules/5.0.0+/build/vmlinux Percent Disassembly of section .text: ffffffff8105a7d0 : nmi_cpu_backtrace_handler(): nmi_trigger_cpumask_backtrace(mask, exclude_self, nmi_raise_cpu_backtrace); } static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { 24.45 → callq __fentry__ if (nmi_cpu_backtrace(regs)) mov %rsi,%rdi 75.55 → callq nmi_cpu_backtrace return NMI_HANDLED; movzbl %al,%eax return NMI_DONE; } ← retq # # perf annotate --stdio2 __hrtimer_next_event_base Samples: 4 of event 'cycles:ppp', 4000 Hz, Event count (approx.): 767977, [percent: local period] __hrtimer_next_event_base() /lib/modules/5.0.0+/build/vmlinux Percent Disassembly of section .text: ffffffff8115b910 <__hrtimer_next_event_base>: __hrtimer_next_event_base(): static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { → callq __fentry__ 4a: add $0x1,%r14 77.31 mov 0x18(%rax),%rdx shl $0x6,%r14 sub 0x38(%rbx,%r14,1),%rdx if (expires < expires_next) { cmp %r12,%rdx ↓ jge 68 Signed-off-by: Andi Kleen Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20190305144758.12397-3-andi@firstfloor.org [ Converted fetch_exe() to use the name it ended up having when merged: thread__memcpy() ] [ archinsn.c needs the instruction decoder that is only build when CONFIG_AUXTRACE=y, fix that ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/Build | 1 + tools/perf/arch/x86/util/archinsn.c | 26 ++++++++++++++++++++++++++ tools/perf/builtin-script.c | 21 ++++++++++++++++++++- tools/perf/util/archinsn.h | 12 ++++++++++++ 4 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 tools/perf/arch/x86/util/archinsn.c create mode 100644 tools/perf/util/archinsn.h diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index 7aab0be5fc5f..47f9c56e744f 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -14,5 +14,6 @@ perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o perf-$(CONFIG_AUXTRACE) += auxtrace.o +perf-$(CONFIG_AUXTRACE) += archinsn.o perf-$(CONFIG_AUXTRACE) += intel-pt.o perf-$(CONFIG_AUXTRACE) += intel-bts.o diff --git a/tools/perf/arch/x86/util/archinsn.c b/tools/perf/arch/x86/util/archinsn.c new file mode 100644 index 000000000000..4237bb2e7fa2 --- /dev/null +++ b/tools/perf/arch/x86/util/archinsn.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "perf.h" +#include "archinsn.h" +#include "util/intel-pt-decoder/insn.h" +#include "machine.h" +#include "thread.h" +#include "symbol.h" + +void arch_fetch_insn(struct perf_sample *sample, + struct thread *thread, + struct machine *machine) +{ + struct insn insn; + int len; + bool is64bit = false; + + if (!sample->ip) + return; + len = thread__memcpy(thread, machine, sample->insn, sample->ip, sizeof(sample->insn), &is64bit); + if (len <= 0) + return; + insn_init(&insn, sample->insn, len, is64bit); + insn_get_length(&insn); + if (insn_complete(&insn) && insn.length <= len) + sample->insn_len = insn.length; +} diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 53f78cf3113f..a5080afd361d 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -29,10 +29,12 @@ #include "util/time-utils.h" #include "util/path.h" #include "print_binary.h" +#include "archinsn.h" #include #include #include #include +#include #include "asm/bug.h" #include "util/mem-events.h" #include "util/dump-insn.h" @@ -63,6 +65,7 @@ static const char *cpu_list; static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); static struct perf_stat_config stat_config; static int max_blocks; +static bool native_arch; unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH; @@ -1227,6 +1230,12 @@ static int perf_sample__fprintf_callindent(struct perf_sample *sample, return len + dlen; } +__weak void arch_fetch_insn(struct perf_sample *sample __maybe_unused, + struct thread *thread __maybe_unused, + struct machine *machine __maybe_unused) +{ +} + static int perf_sample__fprintf_insn(struct perf_sample *sample, struct perf_event_attr *attr, struct thread *thread, @@ -1234,9 +1243,12 @@ static int perf_sample__fprintf_insn(struct perf_sample *sample, { int printed = 0; + if (sample->insn_len == 0 && native_arch) + arch_fetch_insn(sample, thread, machine); + if (PRINT_FIELD(INSNLEN)) printed += fprintf(fp, " ilen: %d", sample->insn_len); - if (PRINT_FIELD(INSN)) { + if (PRINT_FIELD(INSN) && sample->insn_len) { int i; printed += fprintf(fp, " insn:"); @@ -3277,6 +3289,7 @@ int cmd_script(int argc, const char **argv) .set = false, .default_no_sample = true, }; + struct utsname uts; char *script_path = NULL; const char **__argv; int i, j, err = 0; @@ -3615,6 +3628,12 @@ int cmd_script(int argc, const char **argv) if (symbol__init(&session->header.env) < 0) goto out_delete; + uname(&uts); + if (!strcmp(uts.machine, session->header.env.arch) || + (!strcmp(uts.machine, "x86_64") && + !strcmp(session->header.env.arch, "i386"))) + native_arch = true; + script.session = session; script__setup_sample_type(&script); diff --git a/tools/perf/util/archinsn.h b/tools/perf/util/archinsn.h new file mode 100644 index 000000000000..448cbb6b8d7e --- /dev/null +++ b/tools/perf/util/archinsn.h @@ -0,0 +1,12 @@ +#ifndef INSN_H +#define INSN_H 1 + +struct perf_sample; +struct machine; +struct thread; + +void arch_fetch_insn(struct perf_sample *sample, + struct thread *thread, + struct machine *machine); + +#endif From 52bab8868211b7c504146f6239e101421d4d125b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Mar 2019 06:47:47 -0800 Subject: [PATCH 365/855] perf report: Support output in nanoseconds Upcoming changes add timestamp output in perf report. Add a --ns argument similar to perf script to support nanoseconds resolution when needed. Signed-off-by: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20190305144758.12397-5-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 3 +++ tools/perf/builtin-report.c | 1 + tools/perf/builtin-script.c | 11 +++++------ tools/perf/util/symbol.c | 1 + tools/perf/util/symbol_conf.h | 1 + 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 1a27bfe05039..51dbc519dbce 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -477,6 +477,9 @@ include::itrace.txt[] Please note that not all mmaps are stored, options affecting which ones are include 'perf record --data', for instance. +--ns:: + Show time stamps in nanoseconds. + --stats:: Display overall events statistics without any further processing. (like the one at the end of the perf report -D command) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index ee93c18a6685..515864ba504a 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1147,6 +1147,7 @@ int cmd_report(int argc, const char **argv) OPT_CALLBACK(0, "percent-type", &report.annotation_opts, "local-period", "Set percent type local/global-period/hits", annotate_parse_percent_type), + OPT_BOOLEAN(0, "ns", &symbol_conf.nanosecs, "Show times in nanosecs"), OPT_END() }; struct perf_data data = { diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index a5080afd361d..111787e83784 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -60,7 +60,6 @@ static bool no_callchain; static bool latency_format; static bool system_wide; static bool print_flags; -static bool nanosecs; static const char *cpu_list; static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); static struct perf_stat_config stat_config; @@ -691,7 +690,7 @@ static int perf_sample__fprintf_start(struct perf_sample *sample, secs = nsecs / NSEC_PER_SEC; nsecs -= secs * NSEC_PER_SEC; - if (nanosecs) + if (symbol_conf.nanosecs) printed += fprintf(fp, "%5lu.%09llu: ", secs, nsecs); else { char sample_time[32]; @@ -3244,7 +3243,7 @@ static int parse_insn_trace(const struct option *opt __maybe_unused, { parse_output_fields(NULL, "+insn,-event,-period", 0); itrace_parse_synth_opts(opt, "i0ns", 0); - nanosecs = true; + symbol_conf.nanosecs = true; return 0; } @@ -3262,7 +3261,7 @@ static int parse_call_trace(const struct option *opt __maybe_unused, { parse_output_fields(NULL, "-ip,-addr,-event,-period,+callindent", 0); itrace_parse_synth_opts(opt, "cewp", 0); - nanosecs = true; + symbol_conf.nanosecs = true; return 0; } @@ -3272,7 +3271,7 @@ static int parse_callret_trace(const struct option *opt __maybe_unused, { parse_output_fields(NULL, "-ip,-addr,-event,-period,+callindent,+flags", 0); itrace_parse_synth_opts(opt, "crewp", 0); - nanosecs = true; + symbol_conf.nanosecs = true; return 0; } @@ -3408,7 +3407,7 @@ int cmd_script(int argc, const char **argv) OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"), OPT_INTEGER(0, "max-blocks", &max_blocks, "Maximum number of code blocks to dump with brstackinsn"), - OPT_BOOLEAN(0, "ns", &nanosecs, + OPT_BOOLEAN(0, "ns", &symbol_conf.nanosecs, "Use 9 decimal places when displaying time"), OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts", "Instruction Tracing options\n" ITRACE_HELP, diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 758bf5f74e6e..eb873ea1c405 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -39,6 +39,7 @@ int vmlinux_path__nr_entries; char **vmlinux_path; struct symbol_conf symbol_conf = { + .nanosecs = false, .use_modules = true, .try_vmlinux_path = true, .demangle = true, diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h index fffea68c1203..095a297c8b47 100644 --- a/tools/perf/util/symbol_conf.h +++ b/tools/perf/util/symbol_conf.h @@ -8,6 +8,7 @@ struct strlist; struct intlist; struct symbol_conf { + bool nanosecs; unsigned short priv_size; bool try_vmlinux_path, init_annotation, From f8c856cb2c947f4fad0a2dff5e95cdcddb801303 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Mar 2019 06:47:53 -0800 Subject: [PATCH 366/855] perf time-utils: Add utility function to print time stamps in nanoseconds Add a utility function to print nanosecond timestamps. Signed-off-by: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20190305144758.12397-11-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/time-utils.c | 8 ++++++++ tools/perf/util/time-utils.h | 1 + 2 files changed, 9 insertions(+) diff --git a/tools/perf/util/time-utils.c b/tools/perf/util/time-utils.c index 0f53baec660e..20663a460df3 100644 --- a/tools/perf/util/time-utils.c +++ b/tools/perf/util/time-utils.c @@ -453,6 +453,14 @@ int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz) return scnprintf(buf, sz, "%"PRIu64".%06"PRIu64, sec, usec); } +int timestamp__scnprintf_nsec(u64 timestamp, char *buf, size_t sz) +{ + u64 sec = timestamp / NSEC_PER_SEC, + nsec = timestamp % NSEC_PER_SEC; + + return scnprintf(buf, sz, "%" PRIu64 ".%09" PRIu64, sec, nsec); +} + int fetch_current_timestamp(char *buf, size_t sz) { struct timeval tv; diff --git a/tools/perf/util/time-utils.h b/tools/perf/util/time-utils.h index b923de44e36f..72a42ea1d513 100644 --- a/tools/perf/util/time-utils.h +++ b/tools/perf/util/time-utils.h @@ -30,6 +30,7 @@ int perf_time__parse_for_ranges(const char *str, struct perf_session *session, int *range_size, int *range_num); int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz); +int timestamp__scnprintf_nsec(u64 timestamp, char *buf, size_t sz); int fetch_current_timestamp(char *buf, size_t sz); From 2a1292cbd4e5c81edbf815a410fa2072c341db1e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Mar 2019 06:47:48 -0800 Subject: [PATCH 367/855] perf report: Parse time quantum Many workloads change over time. 'perf report' currently aggregates the whole time range reported in perf.data. This patch adds an option for a time quantum to quantisize the perf.data over time. This just adds the option, will be used in follow on patches for a time sort key. Signed-off-by: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20190305144758.12397-6-andi@firstfloor.org [ Use NSEC_PER_[MU]SEC ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 4 +++ tools/perf/builtin-report.c | 42 ++++++++++++++++++++++++ tools/perf/util/symbol.c | 2 ++ tools/perf/util/symbol_conf.h | 1 + 4 files changed, 49 insertions(+) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 51dbc519dbce..9ec1702bccdd 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -497,6 +497,10 @@ include::itrace.txt[] The period/hits keywords set the base the percentage is computed on - the samples period or the number of samples (hits). +--time-quantum:: + Configure time quantum for time sort key. Default 100ms. + Accepts s, us, ms, ns units. + include::callchain-overhead-calculation.txt[] SEE ALSO diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 515864ba504a..05c8dd41106c 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -47,9 +47,11 @@ #include #include #include +#include "sane_ctype.h" #include #include #include +#include #include #include #include @@ -926,6 +928,43 @@ report_parse_callchain_opt(const struct option *opt, const char *arg, int unset) return parse_callchain_report_opt(arg); } +static int +parse_time_quantum(const struct option *opt, const char *arg, + int unset __maybe_unused) +{ + unsigned long *time_q = opt->value; + char *end; + + *time_q = strtoul(arg, &end, 0); + if (end == arg) + goto parse_err; + if (*time_q == 0) { + pr_err("time quantum cannot be 0"); + return -1; + } + while (isspace(*end)) + end++; + if (*end == 0) + return 0; + if (!strcmp(end, "s")) { + *time_q *= NSEC_PER_SEC; + return 0; + } + if (!strcmp(end, "ms")) { + *time_q *= NSEC_PER_MSEC; + return 0; + } + if (!strcmp(end, "us")) { + *time_q *= NSEC_PER_USEC; + return 0; + } + if (!strcmp(end, "ns")) + return 0; +parse_err: + pr_err("Cannot parse time quantum `%s'\n", arg); + return -1; +} + int report_parse_ignore_callees_opt(const struct option *opt __maybe_unused, const char *arg, int unset __maybe_unused) @@ -1148,6 +1187,9 @@ int cmd_report(int argc, const char **argv) "Set percent type local/global-period/hits", annotate_parse_percent_type), OPT_BOOLEAN(0, "ns", &symbol_conf.nanosecs, "Show times in nanosecs"), + OPT_CALLBACK(0, "time-quantum", &symbol_conf.time_quantum, "time (ms|us|ns|s)", + "Set time quantum for time sort key (default 100ms)", + parse_time_quantum), OPT_END() }; struct perf_data data = { diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index eb873ea1c405..6b73a0eeb6a1 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,7 @@ struct symbol_conf symbol_conf = { .demangle = true, .demangle_kernel = false, .cumulate_callchain = true, + .time_quantum = 100 * NSEC_PER_MSEC, /* 100ms */ .show_hist_headers = true, .symfs = "", .event_group = true, diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h index 095a297c8b47..a5684a71b78e 100644 --- a/tools/perf/util/symbol_conf.h +++ b/tools/perf/util/symbol_conf.h @@ -56,6 +56,7 @@ struct symbol_conf { *sym_list_str, *col_width_list_str, *bt_stop_list_str; + unsigned long time_quantum; struct strlist *dso_list, *comm_list, *sym_list, From eaeffeb9838a7c0dec981d258666bfcc0fa6a947 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 4 Mar 2019 15:13:21 +0200 Subject: [PATCH 368/855] perf probe: Fix getting the kernel map Since commit 4d99e4136580 ("perf machine: Workaround missing maps for x86 PTI entry trampolines"), perf tools has been creating more than one kernel map, however 'perf probe' assumed there could be only one. Fix by using machine__kernel_map() to get the main kernel map. Signed-off-by: Adrian Hunter Tested-by: Joseph Qi Acked-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Greg Kroah-Hartman Cc: Jiufei Xue Cc: Peter Zijlstra Cc: stable@vger.kernel.org Cc: Xu Yu Fixes: 4d99e4136580 ("perf machine: Workaround missing maps for x86 PTI entry trampolines") Fixes: d83212d5dd67 ("kallsyms, x86: Export addresses of PTI entry trampolines") Link: http://lkml.kernel.org/r/2ed432de-e904-85d2-5c36-5897ddc5b23b@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index a1b8d9649ca7..198e09ff611e 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -160,8 +160,10 @@ static struct map *kernel_get_module_map(const char *module) if (module && strchr(module, '/')) return dso__new_map(module); - if (!module) - module = "kernel"; + if (!module) { + pos = machine__kernel_map(host_machine); + return map__get(pos); + } for (pos = maps__first(maps); pos; pos = map__next(pos)) { /* short_name is "[module]" */ From 98c07a8f74f85a19aeee2016f5afa0c667fa694d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Li=C5=A1ka?= Date: Wed, 13 Feb 2019 12:19:16 +0100 Subject: [PATCH 369/855] perf vendor events amd: perf PMU events for AMD Family 17h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thi patch adds PMC events for AMD Family 17 CPUs as defined in [1]. It covers events described in section: 2.1.13. Regex pattern in mapfile.csv covers all CPUs of the family. [1] https://support.amd.com/TechDocs/54945_PPR_Family_17h_Models_00h-0Fh.pdf Signed-off-by: Martin Liška Acked-by: Borislav Petkov Cc: Jiri Olsa Cc: Jon Grimm Cc: Martin Jambor Cc: William Cohen Link: https://lkml.kernel.org/r/d65873ca-e402-b198-4fe9-8c4af81258c8@suse.cz Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/amdfam17h/branch.json | 12 + .../pmu-events/arch/x86/amdfam17h/cache.json | 287 ++++++++++++++++++ .../pmu-events/arch/x86/amdfam17h/core.json | 134 ++++++++ .../arch/x86/amdfam17h/floating-point.json | 168 ++++++++++ .../pmu-events/arch/x86/amdfam17h/memory.json | 162 ++++++++++ .../pmu-events/arch/x86/amdfam17h/other.json | 65 ++++ tools/perf/pmu-events/arch/x86/mapfile.csv | 1 + 7 files changed, 829 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/amdfam17h/branch.json create mode 100644 tools/perf/pmu-events/arch/x86/amdfam17h/cache.json create mode 100644 tools/perf/pmu-events/arch/x86/amdfam17h/core.json create mode 100644 tools/perf/pmu-events/arch/x86/amdfam17h/floating-point.json create mode 100644 tools/perf/pmu-events/arch/x86/amdfam17h/memory.json create mode 100644 tools/perf/pmu-events/arch/x86/amdfam17h/other.json diff --git a/tools/perf/pmu-events/arch/x86/amdfam17h/branch.json b/tools/perf/pmu-events/arch/x86/amdfam17h/branch.json new file mode 100644 index 000000000000..93ddfd8053ca --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdfam17h/branch.json @@ -0,0 +1,12 @@ +[ + { + "EventName": "bp_l1_btb_correct", + "EventCode": "0x8a", + "BriefDescription": "L1 BTB Correction." + }, + { + "EventName": "bp_l2_btb_correct", + "EventCode": "0x8b", + "BriefDescription": "L2 BTB Correction." + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdfam17h/cache.json b/tools/perf/pmu-events/arch/x86/amdfam17h/cache.json new file mode 100644 index 000000000000..fad4af9142cb --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdfam17h/cache.json @@ -0,0 +1,287 @@ +[ + { + "EventName": "ic_fw32", + "EventCode": "0x80", + "BriefDescription": "The number of 32B fetch windows transferred from IC pipe to DE instruction decoder (includes non-cacheable and cacheable fill responses)." + }, + { + "EventName": "ic_fw32_miss", + "EventCode": "0x81", + "BriefDescription": "The number of 32B fetch windows tried to read the L1 IC and missed in the full tag." + }, + { + "EventName": "ic_cache_fill_l2", + "EventCode": "0x82", + "BriefDescription": "The number of 64 byte instruction cache line was fulfilled from the L2 cache." + }, + { + "EventName": "ic_cache_fill_sys", + "EventCode": "0x83", + "BriefDescription": "The number of 64 byte instruction cache line fulfilled from system memory or another cache." + }, + { + "EventName": "bp_l1_tlb_miss_l2_hit", + "EventCode": "0x84", + "BriefDescription": "The number of instruction fetches that miss in the L1 ITLB but hit in the L2 ITLB." + }, + { + "EventName": "bp_l1_tlb_miss_l2_miss", + "EventCode": "0x85", + "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs." + }, + { + "EventName": "bp_snp_re_sync", + "EventCode": "0x86", + "BriefDescription": "The number of pipeline restarts caused by invalidating probes that hit on the instruction stream currently being executed. This would happen if the active instruction stream was being modified by another processor in an MP system - typically a highly unlikely event." + }, + { + "EventName": "ic_fetch_stall.ic_stall_any", + "EventCode": "0x87", + "BriefDescription": "IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).", + "PublicDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).", + "UMask": "0x4" + }, + { + "EventName": "ic_fetch_stall.ic_stall_dq_empty", + "EventCode": "0x87", + "BriefDescription": "IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.", + "PublicDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.", + "UMask": "0x2" + }, + { + "EventName": "ic_fetch_stall.ic_stall_back_pressure", + "EventCode": "0x87", + "BriefDescription": "IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.", + "PublicDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.", + "UMask": "0x1" + }, + { + "EventName": "ic_cache_inval.l2_invalidating_probe", + "EventCode": "0x8c", + "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS).", + "PublicDescription": "The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core. IC line invalidated due to L2 invalidating probe (external or LS).", + "UMask": "0x2" + }, + { + "EventName": "ic_cache_inval.fill_invalidated", + "EventCode": "0x8c", + "BriefDescription": "IC line invalidated due to overwriting fill response.", + "PublicDescription": "The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core. IC line invalidated due to overwriting fill response.", + "UMask": "0x1" + }, + { + "EventName": "bp_tlb_rel", + "EventCode": "0x99", + "BriefDescription": "The number of ITLB reload requests." + }, + { + "EventName": "l2_request_g1.rd_blk_l", + "EventCode": "0x60", + "BriefDescription": "Requests to L2 Group1.", + "PublicDescription": "Requests to L2 Group1.", + "UMask": "0x80" + }, + { + "EventName": "l2_request_g1.rd_blk_x", + "EventCode": "0x60", + "BriefDescription": "Requests to L2 Group1.", + "PublicDescription": "Requests to L2 Group1.", + "UMask": "0x40" + }, + { + "EventName": "l2_request_g1.ls_rd_blk_c_s", + "EventCode": "0x60", + "BriefDescription": "Requests to L2 Group1.", + "PublicDescription": "Requests to L2 Group1.", + "UMask": "0x20" + }, + { + "EventName": "l2_request_g1.cacheable_ic_read", + "EventCode": "0x60", + "BriefDescription": "Requests to L2 Group1.", + "PublicDescription": "Requests to L2 Group1.", + "UMask": "0x10" + }, + { + "EventName": "l2_request_g1.change_to_x", + "EventCode": "0x60", + "BriefDescription": "Requests to L2 Group1.", + "PublicDescription": "Requests to L2 Group1.", + "UMask": "0x8" + }, + { + "EventName": "l2_request_g1.prefetch_l2", + "EventCode": "0x60", + "BriefDescription": "Requests to L2 Group1.", + "PublicDescription": "Requests to L2 Group1.", + "UMask": "0x4" + }, + { + "EventName": "l2_request_g1.l2_hw_pf", + "EventCode": "0x60", + "BriefDescription": "Requests to L2 Group1.", + "PublicDescription": "Requests to L2 Group1.", + "UMask": "0x2" + }, + { + "EventName": "l2_request_g1.other_requests", + "EventCode": "0x60", + "BriefDescription": "Events covered by l2_request_g2.", + "PublicDescription": "Requests to L2 Group1. Events covered by l2_request_g2.", + "UMask": "0x1" + }, + { + "EventName": "l2_request_g2.group1", + "EventCode": "0x61", + "BriefDescription": "All Group 1 commands not in unit0.", + "PublicDescription": "Multi-events in that LS and IF requests can be received simultaneous. All Group 1 commands not in unit0.", + "UMask": "0x80" + }, + { + "EventName": "l2_request_g2.ls_rd_sized", + "EventCode": "0x61", + "BriefDescription": "RdSized, RdSized32, RdSized64.", + "PublicDescription": "Multi-events in that LS and IF requests can be received simultaneous. RdSized, RdSized32, RdSized64.", + "UMask": "0x40" + }, + { + "EventName": "l2_request_g2.ls_rd_sized_nc", + "EventCode": "0x61", + "BriefDescription": "RdSizedNC, RdSized32NC, RdSized64NC.", + "PublicDescription": "Multi-events in that LS and IF requests can be received simultaneous. RdSizedNC, RdSized32NC, RdSized64NC.", + "UMask": "0x20" + }, + { + "EventName": "l2_request_g2.ic_rd_sized", + "EventCode": "0x61", + "BriefDescription": "Multi-events in that LS and IF requests can be received simultaneous.", + "PublicDescription": "Multi-events in that LS and IF requests can be received simultaneous.", + "UMask": "0x10" + }, + { + "EventName": "l2_request_g2.ic_rd_sized_nc", + "EventCode": "0x61", + "BriefDescription": "Multi-events in that LS and IF requests can be received simultaneous.", + "PublicDescription": "Multi-events in that LS and IF requests can be received simultaneous.", + "UMask": "0x8" + }, + { + "EventName": "l2_request_g2.smc_inval", + "EventCode": "0x61", + "BriefDescription": "Multi-events in that LS and IF requests can be received simultaneous.", + "PublicDescription": "Multi-events in that LS and IF requests can be received simultaneous.", + "UMask": "0x4" + }, + { + "EventName": "l2_request_g2.bus_locks_originator", + "EventCode": "0x61", + "BriefDescription": "Multi-events in that LS and IF requests can be received simultaneous.", + "PublicDescription": "Multi-events in that LS and IF requests can be received simultaneous.", + "UMask": "0x2" + }, + { + "EventName": "l2_request_g2.bus_locks_responses", + "EventCode": "0x61", + "BriefDescription": "Multi-events in that LS and IF requests can be received simultaneous.", + "PublicDescription": "Multi-events in that LS and IF requests can be received simultaneous.", + "UMask": "0x1" + }, + { + "EventName": "l2_latency.l2_cycles_waiting_on_fills", + "EventCode": "0x62", + "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.", + "PublicDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.", + "UMask": "0x1" + }, + { + "EventName": "l2_wcb_req.wcb_write", + "EventCode": "0x63", + "PublicDescription": "LS (Load/Store unit) to L2 WCB (Write Combining Buffer) write requests.", + "BriefDescription": "LS to L2 WCB write requests.", + "UMask": "0x40" + }, + { + "EventName": "l2_wcb_req.wcb_close", + "EventCode": "0x63", + "BriefDescription": "LS to L2 WCB close requests.", + "PublicDescription": "LS (Load/Store unit) to L2 WCB (Write Combining Buffer) close requests.", + "UMask": "0x20" + }, + { + "EventName": "l2_wcb_req.zero_byte_store", + "EventCode": "0x63", + "BriefDescription": "LS to L2 WCB zero byte store requests.", + "PublicDescription": "LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.", + "UMask": "0x4" + }, + { + "EventName": "l2_wcb_req.cl_zero", + "EventCode": "0x63", + "PublicDescription": "LS to L2 WCB cache line zeroing requests.", + "BriefDescription": "LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.", + "UMask": "0x1" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_cs", + "EventCode": "0x64", + "BriefDescription": "LS ReadBlock C/S Hit.", + "PublicDescription": "This event does not count accesses to the L2 cache by the L2 prefetcher, but it does count accesses by the L1 prefetcher. LS ReadBlock C/S Hit.", + "UMask": "0x80" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_x", + "EventCode": "0x64", + "BriefDescription": "LS Read Block L Hit X.", + "PublicDescription": "This event does not count accesses to the L2 cache by the L2 prefetcher, but it does count accesses by the L1 prefetcher. LS Read Block L Hit X.", + "UMask": "0x40" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_s", + "EventCode": "0x64", + "BriefDescription": "LsRdBlkL Hit Shared.", + "PublicDescription": "This event does not count accesses to the L2 cache by the L2 prefetcher, but it does count accesses by the L1 prefetcher. LsRdBlkL Hit Shared.", + "UMask": "0x20" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_x", + "EventCode": "0x64", + "BriefDescription": "LsRdBlkX/ChgToX Hit X. Count RdBlkX finding Shared as a Miss.", + "PublicDescription": "This event does not count accesses to the L2 cache by the L2 prefetcher, but it does count accesses by the L1 prefetcher. LsRdBlkX/ChgToX Hit X. Count RdBlkX finding Shared as a Miss.", + "UMask": "0x10" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_c", + "EventCode": "0x64", + "BriefDescription": "LS Read Block C S L X Change to X Miss.", + "PublicDescription": "This event does not count accesses to the L2 cache by the L2 prefetcher, but it does count accesses by the L1 prefetcher. LS Read Block C S L X Change to X Miss.", + "UMask": "0x8" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_hit_x", + "EventCode": "0x64", + "BriefDescription": "IC Fill Hit Exclusive Stale.", + "PublicDescription": "This event does not count accesses to the L2 cache by the L2 prefetcher, but it does count accesses by the L1 prefetcher. IC Fill Hit Exclusive Stale.", + "UMask": "0x4" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_hit_s", + "EventCode": "0x64", + "BriefDescription": "IC Fill Hit Shared.", + "PublicDescription": "This event does not count accesses to the L2 cache by the L2 prefetcher, but it does count accesses by the L1 prefetcher. IC Fill Hit Shared.", + "UMask": "0x2" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_miss", + "EventCode": "0x64", + "BriefDescription": "IC Fill Miss.", + "PublicDescription": "This event does not count accesses to the L2 cache by the L2 prefetcher, but it does count accesses by the L1 prefetcher. IC Fill Miss.", + "UMask": "0x1" + }, + { + "EventName": "l2_fill_pending.l2_fill_busy", + "EventCode": "0x6d", + "BriefDescription": "Total cycles spent with one or more fill requests in flight from L2.", + "PublicDescription": "Total cycles spent with one or more fill requests in flight from L2.", + "UMask": "0x1" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdfam17h/core.json b/tools/perf/pmu-events/arch/x86/amdfam17h/core.json new file mode 100644 index 000000000000..7b285b0a7f35 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdfam17h/core.json @@ -0,0 +1,134 @@ +[ + { + "EventName": "ex_ret_instr", + "EventCode": "0xc0", + "BriefDescription": "Retired Instructions." + }, + { + "EventName": "ex_ret_cops", + "EventCode": "0xc1", + "BriefDescription": "Retired Uops.", + "PublicDescription": "The number of uOps retired. This includes all processor activity (instructions, exceptions, interrupts, microcode assists, etc.). The number of events logged per cycle can vary from 0 to 4." + }, + { + "EventName": "ex_ret_brn", + "EventCode": "0xc2", + "BriefDescription": "[Retired Branch Instructions.", + "PublicDescription": "The number of branch instructions retired. This includes all types of architectural control flow changes, including exceptions and interrupts." + }, + { + "EventName": "ex_ret_brn_misp", + "EventCode": "0xc3", + "BriefDescription": "Retired Branch Instructions Mispredicted.", + "PublicDescription": "The number of branch instructions retired, of any type, that were not correctly predicted. This includes those for which prediction is not attempted (far control transfers, exceptions and interrupts)." + }, + { + "EventName": "ex_ret_brn_tkn", + "EventCode": "0xc4", + "BriefDescription": "Retired Taken Branch Instructions.", + "PublicDescription": "The number of taken branches that were retired. This includes all types of architectural control flow changes, including exceptions and interrupts." + }, + { + "EventName": "ex_ret_brn_tkn_misp", + "EventCode": "0xc5", + "BriefDescription": "Retired Taken Branch Instructions Mispredicted.", + "PublicDescription": "The number of retired taken branch instructions that were mispredicted." + }, + { + "EventName": "ex_ret_brn_far", + "EventCode": "0xc6", + "BriefDescription": "Retired Far Control Transfers.", + "PublicDescription": "The number of far control transfers retired including far call/jump/return, IRET, SYSCALL and SYSRET, plus exceptions and interrupts. Far control transfers are not subject to branch prediction." + }, + { + "EventName": "ex_ret_brn_resync", + "EventCode": "0xc7", + "BriefDescription": "Retired Branch Resyncs.", + "PublicDescription": "The number of resync branches. These reflect pipeline restarts due to certain microcode assists and events such as writes to the active instruction stream, among other things. Each occurrence reflects a restart penalty similar to a branch mispredict. This is relatively rare." + }, + { + "EventName": "ex_ret_near_ret", + "EventCode": "0xc8", + "BriefDescription": "Retired Near Returns.", + "PublicDescription": "The number of near return instructions (RET or RET Iw) retired." + }, + { + "EventName": "ex_ret_near_ret_mispred", + "EventCode": "0xc9", + "BriefDescription": "Retired Near Returns Mispredicted.", + "PublicDescription": "The number of near returns retired that were not correctly predicted by the return address predictor. Each such mispredict incurs the same penalty as a mispredicted conditional branch instruction." + }, + { + "EventName": "ex_ret_brn_ind_misp", + "EventCode": "0xca", + "BriefDescription": "Retired Indirect Branch Instructions Mispredicted.", + "PublicDescription": "Retired Indirect Branch Instructions Mispredicted." + }, + { + "EventName": "ex_ret_mmx_fp_instr.sse_instr", + "EventCode": "0xcb", + "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).", + "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).", + "UMask": "0x4" + }, + { + "EventName": "ex_ret_mmx_fp_instr.mmx_instr", + "EventCode": "0xcb", + "BriefDescription": "MMX instructions.", + "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.", + "UMask": "0x2" + }, + { + "EventName": "ex_ret_mmx_fp_instr.x87_instr", + "EventCode": "0xcb", + "BriefDescription": "x87 instructions.", + "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.", + "UMask": "0x1" + }, + { + "EventName": "ex_ret_cond", + "EventCode": "0xd1", + "BriefDescription": "Retired Conditional Branch Instructions." + }, + { + "EventName": "ex_ret_cond_misp", + "EventCode": "0xd2", + "BriefDescription": "Retired Conditional Branch Instructions Mispredicted." + }, + { + "EventName": "ex_div_busy", + "EventCode": "0xd3", + "BriefDescription": "Div Cycles Busy count." + }, + { + "EventName": "ex_div_count", + "EventCode": "0xd4", + "BriefDescription": "Div Op Count." + }, + { + "EventName": "ex_tagged_ibs_ops.ibs_count_rollover", + "EventCode": "0x1cf", + "BriefDescription": "Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.", + "PublicDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.", + "UMask": "0x4" + }, + { + "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret", + "EventCode": "0x1cf", + "BriefDescription": "Number of Ops tagged by IBS that retired.", + "PublicDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.", + "UMask": "0x2" + }, + { + "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops", + "EventCode": "0x1cf", + "BriefDescription": "Number of Ops tagged by IBS.", + "PublicDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.", + "UMask": "0x1" + }, + { + "EventName": "ex_ret_fus_brnch_inst", + "EventCode": "0x1d0", + "BriefDescription": "The number of fused retired branch instructions retired per cycle. The number of events logged per cycle can vary from 0 to 3." + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdfam17h/floating-point.json b/tools/perf/pmu-events/arch/x86/amdfam17h/floating-point.json new file mode 100644 index 000000000000..ea4711983d1d --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdfam17h/floating-point.json @@ -0,0 +1,168 @@ +[ + { + "EventName": "fpu_pipe_assignment.dual", + "EventCode": "0x00", + "BriefDescription": "Total number multi-pipe uOps.", + "PublicDescription": "The number of operations (uOps) and dual-pipe uOps dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number multi-pipe uOps assigned to Pipe 3.", + "UMask": "0xf0" + }, + { + "EventName": "fpu_pipe_assignment.total", + "EventCode": "0x00", + "BriefDescription": "Total number uOps.", + "PublicDescription": "The number of operations (uOps) and dual-pipe uOps dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to Pipe 3.", + "UMask": "0xf" + }, + { + "EventName": "fp_sched_empty", + "EventCode": "0x01", + "BriefDescription": "This is a speculative event. The number of cycles in which the FPU scheduler is empty. Note that some Ops like FP loads bypass the scheduler." + }, + { + "EventName": "fp_retx87_fp_ops.all", + "EventCode": "0x02", + "BriefDescription": "All Ops.", + "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8.", + "UMask": "0x7" + }, + { + "EventName": "fp_retx87_fp_ops.div_sqr_r_ops", + "EventCode": "0x02", + "BriefDescription": "Divide and square root Ops.", + "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Divide and square root Ops.", + "UMask": "0x4" + }, + { + "EventName": "fp_retx87_fp_ops.mul_ops", + "EventCode": "0x02", + "BriefDescription": "Multiply Ops.", + "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Multiply Ops.", + "UMask": "0x2" + }, + { + "EventName": "fp_retx87_fp_ops.add_sub_ops", + "EventCode": "0x02", + "BriefDescription": "Add/subtract Ops.", + "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Add/subtract Ops.", + "UMask": "0x1" + }, + { + "EventName": "fp_ret_sse_avx_ops.all", + "EventCode": "0x03", + "BriefDescription": "All FLOPS.", + "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.", + "UMask": "0xff" + }, + { + "EventName": "fp_ret_sse_avx_ops.dp_mult_add_flops", + "EventCode": "0x03", + "BriefDescription": "Double precision multiply-add FLOPS. Multiply-add counts as 2 FLOPS.", + "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Double precision multiply-add FLOPS. Multiply-add counts as 2 FLOPS.", + "UMask": "0x80" + }, + { + "EventName": "fp_ret_sse_avx_ops.dp_div_flops", + "EventCode": "0x03", + "BriefDescription": "Double precision divide/square root FLOPS.", + "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Double precision divide/square root FLOPS.", + "UMask": "0x40" + }, + { + "EventName": "fp_ret_sse_avx_ops.dp_mult_flops", + "EventCode": "0x03", + "BriefDescription": "Double precision multiply FLOPS.", + "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Double precision multiply FLOPS.", + "UMask": "0x20" + }, + { + "EventName": "fp_ret_sse_avx_ops.dp_add_sub_flops", + "EventCode": "0x03", + "BriefDescription": "Double precision add/subtract FLOPS.", + "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Double precision add/subtract FLOPS.", + "UMask": "0x10" + }, + { + "EventName": "fp_ret_sse_avx_ops.sp_mult_add_flops", + "EventCode": "0x03", + "BriefDescription": "Single precision multiply-add FLOPS. Multiply-add counts as 2 FLOPS.", + "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single precision multiply-add FLOPS. Multiply-add counts as 2 FLOPS.", + "UMask": "0x8" + }, + { + "EventName": "fp_ret_sse_avx_ops.sp_div_flops", + "EventCode": "0x03", + "BriefDescription": "Single-precision divide/square root FLOPS.", + "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision divide/square root FLOPS.", + "UMask": "0x4" + }, + { + "EventName": "fp_ret_sse_avx_ops.sp_mult_flops", + "EventCode": "0x03", + "BriefDescription": "Single-precision multiply FLOPS.", + "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision multiply FLOPS.", + "UMask": "0x2" + }, + { + "EventName": "fp_ret_sse_avx_ops.sp_add_sub_flops", + "EventCode": "0x03", + "BriefDescription": "Single-precision add/subtract FLOPS.", + "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision add/subtract FLOPS.", + "UMask": "0x1" + }, + { + "EventName": "fp_num_mov_elim_scal_op.optimized", + "EventCode": "0x04", + "BriefDescription": "Number of Scalar Ops optimized.", + "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of Scalar Ops optimized.", + "UMask": "0x8" + }, + { + "EventName": "fp_num_mov_elim_scal_op.opt_potential", + "EventCode": "0x04", + "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass).", + "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of Ops that are candidates for optimization (have Z-bit either set or pass).", + "UMask": "0x4" + }, + { + "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim", + "EventCode": "0x04", + "BriefDescription": "Number of SSE Move Ops eliminated.", + "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of SSE Move Ops eliminated.", + "UMask": "0x2" + }, + { + "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops", + "EventCode": "0x04", + "BriefDescription": "Number of SSE Move Ops.", + "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of SSE Move Ops.", + "UMask": "0x1" + }, + { + "EventName": "fp_retired_ser_ops.x87_ctrl_ret", + "EventCode": "0x05", + "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits.", + "PublicDescription": "The number of serializing Ops retired. x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits.", + "UMask": "0x8" + }, + { + "EventName": "fp_retired_ser_ops.x87_bot_ret", + "EventCode": "0x05", + "BriefDescription": "x87 bottom-executing uOps retired.", + "PublicDescription": "The number of serializing Ops retired. x87 bottom-executing uOps retired.", + "UMask": "0x4" + }, + { + "EventName": "fp_retired_ser_ops.sse_ctrl_ret", + "EventCode": "0x05", + "BriefDescription": "SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.", + "PublicDescription": "The number of serializing Ops retired. SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.", + "UMask": "0x2" + }, + { + "EventName": "fp_retired_ser_ops.sse_bot_ret", + "EventCode": "0x05", + "BriefDescription": "SSE bottom-executing uOps retired.", + "PublicDescription": "The number of serializing Ops retired. SSE bottom-executing uOps retired.", + "UMask": "0x1" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdfam17h/memory.json b/tools/perf/pmu-events/arch/x86/amdfam17h/memory.json new file mode 100644 index 000000000000..fa2d60d4def0 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdfam17h/memory.json @@ -0,0 +1,162 @@ +[ + { + "EventName": "ls_locks.bus_lock", + "EventCode": "0x25", + "BriefDescription": "Bus lock when a locked operations crosses a cache boundary or is done on an uncacheable memory type.", + "PublicDescription": "Bus lock when a locked operations crosses a cache boundary or is done on an uncacheable memory type.", + "UMask": "0x1" + }, + { + "EventName": "ls_dispatch.ld_st_dispatch", + "EventCode": "0x29", + "BriefDescription": "Load-op-Stores.", + "PublicDescription": "Counts the number of operations dispatched to the LS unit. Unit Masks ADDed. Load-op-Stores.", + "UMask": "0x4" + }, + { + "EventName": "ls_dispatch.store_dispatch", + "EventCode": "0x29", + "BriefDescription": "Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.", + "PublicDescription": "Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.", + "UMask": "0x2" + }, + { + "EventName": "ls_dispatch.ld_dispatch", + "EventCode": "0x29", + "BriefDescription": "Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.", + "PublicDescription": "Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.", + "UMask": "0x1" + }, + { + "EventName": "ls_stlf", + "EventCode": "0x35", + "BriefDescription": "Number of STLF hits." + }, + { + "EventName": "ls_dc_accesses", + "EventCode": "0x40", + "BriefDescription": "The number of accesses to the data cache for load and store references. This may include certain microcode scratchpad accesses, although these are generally rare. Each increment represents an eight-byte access, although the instruction may only be accessing a portion of that. This event is a speculative event." + }, + { + "EventName": "ls_l1_d_tlb_miss.all", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss or Reload off all sizes.", + "PublicDescription": "L1 DTLB Miss or Reload off all sizes.", + "UMask": "0xff" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss of a page of 1G size.", + "PublicDescription": "L1 DTLB Miss of a page of 1G size.", + "UMask": "0x80" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss of a page of 2M size.", + "PublicDescription": "L1 DTLB Miss of a page of 2M size.", + "UMask": "0x40" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_32k_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss of a page of 32K size.", + "PublicDescription": "L1 DTLB Miss of a page of 32K size.", + "UMask": "0x20" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss of a page of 4K size.", + "PublicDescription": "L1 DTLB Miss of a page of 4K size.", + "UMask": "0x10" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Reload of a page of 1G size.", + "PublicDescription": "L1 DTLB Reload of a page of 1G size.", + "UMask": "0x8" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Reload of a page of 2M size.", + "PublicDescription": "L1 DTLB Reload of a page of 2M size.", + "UMask": "0x4" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_32k_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Reload of a page of 32K size.", + "PublicDescription": "L1 DTLB Reload of a page of 32K size.", + "UMask": "0x2" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Reload of a page of 4K size.", + "PublicDescription": "L1 DTLB Reload of a page of 4K size.", + "UMask": "0x1" + }, + { + "EventName": "ls_tablewalker.perf_mon_tablewalk_alloc_iside", + "EventCode": "0x46", + "BriefDescription": "Tablewalker allocation.", + "PublicDescription": "Tablewalker allocation.", + "UMask": "0xc" + }, + { + "EventName": "ls_tablewalker.perf_mon_tablewalk_alloc_dside", + "EventCode": "0x46", + "BriefDescription": "Tablewalker allocation.", + "PublicDescription": "Tablewalker allocation.", + "UMask": "0x3" + }, + { + "EventName": "ls_misal_accesses", + "EventCode": "0x47", + "BriefDescription": "Misaligned loads." + }, + { + "EventName": "ls_pref_instr_disp.prefetch_nta", + "EventCode": "0x4b", + "BriefDescription": "Software Prefetch Instructions (PREFETCHNTA instruction) Dispatched.", + "PublicDescription": "Software Prefetch Instructions (PREFETCHNTA instruction) Dispatched.", + "UMask": "0x4" + }, + { + "EventName": "ls_pref_instr_disp.store_prefetch_w", + "EventCode": "0x4b", + "BriefDescription": "Software Prefetch Instructions (3DNow PREFETCHW instruction) Dispatched.", + "PublicDescription": "Software Prefetch Instructions (3DNow PREFETCHW instruction) Dispatched.", + "UMask": "0x2" + }, + { + "EventName": "ls_pref_instr_disp.load_prefetch_w", + "EventCode": "0x4b", + "BriefDescription": "Prefetch, Prefetch_T0_T1_T2.", + "PublicDescription": "Software Prefetch Instructions Dispatched. Prefetch, Prefetch_T0_T1_T2.", + "UMask": "0x1" + }, + { + "EventName": "ls_inef_sw_pref.mab_mch_cnt", + "EventCode": "0x52", + "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core.", + "PublicDescription": "The number of software prefetches that did not fetch data outside of the processor core.", + "UMask": "0x2" + }, + { + "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit", + "EventCode": "0x52", + "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core.", + "PublicDescription": "The number of software prefetches that did not fetch data outside of the processor core.", + "UMask": "0x1" + }, + { + "EventName": "ls_not_halted_cyc", + "EventCode": "0x76", + "BriefDescription": "Cycles not in Halt." + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdfam17h/other.json b/tools/perf/pmu-events/arch/x86/amdfam17h/other.json new file mode 100644 index 000000000000..b26a00d05a2e --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdfam17h/other.json @@ -0,0 +1,65 @@ +[ + { + "EventName": "ic_oc_mode_switch.oc_ic_mode_switch", + "EventCode": "0x28a", + "BriefDescription": "OC to IC mode switch.", + "PublicDescription": "OC Mode Switch. OC to IC mode switch.", + "UMask": "0x2" + }, + { + "EventName": "ic_oc_mode_switch.ic_oc_mode_switch", + "EventCode": "0x28a", + "BriefDescription": "IC to OC mode switch.", + "PublicDescription": "OC Mode Switch. IC to OC mode switch.", + "UMask": "0x1" + }, + { + "EventName": "de_dis_dispatch_token_stalls0.retire_token_stall", + "EventCode": "0xaf", + "BriefDescription": "RETIRE Tokens unavailable.", + "PublicDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. RETIRE Tokens unavailable.", + "UMask": "0x40" + }, + { + "EventName": "de_dis_dispatch_token_stalls0.agsq_token_stall", + "EventCode": "0xaf", + "BriefDescription": "AGSQ Tokens unavailable.", + "PublicDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. AGSQ Tokens unavailable.", + "UMask": "0x20" + }, + { + "EventName": "de_dis_dispatch_token_stalls0.alu_token_stall", + "EventCode": "0xaf", + "BriefDescription": "ALU tokens total unavailable.", + "PublicDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALU tokens total unavailable.", + "UMask": "0x10" + }, + { + "EventName": "de_dis_dispatch_token_stalls0.alsq3_0_token_stall", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall.", + "PublicDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall.", + "UMask": "0x8" + }, + { + "EventName": "de_dis_dispatch_token_stalls0.alsq3_token_stall", + "EventCode": "0xaf", + "BriefDescription": "ALSQ 3 Tokens unavailable.", + "PublicDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 3 Tokens unavailable.", + "UMask": "0x4" + }, + { + "EventName": "de_dis_dispatch_token_stalls0.alsq2_token_stall", + "EventCode": "0xaf", + "BriefDescription": "ALSQ 2 Tokens unavailable.", + "PublicDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 2 Tokens unavailable.", + "UMask": "0x2" + }, + { + "EventName": "de_dis_dispatch_token_stalls0.alsq1_token_stall", + "EventCode": "0xaf", + "BriefDescription": "ALSQ 1 Tokens unavailable.", + "PublicDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 1 Tokens unavailable.", + "UMask": "0x1" + } +] diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index e05c2c8458fc..d6984a3017e0 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -33,3 +33,4 @@ GenuineIntel-6-25,v2,westmereep-sp,core GenuineIntel-6-2F,v2,westmereex,core GenuineIntel-6-55-[01234],v1,skylakex,core GenuineIntel-6-55-[56789ABCDEF],v1,cascadelakex,core +AuthenticAMD-23-[[:xdigit:]]+,v1,amdfam17h,core From ec65def1045e4c7817b7f741a86dadae82877a93 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Mar 2019 14:47:35 +0100 Subject: [PATCH 370/855] perf data: Support having perf.data stored as a directory The caller needs to set 'struct perf_data::is_dir flag and the path will be treated as a directory. The 'struct perf_data::file' is initialized and open as 'path/header' file. Add a check to the direcory interface functions to check the is_dir flag. Signed-off-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Andi Kleen Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20190308134745.5057-2-jolsa@kernel.org [ Be consistent on how to signal failure, i.e. use -1 and let users check errno ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data.c | 49 ++++++++++++++++++++++++++++++++++++++- tools/perf/util/data.h | 6 +++++ tools/perf/util/session.c | 4 ++++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index e098e189f93e..18fa8d4614eb 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -34,6 +34,9 @@ int perf_data__create_dir(struct perf_data *data, int nr) struct perf_data_file *files = NULL; int i, ret = -1; + if (WARN_ON(!data->is_dir)) + return -EINVAL; + files = zalloc(nr * sizeof(*files)); if (!files) return -ENOMEM; @@ -69,6 +72,9 @@ int perf_data__open_dir(struct perf_data *data) DIR *dir; int nr = 0; + if (WARN_ON(!data->is_dir)) + return -EINVAL; + dir = opendir(data->path); if (!dir) return -EINVAL; @@ -173,6 +179,16 @@ static int check_backup(struct perf_data *data) return 0; } +static bool is_dir(struct perf_data *data) +{ + struct stat st; + + if (stat(data->path, &st)) + return false; + + return (st.st_mode & S_IFMT) == S_IFDIR; +} + static int open_file_read(struct perf_data *data) { struct stat st; @@ -254,6 +270,30 @@ static int open_file_dup(struct perf_data *data) return open_file(data); } +static int open_dir(struct perf_data *data) +{ + int ret; + + /* + * So far we open only the header, so we can read the data version and + * layout. + */ + if (asprintf(&data->file.path, "%s/header", data->path) < 0) + return -1; + + if (perf_data__is_write(data) && + mkdir(data->path, S_IRWXU) < 0) + return -1; + + ret = open_file(data); + + /* Cleanup whatever we managed to create so far. */ + if (ret && perf_data__is_write(data)) + rm_rf_perf_data(data->path); + + return ret; +} + int perf_data__open(struct perf_data *data) { if (check_pipe(data)) @@ -265,11 +305,18 @@ int perf_data__open(struct perf_data *data) if (check_backup(data)) return -1; - return open_file_dup(data); + if (perf_data__is_read(data)) + data->is_dir = is_dir(data); + + return perf_data__is_dir(data) ? + open_dir(data) : open_file_dup(data); } void perf_data__close(struct perf_data *data) { + if (perf_data__is_dir(data)) + perf_data__close_dir(data); + zfree(&data->file.path); close(data->file.fd); } diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index 14b47be2bd69..06aefeda311f 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -19,6 +19,7 @@ struct perf_data { const char *path; struct perf_data_file file; bool is_pipe; + bool is_dir; bool force; enum perf_data_mode mode; @@ -43,6 +44,11 @@ static inline int perf_data__is_pipe(struct perf_data *data) return data->is_pipe; } +static inline bool perf_data__is_dir(struct perf_data *data) +{ + return data->is_dir; +} + static inline int perf_data__fd(struct perf_data *data) { return data->file.fd; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index db643f3c2b95..de777bdc0ed3 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -152,6 +152,10 @@ struct perf_session *perf_session__new(struct perf_data *data, } perf_evlist__init_trace_event_sample_raw(session->evlist); + + /* Open the directory data. */ + if (data->is_dir && perf_data__open_dir(data)) + goto out_delete; } } else { session->machines.host.env = &perf_env; From cd3dd8dd8ff62374d90cb3f2e54b8c94106c7810 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Mar 2019 14:47:36 +0100 Subject: [PATCH 371/855] perf data: Don't store auxtrace index for directory data file We can't store the auxtrace index when we store into multiple files, because we keep only offset for it, not the file. The auxtrace data will be processed correctly in the 'pipe' mode. Signed-off-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Andi Kleen Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20190308134745.5057-3-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index f3f7f3100336..e983c8d71a79 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -392,7 +392,7 @@ static int record__process_auxtrace(struct perf_tool *tool, size_t padding; u8 pad[8] = {0}; - if (!perf_data__is_pipe(data)) { + if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) { off_t file_offset; int fd = perf_data__fd(data); int err; From e8be135751f26aa5de63e517d375ecf69e9b20c3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Mar 2019 14:47:37 +0100 Subject: [PATCH 372/855] perf data: Add perf_data__update_dir() function Add perf_data__update_dir() to update the size for every file within the perf.data directory. Signed-off-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Andi Kleen Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20190308134745.5057-4-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data.c | 20 ++++++++++++++++++++ tools/perf/util/data.h | 1 + 2 files changed, 21 insertions(+) diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index 18fa8d4614eb..5d0f26aef77f 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -124,6 +124,26 @@ out_err: return ret; } +int perf_data__update_dir(struct perf_data *data) +{ + int i; + + if (WARN_ON(!data->is_dir)) + return -EINVAL; + + for (i = 0; i < data->dir.nr; i++) { + struct perf_data_file *file = &data->dir.files[i]; + struct stat st; + + if (fstat(file->fd, &st)) + return -1; + + file->size = st.st_size; + } + + return 0; +} + static bool check_pipe(struct perf_data *data) { struct stat st; diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index 06aefeda311f..0deeb1af9f54 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -79,4 +79,5 @@ int perf_data__switch(struct perf_data *data, int perf_data__create_dir(struct perf_data *data, int nr); int perf_data__open_dir(struct perf_data *data); void perf_data__close_dir(struct perf_data *data); +int perf_data__update_dir(struct perf_data *data); #endif /* __PERF_DATA_H */ From 29583c17b5ce0bc17fdd80da939f8199a03d9668 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Mar 2019 14:47:38 +0100 Subject: [PATCH 373/855] perf data: Make perf_data__size() work over directory Make perf_data__size() return proper size for directory data, summing up all the individual file sizes. Signed-off-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Andi Kleen Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20190308134745.5057-5-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data.c | 17 +++++++++++++++++ tools/perf/util/data.h | 6 +----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index 5d0f26aef77f..b71c441cafbb 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -393,3 +393,20 @@ out: free(new_filepath); return ret; } + +unsigned long perf_data__size(struct perf_data *data) +{ + u64 size = data->file.size; + int i; + + if (!data->is_dir) + return size; + + for (i = 0; i < data->dir.nr; i++) { + struct perf_data_file *file = &data->dir.files[i]; + + size += file->size; + } + + return size; +} diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index 0deeb1af9f54..d342469bdfda 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -54,11 +54,6 @@ static inline int perf_data__fd(struct perf_data *data) return data->file.fd; } -static inline unsigned long perf_data__size(struct perf_data *data) -{ - return data->file.size; -} - int perf_data__open(struct perf_data *data); void perf_data__close(struct perf_data *data); ssize_t perf_data__write(struct perf_data *data, @@ -80,4 +75,5 @@ int perf_data__create_dir(struct perf_data *data, int nr); int perf_data__open_dir(struct perf_data *data); void perf_data__close_dir(struct perf_data *data); int perf_data__update_dir(struct perf_data *data); +unsigned long perf_data__size(struct perf_data *data); #endif /* __PERF_DATA_H */ From 258031c017c353e899902342a25579fc81a34cc1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Mar 2019 14:47:39 +0100 Subject: [PATCH 374/855] perf header: Add DIR_FORMAT feature to describe directory data The data files layout is described by HEADER_DIR_FORMAT feature. Currently it holds only version number (1): uint64_t version; The current version holds only version value (1) means that data files: - Follow the 'data.*' name format. - Contain raw events data in standard perf format as read from kernel (and need to be sorted) Future versions are expected to describe different data files layout according to special needs. Signed-off-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Andi Kleen Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20190308134745.5057-6-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 2 ++ tools/perf/util/data.c | 10 +++++++-- tools/perf/util/data.h | 1 + tools/perf/util/header.c | 44 ++++++++++++++++++++++++++++++++++++- tools/perf/util/header.h | 5 +++++ 5 files changed, 59 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index e983c8d71a79..a468d882e74f 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -837,6 +837,8 @@ static void record__init_features(struct record *rec) if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns)) perf_header__clear_feat(&session->header, HEADER_CLOCKID); + perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT); + perf_header__clear_feat(&session->header, HEADER_STAT); } diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index b71c441cafbb..c6b67efea11a 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -14,6 +14,7 @@ #include "data.h" #include "util.h" #include "debug.h" +#include "header.h" static void close_dir(struct perf_data_file *files, int nr) { @@ -41,8 +42,9 @@ int perf_data__create_dir(struct perf_data *data, int nr) if (!files) return -ENOMEM; - data->dir.files = files; - data->dir.nr = nr; + data->dir.version = PERF_DIR_VERSION; + data->dir.files = files; + data->dir.nr = nr; for (i = 0; i < nr; i++) { struct perf_data_file *file = &files[i]; @@ -75,6 +77,10 @@ int perf_data__open_dir(struct perf_data *data) if (WARN_ON(!data->is_dir)) return -EINVAL; + /* The version is provided by DIR_FORMAT feature. */ + if (WARN_ON(data->dir.version != PERF_DIR_VERSION)) + return -1; + dir = opendir(data->path); if (!dir) return -EINVAL; diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index d342469bdfda..6aef8746469f 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -24,6 +24,7 @@ struct perf_data { enum perf_data_mode mode; struct { + u64 version; struct perf_data_file *files; int nr; } dir; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 01b324c275b9..b0683bf4d9f3 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -861,6 +861,21 @@ static int write_clockid(struct feat_fd *ff, sizeof(ff->ph->env.clockid_res_ns)); } +static int write_dir_format(struct feat_fd *ff, + struct perf_evlist *evlist __maybe_unused) +{ + struct perf_session *session; + struct perf_data *data; + + session = container_of(ff->ph, struct perf_session, header); + data = session->data; + + if (WARN_ON(!perf_data__is_dir(data))) + return -1; + + return do_write(ff, &data->dir.version, sizeof(data->dir.version)); +} + static int cpu_cache_level__sort(const void *a, const void *b) { struct cpu_cache_level *cache_a = (struct cpu_cache_level *)a; @@ -1341,6 +1356,17 @@ static void print_clockid(struct feat_fd *ff, FILE *fp) ff->ph->env.clockid_res_ns * 1000); } +static void print_dir_format(struct feat_fd *ff, FILE *fp) +{ + struct perf_session *session; + struct perf_data *data; + + session = container_of(ff->ph, struct perf_session, header); + data = session->data; + + fprintf(fp, "# directory data version : %"PRIu64"\n", data->dir.version); +} + static void free_event_desc(struct perf_evsel *events) { struct perf_evsel *evsel; @@ -2373,6 +2399,21 @@ static int process_clockid(struct feat_fd *ff, return 0; } +static int process_dir_format(struct feat_fd *ff, + void *_data __maybe_unused) +{ + struct perf_session *session; + struct perf_data *data; + + session = container_of(ff->ph, struct perf_session, header); + data = session->data; + + if (WARN_ON(!perf_data__is_dir(data))) + return -1; + + return do_read_u64(ff, &data->dir.version); +} + struct feature_ops { int (*write)(struct feat_fd *ff, struct perf_evlist *evlist); void (*print)(struct feat_fd *ff, FILE *fp); @@ -2432,7 +2473,8 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPN(CACHE, cache, true), FEAT_OPR(SAMPLE_TIME, sample_time, false), FEAT_OPR(MEM_TOPOLOGY, mem_topology, true), - FEAT_OPR(CLOCKID, clockid, false) + FEAT_OPR(CLOCKID, clockid, false), + FEAT_OPN(DIR_FORMAT, dir_format, false) }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 0d553ddca0a3..6a231340238d 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -39,6 +39,7 @@ enum { HEADER_SAMPLE_TIME, HEADER_MEM_TOPOLOGY, HEADER_CLOCKID, + HEADER_DIR_FORMAT, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; @@ -48,6 +49,10 @@ enum perf_header_version { PERF_HEADER_VERSION_2, }; +enum perf_dir_version { + PERF_DIR_VERSION = 1, +}; + struct perf_file_section { u64 offset; u64 size; From e51f806198306a8ad7ae6e34d1af0716ef73da80 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Mar 2019 14:47:40 +0100 Subject: [PATCH 375/855] perf session: Add process callback to reader object Adding callback function to reader object so callers can process data in different ways. Signed-off-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Andi Kleen Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20190308134745.5057-7-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index de777bdc0ed3..0ec34227bd60 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1847,10 +1847,17 @@ fetch_mmaped_event(struct perf_session *session, #define NUM_MMAPS 128 #endif +struct reader; + +typedef s64 (*reader_cb_t)(struct perf_session *session, + union perf_event *event, + u64 file_offset); + struct reader { - int fd; - u64 data_size; - u64 data_offset; + int fd; + u64 data_size; + u64 data_offset; + reader_cb_t process; }; static int @@ -1921,7 +1928,7 @@ more: size = event->header.size; if (size < sizeof(struct perf_event_header) || - (skip = perf_session__process_event(session, event, file_pos)) < 0) { + (skip = rd->process(session, event, file_pos)) < 0) { pr_err("%#" PRIx64 " [%#x]: failed to process type: %d\n", file_offset + head, event->header.size, event->header.type); @@ -1947,12 +1954,20 @@ out: return err; } +static s64 process_simple(struct perf_session *session, + union perf_event *event, + u64 file_offset) +{ + return perf_session__process_event(session, event, file_offset); +} + static int __perf_session__process_events(struct perf_session *session) { struct reader rd = { .fd = perf_data__fd(session->data), .data_size = session->header.data_size, .data_offset = session->header.data_offset, + .process = process_simple, }; struct ordered_events *oe = &session->ordered_events; struct perf_tool *tool = session->tool; From b41fdc4a7bf9045e4871c5b15905ea732ffd044f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 11 Mar 2019 15:38:10 +0000 Subject: [PATCH 376/855] irqchip/gic: Drop support for secondary GIC in non-DT systems We do not have any in-tree platform with this pathological setup, and only a single system (Cavium's cns3xxx) isn't DT aware. Let's drop the secondary GIC support for now, until we remove the above horror altogether. Signed-off-by: Marc Zyngier --- arch/arm/mach-cns3xxx/core.c | 2 +- drivers/irqchip/irq-gic.c | 45 ++++++++++++--------------------- include/linux/irqchip/arm-gic.h | 3 +-- 3 files changed, 18 insertions(+), 32 deletions(-) diff --git a/arch/arm/mach-cns3xxx/core.c b/arch/arm/mach-cns3xxx/core.c index 7d5a44a06648..f676592d8402 100644 --- a/arch/arm/mach-cns3xxx/core.c +++ b/arch/arm/mach-cns3xxx/core.c @@ -90,7 +90,7 @@ void __init cns3xxx_map_io(void) /* used by entry-macro.S */ void __init cns3xxx_init_irq(void) { - gic_init(0, 29, IOMEM(CNS3XXX_TC11MP_GIC_DIST_BASE_VIRT), + gic_init(IOMEM(CNS3XXX_TC11MP_GIC_DIST_BASE_VIRT), IOMEM(CNS3XXX_TC11MP_GIC_CPU_BASE_VIRT)); } diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index ba2a37a27a54..fd3110c171ba 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -1089,11 +1089,10 @@ static void gic_init_chip(struct gic_chip_data *gic, struct device *dev, #endif } -static int gic_init_bases(struct gic_chip_data *gic, int irq_start, +static int gic_init_bases(struct gic_chip_data *gic, struct fwnode_handle *handle) { - irq_hw_number_t hwirq_base; - int gic_irqs, irq_base, ret; + int gic_irqs, ret; if (IS_ENABLED(CONFIG_GIC_NON_BANKED) && gic->percpu_offset) { /* Frankein-GIC without banked registers... */ @@ -1145,28 +1144,21 @@ static int gic_init_bases(struct gic_chip_data *gic, int irq_start, } else { /* Legacy support */ /* * For primary GICs, skip over SGIs. - * For secondary GICs, skip over PPIs, too. + * No secondary GIC support whatsoever. */ - if (gic == &gic_data[0] && (irq_start & 31) > 0) { - hwirq_base = 16; - if (irq_start != -1) - irq_start = (irq_start & ~31) + 16; - } else { - hwirq_base = 32; - } + int irq_base; - gic_irqs -= hwirq_base; /* calculate # of irqs to allocate */ + gic_irqs -= 16; /* calculate # of irqs to allocate */ - irq_base = irq_alloc_descs(irq_start, 16, gic_irqs, + irq_base = irq_alloc_descs(16, 16, gic_irqs, numa_node_id()); if (irq_base < 0) { - WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", - irq_start); - irq_base = irq_start; + WARN(1, "Cannot allocate irq_descs @ IRQ16, assuming pre-allocated\n"); + irq_base = 16; } gic->domain = irq_domain_add_legacy(NULL, gic_irqs, irq_base, - hwirq_base, &gic_irq_domain_ops, gic); + 16, &gic_irq_domain_ops, gic); } if (WARN_ON(!gic->domain)) { @@ -1195,7 +1187,6 @@ error: } static int __init __gic_init_bases(struct gic_chip_data *gic, - int irq_start, struct fwnode_handle *handle) { char *name; @@ -1231,32 +1222,28 @@ static int __init __gic_init_bases(struct gic_chip_data *gic, gic_init_chip(gic, NULL, name, false); } - ret = gic_init_bases(gic, irq_start, handle); + ret = gic_init_bases(gic, handle); if (ret) kfree(name); return ret; } -void __init gic_init(unsigned int gic_nr, int irq_start, - void __iomem *dist_base, void __iomem *cpu_base) +void __init gic_init(void __iomem *dist_base, void __iomem *cpu_base) { struct gic_chip_data *gic; - if (WARN_ON(gic_nr >= CONFIG_ARM_GIC_MAX_NR)) - return; - /* * Non-DT/ACPI systems won't run a hypervisor, so let's not * bother with these... */ static_branch_disable(&supports_deactivate_key); - gic = &gic_data[gic_nr]; + gic = &gic_data[0]; gic->raw_dist_base = dist_base; gic->raw_cpu_base = cpu_base; - __gic_init_bases(gic, irq_start, NULL); + __gic_init_bases(gic, NULL); } static void gic_teardown(struct gic_chip_data *gic) @@ -1399,7 +1386,7 @@ int gic_of_init_child(struct device *dev, struct gic_chip_data **gic, int irq) if (ret) return ret; - ret = gic_init_bases(*gic, -1, &dev->of_node->fwnode); + ret = gic_init_bases(*gic, &dev->of_node->fwnode); if (ret) { gic_teardown(*gic); return ret; @@ -1459,7 +1446,7 @@ gic_of_init(struct device_node *node, struct device_node *parent) if (gic_cnt == 0 && !gic_check_eoimode(node, &gic->raw_cpu_base)) static_branch_disable(&supports_deactivate_key); - ret = __gic_init_bases(gic, -1, &node->fwnode); + ret = __gic_init_bases(gic, &node->fwnode); if (ret) { gic_teardown(gic); return ret; @@ -1650,7 +1637,7 @@ static int __init gic_v2_acpi_init(struct acpi_subtable_header *header, return -ENOMEM; } - ret = __gic_init_bases(gic, -1, domain_handle); + ret = __gic_init_bases(gic, domain_handle); if (ret) { pr_err("Failed to initialise GIC\n"); irq_domain_free_fwnode(domain_handle); diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 626179077bb0..0f049b384ccd 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -158,8 +158,7 @@ int gic_of_init_child(struct device *dev, struct gic_chip_data **gic, int irq); * Legacy platforms not converted to DT yet must use this to init * their GIC */ -void gic_init(unsigned int nr, int start, - void __iomem *dist , void __iomem *cpu); +void gic_init(void __iomem *dist , void __iomem *cpu); int gicv2m_init(struct fwnode_handle *parent_handle, struct irq_domain *parent); From 75065a85a9705ad4c0135f07fd4467d46ff342a3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 8 Mar 2019 21:56:20 -0800 Subject: [PATCH 377/855] perf report: Use less for scripts output The UI viewer for scripts output has a lot of limitations: limited size, no search or save function, slow, and various other issues. Just use 'less' to display directly on the terminal instead. This won't work in GTK mode, but GTK doesn't support these context menus anyways. If that is ever done could use an terminal for the output. Signed-off-by: Andi Kleen Acked-by: Feng Tang Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20190309055628.21617-8-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/scripts.c | 130 ++++--------------------------- 1 file changed, 17 insertions(+), 113 deletions(-) diff --git a/tools/perf/ui/browsers/scripts.c b/tools/perf/ui/browsers/scripts.c index 90a32ac69e76..7f36630694bf 100644 --- a/tools/perf/ui/browsers/scripts.c +++ b/tools/perf/ui/browsers/scripts.c @@ -1,35 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include #include "../../util/sort.h" #include "../../util/util.h" #include "../../util/hist.h" #include "../../util/debug.h" #include "../../util/symbol.h" #include "../browser.h" -#include "../helpline.h" #include "../libslang.h" -/* 2048 lines should be enough for a script output */ -#define MAX_LINES 2048 - -/* 160 bytes for one output line */ -#define AVERAGE_LINE_LEN 160 - -struct script_line { - struct list_head node; - char line[AVERAGE_LINE_LEN]; -}; - -struct perf_script_browser { - struct ui_browser b; - struct list_head entries; - const char *script_name; - int nr_lines; -}; - #define SCRIPT_NAMELEN 128 #define SCRIPT_MAX_NO 64 /* @@ -73,69 +50,29 @@ static int list_scripts(char *script_name) return ret; } -static void script_browser__write(struct ui_browser *browser, - void *entry, int row) +static void run_script(char *cmd) { - struct script_line *sline = list_entry(entry, struct script_line, node); - bool current_entry = ui_browser__is_current_entry(browser, row); - - ui_browser__set_color(browser, current_entry ? HE_COLORSET_SELECTED : - HE_COLORSET_NORMAL); - - ui_browser__write_nstring(browser, sline->line, browser->width); + pr_debug("Running %s\n", cmd); + SLang_reset_tty(); + if (system(cmd) < 0) + pr_warning("Cannot run %s\n", cmd); + /* + * SLang doesn't seem to reset the whole terminal, so be more + * forceful to get back to the original state. + */ + printf("\033[c\033[H\033[J"); + fflush(stdout); + SLang_init_tty(0, 0, 0); + SLsmg_refresh(); } -static int script_browser__run(struct perf_script_browser *browser) -{ - int key; - - if (ui_browser__show(&browser->b, browser->script_name, - "Press ESC to exit") < 0) - return -1; - - while (1) { - key = ui_browser__run(&browser->b, 0); - - /* We can add some special key handling here if needed */ - break; - } - - ui_browser__hide(&browser->b); - return key; -} - - int script_browse(const char *script_opt) { char cmd[SCRIPT_FULLPATH_LEN*2], script_name[SCRIPT_FULLPATH_LEN]; - char *line = NULL; - size_t len = 0; - ssize_t retlen; - int ret = -1, nr_entries = 0; - FILE *fp; - void *buf; - struct script_line *sline; - - struct perf_script_browser script = { - .b = { - .refresh = ui_browser__list_head_refresh, - .seek = ui_browser__list_head_seek, - .write = script_browser__write, - }, - .script_name = script_name, - }; - - INIT_LIST_HEAD(&script.entries); - - /* Save each line of the output in one struct script_line object. */ - buf = zalloc((sizeof(*sline)) * MAX_LINES); - if (!buf) - return -1; - sline = buf; memset(script_name, 0, SCRIPT_FULLPATH_LEN); if (list_scripts(script_name)) - goto exit; + return -1; sprintf(cmd, "perf script -s %s ", script_name); @@ -147,42 +84,9 @@ int script_browse(const char *script_opt) strcat(cmd, input_name); } - strcat(cmd, " 2>&1"); + strcat(cmd, " 2>&1 | less"); - fp = popen(cmd, "r"); - if (!fp) - goto exit; + run_script(cmd); - while ((retlen = getline(&line, &len, fp)) != -1) { - strncpy(sline->line, line, AVERAGE_LINE_LEN); - - /* If one output line is very large, just cut it short */ - if (retlen >= AVERAGE_LINE_LEN) { - sline->line[AVERAGE_LINE_LEN - 1] = '\0'; - sline->line[AVERAGE_LINE_LEN - 2] = '\n'; - } - list_add_tail(&sline->node, &script.entries); - - if (script.b.width < retlen) - script.b.width = retlen; - - if (nr_entries++ >= MAX_LINES - 1) - break; - sline++; - } - - if (script.b.width > AVERAGE_LINE_LEN) - script.b.width = AVERAGE_LINE_LEN; - - free(line); - pclose(fp); - - script.nr_lines = nr_entries; - script.b.nr_entries = nr_entries; - script.b.entries = &script.entries; - - ret = script_browser__run(&script); -exit: - free(buf); - return ret; + return 0; } From 5f5f67da9781770df0403269bc57d7aae608fecd Mon Sep 17 00:00:00 2001 From: Yifeng Li Date: Tue, 5 Mar 2019 06:00:22 +0800 Subject: [PATCH 378/855] mips: loongson64: lemote-2f: Add IRQF_NO_SUSPEND to "cascade" irqaction. Timekeeping IRQs from CS5536 MFGPT are routed to i8259, which then triggers the "cascade" IRQ on MIPS CPU. Without IRQF_NO_SUSPEND in cascade_irqaction, MFGPT interrupts will be masked in suspend mode, and the machine would be unable to resume once suspended. Previously, MIPS IRQs were not disabled properly, so the original code appeared to work. Commit a3e6c1eff5 ("MIPS: IRQ: Fix disable_irq on CPU IRQs") uncovers the bug. To fix it, add IRQF_NO_SUSPEND to cascade_irqaction. This commit is functionally identical to 0add9c2f1cff ("MIPS: Loongson-3: Add IRQF_NO_SUSPEND to Cascade irqaction"), but it forgot to apply the same fix to Loongson2. Signed-off-by: Yifeng Li Signed-off-by: Paul Burton Cc: linux-mips@vger.kernel.org Cc: Jiaxun Yang Cc: Huacai Chen Cc: Ralf Baechle Cc: James Hogan Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.19+ --- arch/mips/loongson64/lemote-2f/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/loongson64/lemote-2f/irq.c b/arch/mips/loongson64/lemote-2f/irq.c index 9e33e45aa17c..b213cecb8e3a 100644 --- a/arch/mips/loongson64/lemote-2f/irq.c +++ b/arch/mips/loongson64/lemote-2f/irq.c @@ -103,7 +103,7 @@ static struct irqaction ip6_irqaction = { static struct irqaction cascade_irqaction = { .handler = no_action, .name = "cascade", - .flags = IRQF_NO_THREAD, + .flags = IRQF_NO_THREAD | IRQF_NO_SUSPEND, }; void __init mach_init_irq(void) From 3f0a53bc6482fb09770982a8447981260ea258dc Mon Sep 17 00:00:00 2001 From: Yasha Cherikovsky Date: Fri, 8 Mar 2019 14:58:51 +0200 Subject: [PATCH 379/855] MIPS: Ensure ELF appended dtb is relocated This fixes booting with the combination of CONFIG_RELOCATABLE=y and CONFIG_MIPS_ELF_APPENDED_DTB=y. Sections that appear after the relocation table are not relocated on system boot (except .bss, which has special handling). With CONFIG_MIPS_ELF_APPENDED_DTB, the dtb is part of the vmlinux ELF, so it must be relocated together with everything else. Fixes: 069fd766271d ("MIPS: Reserve space for relocation table") Signed-off-by: Yasha Cherikovsky Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v4.7+ --- arch/mips/kernel/vmlinux.lds.S | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index cb7e9ed7a453..33ee0d18fb0a 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -140,6 +140,13 @@ SECTIONS PERCPU_SECTION(1 << CONFIG_MIPS_L1_CACHE_SHIFT) #endif +#ifdef CONFIG_MIPS_ELF_APPENDED_DTB + .appended_dtb : AT(ADDR(.appended_dtb) - LOAD_OFFSET) { + *(.appended_dtb) + KEEP(*(.appended_dtb)) + } +#endif + #ifdef CONFIG_RELOCATABLE . = ALIGN(4); @@ -164,11 +171,6 @@ SECTIONS __appended_dtb = .; /* leave space for appended DTB */ . += 0x100000; -#elif defined(CONFIG_MIPS_ELF_APPENDED_DTB) - .appended_dtb : AT(ADDR(.appended_dtb) - LOAD_OFFSET) { - *(.appended_dtb) - KEEP(*(.appended_dtb)) - } #endif /* * Align to 64K in attempt to eliminate holes before the From 47c25036b60f27b86ab44b66a8861bcf81cde39b Mon Sep 17 00:00:00 2001 From: Archer Yan Date: Fri, 8 Mar 2019 03:29:19 +0000 Subject: [PATCH 380/855] MIPS: Fix kernel crash for R6 in jump label branch function Insert Branch instruction instead of NOP to make sure assembler don't patch code in forbidden slot. In jump label function, it might be possible to patch Control Transfer Instructions(CTIs) into forbidden slot, which will generate Reserved Instruction exception in MIPS release 6. Signed-off-by: Archer Yan Reviewed-by: Paul Burton [paul.burton@mips.com: - Add MIPS prefix to subject. - Mark for stable from v4.0, which introduced r6 support, onwards.] Signed-off-by: Paul Burton Cc: linux-mips@vger.kernel.org Cc: stable@vger.kernel.org # v4.0+ --- arch/mips/include/asm/jump_label.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index e77672539e8e..e4456e450f94 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -21,15 +21,15 @@ #endif #ifdef CONFIG_CPU_MICROMIPS -#define NOP_INSN "nop32" +#define B_INSN "b32" #else -#define NOP_INSN "nop" +#define B_INSN "b" #endif static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\t" NOP_INSN "\n\t" - "nop\n\t" + asm_volatile_goto("1:\t" B_INSN " 2f\n\t" + "2:\tnop\n\t" ".pushsection __jump_table, \"aw\"\n\t" WORD_INSN " 1b, %l[l_yes], %0\n\t" ".popsection\n\t" From beda0e725e5f06aca27eda2434ea9447dad88e36 Mon Sep 17 00:00:00 2001 From: Tony Jones Date: Fri, 8 Mar 2019 16:05:15 -0800 Subject: [PATCH 381/855] perf script python: Add Python3 support to exported-sql-viewer.py Support both Python2 and Python3 in the exported-sql-viewer.py script. The use of 'from __future__' implies the minimum supported Python2 version is now v2.6 Signed-off-by: Tony Jones Acked-by: Adrian Hunter Link: http://lkml.kernel.org/r/20190309000518.2438-2-tonyj@suse.de Signed-off-by: Seeteena Thoufeek Signed-off-by: Arnaldo Carvalho de Melo --- .../scripts/python/exported-sql-viewer.py | 42 ++++++++++++------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/tools/perf/scripts/python/exported-sql-viewer.py b/tools/perf/scripts/python/exported-sql-viewer.py index afec9479ca7f..e38518cdcbc3 100755 --- a/tools/perf/scripts/python/exported-sql-viewer.py +++ b/tools/perf/scripts/python/exported-sql-viewer.py @@ -88,11 +88,20 @@ # 7fab593ea956 48 89 15 3b 13 22 00 movq %rdx, 0x22133b(%rip) # 8107675243232 2 ls 22011 22011 hardware interrupt No 7fab593ea956 _dl_start+0x26 (ld-2.19.so) -> ffffffff86a012e0 page_fault ([kernel]) +from __future__ import print_function + import sys import weakref import threading import string -import cPickle +try: + # Python2 + import cPickle as pickle + # size of pickled integer big enough for record size + glb_nsz = 8 +except ImportError: + import pickle + glb_nsz = 16 import re import os from PySide.QtCore import * @@ -102,6 +111,15 @@ from decimal import * from ctypes import * from multiprocessing import Process, Array, Value, Event +# xrange is range in Python3 +try: + xrange +except NameError: + xrange = range + +def printerr(*args, **keyword_args): + print(*args, file=sys.stderr, **keyword_args) + # Data formatting helpers def tohex(ip): @@ -1004,10 +1022,6 @@ class ChildDataItemFinder(): glb_chunk_sz = 10000 -# size of pickled integer big enough for record size - -glb_nsz = 8 - # Background process for SQL data fetcher class SQLFetcherProcess(): @@ -1066,7 +1080,7 @@ class SQLFetcherProcess(): return True if space >= glb_nsz: # Use 0 (or space < glb_nsz) to mean there is no more at the top of the buffer - nd = cPickle.dumps(0, cPickle.HIGHEST_PROTOCOL) + nd = pickle.dumps(0, pickle.HIGHEST_PROTOCOL) self.buffer[self.local_head : self.local_head + len(nd)] = nd self.local_head = 0 if self.local_tail - self.local_head > sz: @@ -1084,9 +1098,9 @@ class SQLFetcherProcess(): self.wait_event.wait() def AddToBuffer(self, obj): - d = cPickle.dumps(obj, cPickle.HIGHEST_PROTOCOL) + d = pickle.dumps(obj, pickle.HIGHEST_PROTOCOL) n = len(d) - nd = cPickle.dumps(n, cPickle.HIGHEST_PROTOCOL) + nd = pickle.dumps(n, pickle.HIGHEST_PROTOCOL) sz = n + glb_nsz self.WaitForSpace(sz) pos = self.local_head @@ -1198,12 +1212,12 @@ class SQLFetcher(QObject): pos = self.local_tail if len(self.buffer) - pos < glb_nsz: pos = 0 - n = cPickle.loads(self.buffer[pos : pos + glb_nsz]) + n = pickle.loads(self.buffer[pos : pos + glb_nsz]) if n == 0: pos = 0 - n = cPickle.loads(self.buffer[0 : glb_nsz]) + n = pickle.loads(self.buffer[0 : glb_nsz]) pos += glb_nsz - obj = cPickle.loads(self.buffer[pos : pos + n]) + obj = pickle.loads(self.buffer[pos : pos + n]) self.local_tail = pos + n return obj @@ -2973,7 +2987,7 @@ class DBRef(): def Main(): if (len(sys.argv) < 2): - print >> sys.stderr, "Usage is: exported-sql-viewer.py { | --help-only}" + printerr("Usage is: exported-sql-viewer.py { | --help-only}"); raise Exception("Too few arguments") dbname = sys.argv[1] @@ -2986,8 +3000,8 @@ def Main(): is_sqlite3 = False try: - f = open(dbname) - if f.read(15) == "SQLite format 3": + f = open(dbname, "rb") + if f.read(15) == b'SQLite format 3': is_sqlite3 = True f.close() except: From 1937b0560c3ea43b1b0f7d3617949ca50de8f8c0 Mon Sep 17 00:00:00 2001 From: Tony Jones Date: Fri, 8 Mar 2019 16:05:16 -0800 Subject: [PATCH 382/855] perf script python: Add Python3 support to export-to-postgresql.py Support both Python2 and Python3 in the export-to-postgresql.py script. The use of 'from __future__' implies the minimum supported Python2 version is now v2.6 Signed-off-by: Tony Jones Link: http://lkml.kernel.org/r/20190309000518.2438-3-tonyj@suse.de Signed-off-by: Adrian Hunter Signed-off-by: Seeteena Thoufeek Signed-off-by: Arnaldo Carvalho de Melo --- .../scripts/python/export-to-postgresql.py | 58 +++++++++++++------ 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py index 390a351d15ea..00ab972a2eba 100644 --- a/tools/perf/scripts/python/export-to-postgresql.py +++ b/tools/perf/scripts/python/export-to-postgresql.py @@ -10,6 +10,8 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. +from __future__ import print_function + import os import sys import struct @@ -199,6 +201,18 @@ import datetime from PySide.QtSql import * +if sys.version_info < (3, 0): + def toserverstr(str): + return str + def toclientstr(str): + return str +else: + # Assume UTF-8 server_encoding and client_encoding + def toserverstr(str): + return bytes(str, "UTF_8") + def toclientstr(str): + return bytes(str, "UTF_8") + # Need to access PostgreSQL C library directly to use COPY FROM STDIN from ctypes import * libpq = CDLL("libpq.so.5") @@ -234,12 +248,14 @@ perf_db_export_mode = True perf_db_export_calls = False perf_db_export_callchains = False +def printerr(*args, **kw_args): + print(*args, file=sys.stderr, **kw_args) def usage(): - print >> sys.stderr, "Usage is: export-to-postgresql.py [] [] []" - print >> sys.stderr, "where: columns 'all' or 'branches'" - print >> sys.stderr, " calls 'calls' => create calls and call_paths table" - print >> sys.stderr, " callchains 'callchains' => create call_paths table" + printerr("Usage is: export-to-postgresql.py [] [] []") + printerr("where: columns 'all' or 'branches'") + printerr(" calls 'calls' => create calls and call_paths table") + printerr(" callchains 'callchains' => create call_paths table") raise Exception("Too few arguments") if (len(sys.argv) < 2): @@ -273,7 +289,7 @@ def do_query(q, s): return raise Exception("Query failed: " + q.lastError().text()) -print datetime.datetime.today(), "Creating database..." +print(datetime.datetime.today(), "Creating database...") db = QSqlDatabase.addDatabase('QPSQL') query = QSqlQuery(db) @@ -506,12 +522,12 @@ do_query(query, 'CREATE VIEW samples_view AS ' ' FROM samples') -file_header = struct.pack("!11sii", "PGCOPY\n\377\r\n\0", 0, 0) -file_trailer = "\377\377" +file_header = struct.pack("!11sii", b"PGCOPY\n\377\r\n\0", 0, 0) +file_trailer = b"\377\377" def open_output_file(file_name): path_name = output_dir_name + "/" + file_name - file = open(path_name, "w+") + file = open(path_name, "wb+") file.write(file_header) return file @@ -526,13 +542,13 @@ def copy_output_file_direct(file, table_name): # Use COPY FROM STDIN because security may prevent postgres from accessing the files directly def copy_output_file(file, table_name): - conn = PQconnectdb("dbname = " + dbname) + conn = PQconnectdb(toclientstr("dbname = " + dbname)) if (PQstatus(conn)): raise Exception("COPY FROM STDIN PQconnectdb failed") file.write(file_trailer) file.seek(0) sql = "COPY " + table_name + " FROM STDIN (FORMAT 'binary')" - res = PQexec(conn, sql) + res = PQexec(conn, toclientstr(sql)) if (PQresultStatus(res) != 4): raise Exception("COPY FROM STDIN PQexec failed") data = file.read(65536) @@ -566,7 +582,7 @@ if perf_db_export_calls: call_file = open_output_file("call_table.bin") def trace_begin(): - print datetime.datetime.today(), "Writing to intermediate files..." + print(datetime.datetime.today(), "Writing to intermediate files...") # id == 0 means unknown. It is easier to create records for them than replace the zeroes with NULLs evsel_table(0, "unknown") machine_table(0, 0, "unknown") @@ -582,7 +598,7 @@ def trace_begin(): unhandled_count = 0 def trace_end(): - print datetime.datetime.today(), "Copying to database..." + print(datetime.datetime.today(), "Copying to database...") copy_output_file(evsel_file, "selected_events") copy_output_file(machine_file, "machines") copy_output_file(thread_file, "threads") @@ -597,7 +613,7 @@ def trace_end(): if perf_db_export_calls: copy_output_file(call_file, "calls") - print datetime.datetime.today(), "Removing intermediate files..." + print(datetime.datetime.today(), "Removing intermediate files...") remove_output_file(evsel_file) remove_output_file(machine_file) remove_output_file(thread_file) @@ -612,7 +628,7 @@ def trace_end(): if perf_db_export_calls: remove_output_file(call_file) os.rmdir(output_dir_name) - print datetime.datetime.today(), "Adding primary keys" + print(datetime.datetime.today(), "Adding primary keys") do_query(query, 'ALTER TABLE selected_events ADD PRIMARY KEY (id)') do_query(query, 'ALTER TABLE machines ADD PRIMARY KEY (id)') do_query(query, 'ALTER TABLE threads ADD PRIMARY KEY (id)') @@ -627,7 +643,7 @@ def trace_end(): if perf_db_export_calls: do_query(query, 'ALTER TABLE calls ADD PRIMARY KEY (id)') - print datetime.datetime.today(), "Adding foreign keys" + print(datetime.datetime.today(), "Adding foreign keys") do_query(query, 'ALTER TABLE threads ' 'ADD CONSTRAINT machinefk FOREIGN KEY (machine_id) REFERENCES machines (id),' 'ADD CONSTRAINT processfk FOREIGN KEY (process_id) REFERENCES threads (id)') @@ -663,8 +679,8 @@ def trace_end(): do_query(query, 'CREATE INDEX pid_idx ON calls (parent_id)') if (unhandled_count): - print datetime.datetime.today(), "Warning: ", unhandled_count, " unhandled events" - print datetime.datetime.today(), "Done" + print(datetime.datetime.today(), "Warning: ", unhandled_count, " unhandled events") + print(datetime.datetime.today(), "Done") def trace_unhandled(event_name, context, event_fields_dict): global unhandled_count @@ -674,12 +690,14 @@ def sched__sched_switch(*x): pass def evsel_table(evsel_id, evsel_name, *x): + evsel_name = toserverstr(evsel_name) n = len(evsel_name) fmt = "!hiqi" + str(n) + "s" value = struct.pack(fmt, 2, 8, evsel_id, n, evsel_name) evsel_file.write(value) def machine_table(machine_id, pid, root_dir, *x): + root_dir = toserverstr(root_dir) n = len(root_dir) fmt = "!hiqiii" + str(n) + "s" value = struct.pack(fmt, 3, 8, machine_id, 4, pid, n, root_dir) @@ -690,6 +708,7 @@ def thread_table(thread_id, machine_id, process_id, pid, tid, *x): thread_file.write(value) def comm_table(comm_id, comm_str, *x): + comm_str = toserverstr(comm_str) n = len(comm_str) fmt = "!hiqi" + str(n) + "s" value = struct.pack(fmt, 2, 8, comm_id, n, comm_str) @@ -701,6 +720,9 @@ def comm_thread_table(comm_thread_id, comm_id, thread_id, *x): comm_thread_file.write(value) def dso_table(dso_id, machine_id, short_name, long_name, build_id, *x): + short_name = toserverstr(short_name) + long_name = toserverstr(long_name) + build_id = toserverstr(build_id) n1 = len(short_name) n2 = len(long_name) n3 = len(build_id) @@ -709,12 +731,14 @@ def dso_table(dso_id, machine_id, short_name, long_name, build_id, *x): dso_file.write(value) def symbol_table(symbol_id, dso_id, sym_start, sym_end, binding, symbol_name, *x): + symbol_name = toserverstr(symbol_name) n = len(symbol_name) fmt = "!hiqiqiqiqiii" + str(n) + "s" value = struct.pack(fmt, 6, 8, symbol_id, 8, dso_id, 8, sym_start, 8, sym_end, 4, binding, n, symbol_name) symbol_file.write(value) def branch_type_table(branch_type, name, *x): + name = toserverstr(name) n = len(name) fmt = "!hiii" + str(n) + "s" value = struct.pack(fmt, 2, 4, branch_type, n, name) From ebf6c5c181abe9309788c6241d39602a1ce18723 Mon Sep 17 00:00:00 2001 From: Tony Jones Date: Fri, 8 Mar 2019 16:05:17 -0800 Subject: [PATCH 383/855] perf script python: Add Python3 support to export-to-sqlite.py Support both Python2 and Python3 in the export-to-sqlite.py script The use of 'from __future__' implies the minimum supported Python2 version is now v2.6 Signed-off-by: Tony Jones Acked-by: Adrian Hunter Link: http://lkml.kernel.org/r/20190309000518.2438-4-tonyj@suse.de Signed-off-by: Seeteena Thoufeek Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/export-to-sqlite.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tools/perf/scripts/python/export-to-sqlite.py b/tools/perf/scripts/python/export-to-sqlite.py index eb63e6c7107f..3da338243aed 100644 --- a/tools/perf/scripts/python/export-to-sqlite.py +++ b/tools/perf/scripts/python/export-to-sqlite.py @@ -10,6 +10,8 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. +from __future__ import print_function + import os import sys import struct @@ -60,11 +62,14 @@ perf_db_export_mode = True perf_db_export_calls = False perf_db_export_callchains = False +def printerr(*args, **keyword_args): + print(*args, file=sys.stderr, **keyword_args) + def usage(): - print >> sys.stderr, "Usage is: export-to-sqlite.py [] [] []" - print >> sys.stderr, "where: columns 'all' or 'branches'" - print >> sys.stderr, " calls 'calls' => create calls and call_paths table" - print >> sys.stderr, " callchains 'callchains' => create call_paths table" + printerr("Usage is: export-to-sqlite.py [] [] []"); + printerr("where: columns 'all' or 'branches'"); + printerr(" calls 'calls' => create calls and call_paths table"); + printerr(" callchains 'callchains' => create call_paths table"); raise Exception("Too few arguments") if (len(sys.argv) < 2): @@ -100,7 +105,7 @@ def do_query_(q): return raise Exception("Query failed: " + q.lastError().text()) -print datetime.datetime.today(), "Creating database..." +print(datetime.datetime.today(), "Creating database ...") db_exists = False try: @@ -378,7 +383,7 @@ if perf_db_export_calls: call_query.prepare("INSERT INTO calls VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") def trace_begin(): - print datetime.datetime.today(), "Writing records..." + print(datetime.datetime.today(), "Writing records...") do_query(query, 'BEGIN TRANSACTION') # id == 0 means unknown. It is easier to create records for them than replace the zeroes with NULLs evsel_table(0, "unknown") @@ -397,14 +402,14 @@ unhandled_count = 0 def trace_end(): do_query(query, 'END TRANSACTION') - print datetime.datetime.today(), "Adding indexes" + print(datetime.datetime.today(), "Adding indexes") if perf_db_export_calls: do_query(query, 'CREATE INDEX pcpid_idx ON calls (parent_call_path_id)') do_query(query, 'CREATE INDEX pid_idx ON calls (parent_id)') if (unhandled_count): - print datetime.datetime.today(), "Warning: ", unhandled_count, " unhandled events" - print datetime.datetime.today(), "Done" + print(datetime.datetime.today(), "Warning: ", unhandled_count, " unhandled events") + print(datetime.datetime.today(), "Done") def trace_unhandled(event_name, context, event_fields_dict): global unhandled_count From 49f93bbf17e6267eb34e0c12a9813f3a8723749e Mon Sep 17 00:00:00 2001 From: Tony Jones Date: Fri, 8 Mar 2019 16:05:18 -0800 Subject: [PATCH 384/855] perf script python: Add printdate function to SQL exporters Introduce a printdate function to eliminate the repetitive use of datetime.datetime.today() in the SQL exporting scripts. Signed-off-by: Tony Jones Acked-by: Adrian Hunter Link: http://lkml.kernel.org/r/20190309000518.2438-5-tonyj@suse.de Signed-off-by: Arnaldo Carvalho de Melo --- .../scripts/python/export-to-postgresql.py | 19 +++++++++++-------- tools/perf/scripts/python/export-to-sqlite.py | 13 ++++++++----- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py index 00ab972a2eba..c3eae1d77d36 100644 --- a/tools/perf/scripts/python/export-to-postgresql.py +++ b/tools/perf/scripts/python/export-to-postgresql.py @@ -251,6 +251,9 @@ perf_db_export_callchains = False def printerr(*args, **kw_args): print(*args, file=sys.stderr, **kw_args) +def printdate(*args, **kw_args): + print(datetime.datetime.today(), *args, sep=' ', **kw_args) + def usage(): printerr("Usage is: export-to-postgresql.py [] [] []") printerr("where: columns 'all' or 'branches'") @@ -289,7 +292,7 @@ def do_query(q, s): return raise Exception("Query failed: " + q.lastError().text()) -print(datetime.datetime.today(), "Creating database...") +printdate("Creating database...") db = QSqlDatabase.addDatabase('QPSQL') query = QSqlQuery(db) @@ -582,7 +585,7 @@ if perf_db_export_calls: call_file = open_output_file("call_table.bin") def trace_begin(): - print(datetime.datetime.today(), "Writing to intermediate files...") + printdate("Writing to intermediate files...") # id == 0 means unknown. It is easier to create records for them than replace the zeroes with NULLs evsel_table(0, "unknown") machine_table(0, 0, "unknown") @@ -598,7 +601,7 @@ def trace_begin(): unhandled_count = 0 def trace_end(): - print(datetime.datetime.today(), "Copying to database...") + printdate("Copying to database...") copy_output_file(evsel_file, "selected_events") copy_output_file(machine_file, "machines") copy_output_file(thread_file, "threads") @@ -613,7 +616,7 @@ def trace_end(): if perf_db_export_calls: copy_output_file(call_file, "calls") - print(datetime.datetime.today(), "Removing intermediate files...") + printdate("Removing intermediate files...") remove_output_file(evsel_file) remove_output_file(machine_file) remove_output_file(thread_file) @@ -628,7 +631,7 @@ def trace_end(): if perf_db_export_calls: remove_output_file(call_file) os.rmdir(output_dir_name) - print(datetime.datetime.today(), "Adding primary keys") + printdate("Adding primary keys") do_query(query, 'ALTER TABLE selected_events ADD PRIMARY KEY (id)') do_query(query, 'ALTER TABLE machines ADD PRIMARY KEY (id)') do_query(query, 'ALTER TABLE threads ADD PRIMARY KEY (id)') @@ -643,7 +646,7 @@ def trace_end(): if perf_db_export_calls: do_query(query, 'ALTER TABLE calls ADD PRIMARY KEY (id)') - print(datetime.datetime.today(), "Adding foreign keys") + printdate("Adding foreign keys") do_query(query, 'ALTER TABLE threads ' 'ADD CONSTRAINT machinefk FOREIGN KEY (machine_id) REFERENCES machines (id),' 'ADD CONSTRAINT processfk FOREIGN KEY (process_id) REFERENCES threads (id)') @@ -679,8 +682,8 @@ def trace_end(): do_query(query, 'CREATE INDEX pid_idx ON calls (parent_id)') if (unhandled_count): - print(datetime.datetime.today(), "Warning: ", unhandled_count, " unhandled events") - print(datetime.datetime.today(), "Done") + printdate("Warning: ", unhandled_count, " unhandled events") + printdate("Done") def trace_unhandled(event_name, context, event_fields_dict): global unhandled_count diff --git a/tools/perf/scripts/python/export-to-sqlite.py b/tools/perf/scripts/python/export-to-sqlite.py index 3da338243aed..3b71902a5a21 100644 --- a/tools/perf/scripts/python/export-to-sqlite.py +++ b/tools/perf/scripts/python/export-to-sqlite.py @@ -65,6 +65,9 @@ perf_db_export_callchains = False def printerr(*args, **keyword_args): print(*args, file=sys.stderr, **keyword_args) +def printdate(*args, **kw_args): + print(datetime.datetime.today(), *args, sep=' ', **kw_args) + def usage(): printerr("Usage is: export-to-sqlite.py [] [] []"); printerr("where: columns 'all' or 'branches'"); @@ -105,7 +108,7 @@ def do_query_(q): return raise Exception("Query failed: " + q.lastError().text()) -print(datetime.datetime.today(), "Creating database ...") +printdate("Creating database ...") db_exists = False try: @@ -383,7 +386,7 @@ if perf_db_export_calls: call_query.prepare("INSERT INTO calls VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") def trace_begin(): - print(datetime.datetime.today(), "Writing records...") + printdate("Writing records...") do_query(query, 'BEGIN TRANSACTION') # id == 0 means unknown. It is easier to create records for them than replace the zeroes with NULLs evsel_table(0, "unknown") @@ -402,14 +405,14 @@ unhandled_count = 0 def trace_end(): do_query(query, 'END TRANSACTION') - print(datetime.datetime.today(), "Adding indexes") + printdate("Adding indexes") if perf_db_export_calls: do_query(query, 'CREATE INDEX pcpid_idx ON calls (parent_call_path_id)') do_query(query, 'CREATE INDEX pid_idx ON calls (parent_id)') if (unhandled_count): - print(datetime.datetime.today(), "Warning: ", unhandled_count, " unhandled events") - print(datetime.datetime.today(), "Done") + printdate("Warning: ", unhandled_count, " unhandled events") + printdate("Done") def trace_unhandled(event_name, context, event_fields_dict): global unhandled_count From df94bb44b518b1d0c9f4b2e5127441cec13ab75c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Mar 2019 13:20:25 -0300 Subject: [PATCH 385/855] perf tools: Update x86's syscall_64.tbl, no change in tools/perf behaviour To pick the changes in 7948450d4556 ("x86/x32: use time64 versions of sigtimedwait and recvmmsg"), that doesn't cause any change in behaviour in tools/perf/ as it deals just with the x32 entries. This silences this tools/perf build warning: Warning: Kernel ABI header at 'tools/perf/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl' diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl Cc: Adrian Hunter Cc: Arnd Bergmann Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-mqpvshayeqidlulx5qpioa59@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index f0b1709a5ffb..2ae92fddb6d5 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -343,6 +343,8 @@ 332 common statx __x64_sys_statx 333 common io_pgetevents __x64_sys_io_pgetevents 334 common rseq __x64_sys_rseq +# don't use numbers 387 through 423, add new calls after the last +# 'common' entry # # x32-specific system call numbers start at 512 to avoid cache impact @@ -361,7 +363,7 @@ 520 x32 execve __x32_compat_sys_execve/ptregs 521 x32 ptrace __x32_compat_sys_ptrace 522 x32 rt_sigpending __x32_compat_sys_rt_sigpending -523 x32 rt_sigtimedwait __x32_compat_sys_rt_sigtimedwait +523 x32 rt_sigtimedwait __x32_compat_sys_rt_sigtimedwait_time64 524 x32 rt_sigqueueinfo __x32_compat_sys_rt_sigqueueinfo 525 x32 sigaltstack __x32_compat_sys_sigaltstack 526 x32 timer_create __x32_compat_sys_timer_create @@ -375,7 +377,7 @@ 534 x32 preadv __x32_compat_sys_preadv64 535 x32 pwritev __x32_compat_sys_pwritev64 536 x32 rt_tgsigqueueinfo __x32_compat_sys_rt_tgsigqueueinfo -537 x32 recvmmsg __x32_compat_sys_recvmmsg +537 x32 recvmmsg __x32_compat_sys_recvmmsg_time64 538 x32 sendmmsg __x32_compat_sys_sendmmsg 539 x32 process_vm_readv __x32_compat_sys_process_vm_readv 540 x32 process_vm_writev __x32_compat_sys_process_vm_writev From 1a787fc5ba18ac767e635c58d06a0b46876184e3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Mar 2019 13:30:08 -0300 Subject: [PATCH 386/855] tools headers uapi: Sync copy of asm-generic/unistd.h with the kernel sources To get the changes in: c8ce48f06503 ("asm-generic: Make time32 syscall numbers optional") Silencing these tools/perf build warnings: Warning: Kernel ABI header at 'tools/arch/arm64/include/uapi/asm/unistd.h' differs from latest version at 'arch/arm64/include/uapi/asm/unistd.h' diff -u tools/arch/arm64/include/uapi/asm/unistd.h arch/arm64/include/uapi/asm/unistd.h Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h' diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h Test built it under the ubuntu:14.04.4-x-linaro-arm64 cross build environment and looked at the syscall table at /tmp/build/perf/arch/arm64/include/generated/asm/syscalls.c, looks ok. Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnd Bergmann Cc: Hendrik Brueckner Cc: Jiri Olsa Cc: Kim Phillips Cc: Michael Ellerman Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Thomas Richter Link: https://lkml.kernel.org/n/tip-e4w7ngsmkq48bd6st52ty2kb@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/include/uapi/asm/unistd.h | 2 + tools/include/uapi/asm-generic/unistd.h | 149 ++++++++++++++++----- 2 files changed, 118 insertions(+), 33 deletions(-) diff --git a/tools/arch/arm64/include/uapi/asm/unistd.h b/tools/arch/arm64/include/uapi/asm/unistd.h index dae1584cf017..4703d218663a 100644 --- a/tools/arch/arm64/include/uapi/asm/unistd.h +++ b/tools/arch/arm64/include/uapi/asm/unistd.h @@ -17,5 +17,7 @@ #define __ARCH_WANT_RENAMEAT #define __ARCH_WANT_NEW_STAT +#define __ARCH_WANT_SET_GET_RLIMIT +#define __ARCH_WANT_TIME32_SYSCALLS #include diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index d90127298f12..12cdf611d217 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -38,8 +38,10 @@ __SYSCALL(__NR_io_destroy, sys_io_destroy) __SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit) #define __NR_io_cancel 3 __SYSCALL(__NR_io_cancel, sys_io_cancel) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_io_getevents 4 -__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents) +__SC_3264(__NR_io_getevents, sys_io_getevents_time32, sys_io_getevents) +#endif /* fs/xattr.c */ #define __NR_setxattr 5 @@ -179,7 +181,7 @@ __SYSCALL(__NR_fchownat, sys_fchownat) #define __NR_fchown 55 __SYSCALL(__NR_fchown, sys_fchown) #define __NR_openat 56 -__SC_COMP(__NR_openat, sys_openat, compat_sys_openat) +__SYSCALL(__NR_openat, sys_openat) #define __NR_close 57 __SYSCALL(__NR_close, sys_close) #define __NR_vhangup 58 @@ -222,10 +224,12 @@ __SC_COMP(__NR_pwritev, sys_pwritev, compat_sys_pwritev) __SYSCALL(__NR3264_sendfile, sys_sendfile64) /* fs/select.c */ +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_pselect6 72 -__SC_COMP(__NR_pselect6, sys_pselect6, compat_sys_pselect6) +__SC_COMP_3264(__NR_pselect6, sys_pselect6_time32, sys_pselect6, compat_sys_pselect6_time32) #define __NR_ppoll 73 -__SC_COMP(__NR_ppoll, sys_ppoll, compat_sys_ppoll) +__SC_COMP_3264(__NR_ppoll, sys_ppoll_time32, sys_ppoll, compat_sys_ppoll_time32) +#endif /* fs/signalfd.c */ #define __NR_signalfd4 74 @@ -269,16 +273,20 @@ __SC_COMP(__NR_sync_file_range, sys_sync_file_range, \ /* fs/timerfd.c */ #define __NR_timerfd_create 85 __SYSCALL(__NR_timerfd_create, sys_timerfd_create) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_timerfd_settime 86 -__SC_COMP(__NR_timerfd_settime, sys_timerfd_settime, \ - compat_sys_timerfd_settime) +__SC_3264(__NR_timerfd_settime, sys_timerfd_settime32, \ + sys_timerfd_settime) #define __NR_timerfd_gettime 87 -__SC_COMP(__NR_timerfd_gettime, sys_timerfd_gettime, \ - compat_sys_timerfd_gettime) +__SC_3264(__NR_timerfd_gettime, sys_timerfd_gettime32, \ + sys_timerfd_gettime) +#endif /* fs/utimes.c */ +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_utimensat 88 -__SC_COMP(__NR_utimensat, sys_utimensat, compat_sys_utimensat) +__SC_3264(__NR_utimensat, sys_utimensat_time32, sys_utimensat) +#endif /* kernel/acct.c */ #define __NR_acct 89 @@ -309,8 +317,10 @@ __SYSCALL(__NR_set_tid_address, sys_set_tid_address) __SYSCALL(__NR_unshare, sys_unshare) /* kernel/futex.c */ +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_futex 98 -__SC_COMP(__NR_futex, sys_futex, compat_sys_futex) +__SC_3264(__NR_futex, sys_futex_time32, sys_futex) +#endif #define __NR_set_robust_list 99 __SC_COMP(__NR_set_robust_list, sys_set_robust_list, \ compat_sys_set_robust_list) @@ -319,8 +329,10 @@ __SC_COMP(__NR_get_robust_list, sys_get_robust_list, \ compat_sys_get_robust_list) /* kernel/hrtimer.c */ +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_nanosleep 101 -__SC_COMP(__NR_nanosleep, sys_nanosleep, compat_sys_nanosleep) +__SC_3264(__NR_nanosleep, sys_nanosleep_time32, sys_nanosleep) +#endif /* kernel/itimer.c */ #define __NR_getitimer 102 @@ -341,23 +353,29 @@ __SYSCALL(__NR_delete_module, sys_delete_module) /* kernel/posix-timers.c */ #define __NR_timer_create 107 __SC_COMP(__NR_timer_create, sys_timer_create, compat_sys_timer_create) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_timer_gettime 108 -__SC_COMP(__NR_timer_gettime, sys_timer_gettime, compat_sys_timer_gettime) +__SC_3264(__NR_timer_gettime, sys_timer_gettime32, sys_timer_gettime) +#endif #define __NR_timer_getoverrun 109 __SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_timer_settime 110 -__SC_COMP(__NR_timer_settime, sys_timer_settime, compat_sys_timer_settime) +__SC_3264(__NR_timer_settime, sys_timer_settime32, sys_timer_settime) +#endif #define __NR_timer_delete 111 __SYSCALL(__NR_timer_delete, sys_timer_delete) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_clock_settime 112 -__SC_COMP(__NR_clock_settime, sys_clock_settime, compat_sys_clock_settime) +__SC_3264(__NR_clock_settime, sys_clock_settime32, sys_clock_settime) #define __NR_clock_gettime 113 -__SC_COMP(__NR_clock_gettime, sys_clock_gettime, compat_sys_clock_gettime) +__SC_3264(__NR_clock_gettime, sys_clock_gettime32, sys_clock_gettime) #define __NR_clock_getres 114 -__SC_COMP(__NR_clock_getres, sys_clock_getres, compat_sys_clock_getres) +__SC_3264(__NR_clock_getres, sys_clock_getres_time32, sys_clock_getres) #define __NR_clock_nanosleep 115 -__SC_COMP(__NR_clock_nanosleep, sys_clock_nanosleep, \ - compat_sys_clock_nanosleep) +__SC_3264(__NR_clock_nanosleep, sys_clock_nanosleep_time32, \ + sys_clock_nanosleep) +#endif /* kernel/printk.c */ #define __NR_syslog 116 @@ -388,9 +406,11 @@ __SYSCALL(__NR_sched_yield, sys_sched_yield) __SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max) #define __NR_sched_get_priority_min 126 __SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_sched_rr_get_interval 127 -__SC_COMP(__NR_sched_rr_get_interval, sys_sched_rr_get_interval, \ - compat_sys_sched_rr_get_interval) +__SC_3264(__NR_sched_rr_get_interval, sys_sched_rr_get_interval_time32, \ + sys_sched_rr_get_interval) +#endif /* kernel/signal.c */ #define __NR_restart_syscall 128 @@ -411,9 +431,11 @@ __SC_COMP(__NR_rt_sigaction, sys_rt_sigaction, compat_sys_rt_sigaction) __SC_COMP(__NR_rt_sigprocmask, sys_rt_sigprocmask, compat_sys_rt_sigprocmask) #define __NR_rt_sigpending 136 __SC_COMP(__NR_rt_sigpending, sys_rt_sigpending, compat_sys_rt_sigpending) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_rt_sigtimedwait 137 -__SC_COMP(__NR_rt_sigtimedwait, sys_rt_sigtimedwait, \ - compat_sys_rt_sigtimedwait) +__SC_COMP_3264(__NR_rt_sigtimedwait, sys_rt_sigtimedwait_time32, \ + sys_rt_sigtimedwait, compat_sys_rt_sigtimedwait_time32) +#endif #define __NR_rt_sigqueueinfo 138 __SC_COMP(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo, \ compat_sys_rt_sigqueueinfo) @@ -467,10 +489,15 @@ __SYSCALL(__NR_uname, sys_newuname) __SYSCALL(__NR_sethostname, sys_sethostname) #define __NR_setdomainname 162 __SYSCALL(__NR_setdomainname, sys_setdomainname) + +#ifdef __ARCH_WANT_SET_GET_RLIMIT +/* getrlimit and setrlimit are superseded with prlimit64 */ #define __NR_getrlimit 163 __SC_COMP(__NR_getrlimit, sys_getrlimit, compat_sys_getrlimit) #define __NR_setrlimit 164 __SC_COMP(__NR_setrlimit, sys_setrlimit, compat_sys_setrlimit) +#endif + #define __NR_getrusage 165 __SC_COMP(__NR_getrusage, sys_getrusage, compat_sys_getrusage) #define __NR_umask 166 @@ -481,12 +508,14 @@ __SYSCALL(__NR_prctl, sys_prctl) __SYSCALL(__NR_getcpu, sys_getcpu) /* kernel/time.c */ +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_gettimeofday 169 __SC_COMP(__NR_gettimeofday, sys_gettimeofday, compat_sys_gettimeofday) #define __NR_settimeofday 170 __SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday) #define __NR_adjtimex 171 -__SC_COMP(__NR_adjtimex, sys_adjtimex, compat_sys_adjtimex) +__SC_3264(__NR_adjtimex, sys_adjtimex_time32, sys_adjtimex) +#endif /* kernel/timer.c */ #define __NR_getpid 172 @@ -511,11 +540,13 @@ __SC_COMP(__NR_sysinfo, sys_sysinfo, compat_sys_sysinfo) __SC_COMP(__NR_mq_open, sys_mq_open, compat_sys_mq_open) #define __NR_mq_unlink 181 __SYSCALL(__NR_mq_unlink, sys_mq_unlink) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_mq_timedsend 182 -__SC_COMP(__NR_mq_timedsend, sys_mq_timedsend, compat_sys_mq_timedsend) +__SC_3264(__NR_mq_timedsend, sys_mq_timedsend_time32, sys_mq_timedsend) #define __NR_mq_timedreceive 183 -__SC_COMP(__NR_mq_timedreceive, sys_mq_timedreceive, \ - compat_sys_mq_timedreceive) +__SC_3264(__NR_mq_timedreceive, sys_mq_timedreceive_time32, \ + sys_mq_timedreceive) +#endif #define __NR_mq_notify 184 __SC_COMP(__NR_mq_notify, sys_mq_notify, compat_sys_mq_notify) #define __NR_mq_getsetattr 185 @@ -536,8 +567,10 @@ __SC_COMP(__NR_msgsnd, sys_msgsnd, compat_sys_msgsnd) __SYSCALL(__NR_semget, sys_semget) #define __NR_semctl 191 __SC_COMP(__NR_semctl, sys_semctl, compat_sys_semctl) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_semtimedop 192 -__SC_COMP(__NR_semtimedop, sys_semtimedop, compat_sys_semtimedop) +__SC_COMP(__NR_semtimedop, sys_semtimedop, sys_semtimedop_time32) +#endif #define __NR_semop 193 __SYSCALL(__NR_semop, sys_semop) @@ -658,8 +691,10 @@ __SC_COMP(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo, \ __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __NR_accept4 242 __SYSCALL(__NR_accept4, sys_accept4) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_recvmmsg 243 -__SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg) +__SC_COMP_3264(__NR_recvmmsg, sys_recvmmsg_time32, sys_recvmmsg, compat_sys_recvmmsg_time32) +#endif /* * Architectures may provide up to 16 syscalls of their own @@ -667,8 +702,10 @@ __SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg) */ #define __NR_arch_specific_syscall 244 +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_wait4 260 __SC_COMP(__NR_wait4, sys_wait4, compat_sys_wait4) +#endif #define __NR_prlimit64 261 __SYSCALL(__NR_prlimit64, sys_prlimit64) #define __NR_fanotify_init 262 @@ -678,10 +715,11 @@ __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) #define __NR_name_to_handle_at 264 __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) #define __NR_open_by_handle_at 265 -__SC_COMP(__NR_open_by_handle_at, sys_open_by_handle_at, \ - compat_sys_open_by_handle_at) +__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_clock_adjtime 266 -__SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, compat_sys_clock_adjtime) +__SC_3264(__NR_clock_adjtime, sys_clock_adjtime32, sys_clock_adjtime) +#endif #define __NR_syncfs 267 __SYSCALL(__NR_syncfs, sys_syncfs) #define __NR_setns 268 @@ -734,15 +772,60 @@ __SYSCALL(__NR_pkey_alloc, sys_pkey_alloc) __SYSCALL(__NR_pkey_free, sys_pkey_free) #define __NR_statx 291 __SYSCALL(__NR_statx, sys_statx) +#if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32 #define __NR_io_pgetevents 292 -__SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents) +__SC_COMP_3264(__NR_io_pgetevents, sys_io_pgetevents_time32, sys_io_pgetevents, compat_sys_io_pgetevents) +#endif #define __NR_rseq 293 __SYSCALL(__NR_rseq, sys_rseq) #define __NR_kexec_file_load 294 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) +/* 295 through 402 are unassigned to sync up with generic numbers, don't use */ +#if __BITS_PER_LONG == 32 +#define __NR_clock_gettime64 403 +__SYSCALL(__NR_clock_gettime64, sys_clock_gettime) +#define __NR_clock_settime64 404 +__SYSCALL(__NR_clock_settime64, sys_clock_settime) +#define __NR_clock_adjtime64 405 +__SYSCALL(__NR_clock_adjtime64, sys_clock_adjtime) +#define __NR_clock_getres_time64 406 +__SYSCALL(__NR_clock_getres_time64, sys_clock_getres) +#define __NR_clock_nanosleep_time64 407 +__SYSCALL(__NR_clock_nanosleep_time64, sys_clock_nanosleep) +#define __NR_timer_gettime64 408 +__SYSCALL(__NR_timer_gettime64, sys_timer_gettime) +#define __NR_timer_settime64 409 +__SYSCALL(__NR_timer_settime64, sys_timer_settime) +#define __NR_timerfd_gettime64 410 +__SYSCALL(__NR_timerfd_gettime64, sys_timerfd_gettime) +#define __NR_timerfd_settime64 411 +__SYSCALL(__NR_timerfd_settime64, sys_timerfd_settime) +#define __NR_utimensat_time64 412 +__SYSCALL(__NR_utimensat_time64, sys_utimensat) +#define __NR_pselect6_time64 413 +__SC_COMP(__NR_pselect6_time64, sys_pselect6, compat_sys_pselect6_time64) +#define __NR_ppoll_time64 414 +__SC_COMP(__NR_ppoll_time64, sys_ppoll, compat_sys_ppoll_time64) +#define __NR_io_pgetevents_time64 416 +__SYSCALL(__NR_io_pgetevents_time64, sys_io_pgetevents) +#define __NR_recvmmsg_time64 417 +__SC_COMP(__NR_recvmmsg_time64, sys_recvmmsg, compat_sys_recvmmsg_time64) +#define __NR_mq_timedsend_time64 418 +__SYSCALL(__NR_mq_timedsend_time64, sys_mq_timedsend) +#define __NR_mq_timedreceive_time64 419 +__SYSCALL(__NR_mq_timedreceive_time64, sys_mq_timedreceive) +#define __NR_semtimedop_time64 420 +__SYSCALL(__NR_semtimedop_time64, sys_semtimedop) +#define __NR_rt_sigtimedwait_time64 421 +__SC_COMP(__NR_rt_sigtimedwait_time64, sys_rt_sigtimedwait, compat_sys_rt_sigtimedwait_time64) +#define __NR_futex_time64 422 +__SYSCALL(__NR_futex_time64, sys_futex) +#define __NR_sched_rr_get_interval_time64 423 +__SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval) +#endif #undef __NR_syscalls -#define __NR_syscalls 295 +#define __NR_syscalls 424 /* * 32 bit systems traditionally used different From 2fb71043e8894ca78258f7458a2db2eb3a142a22 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Mar 2019 13:39:48 -0300 Subject: [PATCH 387/855] tools headers uapi: Update linux/in.h copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To get the changes in: 4effd28c1245 ("bridge: join all-snoopers multicast address") That do not generate any changes in tools/ use of this file. Silences this tools/perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/in.h' differs from latest version at 'include/uapi/linux/in.h' diff -u tools/include/uapi/linux/in.h include/uapi/linux/in.h Cc: Adrian Hunter Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Lüssing Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-ifpl634035266ho6wxuqgo81@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/in.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/include/uapi/linux/in.h b/tools/include/uapi/linux/in.h index a55cb8b10165..e7ad9d350a28 100644 --- a/tools/include/uapi/linux/in.h +++ b/tools/include/uapi/linux/in.h @@ -292,10 +292,11 @@ struct sockaddr_in { #define IN_LOOPBACK(a) ((((long int) (a)) & 0xff000000) == 0x7f000000) /* Defines for Multicast INADDR */ -#define INADDR_UNSPEC_GROUP 0xe0000000U /* 224.0.0.0 */ -#define INADDR_ALLHOSTS_GROUP 0xe0000001U /* 224.0.0.1 */ -#define INADDR_ALLRTRS_GROUP 0xe0000002U /* 224.0.0.2 */ -#define INADDR_MAX_LOCAL_GROUP 0xe00000ffU /* 224.0.0.255 */ +#define INADDR_UNSPEC_GROUP 0xe0000000U /* 224.0.0.0 */ +#define INADDR_ALLHOSTS_GROUP 0xe0000001U /* 224.0.0.1 */ +#define INADDR_ALLRTRS_GROUP 0xe0000002U /* 224.0.0.2 */ +#define INADDR_ALLSNOOPERS_GROUP 0xe000006aU /* 224.0.0.106 */ +#define INADDR_MAX_LOCAL_GROUP 0xe00000ffU /* 224.0.0.255 */ #endif /* contains the htonl type stuff.. */ From e87e548126cdc66fd4f194b38b59f351b6e5d3e8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 11 Mar 2019 07:44:52 -0700 Subject: [PATCH 388/855] perf script: Filter COMM/FORK/.. events by CPU The --cpu option only filtered samples. Filter other perf events, such as COMM, FORK, SWITCH by the CPU too. Reported-by: Jiri Olsa Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20190311144502.15423-2-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 71 ++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 111787e83784..b695b20ffc8a 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1933,6 +1933,13 @@ static int cleanup_scripting(void) return scripting_ops ? scripting_ops->stop_script() : 0; } +static bool filter_cpu(struct perf_sample *sample) +{ + if (cpu_list) + return !test_bit(sample->cpu, cpu_bitmap); + return false; +} + static int process_sample_event(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, @@ -1967,7 +1974,7 @@ static int process_sample_event(struct perf_tool *tool, if (al.filtered) goto out_put; - if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) + if (filter_cpu(sample)) goto out_put; if (scripting_ops) @@ -2052,9 +2059,11 @@ static int process_comm_event(struct perf_tool *tool, sample->tid = event->comm.tid; sample->pid = event->comm.pid; } - perf_sample__fprintf_start(sample, thread, evsel, + if (!filter_cpu(sample)) { + perf_sample__fprintf_start(sample, thread, evsel, PERF_RECORD_COMM, stdout); - perf_event__fprintf(event, stdout); + perf_event__fprintf(event, stdout); + } ret = 0; out: thread__put(thread); @@ -2088,9 +2097,11 @@ static int process_namespaces_event(struct perf_tool *tool, sample->tid = event->namespaces.tid; sample->pid = event->namespaces.pid; } - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_NAMESPACES, stdout); - perf_event__fprintf(event, stdout); + if (!filter_cpu(sample)) { + perf_sample__fprintf_start(sample, thread, evsel, + PERF_RECORD_NAMESPACES, stdout); + perf_event__fprintf(event, stdout); + } ret = 0; out: thread__put(thread); @@ -2122,9 +2133,11 @@ static int process_fork_event(struct perf_tool *tool, sample->tid = event->fork.tid; sample->pid = event->fork.pid; } - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_FORK, stdout); - perf_event__fprintf(event, stdout); + if (!filter_cpu(sample)) { + perf_sample__fprintf_start(sample, thread, evsel, + PERF_RECORD_FORK, stdout); + perf_event__fprintf(event, stdout); + } thread__put(thread); return 0; @@ -2152,9 +2165,11 @@ static int process_exit_event(struct perf_tool *tool, sample->tid = event->fork.tid; sample->pid = event->fork.pid; } - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_EXIT, stdout); - perf_event__fprintf(event, stdout); + if (!filter_cpu(sample)) { + perf_sample__fprintf_start(sample, thread, evsel, + PERF_RECORD_EXIT, stdout); + perf_event__fprintf(event, stdout); + } if (perf_event__process_exit(tool, event, sample, machine) < 0) err = -1; @@ -2188,9 +2203,11 @@ static int process_mmap_event(struct perf_tool *tool, sample->tid = event->mmap.tid; sample->pid = event->mmap.pid; } - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_MMAP, stdout); - perf_event__fprintf(event, stdout); + if (!filter_cpu(sample)) { + perf_sample__fprintf_start(sample, thread, evsel, + PERF_RECORD_MMAP, stdout); + perf_event__fprintf(event, stdout); + } thread__put(thread); return 0; } @@ -2220,9 +2237,11 @@ static int process_mmap2_event(struct perf_tool *tool, sample->tid = event->mmap2.tid; sample->pid = event->mmap2.pid; } - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_MMAP2, stdout); - perf_event__fprintf(event, stdout); + if (!filter_cpu(sample)) { + perf_sample__fprintf_start(sample, thread, evsel, + PERF_RECORD_MMAP2, stdout); + perf_event__fprintf(event, stdout); + } thread__put(thread); return 0; } @@ -2247,9 +2266,11 @@ static int process_switch_event(struct perf_tool *tool, return -1; } - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_SWITCH, stdout); - perf_event__fprintf(event, stdout); + if (!filter_cpu(sample)) { + perf_sample__fprintf_start(sample, thread, evsel, + PERF_RECORD_SWITCH, stdout); + perf_event__fprintf(event, stdout); + } thread__put(thread); return 0; } @@ -2270,9 +2291,11 @@ process_lost_event(struct perf_tool *tool, if (thread == NULL) return -1; - perf_sample__fprintf_start(sample, thread, evsel, - PERF_RECORD_LOST, stdout); - perf_event__fprintf(event, stdout); + if (!filter_cpu(sample)) { + perf_sample__fprintf_start(sample, thread, evsel, + PERF_RECORD_LOST, stdout); + perf_event__fprintf(event, stdout); + } thread__put(thread); return 0; } From 8f2bf884114c118f6aa11dd380c5dd7ce60380dc Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Mon, 25 Feb 2019 14:59:37 +0800 Subject: [PATCH 389/855] drm/amd/powerplay: set max fan target temperature as 105C A workaround to override the fan target temperature in SMC table. Signed-off-by: Evan Quan Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher --- .../powerplay/hwmgr/vega20_processpptables.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_processpptables.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_processpptables.c index 97f8a1a970c3..7a7f15d0c53a 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_processpptables.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_processpptables.c @@ -32,6 +32,8 @@ #include "cgs_common.h" #include "vega20_pptable.h" +#define VEGA20_FAN_TARGET_TEMPERATURE_OVERRIDE 105 + static void set_hw_cap(struct pp_hwmgr *hwmgr, bool enable, enum phm_platform_caps cap) { @@ -798,6 +800,17 @@ static int append_vbios_pptable(struct pp_hwmgr *hwmgr, PPTable_t *ppsmc_pptable return 0; } +static int override_powerplay_table_fantargettemperature(struct pp_hwmgr *hwmgr) +{ + struct phm_ppt_v3_information *pptable_information = + (struct phm_ppt_v3_information *)hwmgr->pptable; + PPTable_t *ppsmc_pptable = (PPTable_t *)(pptable_information->smc_pptable); + + ppsmc_pptable->FanTargetTemperature = VEGA20_FAN_TARGET_TEMPERATURE_OVERRIDE; + + return 0; +} + #define VEGA20_ENGINECLOCK_HARDMAX 198000 static int init_powerplay_table_information( struct pp_hwmgr *hwmgr, @@ -887,6 +900,10 @@ static int init_powerplay_table_information( result = append_vbios_pptable(hwmgr, (pptable_information->smc_pptable)); + if (result) + return result; + + result = override_powerplay_table_fantargettemperature(hwmgr); return result; } From f5742ec36422a39b57f0256e4847f61b3c432f8c Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Mon, 25 Feb 2019 16:44:36 +0800 Subject: [PATCH 390/855] drm/amd/powerplay: correct power reading on fiji Set sampling period as 500ms to provide a smooth power reading output. Also, correct the register for power reading. Signed-off-by: Evan Quan Reviewed-by: Feifei Xu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c index 48187acac59e..83d3d935f3ac 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c @@ -3491,14 +3491,14 @@ static int smu7_get_gpu_power(struct pp_hwmgr *hwmgr, u32 *query) smum_send_msg_to_smc(hwmgr, PPSMC_MSG_PmStatusLogStart); cgs_write_ind_register(hwmgr->device, CGS_IND_REG__SMC, - ixSMU_PM_STATUS_94, 0); + ixSMU_PM_STATUS_95, 0); for (i = 0; i < 10; i++) { - mdelay(1); + mdelay(500); smum_send_msg_to_smc(hwmgr, PPSMC_MSG_PmStatusLogSample); tmp = cgs_read_ind_register(hwmgr->device, CGS_IND_REG__SMC, - ixSMU_PM_STATUS_94); + ixSMU_PM_STATUS_95); if (tmp != 0) break; } From 3723908d05834c76fd5cc9ecd17b0851342e1df4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 11 Mar 2019 07:44:54 -0700 Subject: [PATCH 391/855] perf report: Support time sort key Add a time sort key to perf report to display samples for different time quantums separately. This allows easier analysis of workloads that change over time, and also will allow looking at the context of samples. % perf record ... % perf report --sort time,overhead,symbol --time-quantum 1ms --stdio ... 0.67% 277061.87300 [.] _dl_start 0.50% 277061.87300 [.] f1 0.50% 277061.87300 [.] f2 0.33% 277061.87300 [.] main 0.29% 277061.87300 [.] _dl_lookup_symbol_x 0.29% 277061.87300 [.] dl_main 0.29% 277061.87300 [.] do_lookup_x 0.17% 277061.87300 [.] _dl_debug_initialize 0.17% 277061.87300 [.] _dl_init_paths 0.08% 277061.87300 [.] check_match 0.04% 277061.87300 [.] _dl_count_modids 1.33% 277061.87400 [.] f1 1.33% 277061.87400 [.] f2 1.33% 277061.87400 [.] main 1.17% 277061.87500 [.] main 1.08% 277061.87500 [.] f1 1.08% 277061.87500 [.] f2 1.00% 277061.87600 [.] main 0.83% 277061.87600 [.] f1 0.83% 277061.87600 [.] f2 1.00% 277061.87700 [.] main Committer notes: Rename 'time' argument to hist_time() to htime to overcome this in older distros: cc1: warnings being treated as errors util/hist.c: In function 'hist_time': util/hist.c:251: error: declaration of 'time' shadows a global declaration /usr/include/time.h:186: error: shadowed declaration is here Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20190311144502.15423-4-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 2 ++ tools/perf/util/hist.c | 11 +++++++ tools/perf/util/hist.h | 1 + tools/perf/util/sort.c | 39 ++++++++++++++++++++++++ tools/perf/util/sort.h | 2 ++ 5 files changed, 55 insertions(+) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 9ec1702bccdd..546d87221ad8 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -105,6 +105,8 @@ OPTIONS guest machine - sample: Number of sample - period: Raw number of event count of sample + - time: Separate the samples by time stamp with the resolution specified by + --time-quantum (default 100ms). Specify with overhead and before it. By default, comm, dso and symbol keys are used. (i.e. --sort comm,dso,symbol) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index f9eb95bf3938..34c0f00c68d1 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -19,6 +19,7 @@ #include #include #include +#include static bool hists__filter_entry_by_dso(struct hists *hists, struct hist_entry *he); @@ -192,6 +193,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) hists__new_col_len(hists, HISTC_MEM_LVL, 21 + 3); hists__new_col_len(hists, HISTC_LOCAL_WEIGHT, 12); hists__new_col_len(hists, HISTC_GLOBAL_WEIGHT, 12); + hists__new_col_len(hists, HISTC_TIME, 12); if (h->srcline) { len = MAX(strlen(h->srcline), strlen(sort_srcline.se_header)); @@ -246,6 +248,14 @@ static void he_stat__add_cpumode_period(struct he_stat *he_stat, } } +static long hist_time(unsigned long htime) +{ + unsigned long time_quantum = symbol_conf.time_quantum; + if (time_quantum) + return (htime / time_quantum) * time_quantum; + return htime; +} + static void he_stat__add_period(struct he_stat *he_stat, u64 period, u64 weight) { @@ -635,6 +645,7 @@ __hists__add_entry(struct hists *hists, .raw_data = sample->raw_data, .raw_size = sample->raw_size, .ops = ops, + .time = hist_time(sample->time), }, *he = hists__findnew_entry(hists, &entry, al, sample_self); if (!hists->has_callchains && he && he->callchain_size != 0) diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 4af27fbab24f..6279eca56409 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -31,6 +31,7 @@ enum hist_filter { enum hist_column { HISTC_SYMBOL, + HISTC_TIME, HISTC_DSO, HISTC_THREAD, HISTC_COMM, diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index d2299e912e59..bdd30cab51cb 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "sort.h" #include "hist.h" #include "comm.h" @@ -15,6 +16,7 @@ #include #include "mem-events.h" #include "annotate.h" +#include "time-utils.h" #include regex_t parent_regex; @@ -654,6 +656,42 @@ struct sort_entry sort_socket = { .se_width_idx = HISTC_SOCKET, }; +/* --sort time */ + +static int64_t +sort__time_cmp(struct hist_entry *left, struct hist_entry *right) +{ + return right->time - left->time; +} + +static int hist_entry__time_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width) +{ + unsigned long secs; + unsigned long long nsecs; + char he_time[32]; + + nsecs = he->time; + secs = nsecs / NSEC_PER_SEC; + nsecs -= secs * NSEC_PER_SEC; + + if (symbol_conf.nanosecs) + snprintf(he_time, sizeof he_time, "%5lu.%09llu: ", + secs, nsecs); + else + timestamp__scnprintf_usec(he->time, he_time, + sizeof(he_time)); + + return repsep_snprintf(bf, size, "%-.*s", width, he_time); +} + +struct sort_entry sort_time = { + .se_header = "Time", + .se_cmp = sort__time_cmp, + .se_snprintf = hist_entry__time_snprintf, + .se_width_idx = HISTC_TIME, +}; + /* --sort trace */ static char *get_trace_output(struct hist_entry *he) @@ -1634,6 +1672,7 @@ static struct sort_dimension common_sort_dimensions[] = { DIM(SORT_DSO_SIZE, "dso_size", sort_dso_size), DIM(SORT_CGROUP_ID, "cgroup_id", sort_cgroup_id), DIM(SORT_SYM_IPC_NULL, "ipc_null", sort_sym_ipc_null), + DIM(SORT_TIME, "time", sort_time), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 2fbee0b1011c..19dceb7f6145 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -135,6 +135,7 @@ struct hist_entry { char *srcfile; struct symbol *parent; struct branch_info *branch_info; + long time; struct hists *hists; struct mem_info *mem_info; void *raw_data; @@ -231,6 +232,7 @@ enum sort_type { SORT_DSO_SIZE, SORT_CGROUP_ID, SORT_SYM_IPC_NULL, + SORT_TIME, /* branch stack specific sort keys */ __SORT_BRANCH_STACK, From 1d6c49df74b0706af13a7d707638f0db374eaf88 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 11 Mar 2019 07:44:56 -0700 Subject: [PATCH 392/855] perf report: Support running scripts for current time range When using the time sort key, add new context menus to run scripts for only the currently selected time range. Compute the correct range for the selection add pass it as the --time option to perf script. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20190311144502.15423-6-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/hists.c | 83 +++++++++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 11 deletions(-) diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index aef800d97ea1..f98aeac607dd 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "../../util/callchain.h" #include "../../util/evsel.h" @@ -30,6 +31,7 @@ #include "srcline.h" #include "string2.h" #include "units.h" +#include "time-utils.h" #include "sane_ctype.h" @@ -2338,6 +2340,7 @@ close_file_and_continue: } struct popup_action { + unsigned long time; struct thread *thread; struct map_symbol ms; int socket; @@ -2527,36 +2530,64 @@ static int do_run_script(struct hist_browser *browser __maybe_unused, struct popup_action *act) { - char script_opt[64]; - memset(script_opt, 0, sizeof(script_opt)); + char *script_opt; + int len; + int n = 0; + len = 100; + if (act->thread) + len += strlen(thread__comm_str(act->thread)); + else if (act->ms.sym) + len += strlen(act->ms.sym->name); + script_opt = malloc(len); + if (!script_opt) + return -1; + + script_opt[0] = 0; if (act->thread) { - scnprintf(script_opt, sizeof(script_opt), " -c %s ", + n = scnprintf(script_opt, len, " -c %s ", thread__comm_str(act->thread)); } else if (act->ms.sym) { - scnprintf(script_opt, sizeof(script_opt), " -S %s ", + n = scnprintf(script_opt, len, " -S %s ", act->ms.sym->name); } + if (act->time) { + char start[32], end[32]; + unsigned long starttime = act->time; + unsigned long endtime = act->time + symbol_conf.time_quantum; + + if (starttime == endtime) { /* Display 1ms as fallback */ + starttime -= 1*NSEC_PER_MSEC; + endtime += 1*NSEC_PER_MSEC; + } + timestamp__scnprintf_usec(starttime, start, sizeof start); + timestamp__scnprintf_usec(endtime, end, sizeof end); + n += snprintf(script_opt + n, len - n, " --time %s,%s", start, end); + } + script_browse(script_opt); + free(script_opt); return 0; } static int -add_script_opt(struct hist_browser *browser __maybe_unused, +add_script_opt_2(struct hist_browser *browser __maybe_unused, struct popup_action *act, char **optstr, - struct thread *thread, struct symbol *sym) + struct thread *thread, struct symbol *sym, + const char *tstr) { + if (thread) { - if (asprintf(optstr, "Run scripts for samples of thread [%s]", - thread__comm_str(thread)) < 0) + if (asprintf(optstr, "Run scripts for samples of thread [%s]%s", + thread__comm_str(thread), tstr) < 0) return 0; } else if (sym) { - if (asprintf(optstr, "Run scripts for samples of symbol [%s]", - sym->name) < 0) + if (asprintf(optstr, "Run scripts for samples of symbol [%s]%s", + sym->name, tstr) < 0) return 0; } else { - if (asprintf(optstr, "Run scripts for all samples") < 0) + if (asprintf(optstr, "Run scripts for all samples%s", tstr) < 0) return 0; } @@ -2566,6 +2597,36 @@ add_script_opt(struct hist_browser *browser __maybe_unused, return 1; } +static int +add_script_opt(struct hist_browser *browser, + struct popup_action *act, char **optstr, + struct thread *thread, struct symbol *sym) +{ + int n, j; + struct hist_entry *he; + + n = add_script_opt_2(browser, act, optstr, thread, sym, ""); + + he = hist_browser__selected_entry(browser); + if (sort_order && strstr(sort_order, "time")) { + char tstr[128]; + + optstr++; + act++; + j = sprintf(tstr, " in "); + j += timestamp__scnprintf_usec(he->time, tstr + j, + sizeof tstr - j); + j += sprintf(tstr + j, "-"); + timestamp__scnprintf_usec(he->time + symbol_conf.time_quantum, + tstr + j, + sizeof tstr - j); + n += add_script_opt_2(browser, act, optstr, thread, sym, + tstr); + act->time = he->time; + } + return n; +} + static int do_switch_data(struct hist_browser *browser __maybe_unused, struct popup_action *act __maybe_unused) From 6f3da20e151f4121548cf598730ae0f9559ae45d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 11 Mar 2019 07:44:57 -0700 Subject: [PATCH 393/855] perf report: Support builtin perf script in scripts menu The scripts menu traditionally only showed custom perf scripts. Allow to run standard perf script with useful default options too. - Normal perf script - perf script with assembler (needs xed installed) - perf script with source code output (needs debuginfo) - perf script with custom arguments Then we automatically select the right options to display the information in the perf.data file. For example with -b display branch contexts. It's not easily possible to check for xed's existence in advance. perf script usually gives sensible error messages when it's not available. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20190311144502.15423-7-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/ui/browsers/hists.c | 23 +++--- tools/perf/ui/browsers/scripts.c | 127 +++++++++++++++++++++++------- tools/perf/util/hist.h | 8 +- 4 files changed, 120 insertions(+), 40 deletions(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 35bdfd8b1e71..98d934a36d86 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -750,7 +750,7 @@ static int annotate_browser__run(struct annotate_browser *browser, continue; case 'r': { - script_browse(NULL); + script_browse(NULL, NULL); continue; } case 'k': diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index f98aeac607dd..fb4430f8982c 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -2344,6 +2344,7 @@ struct popup_action { struct thread *thread; struct map_symbol ms; int socket; + struct perf_evsel *evsel; int (*fn)(struct hist_browser *browser, struct popup_action *act); }; @@ -2566,7 +2567,7 @@ do_run_script(struct hist_browser *browser __maybe_unused, n += snprintf(script_opt + n, len - n, " --time %s,%s", start, end); } - script_browse(script_opt); + script_browse(script_opt, act->evsel); free(script_opt); return 0; } @@ -2575,7 +2576,7 @@ static int add_script_opt_2(struct hist_browser *browser __maybe_unused, struct popup_action *act, char **optstr, struct thread *thread, struct symbol *sym, - const char *tstr) + struct perf_evsel *evsel, const char *tstr) { if (thread) { @@ -2593,6 +2594,7 @@ add_script_opt_2(struct hist_browser *browser __maybe_unused, act->thread = thread; act->ms.sym = sym; + act->evsel = evsel; act->fn = do_run_script; return 1; } @@ -2600,12 +2602,13 @@ add_script_opt_2(struct hist_browser *browser __maybe_unused, static int add_script_opt(struct hist_browser *browser, struct popup_action *act, char **optstr, - struct thread *thread, struct symbol *sym) + struct thread *thread, struct symbol *sym, + struct perf_evsel *evsel) { int n, j; struct hist_entry *he; - n = add_script_opt_2(browser, act, optstr, thread, sym, ""); + n = add_script_opt_2(browser, act, optstr, thread, sym, evsel, ""); he = hist_browser__selected_entry(browser); if (sort_order && strstr(sort_order, "time")) { @@ -2618,10 +2621,9 @@ add_script_opt(struct hist_browser *browser, sizeof tstr - j); j += sprintf(tstr + j, "-"); timestamp__scnprintf_usec(he->time + symbol_conf.time_quantum, - tstr + j, - sizeof tstr - j); + tstr + j, sizeof tstr - j); n += add_script_opt_2(browser, act, optstr, thread, sym, - tstr); + evsel, tstr); act->time = he->time; } return n; @@ -3092,7 +3094,7 @@ skip_annotation: nr_options += add_script_opt(browser, &actions[nr_options], &options[nr_options], - thread, NULL); + thread, NULL, evsel); } /* * Note that browser->selection != NULL @@ -3107,11 +3109,12 @@ skip_annotation: nr_options += add_script_opt(browser, &actions[nr_options], &options[nr_options], - NULL, browser->selection->sym); + NULL, browser->selection->sym, + evsel); } } nr_options += add_script_opt(browser, &actions[nr_options], - &options[nr_options], NULL, NULL); + &options[nr_options], NULL, NULL, evsel); nr_options += add_switch_opt(browser, &actions[nr_options], &options[nr_options]); skip_scripting: diff --git a/tools/perf/ui/browsers/scripts.c b/tools/perf/ui/browsers/scripts.c index 7f36630694bf..9e5f87558af6 100644 --- a/tools/perf/ui/browsers/scripts.c +++ b/tools/perf/ui/browsers/scripts.c @@ -17,36 +17,111 @@ */ #define SCRIPT_FULLPATH_LEN 256 +struct script_config { + const char **names; + char **paths; + int index; + const char *perf; + char extra_format[256]; +}; + +void attr_to_script(char *extra_format, struct perf_event_attr *attr) +{ + extra_format[0] = 0; + if (attr->read_format & PERF_FORMAT_GROUP) + strcat(extra_format, " -F +metric"); + if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) + strcat(extra_format, " -F +brstackinsn --xed"); + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) + strcat(extra_format, " -F +iregs"); + if (attr->sample_type & PERF_SAMPLE_REGS_USER) + strcat(extra_format, " -F +uregs"); + if (attr->sample_type & PERF_SAMPLE_PHYS_ADDR) + strcat(extra_format, " -F +phys_addr"); +} + +static int add_script_option(const char *name, const char *opt, + struct script_config *c) +{ + c->names[c->index] = name; + if (asprintf(&c->paths[c->index], + "%s script %s -F +metric %s %s", + c->perf, opt, symbol_conf.inline_name ? " --inline" : "", + c->extra_format) < 0) + return -1; + c->index++; + return 0; +} + /* * When success, will copy the full path of the selected script * into the buffer pointed by script_name, and return 0. * Return -1 on failure. */ -static int list_scripts(char *script_name) +static int list_scripts(char *script_name, bool *custom, + struct perf_evsel *evsel) { - char *buf, *names[SCRIPT_MAX_NO], *paths[SCRIPT_MAX_NO]; - int i, num, choice, ret = -1; + char *buf, *paths[SCRIPT_MAX_NO], *names[SCRIPT_MAX_NO]; + int i, num, choice; + int ret = 0; + int max_std, custom_perf; + char pbuf[256]; + const char *perf = perf_exe(pbuf, sizeof pbuf); + struct script_config scriptc = { + .names = (const char **)names, + .paths = paths, + .perf = perf + }; + + script_name[0] = 0; /* Preset the script name to SCRIPT_NAMELEN */ buf = malloc(SCRIPT_MAX_NO * (SCRIPT_NAMELEN + SCRIPT_FULLPATH_LEN)); if (!buf) - return ret; + return -1; - for (i = 0; i < SCRIPT_MAX_NO; i++) { - names[i] = buf + i * (SCRIPT_NAMELEN + SCRIPT_FULLPATH_LEN); + if (evsel) + attr_to_script(scriptc.extra_format, &evsel->attr); + add_script_option("Show individual samples", "", &scriptc); + add_script_option("Show individual samples with assembler", "-F +insn --xed", + &scriptc); + add_script_option("Show individual samples with source", "-F +srcline,+srccode", + &scriptc); + custom_perf = scriptc.index; + add_script_option("Show samples with custom perf script arguments", "", &scriptc); + i = scriptc.index; + max_std = i; + + for (; i < SCRIPT_MAX_NO; i++) { + names[i] = buf + (i - max_std) * (SCRIPT_NAMELEN + SCRIPT_FULLPATH_LEN); paths[i] = names[i] + SCRIPT_NAMELEN; } - num = find_scripts(names, paths); - if (num > 0) { - choice = ui__popup_menu(num, names); - if (choice < num && choice >= 0) { - strcpy(script_name, paths[choice]); - ret = 0; - } + num = find_scripts(names + max_std, paths + max_std); + if (num < 0) + num = 0; + choice = ui__popup_menu(num + max_std, (char * const *)names); + if (choice < 0) { + ret = -1; + goto out; } + if (choice == custom_perf) { + char script_args[50]; + int key = ui_browser__input_window("perf script command", + "Enter perf script command line (without perf script prefix)", + script_args, "", 0); + if (key != K_ENTER) + return -1; + sprintf(script_name, "%s script %s", perf, script_args); + } else if (choice < num + max_std) { + strcpy(script_name, paths[choice]); + } + *custom = choice >= max_std; +out: free(buf); + for (i = 0; i < max_std; i++) + free(paths[i]); return ret; } @@ -66,27 +141,25 @@ static void run_script(char *cmd) SLsmg_refresh(); } -int script_browse(const char *script_opt) +int script_browse(const char *script_opt, struct perf_evsel *evsel) { - char cmd[SCRIPT_FULLPATH_LEN*2], script_name[SCRIPT_FULLPATH_LEN]; + char *cmd, script_name[SCRIPT_FULLPATH_LEN]; + bool custom = false; memset(script_name, 0, SCRIPT_FULLPATH_LEN); - if (list_scripts(script_name)) + if (list_scripts(script_name, &custom, evsel)) return -1; - sprintf(cmd, "perf script -s %s ", script_name); - - if (script_opt) - strcat(cmd, script_opt); - - if (input_name) { - strcat(cmd, " -i "); - strcat(cmd, input_name); - } - - strcat(cmd, " 2>&1 | less"); + if (asprintf(&cmd, "%s%s %s %s%s 2>&1 | less", + custom ? "perf script -s " : "", + script_name, + script_opt ? script_opt : "", + input_name ? "-i " : "", + input_name ? input_name : "") < 0) + return -1; run_script(cmd); + free(cmd); return 0; } diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 6279eca56409..2113a6639cea 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -436,6 +436,8 @@ struct annotation_options; #ifdef HAVE_SLANG_SUPPORT #include "../ui/keysyms.h" +void attr_to_script(char *buf, struct perf_event_attr *attr); + int map_symbol__tui_annotate(struct map_symbol *ms, struct perf_evsel *evsel, struct hist_browser_timer *hbt, struct annotation_options *annotation_opts); @@ -450,7 +452,8 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help, struct perf_env *env, bool warn_lost_event, struct annotation_options *annotation_options); -int script_browse(const char *script_opt); + +int script_browse(const char *script_opt, struct perf_evsel *evsel); #else static inline int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __maybe_unused, @@ -479,7 +482,8 @@ static inline int hist_entry__tui_annotate(struct hist_entry *he __maybe_unused, return 0; } -static inline int script_browse(const char *script_opt __maybe_unused) +static inline int script_browse(const char *script_opt __maybe_unused, + struct perf_evsel *evsel __maybe_unused) { return 0; } From 4968ac8fb7c378e2bc40b7e9bd97768fa8c7aa32 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 11 Mar 2019 07:44:58 -0700 Subject: [PATCH 394/855] perf report: Implement browsing of individual samples Now 'perf report' can show whole time periods with 'perf script', but the user still has to find individual samples of interest manually. It would be expensive and complicated to search for the right samples in the whole perf file. Typically users only need to look at a small number of samples for useful analysis. Also the full scripts tend to show samples of all CPUs and all threads mixed up, which can be very confusing on larger systems. Add a new --samples option to save a small random number of samples per hist entry. Use a reservoir sample technique to select a representatve number of samples. Then allow browsing the samples using 'perf script' as part of the hist entry context menu. This automatically adds the right filters, so only the thread or cpu of the sample is displayed. Then we use less' search functionality to directly jump the to the time stamp of the selected sample. It uses different menus for assembler and source display. Assembler needs xed installed and source needs debuginfo. Currently it only supports as many samples as fit on the screen due to some limitations in the slang ui code. Signed-off-by: Andi Kleen Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20190311174605.GA29294@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-config.txt | 6 ++ tools/perf/Documentation/perf-report.txt | 4 + tools/perf/builtin-report.c | 2 + tools/perf/ui/browsers/Build | 1 + tools/perf/ui/browsers/hists.c | 47 ++++++++++++ tools/perf/ui/browsers/res_sample.c | 94 ++++++++++++++++++++++++ tools/perf/ui/browsers/scripts.c | 2 +- tools/perf/util/hist.c | 39 ++++++++++ tools/perf/util/hist.h | 22 ++++++ tools/perf/util/sort.h | 8 ++ tools/perf/util/symbol.c | 1 + tools/perf/util/symbol_conf.h | 1 + 12 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 tools/perf/ui/browsers/res_sample.c diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt index 86f3dcc15f83..2d0fb7613134 100644 --- a/tools/perf/Documentation/perf-config.txt +++ b/tools/perf/Documentation/perf-config.txt @@ -584,6 +584,12 @@ llvm.*:: llvm.opts:: Options passed to llc. +samples.*:: + + samples.context:: + Define how many ns worth of time to show + around samples in perf report sample context browser. + SEE ALSO -------- linkperf:perf[1] diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 546d87221ad8..f441baa794ce 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -461,6 +461,10 @@ include::itrace.txt[] --socket-filter:: Only report the samples on the processor socket that match with this filter +--samples=N:: + Save N individual samples for each histogram entry to show context in perf + report tui browser. + --raw-trace:: When displaying traceevent output, do not use print fmt or plugins. diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 05c8dd41106c..1921aaa9cece 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1159,6 +1159,8 @@ int cmd_report(int argc, const char **argv) OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel, "Enable kernel symbol demangling"), OPT_BOOLEAN(0, "mem-mode", &report.mem_mode, "mem access profile"), + OPT_INTEGER(0, "samples", &symbol_conf.res_sample, + "Number of samples to save per histogram entry for individual browsing"), OPT_CALLBACK(0, "percent-limit", &report, "percent", "Don't show entries under that percent", parse_percent_limit), OPT_CALLBACK(0, "percentage", NULL, "relative|absolute", diff --git a/tools/perf/ui/browsers/Build b/tools/perf/ui/browsers/Build index 8fee56b46502..fdf86f7981ca 100644 --- a/tools/perf/ui/browsers/Build +++ b/tools/perf/ui/browsers/Build @@ -3,6 +3,7 @@ perf-y += hists.o perf-y += map.o perf-y += scripts.o perf-y += header.o +perf-y += res_sample.o CFLAGS_annotate.o += -DENABLE_SLFUTURE_CONST CFLAGS_hists.o += -DENABLE_SLFUTURE_CONST diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index fb4430f8982c..3421ecbdd3f0 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -1226,6 +1226,8 @@ void hist_browser__init_hpp(void) hist_browser__hpp_color_overhead_guest_us; perf_hpp__format[PERF_HPP__OVERHEAD_ACC].color = hist_browser__hpp_color_overhead_acc; + + res_sample_init(); } static int hist_browser__show_entry(struct hist_browser *browser, @@ -2345,6 +2347,7 @@ struct popup_action { struct map_symbol ms; int socket; struct perf_evsel *evsel; + enum rstype rstype; int (*fn)(struct hist_browser *browser, struct popup_action *act); }; @@ -2572,6 +2575,17 @@ do_run_script(struct hist_browser *browser __maybe_unused, return 0; } +static int +do_res_sample_script(struct hist_browser *browser __maybe_unused, + struct popup_action *act) +{ + struct hist_entry *he; + + he = hist_browser__selected_entry(browser); + res_sample_browse(he->res_samples, he->num_res, act->evsel, act->rstype); + return 0; +} + static int add_script_opt_2(struct hist_browser *browser __maybe_unused, struct popup_action *act, char **optstr, @@ -2629,6 +2643,27 @@ add_script_opt(struct hist_browser *browser, return n; } +static int +add_res_sample_opt(struct hist_browser *browser __maybe_unused, + struct popup_action *act, char **optstr, + struct res_sample *res_sample, + struct perf_evsel *evsel, + enum rstype type) +{ + if (!res_sample) + return 0; + + if (asprintf(optstr, "Show context for individual samples %s", + type == A_ASM ? "with assembler" : + type == A_SOURCE ? "with source" : "") < 0) + return 0; + + act->fn = do_res_sample_script; + act->evsel = evsel; + act->rstype = type; + return 1; +} + static int do_switch_data(struct hist_browser *browser __maybe_unused, struct popup_action *act __maybe_unused) @@ -3115,6 +3150,18 @@ skip_annotation: } nr_options += add_script_opt(browser, &actions[nr_options], &options[nr_options], NULL, NULL, evsel); + nr_options += add_res_sample_opt(browser, &actions[nr_options], + &options[nr_options], + hist_browser__selected_entry(browser)->res_samples, + evsel, A_NORMAL); + nr_options += add_res_sample_opt(browser, &actions[nr_options], + &options[nr_options], + hist_browser__selected_entry(browser)->res_samples, + evsel, A_ASM); + nr_options += add_res_sample_opt(browser, &actions[nr_options], + &options[nr_options], + hist_browser__selected_entry(browser)->res_samples, + evsel, A_SOURCE); nr_options += add_switch_opt(browser, &actions[nr_options], &options[nr_options]); skip_scripting: diff --git a/tools/perf/ui/browsers/res_sample.c b/tools/perf/ui/browsers/res_sample.c new file mode 100644 index 000000000000..884ef2a92c15 --- /dev/null +++ b/tools/perf/ui/browsers/res_sample.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Display a menu with individual samples to browse with perf script */ +#include "util.h" +#include "hist.h" +#include "evsel.h" +#include "hists.h" +#include "sort.h" +#include "config.h" +#include "time-utils.h" +#include + +static u64 context_len = 10 * NSEC_PER_MSEC; + +static int res_sample_config(const char *var, const char *value, void *data __maybe_unused) +{ + if (!strcmp(var, "samples.context")) + return perf_config_u64(&context_len, var, value); + return 0; +} + +void res_sample_init(void) +{ + perf_config(res_sample_config, NULL); +} + +int res_sample_browse(struct res_sample *res_samples, int num_res, + struct perf_evsel *evsel, enum rstype rstype) +{ + char **names; + int i, n; + int choice; + char *cmd; + char pbuf[256], tidbuf[32], cpubuf[32]; + const char *perf = perf_exe(pbuf, sizeof pbuf); + char trange[128], tsample[64]; + struct res_sample *r; + char extra_format[256]; + + /* For now since ui__popup_menu doesn't like lists that don't fit */ + num_res = max(min(SLtt_Screen_Rows - 4, num_res), 0); + + names = calloc(num_res, sizeof(char *)); + if (!names) + return -1; + for (i = 0; i < num_res; i++) { + char tbuf[64]; + + timestamp__scnprintf_nsec(res_samples[i].time, tbuf, sizeof tbuf); + if (asprintf(&names[i], "%s: CPU %d tid %d", tbuf, + res_samples[i].cpu, res_samples[i].tid) < 0) { + while (--i >= 0) + free(names[i]); + free(names); + return -1; + } + } + choice = ui__popup_menu(num_res, names); + for (i = 0; i < num_res; i++) + free(names[i]); + free(names); + + if (choice < 0 || choice >= num_res) + return -1; + r = &res_samples[choice]; + + n = timestamp__scnprintf_nsec(r->time - context_len, trange, sizeof trange); + trange[n++] = ','; + timestamp__scnprintf_nsec(r->time + context_len, trange + n, sizeof trange - n); + + timestamp__scnprintf_nsec(r->time, tsample, sizeof tsample); + + attr_to_script(extra_format, &evsel->attr); + + if (asprintf(&cmd, "%s script %s%s --time %s %s%s %s%s --ns %s %s %s %s %s | less +/%s", + perf, + input_name ? "-i " : "", + input_name ? input_name : "", + trange, + r->cpu >= 0 ? "--cpu " : "", + r->cpu >= 0 ? (sprintf(cpubuf, "%d", r->cpu), cpubuf) : "", + r->tid ? "--tid " : "", + r->tid ? (sprintf(tidbuf, "%d", r->tid), tidbuf) : "", + extra_format, + rstype == A_ASM ? "-F +insn --xed" : + rstype == A_SOURCE ? "-F +srcline,+srccode" : "", + symbol_conf.inline_name ? "--inline" : "", + "--show-lost-events ", + r->tid ? "--show-switch-events --show-task-events " : "", + tsample) < 0) + return -1; + run_script(cmd); + free(cmd); + return 0; +} diff --git a/tools/perf/ui/browsers/scripts.c b/tools/perf/ui/browsers/scripts.c index 9e5f87558af6..cdba58447b85 100644 --- a/tools/perf/ui/browsers/scripts.c +++ b/tools/perf/ui/browsers/scripts.c @@ -125,7 +125,7 @@ out: return ret; } -static void run_script(char *cmd) +void run_script(char *cmd) { pr_debug("Running %s\n", cmd); SLang_reset_tty(); diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 34c0f00c68d1..1f230285d78a 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -436,6 +436,13 @@ static int hist_entry__init(struct hist_entry *he, goto err_rawdata; } + if (symbol_conf.res_sample) { + he->res_samples = calloc(sizeof(struct res_sample), + symbol_conf.res_sample); + if (!he->res_samples) + goto err_srcline; + } + INIT_LIST_HEAD(&he->pairs.node); thread__get(he->thread); he->hroot_in = RB_ROOT_CACHED; @@ -446,6 +453,9 @@ static int hist_entry__init(struct hist_entry *he, return 0; +err_srcline: + free(he->srcline); + err_rawdata: free(he->raw_data); @@ -603,6 +613,32 @@ out: return he; } +static unsigned random_max(unsigned high) +{ + unsigned thresh = -high % high; + for (;;) { + unsigned r = random(); + if (r >= thresh) + return r % high; + } +} + +static void hists__res_sample(struct hist_entry *he, struct perf_sample *sample) +{ + struct res_sample *r; + int j; + + if (he->num_res < symbol_conf.res_sample) { + j = he->num_res++; + } else { + j = random_max(symbol_conf.res_sample); + } + r = &he->res_samples[j]; + r->time = sample->time; + r->cpu = sample->cpu; + r->tid = sample->tid; +} + static struct hist_entry* __hists__add_entry(struct hists *hists, struct addr_location *al, @@ -650,6 +686,8 @@ __hists__add_entry(struct hists *hists, if (!hists->has_callchains && he && he->callchain_size != 0) hists->has_callchains = true; + if (he && symbol_conf.res_sample) + hists__res_sample(he, sample); return he; } @@ -1173,6 +1211,7 @@ void hist_entry__delete(struct hist_entry *he) mem_info__zput(he->mem_info); } + zfree(&he->res_samples); zfree(&he->stat_acc); free_srcline(he->srcline); if (he->srcfile && he->srcfile[0]) diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 2113a6639cea..76ff6c6d03b8 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -433,6 +433,13 @@ struct hist_browser_timer { }; struct annotation_options; +struct res_sample; + +enum rstype { + A_NORMAL, + A_ASM, + A_SOURCE +}; #ifdef HAVE_SLANG_SUPPORT #include "../ui/keysyms.h" @@ -454,6 +461,11 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help, struct annotation_options *annotation_options); int script_browse(const char *script_opt, struct perf_evsel *evsel); + +void run_script(char *cmd); +int res_sample_browse(struct res_sample *res_samples, int num_res, + struct perf_evsel *evsel, enum rstype rstype); +void res_sample_init(void); #else static inline int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __maybe_unused, @@ -488,6 +500,16 @@ static inline int script_browse(const char *script_opt __maybe_unused, return 0; } +static inline int res_sample_browse(struct res_sample *res_samples __maybe_unused, + int num_res __maybe_unused, + struct perf_evsel *evsel __maybe_unused, + enum rstype rstype __maybe_unused) +{ + return 0; +} + +static inline void res_sample_init(void) {} + #define K_LEFT -1000 #define K_RIGHT -2000 #define K_SWITCH_INPUT_DATA -3000 diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 19dceb7f6145..bb9442ab7a0c 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -47,6 +47,12 @@ extern struct sort_entry sort_srcline; extern enum sort_type sort__first_dimension; extern const char default_mem_sort_order[]; +struct res_sample { + u64 time; + int cpu; + int tid; +}; + struct he_stat { u64 period; u64 period_sys; @@ -140,6 +146,8 @@ struct hist_entry { struct mem_info *mem_info; void *raw_data; u32 raw_size; + int num_res; + struct res_sample *res_samples; void *trace_output; struct perf_hpp_list *hpp_list; struct hist_entry *parent_he; diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 6b73a0eeb6a1..58442ca5e3c4 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -51,6 +51,7 @@ struct symbol_conf symbol_conf = { .symfs = "", .event_group = true, .inline_name = true, + .res_sample = 0, }; static enum dso_binary_type binary_type_symtab[] = { diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h index a5684a71b78e..6c55fa6fccec 100644 --- a/tools/perf/util/symbol_conf.h +++ b/tools/perf/util/symbol_conf.h @@ -68,6 +68,7 @@ struct symbol_conf { struct intlist *pid_list, *tid_list; const char *symfs; + int res_sample; }; extern struct symbol_conf symbol_conf; From ca52babe033f2a0a535ce7c814e54a44cead1f15 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 11 Mar 2019 07:44:59 -0700 Subject: [PATCH 395/855] perf tools: Add some new tips describing the new options Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20190311144502.15423-9-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/tips.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt index 849599f39c5e..869965d629ce 100644 --- a/tools/perf/Documentation/tips.txt +++ b/tools/perf/Documentation/tips.txt @@ -15,6 +15,7 @@ To see callchains in a more compact form: perf report -g folded Show individual samples with: perf script Limit to show entries above 5% only: perf report --percent-limit 5 Profiling branch (mis)predictions with: perf record -b / perf report +To show assembler sample contexts use perf record -b / perf script -F +brstackinsn --xed Treat branches as callchains: perf report --branch-history To count events in every 1000 msec: perf stat -I 1000 Print event counts in CSV format with: perf stat -x, @@ -34,3 +35,9 @@ Show current config key-value pairs: perf config --list Show user configuration overrides: perf config --user --list To add Node.js USDT(User-Level Statically Defined Tracing): perf buildid-cache --add `which node` To report cacheline events from previous recording: perf c2c report +To browse sample contexts use perf report --sample 10 and select in context menu +To separate samples by time use perf report --sort time,overhead,sym +To set sample time separation other than 100ms with --sort time use --time-quantum +Add -I to perf report to sample register values visible in perf report context. +To show IPC for sampling periods use perf record -e '{cycles,instructions}:S' and then browse context +To show context switches in perf report sample context add --switch-events to perf record. From 905e4aff31382c3f9b2014d1361f4a1be4479ba2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 11 Mar 2019 07:45:01 -0700 Subject: [PATCH 396/855] perf script: Add array bound checking to list_scripts Don't overflow array when the scripts directory is too large, or the script file name is too long. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20190311144502.15423-11-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 8 ++++++-- tools/perf/builtin.h | 3 ++- tools/perf/ui/browsers/scripts.c | 3 ++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index b695b20ffc8a..2f93d60c5a17 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2982,7 +2982,8 @@ static int check_ev_match(char *dir_name, char *scriptname, * will list all statically runnable scripts, select one, execute it and * show the output in a perf browser. */ -int find_scripts(char **scripts_array, char **scripts_path_array) +int find_scripts(char **scripts_array, char **scripts_path_array, int num, + int pathlen) { struct dirent *script_dirent, *lang_dirent; char scripts_path[MAXPATHLEN], lang_path[MAXPATHLEN]; @@ -3027,7 +3028,10 @@ int find_scripts(char **scripts_array, char **scripts_path_array) /* Skip those real time scripts: xxxtop.p[yl] */ if (strstr(script_dirent->d_name, "top.")) continue; - sprintf(scripts_path_array[i], "%s/%s", lang_path, + if (i >= num) + break; + snprintf(scripts_path_array[i], pathlen, "%s/%s", + lang_path, script_dirent->d_name); temp = strchr(script_dirent->d_name, '.'); snprintf(scripts_array[i], diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h index 05745f3ce912..999fe9170122 100644 --- a/tools/perf/builtin.h +++ b/tools/perf/builtin.h @@ -40,5 +40,6 @@ int cmd_mem(int argc, const char **argv); int cmd_data(int argc, const char **argv); int cmd_ftrace(int argc, const char **argv); -int find_scripts(char **scripts_array, char **scripts_path_array); +int find_scripts(char **scripts_array, char **scripts_path_array, int num, + int pathlen); #endif diff --git a/tools/perf/ui/browsers/scripts.c b/tools/perf/ui/browsers/scripts.c index cdba58447b85..96e5cd3b0eee 100644 --- a/tools/perf/ui/browsers/scripts.c +++ b/tools/perf/ui/browsers/scripts.c @@ -97,7 +97,8 @@ static int list_scripts(char *script_name, bool *custom, paths[i] = names[i] + SCRIPT_NAMELEN; } - num = find_scripts(names + max_std, paths + max_std); + num = find_scripts(names + max_std, paths + max_std, SCRIPT_MAX_NO - max_std, + SCRIPT_FULLPATH_LEN); if (num < 0) num = 0; choice = ui__popup_menu(num + max_std, (char * const *)names); From 59c24980dffbea2106fe65e64ea77834d657ee9c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 11 Mar 2019 07:45:02 -0700 Subject: [PATCH 397/855] perf ui browser: Fix ui popup argv browser for many entries Fix the argv ui browser code to correctly display more entries than fit on the screen without crashing. The problem was some type confusion with pointer types in the ->seek function. Do the argv arithmetic correctly with char ** pointers. Also add some asserts to find overruns and limit the display function correctly. Then finally remove a workaround for this in the res sample browser. Committer testing: 1) Resize the x terminal to have just some 5 lines 2) Use 'perf report --samples 1' to activate the sample browser options in the menu 3) Press ENTER, this will cause the crash: # perf report --samples 1 perf: Segmentation fault -------- backtrace -------- perf[0x5a514a] /lib64/libc.so.6(+0x385bf)[0x7f27281b55bf] /lib64/libc.so.6(+0x161a67)[0x7f27282dea67] /lib64/libslang.so.2(SLsmg_write_wrapped_string+0x82)[0x7f272874a0b2] perf(ui_browser__argv_refresh+0x77)[0x5939a7] perf[0x5924cc] perf(ui_browser__run+0x39)[0x593449] perf(ui__popup_menu+0x83)[0x5a5263] perf[0x59f421] perf(perf_evlist__tui_browse_hists+0x3a0)[0x5a3780] perf(cmd_report+0x2746)[0x447136] perf[0x4a95fe] perf(main+0x61c)[0x42dc6c] /lib64/libc.so.6(__libc_start_main+0xf2)[0x7f27281a1412] perf(_start+0x2d)[0x42de9d] # After applying this patch no crash takes place in such situation. Signed-off-by: Andi Kleen Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20190311144502.15423-12-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browser.c | 10 +++++++--- tools/perf/ui/browsers/res_sample.c | 3 --- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c index 4f75561424ed..4ad37d8c7d6a 100644 --- a/tools/perf/ui/browser.c +++ b/tools/perf/ui/browser.c @@ -611,14 +611,16 @@ void ui_browser__argv_seek(struct ui_browser *browser, off_t offset, int whence) browser->top = browser->entries; break; case SEEK_CUR: - browser->top = browser->top + browser->top_idx + offset; + browser->top = (char **)browser->top + offset; break; case SEEK_END: - browser->top = browser->top + browser->nr_entries - 1 + offset; + browser->top = (char **)browser->entries + browser->nr_entries - 1 + offset; break; default: return; } + assert((char **)browser->top < (char **)browser->entries + browser->nr_entries); + assert((char **)browser->top >= (char **)browser->entries); } unsigned int ui_browser__argv_refresh(struct ui_browser *browser) @@ -630,7 +632,9 @@ unsigned int ui_browser__argv_refresh(struct ui_browser *browser) browser->top = browser->entries; pos = (char **)browser->top; - while (idx < browser->nr_entries) { + while (idx < browser->nr_entries && + row < (unsigned)SLtt_Screen_Rows - 1) { + assert(pos < (char **)browser->entries + browser->nr_entries); if (!browser->filter || !browser->filter(browser, *pos)) { ui_browser__gotorc(browser, row, 0); browser->write(browser, pos, row); diff --git a/tools/perf/ui/browsers/res_sample.c b/tools/perf/ui/browsers/res_sample.c index 884ef2a92c15..c0dd73176d42 100644 --- a/tools/perf/ui/browsers/res_sample.c +++ b/tools/perf/ui/browsers/res_sample.c @@ -36,9 +36,6 @@ int res_sample_browse(struct res_sample *res_samples, int num_res, struct res_sample *r; char extra_format[256]; - /* For now since ui__popup_menu doesn't like lists that don't fit */ - num_res = max(min(SLtt_Screen_Rows - 4, num_res), 0); - names = calloc(num_res, sizeof(char *)); if (!names) return -1; From e3b74de50a5f8bbfacbd772874c8b5d9220ebcdb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 11 Mar 2019 07:45:00 -0700 Subject: [PATCH 398/855] perf tools report: Add custom scripts to script menu Add a way to define custom scripts through ~/.perfconfig, which are then added to the scripts menu. The scripts get the same arguments as 'perf script', in particular -i, --cpu, --tid. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20190311144502.15423-10-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-config.txt | 8 ++++++++ tools/perf/ui/browsers/scripts.c | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt index 2d0fb7613134..95054a8176a2 100644 --- a/tools/perf/Documentation/perf-config.txt +++ b/tools/perf/Documentation/perf-config.txt @@ -590,6 +590,14 @@ samples.*:: Define how many ns worth of time to show around samples in perf report sample context browser. +scripts.*:: + + Any option defines a script that is added to the scripts menu + in the interactive perf browser and whose output is displayed. + The name of the option is the name, the value is a script command line. + The script gets the same options passed as a full perf script, + in particular -i perfdata file, --cpu, --tid + SEE ALSO -------- linkperf:perf[1] diff --git a/tools/perf/ui/browsers/scripts.c b/tools/perf/ui/browsers/scripts.c index 96e5cd3b0eee..27cf3ab88d13 100644 --- a/tools/perf/ui/browsers/scripts.c +++ b/tools/perf/ui/browsers/scripts.c @@ -6,6 +6,7 @@ #include "../../util/symbol.h" #include "../browser.h" #include "../libslang.h" +#include "config.h" #define SCRIPT_NAMELEN 128 #define SCRIPT_MAX_NO 64 @@ -53,6 +54,24 @@ static int add_script_option(const char *name, const char *opt, return 0; } +static int scripts_config(const char *var, const char *value, void *data) +{ + struct script_config *c = data; + + if (!strstarts(var, "scripts.")) + return -1; + if (c->index >= SCRIPT_MAX_NO) + return -1; + c->names[c->index] = strdup(var + 7); + if (!c->names[c->index]) + return -1; + if (asprintf(&c->paths[c->index], "%s %s", value, + c->extra_format) < 0) + return -1; + c->index++; + return 0; +} + /* * When success, will copy the full path of the selected script * into the buffer pointed by script_name, and return 0. @@ -87,6 +106,7 @@ static int list_scripts(char *script_name, bool *custom, &scriptc); add_script_option("Show individual samples with source", "-F +srcline,+srccode", &scriptc); + perf_config(scripts_config, &scriptc); custom_perf = scriptc.index; add_script_option("Show samples with custom perf script arguments", "", &scriptc); i = scriptc.index; From dfcbc2f2994b8a3af3605a26dc29c07ad7378bf4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Mar 2019 17:07:52 -0300 Subject: [PATCH 399/855] tools lib bpf: Fix the build by adding a missing stdarg.h include The libbpf_print_fn_t typedef uses va_list without including the header where that type is defined, stdarg.h, breaking in places where we're unlucky for that type not to be already defined by some previously included header. Noticed while building on fedora 24 cross building tools/perf to the ARC architecture using the uClibc C library: 28 fedora:24-x-ARC-uClibc : FAIL arc-linux-gcc (ARCompact ISA Linux uClibc toolchain 2017.09-rc2) 7.1.1 20170710 CC /tmp/build/perf/tests/llvm.o In file included from tests/llvm.c:3:0: /git/linux/tools/lib/bpf/libbpf.h:57:20: error: unknown type name 'va_list' const char *, va_list ap); ^~~~~~~ /git/linux/tools/lib/bpf/libbpf.h:59:34: error: unknown type name 'libbpf_print_fn_t' LIBBPF_API void libbpf_set_print(libbpf_print_fn_t fn); ^~~~~~~~~~~~~~~~~ mv: cannot stat '/tmp/build/perf/tests/.llvm.o.tmp': No such file or directory Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jakub Kicinski Cc: Jiri Olsa Cc: Namhyung Kim Cc: Quentin Monnet Cc: Stanislav Fomichev Cc: Yonghong Song Fixes: a8a1f7d09cfc ("libbpf: fix libbpf_print") Link: https://lkml.kernel.org/n/tip-5270n2quu2gqz22o7itfdx00@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/bpf/libbpf.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index b4652aa1a58a..aa1521a51687 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -10,6 +10,7 @@ #ifndef __LIBBPF_LIBBPF_H #define __LIBBPF_LIBBPF_H +#include #include #include #include From f7b1844bacecca96dd8d813675e4d8adec02cd66 Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Mon, 11 Mar 2019 11:47:52 -0400 Subject: [PATCH 400/855] drm/amdgpu: Update gc golden setting for vega family GC owner suggested the setting should be applied which is missed by HW default Signed-off-by: shaoyunl Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 5533f6e4f4a4..d0309e8c9d12 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -220,6 +220,7 @@ static const struct soc15_reg_golden golden_settings_gc_9_1_rv2[] = static const struct soc15_reg_golden golden_settings_gc_9_x_common[] = { + SOC15_REG_GOLDEN_VALUE(GC, 0, mmCP_SD_CNTL, 0xffffffff, 0x000001ff), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_CAM_INDEX, 0xffffffff, 0x00000000), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_CAM_DATA, 0xffffffff, 0x2544c382) }; From cdb8faa00e3fcdd0ad10add743516d616dc7d38e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20=C5=A0tetiar?= Date: Mon, 11 Mar 2019 22:08:22 +0100 Subject: [PATCH 401/855] mips: bcm47xx: Enable USB power on Netgear WNDR3400v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eric has reported on OpenWrt's bug tracking system[1], that he's not able to use USB devices on his WNDR3400v2 device after the boot, until he turns on GPIO #21 manually through sysfs. 1. https://bugs.openwrt.org/index.php?do=details&task_id=2170 Cc: Rafał Miłecki Cc: Hauke Mehrtens Reported-by: Eric Bohlman Tested-by: Eric Bohlman Signed-off-by: Petr Štetiar Signed-off-by: Paul Burton --- arch/mips/bcm47xx/workarounds.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/bcm47xx/workarounds.c b/arch/mips/bcm47xx/workarounds.c index 46eddbec8d9f..0ab95dd431b3 100644 --- a/arch/mips/bcm47xx/workarounds.c +++ b/arch/mips/bcm47xx/workarounds.c @@ -24,6 +24,7 @@ void __init bcm47xx_workarounds(void) case BCM47XX_BOARD_NETGEAR_WNR3500L: bcm47xx_workarounds_enable_usb_power(12); break; + case BCM47XX_BOARD_NETGEAR_WNDR3400V2: case BCM47XX_BOARD_NETGEAR_WNDR3400_V3: bcm47xx_workarounds_enable_usb_power(21); break; From 19d6907521b04206676741b26e05a1524662f9d2 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Mon, 4 Mar 2019 13:55:51 +0530 Subject: [PATCH 402/855] powerpc/mm: Disable kcov for SLB routines The kcov instrumentation inside SLB routines causes duplicate SLB entries to be added resulting into SLB multihit machine checks. Disable kcov instrumentation on slb.o Signed-off-by: Mahesh Salgaonkar Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index d52ec118e09d..3c1bd9fa23cd 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -52,3 +52,6 @@ obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o # This is necessary for booting with kcov enabled on book3e machines KCOV_INSTRUMENT_tlb_nohash.o := n KCOV_INSTRUMENT_fsl_booke_mmu.o := n + +# Instrumenting the SLB fault path can lead to duplicate SLB entries +KCOV_INSTRUMENT_slb.o := n From 9a18b5a412baf23137c8fddb4ea7f0c14087f31c Mon Sep 17 00:00:00 2001 From: "Enrico Weigelt, metux IT consult" Date: Mon, 11 Mar 2019 14:57:59 +0100 Subject: [PATCH 403/855] arch: arc: Kconfig: pedantic formatting Formatting of Kconfig files doesn't look so pretty, so let the Great White Handkerchief come around and clean it up. Signed-off-by: Enrico Weigelt, metux IT consult Signed-off-by: Vineet Gupta --- arch/arc/Kconfig | 12 ++++++------ arch/arc/plat-eznps/Kconfig | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 95fb11a85566..68401536e718 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -144,11 +144,11 @@ config ARC_CPU_770 Support for ARC770 core introduced with Rel 4.10 (Summer 2011) This core has a bunch of cool new features: -MMU-v3: Variable Page Sz (4k, 8k, 16k), bigger J-TLB (128x4) - Shared Address Spaces (for sharing TLB entries in MMU) + Shared Address Spaces (for sharing TLB entries in MMU) -Caches: New Prog Model, Region Flush -Insns: endian swap, load-locked/store-conditional, time-stamp-ctr -endif #ISA_ARCOMPACT +endif #ISA_ARCOMPACT config ARC_CPU_HS bool "ARC-HS" @@ -198,7 +198,7 @@ config ARC_SMP_HALT_ON_RESET at designated entry point. For other case, all jump to common entry point and spin wait for Master's signal. -endif #SMP +endif #SMP config ARC_MCIP bool "ARConnect Multicore IP (MCIP) Support " @@ -249,7 +249,7 @@ config ARC_CACHE_VIPT_ALIASING bool "Support VIPT Aliasing D$" depends on ARC_HAS_DCACHE && ISA_ARCOMPACT -endif #ARC_CACHE +endif #ARC_CACHE config ARC_HAS_ICCM bool "Use ICCM" @@ -370,7 +370,7 @@ config ARC_FPU_SAVE_RESTORE based on actual usage of FPU by a task. Thus our implemn does this for all tasks in system. -endif #ISA_ARCOMPACT +endif #ISA_ARCOMPACT config ARC_CANT_LLSC def_bool n @@ -423,7 +423,7 @@ config ARC_IRQ_NO_AUTOSAVE This is programmable and can be optionally disabled in which case software INTERRUPT_PROLOGUE/EPILGUE do the needed work -endif # ISA_ARCV2 +endif # ISA_ARCV2 endmenu # "ARC CPU Configuration" diff --git a/arch/arc/plat-eznps/Kconfig b/arch/arc/plat-eznps/Kconfig index 8eff057efcae..2eaecfb063a7 100644 --- a/arch/arc/plat-eznps/Kconfig +++ b/arch/arc/plat-eznps/Kconfig @@ -26,8 +26,8 @@ config EZNPS_MTM_EXT help Here we add new hierarchy for CPUs topology. We got: - Core - Thread + Core + Thread At the new thread level each CPU represent one HW thread. At highest hierarchy each core contain 16 threads, any of them seem like CPU from Linux point of view. @@ -35,10 +35,10 @@ config EZNPS_MTM_EXT core and HW scheduler round robin between them. config EZNPS_MEM_ERROR_ALIGN - bool "ARC-EZchip Memory error as an exception" - depends on EZNPS_MTM_EXT - default n - help + bool "ARC-EZchip Memory error as an exception" + depends on EZNPS_MTM_EXT + default n + help On the real chip of the NPS, user memory errors are handled as a machine check exception, which is fatal, whereas on simulator platform for NPS, is handled as a Level 2 interrupt From bc1a7f75c85e226e82f183d30d75c357f92b6029 Mon Sep 17 00:00:00 2001 From: Hsin-Yi Wang Date: Fri, 15 Feb 2019 17:02:02 +0800 Subject: [PATCH 404/855] i2c: mediatek: modify threshold passed to i2c_get_dma_safe_msg_buf() DMA with zero-length transfers doesn't make sense and this HW doesn't support them at all, so increase the threshold. Fixes: fc66b39fe36a ("i2c: mediatek: Use DMA safe buffers for i2c transactions") Signed-off-by: Hsin-Yi Wang [wsa: reworded commit message] Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mt65xx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-mt65xx.c b/drivers/i2c/busses/i2c-mt65xx.c index 660de1ee68ed..684d651612b3 100644 --- a/drivers/i2c/busses/i2c-mt65xx.c +++ b/drivers/i2c/busses/i2c-mt65xx.c @@ -503,7 +503,7 @@ static int mtk_i2c_do_transfer(struct mtk_i2c *i2c, struct i2c_msg *msgs, writel(I2C_DMA_INT_FLAG_NONE, i2c->pdmabase + OFFSET_INT_FLAG); writel(I2C_DMA_CON_RX, i2c->pdmabase + OFFSET_CON); - dma_rd_buf = i2c_get_dma_safe_msg_buf(msgs, 0); + dma_rd_buf = i2c_get_dma_safe_msg_buf(msgs, 1); if (!dma_rd_buf) return -ENOMEM; @@ -526,7 +526,7 @@ static int mtk_i2c_do_transfer(struct mtk_i2c *i2c, struct i2c_msg *msgs, writel(I2C_DMA_INT_FLAG_NONE, i2c->pdmabase + OFFSET_INT_FLAG); writel(I2C_DMA_CON_TX, i2c->pdmabase + OFFSET_CON); - dma_wr_buf = i2c_get_dma_safe_msg_buf(msgs, 0); + dma_wr_buf = i2c_get_dma_safe_msg_buf(msgs, 1); if (!dma_wr_buf) return -ENOMEM; @@ -549,7 +549,7 @@ static int mtk_i2c_do_transfer(struct mtk_i2c *i2c, struct i2c_msg *msgs, writel(I2C_DMA_CLR_FLAG, i2c->pdmabase + OFFSET_INT_FLAG); writel(I2C_DMA_CLR_FLAG, i2c->pdmabase + OFFSET_CON); - dma_wr_buf = i2c_get_dma_safe_msg_buf(msgs, 0); + dma_wr_buf = i2c_get_dma_safe_msg_buf(msgs, 1); if (!dma_wr_buf) return -ENOMEM; @@ -561,7 +561,7 @@ static int mtk_i2c_do_transfer(struct mtk_i2c *i2c, struct i2c_msg *msgs, return -ENOMEM; } - dma_rd_buf = i2c_get_dma_safe_msg_buf((msgs + 1), 0); + dma_rd_buf = i2c_get_dma_safe_msg_buf((msgs + 1), 1); if (!dma_rd_buf) { dma_unmap_single(i2c->dev, wpaddr, msgs->len, DMA_TO_DEVICE); From 60f7691c624b41a05bfc3493d9b0519e7951b7ef Mon Sep 17 00:00:00 2001 From: Louis Taylor Date: Sat, 2 Mar 2019 14:18:36 +0000 Subject: [PATCH 405/855] i2c: sis630: correct format strings When compiling with -Wformat, clang warns: drivers/i2c/busses/i2c-sis630.c:482:4: warning: format specifies type 'unsigned short' but the argument has type 'int' [-Wformat] smbus_base + SMB_STS, ^~~~~~~~~~~~~~~~~~~~ drivers/i2c/busses/i2c-sis630.c:483:4: warning: format specifies type 'unsigned short' but the argument has type 'int' [-Wformat] smbus_base + SMB_STS + SIS630_SMB_IOREGION - 1); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/i2c/busses/i2c-sis630.c:531:37: warning: format specifies type 'unsigned short' but the argument has type 'int' [-Wformat] "SMBus SIS630 adapter at %04hx", smbus_base + SMB_STS); ~~~~~ ^~~~~~~~~~~~~~~~~~~~ This patch fixes the format strings to use the format type for int. Link: https://github.com/ClangBuiltLinux/linux/issues/378 Signed-off-by: Louis Taylor Reviewed-by: Nick Desaulniers Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-sis630.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-sis630.c b/drivers/i2c/busses/i2c-sis630.c index 1e6805b5cef2..a57aa4fe51a4 100644 --- a/drivers/i2c/busses/i2c-sis630.c +++ b/drivers/i2c/busses/i2c-sis630.c @@ -478,7 +478,7 @@ static int sis630_setup(struct pci_dev *sis630_dev) if (!request_region(smbus_base + SMB_STS, SIS630_SMB_IOREGION, sis630_driver.name)) { dev_err(&sis630_dev->dev, - "I/O Region 0x%04hx-0x%04hx for SMBus already in use.\n", + "I/O Region 0x%04x-0x%04x for SMBus already in use.\n", smbus_base + SMB_STS, smbus_base + SMB_STS + SIS630_SMB_IOREGION - 1); retval = -EBUSY; @@ -528,7 +528,7 @@ static int sis630_probe(struct pci_dev *dev, const struct pci_device_id *id) sis630_adapter.dev.parent = &dev->dev; snprintf(sis630_adapter.name, sizeof(sis630_adapter.name), - "SMBus SIS630 adapter at %04hx", smbus_base + SMB_STS); + "SMBus SIS630 adapter at %04x", smbus_base + SMB_STS); return i2c_add_adapter(&sis630_adapter); } From a35ba2f74df5481cb4db1e9d582c708efeb9880d Mon Sep 17 00:00:00 2001 From: Hiromitsu Yamasaki Date: Sun, 3 Mar 2019 16:03:13 +0100 Subject: [PATCH 406/855] i2c: rcar: fix concurrency issue related to ICDMAER This patch fixes the problem that an interrupt may set up a new I2C message and the DMA callback overwrites this setup. By disabling the DMA Enable Register(ICDMAER), rcar_i2c_dma_unmap() enables interrupts for register settings (such as Master Control Register(ICMCR)) and advances the I2C transfer sequence. If an interrupt occurs immediately after ICDMAER is disabled, the callback handler later continues and overwrites the previous settings from the interrupt. So, disable ICDMAER at the end of the callback to ensure other interrupts are masked until then. Note that this driver needs to work lock-free because there are IP cores with a HW race condition which prevent us from using a spinlock in the interrupt handler. Reproduction test: 1. Add a delay after disabling ICDMAER. (It is expected to generate an interrupt of rcar_i2c_irq()) void rcar_i2c_dma_unmap(struct rcar_i2c_priv *priv) { ... rcar_i2c_write(priv, ICDMAER, 0); usleep_range(500, 800) ... priv->dma_direction = DMA_NONE; } 2. Execute DMA transfers $ i2ctransfer -y 4 w9@0x6a 1 1+ r16 3. A log message of BUG_ON() will be displayed. Fixes: 73e8b0528346 ("i2c: rcar: add DMA support") Signed-off-by: Hiromitsu Yamasaki Signed-off-by: Wolfram Sang [wsa: updated test case to be more reliable, added note to comment] Reviewed-by: Simon Horman Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-rcar.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-rcar.c b/drivers/i2c/busses/i2c-rcar.c index dd52a068b140..ce59e14e34b0 100644 --- a/drivers/i2c/busses/i2c-rcar.c +++ b/drivers/i2c/busses/i2c-rcar.c @@ -363,9 +363,6 @@ static void rcar_i2c_dma_unmap(struct rcar_i2c_priv *priv) struct dma_chan *chan = priv->dma_direction == DMA_FROM_DEVICE ? priv->dma_rx : priv->dma_tx; - /* Disable DMA Master Received/Transmitted */ - rcar_i2c_write(priv, ICDMAER, 0); - dma_unmap_single(chan->device->dev, sg_dma_address(&priv->sg), sg_dma_len(&priv->sg), priv->dma_direction); @@ -375,6 +372,9 @@ static void rcar_i2c_dma_unmap(struct rcar_i2c_priv *priv) priv->flags |= ID_P_NO_RXDMA; priv->dma_direction = DMA_NONE; + + /* Disable DMA Master Received/Transmitted, must be last! */ + rcar_i2c_write(priv, ICDMAER, 0); } static void rcar_i2c_cleanup_dma(struct rcar_i2c_priv *priv) From 7ce98a5591d292c7630e2c6804ba0818a75a0f86 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sun, 3 Mar 2019 16:03:14 +0100 Subject: [PATCH 407/855] i2c: rcar: explain the lockless design To make sure people can understand the lockless design of this driver without the need to dive into git history, add a comment giving an overview of the situation. Signed-off-by: Wolfram Sang Reviewed-by: Simon Horman Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-rcar.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/i2c/busses/i2c-rcar.c b/drivers/i2c/busses/i2c-rcar.c index ce59e14e34b0..a7578f6da979 100644 --- a/drivers/i2c/busses/i2c-rcar.c +++ b/drivers/i2c/busses/i2c-rcar.c @@ -611,6 +611,15 @@ static bool rcar_i2c_slave_irq(struct rcar_i2c_priv *priv) return true; } +/* + * This driver has a lock-free design because there are IP cores (at least + * R-Car Gen2) which have an inherent race condition in their hardware design. + * There, we need to clear RCAR_BUS_MASK_DATA bits as soon as possible after + * the interrupt was generated, otherwise an unwanted repeated message gets + * generated. It turned out that taking a spinlock at the beginning of the ISR + * was already causing repeated messages. Thus, this driver was converted to + * the now lockless behaviour. Please keep this in mind when hacking the driver. + */ static irqreturn_t rcar_i2c_irq(int irq, void *ptr) { struct rcar_i2c_priv *priv = ptr; From c86da50cfd840edf223a242580913692acddbcf6 Mon Sep 17 00:00:00 2001 From: Nicolas Le Bayon Date: Wed, 6 Mar 2019 15:12:16 +0000 Subject: [PATCH 408/855] i2c: i2c-stm32f7: Fix SDADEL minimum formula It conforms with Reference Manual I2C timing section. Fixes: aeb068c57214 ("i2c: i2c-stm32f7: add driver") Signed-off-by: Nicolas Le Bayon Signed-off-by: Bich Hemon Reviewed-by: Pierre-Yves MORDRET Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-stm32f7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index 13e1213561d4..4284fc991cfd 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -432,7 +432,7 @@ static int stm32f7_i2c_compute_timing(struct stm32f7_i2c_dev *i2c_dev, STM32F7_I2C_ANALOG_FILTER_DELAY_MAX : 0); dnf_delay = setup->dnf * i2cclk; - sdadel_min = setup->fall_time - i2c_specs[setup->speed].hddat_min - + sdadel_min = i2c_specs[setup->speed].hddat_min + setup->fall_time - af_delay_min - (setup->dnf + 3) * i2cclk; sdadel_max = i2c_specs[setup->speed].vddat_max - setup->rise_time - From 0841625201b649c0b1bf0f0e25cf4401e68fb8fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Kl=C4=93tnieks?= Date: Tue, 12 Mar 2019 04:52:58 -0400 Subject: [PATCH 409/855] tracing/probes: Make reserved_field_names static sparse complains: CHECK kernel/trace/trace_probe.c kernel/trace/trace_probe.c:16:12: warning: symbol 'reserved_field_names' was not declared. Should it be static? Yes, it should be static. Link: http://lkml.kernel.org/r/2478.1552380778@turing-police Signed-off-by: Valdis Kletnieks Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 89da34b326e3..cfcf77e6fb19 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,7 +13,7 @@ #include "trace_probe.h" -const char *reserved_field_names[] = { +static const char *reserved_field_names[] = { "common_type", "common_flags", "common_preempt_count", From cede666e2eb28dc8a680d1622fded533769f07a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Kl=C4=93tnieks?= Date: Tue, 12 Mar 2019 04:58:32 -0400 Subject: [PATCH 410/855] trace/probes: Remove kernel doc style from non kernel doc comment CC kernel/trace/trace_kprobe.o kernel/trace/trace_kprobe.c:41: warning: cannot understand function prototype: 'struct trace_kprobe ' The real problem is that a comment looked like kerneldoc when it shouldn't be... Link: http://lkml.kernel.org/r/2812.1552381112@turing-police Signed-off-by: Valdis Kletnieks Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d5fb09ebba8b..ceafa0a2b1d1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,7 +35,7 @@ static struct dyn_event_operations trace_kprobe_ops = { .match = trace_kprobe_match, }; -/** +/* * Kprobe event core functions */ struct trace_kprobe { From 6ef50fe9afae63d11220f3f66b5f4c75d09c8bf0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 10 Mar 2019 11:46:28 -0700 Subject: [PATCH 411/855] xfs: clean up xfs_dir2_leaf_addname Remove typedefs and consolidate local variable initialization. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Bill O'Donnell --- fs/xfs/libxfs/xfs_dir2_leaf.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 2abf945e5844..9c2a0a13ed61 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -563,43 +563,40 @@ xfs_dir3_leaf_find_entry( */ int /* error */ xfs_dir2_leaf_addname( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) /* operation arguments */ { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_trans *tp = args->trans; __be16 *bestsp; /* freespace table in leaf */ - int compact; /* need to compact leaves */ - xfs_dir2_data_hdr_t *hdr; /* data block header */ + __be16 *tagp; /* end of data entry */ struct xfs_buf *dbp; /* data block buffer */ - xfs_dir2_data_entry_t *dep; /* data block entry */ - xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_data_unused_t *dup; /* data unused entry */ + struct xfs_buf *lbp; /* leaf's buffer */ + struct xfs_dir2_leaf *leaf; /* leaf structure */ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + struct xfs_dir2_data_hdr *hdr; /* data block header */ + struct xfs_dir2_data_entry *dep; /* data block entry */ + struct xfs_dir2_leaf_entry *lep; /* leaf entry table pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir2_data_unused *dup; /* data unused entry */ + struct xfs_dir2_leaf_tail *ltp; /* leaf tail pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + int compact; /* need to compact leaves */ int error; /* error return value */ int grown; /* allocated new data block */ int highstale = 0; /* index of next stale leaf */ int i; /* temporary, index */ int index; /* leaf table position */ - struct xfs_buf *lbp; /* leaf's buffer */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ int length; /* length of new entry */ - xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ int lfloglow; /* low leaf logging index */ int lfloghigh; /* high leaf logging index */ int lowstale = 0; /* index of prev stale leaf */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ int needbytes; /* leaf block bytes needed */ int needlog; /* need to log data header */ int needscan; /* need to rescan data free */ - __be16 *tagp; /* end of data entry */ - xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t use_block; /* data block number */ - struct xfs_dir2_data_free *bf; /* bestfree table */ - struct xfs_dir2_leaf_entry *ents; - struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_addname(args); - dp = args->dp; - tp = args->trans; - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) return error; From a596d08677320925b69e70c0fdc4c0f59384a65e Mon Sep 17 00:00:00 2001 From: Mariusz Dabrowski Date: Mon, 18 Feb 2019 15:04:09 +0100 Subject: [PATCH 412/855] raid5: set write hint for PPL When the Partial Parity Log is enabled, circular buffer is used to store PPL data. Each write to RAID device causes overwrite of data in this buffer so some write_hint can be set to those request to help drives handle garbage collection. This patch adds new sysfs attribute which can be used to specify which write_hint should be assigned to PPL. Acked-by: Guoqing Jiang Signed-off-by: Mariusz Dabrowski Signed-off-by: Song Liu --- Documentation/admin-guide/md.rst | 3 ++ drivers/md/raid5-log.h | 1 + drivers/md/raid5-ppl.c | 63 ++++++++++++++++++++++++++++++++ drivers/md/raid5.c | 1 + 4 files changed, 68 insertions(+) diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index 84de718f24a4..3c51084ffd37 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst @@ -756,3 +756,6 @@ These currently include: The cache mode for raid5. raid5 could include an extra disk for caching. The mode can be "write-throuth" and "write-back". The default is "write-through". + + ppl_write_hint + NVMe stream ID to be set for each PPL write request. diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index bfb811407061..43c714a8798c 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -45,6 +45,7 @@ extern void ppl_stripe_write_finished(struct stripe_head *sh); extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); extern void ppl_quiesce(struct r5conf *conf, int quiesce); extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio); +extern struct md_sysfs_entry ppl_write_hint; static inline bool raid5_has_log(struct r5conf *conf) { diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 3a7c36326589..f2b3020c2ac8 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -21,6 +21,7 @@ #include #include "md.h" #include "raid5.h" +#include "raid5-log.h" /* * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for @@ -116,6 +117,8 @@ struct ppl_conf { /* stripes to retry if failed to allocate io_unit */ struct list_head no_mem_stripes; spinlock_t no_mem_stripes_lock; + + unsigned short write_hint; }; struct ppl_log { @@ -476,6 +479,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) bio_set_dev(bio, log->rdev->bdev); bio->bi_iter.bi_sector = log->next_io_sector; bio_add_page(bio, io->header_page, PAGE_SIZE, 0); + bio->bi_write_hint = ppl_conf->write_hint; pr_debug("%s: log->current_io_sector: %llu\n", __func__, (unsigned long long)log->next_io_sector); @@ -505,6 +509,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &ppl_conf->bs); bio->bi_opf = prev->bi_opf; + bio->bi_write_hint = prev->bi_write_hint; bio_copy_dev(bio, prev); bio->bi_iter.bi_sector = bio_end_sector(prev); bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); @@ -1409,6 +1414,7 @@ int ppl_init_log(struct r5conf *conf) atomic64_set(&ppl_conf->seq, 0); INIT_LIST_HEAD(&ppl_conf->no_mem_stripes); spin_lock_init(&ppl_conf->no_mem_stripes_lock); + ppl_conf->write_hint = RWF_WRITE_LIFE_NOT_SET; if (!mddev->external) { ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); @@ -1503,3 +1509,60 @@ int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add) return ret; } + +static ssize_t +ppl_write_hint_show(struct mddev *mddev, char *buf) +{ + size_t ret = 0; + struct r5conf *conf; + struct ppl_conf *ppl_conf = NULL; + + spin_lock(&mddev->lock); + conf = mddev->private; + if (conf && raid5_has_ppl(conf)) + ppl_conf = conf->log_private; + ret = sprintf(buf, "%d\n", ppl_conf ? ppl_conf->write_hint : 0); + spin_unlock(&mddev->lock); + + return ret; +} + +static ssize_t +ppl_write_hint_store(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + struct ppl_conf *ppl_conf; + int err = 0; + unsigned short new; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtou16(page, 10, &new)) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + + conf = mddev->private; + if (!conf) { + err = -ENODEV; + } else if (raid5_has_ppl(conf)) { + ppl_conf = conf->log_private; + if (!ppl_conf) + err = -EINVAL; + else + ppl_conf->write_hint = new; + } else { + err = -EINVAL; + } + + mddev_unlock(mddev); + + return err ?: len; +} + +struct md_sysfs_entry +ppl_write_hint = __ATTR(ppl_write_hint, S_IRUGO | S_IWUSR, + ppl_write_hint_show, + ppl_write_hint_store); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cecea901ab8c..09562d7cc080 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6660,6 +6660,7 @@ static struct attribute *raid5_attrs[] = { &raid5_skip_copy.attr, &raid5_rmw_level.attr, &r5c_journal_mode.attr, + &ppl_write_hint.attr, NULL, }; static struct attribute_group raid5_attrs_group = { From b761dcf1217760a42f7897c31dcb649f59b2333e Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Fri, 8 Mar 2019 23:52:05 +0800 Subject: [PATCH 413/855] It's wrong to add len to sector_nr in raid10 reshape twice In reshape_request it already adds len to sector_nr already. It's wrong to add len to sector_nr again after adding pages to bio. If there is bad block it can't copy one chunk at a time, it needs to goto read_more. Now the sector_nr is wrong. It can cause data corruption. Cc: stable@vger.kernel.org # v3.16+ Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/raid10.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index abb5d382f64d..ecef42bfe19d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4670,7 +4670,6 @@ read_more: atomic_inc(&r10_bio->remaining); read_bio->bi_next = NULL; generic_make_request(read_bio); - sector_nr += nr_sectors; sectors_done += nr_sectors; if (sector_nr <= last) goto read_more; From e406f12dde1a8375d77ea02d91f313fb1a9c6aec Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Mon, 4 Mar 2019 16:48:54 -0600 Subject: [PATCH 414/855] md: Fix failed allocation of md_register_thread mddev->sync_thread can be set to NULL on kzalloc failure downstream. The patch checks for such a scenario and frees allocated resources. Committer node: Added similar fix to raid5.c, as suggested by Guoqing. Cc: stable@vger.kernel.org # v3.16+ Acked-by: Guoqing Jiang Signed-off-by: Aditya Pakki Signed-off-by: Song Liu --- drivers/md/raid10.c | 2 ++ drivers/md/raid5.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ecef42bfe19d..3b6880dd648d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3939,6 +3939,8 @@ static int raid10_run(struct mddev *mddev) set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, "reshape"); + if (!mddev->sync_thread) + goto out_free_conf; } return 0; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 09562d7cc080..992fd08437d8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7403,6 +7403,8 @@ static int raid5_run(struct mddev *mddev) set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, "reshape"); + if (!mddev->sync_thread) + goto abort; } /* Ok, everything is just fine now */ From f87b543af45ea217fa1d000483f9a40944c1ff73 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 12 Mar 2019 12:06:35 -0400 Subject: [PATCH 415/855] fix null pointer deref in tracepoints in back channel Backchannel doesn't have the rq_task->tk_clientid pointer set. Otherwise can lead to the following oops: ocalhost login: [ 111.385319] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004 [ 111.388073] #PF error: [normal kernel read fault] [ 111.389452] PGD 80000000290d8067 P4D 80000000290d8067 PUD 75f25067 PMD 0 [ 111.391224] Oops: 0000 [#1] SMP PTI [ 111.392151] CPU: 0 PID: 3533 Comm: NFSv4 callback Not tainted 5.0.0-rc7+ #1 [ 111.393787] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015 [ 111.396340] RIP: 0010:trace_event_raw_event_xprt_enq_xmit+0x6f/0xf0 [sunrpc] [ 111.397974] Code: 00 00 00 48 89 ee 48 89 e7 e8 bd 0a 85 d7 48 85 c0 74 4a 41 0f b7 94 24 e0 00 00 00 48 89 e7 89 50 08 49 8b 94 24 a8 00 00 00 <8b> 52 04 89 50 0c 49 8b 94 24 c0 00 00 00 8b 92 a8 00 00 00 0f ca [ 111.402215] RSP: 0018:ffffb98743263cf8 EFLAGS: 00010286 [ 111.403406] RAX: ffffa0890fc3bc88 RBX: 0000000000000003 RCX: 0000000000000000 [ 111.405057] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffb98743263cf8 [ 111.406656] RBP: ffffa0896f5368f0 R08: 0000000000000246 R09: 0000000000000000 [ 111.408437] R10: ffffe19b01c01500 R11: 0000000000000000 R12: ffffa08977d28a00 [ 111.410210] R13: 0000000000000004 R14: ffffa089315303f0 R15: ffffa08931530000 [ 111.411856] FS: 0000000000000000(0000) GS:ffffa0897bc00000(0000) knlGS:0000000000000000 [ 111.413699] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 111.415068] CR2: 0000000000000004 CR3: 000000002ac90004 CR4: 00000000001606f0 [ 111.416745] Call Trace: [ 111.417339] xprt_request_enqueue_transmit+0x2b6/0x4a0 [sunrpc] [ 111.418709] ? rpc_task_need_encode+0x40/0x40 [sunrpc] [ 111.419957] call_bc_transmit+0xd5/0x170 [sunrpc] [ 111.421067] __rpc_execute+0x7e/0x3f0 [sunrpc] [ 111.422177] rpc_run_bc_task+0x78/0xd0 [sunrpc] [ 111.423212] bc_svc_process+0x281/0x340 [sunrpc] [ 111.424325] nfs41_callback_svc+0x130/0x1c0 [nfsv4] [ 111.425430] ? remove_wait_queue+0x60/0x60 [ 111.426398] kthread+0xf5/0x130 [ 111.427155] ? nfs_callback_authenticate+0x50/0x50 [nfsv4] [ 111.428388] ? kthread_bind+0x10/0x10 [ 111.429270] ret_from_fork+0x1f/0x30 localhost login: [ 467.462259] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004 [ 467.464411] #PF error: [normal kernel read fault] [ 467.465445] PGD 80000000728c1067 P4D 80000000728c1067 PUD 728c0067 PMD 0 [ 467.466980] Oops: 0000 [#1] SMP PTI [ 467.467759] CPU: 0 PID: 3517 Comm: NFSv4 callback Not tainted 5.0.0-rc7+ #1 [ 467.469393] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015 [ 467.471840] RIP: 0010:trace_event_raw_event_xprt_transmit+0x7c/0xf0 [sunrpc] [ 467.473392] Code: f6 48 85 c0 74 4b 49 8b 94 24 98 00 00 00 48 89 e7 0f b7 92 e0 00 00 00 89 50 08 49 8b 94 24 98 00 00 00 48 8b 92 a8 00 00 00 <8b> 52 04 89 50 0c 41 8b 94 24 a8 00 00 00 0f ca 89 50 10 41 8b 94 [ 467.477605] RSP: 0018:ffffabe7434fbcd0 EFLAGS: 00010282 [ 467.478793] RAX: ffff99720fc3bce0 RBX: 0000000000000003 RCX: 0000000000000000 [ 467.480409] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffabe7434fbcd0 [ 467.482011] RBP: ffff99726f631948 R08: 0000000000000246 R09: 0000000000000000 [ 467.483591] R10: 0000000070000000 R11: 0000000000000000 R12: ffff997277dfcc00 [ 467.485226] R13: 0000000000000000 R14: 0000000000000000 R15: ffff99722fecdca8 [ 467.486830] FS: 0000000000000000(0000) GS:ffff99727bc00000(0000) knlGS:0000000000000000 [ 467.488596] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 467.489931] CR2: 0000000000000004 CR3: 00000000270e6006 CR4: 00000000001606f0 [ 467.491559] Call Trace: [ 467.492128] xprt_transmit+0x303/0x3f0 [sunrpc] [ 467.493143] ? rpc_task_need_encode+0x40/0x40 [sunrpc] [ 467.494328] call_bc_transmit+0x49/0x170 [sunrpc] [ 467.495379] __rpc_execute+0x7e/0x3f0 [sunrpc] [ 467.496451] rpc_run_bc_task+0x78/0xd0 [sunrpc] [ 467.497467] bc_svc_process+0x281/0x340 [sunrpc] [ 467.498507] nfs41_callback_svc+0x130/0x1c0 [nfsv4] [ 467.499751] ? remove_wait_queue+0x60/0x60 [ 467.500686] kthread+0xf5/0x130 [ 467.501438] ? nfs_callback_authenticate+0x50/0x50 [nfsv4] [ 467.502640] ? kthread_bind+0x10/0x10 [ 467.503454] ret_from_fork+0x1f/0x30 Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 8451f30c6a0f..7e899e635d33 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -712,7 +712,8 @@ TRACE_EVENT(xprt_transmit, TP_fast_assign( __entry->task_id = rqst->rq_task->tk_pid; - __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->client_id = rqst->rq_task->tk_client ? + rqst->rq_task->tk_client->cl_clid : -1; __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->seqno = rqst->rq_seqno; __entry->status = status; @@ -742,7 +743,8 @@ TRACE_EVENT(xprt_enq_xmit, TP_fast_assign( __entry->task_id = task->tk_pid; - __entry->client_id = task->tk_client->cl_clid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; __entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid); __entry->seqno = task->tk_rqstp->rq_seqno; __entry->stage = stage; From 400417b05f3ec0531544ca5f94e64d838d8b8849 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Mar 2019 16:04:51 -0400 Subject: [PATCH 416/855] pNFS: Fix a typo in pnfs_update_layout We're supposed to wait for the outstanding layout count to go to zero, but that got lost somehow. Fixes: d03360aaf5cca ("pNFS: Ensure we return the error if someone...") Reported-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 8247bd1634cb..7066cd7c7aff 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1889,7 +1889,7 @@ lookup_again: atomic_read(&lo->plh_outstanding) != 0) { spin_unlock(&ino->i_lock); lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, - atomic_read(&lo->plh_outstanding))); + !atomic_read(&lo->plh_outstanding))); if (IS_ERR(lseg) || !list_empty(&lo->plh_segs)) goto out_put_layout_hdr; pnfs_put_layout_hdr(lo); From 0af725fcb77a148b7da249fa01f98ceeaa149eec Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 15 Feb 2019 19:04:38 -0800 Subject: [PATCH 417/855] f2fs: fix wrong #endif We have to cover whole headerfile with last #endif. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f1f0d2810852..19cbf17550c7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3626,8 +3626,6 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, #define f2fs_build_fault_attr(sbi, rate, type) do { } while (0) #endif -#endif - static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) { #ifdef CONFIG_QUOTA @@ -3640,3 +3638,5 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) #endif return false; } + +#endif From 68b79cdc6de97fe270ceb40082a4aa6ad3e41ea7 Mon Sep 17 00:00:00 2001 From: Zeng Guangyue Date: Mon, 18 Feb 2019 14:26:41 +0800 Subject: [PATCH 418/855] f2fs: correct spelling mistake correct spelling mistake for "nunmber" Signed-off-by: Zeng Guangyue Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 666db8eb71e0..f5740423b002 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -285,7 +285,7 @@ enum { struct node_footer { __le32 nid; /* node id */ - __le32 ino; /* inode nunmber */ + __le32 ino; /* inode number */ __le32 flag; /* include cold/fsync/dentry marks and offset */ __le64 cp_ver; /* checkpoint version */ __le32 next_blkaddr; /* next node page block address */ From bc73a4b2414f2914fa895747166312b1527a97bb Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 19 Feb 2019 10:31:52 +0800 Subject: [PATCH 419/855] f2fs: silence VM_WARN_ON_ONCE in mempool_alloc Note that __GFP_ZERO is not supported for mempool_alloc, which also documented in the mempool_alloc comments. Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e099babf85bd..65c9586b2952 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -299,9 +299,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, for (; start < F2FS_IO_SIZE(sbi); start++) { struct page *page = mempool_alloc(sbi->write_io_dummy, - GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + GFP_NOIO | __GFP_NOFAIL); f2fs_bug_on(sbi, !page); + zero_user_segment(page, 0, PAGE_SIZE); SetPagePrivate(page); set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); lock_page(page); From aa2c8c43e4a5c242f5b0331c8b7a941b85f9435a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 19 Feb 2019 16:23:53 +0800 Subject: [PATCH 420/855] f2fs: fix to retry fill_super only if recovery failed With current retry mechanism in f2fs_fill_super, first fill_super fails due to no memory, then second fill_super runs w/o recovery, if we succeed, we may lose fsynced data, it doesn't make sense. Let's retry fill_super only if it occurs non-ENOMEM error during recovery. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 230845221c74..45121190a2ba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3053,10 +3053,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct f2fs_super_block *raw_super; struct inode *root; int err; - bool retry = true, need_fsck = false; + bool skip_recovery = false, need_fsck = false; char *options = NULL; int recovery, i, valid_super_block; struct curseg_info *seg_i; + int retry_cnt = 1; try_onemore: err = -EINVAL; @@ -3345,7 +3346,7 @@ try_onemore: goto free_meta; if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) - goto skip_recovery; + goto reset_checkpoint; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -3362,11 +3363,13 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - if (!retry) - goto skip_recovery; + if (skip_recovery) + goto reset_checkpoint; err = f2fs_recover_fsync_data(sbi, false); if (err < 0) { + if (err != -ENOMEM) + skip_recovery = true; need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); @@ -3382,7 +3385,7 @@ try_onemore: goto free_meta; } } -skip_recovery: +reset_checkpoint: /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -3428,7 +3431,7 @@ skip_recovery: sync_free_meta: /* safe to flush all the data */ sync_filesystem(sbi->sb); - retry = false; + retry_cnt = 0; free_meta: #ifdef CONFIG_QUOTA @@ -3488,8 +3491,8 @@ free_sbi: kvfree(sbi); /* give only one another chance */ - if (retry) { - retry = false; + if (retry_cnt > 0 && skip_recovery) { + retry_cnt--; shrink_dcache_sb(sb); goto try_onemore; } From dc37910d4c63296513c5795187d020ad9793822b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 19 Feb 2019 17:08:18 +0800 Subject: [PATCH 421/855] f2fs: make fault injection covering __submit_flush_wait() This patch changes to allow failure of f2fs_bio_alloc() in __submit_flush_wait(), which can simulate flush error in checkpoint() for covering more error paths. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fdd8cd21522f..2537893e9466 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -542,9 +542,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio = f2fs_bio_alloc(sbi, 0, true); + struct bio *bio; int ret; + bio = f2fs_bio_alloc(sbi, 0, false); + if (!bio) + return -ENOMEM; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; bio_set_dev(bio, bdev); ret = submit_bio_wait(bio); From 6492a335fd8084fa894beb0c4182de439a12e8d5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 21 Feb 2019 20:37:14 +0800 Subject: [PATCH 422/855] f2fs: fix encrypted page memory leak For IPU path of f2fs_do_write_data_page(), in its error path, we need to release encrypted page and fscrypt context, otherwise it will cause memory leak. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 65c9586b2952..3f3becd46362 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1861,8 +1861,13 @@ got_it: if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); - if (err && PageWriteback(page)) - end_page_writeback(page); + if (err) { + if (f2fs_encrypted_file(inode)) + fscrypt_pullback_bio_page(&fio->encrypted_page, + true); + if (PageWriteback(page)) + end_page_writeback(page); + } trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); return err; From e46f6bd82c831d20f9b6c149076cf5d2c443d638 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 21 Feb 2019 20:40:13 +0800 Subject: [PATCH 423/855] f2fs: fix to update iostat correctly in IPU path In error path of IPU, we didn't account iostat correctly, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2537893e9466..d74359325da6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3182,10 +3182,10 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); - if (!err) + if (!err) { update_device_state(fio); - - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + } return err; } From 613f3dcdf0d8ffdb3e5d820f6d69076905efddc3 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 21 Feb 2019 12:57:35 +0800 Subject: [PATCH 424/855] f2fs: no need to take page lock in readdir VFS will take inode_lock for readdir, therefore no need to take page lock in readdir at all just as the majority of other generic filesystems. This patch improves concurrency since .iterate_shared was introduced to VFS years ago. Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index ba7535399d95..103f3686a045 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -896,7 +896,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = f2fs_get_lock_data_page(inode, n, false); + dentry_page = f2fs_find_data_page(inode, n); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { @@ -914,11 +914,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); if (err) { - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); break; } - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); } out_free: fscrypt_fname_free_buffer(&fstr); From 428e3bcf07696ff252b7ff3ff7711c2b9bbdb908 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 25 Feb 2019 09:46:45 -0800 Subject: [PATCH 425/855] f2fs: give random value to i_generation This follows to give random number to i_generation along with commit 232530680290b ("ext4: improve smp scalability for inode generation") This can be used for DUN for UFS HW encryption. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 -- fs/f2fs/namei.c | 3 ++- fs/f2fs/super.c | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 19cbf17550c7..3007759dd2dd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1242,8 +1242,6 @@ struct f2fs_sb_info { unsigned int nquota_files; /* # of quota sysfile */ - u32 s_next_generation; /* for NFS support */ - /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 62d9829f3a6a..f218d9424d8b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -50,7 +51,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); F2FS_I(inode)->i_crtime = inode->i_mtime; - inode->i_generation = sbi->s_next_generation++; + inode->i_generation = prandom_u32(); if (S_ISDIR(inode->i_mode)) F2FS_I(inode)->i_current_depth = 1; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 45121190a2ba..42eb5c86330a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3129,7 +3129,6 @@ try_onemore: sb->s_maxbytes = sbi->max_file_blocks << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; From ca597bddedd94906cd761d8be6a3ad21292725de Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 23 Feb 2019 09:48:27 +0800 Subject: [PATCH 426/855] f2fs: fix to dirty inode for i_mode recovery As Seulbae Kim reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202637 We didn't recover permission field correctly after sudden power-cut, the reason is in setattr we didn't add inode into global dirty list once i_mode is changed, so latter checkpoint triggered by fsync will not flush last i_mode into disk, result in this problem, fix it. Reported-by: Seulbae Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b8f5d1208619..3a8c8eb0f549 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -766,7 +766,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; - bool size_changed = false; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -841,8 +840,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) down_write(&F2FS_I(inode)->i_sem); F2FS_I(inode)->last_disk_size = i_size_read(inode); up_write(&F2FS_I(inode)->i_sem); - - size_changed = true; } __setattr_copy(inode, attr); @@ -856,7 +853,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } /* file size may changed here */ - f2fs_mark_inode_dirty_sync(inode, size_changed); + f2fs_mark_inode_dirty_sync(inode, true); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); From 48432984d718c95cf13e26d487c2d1b697c3c01f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 25 Feb 2019 17:11:03 +0800 Subject: [PATCH 427/855] f2fs: fix to avoid deadlock of atomic file operations Thread A Thread B - __fput - f2fs_release_file - drop_inmem_pages - mutex_lock(&fi->inmem_lock) - __revoke_inmem_pages - lock_page(page) - open - f2fs_setattr - truncate_setsize - truncate_inode_pages_range - lock_page(page) - truncate_cleanup_page - f2fs_invalidate_page - drop_inmem_page - mutex_lock(&fi->inmem_lock); We may encounter above ABBA deadlock as reported by Kyungtae Kim: I'm reporting a bug in linux-4.17.19: "INFO: task hung in drop_inmem_page" (no reproducer) I think this might be somehow related to the following: https://groups.google.com/forum/#!searchin/syzkaller-bugs/INFO$3A$20task$20hung$20in$20%7Csort:date/syzkaller-bugs/c6soBTrdaIo/AjAzPeIzCgAJ ========================================= INFO: task syz-executor7:10822 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D27024 10822 6346 0x00000004 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 schedule_preempt_disabled+0x18/0x30 kernel/sched/core.c:3617 __mutex_lock_common kernel/locking/mutex.c:833 [inline] __mutex_lock+0x5bd/0x1410 kernel/locking/mutex.c:893 mutex_lock_nested+0x1b/0x20 kernel/locking/mutex.c:908 drop_inmem_page+0xcb/0x810 fs/f2fs/segment.c:327 f2fs_invalidate_page+0x337/0x5e0 fs/f2fs/data.c:2401 do_invalidatepage mm/truncate.c:165 [inline] truncate_cleanup_page+0x261/0x330 mm/truncate.c:187 truncate_inode_pages_range+0x552/0x1610 mm/truncate.c:367 truncate_inode_pages mm/truncate.c:478 [inline] truncate_pagecache+0x6d/0x90 mm/truncate.c:801 truncate_setsize+0x81/0xa0 mm/truncate.c:826 f2fs_setattr+0x44f/0x1270 fs/f2fs/file.c:781 notify_change+0xa62/0xe80 fs/attr.c:313 do_truncate+0x12e/0x1e0 fs/open.c:63 do_last fs/namei.c:2955 [inline] path_openat+0x2042/0x29f0 fs/namei.c:3505 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540 do_sys_open+0x35e/0x4e0 fs/open.c:1101 __do_sys_open fs/open.c:1119 [inline] __se_sys_open fs/open.c:1114 [inline] __x64_sys_open+0x89/0xc0 fs/open.c:1114 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f734e459c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 00007f734e45a6cc RCX: 00000000004497b9 RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080 RBP: 000000000071bea0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e45a700 INFO: task syz-executor7:10858 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D28880 10858 6346 0x00000004 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 __rwsem_down_write_failed_common kernel/locking/rwsem-xadd.c:565 [inline] rwsem_down_write_failed+0x5e6/0xc90 kernel/locking/rwsem-xadd.c:594 call_rwsem_down_write_failed+0x17/0x30 arch/x86/lib/rwsem.S:117 __down_write arch/x86/include/asm/rwsem.h:142 [inline] down_write+0x58/0xa0 kernel/locking/rwsem.c:72 inode_lock include/linux/fs.h:713 [inline] do_truncate+0x120/0x1e0 fs/open.c:61 do_last fs/namei.c:2955 [inline] path_openat+0x2042/0x29f0 fs/namei.c:3505 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540 do_sys_open+0x35e/0x4e0 fs/open.c:1101 __do_sys_open fs/open.c:1119 [inline] __se_sys_open fs/open.c:1114 [inline] __x64_sys_open+0x89/0xc0 fs/open.c:1114 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f734e3b4c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 00007f734e3b56cc RCX: 00000000004497b9 RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080 RBP: 000000000071c238 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e3b5700 INFO: task syz-executor5:10829 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor5 D28760 10829 6308 0x80000002 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 io_schedule+0x21/0x80 kernel/sched/core.c:5179 wait_on_page_bit_common mm/filemap.c:1100 [inline] __lock_page+0x2b5/0x390 mm/filemap.c:1273 lock_page include/linux/pagemap.h:483 [inline] __revoke_inmem_pages+0xb35/0x11c0 fs/f2fs/segment.c:231 drop_inmem_pages+0xa3/0x3e0 fs/f2fs/segment.c:306 f2fs_release_file+0x2c7/0x330 fs/f2fs/file.c:1556 __fput+0x2c7/0x780 fs/file_table.c:209 ____fput+0x1a/0x20 fs/file_table.c:243 task_work_run+0x151/0x1d0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x8ba/0x30a0 kernel/exit.c:865 do_group_exit+0x13b/0x3a0 kernel/exit.c:968 get_signal+0x6bb/0x1650 kernel/signal.c:2482 do_signal+0x84/0x1b70 arch/x86/kernel/signal.c:810 exit_to_usermode_loop+0x155/0x190 arch/x86/entry/common.c:162 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:265 [inline] do_syscall_64+0x445/0x4e0 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f1c68e74ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: fffffffffffffe00 RBX: 000000000071bf80 RCX: 00000000004497b9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000071bf80 RBP: 000000000071bf80 R08: 0000000000000000 R09: 000000000071bf58 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f1c68e759c0 R15: 00007f1c68e75700 This patch tries to use trylock_page to mitigate such deadlock condition for fix. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d74359325da6..e730c334abba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -215,7 +215,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) } static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover) + struct list_head *head, bool drop, bool recover, + bool trylock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inmem_pages *cur, *tmp; @@ -227,7 +228,16 @@ static int __revoke_inmem_pages(struct inode *inode, if (drop) trace_f2fs_commit_inmem_page(page, INMEM_DROP); - lock_page(page); + if (trylock) { + /* + * to avoid deadlock in between page lock and + * inmem_lock. + */ + if (!trylock_page(page)) + continue; + } else { + lock_page(page); + } f2fs_wait_on_page_writeback(page, DATA, true, true); @@ -318,13 +328,19 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - mutex_unlock(&fi->inmem_lock); + while (!list_empty(&fi->inmem_pages)) { + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, true); + + if (list_empty(&fi->inmem_pages)) { + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } + mutex_unlock(&fi->inmem_lock); + } clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; @@ -429,12 +445,15 @@ retry: * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - err = __revoke_inmem_pages(inode, &revoke_list, false, true); + err = __revoke_inmem_pages(inode, &revoke_list, + false, true, false); /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, false); } else { - __revoke_inmem_pages(inode, &revoke_list, false, false); + __revoke_inmem_pages(inode, &revoke_list, + false, false, false); } return err; From 559e87c497a820e132995ad56c8361509e5410da Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 26 Feb 2019 19:01:15 +0800 Subject: [PATCH 428/855] f2fs: trace f2fs_ioc_shutdown This patch supports to trace f2fs_ioc_shutdown. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ include/trace/events/f2fs.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3a8c8eb0f549..807a97ad2430 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1987,6 +1987,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) out: if (in != F2FS_GOING_DOWN_FULLSYNC) mnt_drop_write_file(filp); + + trace_f2fs_shutdown(sbi, in, ret); + return ret; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 8a28a2b1be74..0cb746fe6ae1 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -149,6 +149,14 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { CP_SPEC_LOG_NUM, "log type is 2" }, \ { CP_RECOVER_DIR, "dir needs recovery" }) +#define show_shutdown_mode(type) \ + __print_symbolic(type, \ + { F2FS_GOING_DOWN_FULLSYNC, "full sync" }, \ + { F2FS_GOING_DOWN_METASYNC, "meta sync" }, \ + { F2FS_GOING_DOWN_NOSYNC, "no sync" }, \ + { F2FS_GOING_DOWN_METAFLUSH, "meta flush" }, \ + { F2FS_GOING_DOWN_NEED_FSCK, "need fsck" }) + struct f2fs_sb_info; struct f2fs_io_info; struct extent_info; @@ -1619,6 +1627,30 @@ DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit, TP_ARGS(sb, type, count) ); +TRACE_EVENT(f2fs_shutdown, + + TP_PROTO(struct f2fs_sb_info *sbi, unsigned int mode, int ret), + + TP_ARGS(sbi, mode, ret), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, mode) + __field(int, ret) + ), + + TP_fast_assign( + __entry->dev = sbi->sb->s_dev; + __entry->mode = mode; + __entry->ret = ret; + ), + + TP_printk("dev = (%d,%d), mode: %s, ret:%d", + show_dev(__entry->dev), + show_shutdown_mode(__entry->mode), + __entry->ret) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ From 76630f20059245372a78ccc0845db9d698098c34 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 26 Feb 2019 19:01:16 +0800 Subject: [PATCH 429/855] f2fs: print more parameters in trace_f2fs_map_blocks for better map_blocks trace. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 0cb746fe6ae1..a3916b4dd57e 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -544,6 +544,9 @@ TRACE_EVENT(f2fs_map_blocks, __field(block_t, m_lblk) __field(block_t, m_pblk) __field(unsigned int, m_len) + __field(unsigned int, m_flags) + __field(int, m_seg_type) + __field(bool, m_may_create) __field(int, ret) ), @@ -553,15 +556,22 @@ TRACE_EVENT(f2fs_map_blocks, __entry->m_lblk = map->m_lblk; __entry->m_pblk = map->m_pblk; __entry->m_len = map->m_len; + __entry->m_flags = map->m_flags; + __entry->m_seg_type = map->m_seg_type; + __entry->m_may_create = map->m_may_create; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, " - "start blkaddr = 0x%llx, len = 0x%llx, err = %d", + "start blkaddr = 0x%llx, len = 0x%llx, flags = %u," + "seg_type = %d, may_create = %d, err = %d", show_dev_ino(__entry), (unsigned long long)__entry->m_lblk, (unsigned long long)__entry->m_pblk, (unsigned long long)__entry->m_len, + __entry->m_flags, + __entry->m_seg_type, + __entry->m_may_create, __entry->ret) ); From 2a6a7e722e7a78d774ce02b847c5b183a3ff2672 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 5 Mar 2019 17:52:33 +0800 Subject: [PATCH 430/855] f2fs: fix to use kvfree instead of kzfree As Jiqun Li reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202747 System can panic due to using wrong allocate/free function pair in xattr interface: - use kvmalloc to allocate memory - use kzfree to free memory Let's fix to use kvfree instead of kzfree, BTW, we are safe to get rid of kzfree, since there is no such confidential data stored as xattr, we don't need to zero it before free memory. Fixes: 5222595d093e ("f2fs: use kvmalloc, if kmalloc is failed") Reported-by: Jiqun Li Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index fa620d31ea5f..ca47224d78ab 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -340,7 +340,7 @@ check: *base_addr = txattr_addr; return 0; out: - kzfree(txattr_addr); + kvfree(txattr_addr); return err; } @@ -383,7 +383,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, *base_addr = txattr_addr; return 0; fail: - kzfree(txattr_addr); + kvfree(txattr_addr); return err; } @@ -510,7 +510,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, } error = size; out: - kzfree(base_addr); + kvfree(base_addr); return error; } @@ -556,7 +556,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - kzfree(base_addr); + kvfree(base_addr); return error; } @@ -687,7 +687,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: - kzfree(base_addr); + kvfree(base_addr); return error; } From 25720cc05e492467099a2d4d21a50f6ee8555cfd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Mar 2019 16:18:33 +0800 Subject: [PATCH 431/855] f2fs: remove wrong comment in f2fs_invalidate_page() Since 8c242db9b8c0 ("f2fs: fix stale ATOMIC_WRITTEN_PAGE private pointer"), we've started to not skip clear private flag for atomic_write page truncation, so removing old wrong comment in f2fs_invalidate_page(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3f3becd46362..2510e935301e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2711,7 +2711,6 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, clear_cold_data(page); - /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) return f2fs_drop_inmem_page(inode, page); From 240a59156d9bcfabceddb66be449e7b32fb5dc4a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Mar 2019 17:30:59 +0800 Subject: [PATCH 432/855] f2fs: fix to add refcount once page is tagged PG_private As Gao Xiang reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202749 f2fs may skip pageout() due to incorrect page reference count. The problem here is that MM defined the rule [1] very clearly that once page was set with PG_private flag, we should increment the refcount in that page, also main flows like pageout(), migrate_page() will assume there is one additional page reference count if page_has_private() returns true. But currently, f2fs won't add/del refcount when changing PG_private flag. Anyway, f2fs should follow MM's rule to make MM's related flows running as expected. [1] https://lore.kernel.org/lkml/2b19b3c4-2bc4-15fa-15cc-27a13e5c7af1@aol.com/ Reported-by: Gao Xiang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/data.c | 21 ++++++++------------- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 21 +++++++++++++++++++++ fs/f2fs/node.c | 2 +- fs/f2fs/segment.c | 9 +++------ 6 files changed, 36 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c65a1e8e1e95..a98e1b02279e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -406,7 +406,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); return 1; } @@ -957,7 +957,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2510e935301e..fa6318549af5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2714,8 +2714,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, if (IS_ATOMIC_WRITTEN_PAGE(page)) return f2fs_drop_inmem_page(inode, page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); } int f2fs_release_page(struct page *page, gfp_t wait) @@ -2729,8 +2728,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; clear_cold_data(page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); return 1; } @@ -2798,12 +2796,8 @@ int f2fs_migrate_page(struct address_space *mapping, return -EAGAIN; } - /* - * A reference is expected if PagePrivate set when move mapping, - * however F2FS breaks this for maintaining dirty page counts when - * truncating pages. So here adjusting the 'extra_count' make it work. - */ - extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + /* one extra reference was held for atomic_write page */ + extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, mode, extra_count); if (rc != MIGRATEPAGE_SUCCESS) { @@ -2824,9 +2818,10 @@ int f2fs_migrate_page(struct address_space *mapping, get_page(newpage); } - if (PagePrivate(page)) - SetPagePrivate(newpage); - set_page_private(newpage, page_private(page)); + if (PagePrivate(page)) { + f2fs_set_page_private(newpage, page_private(page)); + f2fs_clear_page_private(page); + } if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 103f3686a045..fb647e58edb5 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -728,7 +728,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); - ClearPagePrivate(page); + f2fs_clear_page_private(page); ClearPageUptodate(page); clear_cold_data(page); inode_dec_dirty_pages(dir); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3007759dd2dd..a165f7870004 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2835,6 +2835,27 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, return true; } +static inline void f2fs_set_page_private(struct page *page, + unsigned long data) +{ + if (PagePrivate(page)) + return; + + get_page(page); + SetPagePrivate(page); + set_page_private(page, data); +} + +static inline void f2fs_clear_page_private(struct page *page) +{ + if (!PagePrivate(page)) + return; + + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); +} + /* * file.c */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f6ff84e29749..3f99ab288695 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1961,7 +1961,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); return 1; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e730c334abba..aa7fe79b62b2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -191,8 +191,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); - set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); - SetPagePrivate(page); + f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -280,8 +279,7 @@ next: ClearPageUptodate(page); clear_cold_data(page); } - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); f2fs_put_page(page, 1); list_del(&cur->list); @@ -370,8 +368,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) kmem_cache_free(inmem_entry_slab, cur); ClearPageUptodate(page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); f2fs_put_page(page, 0); trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); From 86109c9064daf2ac44ef7b4f1eeb039260351e9c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 7 Mar 2019 17:31:30 +0800 Subject: [PATCH 433/855] f2fs: don't trigger read IO for beyond EOF page In f2fs_mpage_readpages(), if page is beyond EOF, we should just zero out it, but previously, before checking previous mapping info, we missed to check filesize boundary, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fa6318549af5..912fdcc2dbb8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1551,6 +1551,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping, if (last_block > last_block_in_file) last_block = last_block_in_file; + /* just zeroing out page which is beyond EOF */ + if (block_in_file >= last_block) + goto zero_out; /* * Map blocks using the previous result first. */ @@ -1563,16 +1566,11 @@ static int f2fs_mpage_readpages(struct address_space *mapping, * Then do more f2fs_map_blocks() calls until we are * done with this page. */ - map.m_flags = 0; + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; - if (block_in_file < last_block) { - map.m_lblk = block_in_file; - map.m_len = last_block - block_in_file; - - if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_DEFAULT)) - goto set_error_page; - } + if (f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT)) + goto set_error_page; got_it: if ((map.m_flags & F2FS_MAP_MAPPED)) { block_nr = map.m_pblk + block_in_file - map.m_lblk; @@ -1587,6 +1585,7 @@ got_it: DATA_GENERIC)) goto set_error_page; } else { +zero_out: zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) SetPageUptodate(page); From 70db5b04cbe19e5ae7e85ada2d3e82bcfdf90352 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 12 Mar 2019 11:49:53 -0700 Subject: [PATCH 434/855] f2fs: give some messages for inline_xattr_size This patch adds some kernel messages when user sets wrong inline_xattr_size. Fixes: 500e0b28ecd3 ("f2fs: fix to check inline_xattr_size boundary correctly") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 42eb5c86330a..324938ec95f3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -821,6 +821,8 @@ static int parse_options(struct super_block *sb, char *options) } if (test_opt(sbi, INLINE_XATTR_SIZE)) { + int min_size, max_size; + if (!f2fs_sb_has_extra_attr(sbi) || !f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_msg(sb, KERN_ERR, @@ -834,15 +836,18 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (F2FS_OPTION(sbi).inline_xattr_size < - sizeof(struct f2fs_xattr_header) / sizeof(__le32) || - F2FS_OPTION(sbi).inline_xattr_size > - DEF_ADDRS_PER_INODE - + + min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32); + max_size = DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - DEF_INLINE_RESERVED_SIZE - - MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { + MIN_INLINE_DENTRY_SIZE / sizeof(__le32); + + if (F2FS_OPTION(sbi).inline_xattr_size < min_size || + F2FS_OPTION(sbi).inline_xattr_size > max_size) { f2fs_msg(sb, KERN_ERR, - "inline xattr size is out of range"); + "inline xattr size is out of range: %d ~ %d", + min_size, max_size); return -EINVAL; } } From dd6c89b5f2b93ceced4111e7b69d4efd8c312713 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Mar 2019 17:19:04 +0800 Subject: [PATCH 435/855] f2fs: fix to do sanity check with inode.i_inline_xattr_size As Paul Bandha reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202709 When I run the poc on the mounted f2fs img I get a buffer overflow in read_inline_xattr due to there being no sanity check on the value of i_inline_xattr_size. I created the img by just modifying the value of i_inline_xattr_size in the inode: i_name [test1.txt] i_ext: fofs:0 blkaddr:0 len:0 i_extra_isize [0x 18 : 24] i_inline_xattr_size [0x ffff : 65535] i_addr[ofs] [0x 0 : 0] mkdir /mnt/f2fs mount ./f2fs1.img /mnt/f2fs gcc poc.c -o poc ./poc int main() { int y = syscall(SYS_listxattr, "/mnt/f2fs/test1.txt", NULL, 0); printf("ret %d", y); printf("errno: %d\n", errno); } BUG: KASAN: slab-out-of-bounds in read_inline_xattr+0x18f/0x260 Read of size 262140 at addr ffff88011035efd8 by task f2fs1poc/3263 CPU: 0 PID: 3263 Comm: f2fs1poc Not tainted 4.18.0-custom #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.1-0-g0551a4be2c-prebuilt.qemu-project.org 04/01/2014 Call Trace: dump_stack+0x71/0xab print_address_description+0x83/0x250 kasan_report+0x213/0x350 memcpy+0x1f/0x50 read_inline_xattr+0x18f/0x260 read_all_xattrs+0xba/0x190 f2fs_listxattr+0x9d/0x3f0 listxattr+0xb2/0xd0 path_listxattr+0x93/0xe0 do_syscall_64+0x9d/0x220 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Let's add sanity check for inode.i_inline_xattr_size during f2fs_iget() to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 15 +++++++++++++++ fs/f2fs/super.c | 5 +---- fs/f2fs/xattr.h | 6 ++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bec52961630b..d44e6de70cb3 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -14,6 +14,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include @@ -248,6 +249,20 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_flexible_inline_xattr(sbi) && + f2fs_has_inline_xattr(inode) && + (!fi->i_inline_xattr_size || + fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) has corrupted " + "i_inline_xattr_size: %d, max: %zu", + __func__, inode->i_ino, fi->i_inline_xattr_size, + MAX_INLINE_XATTR_SIZE); + return false; + } + if (F2FS_I(inode)->extent_tree) { struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 324938ec95f3..21750df930b3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -838,10 +838,7 @@ static int parse_options(struct super_block *sb, char *options) } min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32); - max_size = DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - - DEF_INLINE_RESERVED_SIZE - - MIN_INLINE_DENTRY_SIZE / sizeof(__le32); + max_size = MAX_INLINE_XATTR_SIZE; if (F2FS_OPTION(sbi).inline_xattr_size < min_size || F2FS_OPTION(sbi).inline_xattr_size > max_size) { diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 67db134da0f5..9172ee082ca8 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -78,6 +78,12 @@ struct f2fs_xattr_entry { sizeof(struct f2fs_xattr_header) - \ sizeof(struct f2fs_xattr_entry)) +#define MAX_INLINE_XATTR_SIZE \ + (DEF_ADDRS_PER_INODE - \ + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \ + DEF_INLINE_RESERVED_SIZE - \ + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) + /* * On-disk structure of f2fs_xattr * We use inline xattrs space + 1 block for xattr. From 2c28aba8b2e2a51749fa66e01b68e1cd5b53e022 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 5 Mar 2019 19:32:26 +0800 Subject: [PATCH 436/855] f2fs: fix to adapt small inline xattr space in __find_inline_xattr() With below testcase, we will fail to find existed xattr entry: 1. mkfs.f2fs -O extra_attr -O flexible_inline_xattr /dev/zram0 2. mount -t f2fs -o inline_xattr_size=1 /dev/zram0 /mnt/f2fs/ 3. touch /mnt/f2fs/file 4. setfattr -n "user.name" -v 0 /mnt/f2fs/file 5. getfattr -n "user.name" /mnt/f2fs/file /mnt/f2fs/file: user.name: No such attribute The reason is for inode which has very small inline xattr size, __find_inline_xattr() will fail to traverse any entry due to first entry may not be loaded from xattr node yet, later, we may skip to check entire xattr datas in __find_xattr(), result in such wrong condition. This patch adds condition to check such case to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index ca47224d78ab..848a785abe25 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -224,11 +224,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, { struct f2fs_xattr_entry *entry; unsigned int inline_size = inline_xattr_size(inode); + void *max_addr = base_addr + inline_size; list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > base_addr + inline_size || - (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > - base_addr + inline_size) { + if ((void *)entry + sizeof(__u32) > max_addr || + (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { *last_addr = entry; return NULL; } @@ -239,6 +239,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, if (!memcmp(entry->e_name, name, len)) break; } + + /* inline xattr header or entry across max inline xattr size */ + if (IS_XATTR_LAST_ENTRY(entry) && + (void *)entry + sizeof(__u32) > max_addr) { + *last_addr = entry; + return NULL; + } return entry; } From aadcef64b22f668c1a107b86d3521d9cac915c24 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Mar 2019 15:44:27 +0800 Subject: [PATCH 437/855] f2fs: fix to avoid deadlock in f2fs_read_inline_dir() As Jiqun Li reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202883 sometimes, dead lock when make system call SYS_getdents64 with fsync() is called by another process. monkey running on android9.0 1. task 9785 held sbi->cp_rwsem and waiting lock_page() 2. task 10349 held mm_sem and waiting sbi->cp_rwsem 3. task 9709 held lock_page() and waiting mm_sem so this is a dead lock scenario. task stack is show by crash tools as following crash_arm64> bt ffffffc03c354080 PID: 9785 TASK: ffffffc03c354080 CPU: 1 COMMAND: "RxIoScheduler-3" >> #7 [ffffffc01b50fac0] __lock_page at ffffff80081b11e8 crash-arm64> bt 10349 PID: 10349 TASK: ffffffc018b83080 CPU: 1 COMMAND: "BUGLY_ASYNC_UPL" >> #3 [ffffffc01f8cfa40] rwsem_down_read_failed at ffffff8008a93afc PC: 00000033 LR: 00000000 SP: 00000000 PSTATE: ffffffffffffffff crash-arm64> bt 9709 PID: 9709 TASK: ffffffc03e7f3080 CPU: 1 COMMAND: "IntentService[A" >> #3 [ffffffc001e67850] rwsem_down_read_failed at ffffff8008a93afc >> #8 [ffffffc001e67b80] el1_ia at ffffff8008084fc4 PC: ffffff8008274114 [compat_filldir64+120] LR: ffffff80083584d4 [f2fs_fill_dentries+448] SP: ffffffc001e67b80 PSTATE: 80400145 X29: ffffffc001e67b80 X28: 0000000000000000 X27: 000000000000001a X26: 00000000000093d7 X25: ffffffc070d52480 X24: 0000000000000008 X23: 0000000000000028 X22: 00000000d43dfd60 X21: ffffffc001e67e90 X20: 0000000000000011 X19: ffffff80093a4000 X18: 0000000000000000 X17: 0000000000000000 X16: 0000000000000000 X15: 0000000000000000 X14: ffffffffffffffff X13: 0000000000000008 X12: 0101010101010101 X11: 7f7f7f7f7f7f7f7f X10: 6a6a6a6a6a6a6a6a X9: 7f7f7f7f7f7f7f7f X8: 0000000080808000 X7: ffffff800827409c X6: 0000000080808000 X5: 0000000000000008 X4: 00000000000093d7 X3: 000000000000001a X2: 0000000000000011 X1: ffffffc070d52480 X0: 0000000000800238 >> #9 [ffffffc001e67be0] f2fs_fill_dentries at ffffff80083584d0 PC: 0000003c LR: 00000000 SP: 00000000 PSTATE: 000000d9 X12: f48a02ff X11: d4678960 X10: d43dfc00 X9: d4678ae4 X8: 00000058 X7: d4678994 X6: d43de800 X5: 000000d9 X4: d43dfc0c X3: d43dfc10 X2: d46799c8 X1: 00000000 X0: 00001068 Below potential deadlock will happen between three threads: Thread A Thread B Thread C - f2fs_do_sync_file - f2fs_write_checkpoint - down_write(&sbi->node_change) -- 1) - do_page_fault - down_write(&mm->mmap_sem) -- 2) - do_wp_page - f2fs_vm_page_mkwrite - getdents64 - f2fs_read_inline_dir - lock_page -- 3) - f2fs_sync_node_pages - lock_page -- 3) - __do_map_lock - down_read(&sbi->node_change) -- 1) - f2fs_fill_dentries - dir_emit - compat_filldir64 - do_page_fault - down_read(&mm->mmap_sem) -- 2) Since f2fs_readdir is protected by inode.i_rwsem, there should not be any updates in inode page, we're safe to lookup dents in inode page without its lock held, so taking off the lock to improve concurrency of readdir and avoid potential deadlock. Reported-by: Jiqun Li Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a1381d05956b..bb6a152310ef 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -659,6 +659,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (IS_ERR(ipage)) return PTR_ERR(ipage); + /* + * f2fs_readdir was protected by inode.i_rwsem, it is safe to access + * ipage without page's lock held. + */ + unlock_page(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -667,7 +673,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (!err) ctx->pos = d.max; - f2fs_put_page(ipage, 1); + f2fs_put_page(ipage, 0); return err < 0 ? err : 0; } From 1702877621ff1c8a737857b71d379510267a17db Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 12 Mar 2019 16:07:12 +1100 Subject: [PATCH 438/855] powerpc/powernv: Fix compile without CONFIG_TRACEPOINTS The functions returns s64 but the return statement is missing. This adds the missing return statement. Fixes: 75d9fc7fd94e ("powerpc/powernv: move OPAL call wrapper tracing and interrupt handling to C") Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-call.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index 578757d403ab..daad8c45c8e7 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -86,6 +86,7 @@ static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3, s64 a4, s64 a5, s64 a6, s64 a7, unsigned long opcode, unsigned long msr) { + return 0; } #define DO_TRACE false From de3c83c2fd2b87cf68214eda76dfa66989d78cb6 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Tue, 12 Mar 2019 21:18:23 +0100 Subject: [PATCH 439/855] powerpc/64s: Include header file to fix a warning Make sure to include to provide the following prototype: hv_nmi_check_nonrecoverable. Remove the following warning treated as error (W=1): arch/powerpc/kernel/traps.c:393:6: error: no previous prototype for 'hv_nmi_check_nonrecoverable' Fixes: ccd477028a20 ("powerpc/64s: Fix HV NMI vs HV interrupt recoverability test") Signed-off-by: Mathieu Malaterre Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index a21200c6aaea..1fd45a8650e1 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -71,6 +71,7 @@ #include #include #include +#include #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE) int (*__debugger)(struct pt_regs *regs) __read_mostly; From 2795e8c251614ac0784c9d41008551109f665716 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Mon, 11 Mar 2019 02:25:17 -0500 Subject: [PATCH 440/855] net: ieee802154: fix a potential NULL pointer dereference In case alloc_ordered_workqueue fails, the fix releases sources and returns -ENOMEM to avoid NULL pointer dereference. Signed-off-by: Kangjie Lu Acked-by: Michael Hennerich Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/adf7242.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ieee802154/adf7242.c b/drivers/net/ieee802154/adf7242.c index cd1d8faccca5..cd6b95e673a5 100644 --- a/drivers/net/ieee802154/adf7242.c +++ b/drivers/net/ieee802154/adf7242.c @@ -1268,6 +1268,10 @@ static int adf7242_probe(struct spi_device *spi) INIT_DELAYED_WORK(&lp->work, adf7242_rx_cal_work); lp->wqueue = alloc_ordered_workqueue(dev_name(&spi->dev), WQ_MEM_RECLAIM); + if (unlikely(!lp->wqueue)) { + ret = -ENOMEM; + goto err_hw_init; + } ret = adf7242_hw_init(lp); if (ret) From 19b39a25388e71390e059906c979f87be4ef0c71 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 19 Feb 2019 13:10:29 +0800 Subject: [PATCH 441/855] ieee802154: hwsim: propagate genlmsg_reply return code genlmsg_reply can fail, so propagate its return code Signed-off-by: Li RongQing Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mac802154_hwsim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index b6743f03dce0..3b88846de31b 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -324,7 +324,7 @@ static int hwsim_get_radio_nl(struct sk_buff *msg, struct genl_info *info) goto out_err; } - genlmsg_reply(skb, info); + res = genlmsg_reply(skb, info); break; } From d344e07940f3a3a93dec38f36593cca1591a7a5e Mon Sep 17 00:00:00 2001 From: Mariusz Ceier Date: Mon, 11 Mar 2019 21:53:57 +0100 Subject: [PATCH 442/855] ALSA: hda: Avoid NULL pointer dereference at snd_hdac_stream_start() For ca0132 codec, azx_dev->stream is NULL during firmware loading. Calling snd_hdac_get_stream_stripe_ctl unconditionally causes NULL pointer dereference in that function. Fixes: 9b6f7e7a296e ("ALSA: hda: program stripe bits for controller") Signed-off-by: Mariusz Ceier Signed-off-by: Takashi Iwai --- sound/hda/hdac_stream.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sound/hda/hdac_stream.c b/sound/hda/hdac_stream.c index f5dd288d1a7a..76e9b41fcea2 100644 --- a/sound/hda/hdac_stream.c +++ b/sound/hda/hdac_stream.c @@ -95,7 +95,10 @@ void snd_hdac_stream_start(struct hdac_stream *azx_dev, bool fresh_start) 1 << azx_dev->index, 1 << azx_dev->index); /* set stripe control */ - stripe_ctl = snd_hdac_get_stream_stripe_ctl(bus, azx_dev->substream); + if (azx_dev->substream) + stripe_ctl = snd_hdac_get_stream_stripe_ctl(bus, azx_dev->substream); + else + stripe_ctl = 0; snd_hdac_stream_updateb(azx_dev, SD_CTL_3B, SD_CTL_STRIPE_MASK, stripe_ctl); /* set DMA start and interrupt mask */ From a2c6433ee5a35a8de6d563f6512a26f87835ea0f Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Mon, 4 Mar 2019 17:00:02 -0600 Subject: [PATCH 443/855] ALSA: usx2y: Fix potential NULL pointer dereference usb_alloc_urb() can fail due to kmalloc failure and push the error upstream. Further this can cause a NULL pointer dereference in init_pipe_urbs(). This patch avoids such a scenario. Signed-off-by: Aditya Pakki Signed-off-by: Takashi Iwai --- sound/usb/usx2y/usb_stream.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/usb/usx2y/usb_stream.c b/sound/usb/usx2y/usb_stream.c index b0f8979ff2d2..221adf68bd0c 100644 --- a/sound/usb/usx2y/usb_stream.c +++ b/sound/usb/usx2y/usb_stream.c @@ -104,7 +104,12 @@ static int init_urbs(struct usb_stream_kernel *sk, unsigned use_packsize, for (u = 0; u < USB_STREAM_NURBS; ++u) { sk->inurb[u] = usb_alloc_urb(sk->n_o_ps, GFP_KERNEL); + if (!sk->inurb[u]) + return -ENOMEM; + sk->outurb[u] = usb_alloc_urb(sk->n_o_ps, GFP_KERNEL); + if (!sk->outurb[u]) + return -ENOMEM; } if (init_pipe_urbs(sk, use_packsize, sk->inurb, indata, dev, in_pipe) || From 7472946915aad1cc751cce3edfd8c1fd5c845834 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Mar 2019 21:33:25 +0100 Subject: [PATCH 444/855] ALSA: hda/tegra: avoid build error without CONFIG_PM The #ifdef protection around the PM functions is wrong, leading to a failed reference in some configurations: sound/pci/hda/hda_tegra.c: In function 'hda_tegra_runtime_suspend': sound/pci/hda/hda_tegra.c:273:2: error: implicit declaration of function 'hda_tegra_disable_clocks'; did you mean 'hda_tegra_enable_clocks'? [-Werror=implicit-function-declaration] Better remove the #ifdefs entirely and rely on the compiler silently dropping unused functions marked __maybe_unused. Fixes: 707e0759f2f4 ("ALSA: hda/tegra: implement runtime suspend/resume") Acked-by: Thierry Reding Signed-off-by: Arnd Bergmann Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_tegra.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/sound/pci/hda/hda_tegra.c b/sound/pci/hda/hda_tegra.c index dbd8da5685cb..3d68f9ef7694 100644 --- a/sound/pci/hda/hda_tegra.c +++ b/sound/pci/hda/hda_tegra.c @@ -219,7 +219,6 @@ disable_hda: return rc; } -#ifdef CONFIG_PM_SLEEP static void hda_tegra_disable_clocks(struct hda_tegra *data) { clk_disable_unprepare(data->hda2hdmi_clk); @@ -230,7 +229,7 @@ static void hda_tegra_disable_clocks(struct hda_tegra *data) /* * power management */ -static int hda_tegra_suspend(struct device *dev) +static int __maybe_unused hda_tegra_suspend(struct device *dev) { struct snd_card *card = dev_get_drvdata(dev); int rc; @@ -243,7 +242,7 @@ static int hda_tegra_suspend(struct device *dev) return 0; } -static int hda_tegra_resume(struct device *dev) +static int __maybe_unused hda_tegra_resume(struct device *dev) { struct snd_card *card = dev_get_drvdata(dev); int rc; @@ -255,10 +254,8 @@ static int hda_tegra_resume(struct device *dev) return 0; } -#endif /* CONFIG_PM_SLEEP */ -#ifdef CONFIG_PM -static int hda_tegra_runtime_suspend(struct device *dev) +static int __maybe_unused hda_tegra_runtime_suspend(struct device *dev) { struct snd_card *card = dev_get_drvdata(dev); struct azx *chip = card->private_data; @@ -275,7 +272,7 @@ static int hda_tegra_runtime_suspend(struct device *dev) return 0; } -static int hda_tegra_runtime_resume(struct device *dev) +static int __maybe_unused hda_tegra_runtime_resume(struct device *dev) { struct snd_card *card = dev_get_drvdata(dev); struct azx *chip = card->private_data; @@ -292,7 +289,6 @@ static int hda_tegra_runtime_resume(struct device *dev) return 0; } -#endif /* CONFIG_PM */ static const struct dev_pm_ops hda_tegra_pm = { SET_SYSTEM_SLEEP_PM_OPS(hda_tegra_suspend, hda_tegra_resume) From bb06c388fa20ae24cfe80c52488de718a7e3a53f Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 13 Mar 2019 16:01:37 +0800 Subject: [PATCH 445/855] 9p/net: fix memory leak in p9_client_create If msize is less than 4096, we should close and put trans, destroy tagpool, not just free client. This patch fixes that. Link: http://lkml.kernel.org/m/1552464097-142659-1-git-send-email-zhengbin13@huawei.com Cc: stable@vger.kernel.org Fixes: 574d356b7a02 ("9p/net: put a lower bound on msize") Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Dominique Martinet --- net/9p/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/9p/client.c b/net/9p/client.c index 357214a51f13..b85d51f4b8eb 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1061,7 +1061,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) p9_debug(P9_DEBUG_ERROR, "Please specify a msize of at least 4k\n"); err = -EINVAL; - goto free_client; + goto close_trans; } err = p9_client_version(clnt); From cbc05fd6708c1744ee6a61cb4c461ff956d30524 Mon Sep 17 00:00:00 2001 From: Jian-Hong Pan Date: Wed, 13 Mar 2019 17:33:24 +0800 Subject: [PATCH 446/855] ALSA: hda/realtek: Enable headset MIC of Acer TravelMate X514-51T with ALC255 The Acer TravelMate X514-51T with ALC255 cannot detect the headset MIC until ALC255_FIXUP_ACER_HEADSET_MIC quirk applied. Although, the internal DMIC uses another module - snd_soc_skl as the driver. We still need the NID 0x1a in the quirk to enable the headset MIC. Signed-off-by: Jian-Hong Pan Signed-off-by: Kailang Yang Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c8413d44973c..8debd661e6f2 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5673,6 +5673,7 @@ enum { ALC225_FIXUP_HEADSET_JACK, ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE, ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE, + ALC255_FIXUP_ACER_HEADSET_MIC, }; static const struct hda_fixup alc269_fixups[] = { @@ -6639,6 +6640,16 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC285_FIXUP_LENOVO_HEADPHONE_NOISE }, + [ALC255_FIXUP_ACER_HEADSET_MIC] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x03a11130 }, + { 0x1a, 0x90a60140 }, /* use as internal mic */ + { } + }, + .chained = true, + .chain_id = ALC255_FIXUP_HEADSET_MODE_NO_HP_MIC + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -6658,6 +6669,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x128f, "Acer Veriton Z6860G", ALC286_FIXUP_ACER_AIO_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x1290, "Acer Veriton Z4860G", ALC286_FIXUP_ACER_AIO_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x1291, "Acer Veriton Z4660G", ALC286_FIXUP_ACER_AIO_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1025, 0x1330, "Acer TravelMate X514-51T", ALC255_FIXUP_ACER_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0470, "Dell M101z", ALC269_FIXUP_DELL_M101Z), SND_PCI_QUIRK(0x1028, 0x054b, "Dell XPS one 2710", ALC275_FIXUP_DELL_XPS), SND_PCI_QUIRK(0x1028, 0x05bd, "Dell Latitude E6440", ALC292_FIXUP_DELL_E7X), From 10f5b1b85ed10a80d45bc2db450e65bd792efaad Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Thu, 21 Feb 2019 16:10:22 +0800 Subject: [PATCH 447/855] ALSA: hda/realtek - Fixed Headset Mic JD not stable It will be lose Mic JD state when Chrome OS boot and headset was plugged. Implement of reset combo jack JD. It will show normally. Fixes: e854747d7593 ("ALSA: hda/realtek - Enable headset button support for new codec") Signed-off-by: Kailang Yang Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 8debd661e6f2..156f4c4305a5 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5521,6 +5521,26 @@ static void alc_fixup_headset_jack(struct hda_codec *codec, } } +static void alc295_fixup_chromebook(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + + switch (action) { + case HDA_FIXUP_ACT_PRE_PROBE: + switch (codec->core.vendor_id) { + case 0x10ec0295: + alc_update_coef_idx(codec, 0x4a, 0x8000, 1 << 15); /* Reset HP JD */ + alc_update_coef_idx(codec, 0x4a, 0x8000, 0 << 15); + break; + case 0x10ec0236: + alc_update_coef_idx(codec, 0x1b, 0x8000, 1 << 15); /* Reset HP JD */ + alc_update_coef_idx(codec, 0x1b, 0x8000, 0 << 15); + break; + } + break; + } +} + static void alc_fixup_disable_mic_vref(struct hda_codec *codec, const struct hda_fixup *fix, int action) { @@ -5674,6 +5694,7 @@ enum { ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE, ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE, ALC255_FIXUP_ACER_HEADSET_MIC, + ALC295_FIXUP_CHROME_BOOK, }; static const struct hda_fixup alc269_fixups[] = { @@ -6650,6 +6671,12 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC255_FIXUP_HEADSET_MODE_NO_HP_MIC }, + [ALC295_FIXUP_CHROME_BOOK] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc295_fixup_chromebook, + .chained = true, + .chain_id = ALC225_FIXUP_HEADSET_JACK + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -7060,7 +7087,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC255_FIXUP_DUMMY_LINEOUT_VERB, .name = "alc255-dummy-lineout"}, {.id = ALC255_FIXUP_DELL_HEADSET_MIC, .name = "alc255-dell-headset"}, {.id = ALC295_FIXUP_HP_X360, .name = "alc295-hp-x360"}, - {.id = ALC225_FIXUP_HEADSET_JACK, .name = "alc-sense-combo"}, + {.id = ALC295_FIXUP_CHROME_BOOK, .name = "alc-sense-combo"}, {} }; #define ALC225_STANDARD_PINS \ From 167897f4b32c2bc18b3b6183029a33fb420a114e Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Wed, 13 Mar 2019 13:40:15 +0100 Subject: [PATCH 448/855] ALSA: hda - add more quirks for HP Z2 G4 and HP Z240 Apply the HP_MIC_NO_PRESENCE fixups for the more HP Z2 G4 and HP Z240 models. Reported-by: Jeff Burrell Signed-off-by: Jaroslav Kysela Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_conexant.c | 3 +++ sound/pci/hda/patch_realtek.c | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index a4ee7656d9ee..fb65ad31e86c 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -936,6 +936,9 @@ static const struct snd_pci_quirk cxt5066_fixups[] = { SND_PCI_QUIRK(0x103c, 0x8299, "HP 800 G3 SFF", CXT_FIXUP_HP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x829a, "HP 800 G3 DM", CXT_FIXUP_HP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x8455, "HP Z2 G4", CXT_FIXUP_HP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x103c, 0x8456, "HP Z2 G4 SFF", CXT_FIXUP_HP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x103c, 0x8457, "HP Z2 G4 mini", CXT_FIXUP_HP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x103c, 0x8458, "HP Z2 G4 mini premium", CXT_FIXUP_HP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x138d, "Asus", CXT_FIXUP_HEADPHONE_MIC_PIN), SND_PCI_QUIRK(0x152d, 0x0833, "OLPC XO-1.5", CXT_FIXUP_OLPC_XO), SND_PCI_QUIRK(0x17aa, 0x20f2, "Lenovo T400", CXT_PINCFG_LENOVO_TP410), diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 156f4c4305a5..a63670013943 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6803,11 +6803,13 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x2336, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC1), SND_PCI_QUIRK(0x103c, 0x2337, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC1), SND_PCI_QUIRK(0x103c, 0x221c, "HP EliteBook 755 G2", ALC280_FIXUP_HP_HEADSET_MIC), + SND_PCI_QUIRK(0x103c, 0x802e, "HP Z240 SFF", ALC221_FIXUP_HP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x103c, 0x802f, "HP Z240", ALC221_FIXUP_HP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x820d, "HP Pavilion 15", ALC269_FIXUP_HP_MUTE_LED_MIC3), SND_PCI_QUIRK(0x103c, 0x8256, "HP", ALC221_FIXUP_HP_FRONT_MIC), SND_PCI_QUIRK(0x103c, 0x827e, "HP x360", ALC295_FIXUP_HP_X360), - SND_PCI_QUIRK(0x103c, 0x82bf, "HP", ALC221_FIXUP_HP_MIC_NO_PRESENCE), - SND_PCI_QUIRK(0x103c, 0x82c0, "HP", ALC221_FIXUP_HP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x103c, 0x82bf, "HP G3 mini", ALC221_FIXUP_HP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x103c, 0x82c0, "HP G3 mini premium", ALC221_FIXUP_HP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x83b9, "HP Spectre x360", ALC269_FIXUP_HP_MUTE_LED_MIC3), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), From 31b265b3baaf55f209229888b7ffea523ddab366 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 8 Mar 2019 11:32:04 -0800 Subject: [PATCH 449/855] tracing: kdb: Fix ftdump to not sleep As reported back in 2016-11 [1], the "ftdump" kdb command triggers a BUG for "sleeping function called from invalid context". kdb's "ftdump" command wants to call ring_buffer_read_prepare() in atomic context. A very simple solution for this is to add allocation flags to ring_buffer_read_prepare() so kdb can call it without triggering the allocation error. This patch does that. Note that in the original email thread about this, it was suggested that perhaps the solution for kdb was to either preallocate the buffer ahead of time or create our own iterator. I'm hoping that this alternative of adding allocation flags to ring_buffer_read_prepare() can be considered since it means I don't need to duplicate more of the core trace code into "trace_kdb.c" (for either creating my own iterator or re-preparing a ring allocator whose memory was already allocated). NOTE: another option for kdb is to actually figure out how to make it reuse the existing ftrace_dump() function and totally eliminate the duplication. This sounds very appealing and actually works (the "sr z" command can be seen to properly dump the ftrace buffer). The downside here is that ftrace_dump() fully consumes the trace buffer. Unless that is changed I'd rather not use it because it means "ftdump | grep xyz" won't be very useful to search the ftrace buffer since it will throw away the whole trace on the first grep. A future patch to dump only the last few lines of the buffer will also be hard to implement. [1] https://lkml.kernel.org/r/20161117191605.GA21459@google.com Link: http://lkml.kernel.org/r/20190308193205.213659-1-dianders@chromium.org Reported-by: Brian Norris Signed-off-by: Douglas Anderson Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 5 +++-- kernel/trace/trace.c | 6 ++++-- kernel/trace/trace_kdb.c | 6 ++++-- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index f1429675f252..1a40277b512c 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -128,7 +128,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events); struct ring_buffer_iter * -ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu); +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags); void ring_buffer_read_prepare_sync(void); void ring_buffer_read_start(struct ring_buffer_iter *iter); void ring_buffer_read_finish(struct ring_buffer_iter *iter); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9a91479bbbfe..41b6f96e5366 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4191,6 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over + * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate * through the buffer. Memory is allocated, buffer recording @@ -4208,7 +4209,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -4216,7 +4217,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc(sizeof(*iter), flags); if (!iter) return NULL; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e9cc47e59d25..ccd759eaad79 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4077,7 +4077,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { @@ -4087,7 +4088,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index d953c163a079..810d78a8d14c 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -51,14 +51,16 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu]); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu_file, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } From bf504110bc8aa05df48b0e5f0aa84bfb81e0574b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Mar 2019 14:06:12 +0000 Subject: [PATCH 450/855] Btrfs: fix incorrect file size after shrinking truncate and fsync If we do a shrinking truncate against an inode which is already present in the respective log tree and then rename it, as part of logging the new name we end up logging an inode item that reflects the old size of the file (the one which we previously logged) and not the new smaller size. The decision to preserve the size previously logged was added by commit 1a4bcf470c886b ("Btrfs: fix fsync data loss after adding hard link to inode") in order to avoid data loss after replaying the log. However that decision is only needed for the case the logged inode size is smaller then the current size of the inode, as explained in that commit's change log. If the current size of the inode is smaller then the previously logged size, we know a shrinking truncate happened and therefore need to use that smaller size. Example to trigger the problem: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ xfs_io -f -c "pwrite -S 0xab 0 8000" /mnt/foo $ xfs_io -c "fsync" /mnt/foo $ xfs_io -c "truncate 3000" /mnt/foo $ mv /mnt/foo /mnt/bar $ xfs_io -c "fsync" /mnt/bar $ mount /dev/sdb /mnt $ od -t x1 -A d /mnt/bar 0000000 ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab * 0008000 Once we rename the file, we log its name (and inode item), and because the inode was already logged before in the current transaction, we log it with a size of 8000 bytes because that is the size we previously logged (with the first fsync). As part of the rename, besides logging the inode, we do also sync the log, which is done since commit d4682ba03ef618 ("Btrfs: sync log after logging new name"), so the next fsync against our inode is effectively a no-op, since no new changes happened since the rename operation. Even if did not sync the log during the rename operation, the same problem (fize size of 8000 bytes instead of 3000 bytes) would be visible after replaying the log if the log ended up getting synced to disk through some other means, such as for example by fsyncing some other modified file. In the example above the fsync after the rename operation is there just because not every filesystem may guarantee logging/journalling the inode (and syncing the log/journal) during the rename operation, for example it is needed for f2fs, but not for ext4 and xfs. Fix this scenario by, when logging a new name (which is triggered by rename and link operations), using the current size of the inode instead of the previously logged inode size. A test case for fstests follows soon. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202695 CC: stable@vger.kernel.org # 4.4+ Reported-by: Seulbae Kim Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f06454a55e00..5256cddf3a43 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4544,6 +4544,19 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); *size_ret = btrfs_inode_size(path->nodes[0], item); + /* + * If the in-memory inode's i_size is smaller then the inode + * size stored in the btree, return the inode's i_size, so + * that we get a correct inode size after replaying the log + * when before a power failure we had a shrinking truncate + * followed by addition of a new name (rename / new hard link). + * Otherwise return the inode size from the btree, to avoid + * data loss when replaying a log due to previously doing a + * write that expands the inode's size and logging a new name + * immediately after. + */ + if (*size_ret > inode->vfs_inode.i_size) + *size_ret = inode->vfs_inode.i_size; } btrfs_release_path(path); From 2cc8334270e281815c3850c3adea363c51f21e0d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 6 Mar 2019 17:13:04 -0500 Subject: [PATCH 451/855] btrfs: remove WARN_ON in log_dir_items When Filipe added the recursive directory logging stuff in 2f2ff0ee5e430 ("Btrfs: fix metadata inconsistencies after directory fsync") he specifically didn't take the directory i_mutex for the children directories that we need to log because of lockdep. This is generally fine, but can lead to this WARN_ON() tripping if we happen to run delayed deletion's in between our first search and our second search of dir_item/dir_indexes for this directory. We expect this to happen, so the WARN_ON() isn't necessary. Drop the WARN_ON() and add a comment so we know why this case can happen. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5256cddf3a43..960ea49a7a0f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3578,9 +3578,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - /* find the first key from this transaction again */ + /* + * Find the first key from this transaction again. See the note for + * log_new_dir_dentries, if we're logging a directory recursively we + * won't be holding its i_mutex, which means we can modify the directory + * while we're logging it. If we remove an entry between our first + * search and this search we'll not find the key again and can just + * bail. + */ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (WARN_ON(ret != 0)) + if (ret != 0) goto done; /* From 609e804d771f59dc5d45a93e5ee0053c74bbe2bf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Feb 2019 13:42:30 +0000 Subject: [PATCH 452/855] Btrfs: fix file corruption after snapshotting due to mix of buffered/DIO writes When we are mixing buffered writes with direct IO writes against the same file and snapshotting is happening concurrently, we can end up with a corrupt file content in the snapshot. Example: 1) Inode/file is empty. 2) Snapshotting starts. 2) Buffered write at offset 0 length 256Kb. This updates the i_size of the inode to 256Kb, disk_i_size remains zero. This happens after the task doing the snapshot flushes all existing delalloc. 3) DIO write at offset 256Kb length 768Kb. Once the ordered extent completes it sets the inode's disk_i_size to 1Mb (256Kb + 768Kb) and updates the inode item in the fs tree with a size of 1Mb (which is the value of disk_i_size). 4) The dealloc for the range [0, 256Kb[ did not start yet. 5) The transaction used in the DIO ordered extent completion, which updated the inode item, is committed by the snapshotting task. 6) Snapshot creation completes. 7) Dealloc for the range [0, 256Kb[ is flushed. After that when reading the file from the snapshot we always get zeroes for the range [0, 256Kb[, the file has a size of 1Mb and the data written by the direct IO write is found. From an application's point of view this is a corruption, since in the source subvolume it could never read a version of the file that included the data from the direct IO write without the data from the buffered write included as well. In the snapshot's tree, file extent items are missing for the range [0, 256Kb[. The issue, obviously, does not happen when using the -o flushoncommit mount option. Fix this by flushing delalloc for all the roots that are about to be snapshotted when committing a transaction. This guarantees total ordering when updating the disk_i_size of an inode since the flush for dealloc is done when a transaction is in the TRANS_STATE_COMMIT_START state and wait is done once no more external writers exist. This is similar to what we do when using the flushoncommit mount option, but we do it only if the transaction has snapshots to create and only for the roots of the subvolumes to be snapshotted. The bulk of the dealloc is flushed in the snapshot creation ioctl, so the flush work we do inside the transaction is minimized. This issue, involving buffered and direct IO writes with snapshotting, is often triggered by fstest btrfs/078, and got reported by fsck when not using the NO_HOLES features, for example: $ cat results/btrfs/078.full (...) _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent *** fsck.btrfs output *** [1/7] checking root items [2/7] checking extents [3/7] checking free space cache [4/7] checking fs roots root 258 inode 264 errors 100, file extent discount Found file extent holes: start: 524288, len: 65536 ERROR: errors found in fs roots Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 49 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index acdad6d658f5..e4e665f422fc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1886,8 +1886,10 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) } } -static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) +static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; + /* * We use writeback_inodes_sb here because if we used * btrfs_start_delalloc_roots we would deadlock with fs freeze. @@ -1897,15 +1899,50 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) * from already being in a transaction and our join_transaction doesn't * have to re-take the fs freeze lock. */ - if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) { writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); + } else { + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + + /* + * Flush dellaloc for any root that is going to be snapshotted. + * This is done to avoid a corrupted version of files, in the + * snapshots, that had both buffered and direct IO writes (even + * if they were done sequentially) due to an unordered update of + * the inode's size on disk. + */ + list_for_each_entry(pending, head, list) { + int ret; + + ret = btrfs_start_delalloc_snapshot(pending->root); + if (ret) + return ret; + } + } return 0; } -static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) +static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans) { - if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) + struct btrfs_fs_info *fs_info = trans->fs_info; + + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) { btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + } else { + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + + /* + * Wait for any dellaloc that we started previously for the roots + * that are going to be snapshotted. This is to avoid a corrupted + * version of files in the snapshots that had both buffered and + * direct IO writes (even if they were done sequentially). + */ + list_for_each_entry(pending, head, list) + btrfs_wait_ordered_extents(pending->root, + U64_MAX, 0, U64_MAX); + } } int btrfs_commit_transaction(struct btrfs_trans_handle *trans) @@ -2023,7 +2060,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) extwriter_counter_dec(cur_trans, trans->type); - ret = btrfs_start_delalloc_flush(fs_info); + ret = btrfs_start_delalloc_flush(trans); if (ret) goto cleanup_transaction; @@ -2039,7 +2076,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto cleanup_transaction; - btrfs_wait_delalloc_flush(fs_info); + btrfs_wait_delalloc_flush(trans); btrfs_scrub_pause(fs_info); /* From 0cc068e6ee59c1fffbfa977d8bf868b7551d80ac Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 7 Mar 2019 15:40:50 +0100 Subject: [PATCH 453/855] btrfs: don't report readahead errors and don't update statistics As readahead is an optimization, all errors are usually filtered out, but still properly handled when the real read call is done. The commit 5e9d398240b2 ("btrfs: readpages() should submit IO as read-ahead") added REQ_RAHEAD to readpages() because that's only used for readahead (despite what one would expect from the callback name). This causes a flood of messages and inflated read error stats, so skip reporting in case it's readahead. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202403 Reported-by: LimeTech Fixes: 5e9d398240b2 ("btrfs: readpages() should submit IO as read-ahead") CC: stable@vger.kernel.org # 4.19+ Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9024eee889b9..db934ceae9c1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6407,7 +6407,7 @@ static void btrfs_end_bio(struct bio *bio) if (bio_op(bio) == REQ_OP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - else + else if (!(bio->bi_opf & REQ_RAHEAD)) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) From bf263c35b2ebe7f1674205f6b36487250299b5a7 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 12 Mar 2019 13:44:42 +0100 Subject: [PATCH 454/855] i2c: add extra check to safe DMA buffer helper Make sure we report 'no buffer' for 0-length messages. This can only happen if threshold is set to 0 which is kind of bogus but we should still handle this situation. Update the docs and add a debug message to educate callers of this function. Reported-by: Hsin-Yi Wang Fixes: e94bc5d18be0 ("i2c: add helpers to ease DMA handling") Signed-off-by: Wolfram Sang Reviewed-by: Hsin-Yi Wang Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index cb6c5cb0df0b..38af18645133 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -2258,7 +2258,8 @@ EXPORT_SYMBOL(i2c_put_adapter); /** * i2c_get_dma_safe_msg_buf() - get a DMA safe buffer for the given i2c_msg * @msg: the message to be checked - * @threshold: the minimum number of bytes for which using DMA makes sense + * @threshold: the minimum number of bytes for which using DMA makes sense. + * Should at least be 1. * * Return: NULL if a DMA safe buffer was not obtained. Use msg->buf with PIO. * Or a valid pointer to be used with DMA. After use, release it by @@ -2268,7 +2269,11 @@ EXPORT_SYMBOL(i2c_put_adapter); */ u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold) { - if (msg->len < threshold) + /* also skip 0-length msgs for bogus thresholds of 0 */ + if (!threshold) + pr_debug("DMA buffer for addr=0x%02x with length 0 is bogus\n", + msg->addr); + if (msg->len < threshold || msg->len == 0) return NULL; if (msg->flags & I2C_M_DMA_SAFE) From 77f3381a83c2f66daeb6719a1191a87280d57f62 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 12 Mar 2019 15:55:53 +0100 Subject: [PATCH 455/855] i2c: i2c-designware-platdrv: Cleanup setting of the adapter number i2c-designware-platdrv assumes that if the pdev has an apci-companion it should use a dynamic adapter-nr and otherwise it will use pdev->id as adapter-nr. Before this commit the setting of the adapter.nr was somewhat convoluted, in the acpi_companion case it was set from dw_i2c_acpi_configure, in the non acpi_companion case it was set from dw_i2c_set_fifo_size based on tx_fifo_depth not being set yet indicating that dw_i2c_acpi_configure was not executed. This cleans this up, directly setting the adapter-nr from dw_i2c_plat_probe for both cases. Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index ead5e7de3e4d..30529839cbd2 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -86,7 +86,6 @@ static int dw_i2c_acpi_configure(struct platform_device *pdev) struct i2c_timings *t = &dev->timings; u32 ss_ht = 0, fp_ht = 0, hs_ht = 0, fs_ht = 0; - dev->adapter.nr = -1; dev->tx_fifo_depth = 32; dev->rx_fifo_depth = 32; @@ -219,7 +218,7 @@ static void i2c_dw_configure_slave(struct dw_i2c_dev *dev) dev->mode = DW_IC_SLAVE; } -static void dw_i2c_set_fifo_size(struct dw_i2c_dev *dev, int id) +static void dw_i2c_set_fifo_size(struct dw_i2c_dev *dev) { u32 param, tx_fifo_depth, rx_fifo_depth; @@ -233,7 +232,6 @@ static void dw_i2c_set_fifo_size(struct dw_i2c_dev *dev, int id) if (!dev->tx_fifo_depth) { dev->tx_fifo_depth = tx_fifo_depth; dev->rx_fifo_depth = rx_fifo_depth; - dev->adapter.nr = id; } else if (tx_fifo_depth >= 2) { dev->tx_fifo_depth = min_t(u32, dev->tx_fifo_depth, tx_fifo_depth); @@ -358,13 +356,17 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) div_u64(clk_khz * t->sda_hold_ns + 500000, 1000000); } - dw_i2c_set_fifo_size(dev, pdev->id); + dw_i2c_set_fifo_size(dev); adap = &dev->adapter; adap->owner = THIS_MODULE; adap->class = I2C_CLASS_DEPRECATED; ACPI_COMPANION_SET(&adap->dev, ACPI_COMPANION(&pdev->dev)); adap->dev.of_node = pdev->dev.of_node; + if (has_acpi_companion(&pdev->dev)) + adap->nr = -1; + else + adap->nr = pdev->id; dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_SMART_PREPARE | From cd86d1403bb4c80e443d736b2a692cbf68a9f471 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 12 Mar 2019 15:55:54 +0100 Subject: [PATCH 456/855] i2c: i2c-designware-platdrv: Always use a dynamic adapter number Before this commit the i2c-designware-platdrv assumes that if the pdev has an apci-companion it should use a dynamic adapter-nr and it sets adapter->nr to -1, otherwise it will use pdev->id as adapter->nr. There are 3 ways how platform_device-s to which i2c-designware-platdrv will bind can be instantiated: 1) Through of / devicetree 2) Through ACPI enumeration 3) Explicitly instantiated through platform_device_create + add 1) In case of devicetree-instantiation the drivers/of code always sets pdev->id to PLATFORM_DEVID_NONE, which is -1 so in this case both paths to set adapter->nr end up doing the same thing. 2) In case of ACPI instantiation the device will always have an ACPI-companion, so we are already using dynamic adapter-nrs. 3) There are 2 places manually instantiating a designware_i2c platform_dev: drivers/mfd/intel_quark_i2c_gpio.c drivers/mfd/intel-lpss.c In the intel_quark_i2c_gpio.c case pdev->id is always 0, so switching to dynamic adapter-nrs here could lead to the bus-number no longer being stable, but the quark X1000 only has 1 i2c-controller, which will also be assigned bus-number 0 when using dynamic adapter-nrs. In the intel-lpss.c case intel_lpss_probe() is called from either intel-lpss-acpi.c in which case there always is an ACPI-companion, or from intel-lpss-pci.c. In most cases devices handled by intel-lpss-pci.c also have an ACPI-companion, so we use a dynamic adapter-nr. But in some cases the ACPI-companion is missing and we would use pdev->id (allocated from intel_lpss_devid_ida). Devices which use the intel-lpss-pci.c code typically have many i2c busses, so using pdev->id in this case may lead to a bus-number conflict, triggering a WARN(id < 0, "couldn't get idr") in i2c-core-base.c causing an oops an the adapter registration to fail. So in this case using non dynamic adapter-nrs is actually undesirable. One machine on which this oops was triggering is the Apollo Lake based Acer TravelMate Spin B118. TL;DR: Switching to always using dynamic adapter-numbers does not make any difference in most cases and in the one case where it does make a difference the behavior change is desirable because the old behavior caused an oops. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1687065 Signed-off-by: Hans de Goede Acked-by: Andy Shevchenko Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index 30529839cbd2..416f89b8f881 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -363,10 +363,7 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) adap->class = I2C_CLASS_DEPRECATED; ACPI_COMPANION_SET(&adap->dev, ACPI_COMPANION(&pdev->dev)); adap->dev.of_node = pdev->dev.of_node; - if (has_acpi_companion(&pdev->dev)) - adap->nr = -1; - else - adap->nr = pdev->id; + adap->nr = -1; dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_SMART_PREPARE | From e3a22850664ffbe5b786c80249b3cfea61b659c6 Mon Sep 17 00:00:00 2001 From: Riku Voipio Date: Wed, 2 Jan 2019 11:23:04 +0200 Subject: [PATCH 457/855] deb-pkg: generate correct build dependencies bison/flex is now needed always for building for kconfig. Some build dependencies depend on kernel configuration, enable them as needed: - libelf-dev when UNWINDER_ORC is set - libssl-dev for SYSTEM_TRUSTED_KEYRING Since the libssl-dev is needed for extract_cert binary, denote with :native to install the libssl-dev for the build machines architecture, rather than for the architecture of the kernel being built. Tested-by: Manivannan Sadhasivam Signed-off-by: Riku Voipio Reviewed-by: Ben Hutchings Acked-by: maximilian attems [masahiro.yamada: change 'flex' to 'flex | flex:native' ] Signed-off-by: Masahiro Yamada --- scripts/package/mkdebian | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index f030961c5165..110cd8d2a226 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -134,6 +134,8 @@ fi mkdir -p debian/ echo $debarch > debian/arch +extra_build_depends=", $(if_enabled_echo UNWINDER_ORC libelf-dev)" +extra_build_depends="$extra_build_depends, $(if_enabled_echo SYSTEM_TRUSTED_KEYRING libssl-dev:native)" # Generate a simple changelog template cat < debian/changelog @@ -170,7 +172,7 @@ Source: $sourcename Section: kernel Priority: optional Maintainer: $maintainer -Build-Depends: bc, kmod, cpio +Build-Depends: bc, kmod, cpio, bison, flex | flex:native $extra_build_depends Homepage: http://www.kernel.org/ Package: $packagename From 13d3bc7152d0f00b243dd4863f8feac81f7f84c7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 25 Jan 2019 12:41:38 +0900 Subject: [PATCH 458/855] libfdt: prefix header search paths with $(srctree)/ Currently, the Kbuild core manipulates header search paths in a crazy way [1]. To fix this mess, I want all Makefiles to add explicit $(srctree)/ to the search paths in the srctree. Some Makefiles are already written in that way, but not all. The goal of this work is to make the notation consistent, and finally get rid of the gross hacks. Having whitespaces after -I does not matter since commit 48f6e3cf5bc6 ("kbuild: do not drop -I without parameter"). [1]: https://patchwork.kernel.org/patch/9632347/ Signed-off-by: Masahiro Yamada --- lib/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Makefile b/lib/Makefile index 4e066120a0d6..3b08673e8881 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -213,7 +213,7 @@ KCOV_INSTRUMENT_stackdepot.o := n libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ fdt_empty_tree.o $(foreach file, $(libfdt_files), \ - $(eval CFLAGS_$(file) = -I$(src)/../scripts/dtc/libfdt)) + $(eval CFLAGS_$(file) = -I $(srctree)/scripts/dtc/libfdt)) lib-$(CONFIG_LIBFDT) += $(libfdt_files) obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o From 393492b567fd849a92795d28316292fbcaf48af1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 25 Jan 2019 16:18:23 +0900 Subject: [PATCH 459/855] ia64: prefix header search path with $(srctree)/ Currently, the Kbuild core manipulates header search paths in a crazy way [1]. To fix this mess, I want all Makefiles to add explicit $(srctree)/ to the search paths in the srctree. Some Makefiles are already written in that way, but not all. The goal of this work is to make the notation consistent, and finally get rid of the gross hacks. Having whitespaces after -I does not matter since commit 48f6e3cf5bc6 ("kbuild: do not drop -I without parameter"). I removed some header search paths because I was able to build ia64 without them. [1]: https://patchwork.kernel.org/patch/9632347/ Signed-off-by: Masahiro Yamada --- arch/ia64/sn/kernel/Makefile | 2 +- arch/ia64/sn/kernel/sn2/Makefile | 2 -- arch/ia64/sn/pci/Makefile | 2 -- arch/ia64/sn/pci/pcibr/Makefile | 2 +- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/ia64/sn/kernel/Makefile b/arch/ia64/sn/kernel/Makefile index d27df1d45da7..9c349dd23265 100644 --- a/arch/ia64/sn/kernel/Makefile +++ b/arch/ia64/sn/kernel/Makefile @@ -7,7 +7,7 @@ # Copyright (C) 1999,2001-2006,2008 Silicon Graphics, Inc. All Rights Reserved. # -ccflags-y := -Iarch/ia64/sn/include +ccflags-y := -I $(srctree)/arch/ia64/sn/include obj-y += setup.o bte.o bte_error.o irq.o mca.o idle.o \ huberror.o io_acpi_init.o io_common.o \ diff --git a/arch/ia64/sn/kernel/sn2/Makefile b/arch/ia64/sn/kernel/sn2/Makefile index 3d09108d4277..170bde4549da 100644 --- a/arch/ia64/sn/kernel/sn2/Makefile +++ b/arch/ia64/sn/kernel/sn2/Makefile @@ -9,7 +9,5 @@ # sn2 specific kernel files # -ccflags-y := -Iarch/ia64/sn/include - obj-y += cache.o io.o ptc_deadlock.o sn2_smp.o sn_proc_fs.o \ prominfo_proc.o timer.o timer_interrupt.o sn_hwperf.o diff --git a/arch/ia64/sn/pci/Makefile b/arch/ia64/sn/pci/Makefile index df2a90145426..321576b1b425 100644 --- a/arch/ia64/sn/pci/Makefile +++ b/arch/ia64/sn/pci/Makefile @@ -7,6 +7,4 @@ # # Makefile for the sn pci general routines. -ccflags-y := -Iarch/ia64/sn/include - obj-y := pci_dma.o tioca_provider.o tioce_provider.o pcibr/ diff --git a/arch/ia64/sn/pci/pcibr/Makefile b/arch/ia64/sn/pci/pcibr/Makefile index 396bcae36309..712f6af7c6e0 100644 --- a/arch/ia64/sn/pci/pcibr/Makefile +++ b/arch/ia64/sn/pci/pcibr/Makefile @@ -7,7 +7,7 @@ # # Makefile for the sn2 io routines. -ccflags-y := -Iarch/ia64/sn/include +ccflags-y := -I $(srctree)/arch/ia64/sn/include obj-y += pcibr_dma.o pcibr_reg.o \ pcibr_ate.o pcibr_provider.o From 46c7dd56d54133e3fb9414844d90e563627f3feb Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 1 Feb 2019 13:50:45 +0900 Subject: [PATCH 460/855] modpost: always show verbose warning for section mismatch Unless CONFIG_DEBUG_SECTION_MISMATCH is enabled, modpost only shows the number of section mismatches. If you want to know the symbols causing the issue, you need to rebuild with CONFIG_DEBUG_SECTION_MISMATCH. It is tedious. I think it is fine to show annoying warning when a new section mismatch comes in. Signed-off-by: Masahiro Yamada --- scripts/Makefile.modpost | 1 - scripts/mod/modpost.c | 27 +++++---------------------- 2 files changed, 5 insertions(+), 23 deletions(-) diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost index c0b7f526f95e..6b7f354f189a 100644 --- a/scripts/Makefile.modpost +++ b/scripts/Makefile.modpost @@ -77,7 +77,6 @@ modpost = scripts/mod/modpost \ $(if $(KBUILD_EXTMOD),-I $(modulesymfile)) \ $(if $(KBUILD_EXTRA_SYMBOLS), $(patsubst %, -e %,$(KBUILD_EXTRA_SYMBOLS))) \ $(if $(KBUILD_EXTMOD),-o $(modulesymfile)) \ - $(if $(CONFIG_DEBUG_SECTION_MISMATCH),,-S) \ $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),-w) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 26bf886bd168..0b0d1080b1c5 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -35,7 +35,6 @@ static int vmlinux_section_warnings = 1; static int warn_unresolved = 0; /* How a symbol is exported */ static int sec_mismatch_count = 0; -static int sec_mismatch_verbose = 1; static int sec_mismatch_fatal = 0; /* ignore missing files */ static int ignore_missing_files; @@ -1406,8 +1405,6 @@ static void report_sec_mismatch(const char *modname, char *prl_to; sec_mismatch_count++; - if (!sec_mismatch_verbose) - return; get_pretty_name(from_is_func, &from, &from_p); get_pretty_name(to_is_func, &to, &to_p); @@ -1655,9 +1652,7 @@ static void extable_mismatch_handler(const char* modname, struct elf_info *elf, sec_mismatch_count++; - if (sec_mismatch_verbose) - report_extable_warnings(modname, elf, mismatch, r, sym, - fromsec, tosec); + report_extable_warnings(modname, elf, mismatch, r, sym, fromsec, tosec); if (match(tosec, mismatch->bad_tosec)) fatal("The relocation at %s+0x%lx references\n" @@ -2433,7 +2428,7 @@ int main(int argc, char **argv) struct ext_sym_list *extsym_iter; struct ext_sym_list *extsym_start = NULL; - while ((opt = getopt(argc, argv, "i:I:e:mnsST:o:awE")) != -1) { + while ((opt = getopt(argc, argv, "i:I:e:mnsT:o:awE")) != -1) { switch (opt) { case 'i': kernel_read = optarg; @@ -2465,9 +2460,6 @@ int main(int argc, char **argv) case 's': vmlinux_section_warnings = 0; break; - case 'S': - sec_mismatch_verbose = 0; - break; case 'T': files_source = optarg; break; @@ -2525,18 +2517,9 @@ int main(int argc, char **argv) } if (dump_write) write_dump(dump_write); - if (sec_mismatch_count) { - if (!sec_mismatch_verbose) { - warn("modpost: Found %d section mismatch(es).\n" - "To see full details build your kernel with:\n" - "'make CONFIG_DEBUG_SECTION_MISMATCH=y'\n", - sec_mismatch_count); - } - if (sec_mismatch_fatal) { - fatal("modpost: Section mismatches detected.\n" - "Set CONFIG_SECTION_MISMATCH_WARN_ONLY=y to allow them.\n"); - } - } + if (sec_mismatch_count && sec_mismatch_fatal) + fatal("modpost: Section mismatches detected.\n" + "Set CONFIG_SECTION_MISMATCH_WARN_ONLY=y to allow them.\n"); free(buf.p); return err; From 898f5a009f226fbaee0ff9ea58b919a31f627d1e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 5 Feb 2019 16:33:37 +0900 Subject: [PATCH 461/855] kbuild: move archive command to scripts/Makefile.lib scripts/Makefile.build and arch/s390/boot/Makefile use the same command (thin archiving with symbol table creation). Avoid the code duplication, and move it to scripts/Makefile.lib. Signed-off-by: Masahiro Yamada --- arch/s390/boot/Makefile | 3 --- scripts/Makefile.build | 6 +----- scripts/Makefile.lib | 6 ++++++ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 11ca8795b74a..c844eaf24ed7 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -57,9 +57,6 @@ $(obj)/section_cmp%: vmlinux $(obj)/compressed/vmlinux FORCE $(obj)/compressed/vmlinux: $(obj)/startup.a FORCE $(Q)$(MAKE) $(build)=$(obj)/compressed $@ -quiet_cmd_ar = AR $@ - cmd_ar = rm -f $@; $(AR) rcsTP$(KBUILD_ARFLAGS) $@ $(real-prereqs) - $(obj)/startup.a: $(OBJECTS) FORCE $(call if_changed,ar) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 850b611f4aba..2554a15ecf2b 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -426,13 +426,9 @@ $(modorder-target): $(subdir-ym) FORCE # Rule to compile a set of .o files into one .a file (with symbol table) # ifdef lib-target -quiet_cmd_link_l_target = AR $@ - -# lib target archives do get a symbol table and index -cmd_link_l_target = rm -f $@; $(AR) rcsTP$(KBUILD_ARFLAGS) $@ $(real-prereqs) $(lib-target): $(lib-y) FORCE - $(call if_changed,link_l_target) + $(call if_changed,ar) targets += $(lib-target) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index c0abd9a779c3..8a1f64f17740 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -233,6 +233,12 @@ $(obj)/%: $(src)/%_shipped quiet_cmd_ld = LD $@ cmd_ld = $(LD) $(ld_flags) $(real-prereqs) -o $@ +# Archive +# --------------------------------------------------------------------------- + +quiet_cmd_ar = AR $@ + cmd_ar = rm -f $@; $(AR) rcsTP$(KBUILD_ARFLAGS) $@ $(real-prereqs) + # Objcopy # --------------------------------------------------------------------------- From fc2b47b55f17fd996f7a01975ce1c33c2f2513f6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 15 Feb 2019 13:04:26 +0900 Subject: [PATCH 462/855] h8300: use cc-cross-prefix instead of hardcoding h8300-unknown-linux- It believe it is a bad idea to hardcode a specific compiler prefix that may or may not be installed on a user's system. It is annoying when testing features that should not require compilers at all. For example, mrproper, headers_install, etc. should work without any compiler. They look like follows on my machine. $ make ARCH=h8300 mrproper ./scripts/gcc-version.sh: line 26: h8300-unknown-linux-gcc: command not found ./scripts/gcc-version.sh: line 27: h8300-unknown-linux-gcc: command not found make: h8300-unknown-linux-gcc: Command not found make: h8300-unknown-linux-gcc: Command not found [ a bunch of the same error messages continue ] $ make ARCH=h8300 headers_install ./scripts/gcc-version.sh: line 26: h8300-unknown-linux-gcc: command not found ./scripts/gcc-version.sh: line 27: h8300-unknown-linux-gcc: command not found make: h8300-unknown-linux-gcc: Command not found HOSTCC scripts/basic/fixdep make: h8300-unknown-linux-gcc: Command not found WRAP arch/h8300/include/generated/uapi/asm/kvm_para.h [ snip ] The solution is to delete this line, or to use cc-cross-prefix like some architectures do. I chose the latter as a moderate fixup. I added an alternative 'h8300-linux-' because it is available at: https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/x86_64/8.1.0/ Signed-off-by: Masahiro Yamada --- arch/h8300/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/h8300/Makefile b/arch/h8300/Makefile index f801f3708a89..ba0f26cfad61 100644 --- a/arch/h8300/Makefile +++ b/arch/h8300/Makefile @@ -27,7 +27,7 @@ KBUILD_LDFLAGS += $(ldflags-y) CHECKFLAGS += -msize-long ifeq ($(CROSS_COMPILE),) -CROSS_COMPILE := h8300-unknown-linux- +CROSS_COMPILE := $(call cc-cross-prefix, h8300-unknown-linux- h8300-linux-) endif core-y += arch/$(ARCH)/kernel/ arch/$(ARCH)/mm/ From c649bd59b6d43998d8c408e2a2fa1922ce5d8363 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 8 Mar 2019 14:30:35 +0900 Subject: [PATCH 463/855] unicore32: simplify linker script generation for decompressor When I was searching for unneeded $(KCONFIG_CONFIG) usages, I noticed this strange build dependency. It can use $(call if_changed,...) in case ZTEXTADDR and ZBSSADDR are changed, but even a simpler way is to use the pattern rule in scripts/Makefile.build. This is what arch/arm/boot/compressed/Makefile does. I did only build test. I confirmed equivalent vmlinux.lds was generated. Signed-off-by: Masahiro Yamada --- arch/unicore32/boot/compressed/Makefile | 5 +---- .../boot/compressed/{vmlinux.lds.in => vmlinux.lds.S} | 0 2 files changed, 1 insertion(+), 4 deletions(-) rename arch/unicore32/boot/compressed/{vmlinux.lds.in => vmlinux.lds.S} (100%) diff --git a/arch/unicore32/boot/compressed/Makefile b/arch/unicore32/boot/compressed/Makefile index 9aecdd3ddc48..150fafc32fb0 100644 --- a/arch/unicore32/boot/compressed/Makefile +++ b/arch/unicore32/boot/compressed/Makefile @@ -61,7 +61,4 @@ $(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head.o $(obj)/piggy.o \ ZTEXTADDR := 0x03000000 ZBSSADDR := ALIGN(4) -SEDFLAGS_lds = s/TEXT_START/$(ZTEXTADDR)/;s/BSS_START/$(ZBSSADDR)/ -$(obj)/vmlinux.lds: $(obj)/vmlinux.lds.in arch/unicore32/boot/Makefile $(KCONFIG_CONFIG) - @sed "$(SEDFLAGS_lds)" < $< > $@ - +CPPFLAGS_vmlinux.lds = -DTEXT_START="$(ZTEXTADDR)" -DBSS_START="$(ZBSSADDR)" diff --git a/arch/unicore32/boot/compressed/vmlinux.lds.in b/arch/unicore32/boot/compressed/vmlinux.lds.S similarity index 100% rename from arch/unicore32/boot/compressed/vmlinux.lds.in rename to arch/unicore32/boot/compressed/vmlinux.lds.S From 94cf8acc38e57caaba1329a255409b6d93936ba7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 8 Mar 2019 14:49:10 +0900 Subject: [PATCH 464/855] kbuild: source include/config/auto.conf instead of ${KCONFIG_CONFIG} As commit 423a8155facf ("kbuild: Fix reading of .config in link-vmlinux.sh") addressed, some shells fail to perform '.' if ${KCONFIG_CONFIG} does not contain a slash at all. Instead, we can source include/config/auto.conf, which obviously contain slashes, and we do not expect its file path overridden by a user. Perhaps, the performance might be slightly better since unset CONFIG options are stripped from include/config/auto.conf. scripts/setlocalversion already works this way. Signed-off-by: Masahiro Yamada --- scripts/adjust_autoksyms.sh | 9 +-------- scripts/link-vmlinux.sh | 9 +-------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/scripts/adjust_autoksyms.sh b/scripts/adjust_autoksyms.sh index 6e6d63957da3..84bf6b500815 100755 --- a/scripts/adjust_autoksyms.sh +++ b/scripts/adjust_autoksyms.sh @@ -39,14 +39,7 @@ case "$KBUILD_VERBOSE" in esac # We need access to CONFIG_ symbols -case "${KCONFIG_CONFIG}" in -*/*) - . "${KCONFIG_CONFIG}" - ;; -*) - # Force using a file from the current directory - . "./${KCONFIG_CONFIG}" -esac +. include/config/auto.conf # Generate a new ksym list file with symbols needed by the current # set of modules. diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index bc7f1fc1f55b..dc0e8c5a1402 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -171,14 +171,7 @@ if [ "$1" = "clean" ]; then fi # We need access to CONFIG_ symbols -case "${KCONFIG_CONFIG}" in -*/*) - . "${KCONFIG_CONFIG}" - ;; -*) - # Force using a file from the current directory - . "./${KCONFIG_CONFIG}" -esac +. include/config/auto.conf # Update version info GEN .version From 2b50f7ab63685cd247e32ad321f7338ed130d3d5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 8 Mar 2019 18:13:39 +0900 Subject: [PATCH 465/855] kbuild: add workaround for Debian make-kpkg Since commit 3812b8c5c5d5 ("kbuild: make -r/-R effective in top Makefile for old Make versions"), make-kpkg is not working. make-kpkg directly includes the top Makefile of Linux kernel, and appends some debian_* targets. /usr/share/kernel-package/ruleset/kernel_version.mk: # Include the kernel makefile override dot-config := 1 include Makefile dot-config := 1 I did not know the kernel Makefile was used in that way, and it is hard to guarantee the behavior when the kernel Makefile is included by another Makefile from a different project. It looks like Debian Stretch stopped providing make-kpkg. Maybe it is obsolete and being replaced with 'make deb-pkg' etc. but still widely used. This commit adds a workaround; if the top Makefile is included by another Makefile, skip sub-make in order to make the main part visible. 'MAKEFLAGS += -rR' does not become effective for GNU Make < 4.0, but Debian/Ubuntu is already using newer versions. The effect of this commit: Debian 8 (Jessie) : Fixed Debian 9 (Stretch) : make-kpkg (kernel-package) is not provided Ubuntu 14.04 LTS : NOT Fixed Ubuntu 16.04 LTS : Fixed Ubuntu 18.04 LTS : Fixed This commit cannot fix Ubuntu 14.04 because it installs GNU Make 3.81, but its support will end in Apr 2019, which is before the Linux v5.1 release. I added warning so that nobody would try to include the top Makefile. Fixes: 3812b8c5c5d5 ("kbuild: make -r/-R effective in top Makefile for old Make versions") Reported-by: Liz Zhang Signed-off-by: Masahiro Yamada Tested-by: Lili Deng Cc: Manoj Srivastava --- Makefile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Makefile b/Makefile index 9ef547fc7ffe..1ef92d705984 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,16 @@ _all: # descending is started. They are now explicitly listed as the # prepare rule. +# Ugly workaround for Debian make-kpkg: +# make-kpkg directly includes the top Makefile of Linux kernel. In such a case, +# skip sub-make to support debian_* targets in ruleset/kernel_version.mk, but +# displays warning to discourage such abusage. +ifneq ($(word 2, $(MAKEFILE_LIST)),) +$(warning Do not include top Makefile of Linux Kernel) +sub-make-done := 1 +MAKEFLAGS += -rR +endif + ifneq ($(sub-make-done),1) # Do not use make's built-in rules and variables From 7e548e9a54bf4ca420f1874e4a602cafe0ed7671 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 8 Mar 2019 18:56:23 +0900 Subject: [PATCH 466/855] kbuild: deb-pkg: add CONFIG_ prefix to kernel config options This might be a kind of bike-shed, but I personally prefer grep'able code. I often do 'git grep CONFIG_FOO' instead of 'git grep FOO' when I want to know where that CONFIG option is used. This makes code longer, but I hope this is acceptable level. Signed-off-by: Masahiro Yamada --- scripts/package/mkdebian | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index 110cd8d2a226..fb76dd1b72e8 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -7,7 +7,7 @@ set -e is_enabled() { - grep -q "^CONFIG_$1=y" $KCONFIG_CONFIG + grep -q "^$1=y" $KCONFIG_CONFIG } if_enabled_echo() { @@ -31,23 +31,23 @@ set_debarch() { x86_64) debarch=amd64 ;; sparc*) - debarch=sparc$(if_enabled_echo 64BIT 64) ;; + debarch=sparc$(if_enabled_echo CONFIG_64BIT 64) ;; s390*) debarch=s390x ;; ppc*) - if is_enabled 64BIT; then - debarch=ppc64$(if_enabled_echo CPU_LITTLE_ENDIAN el) + if is_enabled CONFIG_64BIT; then + debarch=ppc64$(if_enabled_echo CONFIG_CPU_LITTLE_ENDIAN el) else - debarch=powerpc$(if_enabled_echo SPE spe) + debarch=powerpc$(if_enabled_echo CONFIG_SPE spe) fi ;; parisc*) debarch=hppa ;; mips*) - if is_enabled CPU_LITTLE_ENDIAN; then - debarch=mips$(if_enabled_echo 64BIT 64)$(if_enabled_echo CPU_MIPSR6 r6)el - elif is_enabled CPU_MIPSR6; then - debarch=mips$(if_enabled_echo 64BIT 64)r6 + if is_enabled CONFIG_CPU_LITTLE_ENDIAN; then + debarch=mips$(if_enabled_echo CONFIG_64BIT 64)$(if_enabled_echo CONFIG_CPU_MIPSR6 r6)el + elif is_enabled CONFIG_CPU_MIPSR6; then + debarch=mips$(if_enabled_echo CONFIG_64BIT 64)r6 else debarch=mips fi @@ -55,8 +55,8 @@ set_debarch() { aarch64|arm64) debarch=arm64 ;; arm*) - if is_enabled AEABI; then - debarch=arm$(if_enabled_echo VFP hf el) + if is_enabled CONFIG_AEABI; then + debarch=arm$(if_enabled_echo CONFIG_VFP hf el) else debarch=arm fi @@ -64,10 +64,10 @@ set_debarch() { openrisc) debarch=or1k ;; sh) - if is_enabled CPU_SH3; then - debarch=sh3$(if_enabled_echo CPU_BIG_ENDIAN eb) - elif is_enabled CPU_SH4; then - debarch=sh4$(if_enabled_echo CPU_BIG_ENDIAN eb) + if is_enabled CONFIG_CPU_SH3; then + debarch=sh3$(if_enabled_echo CONFIG_CPU_BIG_ENDIAN eb) + elif is_enabled CONFIG_CPU_SH4; then + debarch=sh4$(if_enabled_echo CONFIG_CPU_BIG_ENDIAN eb) fi ;; esac @@ -134,8 +134,8 @@ fi mkdir -p debian/ echo $debarch > debian/arch -extra_build_depends=", $(if_enabled_echo UNWINDER_ORC libelf-dev)" -extra_build_depends="$extra_build_depends, $(if_enabled_echo SYSTEM_TRUSTED_KEYRING libssl-dev:native)" +extra_build_depends=", $(if_enabled_echo CONFIG_UNWINDER_ORC libelf-dev)" +extra_build_depends="$extra_build_depends, $(if_enabled_echo CONFIG_SYSTEM_TRUSTED_KEYRING libssl-dev:native)" # Generate a simple changelog template cat < debian/changelog From 515f4c633daee7654a199deed3e1939c7933ae2c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 8 Mar 2019 18:56:24 +0900 Subject: [PATCH 467/855] kbuild: deb-pkg: introduce is_enabled and if_enabled_echo to builddeb I think is_enabled() and if_enable_echo() in scripts/package/mkdebian are useful. builddeb also has many repetitive greps over the kernel config, so I borrowed the idea to clean it up. Signed-off-by: Masahiro Yamada --- scripts/package/builddeb | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index 8ac25d10a6ad..e2cb43895e14 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -12,6 +12,18 @@ set -e +is_enabled() { + grep -q "^$1=y" $KCONFIG_CONFIG +} + +if_enabled_echo() { + if is_enabled "$1"; then + echo -n "$2" + elif [ $# -ge 3 ]; then + echo -n "$3" + fi +} + create_package() { local pname="$1" pdir="$2" @@ -62,7 +74,7 @@ parisc|mips|powerpc) installed_image_path="boot/vmlinuz-$version" esac -BUILD_DEBUG="$(grep -s '^CONFIG_DEBUG_INFO=y' $KCONFIG_CONFIG || true)" +BUILD_DEBUG=$(if_enabled_echo CONFIG_DEBUG_INFO Yes) # Setup the directory structure rm -rf "$tmpdir" "$kernel_headers_dir" "$libc_headers_dir" "$dbg_dir" $objtree/debian/files @@ -83,14 +95,14 @@ else fi cp "$($MAKE -s -f $srctree/Makefile image_name)" "$tmpdir/$installed_image_path" -if grep -q "^CONFIG_OF_EARLY_FLATTREE=y" $KCONFIG_CONFIG ; then +if is_enabled CONFIG_OF_EARLY_FLATTREE; then # Only some architectures with OF support have this target if [ -d "${srctree}/arch/$SRCARCH/boot/dts" ]; then $MAKE -f $srctree/Makefile INSTALL_DTBS_PATH="$tmpdir/usr/lib/$packagename" dtbs_install fi fi -if grep -q '^CONFIG_MODULES=y' $KCONFIG_CONFIG ; then +if is_enabled CONFIG_MODULES; then INSTALL_MOD_PATH="$tmpdir" $MAKE -f $srctree/Makefile modules_install rm -f "$tmpdir/lib/modules/$version/build" rm -f "$tmpdir/lib/modules/$version/source" @@ -111,8 +123,7 @@ if grep -q '^CONFIG_MODULES=y' $KCONFIG_CONFIG ; then done # resign stripped modules - MODULE_SIG_ALL="$(grep -s '^CONFIG_MODULE_SIG_ALL=y' $KCONFIG_CONFIG || true)" - if [ -n "$MODULE_SIG_ALL" ]; then + if is_enabled CONFIG_MODULE_SIG_ALL; then INSTALL_MOD_PATH="$tmpdir" $MAKE -f $srctree/Makefile modules_sign fi fi @@ -129,11 +140,6 @@ fi # make-kpkg sets $INITRD to indicate whether an initramfs is wanted, and # so do we; recent versions of dracut and initramfs-tools will obey this. debhookdir=${KDEB_HOOKDIR:-/etc/kernel} -if grep -q '^CONFIG_BLK_DEV_INITRD=y' $KCONFIG_CONFIG; then - want_initrd=Yes -else - want_initrd=No -fi for script in postinst postrm preinst prerm ; do mkdir -p "$tmpdir$debhookdir/$script.d" cat < "$tmpdir/DEBIAN/$script" @@ -145,7 +151,7 @@ set -e export DEB_MAINT_PARAMS="\$*" # Tell initramfs builder whether it's wanted -export INITRD=$want_initrd +export INITRD=$(if_enabled_echo CONFIG_BLK_DEV_INITRD Yes No) test -d $debhookdir/$script.d && run-parts --arg="$version" --arg="/$installed_image_path" $debhookdir/$script.d exit 0 @@ -158,11 +164,11 @@ done (cd $srctree; find arch/*/include include scripts -type f -o -type l) >> "$objtree/debian/hdrsrcfiles" (cd $srctree; find arch/$SRCARCH -name module.lds -o -name Kbuild.platforms -o -name Platform) >> "$objtree/debian/hdrsrcfiles" (cd $srctree; find $(find arch/$SRCARCH -name include -o -name scripts -type d) -type f) >> "$objtree/debian/hdrsrcfiles" -if grep -q '^CONFIG_STACK_VALIDATION=y' $KCONFIG_CONFIG ; then +if is_enabled CONFIG_STACK_VALIDATION; then (cd $objtree; find tools/objtool -type f -executable) >> "$objtree/debian/hdrobjfiles" fi (cd $objtree; find arch/$SRCARCH/include Module.symvers include scripts -type f) >> "$objtree/debian/hdrobjfiles" -if grep -q '^CONFIG_GCC_PLUGINS=y' $KCONFIG_CONFIG ; then +if is_enabled CONFIG_GCC_PLUGINS; then (cd $objtree; find scripts/gcc-plugins -name \*.so -o -name gcc-common.h) >> "$objtree/debian/hdrobjfiles" fi destdir=$kernel_headers_dir/usr/src/linux-headers-$version From 6fb7ef5a343dea78e71600314cbb5e5b7466243b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 8 Mar 2019 18:56:25 +0900 Subject: [PATCH 468/855] kbuild: pkg: grep include/config/auto.conf instead of $KCONFIG_CONFIG This will be a little more efficient since unset CONFIG options are stripped away from auto.conf, and we can hard-code the path to auto.conf since it is never overridden. include/config/kernel.release is generated before %pkg is run. So, it is guaranteed auto.conf is up-to-date. Signed-off-by: Masahiro Yamada --- scripts/package/builddeb | 2 +- scripts/package/buildtar | 2 +- scripts/package/mkdebian | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index e2cb43895e14..b03dd56a4782 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -13,7 +13,7 @@ set -e is_enabled() { - grep -q "^$1=y" $KCONFIG_CONFIG + grep -q "^$1=y" include/config/auto.conf } if_enabled_echo() { diff --git a/scripts/package/buildtar b/scripts/package/buildtar index cfd2a4a3fe42..2f66c81e4021 100755 --- a/scripts/package/buildtar +++ b/scripts/package/buildtar @@ -56,7 +56,7 @@ dirs=boot # # Try to install modules # -if grep -q '^CONFIG_MODULES=y' "${KCONFIG_CONFIG}"; then +if grep -q '^CONFIG_MODULES=y' include/config/auto.conf; then make ARCH="${ARCH}" -f ${srctree}/Makefile INSTALL_MOD_PATH="${tmpdir}" modules_install dirs="$dirs lib" fi diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index fb76dd1b72e8..8068328a079c 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -7,7 +7,7 @@ set -e is_enabled() { - grep -q "^$1=y" $KCONFIG_CONFIG + grep -q "^$1=y" include/config/auto.conf } if_enabled_echo() { From d9d53ed3f77ff4057ce714c0d169c28a653504e7 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 13 Mar 2019 18:54:54 +0100 Subject: [PATCH 469/855] nvme: add get-feature to admin cmds tracer This will print get-feature cmd in more informative way. For example, run "nvme get-feature /dev/nvme0 -n 1 -f 0x9 -c 10" will trace: nvme-3907 [008] .... 1763.635054: nvme_setup_cmd: nvme0: qid=0, cmdid=6, nsid=1, flags=0x0, meta=0x0, cmd=(nvme_admin_get_features fid=0x9 sel=0x0 cdw11=0xa) -0 [001] d.h. 1763.635112: nvme_sq: nvme0: qid=0, head=27, tail=27 -0 [008] ..s. 1763.635121: nvme_complete_rq: nvme0: qid=0, cmdid=6, res=10, retries=0, flags=0x2, status=0 Signed-off-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/trace.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 58456de78bb2..5f24ea7a28eb 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -50,7 +50,19 @@ static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_admin_get_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sel = cdw10[1] & 0x7; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_putc(p, 0); + + return ret; +} static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) { @@ -101,6 +113,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_create_cq(p, cdw10); case nvme_admin_identify: return nvme_trace_admin_identify(p, cdw10); + case nvme_admin_get_features: + return nvme_trace_admin_get_features(p, cdw10); default: return nvme_trace_common(p, cdw10); } From 415df90b437f2b026ed37af2f812e41fc06c7f90 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Mar 2019 18:54:55 +0100 Subject: [PATCH 470/855] nvme: don't warn on block content change effects A write or flush IO passthrough command is expected to change the logical block content, so don't warn on these as no additional handling is necessary. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 07bf2bff3a76..dc1641247b17 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1250,7 +1250,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (ns) { if (ctrl->effects) effects = le32_to_cpu(ctrl->effects->iocs[opcode]); - if (effects & ~NVME_CMD_EFFECTS_CSUPP) + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) dev_warn(ctrl->device, "IO command:%02x has unhandled effects:%08x\n", opcode, effects); From 81fe92849928d65159d707b7b28febffbef94559 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Mar 2019 18:54:56 +0100 Subject: [PATCH 471/855] nvme-trace: fix cdw10 buffer overrun The field is defined to be a 24 byte array, we don't need to multiply the sizeof() that field by the number of dwords it covers. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 244d7c177e5a..97d3c77365b8 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -108,7 +108,7 @@ TRACE_EVENT(nvme_setup_cmd, __entry->metadata = le64_to_cpu(cmd->common.metadata); __assign_disk_name(__entry->disk, req->rq_disk); memcpy(__entry->cdw10, &cmd->common.cdw10, - 6 * sizeof(__entry->cdw10)); + sizeof(__entry->cdw10)); ), TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", __entry->ctrl_id, __print_disk_name(__entry->disk), From a63b83700ba89c300f705167d06bf122f3666287 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 13 Mar 2019 18:54:57 +0100 Subject: [PATCH 472/855] nvme: put ns_head ref if namespace fails allocation In case nvme_alloc_ns fails after we initialize ns_head but before we add the ns to the controller namespaces list we need to explicitly put the ns_head reference because when we teardown the controller we won't find it, causing us to leak a dangling subsystem eventually. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dc1641247b17..d57a84f45ed0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3304,6 +3304,7 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); mutex_unlock(&ctrl->subsys->lock); + nvme_put_ns_head(ns->head); out_free_id: kfree(id); out_free_queue: From 01fc08ff1f2f3f17d5947f18e62ed93c391aa3ce Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Wed, 13 Mar 2019 18:54:58 +0100 Subject: [PATCH 473/855] nvme: update comment to make the code easier to read After commit a686ed75c0fb ("nvme: introduce a helper function for controller deletion), nvme_delete_ctrl_sync no longer use flush_work. Update comment, accordingly. Signed-off-by: Yufen Yu Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d57a84f45ed0..b92fab434066 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -179,8 +179,8 @@ static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) int ret = 0; /* - * Keep a reference until the work is flushed since ->delete_ctrl - * can free the controller. + * Keep a reference until nvme_do_delete_ctrl() complete, + * since ->delete_ctrl can free the controller. */ nvme_get_ctrl(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) From d11de63f2b519f0a162b834013b6d3a46dbf3886 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Wed, 13 Mar 2019 18:54:59 +0100 Subject: [PATCH 474/855] nvme-loop: init nvmet_ctrl fatal_err_work when allocate After commit 4d43d395fe (workqueue: Try to catch flush_work() without INIT_WORK()), it can cause warning when delete nvme-loop device, trace like: [ 76.601272] Call Trace: [ 76.601646] ? del_timer+0x72/0xa0 [ 76.602156] __cancel_work_timer+0x1ae/0x270 [ 76.602791] cancel_work_sync+0x14/0x20 [ 76.603407] nvmet_ctrl_free+0x1b7/0x2f0 [nvmet] [ 76.604091] ? free_percpu+0x168/0x300 [ 76.604652] nvmet_sq_destroy+0x106/0x240 [nvmet] [ 76.605346] nvme_loop_destroy_admin_queue+0x30/0x60 [nvme_loop] [ 76.606220] nvme_loop_shutdown_ctrl+0xc3/0xf0 [nvme_loop] [ 76.607026] nvme_loop_delete_ctrl_host+0x19/0x30 [nvme_loop] [ 76.607871] nvme_do_delete_ctrl+0x75/0xb0 [ 76.608477] nvme_sysfs_delete+0x7d/0xc0 [ 76.609057] dev_attr_store+0x24/0x40 [ 76.609603] sysfs_kf_write+0x4c/0x60 [ 76.610144] kernfs_fop_write+0x19a/0x260 [ 76.610742] __vfs_write+0x1c/0x60 [ 76.611246] vfs_write+0xfa/0x280 [ 76.611739] ksys_write+0x6e/0x120 [ 76.612238] __x64_sys_write+0x1e/0x30 [ 76.612787] do_syscall_64+0xbf/0x3a0 [ 76.613329] entry_SYSCALL_64_after_hwframe+0x44/0xa9 We fix it by moving fatal_err_work init to nvmet_alloc_ctrl(), which may more reasonable. Signed-off-by: Yufen Yu Reviewed-by: Sagi Grimberg Reviewed-by: Bart Van Assche Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index d44ede147263..2d73b66e3686 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1163,6 +1163,15 @@ static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl) put_device(ctrl->p2p_client); } +static void nvmet_fatal_error_handler(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, fatal_err_work); + + pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); + ctrl->ops->delete_ctrl(ctrl); +} + u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) { @@ -1205,6 +1214,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); INIT_LIST_HEAD(&ctrl->async_events); INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL); + INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); @@ -1308,21 +1318,11 @@ void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) kref_put(&ctrl->ref, nvmet_ctrl_free); } -static void nvmet_fatal_error_handler(struct work_struct *work) -{ - struct nvmet_ctrl *ctrl = - container_of(work, struct nvmet_ctrl, fatal_err_work); - - pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); - ctrl->ops->delete_ctrl(ctrl); -} - void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) { mutex_lock(&ctrl->lock); if (!(ctrl->csts & NVME_CSTS_CFS)) { ctrl->csts |= NVME_CSTS_CFS; - INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); schedule_work(&ctrl->fatal_err_work); } mutex_unlock(&ctrl->lock); From 9f7d8ae2f79479ce13d987c8f3b1500b8937fc5d Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Mar 2019 18:55:00 +0100 Subject: [PATCH 475/855] nvme-fc: use nr_phys_segments to determine existence of sgl For some nvme command, when issued by the nvme core layer, there is an internal buffer which can cause blk_rq_payload_bytes() to return a non-zero value yet there is no actual/real command payload and sg list. An example is the WRITE ZEROES command. To address this, when making choices on whether to dma map an sgl, use blk_rq_nr_phys_segments() instead of blk_rq_payload_bytes(). When there is a sgl, blk_rq_payload_bytes() will return the amount of data to be transferred by the sgl. Signed-off-by: Chaitanya Kulkarni Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b29b12498a1a..ba8f2a9cbdaf 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2107,7 +2107,7 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, freq->sg_cnt = 0; - if (!blk_rq_payload_bytes(rq)) + if (!blk_rq_nr_phys_segments(rq)) return 0; freq->sg_table.sgl = freq->first_sgl; @@ -2304,12 +2304,23 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) return ret; - data_len = blk_rq_payload_bytes(rq); - if (data_len) + /* + * nvme core doesn't quite treat the rq opaquely. Commands such + * as WRITE ZEROES will return a non-zero rq payload_bytes yet + * there is no actual payload to be transferred. + * To get it right, key data transmission on there being 1 or + * more physical segments in the sg list. If there is no + * physical segments, there is no payload. + */ + if (blk_rq_nr_phys_segments(rq)) { + data_len = blk_rq_payload_bytes(rq); io_dir = ((rq_data_dir(rq) == WRITE) ? NVMEFC_FCP_WRITE : NVMEFC_FCP_READ); - else + } else { + data_len = 0; io_dir = NVMEFC_FCP_NODATA; + } + return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir); } From 06f3d71ea071b70e62bcc146cd9ff7ed1f9d4e43 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Mar 2019 18:55:01 +0100 Subject: [PATCH 476/855] nvme-fc: fix numa_node when dev is null A recent change added a numa_node field to the nvme controller and has the transport assign the node using dev_to_node(). However, fcloop registers with a NULL device struct, so the dev_to_node() call oops. Revise the assignment to assign no node when device struct is null. Fixes: 103e515efa89b ("nvme: add a numa_node field to struct nvme_ctrl") Reported-by: Mike Snitzer Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Mike Snitzer [hch: small coding style fixup] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index ba8f2a9cbdaf..23f6bad19274 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3017,7 +3017,10 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.opts = opts; ctrl->ctrl.nr_reconnects = 0; - ctrl->ctrl.numa_node = dev_to_node(lport->dev); + if (lport->dev) + ctrl->ctrl.numa_node = dev_to_node(lport->dev); + else + ctrl->ctrl.numa_node = NUMA_NO_NODE; INIT_LIST_HEAD(&ctrl->ctrl_list); ctrl->lport = lport; ctrl->rport = rport; From 834d3710a093aa18c8aa88e6e1892180abadebaf Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Mar 2019 18:55:02 +0100 Subject: [PATCH 477/855] nvme-fc: reject reconnect if io queue count is reduced to zero If: - A successful connect has occurred with an io queue count greater than zero and namespaces detected and running. - An error or something occurs which causes a termination of the prior association and then starts a reconnect, - The reconnect then creates a new controller, but for whatever reason, nvme_set_queue_count() results in io queue count set to zero. This will skip io queue and tag set changes. - But... the controller will transition to live, calling nvme_start_ctrl, which calls nvme_start_queues(), which then releases I/Os into the transport which then sends them to the driver. As there are no queues, things eventually hit the driver looking for a handle, which was cleared when the original controller was reset, and it can't proceed. Worst case, things progress, but everything fails. In the failing scenario, the nvme_set_features(NVME_FEAT_NUM_QUEUES) command actually failed with a NVME_SC_INTERNAL error. For some reason, although nvme_set_queue_count() saw the error and set io queue count to zero, it doesn't return a failure status to the transport, which allows the transport to continue using the controller. Fix the problem by simply rejecting the new association if at least 1 I/O queue can't be created. The association reject will fail the reconnect attempt and fall into the reconnect retry policy. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 23f6bad19274..f3b9d91ba0df 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2475,6 +2475,7 @@ static int nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + u32 prior_ioq_cnt = ctrl->ctrl.queue_count - 1; unsigned int nr_io_queues; int ret; @@ -2487,6 +2488,13 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) return ret; } + if (!nr_io_queues && prior_ioq_cnt) { + dev_info(ctrl->ctrl.device, + "Fail Reconnect: At least 1 io queue " + "required (was %d)\n", prior_ioq_cnt); + return -ENOSPC; + } + ctrl->ctrl.queue_count = nr_io_queues + 1; /* check for io queues existing */ if (ctrl->ctrl.queue_count == 1) @@ -2500,6 +2508,10 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queues; + if (prior_ioq_cnt != nr_io_queues) + dev_info(ctrl->ctrl.device, + "reconnect: revising io queue count from %d to %d\n", + prior_ioq_cnt, nr_io_queues); blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); return 0; From 0191e7405b687339a5540c1562acdecefd70eb3f Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Mar 2019 18:55:03 +0100 Subject: [PATCH 478/855] nvmet-fc: fix issues with targetport assoc_list list walking There are two changes: 1) The logic in the __nvmet_fc_free_assoc() routine is bad. It uses "safe" routines assuming pointers will come back valid. However, the intervening next structure being linked can be removed from the list and the resulting safe pointers are bad, resulting in NULL ptrs being hit. Correct by scheduling a work element to perform the association delete, which can be done while under lock. 2) Prior patch that added the work element scheduling left a possible reference on the object if the work element couldn't be scheduled. Correct by doing the put on a failing schedule_work() call. Signed-off-by: Nigel Kirkland Signed-off-by: James Smart Reviewed-by: Ewan D. Milne Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 1e9654f04c60..6b7bbf39fa06 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1143,10 +1143,8 @@ __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport) &tgtport->assoc_list, a_list) { if (!nvmet_fc_tgt_a_get(assoc)) continue; - spin_unlock_irqrestore(&tgtport->lock, flags); - nvmet_fc_delete_target_assoc(assoc); - nvmet_fc_tgt_a_put(assoc); - spin_lock_irqsave(&tgtport->lock, flags); + if (!schedule_work(&assoc->del_work)) + nvmet_fc_tgt_a_put(assoc); } spin_unlock_irqrestore(&tgtport->lock, flags); } @@ -1185,7 +1183,8 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) nvmet_fc_tgtport_put(tgtport); if (found_ctrl) { - schedule_work(&assoc->del_work); + if (!schedule_work(&assoc->del_work)) + nvmet_fc_tgt_a_put(assoc); return; } From 404ec31df434fdae515202952b5e230c1b983ee1 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Mar 2019 18:55:04 +0100 Subject: [PATCH 479/855] nvmet-fc: bring Disconnect into compliance with FC-NVME spec The FC-NVME spec, when finally approved, modified the disconnect LS such that the only scope available is the association. Rework the Disconnect LS processing to be in accordance with the change. Signed-off-by: Nigel Kirkland Signed-off-by: James Smart Reviewed-by: Ewan D. Milne Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 6b7bbf39fa06..98b7b1f4ee96 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1502,10 +1502,8 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, (struct fcnvme_ls_disconnect_rqst *)iod->rqstbuf; struct fcnvme_ls_disconnect_acc *acc = (struct fcnvme_ls_disconnect_acc *)iod->rspbuf; - struct nvmet_fc_tgt_queue *queue = NULL; struct nvmet_fc_tgt_assoc *assoc; int ret = 0; - bool del_assoc = false; memset(acc, 0, sizeof(*acc)); @@ -1536,18 +1534,7 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, assoc = nvmet_fc_find_target_assoc(tgtport, be64_to_cpu(rqst->associd.association_id)); iod->assoc = assoc; - if (assoc) { - if (rqst->discon_cmd.scope == - FCNVME_DISCONN_CONNECTION) { - queue = nvmet_fc_find_target_queue(tgtport, - be64_to_cpu( - rqst->discon_cmd.id)); - if (!queue) { - nvmet_fc_tgt_a_put(assoc); - ret = VERR_NO_CONN; - } - } - } else + if (!assoc) ret = VERR_NO_ASSOC; } @@ -1575,26 +1562,10 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, sizeof(struct fcnvme_ls_disconnect_acc)), FCNVME_LS_DISCONNECT); - - /* are we to delete a Connection ID (queue) */ - if (queue) { - int qid = queue->qid; - - nvmet_fc_delete_target_queue(queue); - - /* release the get taken by find_target_queue */ - nvmet_fc_tgt_q_put(queue); - - /* tear association down if io queue terminated */ - if (!qid) - del_assoc = true; - } - /* release get taken in nvmet_fc_find_target_assoc */ nvmet_fc_tgt_a_put(iod->assoc); - if (del_assoc) - nvmet_fc_delete_target_assoc(iod->assoc); + nvmet_fc_delete_target_assoc(iod->assoc); } From 7b210e4ed5e281728243799c5e2b84d3f70d4dd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2019 18:55:05 +0100 Subject: [PATCH 480/855] nvme: disable Write Zeroes for qemu controllers Qemu started out with a broken implementation of Write Zeroes written by yours truly. Disable Write Zeroes on qemu for now, eventually we need to go back and make all the qemu quirks version specific, but that is left for another time. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Tested-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/nvme.h | 5 +++++ drivers/nvme/host/pci.c | 3 ++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b92fab434066..951e9f31b57c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1531,7 +1531,8 @@ static inline void nvme_config_write_zeroes(struct nvme_ns *ns) u32 max_sectors; unsigned short bs = 1 << ns->lba_shift; - if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES)) + if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) || + (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) return; /* * Even though NVMe spec explicitly states that MDTS is not diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b91f1838bbd5..527d64545023 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -87,6 +87,11 @@ enum nvme_quirks { * Ignore device provided subnqn. */ NVME_QUIRK_IGNORE_DEV_SUBNQN = (1 << 8), + + /* + * Broken Write Zeroes. + */ + NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f54718b63637..3a2377888a46 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2975,7 +2975,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ - .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, + .driver_data = NVME_QUIRK_IDENTIFY_CNS | + NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ From b1aafb35b45b1d734c670059c125a4ff111a47bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2019 18:55:06 +0100 Subject: [PATCH 481/855] nvme: remove nvme_ns_config_oncs Just opencode the two function calls in the caller. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Max Gurtovoy Reviewed-by: Chaitanya Kulkarni Tested-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 951e9f31b57c..26ae805fc958 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1552,12 +1552,6 @@ static inline void nvme_config_write_zeroes(struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(ns->queue, max_sectors); } -static inline void nvme_ns_config_oncs(struct nvme_ns *ns) -{ - nvme_config_discard(ns); - nvme_config_write_zeroes(ns); -} - static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, struct nvme_id_ns *id, struct nvme_ns_ids *ids) { @@ -1611,7 +1605,9 @@ static void nvme_update_disk_info(struct gendisk *disk, capacity = 0; set_capacity(disk, capacity); - nvme_ns_config_oncs(ns); + + nvme_config_discard(ns); + nvme_config_write_zeroes(ns); if (id->nsattr & (1 << 0)) set_disk_ro(disk, true); From 2631857160ecbea04e54423f5053133fe2b6ea45 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2019 18:55:07 +0100 Subject: [PATCH 482/855] nvme: add proper discard setup for the multipath device Add a gendisk argument to nvme_config_discard so that the call to nvme_update_disk_info for the multipath device node updates the proper request_queue. Signed-off-by: Christoph Hellwig Reported-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Max Gurtovoy Tested-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 26ae805fc958..6a57ece7d76b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1495,10 +1495,10 @@ static void nvme_set_chunk_size(struct nvme_ns *ns) blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); } -static void nvme_config_discard(struct nvme_ns *ns) +static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) { struct nvme_ctrl *ctrl = ns->ctrl; - struct request_queue *queue = ns->queue; + struct request_queue *queue = disk->queue; u32 size = queue_logical_block_size(queue); if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { @@ -1606,7 +1606,7 @@ static void nvme_update_disk_info(struct gendisk *disk, set_capacity(disk, capacity); - nvme_config_discard(ns); + nvme_config_discard(disk, ns); nvme_config_write_zeroes(ns); if (id->nsattr & (1 << 0)) From 9f0916ab932f676c042d4592a235a895847484f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2019 18:55:08 +0100 Subject: [PATCH 483/855] nvme: add proper write zeroes setup for the multipath device Add a gendisk argument to nvme_config_write_zeroes so that the call to nvme_update_disk_info for the multipath device node updates the proper request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Max Gurtovoy Tested-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6a57ece7d76b..470601980794 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1526,7 +1526,7 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } -static inline void nvme_config_write_zeroes(struct nvme_ns *ns) +static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) { u32 max_sectors; unsigned short bs = 1 << ns->lba_shift; @@ -1549,7 +1549,7 @@ static inline void nvme_config_write_zeroes(struct nvme_ns *ns) else max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9; - blk_queue_max_write_zeroes_sectors(ns->queue, max_sectors); + blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors); } static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, @@ -1607,7 +1607,7 @@ static void nvme_update_disk_info(struct gendisk *disk, set_capacity(disk, capacity); nvme_config_discard(disk, ns); - nvme_config_write_zeroes(ns); + nvme_config_write_zeroes(disk, ns); if (id->nsattr & (1 << 0)) set_disk_ro(disk, true); From 005c674f705ee308e23b8e4e7047419d12122fde Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2019 18:55:09 +0100 Subject: [PATCH 484/855] nvmet: ignore EOPNOTSUPP for discard NVMe DSM is a pure hint, so if the underlying device / file system does not support discard-like operations we should not fail the operation but rather return success. Fixes: 3b031d15995f ("nvmet: add error log support for bdev backend") Signed-off-by: Christoph Hellwig Reviewed by: Chaitanya Kulkarni Tested-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/target/io-cmd-bdev.c | 8 ++++---- drivers/nvme/target/io-cmd-file.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 71dfedbadc26..a065dbfc43b1 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -194,11 +194,11 @@ static u16 nvmet_bdev_discard_range(struct nvmet_req *req, le64_to_cpu(range->slba) << (ns->blksize_shift - 9), le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), GFP_KERNEL, 0, bio); - - if (ret) + if (ret && ret != -EOPNOTSUPP) { req->error_slba = le64_to_cpu(range->slba); - - return blk_to_nvme_status(req, errno_to_blk_status(ret)); + return blk_to_nvme_status(req, errno_to_blk_status(ret)); + } + return NVME_SC_SUCCESS; } static void nvmet_bdev_execute_discard(struct nvmet_req *req) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 517522305e5c..3e43212d3c1c 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -297,7 +297,7 @@ static void nvmet_file_execute_discard(struct nvmet_req *req) } ret = vfs_fallocate(req->ns->file, mode, offset, len); - if (ret) { + if (ret && ret != -EOPNOTSUPP) { req->error_slba = le64_to_cpu(range.slba); status = errno_to_nvme_status(req, ret); break; From 602d674ce90f64ac135452fb9b2b058acb53b226 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 13 Mar 2019 18:55:10 +0100 Subject: [PATCH 485/855] nvme-tcp: support C2HData with SUCCESS flag A C2HData PDU with the SUCCESS flag set indicates that the I/O was completed by the controller successfully and means that a subsequent completion response capsule PDU will be ommitted. If we see this flag, fisrt we check that LAST_PDU flag is set as well, and then we complete the request when the data transfer (and data digest verification if its on) is done. While we're at it, reuse a bit of code with nvme_fail_request. Reported-by: Steve Blightman Suggested-by: Oliver Smith-Denny Signed-off-by: Sagi Grimberg Reviewed-by: Oliver Smith-Denny Tested-by: Oliver Smith-Denny Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/tcp.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 208ee518af65..e7e08889865e 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -463,6 +463,15 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, queue->data_remaining = le32_to_cpu(pdu->data_length); + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS && + unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x SUCCESS set but not last PDU\n", + nvme_tcp_queue_id(queue), rq->tag); + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + return -EPROTO; + } + return 0; } @@ -618,6 +627,14 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, return ret; } +static inline void nvme_tcp_end_request(struct request *rq, __le16 status) +{ + union nvme_result res = {}; + + nvme_end_request(rq, cpu_to_le16(status << 1), res); +} + + static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { @@ -685,6 +702,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) + nvme_tcp_end_request(rq, NVME_SC_SUCCESS); nvme_tcp_init_recv_ctx(queue); } } @@ -695,6 +714,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { + struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; char *ddgst = (char *)&queue->recv_ddgst; size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining); off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining; @@ -718,6 +738,13 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, return -EIO; } + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { + struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); + + nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + } + nvme_tcp_init_recv_ctx(queue); return 0; } @@ -815,10 +842,7 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) static void nvme_tcp_fail_request(struct nvme_tcp_request *req) { - union nvme_result res = {}; - - nvme_end_request(blk_mq_rq_from_pdu(req), - cpu_to_le16(NVME_SC_DATA_XFER_ERROR), res); + nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR); } static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) From b0d8bc50b9f221e3af76afe0473f7c171cebbb40 Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Wed, 13 Mar 2019 17:09:23 +0100 Subject: [PATCH 486/855] ALSA: hda: hdmi - add Icelake support This is just a port of the ASoC Icelake HDMI codec code to the legacy HDA driver with some cleanups. ASoC commit 019033c854a20e10f691f6cc0e897df8817d9521: "ASoC: Intel: hdac_hdmi: add Icelake support" Signed-off-by: Jaroslav Kysela Cc: Bard liao Cc: Pierre-Louis Bossart Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_hdmi.c | 67 +++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 16 deletions(-) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 73d7042ff884..8b3ac690efa3 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -57,10 +57,11 @@ MODULE_PARM_DESC(static_hdmi_pcm, "Don't restrict PCM parameters per ELD info"); #define is_geminilake(codec) (((codec)->core.vendor_id == 0x8086280d) || \ ((codec)->core.vendor_id == 0x80862800)) #define is_cannonlake(codec) ((codec)->core.vendor_id == 0x8086280c) +#define is_icelake(codec) ((codec)->core.vendor_id == 0x8086280f) #define is_haswell_plus(codec) (is_haswell(codec) || is_broadwell(codec) \ || is_skylake(codec) || is_broxton(codec) \ - || is_kabylake(codec)) || is_geminilake(codec) \ - || is_cannonlake(codec) + || is_kabylake(codec) || is_geminilake(codec) \ + || is_cannonlake(codec) || is_icelake(codec)) #define is_valleyview(codec) ((codec)->core.vendor_id == 0x80862882) #define is_cherryview(codec) ((codec)->core.vendor_id == 0x80862883) #define is_valleyview_plus(codec) (is_valleyview(codec) || is_cherryview(codec)) @@ -181,6 +182,8 @@ struct hdmi_spec { struct hdac_chmap chmap; hda_nid_t vendor_nid; + const int *port_map; + int port_num; }; #ifdef CONFIG_SND_HDA_COMPONENT @@ -2418,12 +2421,11 @@ static void intel_haswell_fixup_connect_list(struct hda_codec *codec, snd_hda_override_conn_list(codec, nid, spec->num_cvts, spec->cvt_nids); } -#define INTEL_VENDOR_NID 0x08 -#define INTEL_GLK_VENDOR_NID 0x0B -#define INTEL_GET_VENDOR_VERB 0xf81 -#define INTEL_SET_VENDOR_VERB 0x781 -#define INTEL_EN_DP12 0x02 /* enable DP 1.2 features */ -#define INTEL_EN_ALL_PIN_CVTS 0x01 /* enable 2nd & 3rd pins and convertors */ +#define INTEL_GET_VENDOR_VERB 0xf81 +#define INTEL_GET_VENDOR_VERB 0xf81 +#define INTEL_SET_VENDOR_VERB 0x781 +#define INTEL_EN_DP12 0x02 /* enable DP 1.2 features */ +#define INTEL_EN_ALL_PIN_CVTS 0x01 /* enable 2nd & 3rd pins and convertors */ static void intel_haswell_enable_all_pins(struct hda_codec *codec, bool update_tree) @@ -2503,11 +2505,29 @@ static int intel_base_nid(struct hda_codec *codec) static int intel_pin2port(void *audio_ptr, int pin_nid) { - int base_nid = intel_base_nid(audio_ptr); + struct hda_codec *codec = audio_ptr; + struct hdmi_spec *spec = codec->spec; + int base_nid, i; - if (WARN_ON(pin_nid < base_nid || pin_nid >= base_nid + 3)) - return -1; - return pin_nid - base_nid + 1; /* intel port is 1-based */ + if (!spec->port_num) { + base_nid = intel_base_nid(codec); + if (WARN_ON(pin_nid < base_nid || pin_nid >= base_nid + 3)) + return -1; + return pin_nid - base_nid + 1; /* intel port is 1-based */ + } + + /* + * looking for the pin number in the mapping table and return + * the index which indicate the port number + */ + for (i = 0; i < spec->port_num; i++) { + if (pin_nid == spec->port_map[i]) + return i + 1; + } + + /* return -1 if pin number exceeds our expectation */ + codec_info(codec, "Can't find the HDMI/DP port for pin %d\n", pin_nid); + return -1; } static void intel_pin_eld_notify(void *audio_ptr, int port, int pipe) @@ -2608,7 +2628,8 @@ static int parse_intel_hdmi(struct hda_codec *codec) } /* Intel Haswell and onwards; audio component with eld notifier */ -static int intel_hsw_common_init(struct hda_codec *codec, hda_nid_t vendor_nid) +static int intel_hsw_common_init(struct hda_codec *codec, hda_nid_t vendor_nid, + const int *port_map, int port_num) { struct hdmi_spec *spec; int err; @@ -2620,6 +2641,8 @@ static int intel_hsw_common_init(struct hda_codec *codec, hda_nid_t vendor_nid) codec->dp_mst = true; spec->dyn_pcm_assign = true; spec->vendor_nid = vendor_nid; + spec->port_map = port_map; + spec->port_num = port_num; intel_haswell_enable_all_pins(codec, true); intel_haswell_fixup_enable_dp12(codec); @@ -2638,12 +2661,23 @@ static int intel_hsw_common_init(struct hda_codec *codec, hda_nid_t vendor_nid) static int patch_i915_hsw_hdmi(struct hda_codec *codec) { - return intel_hsw_common_init(codec, INTEL_VENDOR_NID); + return intel_hsw_common_init(codec, 0x08, NULL, 0); } static int patch_i915_glk_hdmi(struct hda_codec *codec) { - return intel_hsw_common_init(codec, INTEL_GLK_VENDOR_NID); + return intel_hsw_common_init(codec, 0x0b, NULL, 0); +} + +static int patch_i915_icl_hdmi(struct hda_codec *codec) +{ + /* + * pin to port mapping table where the value indicate the pin number and + * the index indicate the port number with 1 base. + */ + static const int map[] = {0x4, 0x6, 0x8, 0xa, 0xb}; + + return intel_hsw_common_init(codec, 0x02, map, ARRAY_SIZE(map)); } /* Intel Baytrail and Braswell; with eld notifier */ @@ -3886,6 +3920,7 @@ HDA_CODEC_ENTRY(0x11069f81, "VX900 HDMI/DP", patch_via_hdmi), HDA_CODEC_ENTRY(0x11069f84, "VX11 HDMI/DP", patch_generic_hdmi), HDA_CODEC_ENTRY(0x11069f85, "VX11 HDMI/DP", patch_generic_hdmi), HDA_CODEC_ENTRY(0x80860054, "IbexPeak HDMI", patch_i915_cpt_hdmi), +HDA_CODEC_ENTRY(0x80862800, "Geminilake HDMI", patch_i915_glk_hdmi), HDA_CODEC_ENTRY(0x80862801, "Bearlake HDMI", patch_generic_hdmi), HDA_CODEC_ENTRY(0x80862802, "Cantiga HDMI", patch_generic_hdmi), HDA_CODEC_ENTRY(0x80862803, "Eaglelake HDMI", patch_generic_hdmi), @@ -3899,7 +3934,7 @@ HDA_CODEC_ENTRY(0x8086280a, "Broxton HDMI", patch_i915_hsw_hdmi), HDA_CODEC_ENTRY(0x8086280b, "Kabylake HDMI", patch_i915_hsw_hdmi), HDA_CODEC_ENTRY(0x8086280c, "Cannonlake HDMI", patch_i915_glk_hdmi), HDA_CODEC_ENTRY(0x8086280d, "Geminilake HDMI", patch_i915_glk_hdmi), -HDA_CODEC_ENTRY(0x80862800, "Geminilake HDMI", patch_i915_glk_hdmi), +HDA_CODEC_ENTRY(0x8086280f, "Icelake HDMI", patch_i915_icl_hdmi), HDA_CODEC_ENTRY(0x80862880, "CedarTrail HDMI", patch_generic_hdmi), HDA_CODEC_ENTRY(0x80862882, "Valleyview2 HDMI", patch_i915_byt_hdmi), HDA_CODEC_ENTRY(0x80862883, "Braswell HDMI", patch_i915_byt_hdmi), From c24a1269652006b401bda29fea15a0e42b7870d1 Mon Sep 17 00:00:00 2001 From: Ricardo Biehl Pasquali Date: Wed, 13 Mar 2019 16:06:48 -0300 Subject: [PATCH 487/855] ALSA: pcm: Fix function name in kernel-doc comment Signed-off-by: Ricardo Biehl Pasquali Signed-off-by: Takashi Iwai --- include/sound/pcm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 465d7d033c4c..18bd8c3ea605 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -750,7 +750,7 @@ static inline snd_pcm_uframes_t snd_pcm_playback_avail(struct snd_pcm_runtime *r } /** - * snd_pcm_playback_avail - Get the available (readable) space for capture + * snd_pcm_capture_avail - Get the available (readable) space for capture * @runtime: PCM runtime instance * * Result is between 0 ... (boundary - 1) From f6d85f04e29859dd3ea65395c05925da352dae89 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Mon, 14 Jan 2019 21:31:13 +0100 Subject: [PATCH 488/855] blkcg: annotate implicit fall through There is a plan to build the kernel with -Wimplicit-fallthrough and this place in the code produced a warning (W=1). This commit remove the following warning: kernel/trace/blktrace.c:725:9: warning: this statement may fall through [-Wimplicit-fallthrough=] Signed-off-by: Mathieu Malaterre Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fac0ddf8a8e2..e1c6d79fb4cc 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -723,6 +723,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) #endif case BLKTRACESTART: start = 1; + /* fall through */ case BLKTRACESTOP: ret = __blk_trace_startstop(q, start); break; From c8a9afa632f0fd45731d3353525faf1fdb362c89 Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Thu, 14 Mar 2019 09:21:08 +0100 Subject: [PATCH 489/855] ALSA: hda/realtek: merge alc_fixup_headset_jack to alc295_fixup_chromebook The ALC225_FIXUP_HEADSET_JACK fixup can be merged to alc295_fixup_chromebook. There are no other users for ALC225_FIXUP_HEADSET_JACK other than the chromebook hardware. Fixes: 10f5b1b85ed1 ("ALSA: hda/realtek - Fixed Headset Mic JD not stable") Cc: Kailang Yang Signed-off-by: Jaroslav Kysela Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 43 +++++++++++------------------------ 1 file changed, 13 insertions(+), 30 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a63670013943..96203d9d17c1 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5491,7 +5491,7 @@ static void alc_headset_btn_callback(struct hda_codec *codec, jack->jack->button_state = report; } -static void alc_fixup_headset_jack(struct hda_codec *codec, +static void alc295_fixup_chromebook(struct hda_codec *codec, const struct hda_fixup *fix, int action) { @@ -5501,6 +5501,16 @@ static void alc_fixup_headset_jack(struct hda_codec *codec, alc_headset_btn_callback); snd_hda_jack_add_kctl(codec, 0x55, "Headset Jack", false, SND_JACK_HEADSET, alc_headset_btn_keymap); + switch (codec->core.vendor_id) { + case 0x10ec0295: + alc_update_coef_idx(codec, 0x4a, 0x8000, 1 << 15); /* Reset HP JD */ + alc_update_coef_idx(codec, 0x4a, 0x8000, 0 << 15); + break; + case 0x10ec0236: + alc_update_coef_idx(codec, 0x1b, 0x8000, 1 << 15); /* Reset HP JD */ + alc_update_coef_idx(codec, 0x1b, 0x8000, 0 << 15); + break; + } break; case HDA_FIXUP_ACT_INIT: switch (codec->core.vendor_id) { @@ -5521,26 +5531,6 @@ static void alc_fixup_headset_jack(struct hda_codec *codec, } } -static void alc295_fixup_chromebook(struct hda_codec *codec, - const struct hda_fixup *fix, int action) -{ - - switch (action) { - case HDA_FIXUP_ACT_PRE_PROBE: - switch (codec->core.vendor_id) { - case 0x10ec0295: - alc_update_coef_idx(codec, 0x4a, 0x8000, 1 << 15); /* Reset HP JD */ - alc_update_coef_idx(codec, 0x4a, 0x8000, 0 << 15); - break; - case 0x10ec0236: - alc_update_coef_idx(codec, 0x1b, 0x8000, 1 << 15); /* Reset HP JD */ - alc_update_coef_idx(codec, 0x1b, 0x8000, 0 << 15); - break; - } - break; - } -} - static void alc_fixup_disable_mic_vref(struct hda_codec *codec, const struct hda_fixup *fix, int action) { @@ -5690,7 +5680,6 @@ enum { ALC294_FIXUP_ASUS_MIC, ALC294_FIXUP_ASUS_HEADSET_MIC, ALC294_FIXUP_ASUS_SPK, - ALC225_FIXUP_HEADSET_JACK, ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE, ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE, ALC255_FIXUP_ACER_HEADSET_MIC, @@ -6637,9 +6626,9 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC294_FIXUP_ASUS_HEADSET_MIC }, - [ALC225_FIXUP_HEADSET_JACK] = { + [ALC295_FIXUP_CHROME_BOOK] = { .type = HDA_FIXUP_FUNC, - .v.func = alc_fixup_headset_jack, + .v.func = alc295_fixup_chromebook, }, [ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE] = { .type = HDA_FIXUP_PINS, @@ -6671,12 +6660,6 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC255_FIXUP_HEADSET_MODE_NO_HP_MIC }, - [ALC295_FIXUP_CHROME_BOOK] = { - .type = HDA_FIXUP_FUNC, - .v.func = alc295_fixup_chromebook, - .chained = true, - .chain_id = ALC225_FIXUP_HEADSET_JACK - }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { From 136824efaab2c095fc911048f7c7ddeda258c965 Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Thu, 14 Mar 2019 16:22:45 +0800 Subject: [PATCH 490/855] ALSA: hda/realtek - Add support headset mode for DELL WYSE AIO This patch will enable WYSE AIO for Headset mode. Signed-off-by: Kailang Yang Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 96203d9d17c1..dddfd0f03bca 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5684,6 +5684,9 @@ enum { ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE, ALC255_FIXUP_ACER_HEADSET_MIC, ALC295_FIXUP_CHROME_BOOK, + ALC225_FIXUP_DELL_WYSE_AIO_MIC_NO_PRESENCE, + ALC225_FIXUP_WYSE_AUTO_MUTE, + ALC225_FIXUP_WYSE_DISABLE_MIC_VREF, }; static const struct hda_fixup alc269_fixups[] = { @@ -6660,6 +6663,28 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC255_FIXUP_HEADSET_MODE_NO_HP_MIC }, + [ALC225_FIXUP_DELL_WYSE_AIO_MIC_NO_PRESENCE] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x16, 0x01011020 }, /* Rear Line out */ + { 0x19, 0x01a1913c }, /* use as Front headset mic, without its own jack detect */ + { } + }, + .chained = true, + .chain_id = ALC225_FIXUP_WYSE_AUTO_MUTE + }, + [ALC225_FIXUP_WYSE_AUTO_MUTE] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc_fixup_auto_mute_via_amp, + .chained = true, + .chain_id = ALC225_FIXUP_WYSE_DISABLE_MIC_VREF + }, + [ALC225_FIXUP_WYSE_DISABLE_MIC_VREF] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc_fixup_disable_mic_vref, + .chained = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -6724,6 +6749,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0871, "Dell Precision 3630", ALC255_FIXUP_DELL_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0872, "Dell Precision 3630", ALC255_FIXUP_DELL_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0873, "Dell Precision 3930", ALC255_FIXUP_DUMMY_LINEOUT_VERB), + SND_PCI_QUIRK(0x1028, 0x08ad, "Dell WYSE AIO", ALC225_FIXUP_DELL_WYSE_AIO_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0935, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), From da484d00f020af3dd7cfcc6c4b69a7f856832883 Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Thu, 14 Mar 2019 15:50:59 +0800 Subject: [PATCH 491/855] ALSA: hda/realtek - Add support headset mode for New DELL WYSE NB Enable headset mode support for new WYSE NB platform. Signed-off-by: Kailang Yang Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index dddfd0f03bca..384719d5c44e 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6750,6 +6750,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0872, "Dell Precision 3630", ALC255_FIXUP_DELL_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0873, "Dell Precision 3930", ALC255_FIXUP_DUMMY_LINEOUT_VERB), SND_PCI_QUIRK(0x1028, 0x08ad, "Dell WYSE AIO", ALC225_FIXUP_DELL_WYSE_AIO_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1028, 0x08ae, "Dell WYSE NB", ALC225_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0935, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), From 3a487ff78c4b2bf5686dbb29178cfd78b3eed053 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 7 Mar 2019 08:33:44 +0300 Subject: [PATCH 492/855] scsi: lpfc: Fix error codes in lpfc_sli4_pci_mem_setup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It used to be that "error" was set to -ENODEV at the start of the function but we shifted some code around an now "error" is set to zero for most error paths. There is a mix of direct returns and "goto out" but I changed everything to direct returns for consistency. Fixes: 56de8357049c ("scsi: lpfc: fix calls to dma_set_mask_and_coherent()") Signed-off-by: Dan Carpenter Acked-by: James Smart  Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 707dbe7a8d15..bbc2815fc012 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -10049,7 +10049,7 @@ lpfc_sli4_pci_mem_setup(struct lpfc_hba *phba) { struct pci_dev *pdev = phba->pcidev; unsigned long bar0map_len, bar1map_len, bar2map_len; - int error = -ENODEV; + int error; uint32_t if_type; if (!pdev) @@ -10066,7 +10066,7 @@ lpfc_sli4_pci_mem_setup(struct lpfc_hba *phba) */ if (pci_read_config_dword(pdev, LPFC_SLI_INTF, &phba->sli4_hba.sli_intf.word0)) { - return error; + return -ENODEV; } /* There is no SLI3 failback for SLI4 devices. */ @@ -10076,7 +10076,7 @@ lpfc_sli4_pci_mem_setup(struct lpfc_hba *phba) "2894 SLI_INTF reg contents invalid " "sli_intf reg 0x%x\n", phba->sli4_hba.sli_intf.word0); - return error; + return -ENODEV; } if_type = bf_get(lpfc_sli_intf_if_type, &phba->sli4_hba.sli_intf); @@ -10100,7 +10100,7 @@ lpfc_sli4_pci_mem_setup(struct lpfc_hba *phba) dev_printk(KERN_ERR, &pdev->dev, "ioremap failed for SLI4 PCI config " "registers.\n"); - goto out; + return -ENODEV; } phba->pci_bar0_memmap_p = phba->sli4_hba.conf_regs_memmap_p; /* Set up BAR0 PCI config space register memory map */ @@ -10111,7 +10111,7 @@ lpfc_sli4_pci_mem_setup(struct lpfc_hba *phba) if (if_type >= LPFC_SLI_INTF_IF_TYPE_2) { dev_printk(KERN_ERR, &pdev->dev, "FATAL - No BAR0 mapping for SLI4, if_type 2\n"); - goto out; + return -ENODEV; } phba->sli4_hba.conf_regs_memmap_p = ioremap(phba->pci_bar0_map, bar0map_len); @@ -10119,7 +10119,7 @@ lpfc_sli4_pci_mem_setup(struct lpfc_hba *phba) dev_printk(KERN_ERR, &pdev->dev, "ioremap failed for SLI4 PCI config " "registers.\n"); - goto out; + return -ENODEV; } lpfc_sli4_bar0_register_memmap(phba, if_type); } @@ -10165,6 +10165,7 @@ lpfc_sli4_pci_mem_setup(struct lpfc_hba *phba) if (!phba->sli4_hba.drbl_regs_memmap_p) { dev_err(&pdev->dev, "ioremap failed for SLI4 HBA doorbell registers.\n"); + error = -ENOMEM; goto out_iounmap_conf; } phba->pci_bar2_memmap_p = phba->sli4_hba.drbl_regs_memmap_p; @@ -10214,6 +10215,7 @@ lpfc_sli4_pci_mem_setup(struct lpfc_hba *phba) if (!phba->sli4_hba.dpp_regs_memmap_p) { dev_err(&pdev->dev, "ioremap failed for SLI4 HBA dpp registers.\n"); + error = -ENOMEM; goto out_iounmap_ctrl; } phba->pci_bar4_memmap_p = phba->sli4_hba.dpp_regs_memmap_p; @@ -10244,7 +10246,7 @@ out_iounmap_ctrl: iounmap(phba->sli4_hba.ctrl_regs_memmap_p); out_iounmap_conf: iounmap(phba->sli4_hba.conf_regs_memmap_p); -out: + return error; } From 0015437cc046e5ec2b57b00ff8312b8d432eac7c Mon Sep 17 00:00:00 2001 From: Sagar Biradar Date: Thu, 7 Mar 2019 23:26:41 -0800 Subject: [PATCH 493/855] scsi: aacraid: Fix performance issue on logical drives Fix performance issue where the queue depth for SmartIOC logical volumes is set to 1, and allow the usual logical volume code to be executed Fixes: a052865fe287 (aacraid: Set correct Queue Depth for HBA1000 RAW disks) Cc: stable@vger.kernel.org Signed-off-by: Sagar Biradar Reviewed-by: Dave Carroll Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/linit.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 22ecacffeca6..598d6efae53e 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -413,13 +413,16 @@ static int aac_slave_configure(struct scsi_device *sdev) if (chn < AAC_MAX_BUSES && tid < AAC_MAX_TARGETS && aac->sa_firmware) { devtype = aac->hba_map[chn][tid].devtype; - if (devtype == AAC_DEVTYPE_NATIVE_RAW) + if (devtype == AAC_DEVTYPE_NATIVE_RAW) { depth = aac->hba_map[chn][tid].qd_limit; - else if (devtype == AAC_DEVTYPE_ARC_RAW) + set_timeout = 1; + goto common_config; + } + if (devtype == AAC_DEVTYPE_ARC_RAW) { set_qd_dev_type = true; - - set_timeout = 1; - goto common_config; + set_timeout = 1; + goto common_config; + } } if (aac->jbod && (sdev->type == TYPE_DISK)) From aff7b628ac2d58616b74789389ebb1e987081f49 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 13 Mar 2019 16:15:08 -0700 Subject: [PATCH 494/855] f2fs: set pin_file under CAP_SYS_ADMIN Android uses pin_file for uncrypt during OTA, and that should be managed by CAP_SYS_ADMIN only. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 807a97ad2430..012815d816e6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2873,8 +2873,8 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) __u32 pin; int ret = 0; - if (!inode_owner_or_capable(inode)) - return -EACCES; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (get_user(pin, (__u32 __user *)arg)) return -EFAULT; From f6cab793d4a70808e4946baa8f5df4ea9adacc82 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 14 Mar 2019 17:40:16 +0000 Subject: [PATCH 495/855] MIPS: Remove custom MIPS32 __kernel_fsid_t type For MIPS32 kernels we have a custom definition of __kernel_fsid_t. This differs from the asm-generic version used by all other architectures & MIPS64 in one way - it declares the val field as an array of long, rather than an array of int. Since int & long have identical size & alignment when targeting MIPS32 anyway, this makes little sense. Beyond the pointlessness this causes problems for code which prints entries from the val array, for example the fanotify_encode_fid() function [1]. If such code uses a format specified suited to an int then it encounters compiler warnings when building for MIPS32, such as: In file included from include/linux/kernel.h:14:0, from include/linux/list.h:9, from include/linux/preempt.h:11, from include/linux/spinlock.h:51, from include/linux/fdtable.h:11, from fs/notify/fanotify/fanotify.c:3: fs/notify/fanotify/fanotify.c: In function 'fanotify_encode_fid': include/linux/kern_levels.h:5:18: warning: format '%x' expects argument of type 'unsigned int', but argument 2 has type 'long int' [-Wformat=] Remove the custom __kernel_fsid_t definition & make use of the asm-generic version which will have an identical layout in memory anyway, in order to remove the inconsistency with other architectures. One possible regression this could cause if is any code is attempting to print entries from the val array with a long-sized format specifier, in which case it would begin seeing compiler warnings when built against kernel headers including this change. Since such code is exceedingly rare, and would have to be MIPS32-specific to expect a long, this seems to be a problem that it's extremely unlikely anyone will encounter. [1] https://lore.kernel.org/linux-mips/CAOQ4uxiEkczB7PNCXegFC-eYb9zAGaio_o=OgHAJHFd7eavBxA@mail.gmail.com/T/#mb43103277c79ef06b884359209e817db1c136140 Signed-off-by: Paul Burton Cc: Amir Goldstein Cc: Arnd Bergmann Cc: Jan Kara Cc: linux-arch@vger.kernel.org Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- arch/mips/include/uapi/asm/posix_types.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/mips/include/uapi/asm/posix_types.h b/arch/mips/include/uapi/asm/posix_types.h index 6aa49c10f88f..f0ccb5b90ce9 100644 --- a/arch/mips/include/uapi/asm/posix_types.h +++ b/arch/mips/include/uapi/asm/posix_types.h @@ -21,13 +21,6 @@ typedef long __kernel_daddr_t; #define __kernel_daddr_t __kernel_daddr_t -#if (_MIPS_SZLONG == 32) -typedef struct { - long val[2]; -} __kernel_fsid_t; -#define __kernel_fsid_t __kernel_fsid_t -#endif - #include #endif /* _ASM_POSIX_TYPES_H */ From 287c038c0b994dae7569d96eca154f6a7ff6b4a9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2019 13:30:09 +0900 Subject: [PATCH 496/855] tracing/probe: Check maxactive error cases Check maxactive on kprobe error case, because maxactive is only for kretprobe, not for kprobe. Also, maxactive should not be 0, it should be at least 1. Link: http://lkml.kernel.org/r/155253780952.14922.15784129810238750331.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ceafa0a2b1d1..a14837545295 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -624,7 +624,11 @@ static int trace_kprobe_create(int argc, const char *argv[]) if (event) event++; - if (is_return && isdigit(argv[0][1])) { + if (isdigit(argv[0][1])) { + if (!is_return) { + pr_info("Maxactive is not for kprobe"); + return -EINVAL; + } if (event) len = event - &argv[0][1] - 1; else @@ -634,8 +638,8 @@ static int trace_kprobe_create(int argc, const char *argv[]) memcpy(buf, &argv[0][1], len); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); - if (ret) { - pr_info("Failed to parse maxactive.\n"); + if (ret || !maxactive) { + pr_info("Invalid maxactive number\n"); return ret; } /* kretprobes instances are iterated over via a list. The From dec65d79fd269d05427c8167090bfc9c3d0b56c4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2019 13:30:20 +0900 Subject: [PATCH 497/855] tracing/probe: Check event name length correctly Ensure given name of event is not too long when parsing it, and fix to update event name offset correctly when the group name is given. For example, this makes probe event to check the "p:foo/" error case correctly. Link: http://lkml.kernel.org/r/155253782046.14922.14724124823730168629.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index cfcf77e6fb19..0dc13bff2847 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -159,6 +159,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf) { const char *slash, *event = *pevent; + int len; slash = strchr(event, '/'); if (slash) { @@ -173,10 +174,15 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, strlcpy(buf, event, slash - event + 1); *pgroup = buf; *pevent = slash + 1; + event = *pevent; } - if (strlen(event) == 0) { + len = strlen(event); + if (len == 0) { pr_info("Event name is not specified\n"); return -EINVAL; + } else if (len > MAX_EVENT_NAME_LEN) { + pr_info("Event name is too long\n"); + return -E2BIG; } return 0; } From b4443c17a3c9d652dc5d7679ddca867ee3cdaa9c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2019 13:30:30 +0900 Subject: [PATCH 498/855] tracing/probe: Check the size of argument name and body Check the size of argument name and expression is not 0 and smaller than maximum length. Link: http://lkml.kernel.org/r/155253783029.14922.12650939303827581096.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 2 ++ kernel/trace/trace_probe.h | 1 + 2 files changed, 3 insertions(+) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 0dc13bff2847..bf9d3d7e0c9d 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -554,6 +554,8 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, body = strchr(arg, '='); if (body) { + if (body - arg > MAX_ARG_NAME_LEN || body == arg) + return -EINVAL; parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); body++; } else { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 8a63f8bc01bc..2177c206de15 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -32,6 +32,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 #define MAX_ARRAY_LEN 64 +#define MAX_ARG_NAME_LEN 32 #define MAX_STRING_SIZE PATH_MAX /* Reserved field names */ From 5b7a96220900e3c3f6fb53908eb4602cda959376 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2019 13:30:40 +0900 Subject: [PATCH 499/855] tracing/probe: Check event/group naming rule at parsing Check event and group naming rule at parsing it instead of allocating probes. Link: http://lkml.kernel.org/r/155253784064.14922.2336893061156236237.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 7 +------ kernel/trace/trace_probe.c | 8 ++++++++ kernel/trace/trace_uprobe.c | 5 +---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a14837545295..5265117ad7e8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -221,7 +221,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, tk->rp.maxactive = maxactive; - if (!event || !is_good_name(event)) { + if (!event || !group) { ret = -EINVAL; goto error; } @@ -231,11 +231,6 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, if (!tk->tp.call.name) goto error; - if (!group || !is_good_name(group)) { - ret = -EINVAL; - goto error; - } - tk->tp.class.system = kstrdup(group, GFP_KERNEL); if (!tk->tp.class.system) goto error; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index bf9d3d7e0c9d..8f8411e7835f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -172,6 +172,10 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, return -E2BIG; } strlcpy(buf, event, slash - event + 1); + if (!is_good_name(buf)) { + pr_info("Group name must follow the same rules as C identifiers\n"); + return -EINVAL; + } *pgroup = buf; *pevent = slash + 1; event = *pevent; @@ -184,6 +188,10 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, pr_info("Event name is too long\n"); return -E2BIG; } + if (!is_good_name(event)) { + pr_info("Event name must follow the same rules as C identifiers\n"); + return -EINVAL; + } return 0; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e335576b9411..52f033489377 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -266,10 +266,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) { struct trace_uprobe *tu; - if (!event || !is_good_name(event)) - return ERR_PTR(-EINVAL); - - if (!group || !is_good_name(group)) + if (!event || !group) return ERR_PTR(-EINVAL); tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); From a039480e9e93896cadc5a91468964febb3c5d488 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2019 13:30:50 +0900 Subject: [PATCH 500/855] tracing/probe: Verify alloc_trace_*probe() result Since alloc_trace_*probe() returns -EINVAL only if !event && !group, it should not happen in trace_*probe_create(). If we catch that case there is a bug. So use WARN_ON_ONCE() instead of pr_info(). Link: http://lkml.kernel.org/r/155253785078.14922.16902223633734601469.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_uprobe.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5265117ad7e8..63aa0ab56051 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -693,9 +693,9 @@ static int trace_kprobe_create(int argc, const char *argv[]) tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, argc, is_return); if (IS_ERR(tk)) { - pr_info("Failed to allocate trace_probe.(%d)\n", - (int)PTR_ERR(tk)); ret = PTR_ERR(tk); + /* This must return -ENOMEM otherwise there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); goto out; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 52f033489377..b54137ec7810 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -514,8 +514,9 @@ static int trace_uprobe_create(int argc, const char **argv) tu = alloc_trace_uprobe(group, event, argc, is_return); if (IS_ERR(tu)) { - pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); ret = PTR_ERR(tu); + /* This must return -ENOMEM otherwise there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); goto fail_address_parse; } tu->offset = offset; From 480b1cb9dad894f40523a29964746cfc6ebd714f Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 8 Mar 2019 12:58:18 +1000 Subject: [PATCH 501/855] cifs: change wait_for_free_request() to take flags as argument and compute timeout and optyp from it. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/transport.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 7ce8a585abd6..ed104c0550e8 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -538,15 +538,20 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, } static int -wait_for_free_request(struct TCP_Server_Info *server, const int timeout, - const int optype, unsigned int *instance) +wait_for_free_request(struct TCP_Server_Info *server, const int flags, + unsigned int *instance) { int *val; + int timeout, optype; + + timeout = flags & CIFS_TIMEOUT_MASK; + optype = flags & CIFS_OP_MASK; val = server->ops->get_credits_field(server, optype); /* Since an echo is already inflight, no need to wait to send another */ if (*val <= 0 && optype == CIFS_ECHO_OP) return -EAGAIN; + return wait_for_free_credits(server, timeout, val, instance); } @@ -646,16 +651,16 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, mid_handle_t *handle, void *cbdata, const int flags, const struct cifs_credits *exist_credits) { - int rc, timeout, optype; + int rc; struct mid_q_entry *mid; struct cifs_credits credits = { .value = 0, .instance = 0 }; unsigned int instance; + int optype; - timeout = flags & CIFS_TIMEOUT_MASK; optype = flags & CIFS_OP_MASK; if ((flags & CIFS_HAS_CREDITS) == 0) { - rc = wait_for_free_request(server, timeout, optype, &instance); + rc = wait_for_free_request(server, flags, &instance); if (rc) return rc; credits.value = 1; @@ -871,8 +876,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, const int flags, const int num_rqst, struct smb_rqst *rqst, int *resp_buf_type, struct kvec *resp_iov) { - int i, j, rc = 0; - int timeout, optype; + int i, j, optype, rc = 0; struct mid_q_entry *midQ[MAX_COMPOUND]; bool cancelled_mid[MAX_COMPOUND] = {false}; struct cifs_credits credits[MAX_COMPOUND] = { @@ -882,7 +886,6 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, unsigned int first_instance = 0; char *buf; - timeout = flags & CIFS_TIMEOUT_MASK; optype = flags & CIFS_OP_MASK; for (i = 0; i < num_rqst; i++) @@ -933,8 +936,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, * Ensure we obtain 1 credit per request in the compound chain. */ for (i = 0; i < num_rqst; i++) { - rc = wait_for_free_request(ses->server, timeout, optype, - &instance); + rc = wait_for_free_request(ses->server, flags, &instance); if (rc == 0) { credits[i].value = 1; @@ -1057,7 +1059,7 @@ setup_rqsts: smb311_update_preauth_hash(ses, rqst[0].rq_iov, rqst[0].rq_nvec); - if (timeout == CIFS_ASYNC_OP) + if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) goto out; for (i = 0; i < num_rqst; i++) { @@ -1194,7 +1196,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, int SendReceive(const unsigned int xid, struct cifs_ses *ses, struct smb_hdr *in_buf, struct smb_hdr *out_buf, - int *pbytes_returned, const int timeout) + int *pbytes_returned, const int flags) { int rc = 0; struct mid_q_entry *midQ; @@ -1225,7 +1227,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - rc = wait_for_free_request(ses->server, timeout, 0, &credits.instance); + rc = wait_for_free_request(ses->server, flags, &credits.instance); if (rc) return rc; @@ -1264,7 +1266,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc < 0) goto out; - if (timeout == CIFS_ASYNC_OP) + if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) goto out; rc = wait_for_response(ses->server, midQ); @@ -1367,8 +1369,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return -EIO; } - rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0, - &instance); + rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, &instance); if (rc) return rc; From 4230cff8c0b7aaee7203e434b05a9acc1635fc04 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 8 Mar 2019 12:58:19 +1000 Subject: [PATCH 502/855] cifs: pass flags down into wait_for_free_credits() Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/transport.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index ed104c0550e8..9fcc4a82943d 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -486,15 +486,24 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, } static int -wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, - int *credits, unsigned int *instance) +wait_for_free_credits(struct TCP_Server_Info *server, const int flags, + unsigned int *instance) { int rc; + int *credits; + int optype; + + optype = flags & CIFS_OP_MASK; *instance = 0; + credits = server->ops->get_credits_field(server, optype); + /* Since an echo is already inflight, no need to wait to send another */ + if (*credits <= 0 && optype == CIFS_ECHO_OP) + return -EAGAIN; + spin_lock(&server->req_lock); - if (timeout == CIFS_ASYNC_OP) { + if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) { /* oplock breaks must not be held up */ server->in_flight++; *credits -= 1; @@ -525,7 +534,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, */ /* update # of requests on the wire to server */ - if (timeout != CIFS_BLOCKING_OP) { + if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) { *credits -= 1; server->in_flight++; *instance = server->reconnect_instance; @@ -541,18 +550,7 @@ static int wait_for_free_request(struct TCP_Server_Info *server, const int flags, unsigned int *instance) { - int *val; - int timeout, optype; - - timeout = flags & CIFS_TIMEOUT_MASK; - optype = flags & CIFS_OP_MASK; - - val = server->ops->get_credits_field(server, optype); - /* Since an echo is already inflight, no need to wait to send another */ - if (*val <= 0 && optype == CIFS_ECHO_OP) - return -EAGAIN; - - return wait_for_free_credits(server, timeout, val, instance); + return wait_for_free_credits(server, flags, instance); } int From b227d215deef4f3528b8f754accef4db03539a59 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 8 Mar 2019 12:58:20 +1000 Subject: [PATCH 503/855] cifs: wait_for_free_credits() make it possible to wait for >=1 credits Change wait_for_free_credits() to allow waiting for >=1 credits instead of just a single credit. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/cifsglob.h | 4 ++-- fs/cifs/smb2ops.c | 2 +- fs/cifs/transport.c | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f293e052e351..ddb299494cd6 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -735,13 +735,13 @@ in_flight(struct TCP_Server_Info *server) } static inline bool -has_credits(struct TCP_Server_Info *server, int *credits) +has_credits(struct TCP_Server_Info *server, int *credits, int num_credits) { int num; spin_lock(&server->req_lock); num = *credits; spin_unlock(&server->req_lock); - return num > 0; + return num >= num_credits; } static inline void diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 085e91436da7..1654d50b8aef 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -185,7 +185,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, spin_unlock(&server->req_lock); cifs_num_waiters_inc(server); rc = wait_event_killable(server->request_q, - has_credits(server, &server->credits)); + has_credits(server, &server->credits, 1)); cifs_num_waiters_dec(server); if (rc) return rc; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 9fcc4a82943d..1951f9f74bb2 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -486,8 +486,8 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, } static int -wait_for_free_credits(struct TCP_Server_Info *server, const int flags, - unsigned int *instance) +wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, + const int flags, unsigned int *instance) { int rc; int *credits; @@ -513,11 +513,11 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int flags, } while (1) { - if (*credits <= 0) { + if (*credits < num_credits) { spin_unlock(&server->req_lock); cifs_num_waiters_inc(server); rc = wait_event_killable(server->request_q, - has_credits(server, credits)); + has_credits(server, credits, num_credits)); cifs_num_waiters_dec(server); if (rc) return rc; @@ -535,8 +535,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int flags, /* update # of requests on the wire to server */ if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) { - *credits -= 1; - server->in_flight++; + *credits -= num_credits; + server->in_flight += num_credits; *instance = server->reconnect_instance; } spin_unlock(&server->req_lock); @@ -550,7 +550,7 @@ static int wait_for_free_request(struct TCP_Server_Info *server, const int flags, unsigned int *instance) { - return wait_for_free_credits(server, flags, instance); + return wait_for_free_credits(server, 1, flags, instance); } int From 16b34aa44b257155d9392a19e08e4ce139bc2789 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 8 Mar 2019 12:58:21 +1000 Subject: [PATCH 504/855] cifs: prevent starvation in wait_for_free_credits for multi-credit requests Reserve the last MAX_COMPOUND credits for any request asking for >1 credit. This is to prevent future compound requests from becoming starved while waiting for potentially many requests is there is a large number of concurrent singe-credit requests. However, we need to protect from servers that are very slow to hand out new credits on new sessions so we only do this IFF there are 2*MAX_COMPOUND (arbitrary) credits already in flight. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/transport.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 1951f9f74bb2..9e08ce722dbb 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -528,6 +528,34 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, return -ENOENT; } + /* + * For normal commands, reserve the last MAX_COMPOUND + * credits to compound requests. + * Otherwise these compounds could be permanently + * starved for credits by single-credit requests. + * + * To prevent spinning CPU, block this thread until + * there are >MAX_COMPOUND credits available. + * But only do this is we already have a lot of + * credits in flight to avoid triggering this check + * for servers that are slow to hand out credits on + * new sessions. + */ + if (!optype && num_credits == 1 && + server->in_flight > 2 * MAX_COMPOUND && + *credits <= MAX_COMPOUND) { + spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); + rc = wait_event_killable(server->request_q, + has_credits(server, credits, + MAX_COMPOUND + 1)); + cifs_num_waiters_dec(server); + if (rc) + return rc; + spin_lock(&server->req_lock); + continue; + } + /* * Can not count locking commands against total * as they are allowed to block on server. From 2b53b929faedacc6531bbb4315585cb7c14a252d Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 8 Mar 2019 12:58:22 +1000 Subject: [PATCH 505/855] cifs: add a timeout argument to wait_for_free_credits A negative timeout is the same as the current behaviour, i.e. no timeout. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/transport.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 9e08ce722dbb..b3d04018195c 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -487,11 +487,18 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, static int wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, - const int flags, unsigned int *instance) + const int timeout, const int flags, + unsigned int *instance) { int rc; int *credits; int optype; + long int t; + + if (timeout < 0) + t = MAX_JIFFY_OFFSET; + else + t = msecs_to_jiffies(timeout); optype = flags & CIFS_OP_MASK; @@ -516,11 +523,16 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, if (*credits < num_credits) { spin_unlock(&server->req_lock); cifs_num_waiters_inc(server); - rc = wait_event_killable(server->request_q, - has_credits(server, credits, num_credits)); + rc = wait_event_killable_timeout(server->request_q, + has_credits(server, credits, num_credits), t); cifs_num_waiters_dec(server); - if (rc) - return rc; + if (!rc) { + cifs_dbg(VFS, "wait timed out after %d ms\n", + timeout); + return -ENOTSUPP; + } + if (rc == -ERESTARTSYS) + return -ERESTARTSYS; spin_lock(&server->req_lock); } else { if (server->tcpStatus == CifsExiting) { @@ -546,12 +558,19 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, *credits <= MAX_COMPOUND) { spin_unlock(&server->req_lock); cifs_num_waiters_inc(server); - rc = wait_event_killable(server->request_q, + rc = wait_event_killable_timeout( + server->request_q, has_credits(server, credits, - MAX_COMPOUND + 1)); + MAX_COMPOUND + 1), + t); cifs_num_waiters_dec(server); - if (rc) - return rc; + if (!rc) { + cifs_dbg(VFS, "wait timed out after %d ms\n", + timeout); + return -ENOTSUPP; + } + if (rc == -ERESTARTSYS) + return -ERESTARTSYS; spin_lock(&server->req_lock); continue; } @@ -578,7 +597,8 @@ static int wait_for_free_request(struct TCP_Server_Info *server, const int flags, unsigned int *instance) { - return wait_for_free_credits(server, 1, flags, instance); + return wait_for_free_credits(server, 1, -1, flags, + instance); } int From ffd1ef1e50727e96e360e871b89aa8cf329935f7 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 9 Mar 2019 18:12:18 -0600 Subject: [PATCH 506/855] smb3: display security information in /proc/fs/cifs/DebugData more accurately When the server required encryption (but we didn't connect to it with the "seal" mount option) we weren't displaying in /proc/fs/cifs/DebugData that the tcon for that share was encrypted. Similarly we were not displaying that signing was required when ses->sign was enabled (we only checked ses->server->sign). This makes it easier to debug when in fact the connection is signed (or sealed), whether for performance or security questions. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/cifs_debug.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index e92a2fee3c57..faeb1452cc2d 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -115,7 +115,9 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_puts(m, " type: CDROM "); else seq_printf(m, " type: %d ", dev_type); - if (tcon->seal) + if ((tcon->seal) || + (tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) || + (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA)) seq_printf(m, " Encrypted"); if (tcon->nocase) seq_printf(m, " nocase"); @@ -371,6 +373,10 @@ skip_rdma: atomic_read(&server->in_send), atomic_read(&server->num_waiters)); #endif + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + seq_puts(m, " encrypted"); + if (ses->sign) + seq_puts(m, " signed"); seq_puts(m, "\n\tShares:"); j = 0; From 7937ca961c847bda8a75da5d9c34efee9602f6b5 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 9 Mar 2019 20:29:55 -0600 Subject: [PATCH 507/855] smb3: add dynamic tracepoint for timeout waiting for credits To help debug credit starvation problems where we timeout waiting for server to grant the client credits. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky --- fs/cifs/trace.h | 1 + fs/cifs/transport.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index d8b049afa606..c78e96c4cc53 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -713,6 +713,7 @@ DEFINE_EVENT(smb3_credit_class, smb3_##name, \ TP_ARGS(currmid, hostname, credits)) DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits); +DEFINE_SMB3_CREDIT_EVENT(credit_timeout); #endif /* _CIFS_TRACE_H */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index b3d04018195c..8731cfa66026 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -527,6 +527,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, has_credits(server, credits, num_credits), t); cifs_num_waiters_dec(server); if (!rc) { + trace_smb3_credit_timeout(server->CurrentMid, + server->hostname, num_credits); cifs_dbg(VFS, "wait timed out after %d ms\n", timeout); return -ENOTSUPP; @@ -565,6 +567,9 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, t); cifs_num_waiters_dec(server); if (!rc) { + trace_smb3_credit_timeout( + server->CurrentMid, + server->hostname, num_credits); cifs_dbg(VFS, "wait timed out after %d ms\n", timeout); return -ENOTSUPP; From 257b78099be015762b1b19b3e4c58f90a9913fb7 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 11 Mar 2019 12:18:58 +1000 Subject: [PATCH 508/855] cifs: simplify how we handle credits in compound_send_recv() Since we can now wait for multiple requests atomically in wait_for_free_request() we can now greatly simplify the handling of the credits in this function. This fixes a potential deadlock where many concurrent compound requests could each have reserved 1 or 2 credits each but are all blocked waiting for the final credits they need to be able to issue the requests to the server. Set a default timeout of 60 seconds for compounded requests. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/transport.c | 110 +++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 72 deletions(-) diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 8731cfa66026..1de8e996e566 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -606,6 +606,31 @@ wait_for_free_request(struct TCP_Server_Info *server, const int flags, instance); } +static int +wait_for_compound_request(struct TCP_Server_Info *server, int num, + const int flags, unsigned int *instance) +{ + int *credits; + + credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK); + + spin_lock(&server->req_lock); + if (*credits < num) { + /* + * Return immediately if not too many requests in flight since + * we will likely be stuck on waiting for credits. + */ + if (server->in_flight < num - *credits) { + spin_unlock(&server->req_lock); + return -ENOTSUPP; + } + } + spin_unlock(&server->req_lock); + + return wait_for_free_credits(server, num, 60000, flags, + instance); +} + int cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, unsigned int *num, struct cifs_credits *credits) @@ -934,7 +959,6 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, { .value = 0, .instance = 0 } }; unsigned int instance; - unsigned int first_instance = 0; char *buf; optype = flags & CIFS_OP_MASK; @@ -950,80 +974,24 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, if (ses->server->tcpStatus == CifsExiting) return -ENOENT; - spin_lock(&ses->server->req_lock); - if (ses->server->credits < num_rqst) { - /* - * Return immediately if not too many requests in flight since - * we will likely be stuck on waiting for credits. - */ - if (ses->server->in_flight < num_rqst - ses->server->credits) { - spin_unlock(&ses->server->req_lock); - return -ENOTSUPP; - } - } else { - /* enough credits to send the whole compounded request */ - ses->server->credits -= num_rqst; - ses->server->in_flight += num_rqst; - first_instance = ses->server->reconnect_instance; - } - spin_unlock(&ses->server->req_lock); - - if (first_instance) { - cifs_dbg(FYI, "Acquired %d credits at once\n", num_rqst); - for (i = 0; i < num_rqst; i++) { - credits[i].value = 1; - credits[i].instance = first_instance; - } - goto setup_rqsts; - } - /* - * There are not enough credits to send the whole compound request but - * there are requests in flight that may bring credits from the server. + * Wait for all the requests to become available. * This approach still leaves the possibility to be stuck waiting for * credits if the server doesn't grant credits to the outstanding - * requests. This should be fixed by returning immediately and letting - * a caller fallback to sequential commands instead of compounding. - * Ensure we obtain 1 credit per request in the compound chain. + * requests and if the client is completely idle, not generating any + * other requests. + * This can be handled by the eventual session reconnect. */ + rc = wait_for_compound_request(ses->server, num_rqst, flags, + &instance); + if (rc) + return rc; + for (i = 0; i < num_rqst; i++) { - rc = wait_for_free_request(ses->server, flags, &instance); - - if (rc == 0) { - credits[i].value = 1; - credits[i].instance = instance; - /* - * All parts of the compound chain must get credits from - * the same session, otherwise we may end up using more - * credits than the server granted. If there were - * reconnects in between, return -EAGAIN and let callers - * handle it. - */ - if (i == 0) - first_instance = instance; - else if (first_instance != instance) { - i++; - rc = -EAGAIN; - } - } - - if (rc) { - /* - * We haven't sent an SMB packet to the server yet but - * we already obtained credits for i requests in the - * compound chain - need to return those credits back - * for future use. Note that we need to call add_credits - * multiple times to match the way we obtained credits - * in the first place and to account for in flight - * requests correctly. - */ - for (j = 0; j < i; j++) - add_credits(ses->server, &credits[j], optype); - return rc; - } + credits[i].value = 1; + credits[i].instance = instance; } -setup_rqsts: /* * Make sure that we sign in the same order that we send on this socket * and avoid races inside tcp sendmsg code that could cause corruption @@ -1034,14 +1002,12 @@ setup_rqsts: /* * All the parts of the compound chain belong obtained credits from the - * same session (see the appropriate checks above). In the same time - * there might be reconnects after those checks but before we acquired - * the srv_mutex. We can not use credits obtained from the previous + * same session. We can not use credits obtained from the previous * session to send this request. Check if there were reconnects after * we obtained credits and return -EAGAIN in such cases to let callers * handle it. */ - if (first_instance != ses->server->reconnect_instance) { + if (instance != ses->server->reconnect_instance) { mutex_unlock(&ses->server->srv_mutex); for (j = 0; j < num_rqst; j++) add_credits(ses->server, &credits[j], optype); From ab7b10cf4fd14496292c62099f3a0a1c1ef90c0e Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 12 Mar 2019 01:29:03 -0500 Subject: [PATCH 509/855] smb3: display volume serial number for shares in /proc/fs/cifs/DebugData It can be helpful for debugging. According to MS-FSCC: "A 32-bit unsigned integer that contains the serial number of the volume. The serial number is an opaque value generated by the file system at format time" Signed-off-by: Steve French Acked-by: Pavel Shilovsky --- fs/cifs/cifs_debug.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index faeb1452cc2d..13c1288b04a7 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -115,6 +115,9 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_puts(m, " type: CDROM "); else seq_printf(m, " type: %d ", dev_type); + + seq_printf(m, "Serial Number: 0x%x", tcon->vol_serial_number); + if ((tcon->seal) || (tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) || (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA)) From b0f6df737a1c5d9d5ff90cc946205d8af46cc067 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 12 Mar 2019 13:58:31 +1000 Subject: [PATCH 510/855] cifs: cache FILE_ALL_INFO for the shared root handle When we open the shared root handle also ask for FILE_ALL_INFORMATION since we can do this at zero cost as part of a compound. Cache this information as long as the lease is held and return and serve any future requests from cache. This allows us to serve "stat /" directly from cache and avoid a network roundtrip. Since clients often want to do this quite a lot this improve performance slightly. As an example: xfstest generic/533 performs 43 stat operations on the root of the share while it is run. Which are eliminated with this patch. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/cifsglob.h | 3 ++ fs/cifs/smb2inode.c | 15 ++++-- fs/cifs/smb2ops.c | 111 +++++++++++++++++++++++++++++++++++++------- fs/cifs/smb2pdu.c | 12 +++-- fs/cifs/smb2proto.h | 3 ++ 5 files changed, 117 insertions(+), 27 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ddb299494cd6..8b9ecf8c2986 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -962,11 +962,14 @@ cap_unix(struct cifs_ses *ses) struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ + bool file_all_info_is_valid:1; + struct kref refcount; struct cifs_fid *fid; struct mutex fid_mutex; struct cifs_tcon *tcon; struct work_struct lease_break; + struct smb2_file_all_info file_all_info; }; /* diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 01a76bccdb8d..b6e07e2eed10 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -309,12 +309,17 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, rc = open_shroot(xid, tcon, &fid); if (rc) goto out; - rc = SMB2_query_info(xid, tcon, fid.persistent_fid, - fid.volatile_fid, smb2_data); + + if (tcon->crfid.file_all_info_is_valid) { + move_smb2_info_to_cifs(data, + &tcon->crfid.file_all_info); + } else { + rc = SMB2_query_info(xid, tcon, fid.persistent_fid, + fid.volatile_fid, smb2_data); + if (!rc) + move_smb2_info_to_cifs(data, smb2_data); + } close_shroot(&tcon->crfid); - if (rc) - goto out; - move_smb2_info_to_cifs(data, smb2_data); goto out; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 1654d50b8aef..a338190b97fa 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -619,6 +619,7 @@ smb2_close_cached_fid(struct kref *ref) SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid, cfid->fid->volatile_fid); cfid->is_valid = false; + cfid->file_all_info_is_valid = false; } } @@ -643,9 +644,18 @@ smb2_cached_lease_break(struct work_struct *work) */ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) { - struct cifs_open_parms oparams; - int rc; - __le16 srch_path = 0; /* Null - since an open of top of share */ + struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = ses->server; + struct cifs_open_parms oparms; + struct smb2_create_rsp *o_rsp = NULL; + struct smb2_query_info_rsp *qi_rsp = NULL; + int resp_buftype[2]; + struct smb_rqst rqst[2]; + struct kvec rsp_iov[2]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec qi_iov[1]; + int rc, flags = 0; + __le16 utf16_path = 0; /* Null - since an open of top of share */ u8 oplock = SMB2_OPLOCK_LEVEL_II; mutex_lock(&tcon->crfid.fid_mutex); @@ -657,22 +667,89 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) return 0; } - oparams.tcon = tcon; - oparams.create_options = 0; - oparams.desired_access = FILE_READ_ATTRIBUTES; - oparams.disposition = FILE_OPEN; - oparams.fid = pfid; - oparams.reconnect = false; + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; - rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL, NULL); - if (rc == 0) { - memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); - tcon->crfid.tcon = tcon; - tcon->crfid.is_valid = true; - kref_init(&tcon->crfid.refcount); - kref_get(&tcon->crfid.refcount); - } + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + /* Open */ + memset(&open_iov, 0, sizeof(open_iov)); + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; + + oparms.tcon = tcon; + oparms.create_options = 0; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.fid = pfid; + oparms.reconnect = false; + + rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &utf16_path); + if (rc) + goto oshr_exit; + smb2_set_next_command(tcon, &rqst[0]); + + memset(&qi_iov, 0, sizeof(qi_iov)); + rqst[1].rq_iov = qi_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, + COMPOUND_FID, FILE_ALL_INFORMATION, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + + PATH_MAX * 2, 0, NULL); + if (rc) + goto oshr_exit; + + smb2_set_related(&rqst[1]); + + rc = compound_send_recv(xid, ses, flags, 2, rqst, + resp_buftype, rsp_iov); + if (rc) + goto oshr_exit; + + o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; + oparms.fid->persistent_fid = o_rsp->PersistentFileId; + oparms.fid->volatile_fid = o_rsp->VolatileFileId; +#ifdef CONFIG_CIFS_DEBUG2 + oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId); +#endif /* CIFS_DEBUG2 */ + + if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) + oplock = smb2_parse_lease_state(server, o_rsp, + &oparms.fid->epoch, + oparms.fid->lease_key); + else + goto oshr_exit; + + + memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); + tcon->crfid.tcon = tcon; + tcon->crfid.is_valid = true; + kref_init(&tcon->crfid.refcount); + kref_get(&tcon->crfid.refcount); + + + qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) + goto oshr_exit; + rc = smb2_validate_and_copy_iov( + le16_to_cpu(qi_rsp->OutputBufferOffset), + sizeof(struct smb2_file_all_info), + &rsp_iov[1], sizeof(struct smb2_file_all_info), + (char *)&tcon->crfid.file_all_info); + if (rc) + goto oshr_exit; + tcon->crfid.file_all_info_is_valid = 1; + + oshr_exit: mutex_unlock(&tcon->crfid.fid_mutex); + SMB2_open_free(&rqst[0]); + SMB2_query_info_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 60fbe306f604..cfe9fe41ccf5 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1797,9 +1797,10 @@ create_reconnect_durable_buf(struct cifs_fid *fid) return buf; } -static __u8 -parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, - unsigned int *epoch, char *lease_key) +__u8 +smb2_parse_lease_state(struct TCP_Server_Info *server, + struct smb2_create_rsp *rsp, + unsigned int *epoch, char *lease_key) { char *data_offset; struct create_context *cc; @@ -2456,8 +2457,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) - *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch, - oparms->fid->lease_key); + *oplock = smb2_parse_lease_state(server, rsp, + &oparms->fid->epoch, + oparms->fid->lease_key); else *oplock = rsp->OplockLevel; creat_exit: diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 87733b27a65f..72cc563c32fe 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -223,6 +223,9 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *); extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, enum securityEnum); +extern __u8 smb2_parse_lease_state(struct TCP_Server_Info *server, + struct smb2_create_rsp *rsp, + unsigned int *epoch, char *lease_key); extern int smb3_encryption_required(const struct cifs_tcon *tcon); extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int min_buf_size); From 8191576a1249763b246164b323003bec084cc8a2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 13 Mar 2019 00:02:47 -0500 Subject: [PATCH 511/855] smb3: Add dynamic trace points for various compounded smb3 ops Adds trace points for enter and exit (done vs. error) for: compounded query and setinfo, hardlink, rename, mkdir, rmdir, set_eof, delete (unlink) Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/smb2inode.c | 72 +++++++++++++++++++++++++-- fs/cifs/trace.h | 117 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+), 4 deletions(-) diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index b6e07e2eed10..278405d26c47 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -37,6 +37,16 @@ #include "smb2pdu.h" #include "smb2proto.h" +static void +free_set_inf_compound(struct smb_rqst *rqst) +{ + if (rqst[1].rq_iov) + SMB2_set_info_free(&rqst[1]); + if (rqst[2].rq_iov) + SMB2_close_free(&rqst[2]); +} + + static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, @@ -112,14 +122,18 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, PATH_MAX * 2, 0, NULL); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid, + full_path); break; case SMB2_OP_DELETE: + trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_MKDIR: /* * Directories are created through parameters in the * SMB2_open() call. */ + trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_RMDIR: memset(&si_iov, 0, sizeof(si_iov)); @@ -135,6 +149,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_SET_EOF: memset(&si_iov, 0, sizeof(si_iov)); @@ -150,6 +165,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_SET_INFO: memset(&si_iov, 0, sizeof(si_iov)); @@ -166,6 +182,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid, + full_path); break; case SMB2_OP_RENAME: memset(&si_iov, 0, sizeof(si_iov)); @@ -190,6 +208,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_HARDLINK: memset(&si_iov, 0, sizeof(si_iov)); @@ -214,6 +233,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path); break; default: cifs_dbg(VFS, "Invalid command\n"); @@ -252,21 +272,65 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_query_info_free(&rqst[1]); if (rqst[2].rq_iov) SMB2_close_free(&rqst[2]); + if (rc) + trace_smb3_query_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_query_info_compound_done(xid, ses->Suid, + tcon->tid); break; case SMB2_OP_DELETE: + if (rc) + trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_delete_done(xid, ses->Suid, tcon->tid); + if (rqst[1].rq_iov) + SMB2_close_free(&rqst[1]); + break; case SMB2_OP_MKDIR: + if (rc) + trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid); if (rqst[1].rq_iov) SMB2_close_free(&rqst[1]); break; case SMB2_OP_HARDLINK: + if (rc) + trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_RENAME: + if (rc) + trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_rename_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_RMDIR: + if (rc) + trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_SET_EOF: + if (rc) + trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_SET_INFO: - if (rqst[1].rq_iov) - SMB2_set_info_free(&rqst[1]); - if (rqst[2].rq_iov) - SMB2_close_free(&rqst[2]); + if (rc) + trace_smb3_set_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_set_info_compound_done(xid, ses->Suid, + tcon->tid); + free_set_inf_compound(rqst); break; } free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index c78e96c4cc53..30bf51c7e8fe 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -242,6 +242,123 @@ DEFINE_SMB3_INF_ERR_EVENT(query_info_err); DEFINE_SMB3_INF_ERR_EVENT(set_info_err); DEFINE_SMB3_INF_ERR_EVENT(fsctl_err); +DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + const char *full_path), + TP_ARGS(xid, tid, sesid, full_path), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __string(path, full_path) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __assign_str(path, full_path); + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s", + __entry->xid, __entry->sesid, __entry->tid, + __get_str(path)) +) + +#define DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(name) \ +DEFINE_EVENT(smb3_inf_compound_enter_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + const char *full_path), \ + TP_ARGS(xid, tid, sesid, full_path)) + +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); + + +DECLARE_EVENT_CLASS(smb3_inf_compound_done_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid), + TP_ARGS(xid, tid, sesid), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x", + __entry->xid, __entry->sesid, __entry->tid) +) + +#define DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_inf_compound_done_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid), \ + TP_ARGS(xid, tid, sesid)) + +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); + + +DECLARE_EVENT_CLASS(smb3_inf_compound_err_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + int rc), + TP_ARGS(xid, tid, sesid, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->rc = rc; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, + __entry->rc) +) + +#define DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_inf_compound_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + int rc), \ + TP_ARGS(xid, tid, sesid, rc)) + +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); + /* * For logging SMB3 Status code and Command for responses which return errors */ From ccdc77a3054afb851473e43f4d60890263cd22e9 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 13 Mar 2019 14:37:48 +1000 Subject: [PATCH 512/855] cifs: add SMB2_ioctl_init/free helpers to be used with compounding Define an _init() and a _free() function for SMB2_init so that we will be able to use it with compounds. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 133 +++++++++++++++++++++++++------------------- fs/cifs/smb2proto.h | 4 ++ 2 files changed, 80 insertions(+), 57 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index cfe9fe41ccf5..481d64d2cc12 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2468,65 +2468,46 @@ creat_exit: return rc; } -/* - * SMB2 IOCTL is used for both IOCTLs and FSCTLs - */ int -SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 opcode, bool is_fsctl, - char *in_data, u32 indatalen, - char **out_data, u32 *plen /* returned data len */) +SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 opcode, + bool is_fsctl, char *in_data, u32 indatalen) { - struct smb_rqst rqst; struct smb2_ioctl_req *req; - struct smb2_ioctl_rsp *rsp; - struct cifs_ses *ses; - struct kvec iov[2]; - struct kvec rsp_iov; - int resp_buftype; - int n_iov; - int rc = 0; - int flags = 0; + struct kvec *iov = rqst->rq_iov; unsigned int total_len; - - cifs_dbg(FYI, "SMB2 IOCTL\n"); - - if (out_data != NULL) - *out_data = NULL; - - /* zero out returned data len, in case of error */ - if (plen) - *plen = 0; - - if (tcon) - ses = tcon->ses; - else - return -EIO; - - if (!ses || !(ses->server)) - return -EIO; + int rc; rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len); if (rc) return rc; - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - req->CtlCode = cpu_to_le32(opcode); req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; + iov[0].iov_base = (char *)req; + /* + * If no input data, the size of ioctl struct in + * protocol spec still includes a 1 byte data buffer, + * but if input data passed to ioctl, we do not + * want to double count this, so we do not send + * the dummy one byte of data in iovec[0] if sending + * input data (in iovec[1]). + */ if (indatalen) { req->InputCount = cpu_to_le32(indatalen); /* do not set InputOffset if no input data */ req->InputOffset = cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer)); + rqst->rq_nvec = 2; + iov[0].iov_len = total_len - 1; iov[1].iov_base = in_data; iov[1].iov_len = indatalen; - n_iov = 2; - } else - n_iov = 1; + } else { + rqst->rq_nvec = 1; + iov[0].iov_len = total_len; + } req->OutputOffset = 0; req->OutputCount = 0; /* MBZ */ @@ -2548,33 +2529,70 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, else req->Flags = 0; - iov[0].iov_base = (char *)req; - - /* - * If no input data, the size of ioctl struct in - * protocol spec still includes a 1 byte data buffer, - * but if input data passed to ioctl, we do not - * want to double count this, so we do not send - * the dummy one byte of data in iovec[0] if sending - * input data (in iovec[1]). - */ - - if (indatalen) { - iov[0].iov_len = total_len - 1; - } else - iov[0].iov_len = total_len; - /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + return 0; +} + +void +SMB2_ioctl_free(struct smb_rqst *rqst) +{ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ +} + +/* + * SMB2 IOCTL is used for both IOCTLs and FSCTLs + */ +int +SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + u64 volatile_fid, u32 opcode, bool is_fsctl, + char *in_data, u32 indatalen, + char **out_data, u32 *plen /* returned data len */) +{ + struct smb_rqst rqst; + struct smb2_ioctl_rsp *rsp = NULL; + struct cifs_ses *ses; + struct kvec iov[2]; + struct kvec rsp_iov = {NULL, 0}; + int resp_buftype = CIFS_NO_BUFFER; + int rc = 0; + int flags = 0; + + cifs_dbg(FYI, "SMB2 IOCTL\n"); + + if (out_data != NULL) + *out_data = NULL; + + /* zero out returned data len, in case of error */ + if (plen) + *plen = 0; + + if (tcon) + ses = tcon->ses; + else + return -EIO; + + if (!ses || !(ses->server)) + return -EIO; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + memset(&rqst, 0, sizeof(struct smb_rqst)); + memset(&iov, 0, sizeof(iov)); rqst.rq_iov = iov; - rqst.rq_nvec = n_iov; + rqst.rq_nvec = 2; + + rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid, + opcode, is_fsctl, in_data, indatalen); + if (rc) + goto ioctl_exit; rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base; if (rc != 0) @@ -2624,6 +2642,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, } ioctl_exit: + SMB2_ioctl_free(&rqst); free_rsp_buf(resp_buftype, rsp); return rc; } diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 72cc563c32fe..3c32d0cfea69 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -144,6 +144,10 @@ extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, u32 indatalen, char **out_data, u32 *plen /* returned data len */); +extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 opcode, + bool is_fsctl, char *in_data, u32 indatalen); +extern void SMB2_ioctl_free(struct smb_rqst *rqst); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, From 72c419d9b073628d3b5b0b2fc787b724f1a8c726 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 13 Mar 2019 14:37:49 +1000 Subject: [PATCH 513/855] cifs: fix smb3_zero_range so it can expand the file-size when required This allows fallocate -z to work against a Windows2016 share. This is due to the SMB3 ZERO_RANGE command does not modify the filesize. To address this we will now append a compounded SET-INFO to update the end-of-file information. This brings xfstests generic/469 closer to working against a windows share. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 76 ++++++++++++++++++++++++++++++++++++----------- fs/cifs/smb2pdu.c | 4 +-- fs/cifs/smb2pdu.h | 7 +++++ 3 files changed, 68 insertions(+), 19 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index a338190b97fa..bd3d2288161c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2549,12 +2549,22 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb, static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, loff_t offset, loff_t len, bool keep_size) { + struct cifs_ses *ses = tcon->ses; struct inode *inode; struct cifsInodeInfo *cifsi; struct cifsFileInfo *cfile = file->private_data; struct file_zero_data_information fsctl_buf; + struct smb_rqst rqst[2]; + int resp_buftype[2]; + struct kvec rsp_iov[2]; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec si_iov[1]; + unsigned int size[1]; + void *data[1]; long rc; unsigned int xid; + int num = 0, flags = 0; + __le64 eof; xid = get_xid(); @@ -2579,28 +2589,60 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, return rc; } - /* - * need to make sure we are not asked to extend the file since the SMB3 - * fsctl does not change the file size. In the future we could change - * this to zero the first part of the range then set the file size - * which for a non sparse file would zero the newly extended range - */ - if (keep_size == false) - if (i_size_read(inode) < offset + len) { - rc = -EOPNOTSUPP; - free_xid(xid); - return rc; - } - cifs_dbg(FYI, "offset %lld len %lld", offset, len); fsctl_buf.FileOffset = cpu_to_le64(offset); fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); - rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, - true /* is_fctl */, (char *)&fsctl_buf, - sizeof(struct file_zero_data_information), NULL, NULL); + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + + memset(&io_iov, 0, sizeof(io_iov)); + rqst[num].rq_iov = io_iov; + rqst[num].rq_nvec = SMB2_IOCTL_IOV_SIZE; + rc = SMB2_ioctl_init(tcon, &rqst[num++], cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, + true /* is_fctl */, (char *)&fsctl_buf, + sizeof(struct file_zero_data_information)); + if (rc) + goto zero_range_exit; + + /* + * do we also need to change the size of the file? + */ + if (keep_size == false && i_size_read(inode) < offset + len) { + smb2_set_next_command(tcon, &rqst[0]); + + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num].rq_iov = si_iov; + rqst[num].rq_nvec = 1; + + eof = cpu_to_le64(offset + len); + size[0] = 8; /* sizeof __le64 */ + data[0] = &eof; + + rc = SMB2_set_info_init(tcon, &rqst[num++], + cfile->fid.persistent_fid, + cfile->fid.persistent_fid, + current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_related(&rqst[1]); + } + + rc = compound_send_recv(xid, ses, flags, num, rqst, + resp_buftype, rsp_iov); + + zero_range_exit: + SMB2_ioctl_free(&rqst[0]); + SMB2_set_info_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_xid(xid); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 481d64d2cc12..c399e09b76e6 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2555,7 +2555,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, struct smb_rqst rqst; struct smb2_ioctl_rsp *rsp = NULL; struct cifs_ses *ses; - struct kvec iov[2]; + struct kvec iov[SMB2_IOCTL_IOV_SIZE]; struct kvec rsp_iov = {NULL, 0}; int resp_buftype = CIFS_NO_BUFFER; int rc = 0; @@ -2584,7 +2584,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, memset(&rqst, 0, sizeof(struct smb_rqst)); memset(&iov, 0, sizeof(iov)); rqst.rq_iov = iov; - rqst.rq_nvec = 2; + rqst.rq_nvec = SMB2_IOCTL_IOV_SIZE; rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid, opcode, is_fsctl, in_data, indatalen); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 0bd4d4802701..ee8977688e21 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -959,6 +959,13 @@ struct duplicate_extents_to_file { __le64 ByteCount; /* Bytes to be copied */ } __packed; +/* + * Maximum number of iovs we need for an ioctl request. + * [0] : struct smb2_ioctl_req + * [1] : in_data + */ +#define SMB2_IOCTL_IOV_SIZE 2 + struct smb2_ioctl_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 57 */ From 779ede040dd491acdb076ed9660d7160228949fd Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 13 Mar 2019 01:41:49 -0500 Subject: [PATCH 514/855] smb3: add dynamic tracepoints for simple fallocate and zero range Can be helpful in debugging various xfstests that are currently skipped or failing due to missing features in our current implementation of fallocate. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/smb2ops.c | 32 ++++++++++++++++++++++++++++++++ fs/cifs/trace.h | 6 ++++++ 2 files changed, 38 insertions(+) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index bd3d2288161c..823a58550dfd 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2571,10 +2571,16 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); + trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len); + + /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) if (keep_size == false) { rc = -EOPNOTSUPP; + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, + tcon->tid, ses->Suid, offset, len, rc); free_xid(xid); return rc; } @@ -2585,6 +2591,8 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, */ if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) { rc = -EOPNOTSUPP; + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len, rc); free_xid(xid); return rc; } @@ -2644,6 +2652,12 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_xid(xid); + if (rc) + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len, rc); + else + trace_smb3_zero_done(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len); return rc; } @@ -2698,9 +2712,13 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); + trace_smb3_falloc_enter(xid, cfile->fid.persistent_fid, tcon->tid, + tcon->ses->Suid, off, len); /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) if (keep_size == false) { + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len, rc); free_xid(xid); return rc; } @@ -2720,6 +2738,12 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, /* BB: in future add else clause to extend file */ else rc = -EOPNOTSUPP; + if (rc) + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len, rc); + else + trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len); free_xid(xid); return rc; } @@ -2735,6 +2759,8 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, */ if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) { rc = -EOPNOTSUPP; + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len, rc); free_xid(xid); return rc; } @@ -2743,6 +2769,12 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, } /* BB: else ... in future add code to extend file and set sparse */ + if (rc) + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid, + tcon->ses->Suid, off, len, rc); + else + trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, tcon->tid, + tcon->ses->Suid, off, len); free_xid(xid); return rc; diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 30bf51c7e8fe..fa226de48ef3 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -59,6 +59,8 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \ DEFINE_SMB3_RW_ERR_EVENT(write_err); DEFINE_SMB3_RW_ERR_EVENT(read_err); DEFINE_SMB3_RW_ERR_EVENT(query_dir_err); +DEFINE_SMB3_RW_ERR_EVENT(zero_err); +DEFINE_SMB3_RW_ERR_EVENT(falloc_err); /* For logging successful read or write */ @@ -104,9 +106,13 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \ DEFINE_SMB3_RW_DONE_EVENT(write_enter); DEFINE_SMB3_RW_DONE_EVENT(read_enter); DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter); +DEFINE_SMB3_RW_DONE_EVENT(zero_enter); +DEFINE_SMB3_RW_DONE_EVENT(falloc_enter); DEFINE_SMB3_RW_DONE_EVENT(write_done); DEFINE_SMB3_RW_DONE_EVENT(read_done); DEFINE_SMB3_RW_DONE_EVENT(query_dir_done); +DEFINE_SMB3_RW_DONE_EVENT(zero_done); +DEFINE_SMB3_RW_DONE_EVENT(falloc_done); /* * For handle based calls other than read and write, and get/set info From 31ba4331d571f501fb32ae072478787e77baf52a Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 13 Mar 2019 02:40:07 -0500 Subject: [PATCH 515/855] SMB3: passthru query info doesn't check for SMB3 FSCTL passthru The passthrough queries from user space tools like smbinfo can be either SMB3 QUERY_INFO or SMB3 FSCTL, but we are not checking for the latter. Temporarily we return EOPNOTSUPP for SMB3 FSCTL passthrough requests but once compounding fsctls is fixed can enable. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/cifs_ioctl.h | 3 +++ fs/cifs/smb2ops.c | 24 ++++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index d8bce2f862de..086ddc5108af 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -43,6 +43,9 @@ struct smb_snapshot_array { /* snapshots[]; */ } __packed; +/* query_info flags */ +#define PASSTHRU_QUERY_INFO 0x00000000 +#define PASSTHRU_FSCTL 0x00000001 struct smb_query_info { __u32 info_type; __u32 file_info_class; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 823a58550dfd..32dde87feaa9 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1390,15 +1390,27 @@ smb2_ioctl_query_info(const unsigned int xid, smb2_set_next_command(tcon, &rqst[0]); /* Query */ - memset(&qi_iov, 0, sizeof(qi_iov)); - rqst[1].rq_iov = qi_iov; - rqst[1].rq_nvec = 1; + if (qi.flags & PASSTHRU_FSCTL) { + /* Can eventually relax perm check since server enforces too */ + if (!capable(CAP_SYS_ADMIN)) + rc = -EPERM; + else /* TBD: Add code to compound FSCTL */ + rc = -EOPNOTSUPP; + } else if (qi.flags == PASSTHRU_QUERY_INFO) { + memset(&qi_iov, 0, sizeof(qi_iov)); + rqst[1].rq_iov = qi_iov; + rqst[1].rq_nvec = 1; - rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID, - qi.file_info_class, qi.info_type, - qi.additional_information, + rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, + COMPOUND_FID, qi.file_info_class, + qi.info_type, qi.additional_information, qi.input_buffer_length, qi.output_buffer_length, buffer); + } else { /* unknown flags */ + cifs_dbg(VFS, "invalid passthru query flags: 0x%x\n", qi.flags); + rc = -EINVAL; + } + if (rc) goto iqinf_exit; smb2_set_next_command(tcon, &rqst[1]); From d44d13723b0994bc61e0f2d3efeb17856128673b Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 13 Mar 2019 16:48:33 -0500 Subject: [PATCH 516/855] cifs: remove unused value pointed out by Coverity Detected by CoverityScan CID#1438719 ("Unused Value") buf is reset again before being used so these two lines of code are useless. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/connect.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b95db2b593cb..a8e9738db691 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1191,10 +1191,6 @@ next_pdu: continue; } - if (server->large_buf) - buf = server->bigbuf; - - server->lstrp = jiffies; for (i = 0; i < num_mids; i++) { From 6552580286e5fdcde50206dd0263b63ab87b64fe Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 13 Mar 2019 18:21:38 -0500 Subject: [PATCH 517/855] cifs: minor documentation updates Also updated a comment describing use of the GlobalMid_Lock Signed-off-by: Steve French --- Documentation/filesystems/cifs/TODO | 3 ++- Documentation/filesystems/cifs/cifs.txt | 34 ++++++++++++++++--------- fs/cifs/cifsglob.h | 1 + 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/Documentation/filesystems/cifs/TODO b/Documentation/filesystems/cifs/TODO index 66b3f54aa6dc..9267f3fb131f 100644 --- a/Documentation/filesystems/cifs/TODO +++ b/Documentation/filesystems/cifs/TODO @@ -111,7 +111,8 @@ negotiated size) and send larger write sizes to modern servers. 5) Continue to extend the smb3 "buildbot" which does automated xfstesting against Windows, Samba and Azure currently - to add additional tests and -to allow the buildbot to execute the tests faster. +to allow the buildbot to execute the tests faster. The URL for the +buildbot is: http://smb3-test-rhel-75.southcentralus.cloudapp.azure.com 6) Address various coverity warnings (most are not bugs per-se, but the more warnings are addressed, the easier it is to spot real diff --git a/Documentation/filesystems/cifs/cifs.txt b/Documentation/filesystems/cifs/cifs.txt index 67756607246e..1be3d21c286e 100644 --- a/Documentation/filesystems/cifs/cifs.txt +++ b/Documentation/filesystems/cifs/cifs.txt @@ -1,16 +1,21 @@ This is the client VFS module for the SMB3 NAS protocol as well - older dialects such as the Common Internet File System (CIFS) + as for older dialects such as the Common Internet File System (CIFS) protocol which was the successor to the Server Message Block (SMB) protocol, the native file sharing mechanism for most early PC operating systems. New and improved versions of CIFS are now - called SMB2 and SMB3. These dialects are also supported by the - CIFS VFS module. CIFS is fully supported by network - file servers such as Windows 2000, 2003, 2008, 2012 and 2016 - as well by Samba (which provides excellent CIFS - server support for Linux and many other operating systems), Apple - systems, as well as most Network Attached Storage vendors, so - this network filesystem client can mount to a wide variety of - servers. + called SMB2 and SMB3. Use of SMB3 (and later, including SMB3.1.1) + is strongly preferred over using older dialects like CIFS due to + security reaasons. All modern dialects, including the most recent, + SMB3.1.1 are supported by the CIFS VFS module. The SMB3 protocol + is implemented and supported by all major file servers + such as all modern versions of Windows (including Windows 2016 + Server), as well as by Samba (which provides excellent + CIFS/SMB2/SMB3 server support and tools for Linux and many other + operating systems). Apple systems also support SMB3 well, as + do most Network Attached Storage vendors, so this network + filesystem client can mount to a wide variety of systems. + It also supports mounting to the cloud (for example + Microsoft Azure), including the necessary security features. The intent of this module is to provide the most advanced network file system function for SMB3 compliant servers, including advanced @@ -24,12 +29,17 @@ cluster file systems for fileserving in some Linux to Linux environments, not just in Linux to Windows (or Linux to Mac) environments. - This filesystem has an mount utility (mount.cifs) that can be obtained from + This filesystem has a mount utility (mount.cifs) and various user space + tools (including smbinfo and setcifsacl) that can be obtained from - https://ftp.samba.org/pub/linux-cifs/cifs-utils/ + https://git.samba.org/?p=cifs-utils.git + or + git://git.samba.org/cifs-utils.git - It must be installed in the directory with the other mount helpers. + mount.cifs should be installed in the directory with the other mount helpers. For more information on the module see the project wiki page at + https://wiki.samba.org/index.php/LinuxCIFS + and https://wiki.samba.org/index.php/LinuxCIFS_utils diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8b9ecf8c2986..c4f0f4e4bc6d 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1738,6 +1738,7 @@ require use of the stronger protocol */ * GlobalMid_Lock protects: * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers + * list operations on global DnotifyReqList * tcp_ses_lock protects: * list operations on tcp and SMB session lists * tcon->open_file_lock protects the list of open files hanging off the tcon From c847dccfbdc198671e80cd81891ff7a255606aea Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 14 Mar 2019 00:29:17 -0500 Subject: [PATCH 518/855] CIFS: make mknod() an smb_version_op This cleanup removes cifs specific code from SMB2/SMB3 code paths which is cleaner and easier to maintain as the code to handle special files is improved. Below is an example creating special files using 'sfu' mount option over SMB3 to Windows (with this patch) (Note that to Samba server, support for saving dos attributes has to be enabled for the SFU mount option to work). In the future this will also make implementation of creating special files as reparse points easier (as Windows NFS server does for example). root@smf-Thinkpad-P51:~# stat -c "%F" /mnt2/char character special file root@smf-Thinkpad-P51:~# stat -c "%F" /mnt2/block block special file Signed-off-by: Aurelien Aptel Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/cifsglob.h | 8 +++ fs/cifs/dir.c | 107 ++------------------------------------ fs/cifs/smb1ops.c | 126 +++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/smb2ops.c | 102 ++++++++++++++++++++++++++++++++++++ 4 files changed, 239 insertions(+), 104 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c4f0f4e4bc6d..38feae812b47 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -479,6 +479,14 @@ struct smb_version_operations { struct cifs_tcon *tcon, __le16 *path, int is_dir, unsigned long p); + /* make unix special files (block, char, fifo, socket) */ + int (*make_node)(unsigned int xid, + struct inode *inode, + struct dentry *dentry, + struct cifs_tcon *tcon, + char *full_path, + umode_t mode, + dev_t device_number); }; struct smb_version_values { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 907e85d65bb4..f26a48dd2e39 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -621,20 +621,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, { int rc = -EPERM; unsigned int xid; - int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; - struct cifs_io_parms io_parms; char *full_path = NULL; - struct inode *newinode = NULL; - __u32 oplock = 0; - struct cifs_fid fid; - struct cifs_open_parms oparms; - FILE_ALL_INFO *buf = NULL; - unsigned int bytes_written; - struct win_dev *pdev; - struct kvec iov[2]; if (!old_valid_dev(device_number)) return -EINVAL; @@ -654,103 +644,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, goto mknod_out; } - if (tcon->unix_ext) { - struct cifs_unix_set_info_args args = { - .mode = mode & ~current_umask(), - .ctime = NO_CHANGE_64, - .atime = NO_CHANGE_64, - .mtime = NO_CHANGE_64, - .device = device_number, - }; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = current_fsuid(); - args.gid = current_fsgid(); - } else { - args.uid = INVALID_UID; /* no change */ - args.gid = INVALID_GID; /* no change */ - } - rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); - if (rc) - goto mknod_out; - - rc = cifs_get_inode_info_unix(&newinode, full_path, - inode->i_sb, xid); - - if (rc == 0) - d_instantiate(direntry, newinode); - goto mknod_out; - } - - if (!S_ISCHR(mode) && !S_ISBLK(mode)) - goto mknod_out; - - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - goto mknod_out; - - - cifs_dbg(FYI, "sfu compat create special file\n"); - - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto mknod_out; - } - - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = create_options; - oparms.disposition = FILE_CREATE; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; - - if (tcon->ses->server->oplocks) - oplock = REQ_OPLOCK; - else - oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); - if (rc) - goto mknod_out; - - /* - * BB Do not bother to decode buf since no local inode yet to put - * timestamps in, but we can reuse it safely. - */ - - pdev = (struct win_dev *)buf; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = buf; - iov[1].iov_len = sizeof(struct win_dev); - if (S_ISCHR(mode)) { - memcpy(pdev->type, "IntxCHR", 8); - pdev->major = cpu_to_le64(MAJOR(device_number)); - pdev->minor = cpu_to_le64(MINOR(device_number)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); - } else if (S_ISBLK(mode)) { - memcpy(pdev->type, "IntxBLK", 8); - pdev->major = cpu_to_le64(MAJOR(device_number)); - pdev->minor = cpu_to_le64(MINOR(device_number)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); - } - tcon->ses->server->ops->close(xid, tcon, &fid); - d_drop(direntry); - - /* FIXME: add code here to set EAs */ + rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon, + full_path, mode, + device_number); mknod_out: kfree(full_path); - kfree(buf); free_xid(xid); cifs_put_tlink(tlink); return rc; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index f0ce27c3c6e4..c711f1f39bf2 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -1027,6 +1027,131 @@ cifs_can_echo(struct TCP_Server_Info *server) return false; } +static int +cifs_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct inode *newinode = NULL; + int rc = -EPERM; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; + FILE_ALL_INFO *buf = NULL; + struct cifs_io_parms io_parms; + __u32 oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + unsigned int bytes_written; + struct win_dev *pdev; + struct kvec iov[2]; + + if (tcon->unix_ext) { + /* + * SMB1 Unix Extensions: requires server support but + * works with all special files + */ + struct cifs_unix_set_info_args args = { + .mode = mode & ~current_umask(), + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = dev, + }; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = current_fsuid(); + args.gid = current_fsgid(); + } else { + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ + } + rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + if (rc) + goto out; + + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb, xid); + + if (rc == 0) + d_instantiate(dentry, newinode); + goto out; + } + + /* + * SMB1 SFU emulation: should work with all servers, but only + * support block and char device (no socket & fifo) + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + goto out; + + if (!S_ISCHR(mode) && !S_ISBLK(mode)) + goto out; + + cifs_dbg(FYI, "sfu compat create special file\n"); + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto out; + } + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + if (rc) + goto out; + + /* + * BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely. + */ + + pdev = (struct win_dev *)buf; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = sizeof(struct win_dev); + iov[1].iov_base = buf; + iov[1].iov_len = sizeof(struct win_dev); + if (S_ISCHR(mode)) { + memcpy(pdev->type, "IntxCHR", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } else if (S_ISBLK(mode)) { + memcpy(pdev->type, "IntxBLK", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } + tcon->ses->server->ops->close(xid, tcon, &fid); + d_drop(dentry); + + /* FIXME: add code here to set EAs */ +out: + kfree(buf); + return rc; +} + + + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1110,6 +1235,7 @@ struct smb_version_operations smb1_operations = { .get_acl_by_fid = get_cifs_acl_by_fid, .set_acl = set_cifs_acl, #endif /* CIFS_ACL */ + .make_node = cifs_make_node, }; struct smb_version_values smb1_values = { diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 32dde87feaa9..9a7164c2ea63 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3767,6 +3767,104 @@ smb2_next_header(char *buf) return le32_to_cpu(hdr->NextCommand); } +static int +smb2_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + int rc = -EPERM; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; + FILE_ALL_INFO *buf = NULL; + struct cifs_io_parms io_parms; + __u32 oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + unsigned int bytes_written; + struct win_dev *pdev; + struct kvec iov[2]; + + /* + * Check if mounted with mount parm 'sfu' mount parm. + * SFU emulation should work with all servers, but only + * supports block and char device (no socket & fifo), + * and was used by default in earlier versions of Windows + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + goto out; + + /* + * TODO: Add ability to create instead via reparse point. Windows (e.g. + * their current NFS server) uses this approach to expose special files + * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions + */ + + if (!S_ISCHR(mode) && !S_ISBLK(mode)) + goto out; + + cifs_dbg(FYI, "sfu compat create special file\n"); + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto out; + } + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + if (rc) + goto out; + + /* + * BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely. + */ + + pdev = (struct win_dev *)buf; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = sizeof(struct win_dev); + iov[1].iov_base = buf; + iov[1].iov_len = sizeof(struct win_dev); + if (S_ISCHR(mode)) { + memcpy(pdev->type, "IntxCHR", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } else if (S_ISBLK(mode)) { + memcpy(pdev->type, "IntxBLK", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } + tcon->ses->server->ops->close(xid, tcon, &fid); + d_drop(dentry); + + /* FIXME: add code here to set EAs */ +out: + kfree(buf); + return rc; +} + + struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@ -3861,6 +3959,7 @@ struct smb_version_operations smb20_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_operations smb21_operations = { @@ -3959,6 +4058,7 @@ struct smb_version_operations smb21_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_operations smb30_operations = { @@ -4066,6 +4166,7 @@ struct smb_version_operations smb30_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_operations smb311_operations = { @@ -4174,6 +4275,7 @@ struct smb_version_operations smb311_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_values smb20_values = { From dd0ac2d24bf0b39c0f4f17934a0e1dde2b3a2840 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 14 Mar 2019 01:56:34 -0500 Subject: [PATCH 519/855] smb2: fix typo in definition of a few error flags As Sergey Senozhatsky pointed out __constant_cpu_to_le32() is misspelled in a few definitions in the list of status codes smb2status.h as __constanst_cpu_to_le32() Signed-off-by: Steve French CC: Sergey Senozhatsky --- fs/cifs/smb2status.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h index 3d5f62150de4..447c0c6e4c64 100644 --- a/fs/cifs/smb2status.h +++ b/fs/cifs/smb2status.h @@ -30,9 +30,9 @@ */ #define STATUS_SEVERITY_SUCCESS __constant_cpu_to_le32(0x0000) -#define STATUS_SEVERITY_INFORMATIONAL __constanst_cpu_to_le32(0x0001) -#define STATUS_SEVERITY_WARNING __constanst_cpu_to_le32(0x0002) -#define STATUS_SEVERITY_ERROR __constanst_cpu_to_le32(0x0003) +#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001) +#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002) +#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003) struct ntstatus { /* Facility is the high 12 bits of the following field */ From f16994797ea89e572b27f41c554aeac6b1c16048 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 15 Mar 2019 00:08:48 +1000 Subject: [PATCH 520/855] cifs: fix incorrect handling of smb2_set_sparse() return in smb3_simple_falloc smb2_set_sparse does not return -errno, it returns a boolean where true means success. Change this to just ignore the return value just like the other callsites. Additionally add code to handle the case where we must set the file sparse and possibly also extending it. Fixes xfstests: generic/236 generic/350 generic/420 Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 9a7164c2ea63..1c8d3684bb8b 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2718,6 +2718,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, struct cifsFileInfo *cfile = file->private_data; long rc = -EOPNOTSUPP; unsigned int xid; + __le64 eof; xid = get_xid(); @@ -2777,9 +2778,18 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, return rc; } - rc = smb2_set_sparse(xid, tcon, cfile, inode, false); + smb2_set_sparse(xid, tcon, cfile, inode, false); + rc = 0; + } else { + smb2_set_sparse(xid, tcon, cfile, inode, false); + rc = 0; + if (i_size_read(inode) < off + len) { + eof = cpu_to_le64(off + len); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, + &eof); + } } - /* BB: else ... in future add code to extend file and set sparse */ if (rc) trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid, From f5778c398713692a16150ae96e5c8270bab8399f Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 15 Mar 2019 09:07:22 +1000 Subject: [PATCH 521/855] SMB3: Allow SMB3 FSCTL queries to be sent to server from tools For debugging purposes we often have to be able to query additional information only available via SMB3 FSCTL from the server from user space tools (e.g. like cifs-utils's smbinfo). See MS-FSCC and MS-SMB2 protocol specifications for more details. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 62 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 1c8d3684bb8b..1022a3771e14 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1330,7 +1330,8 @@ smb2_ioctl_query_info(const unsigned int xid, struct smb_query_info __user *pqi; int rc = 0; int flags = 0; - struct smb2_query_info_rsp *rsp = NULL; + struct smb2_query_info_rsp *qi_rsp = NULL; + struct smb2_ioctl_rsp *io_rsp = NULL; void *buffer = NULL; struct smb_rqst rqst[3]; int resp_buftype[3]; @@ -1340,6 +1341,7 @@ smb2_ioctl_query_info(const unsigned int xid, u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_fid fid; struct kvec qi_iov[1]; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec close_iov[1]; memset(rqst, 0, sizeof(rqst)); @@ -1394,8 +1396,16 @@ smb2_ioctl_query_info(const unsigned int xid, /* Can eventually relax perm check since server enforces too */ if (!capable(CAP_SYS_ADMIN)) rc = -EPERM; - else /* TBD: Add code to compound FSCTL */ - rc = -EOPNOTSUPP; + else { + memset(&io_iov, 0, sizeof(io_iov)); + rqst[1].rq_iov = io_iov; + rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, &rqst[1], + COMPOUND_FID, COMPOUND_FID, + qi.info_type, true, NULL, + 0); + } } else if (qi.flags == PASSTHRU_QUERY_INFO) { memset(&qi_iov, 0, sizeof(qi_iov)); rqst[1].rq_iov = qi_iov; @@ -1430,24 +1440,44 @@ smb2_ioctl_query_info(const unsigned int xid, resp_buftype, rsp_iov); if (rc) goto iqinf_exit; - pqi = (struct smb_query_info __user *)arg; - rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; - if (le32_to_cpu(rsp->OutputBufferLength) < qi.input_buffer_length) - qi.input_buffer_length = le32_to_cpu(rsp->OutputBufferLength); - if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) { - rc = -EFAULT; - goto iqinf_exit; - } - if (copy_to_user(pqi + 1, rsp->Buffer, qi.input_buffer_length)) { - rc = -EFAULT; - goto iqinf_exit; + if (qi.flags & PASSTHRU_FSCTL) { + pqi = (struct smb_query_info __user *)arg; + io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length) + qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount); + if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto iqinf_exit; + } + if (copy_to_user(pqi + 1, &io_rsp[1], qi.input_buffer_length)) { + rc = -EFAULT; + goto iqinf_exit; + } + } else { + pqi = (struct smb_query_info __user *)arg; + qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length) + qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength); + if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto iqinf_exit; + } + if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) { + rc = -EFAULT; + goto iqinf_exit; + } } iqinf_exit: kfree(buffer); SMB2_open_free(&rqst[0]); - SMB2_query_info_free(&rqst[1]); + if (qi.flags & PASSTHRU_FSCTL) + SMB2_ioctl_free(&rqst[1]); + else + SMB2_query_info_free(&rqst[1]); + SMB2_close_free(&rqst[2]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); From bc31d0cdcfbadb6258b45db97e93b1c83822ba33 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 14 Mar 2019 18:44:16 +0100 Subject: [PATCH 522/855] CIFS: fix POSIX lock leak and invalid ptr deref We have a customer reporting crashes in lock_get_status() with many "Leaked POSIX lock" messages preceeding the crash. Leaked POSIX lock on dev=0x0:0x56 ... Leaked POSIX lock on dev=0x0:0x56 ... Leaked POSIX lock on dev=0x0:0x56 ... Leaked POSIX lock on dev=0x0:0x53 ... Leaked POSIX lock on dev=0x0:0x53 ... Leaked POSIX lock on dev=0x0:0x53 ... Leaked POSIX lock on dev=0x0:0x53 ... POSIX: fl_owner=ffff8900e7b79380 fl_flags=0x1 fl_type=0x1 fl_pid=20709 Leaked POSIX lock on dev=0x0:0x4b ino... Leaked locks on dev=0x0:0x4b ino=0xf911400000029: POSIX: fl_owner=ffff89f41c870e00 fl_flags=0x1 fl_type=0x1 fl_pid=19592 stack segment: 0000 [#1] SMP Modules linked in: binfmt_misc msr tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag rpcsec_gss_krb5 arc4 ecb auth_rpcgss nfsv4 md4 nfs nls_utf8 lockd grace cifs sunrpc ccm dns_resolver fscache af_packet iscsi_ibft iscsi_boot_sysfs vmw_vsock_vmci_transport vsock xfs libcrc32c sb_edac edac_core crct10dif_pclmul crc32_pclmul ghash_clmulni_intel drbg ansi_cprng vmw_balloon aesni_intel aes_x86_64 lrw gf128mul glue_helper ablk_helper cryptd joydev pcspkr vmxnet3 i2c_piix4 vmw_vmci shpchp fjes processor button ac btrfs xor raid6_pq sr_mod cdrom ata_generic sd_mod ata_piix vmwgfx crc32c_intel drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm serio_raw ahci libahci drm libata vmw_pvscsi sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4 Supported: Yes CPU: 6 PID: 28250 Comm: lsof Not tainted 4.4.156-94.64-default #1 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/05/2016 task: ffff88a345f28740 ti: ffff88c74005c000 task.ti: ffff88c74005c000 RIP: 0010:[] [] lock_get_status+0x9b/0x3b0 RSP: 0018:ffff88c74005fd90 EFLAGS: 00010202 RAX: ffff89bde83e20ae RBX: ffff89e870003d18 RCX: 0000000049534f50 RDX: ffffffff81a3541f RSI: ffffffff81a3544e RDI: ffff89bde83e20ae RBP: 0026252423222120 R08: 0000000020584953 R09: 000000000000ffff R10: 0000000000000000 R11: ffff88c74005fc70 R12: ffff89e5ca7b1340 R13: 00000000000050e5 R14: ffff89e870003d30 R15: ffff89e5ca7b1340 FS: 00007fafd64be800(0000) GS:ffff89f41fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000001c80018 CR3: 000000a522048000 CR4: 0000000000360670 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: 0000000000000208 ffffffff81a3d6b6 ffff89e870003d30 ffff89e870003d18 ffff89e5ca7b1340 ffff89f41738d7c0 ffff89e870003d30 ffff89e5ca7b1340 ffffffff8125e08f 0000000000000000 ffff89bc22b67d00 ffff88c74005ff28 Call Trace: [] locks_show+0x2f/0x70 [] seq_read+0x251/0x3a0 [] proc_reg_read+0x3c/0x70 [] __vfs_read+0x26/0x140 [] vfs_read+0x7a/0x120 [] SyS_read+0x42/0xa0 [] entry_SYSCALL_64_fastpath+0x1e/0xb7 When Linux closes a FD (close(), close-on-exec, dup2(), ...) it calls filp_close() which also removes all posix locks. The lock struct is initialized like so in filp_close() and passed down to cifs ... lock.fl_type = F_UNLCK; lock.fl_flags = FL_POSIX | FL_CLOSE; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; ... Note the FL_CLOSE flag, which hints the VFS code that this unlocking is done for closing the fd. filp_close() locks_remove_posix(filp, id); vfs_lock_file(filp, F_SETLK, &lock, NULL); return filp->f_op->lock(filp, cmd, fl) => cifs_lock() rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock, xid); rc = server->ops->mand_unlock_range(cfile, flock, xid); if (flock->fl_flags & FL_POSIX && !rc) rc = locks_lock_file_wait(file, flock) Notice how we don't call locks_lock_file_wait() which does the generic VFS lock/unlock/wait work on the inode if rc != 0. If we are closing the handle, the SMB server is supposed to remove any locks associated with it. Similarly, cifs.ko frees and wakes up any lock and lock waiter when closing the file: cifs_close() cifsFileInfo_put(file->private_data) /* * Delete any outstanding lock records. We'll lose them when the file * is closed anyway. */ down_write(&cifsi->lock_sem); list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) { list_del(&li->llist); cifs_del_lock_waiters(li); kfree(li); } list_del(&cifs_file->llist->llist); kfree(cifs_file->llist); up_write(&cifsi->lock_sem); So we can safely ignore unlocking failures in cifs_lock() if they happen with the FL_CLOSE flag hint set as both the server and the client take care of it during the actual closing. This is not a proper fix for the unlocking failure but it's safe and it seems to prevent the lock leakages and crashes the customer experiences. Signed-off-by: Aurelien Aptel Signed-off-by: NeilBrown Signed-off-by: Steve French Acked-by: Pavel Shilovsky --- fs/cifs/file.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 4c144c1f50eb..2a6d20c0ce02 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1645,8 +1645,20 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if (flock->fl_flags & FL_POSIX && !rc) + if (flock->fl_flags & FL_POSIX) { + /* + * If this is a request to remove all locks because we + * are closing the file, it doesn't matter if the + * unlocking failed as both cifs.ko and the SMB server + * remove the lock on file close + */ + if (rc) { + cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc); + if (!(flock->fl_flags & FL_CLOSE)) + return rc; + } rc = locks_lock_file_wait(file, flock); + } return rc; } From fa30dde38aa8628c73a6dded7cb0bba38c27b576 Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Thu, 14 Mar 2019 23:19:22 -0400 Subject: [PATCH 523/855] ext4: fix NULL pointer dereference while journal is aborted We see the following NULL pointer dereference while running xfstests generic/475: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 PGD 8000000c84bad067 P4D 8000000c84bad067 PUD c84e62067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 7 PID: 9886 Comm: fsstress Kdump: loaded Not tainted 5.0.0-rc8 #10 RIP: 0010:ext4_do_update_inode+0x4ec/0x760 ... Call Trace: ? jbd2_journal_get_write_access+0x42/0x50 ? __ext4_journal_get_write_access+0x2c/0x70 ? ext4_truncate+0x186/0x3f0 ext4_mark_iloc_dirty+0x61/0x80 ext4_mark_inode_dirty+0x62/0x1b0 ext4_truncate+0x186/0x3f0 ? unmap_mapping_pages+0x56/0x100 ext4_setattr+0x817/0x8b0 notify_change+0x1df/0x430 do_truncate+0x5e/0x90 ? generic_permission+0x12b/0x1a0 This is triggered because the NULL pointer handle->h_transaction was dereferenced in function ext4_update_inode_fsync_trans(). I found that the h_transaction was set to NULL in jbd2__journal_restart but failed to attached to a new transaction while the journal is aborted. Fix this by checking the handle before updating the inode. Fixes: b436b9bef84d ("ext4: Wait for proper transaction commit on fsync") Signed-off-by: Jiufei Xue Signed-off-by: Theodore Ts'o Reviewed-by: Joseph Qi Cc: stable@kernel.org --- fs/ext4/ext4_jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 15b6dd733780..df908ef79cce 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -384,7 +384,7 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle, { struct ext4_inode_info *ei = EXT4_I(inode); - if (ext4_handle_valid(handle)) { + if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; if (datasync) ei->i_datasync_tid = handle->h_transaction->t_tid; From 372a03e01853f860560eade508794dd274e9b390 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 14 Mar 2019 23:20:25 -0400 Subject: [PATCH 524/855] ext4: fix data corruption caused by unaligned direct AIO Ext4 needs to serialize unaligned direct AIO because the zeroing of partial blocks of two competing unaligned AIOs can result in data corruption. However it decides not to serialize if the potentially unaligned aio is past i_size with the rationale that no pending writes are possible past i_size. Unfortunately if the i_size is not block aligned and the second unaligned write lands past i_size, but still into the same block, it has the potential of corrupting the previous unaligned write to the same block. This is (very simplified) reproducer from Frank // 41472 = (10 * 4096) + 512 // 37376 = 41472 - 4096 ftruncate(fd, 41472); io_prep_pwrite(iocbs[0], fd, buf[0], 4096, 37376); io_prep_pwrite(iocbs[1], fd, buf[1], 4096, 41472); io_submit(io_ctx, 1, &iocbs[1]); io_submit(io_ctx, 1, &iocbs[2]); io_getevents(io_ctx, 2, 2, events, NULL); Without this patch the 512B range from 40960 up to the start of the second unaligned write (41472) is going to be zeroed overwriting the data written by the first write. This is a data corruption. 00000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 00009200 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 * 0000a000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0000a200 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 With this patch the data corruption is avoided because we will recognize the unaligned_aio and wait for the unwritten extent conversion. 00000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 00009200 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 * 0000a200 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 * 0000b200 Reported-by: Frank Sorenson Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o Fixes: e9e3bcecf44c ("ext4: serialize unaligned asynchronous DIO") Cc: stable@vger.kernel.org --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 69d65d49837b..98ec11f69cd4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -125,7 +125,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) struct super_block *sb = inode->i_sb; int blockmask = sb->s_blocksize - 1; - if (pos >= i_size_read(inode)) + if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) return 0; if ((pos | iov_iter_alignment(from)) & blockmask) From 1dc1097ff60e4105216da7cd0aa99032b039a994 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 14 Mar 2019 23:46:05 -0400 Subject: [PATCH 525/855] ext4: avoid panic during forced reboot When admin calls "reboot -f" - i.e., does a hard system reboot by directly calling reboot(2) - ext4 filesystem mounted with errors=panic can panic the system. This happens because the underlying device gets disabled without unmounting the filesystem and thus some syscall running in parallel to reboot(2) can result in the filesystem getting IO errors. This is somewhat surprising to the users so try improve the behavior by switching to errors=remount-ro behavior when the system is running reboot(2). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6e4cac646345..1a5729d8f9ec 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -430,6 +430,12 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) spin_unlock(&sbi->s_md_lock); } +static bool system_going_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -460,7 +466,12 @@ static void ext4_handle_error(struct super_block *sb) if (journal) jbd2_journal_abort(journal, -EIO); } - if (test_opt(sb, ERRORS_RO)) { + /* + * We force ERRORS_RO behavior when system is rebooting. Otherwise we + * could panic during 'reboot -f' as the underlying device got already + * disabled. + */ + if (test_opt(sb, ERRORS_RO) || system_going_down()) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible @@ -468,8 +479,7 @@ static void ext4_handle_error(struct super_block *sb) */ smp_wmb(); sb->s_flags |= SB_RDONLY; - } - if (test_opt(sb, ERRORS_PANIC)) { + } else if (test_opt(sb, ERRORS_PANIC)) { if (EXT4_SB(sb)->s_journal && !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) return; From 7cf77140777364d77b2b6e392e7e081a205a08c5 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 14 Mar 2019 23:51:13 -0400 Subject: [PATCH 526/855] ext4: remove useless ext4_pin_inode() This function is never used from the beginning (and is commented out); let's remove it. Signed-off-by: Jason Yan Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f84cf62fd290..e5014a6f76e2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -6082,36 +6082,6 @@ out: return; } -#if 0 -/* - * Bind an inode's backing buffer_head into this transaction, to prevent - * it from being flushed to disk early. Unlike - * ext4_reserve_inode_write, this leaves behind no bh reference and - * returns no iloc structure, so the caller needs to repeat the iloc - * lookup to mark the inode dirty later. - */ -static int ext4_pin_inode(handle_t *handle, struct inode *inode) -{ - struct ext4_iloc iloc; - - int err = 0; - if (handle) { - err = ext4_get_inode_loc(inode, &iloc); - if (!err) { - BUFFER_TRACE(iloc.bh, "get_write_access"); - err = jbd2_journal_get_write_access(handle, iloc.bh); - if (!err) - err = ext4_handle_dirty_metadata(handle, - NULL, - iloc.bh); - brelse(iloc.bh); - } - } - ext4_std_error(inode->i_sb, err); - return err; -} -#endif - int ext4_change_inode_journal_flag(struct inode *inode, int val) { journal_t *journal; From e65ef56db4945fb18a0d522e056c02ddf939e644 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Mar 2019 10:16:44 -0600 Subject: [PATCH 527/855] io_uring: use regular request ref counts Get rid of the special casing of "normal" requests not having any references to the io_kiocb. We initialize the ref count to 2, one for the submission side, and one or the completion side. Signed-off-by: Jens Axboe --- fs/io_uring.c | 54 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5d99376d2369..9071fca118a4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -411,7 +411,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req->ctx = ctx; req->flags = 0; - refcount_set(&req->refs, 0); + /* one is dropped after submission, the other at completion */ + refcount_set(&req->refs, 2); return req; out: io_ring_drop_ctx_refs(ctx, 1); @@ -429,10 +430,14 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) static void io_free_req(struct io_kiocb *req) { - if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) { - io_ring_drop_ctx_refs(req->ctx, 1); - kmem_cache_free(req_cachep, req); - } + io_ring_drop_ctx_refs(req->ctx, 1); + kmem_cache_free(req_cachep, req); +} + +static void io_put_req(struct io_kiocb *req) +{ + if (refcount_dec_and_test(&req->refs)) + io_free_req(req); } /* @@ -453,7 +458,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, io_cqring_fill_event(ctx, req->user_data, req->error, 0); - reqs[to_free++] = req; + if (refcount_dec_and_test(&req->refs)) + reqs[to_free++] = req; (*nr_events)++; /* @@ -616,7 +622,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) io_fput(req); io_cqring_add_event(req->ctx, req->user_data, res, 0); - io_free_req(req); + io_put_req(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -1083,7 +1089,7 @@ static int io_nop(struct io_kiocb *req, u64 user_data) io_fput(req); } io_cqring_add_event(ctx, user_data, err, 0); - io_free_req(req); + io_put_req(req); return 0; } @@ -1146,7 +1152,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, io_fput(req); io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); - io_free_req(req); + io_put_req(req); return 0; } @@ -1204,7 +1210,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) spin_unlock_irq(&ctx->completion_lock); io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); - io_free_req(req); + io_put_req(req); return 0; } @@ -1212,7 +1218,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask) { io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0); io_fput(req); - io_free_req(req); + io_put_req(req); } static void io_poll_complete_work(struct work_struct *work) @@ -1346,9 +1352,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) INIT_LIST_HEAD(&poll->wait.entry); init_waitqueue_func_entry(&poll->wait, io_poll_wake); - /* one for removal from waitqueue, one for this function */ - refcount_set(&req->refs, 2); - mask = vfs_poll(poll->file, &ipt.pt) & poll->events; if (unlikely(!poll->head)) { /* we did not manage to set up a waitqueue, done */ @@ -1380,13 +1383,12 @@ out: * Drop one of our refs to this req, __io_submit_sqe() will * drop the other one since we're returning an error. */ - io_free_req(req); + io_put_req(req); return ipt.error; } if (mask) io_poll_complete(req, mask); - io_free_req(req); return 0; } @@ -1524,10 +1526,13 @@ restart: break; cond_resched(); } while (1); + + /* drop submission reference */ + io_put_req(req); } if (ret) { io_cqring_add_event(ctx, sqe->user_data, ret, 0); - io_free_req(req); + io_put_req(req); } /* async context always use a copy of the sqe */ @@ -1649,11 +1654,22 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, INIT_WORK(&req->work, io_sq_wq_submit_work); queue_work(ctx->sqo_wq, &req->work); } - ret = 0; + + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually + * submitted. + */ + return 0; } } + + /* drop submission reference */ + io_put_req(req); + + /* and drop final reference, if we failed */ if (ret) - io_free_req(req); + io_put_req(req); return ret; } From d64264d6218e6892edd832dc3a5a5857c2856c53 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 15 Mar 2019 00:15:32 -0400 Subject: [PATCH 528/855] ext4: add missing brelse() in add_new_gdb_meta_bg() Currently in add_new_gdb_meta_bg() there is a missing brelse of gdb_bh in case ext4_journal_get_write_access() fails. Additionally kvfree() is missing in the same error path. Fix it by moving the ext4_journal_get_write_access() before the ext4 sb update as Ted suggested and release n_group_desc and gdb_bh in case it fails. Fixes: 61a9c11e5e7a ("ext4: add missing brelse() add_new_gdb_meta_bg()'s error path") Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3d9b18505c0c..90061c3d048b 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -932,11 +932,18 @@ static int add_new_gdb_meta_bg(struct super_block *sb, memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); n_group_desc[gdb_num] = gdb_bh; + + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gdb_bh); + if (err) { + kvfree(n_group_desc); + brelse(gdb_bh); + return err; + } + EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; kvfree(o_group_desc); - BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); return err; } From 6c7328400e0488f7d49e19e02290ba343b6811b2 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 15 Mar 2019 00:22:28 -0400 Subject: [PATCH 529/855] ext4: report real fs size after failed resize Currently when the file system resize using ext4_resize_fs() fails it will report into log that "resized filesystem to ". However this may not be true in the case of failure. Use the current block count as returned by ext4_blocks_count() to report the block count. Additionally, report a warning that "error occurred during file system resize" Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 90061c3d048b..e7ae26e36c9c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -2080,6 +2080,10 @@ out: free_flex_gd(flex_gd); if (resize_inode != NULL) iput(resize_inode); - ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count); + if (err) + ext4_warning(sb, "error (%d) occurred during " + "file system resize", err); + ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", + ext4_blocks_count(es)); return err; } From e0c5c576d5074b5bb7b1b4b59848c25ceb521331 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Mar 2019 10:18:47 -0600 Subject: [PATCH 530/855] io_uring: make io_read/write return an integer The callers all convert to an integer, and we only return 0/-ERROR anyway. Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9071fca118a4..caf39663466f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -893,7 +893,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw, opcode = READ_ONCE(sqe->opcode); if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); + int ret = io_import_fixed(ctx, rw, sqe, iter); *iovec = NULL; return ret; } @@ -951,15 +951,15 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) async_list->io_end = io_end; } -static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) +static int io_read(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock, struct io_submit_state *state) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t ret; + int ret; ret = io_prep_rw(req, s, force_nonblock, state); if (ret) @@ -1004,15 +1004,15 @@ out_fput: return ret; } -static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) +static int io_write(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock, struct io_submit_state *state) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t ret; + int ret; ret = io_prep_rw(req, s, force_nonblock, state); if (ret) @@ -1396,8 +1396,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock, struct io_submit_state *state) { - ssize_t ret; - int opcode; + int ret, opcode; if (unlikely(s->index >= ctx->sq_entries)) return -EINVAL; @@ -1623,7 +1622,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, struct io_submit_state *state) { struct io_kiocb *req; - ssize_t ret; + int ret; /* enforce forwards compatibility on users */ if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) From d530a402a114efcf6d2b88d7f628856dade5b90b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Mar 2019 12:15:01 -0600 Subject: [PATCH 531/855] io_uring: add prepped flag We currently use the fact that if ->ki_filp is already set, then we've done the prep. In preparation for moving the file assignment earlier, use a separate flag to tell whether the request has been prepped for IO or not. Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index caf39663466f..d259e8a6cb2e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -214,6 +214,7 @@ struct io_kiocb { #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ #define REQ_F_SEQ_PREV 8 /* sequential with previous */ +#define REQ_F_PREPPED 16 /* prep already done */ u64 user_data; u64 error; @@ -741,7 +742,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, int fd, ret; /* For -EAGAIN retry, everything is already prepped */ - if (kiocb->ki_filp) + if (req->flags & REQ_F_PREPPED) return 0; flags = READ_ONCE(sqe->flags); @@ -799,6 +800,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, } kiocb->ki_complete = io_complete_rw; } + req->flags |= REQ_F_PREPPED; return 0; out_fput: if (!(flags & IOSQE_FIXED_FILE)) { @@ -1099,8 +1101,8 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned flags; int fd; - /* Prep already done */ - if (req->rw.ki_filp) + /* Prep already done (EAGAIN retry) */ + if (req->flags & REQ_F_PREPPED) return 0; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) @@ -1122,6 +1124,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EBADF; } + req->flags |= REQ_F_PREPPED; return 0; } @@ -1632,8 +1635,6 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, if (unlikely(!req)) return -EAGAIN; - req->rw.ki_filp = NULL; - ret = __io_submit_sqe(ctx, req, s, true, state); if (ret == -EAGAIN) { struct io_uring_sqe *sqe_copy; From ede271b059463731cbd6dffe55ffd70d7dbe8392 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Mar 2019 14:01:14 +0100 Subject: [PATCH 532/855] perf/x86/intel: Fix memory corruption Through: validate_event() x86_pmu.get_event_constraints(.idx=-1) tfa_get_event_constraints() dyn_constraint() cpuc->constraint_list[-1] is used, which is an obvious out-of-bound access. In this case, simply skip the TFA constraint code, there is no event constraint with just PMC3, therefore the code will never result in the empty set. Fixes: 400816f60c54 ("perf/x86/intel: Implement support for TSX Force Abort") Reported-by: Tony Jones Reported-by: "DSouza, Nelson" Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Tested-by: Tony Jones Tested-by: "DSouza, Nelson" Cc: eranian@google.com Cc: jolsa@redhat.com Cc: stable@kernel.org Link: https://lkml.kernel.org/r/20190314130705.441549378@infradead.org --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 35102ecdfc8d..92dfeb343a6a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3410,7 +3410,7 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx, /* * Without TFA we must not use PMC3. */ - if (!allow_tsx_force_abort && test_bit(3, c->idxmsk)) { + if (!allow_tsx_force_abort && test_bit(3, c->idxmsk) && idx >= 0) { c = dyn_constraint(cpuc, c, idx); c->idxmsk64 &= ~(1ULL << 3); c->weight--; From f764c58b7faa26f5714e6907f892abc2bc0de4f8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Mar 2019 09:14:10 +0100 Subject: [PATCH 533/855] perf/x86: Fixup typo in stub functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Guenter reported a build warning for CONFIG_CPU_SUP_INTEL=n: > With allmodconfig-CONFIG_CPU_SUP_INTEL, this patch results in: > > In file included from arch/x86/events/amd/core.c:8:0: > arch/x86/events/amd/../perf_event.h:1036:45: warning: ‘struct cpu_hw_event’ declared inside parameter list will not be visible outside of this definition or declaration > static inline int intel_cpuc_prepare(struct cpu_hw_event *cpuc, int cpu) While harmless (an unsed pointer is an unused pointer, no matter the type) it needs fixing. Reported-by: Guenter Roeck Signed-off-by: Peter Zijlstra (Intel) Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: d01b1f96a82e ("perf/x86/intel: Make cpuc allocations consistent") Link: http://lkml.kernel.org/r/20190315081410.GR5996@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/events/perf_event.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index b04ae6c8775e..a75955741c50 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1033,12 +1033,12 @@ static inline int intel_pmu_init(void) return 0; } -static inline int intel_cpuc_prepare(struct cpu_hw_event *cpuc, int cpu) +static inline int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) { return 0; } -static inline void intel_cpuc_finish(struct cpu_hw_event *cpuc) +static inline void intel_cpuc_finish(struct cpu_hw_events *cpuc) { } From 0266def913771e718fd0c998eecb072e0685e2c9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 14 Mar 2019 17:02:56 +0100 Subject: [PATCH 534/855] xen/balloon: Fix mapping PG_offline pages to user space The XEN balloon driver - in contrast to other balloon drivers - allows to map some inflated pages to user space. Such pages are allocated via alloc_xenballooned_pages() and freed via free_xenballooned_pages(). The pfn space of these allocated pages is used to map other things by the hypervisor using hypercalls. Pages marked with PG_offline must never be mapped to user space (as this page type uses the mapcount field of struct pages). So what we can do is, clear/set PG_offline when allocating/freeing an inflated pages. This way, most inflated pages can be excluded by dumping tools and the "reused for other purpose" balloon pages are correctly not marked as PG_offline. Fixes: 77c4adf6a6df (xen/balloon: mark inflated pages PG_offline) Reported-by: Julien Grall Tested-by: Julien Grall Signed-off-by: David Hildenbrand Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/balloon.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 39b229f9e256..d37dd5bb7a8f 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -604,6 +604,7 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages) while (pgno < nr_pages) { page = balloon_retrieve(true); if (page) { + __ClearPageOffline(page); pages[pgno++] = page; #ifdef CONFIG_XEN_HAVE_PVMMU /* @@ -645,8 +646,10 @@ void free_xenballooned_pages(int nr_pages, struct page **pages) mutex_lock(&balloon_mutex); for (i = 0; i < nr_pages; i++) { - if (pages[i]) + if (pages[i]) { + __SetPageOffline(pages[i]); balloon_append(pages[i]); + } } balloon_stats.target_unpopulated -= nr_pages; From bb6bccba390c7d743c1e4427de4ef284c8cc6869 Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Wed, 13 Mar 2019 21:53:24 +0800 Subject: [PATCH 535/855] iommu/amd: Fix NULL dereference bug in match_hid_uid Add a non-NULL check to fix potential NULL pointer dereference Cleanup code to call function once. Signed-off-by: Aaron Ma Fixes: 2bf9a0a12749b ('iommu/amd: Add iommu support for ACPI HID devices') Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 6b0760dafb3e..b319e51c379b 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -140,10 +140,14 @@ static struct lock_class_key reserved_rbtree_key; static inline int match_hid_uid(struct device *dev, struct acpihid_map_entry *entry) { + struct acpi_device *adev = ACPI_COMPANION(dev); const char *hid, *uid; - hid = acpi_device_hid(ACPI_COMPANION(dev)); - uid = acpi_device_uid(ACPI_COMPANION(dev)); + if (!adev) + return -ENODEV; + + hid = acpi_device_hid(adev); + uid = acpi_device_uid(adev); if (!hid || !(*hid)) return -ENODEV; From 9734ad57b0f1a367fd3a00d717f97f8c00d9edb7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 15 Mar 2019 12:47:34 -0400 Subject: [PATCH 536/855] SUNRPC: Fix a client regression when handling oversized replies If the server sends a reply that is larger than the pre-allocated buffer, then the current code may fail to register how much of the stream that it has finished reading. This again can lead to hangs. Fixes: e92053a52e68 ("SUNRPC: Handle zero length fragments correctly") Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 42f45d33dc56..9359539907ba 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -453,7 +453,7 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, goto out; if (ret != want) goto out; - } else + } else if (offset < seek_init) offset = seek_init; ret = -EMSGSIZE; out: From 513149607d19bc3821386fb5ac75f8b99fd4b115 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 15 Mar 2019 12:55:59 -0400 Subject: [PATCH 537/855] SUNRPC: Fix the minimal size for reply buffer allocation We must at minimum allocate enough memory to be able to see any auth errors in the reply from the server. Fixes: 2c94b8eca1a26 ("SUNRPC: Use au_rslack when computing reply...") Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 4216fe33204a..310873895578 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1730,7 +1730,12 @@ call_allocate(struct rpc_task *task) req->rq_callsize = RPC_CALLHDRSIZE + (auth->au_cslack << 1) + proc->p_arglen; req->rq_callsize <<= 2; - req->rq_rcvsize = RPC_REPHDRSIZE + auth->au_rslack + proc->p_replen; + /* + * Note: the reply buffer must at minimum allocate enough space + * for the 'struct accepted_reply' from RFC5531. + */ + req->rq_rcvsize = RPC_REPHDRSIZE + auth->au_rslack + \ + max_t(size_t, proc->p_replen, 2); req->rq_rcvsize <<= 2; status = xprt->ops->buf_alloc(task); From 27adc785928ae6b34cdda96f472735b77c91e247 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 15 Mar 2019 08:01:16 -0400 Subject: [PATCH 538/855] SUNRPC: Use the ENOTCONN error on socket disconnect When the socket is closed, we currently send an EAGAIN error to all pending requests in order to ask them to retransmit. Use ENOTCONN instead, to ensure that they try to reconnect before attempting to transmit. This also helps SOFTCONN tasks to behave correctly in this situation. Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e096c5a725df..d7117d241460 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -664,7 +664,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); xprt_clear_write_space_locked(xprt); - xprt_wake_pending_tasks(xprt, -EAGAIN); + xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock_bh(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_disconnect_done); From eb90a16e9087063943859ae99bbdddd1fbfcf477 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 15 Mar 2019 09:29:00 -0400 Subject: [PATCH 539/855] SUNRPC: rpc_decode_header() must always return a non-zero value on error Ensure that when the "garbage args" case falls through, we do set an error of EIO. Fixes: a0584ee9aed8 ("SUNRPC: Use struct xdr_stream when decoding...") Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 310873895578..1f47801e0002 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2454,7 +2454,7 @@ static noinline int rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_clnt *clnt = task->tk_client; - int error = -EACCES; + int error; __be32 *p; /* RFC-1014 says that the representation of XDR data must be a @@ -2463,7 +2463,7 @@ rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr) * undefined results */ if (task->tk_rqstp->rq_rcv_buf.len & 3) - goto out_badlen; + goto out_unparsable; p = xdr_inline_decode(xdr, 3 * sizeof(*p)); if (!p) @@ -2498,9 +2498,10 @@ rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr) goto out_err; case rpc_garbage_args: trace_rpc__garbage_args(task); + error = -EIO; break; default: - trace_rpc__unparsable(task); + goto out_unparsable; } out_garbage: @@ -2514,11 +2515,6 @@ out_err: rpc_exit(task, error); return error; -out_badlen: - trace_rpc__unparsable(task); - error = -EIO; - goto out_err; - out_unparsable: trace_rpc__unparsable(task); error = -EIO; @@ -2529,6 +2525,7 @@ out_verifier: goto out_garbage; out_msg_denied: + error = -EACCES; p = xdr_inline_decode(xdr, sizeof(*p)); if (!p) goto out_unparsable; @@ -2540,9 +2537,7 @@ out_msg_denied: error = -EPROTONOSUPPORT; goto out_err; default: - trace_rpc__unparsable(task); - error = -EIO; - goto out_err; + goto out_unparsable; } p = xdr_inline_decode(xdr, sizeof(*p)); @@ -2577,8 +2572,7 @@ out_msg_denied: task->tk_xprt->servername); break; default: - trace_rpc__unparsable(task); - error = -EIO; + goto out_unparsable; } goto out_err; } From 928d42f7d8737e1d6327e09668525f59725dabf9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 15 Mar 2019 10:12:30 -0400 Subject: [PATCH 540/855] SUNRPC: Handle the SYSTEM_ERR rpc error Handle the SYSTEM_ERR rpc error by retrying the RPC call as if it were a garbage argument. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 1f47801e0002..cb73d6c25857 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2497,6 +2497,7 @@ rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr) error = -EOPNOTSUPP; goto out_err; case rpc_garbage_args: + case rpc_system_err: trace_rpc__garbage_args(task); error = -EIO; break; From 5e3863fd597eba8c6679de805681631b1aad9bdb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 15 Mar 2019 13:11:36 -0400 Subject: [PATCH 541/855] SUNRPC: Remove redundant check for the reply length in call_decode() Now that we're using the xdr_stream functions to decode the header, the test for the minimum reply length is redundant. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index cb73d6c25857..228970e6e52b 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2392,9 +2392,6 @@ call_decode(struct rpc_task *task) WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf, sizeof(req->rq_rcv_buf)) != 0); - if (req->rq_rcv_buf.len < 12) - goto out_retry; - xdr_init_decode(&xdr, &req->rq_rcv_buf, req->rq_rcv_buf.head[0].iov_base, req); switch (rpc_decode_header(task, &xdr)) { @@ -2405,7 +2402,6 @@ call_decode(struct rpc_task *task) task->tk_pid, __func__, task->tk_status); return; case -EAGAIN: -out_retry: task->tk_status = 0; /* Note: rpc_decode_header() may have freed the RPC slot */ if (task->tk_rqstp == req) { From 09bb839434bd845c01da3d159b0c126fe7fa90da Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Mar 2019 12:39:28 -0600 Subject: [PATCH 542/855] io_uring: fix fget/fput handling This isn't a straight port of commit 84c4e1f89fef for aio.c, since io_uring doesn't use files in exactly the same way. But it's pretty close. See the commit message for that commit. This essentially fixes a use-after-free with the poll command handling, but it takes cue from Linus's approach to just simplifying the file handling. We move the setup of the file into a higher level location, so the individual commands don't have to deal with it. And then we release the reference when we free the associated io_kiocb. Fixes: 221c5eb23382 ("io_uring: add support for IORING_OP_POLL") Signed-off-by: Jens Axboe --- fs/io_uring.c | 230 +++++++++++++++++++++----------------------------- 1 file changed, 97 insertions(+), 133 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d259e8a6cb2e..c08fa62e1978 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -189,6 +189,10 @@ struct sqe_submit { bool needs_fixed_file; }; +/* + * First field must be the file pointer in all the + * iocb unions! See also 'struct kiocb' in + */ struct io_poll_iocb { struct file *file; struct wait_queue_head *head; @@ -198,8 +202,15 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +/* + * NOTE! Each of the iocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'ki_filp' in this struct. + */ struct io_kiocb { union { + struct file *file; struct kiocb rw; struct io_poll_iocb poll; }; @@ -431,6 +442,8 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) static void io_free_req(struct io_kiocb *req) { + if (req->file && !(req->flags & REQ_F_FIXED_FILE)) + fput(req->file); io_ring_drop_ctx_refs(req->ctx, 1); kmem_cache_free(req_cachep, req); } @@ -448,45 +461,34 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, struct list_head *done) { void *reqs[IO_IOPOLL_BATCH]; - int file_count, to_free; - struct file *file = NULL; struct io_kiocb *req; + int to_free; - file_count = to_free = 0; + to_free = 0; while (!list_empty(done)) { req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); io_cqring_fill_event(ctx, req->user_data, req->error, 0); - - if (refcount_dec_and_test(&req->refs)) - reqs[to_free++] = req; (*nr_events)++; - /* - * Batched puts of the same file, to avoid dirtying the - * file usage count multiple times, if avoidable. - */ - if (!(req->flags & REQ_F_FIXED_FILE)) { - if (!file) { - file = req->rw.ki_filp; - file_count = 1; - } else if (file == req->rw.ki_filp) { - file_count++; + if (refcount_dec_and_test(&req->refs)) { + /* If we're not using fixed files, we have to pair the + * completion part with the file put. Use regular + * completions for those, only batch free for fixed + * file. + */ + if (req->flags & REQ_F_FIXED_FILE) { + reqs[to_free++] = req; + if (to_free == ARRAY_SIZE(reqs)) + io_free_req_many(ctx, reqs, &to_free); } else { - fput_many(file, file_count); - file = req->rw.ki_filp; - file_count = 1; + io_free_req(req); } } - - if (to_free == ARRAY_SIZE(reqs)) - io_free_req_many(ctx, reqs, &to_free); } - io_commit_cqring(ctx); - if (file) - fput_many(file, file_count); + io_commit_cqring(ctx); io_free_req_many(ctx, reqs, &to_free); } @@ -609,19 +611,12 @@ static void kiocb_end_write(struct kiocb *kiocb) } } -static void io_fput(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_FIXED_FILE)) - fput(req->rw.ki_filp); -} - static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); kiocb_end_write(kiocb); - io_fput(req); io_cqring_add_event(req->ctx, req->user_data, res, 0); io_put_req(req); } @@ -738,31 +733,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, const struct io_uring_sqe *sqe = s->sqe; struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; - unsigned ioprio, flags; - int fd, ret; + unsigned ioprio; + int ret; + if (!req->file) + return -EBADF; /* For -EAGAIN retry, everything is already prepped */ if (req->flags & REQ_F_PREPPED) return 0; - flags = READ_ONCE(sqe->flags); - fd = READ_ONCE(sqe->fd); + if (force_nonblock && !io_file_supports_async(req->file)) + force_nonblock = false; - if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->user_files || - (unsigned) fd >= ctx->nr_user_files)) - return -EBADF; - kiocb->ki_filp = ctx->user_files[fd]; - req->flags |= REQ_F_FIXED_FILE; - } else { - if (s->needs_fixed_file) - return -EBADF; - kiocb->ki_filp = io_file_get(state, fd); - if (unlikely(!kiocb->ki_filp)) - return -EBADF; - if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) - force_nonblock = false; - } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -771,7 +753,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, if (ioprio) { ret = ioprio_check_cap(ioprio); if (ret) - goto out_fput; + return ret; kiocb->ki_ioprio = ioprio; } else @@ -779,39 +761,26 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) - goto out_fput; + return ret; if (force_nonblock) { kiocb->ki_flags |= IOCB_NOWAIT; req->flags |= REQ_F_FORCE_NONBLOCK; } if (ctx->flags & IORING_SETUP_IOPOLL) { - ret = -EOPNOTSUPP; if (!(kiocb->ki_flags & IOCB_DIRECT) || !kiocb->ki_filp->f_op->iopoll) - goto out_fput; + return -EOPNOTSUPP; req->error = 0; kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; } else { - if (kiocb->ki_flags & IOCB_HIPRI) { - ret = -EINVAL; - goto out_fput; - } + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; kiocb->ki_complete = io_complete_rw; } req->flags |= REQ_F_PREPPED; return 0; -out_fput: - if (!(flags & IOSQE_FIXED_FILE)) { - /* - * in case of error, we didn't use this file reference. drop it. - */ - if (state) - state->used_refs--; - io_file_put(state, kiocb->ki_filp); - } - return ret; } static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) @@ -968,16 +937,14 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, return ret; file = kiocb->ki_filp; - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) - goto out_fput; - ret = -EINVAL; + return -EBADF; if (unlikely(!file->f_op->read_iter)) - goto out_fput; + return -EINVAL; ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); if (ret) - goto out_fput; + return ret; iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); @@ -999,10 +966,6 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, } } kfree(iovec); -out_fput: - /* Hold on to the file for -EAGAIN */ - if (unlikely(ret && ret != -EAGAIN)) - io_fput(req); return ret; } @@ -1020,17 +983,15 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, if (ret) return ret; - ret = -EBADF; file = kiocb->ki_filp; if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out_fput; - ret = -EINVAL; + return -EBADF; if (unlikely(!file->f_op->write_iter)) - goto out_fput; + return -EINVAL; ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); if (ret) - goto out_fput; + return ret; iov_count = iov_iter_count(&iter); @@ -1062,10 +1023,6 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, } out_free: kfree(iovec); -out_fput: - /* Hold on to the file for -EAGAIN */ - if (unlikely(ret && ret != -EAGAIN)) - io_fput(req); return ret; } @@ -1080,16 +1037,6 @@ static int io_nop(struct io_kiocb *req, u64 user_data) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - /* - * Twilight zone - it's possible that someone issued an opcode that - * has a file attached, then got -EAGAIN on submission, and changed - * the sqe before we retried it from async context. Avoid dropping - * a file reference for this malicious case, and flag the error. - */ - if (req->rw.ki_filp) { - err = -EBADF; - io_fput(req); - } io_cqring_add_event(ctx, user_data, err, 0); io_put_req(req); return 0; @@ -1098,9 +1045,9 @@ static int io_nop(struct io_kiocb *req, u64 user_data) static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags; - int fd; + if (!req->file) + return -EBADF; /* Prep already done (EAGAIN retry) */ if (req->flags & REQ_F_PREPPED) return 0; @@ -1110,20 +1057,6 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; - fd = READ_ONCE(sqe->fd); - flags = READ_ONCE(sqe->flags); - - if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) - return -EBADF; - req->rw.ki_filp = ctx->user_files[fd]; - req->flags |= REQ_F_FIXED_FILE; - } else { - req->rw.ki_filp = fget(fd); - if (unlikely(!req->rw.ki_filp)) - return -EBADF; - } - req->flags |= REQ_F_PREPPED; return 0; } @@ -1153,7 +1086,6 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, end > 0 ? end : LLONG_MAX, fsync_flags & IORING_FSYNC_DATASYNC); - io_fput(req); io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); io_put_req(req); return 0; @@ -1220,7 +1152,6 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) static void io_poll_complete(struct io_kiocb *req, __poll_t mask) { io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0); - io_fput(req); io_put_req(req); } @@ -1314,34 +1245,20 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_poll_iocb *poll = &req->poll; struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; - unsigned flags; __poll_t mask; u16 events; - int fd; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) return -EINVAL; + if (!poll->file) + return -EBADF; INIT_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - flags = READ_ONCE(sqe->flags); - fd = READ_ONCE(sqe->fd); - - if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) - return -EBADF; - poll->file = ctx->user_files[fd]; - req->flags |= REQ_F_FIXED_FILE; - } else { - poll->file = fget(fd); - } - if (unlikely(!poll->file)) - return -EBADF; - poll->head = NULL; poll->woken = false; poll->canceled = false; @@ -1380,8 +1297,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) out: if (unlikely(ipt.error)) { - if (!(flags & IOSQE_FIXED_FILE)) - fput(poll->file); /* * Drop one of our refs to this req, __io_submit_sqe() will * drop the other one since we're returning an error. @@ -1621,6 +1536,50 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) return ret; } +static bool io_op_needs_file(const struct io_uring_sqe *sqe) +{ + int op = READ_ONCE(sqe->opcode); + + switch (op) { + case IORING_OP_NOP: + case IORING_OP_POLL_REMOVE: + return false; + default: + return true; + } +} + +static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, + struct io_submit_state *state, struct io_kiocb *req) +{ + unsigned flags; + int fd; + + flags = READ_ONCE(s->sqe->flags); + fd = READ_ONCE(s->sqe->fd); + + if (!io_op_needs_file(s->sqe)) { + req->file = NULL; + return 0; + } + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || + (unsigned) fd >= ctx->nr_user_files)) + return -EBADF; + req->file = ctx->user_files[fd]; + req->flags |= REQ_F_FIXED_FILE; + } else { + if (s->needs_fixed_file) + return -EBADF; + req->file = io_file_get(state, fd); + if (unlikely(!req->file)) + return -EBADF; + } + + return 0; +} + static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, struct io_submit_state *state) { @@ -1635,6 +1594,10 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, if (unlikely(!req)) return -EAGAIN; + ret = io_req_set_file(ctx, s, state, req); + if (unlikely(ret)) + goto out; + ret = __io_submit_sqe(ctx, req, s, true, state); if (ret == -EAGAIN) { struct io_uring_sqe *sqe_copy; @@ -1664,6 +1627,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, } } +out: /* drop submission reference */ io_put_req(req); From 52eaa798f4f4e983c711eaa1c13d8859a52946e8 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 12 Mar 2019 09:08:12 +0100 Subject: [PATCH 543/855] scsi: ia64: simscsi: use request tag instead of serial_number Use the request tag for logging instead of the scsi command serial number. Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen [jejb: fix commit oneliner] Signed-off-by: James Bottomley --- arch/ia64/hp/sim/simscsi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/ia64/hp/sim/simscsi.c b/arch/ia64/hp/sim/simscsi.c index f86844fc0725..0a8a74271173 100644 --- a/arch/ia64/hp/sim/simscsi.c +++ b/arch/ia64/hp/sim/simscsi.c @@ -105,7 +105,8 @@ simscsi_interrupt (unsigned long val) atomic_dec(&num_reqs); queue[rd].sc = NULL; if (DBG) - printk("simscsi_interrupt: done with %ld\n", sc->serial_number); + printk("simscsi_interrupt: done with %u\n", + sc->request->tag); (*sc->scsi_done)(sc); rd = (rd + 1) % SIMSCSI_REQ_QUEUE_LEN; } @@ -214,8 +215,8 @@ simscsi_queuecommand_lck (struct scsi_cmnd *sc, void (*done)(struct scsi_cmnd *) register long sp asm ("sp"); if (DBG) - printk("simscsi_queuecommand: target=%d,cmnd=%u,sc=%lu,sp=%lx,done=%p\n", - target_id, sc->cmnd[0], sc->serial_number, sp, done); + printk("simscsi_queuecommand: target=%d,cmnd=%u,sc=%u,sp=%lx,done=%p\n", + target_id, sc->cmnd[0], sc->request->tag, sp, done); #endif sc->result = DID_BAD_TARGET << 16; From 92da008fa21034c369cdb8ca2b629fe5c196826b Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 12 Mar 2019 11:45:58 -0700 Subject: [PATCH 544/855] Revert "KVM/MMU: Flush tlb directly in the kvm_zap_gfn_range()" This reverts commit 71883a62fcd6c70639fa12cda733378b4d997409. The above commit contains an optimization to kvm_zap_gfn_range which uses gfn-limited TLB flushes, if enabled. If using these limited flushes, kvm_zap_gfn_range passes lock_flush_tlb=false to slot_handle_level_range which creates a race when the function unlocks to call cond_resched. See an example of this race below: CPU 0 CPU 1 CPU 3 // zap_direct_gfn_range mmu_lock() // *ptep == pte_1 *ptep = 0 if (lock_flush_tlb) flush_tlbs() mmu_unlock() // In invalidate range // MMU notifier mmu_lock() if (pte != 0) *ptep = 0 flush = true if (flush) flush_remote_tlbs() mmu_unlock() return // Host MM reallocates // page previously // backing guest memory. // Guest accesses // invalid page // through pte_1 // in its TLB!! Tested: Ran all kvm-unit-tests on a Intel Haswell machine with and without this patch. The patch introduced no new failures. Signed-off-by: Ben Gardon Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8d43b7c0f56f..4cda5ee48845 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5660,13 +5660,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - bool flush_tlb = true; - bool flush = false; int i; - if (kvm_available_flush_tlb_with_range()) - flush_tlb = false; - spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); @@ -5678,17 +5673,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - flush |= slot_handle_level_range(kvm, memslot, - kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, start, - end - 1, flush_tlb); + slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + start, end - 1, true); } } - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start + 1); - spin_unlock(&kvm->mmu_lock); } From 46333236485c8647e40ac2922579e29c5e49ed16 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 13 Mar 2019 12:55:55 -0700 Subject: [PATCH 545/855] MAINTAINERS: Add KVM selftests to existing KVM entry It's safe to assume Paolo and Radim are maintaining the KVM selftests given that the vast majority of commits have their SOBs. Play nice with get_maintainers and make it official. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8347269aa386..b95f5b69e5be 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8295,6 +8295,7 @@ F: include/linux/kvm* F: include/kvm/iodev.h F: virt/kvm/* F: tools/kvm/ +F: tools/testing/selftests/kvm/ KERNEL VIRTUAL MACHINE FOR AMD-V (KVM/amd) M: Joerg Roedel From a75d4c33377277b6034dd1e2663bce444f952c14 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Mar 2019 11:44:14 -0700 Subject: [PATCH 546/855] filemap: kill page_cache_read usage in filemap_fault Patch series "drop the mmap_sem when doing IO in the fault path", v6. Now that we have proper isolation in place with cgroups2 we have started going through and fixing the various priority inversions. Most are all gone now, but this one is sort of weird since it's not necessarily a priority inversion that happens within the kernel, but rather because of something userspace does. We have giant applications that we want to protect, and parts of these giant applications do things like watch the system state to determine how healthy the box is for load balancing and such. This involves running 'ps' or other such utilities. These utilities will often walk /proc//whatever, and these files can sometimes need to down_read(&task->mmap_sem). Not usually a big deal, but we noticed when we are stress testing that sometimes our protected application has latency spikes trying to get the mmap_sem for tasks that are in lower priority cgroups. This is because any down_write() on a semaphore essentially turns it into a mutex, so even if we currently have it held for reading, any new readers will not be allowed on to keep from starving the writer. This is fine, except a lower priority task could be stuck doing IO because it has been throttled to the point that its IO is taking much longer than normal. But because a higher priority group depends on this completing it is now stuck behind lower priority work. In order to avoid this particular priority inversion we want to use the existing retry mechanism to stop from holding the mmap_sem at all if we are going to do IO. This already exists in the read case sort of, but needed to be extended for more than just grabbing the page lock. With io.latency we throttle at submit_bio() time, so the readahead stuff can block and even page_cache_read can block, so all these paths need to have the mmap_sem dropped. The other big thing is ->page_mkwrite. btrfs is particularly shitty here because we have to reserve space for the dirty page, which can be a very expensive operation. We use the same retry method as the read path, and simply cache the page and verify the page is still setup properly the next pass through ->page_mkwrite(). I've tested these patches with xfstests and there are no regressions. This patch (of 3): If we do not have a page at filemap_fault time we'll do this weird forced page_cache_read thing to populate the page, and then drop it again and loop around and find it. This makes for 2 ways we can read a page in filemap_fault, and it's not really needed. Instead add a FGP_FOR_MMAP flag so that pagecache_get_page() will return a unlocked page that's in pagecache. Then use the normal page locking and readpage logic already in filemap_fault. This simplifies the no page in page cache case significantly. [akpm@linux-foundation.org: fix comment text] [josef@toxicpanda.com: don't unlock null page in FGP_FOR_MMAP case] Link: http://lkml.kernel.org/r/20190312201742.22935-1-josef@toxicpanda.com Link: http://lkml.kernel.org/r/20181211173801.29535-2-josef@toxicpanda.com Signed-off-by: Josef Bacik Acked-by: Johannes Weiner Reviewed-by: Jan Kara Reviewed-by: Andrew Morton Cc: Tejun Heo Cc: Dave Chinner Cc: Rik van Riel Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 1 + mm/filemap.c | 75 +++++++++-------------------------------- 2 files changed, 16 insertions(+), 60 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b477a70cc2e4..bcf909d0de5f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -239,6 +239,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_WRITE 0x00000008 #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 +#define FGP_FOR_MMAP 0x00000040 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t cache_gfp_mask); diff --git a/mm/filemap.c b/mm/filemap.c index ec6566ffbd90..64d014f940e9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1587,6 +1587,9 @@ EXPORT_SYMBOL(find_lock_entry); * @gfp_mask and added to the page cache and the VM's LRU * list. The page is returned locked and with an increased * refcount. + * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do + * its own locking dance if the page is already in cache, or unlock the page + * before returning if we had to add the page to pagecache. * * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even * if the GFP flags specified for FGP_CREAT are atomic. @@ -1641,7 +1644,7 @@ no_page: if (!page) return NULL; - if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) + if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; /* Init accessed so avoid atomic mark_page_accessed later */ @@ -1655,6 +1658,13 @@ no_page: if (err == -EEXIST) goto repeat; } + + /* + * add_to_page_cache_lru locks the page, and for mmap we expect + * an unlocked page. + */ + if (page && (fgp_flags & FGP_FOR_MMAP)) + unlock_page(page); } return page; @@ -2379,41 +2389,6 @@ out: EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU -/** - * page_cache_read - adds requested page to the page cache if not already there - * @file: file to read - * @offset: page index - * @gfp_mask: memory allocation flags - * - * This adds the requested page to the page cache if it isn't already there, - * and schedules an I/O to read in its contents from disk. - * - * Return: %0 on success, negative error code otherwise. - */ -static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) -{ - struct address_space *mapping = file->f_mapping; - struct page *page; - int ret; - - do { - page = __page_cache_alloc(gfp_mask); - if (!page) - return -ENOMEM; - - ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask); - if (ret == 0) - ret = mapping->a_ops->readpage(file, page); - else if (ret == -EEXIST) - ret = 0; /* losing race to add is OK */ - - put_page(page); - - } while (ret == AOP_TRUNCATED_PAGE); - - return ret; -} - #define MMAP_LOTSAMISS (100) /* @@ -2539,9 +2514,11 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: - page = find_get_page(mapping, offset); + page = pagecache_get_page(mapping, offset, + FGP_CREAT|FGP_FOR_MMAP, + vmf->gfp_mask); if (!page) - goto no_cached_page; + return vmf_error(-ENOMEM); } if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { @@ -2578,28 +2555,6 @@ retry_find: vmf->page = page; return ret | VM_FAULT_LOCKED; -no_cached_page: - /* - * We're only likely to ever get here if MADV_RANDOM is in - * effect. - */ - error = page_cache_read(file, offset, vmf->gfp_mask); - - /* - * The page we want has now been added to the page cache. - * In the unlikely event that someone removed it in the - * meantime, we'll just come back here and read it again. - */ - if (error >= 0) - goto retry_find; - - /* - * An error return from page_cache_read can result if the - * system is low on memory, or a problem occurs while trying - * to schedule I/O. - */ - return vmf_error(error); - page_not_uptodate: /* * Umm, take care of errors if the page isn't up-to-date. From 6b4c9f4469819a0c1a38a0a4541337e0f9bf6c11 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Mar 2019 11:44:22 -0700 Subject: [PATCH 547/855] filemap: drop the mmap_sem for all blocking operations Currently we only drop the mmap_sem if there is contention on the page lock. The idea is that we issue readahead and then go to lock the page while it is under IO and we want to not hold the mmap_sem during the IO. The problem with this is the assumption that the readahead does anything. In the case that the box is under extreme memory or IO pressure we may end up not reading anything at all for readahead, which means we will end up reading in the page under the mmap_sem. Even if the readahead does something, it could get throttled because of io pressure on the system and the process is in a lower priority cgroup. Holding the mmap_sem while doing IO is problematic because it can cause system-wide priority inversions. Consider some large company that does a lot of web traffic. This large company has load balancing logic in it's core web server, cause some engineer thought this was a brilliant plan. This load balancing logic gets statistics from /proc about the system, which trip over processes mmap_sem for various reasons. Now the web server application is in a protected cgroup, but these other processes may not be, and if they are being throttled while their mmap_sem is held we'll stall, and cause this nice death spiral. Instead rework filemap fault path to drop the mmap sem at any point that we may do IO or block for an extended period of time. This includes while issuing readahead, locking the page, or needing to call ->readpage because readahead did not occur. Then once we have a fully uptodate page we can return with VM_FAULT_RETRY and come back again to find our nicely in-cache page that was gotten outside of the mmap_sem. This patch also adds a new helper for locking the page with the mmap_sem dropped. This doesn't make sense currently as generally speaking if the page is already locked it'll have been read in (unless there was an error) before it was unlocked. However a forthcoming patchset will change this with the ability to abort read-ahead bio's if necessary, making it more likely that we could contend for a page lock and still have a not uptodate page. This allows us to deal with this case by grabbing the lock and issuing the IO without the mmap_sem held, and then returning VM_FAULT_RETRY to come back around. [josef@toxicpanda.com: v6] Link: http://lkml.kernel.org/r/20181212152757.10017-1-josef@toxicpanda.com [kirill@shutemov.name: fix race in filemap_fault()] Link: http://lkml.kernel.org/r/20181228235106.okk3oastsnpxusxs@kshutemo-mobl1 [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20181211173801.29535-4-josef@toxicpanda.com Signed-off-by: Josef Bacik Acked-by: Johannes Weiner Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Tested-by: syzbot+b437b5a429d680cf2217@syzkaller.appspotmail.com Cc: Dave Chinner Cc: Rik van Riel Cc: Tejun Heo Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 136 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 117 insertions(+), 19 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 64d014f940e9..2815cb79a246 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2390,28 +2390,92 @@ EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) +static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, + struct file *fpin) +{ + int flags = vmf->flags; + + if (fpin) + return fpin; + + /* + * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or + * anything, so we only pin the file and drop the mmap_sem if only + * FAULT_FLAG_ALLOW_RETRY is set. + */ + if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == + FAULT_FLAG_ALLOW_RETRY) { + fpin = get_file(vmf->vma->vm_file); + up_read(&vmf->vma->vm_mm->mmap_sem); + } + return fpin; +} /* - * Synchronous readahead happens when we don't even find - * a page in the page cache at all. + * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem + * @vmf - the vm_fault for this fault. + * @page - the page to lock. + * @fpin - the pointer to the file we may pin (or is already pinned). + * + * This works similar to lock_page_or_retry in that it can drop the mmap_sem. + * It differs in that it actually returns the page locked if it returns 1 and 0 + * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin + * will point to the pinned file and needs to be fput()'ed at a later point. */ -static void do_sync_mmap_readahead(struct vm_fault *vmf) +static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, + struct file **fpin) +{ + if (trylock_page(page)) + return 1; + + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) + return 0; + + *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); + if (vmf->flags & FAULT_FLAG_KILLABLE) { + if (__lock_page_killable(page)) { + /* + * We didn't have the right flags to drop the mmap_sem, + * but all fault_handlers only check for fatal signals + * if we return VM_FAULT_RETRY, so we need to drop the + * mmap_sem here and return 0 if we don't have a fpin. + */ + if (*fpin == NULL) + up_read(&vmf->vma->vm_mm->mmap_sem); + return 0; + } + } else + __lock_page(page); + return 1; +} + + +/* + * Synchronous readahead happens when we don't even find a page in the page + * cache at all. We don't want to perform IO under the mmap sem, so if we have + * to drop the mmap sem we return the file that was pinned in order for us to do + * that. If we didn't pin a file then we return NULL. The file that is + * returned needs to be fput()'ed when we're done with it. + */ +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + struct file *fpin = NULL; pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ) - return; + return fpin; if (!ra->ra_pages) - return; + return fpin; if (vmf->vma->vm_flags & VM_SEQ_READ) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); - return; + return fpin; } /* Avoid banging the cache line if not needed */ @@ -2423,37 +2487,44 @@ static void do_sync_mmap_readahead(struct vm_fault *vmf) * stop bothering with read-ahead. It will only hurt. */ if (ra->mmap_miss > MMAP_LOTSAMISS) - return; + return fpin; /* * mmap read-around */ + fpin = maybe_unlock_mmap_for_io(vmf, fpin); ra->start = max_t(long, 0, offset - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ra_submit(ra, mapping, file); + return fpin; } /* * Asynchronous readahead happens when we find the page and PG_readahead, - * so we want to possibly extend the readahead further.. + * so we want to possibly extend the readahead further. We return the file that + * was pinned if we have to drop the mmap_sem in order to do IO. */ -static void do_async_mmap_readahead(struct vm_fault *vmf, - struct page *page) +static struct file *do_async_mmap_readahead(struct vm_fault *vmf, + struct page *page) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + struct file *fpin = NULL; pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ) - return; + return fpin; if (ra->mmap_miss > 0) ra->mmap_miss--; - if (PageReadahead(page)) + if (PageReadahead(page)) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_readahead(mapping, ra, file, page, offset, ra->ra_pages); + } + return fpin; } /** @@ -2485,6 +2556,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; + struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; @@ -2506,25 +2578,26 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - do_async_mmap_readahead(vmf, page); + fpin = do_async_mmap_readahead(vmf, page); } else if (!page) { /* No page in the page cache at all */ - do_sync_mmap_readahead(vmf); count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; + fpin = do_sync_mmap_readahead(vmf); retry_find: page = pagecache_get_page(mapping, offset, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); - if (!page) + if (!page) { + if (fpin) + goto out_retry; return vmf_error(-ENOMEM); + } } - if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { - put_page(page); - return ret | VM_FAULT_RETRY; - } + if (!lock_page_maybe_drop_mmap(vmf, page, &fpin)) + goto out_retry; /* Did it get truncated? */ if (unlikely(page->mapping != mapping)) { @@ -2541,6 +2614,16 @@ retry_find: if (unlikely(!PageUptodate(page))) goto page_not_uptodate; + /* + * We've made it this far and we had to drop our mmap_sem, now is the + * time to return to the upper layer and have it re-find the vma and + * redo the fault. + */ + if (fpin) { + unlock_page(page); + goto out_retry; + } + /* * Found the page and have a reference on it. * We must recheck i_size under page lock. @@ -2563,12 +2646,15 @@ page_not_uptodate: * and we need to check for errors. */ ClearPageError(page); + fpin = maybe_unlock_mmap_for_io(vmf, fpin); error = mapping->a_ops->readpage(file, page); if (!error) { wait_on_page_locked(page); if (!PageUptodate(page)) error = -EIO; } + if (fpin) + goto out_retry; put_page(page); if (!error || error == AOP_TRUNCATED_PAGE) @@ -2577,6 +2663,18 @@ page_not_uptodate: /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(file, ra); return VM_FAULT_SIGBUS; + +out_retry: + /* + * We dropped the mmap_sem, we need to return to the fault handler to + * re-find the vma and come back and find our hopefully still populated + * page. + */ + if (page) + put_page(page); + if (fpin) + fput(fpin); + return ret | VM_FAULT_RETRY; } EXPORT_SYMBOL(filemap_fault); From eca6be566d47029f945a5f8e1c94d374e31df2ca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 Feb 2019 12:48:40 -0800 Subject: [PATCH 548/855] KVM: doc: Document the life cycle of a VM and its resources The series to add memcg accounting to KVM allocations[1] states: There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. While it is correct to account KVM kernel allocations to the cgroup of the process that created the VM, it's technically incorrect to state that the KVM kernel memory allocations are tied to the life of the VM process. This is because the VM itself, i.e. struct kvm, is not tied to the life of the process which created it, rather it is tied to the life of its associated file descriptor. In other words, kvm_destroy_vm() is not invoked until fput() decrements its associated file's refcount to zero. A simple example is to fork() in Qemu and have the child sleep indefinitely; kvm_destroy_vm() isn't called until Qemu closes its file descriptor *and* the rogue child is killed. The allocations are guaranteed to be *accounted* to the process which created the VM, but only because KVM's per-{VM,vCPU} ioctls reject the ioctl() with -EIO if kvm->mm != current->mm. I.e. the child can keep the VM "alive" but can't do anything useful with its reference. Note that because 'struct kvm' also holds a reference to the mm_struct of its owner, the above behavior also applies to userspace allocations. Given that mucking with a VM's file descriptor can lead to subtle and undesirable behavior, e.g. memcg charges persisting after a VM is shut down, explicitly document a VM's lifecycle and its impact on the VM's resources. Alternatively, KVM could aggressively free resources when the creating process exits, e.g. via mmu_notifier->release(). However, mmu_notifier isn't guaranteed to be available, and freeing resources when the creator exits is likely to be error prone and fragile as KVM would need to ensure that it only freed resources that are truly out of reach. In practice, the existing behavior shouldn't be problematic as a properly configured system will prevent a child process from being moved out of the appropriate cgroup hierarchy, i.e. prevent hiding the process from the OOM killer, and will prevent an unprivileged user from being able to to hold a reference to struct kvm via another method, e.g. debugfs. [1]https://patchwork.kernel.org/patch/10806707/ Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 356156f5c52d..7de9eee73fcd 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -45,6 +45,23 @@ the API. The only supported use is one virtual machine per process, and one vcpu per thread. +It is important to note that althought VM ioctls may only be issued from +the process that created the VM, a VM's lifecycle is associated with its +file descriptor, not its creator (process). In other words, the VM and +its resources, *including the associated address space*, are not freed +until the last reference to the VM's file descriptor has been released. +For example, if fork() is issued after ioctl(KVM_CREATE_VM), the VM will +not be freed until both the parent (original) process and its child have +put their references to the VM's file descriptor. + +Because a VM's resources are not freed until the last reference to its +file descriptor is released, creating additional references to a VM via +via fork(), dup(), etc... without careful consideration is strongly +discouraged and may have unwanted side effects, e.g. memory allocated +by and on behalf of the VM's process may not be freed/unaccounted when +the VM is shut down. + + 3. Extensions ------------- From 4a605bc08e98381d8df61c30a4acb2eac15eb7da Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 15 Mar 2019 19:23:45 +0100 Subject: [PATCH 549/855] kvm: vmx: fix formatting of a comment Eliminate a gratuitous conflict with 5.0. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4950bb20e06a..886930db77c5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1213,13 +1213,13 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.control) != old.control); /* - * Clear SN before reading the bitmap; this ensures that any - * interrupt that comes after the bitmap is read sets ON. The - * VT-d firmware * writes the bitmap and reads SN atomically (5.2.3 - * in the spec), so it doesn't really have a memory barrier that - * pairs with this. However, we cannot do that and we need one. + * Clear SN before reading the bitmap. The VT-d firmware + * writes the bitmap and reads SN atomically (5.2.3 in the + * spec), so it doesn't really have a memory barrier that + * pairs with this, but we cannot do that and we need one. */ smp_mb__after_atomic(); + if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) pi_set_on(pi_desc); } From 8b0f9fa2e02dc95216577c3387b0707c5f60fbaf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 15 Mar 2019 11:26:07 -0700 Subject: [PATCH 550/855] filemap: add a comment about FAULT_FLAG_RETRY_NOWAIT behavior I thought Josef Bacik's patch to drop the mmap_sem was buggy, because when looking at the error cases, there was one case where we returned VM_FAULT_RETRY without actually dropping the mmap_sem. Josef had to explain to me (using small words) that yes, that's actually what we're supposed to do, and his patch was correct. Which not only convinced me he knew what he was doing and I should stop arguing with him, but also that I should add a comment to the case I was confused about. Patiently-pointed-out-by: Josef Bacik Signed-off-by: Linus Torvalds --- mm/filemap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index 2815cb79a246..d78f577baef2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2428,6 +2428,11 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, if (trylock_page(page)) return 1; + /* + * NOTE! This will make us return with VM_FAULT_RETRY, but with + * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT + * is supposed to work. We have way too many special cases.. + */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) return 0; From 8c838788775a593527803786d376393b7c28f589 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Mar 2019 15:48:16 -0600 Subject: [PATCH 551/855] io_uring: fix poll races This is a straight port of Al's fix for the aio poll implementation, since the io_uring version is heavily based on that. The below description is almost straight from that patch, just modified to fit the io_uring situation. io_poll() has to cope with several unpleasant problems: * requests that might stay around indefinitely need to be made visible for io_cancel(2); that must not be done to a request already completed, though. * in cases when ->poll() has placed us on a waitqueue, wakeup might have happened (and request completed) before ->poll() returns. * worse, in some early wakeup cases request might end up re-added into the queue later - we can't treat "woken up and currently not in the queue" as "it's not going to stick around indefinitely" * ... moreover, ->poll() might have decided not to put it on any queues to start with, and that needs to be distinguished from the previous case * ->poll() might have tried to put us on more than one queue. Only the first will succeed for io poll, so we might end up missing wakeups. OTOH, we might very well notice that only after the wakeup hits and request gets completed (all before ->poll() gets around to the second poll_wait()). In that case it's too late to decide that we have an error. req->woken was an attempt to deal with that. Unfortunately, it was broken. What we need to keep track of is not that wakeup has happened - the thing might come back after that. It's that async reference is already gone and won't come back, so we can't (and needn't) put the request on the list of cancellables. The easiest case is "request hadn't been put on any waitqueues"; we can tell by seeing NULL apt.head, and in that case there won't be anything async. We should either complete the request ourselves (if vfs_poll() reports anything of interest) or return an error. In all other cases we get exclusion with wakeups by grabbing the queue lock. If request is currently on queue and we have something interesting from vfs_poll(), we can steal it and complete the request ourselves. If it's on queue and vfs_poll() has not reported anything interesting, we either put it on the cancellable list, or, if we know that it hadn't been put on all queues ->poll() wanted it on, we steal it and return an error. If it's _not_ on queue, it's either been already dealt with (in which case we do nothing), or there's io_poll_complete_work() about to be executed. In that case we either put it on the cancellable list, or, if we know it hadn't been put on all queues ->poll() wanted it on, simulate what cancel would've done. Fixes: 221c5eb23382 ("io_uring: add support for IORING_OP_POLL") Signed-off-by: Jens Axboe --- fs/io_uring.c | 119 +++++++++++++++++++++++++------------------------- 1 file changed, 60 insertions(+), 59 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c08fa62e1978..12bb238aed6b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -197,7 +197,7 @@ struct io_poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool woken; + bool done; bool canceled; struct wait_queue_entry wait; }; @@ -367,20 +367,25 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, } } -static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, +static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); + if (waitqueue_active(&ctx->sqo_wait)) + wake_up(&ctx->sqo_wait); +} + +static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, long res, unsigned ev_flags) { unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - io_cqring_fill_event(ctx, ki_user_data, res, ev_flags); + io_cqring_fill_event(ctx, user_data, res, ev_flags); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); - if (waitqueue_active(&ctx->sqo_wait)) - wake_up(&ctx->sqo_wait); + io_cqring_ev_posted(ctx); } static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) @@ -1149,10 +1154,12 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static void io_poll_complete(struct io_kiocb *req, __poll_t mask) +static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req, + __poll_t mask) { - io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0); - io_put_req(req); + req->poll.done = true; + io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0); + io_commit_cqring(ctx); } static void io_poll_complete_work(struct work_struct *work) @@ -1180,9 +1187,11 @@ static void io_poll_complete_work(struct work_struct *work) return; } list_del_init(&req->list); + io_poll_complete(ctx, req, mask); spin_unlock_irq(&ctx->completion_lock); - io_poll_complete(req, mask); + io_cqring_ev_posted(ctx); + io_put_req(req); } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -1193,29 +1202,25 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); - - poll->woken = true; + unsigned long flags; /* for instances that support it check for an event match first: */ - if (mask) { - unsigned long flags; - - if (!(mask & poll->events)) - return 0; - - /* try to complete the iocb inline if we can: */ - if (spin_trylock_irqsave(&ctx->completion_lock, flags)) { - list_del(&req->list); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - - list_del_init(&poll->wait.entry); - io_poll_complete(req, mask); - return 1; - } - } + if (mask && !(mask & poll->events)) + return 0; list_del_init(&poll->wait.entry); - queue_work(ctx->sqo_wq, &req->work); + + if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { + list_del(&req->list); + io_poll_complete(ctx, req, mask); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + io_put_req(req); + } else { + queue_work(ctx->sqo_wq, &req->work); + } + return 1; } @@ -1245,6 +1250,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_poll_iocb *poll = &req->poll; struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; + bool cancel = false; __poll_t mask; u16 events; @@ -1260,7 +1266,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; poll->head = NULL; - poll->woken = false; + poll->done = false; poll->canceled = false; ipt.pt._qproc = io_poll_queue_proc; @@ -1273,41 +1279,36 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) init_waitqueue_func_entry(&poll->wait, io_poll_wake); mask = vfs_poll(poll->file, &ipt.pt) & poll->events; - if (unlikely(!poll->head)) { - /* we did not manage to set up a waitqueue, done */ - goto out; - } spin_lock_irq(&ctx->completion_lock); - spin_lock(&poll->head->lock); - if (poll->woken) { - /* wake_up context handles the rest */ - mask = 0; - ipt.error = 0; - } else if (mask || ipt.error) { - /* if we get an error or a mask we are done */ - WARN_ON_ONCE(list_empty(&poll->wait.entry)); - list_del_init(&poll->wait.entry); - } else { - /* actually waiting for an event */ - list_add_tail(&req->list, &ctx->cancel_list); + if (likely(poll->head)) { + spin_lock(&poll->head->lock); + if (unlikely(list_empty(&poll->wait.entry))) { + if (ipt.error) + cancel = true; + ipt.error = 0; + mask = 0; + } + if (mask || ipt.error) + list_del_init(&poll->wait.entry); + else if (cancel) + WRITE_ONCE(poll->canceled, true); + else if (!poll->done) /* actually waiting for an event */ + list_add_tail(&req->list, &ctx->cancel_list); + spin_unlock(&poll->head->lock); + } + if (mask) { /* no async, we'd stolen it */ + req->error = mangle_poll(mask); + ipt.error = 0; + io_poll_complete(ctx, req, mask); } - spin_unlock(&poll->head->lock); spin_unlock_irq(&ctx->completion_lock); -out: - if (unlikely(ipt.error)) { - /* - * Drop one of our refs to this req, __io_submit_sqe() will - * drop the other one since we're returning an error. - */ + if (mask) { + io_cqring_ev_posted(ctx); io_put_req(req); - return ipt.error; } - - if (mask) - io_poll_complete(req, mask); - return 0; + return ipt.error; } static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, From 6ade657d6125ec3ec07f95fa51e28138aef6208f Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Thu, 14 Mar 2019 22:58:29 -0500 Subject: [PATCH 552/855] ALSA: echoaudio: add a check for ioremap_nocache In case ioremap_nocache fails, the fix releases chip and returns an error code upstream to avoid NULL pointer dereference. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai --- sound/pci/echoaudio/echoaudio.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/pci/echoaudio/echoaudio.c b/sound/pci/echoaudio/echoaudio.c index ea876b0b02b9..dc0084dc8550 100644 --- a/sound/pci/echoaudio/echoaudio.c +++ b/sound/pci/echoaudio/echoaudio.c @@ -1952,6 +1952,11 @@ static int snd_echo_create(struct snd_card *card, } chip->dsp_registers = (volatile u32 __iomem *) ioremap_nocache(chip->dsp_registers_phys, sz); + if (!chip->dsp_registers) { + dev_err(chip->card->dev, "ioremap failed\n"); + snd_echo_free(chip); + return -ENOMEM; + } if (request_irq(pci->irq, snd_echo_interrupt, IRQF_SHARED, KBUILD_MODNAME, chip)) { From dcd0feac9bab901d5739de51b3f69840851f8919 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Thu, 14 Mar 2019 23:04:14 -0500 Subject: [PATCH 553/855] ALSA: sb8: add a check for request_region In case request_region fails, the fix returns an error code to avoid NULL pointer dereference. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai --- sound/isa/sb/sb8.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/isa/sb/sb8.c b/sound/isa/sb/sb8.c index aa2a83eb81a9..dc27a480c2d9 100644 --- a/sound/isa/sb/sb8.c +++ b/sound/isa/sb/sb8.c @@ -111,6 +111,10 @@ static int snd_sb8_probe(struct device *pdev, unsigned int dev) /* block the 0x388 port to avoid PnP conflicts */ acard->fm_res = request_region(0x388, 4, "SoundBlaster FM"); + if (!acard->fm_res) { + err = -EBUSY; + goto _err; + } if (port[dev] != SNDRV_AUTO_PORT) { if ((err = snd_sbdsp_create(card, port[dev], irq[dev], From ab81dabda1d4edc1728173be6c6a279455f220e3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 16 Mar 2019 09:45:42 -0400 Subject: [PATCH 554/855] fix sysfs_init_fs_context() in !CONFIG_NET_NS case Permission checks on current's netns should be done only when netns are enabled. Reported-by: Dominik Brodowski Fixes: 23bf1b6be9c2 Signed-off-by: Al Viro --- fs/sysfs/mount.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 4cb21b558a85..1b56686ab178 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -71,9 +71,11 @@ static int sysfs_init_fs_context(struct fs_context *fc) kfc->magic = SYSFS_MAGIC; fc->fs_private = kfc; fc->ops = &sysfs_fs_context_ops; - if (fc->user_ns) - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(netns->user_ns); + if (netns) { + if (fc->user_ns) + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(netns->user_ns); + } fc->global = true; return 0; } From da9cfb87a44da61f2403c4312916befcb6b6d7e8 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Fri, 15 Feb 2019 15:55:19 +0800 Subject: [PATCH 555/855] coccinelle: semantic code search for missing put_device() The of_find_device_by_node() takes a reference to the underlying device structure, we should release that reference. The implementation of this semantic code search is: In a function, for a local variable returned by calling of_find_device_by_node(), a, if it is released by a function such as put_device()/of_dev_put()/platform_device_put() after the last use, it is considered that there is no reference leak; b, if it is passed back to the caller via dev_get_drvdata()/platform_get_drvdata()/get_device(), etc., the reference will be released in other functions, and the current function also considers that there is no reference leak; c, for the rest of the situation, the current function should release the reference by calling put_device, this code search will report the corresponding error message. By using this semantic code search, we have found some object reference leaks, such as: commit 11907e9d3533 ("ASoC: fsl-asoc-card: fix object reference leaks in fsl_asoc_card_probe") commit a12085d13997 ("mtd: rawnand: atmel: fix possible object reference leak") commit 11493f26856a ("mtd: rawnand: jz4780: fix possible object reference leak") There are still dozens of reference leaks in the current kernel code. Further, for the case of b, the object returned to other functions may also have a reference leak, we will continue to develop other cocci scripts to further check the reference leak. Signed-off-by: Wen Yang Reviewed-by: Julia Lawall Reviewed-by: Markus Elfring Signed-off-by: Masahiro Yamada --- scripts/coccinelle/free/put_device.cocci | 56 ++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 scripts/coccinelle/free/put_device.cocci diff --git a/scripts/coccinelle/free/put_device.cocci b/scripts/coccinelle/free/put_device.cocci new file mode 100644 index 000000000000..7395697e7f19 --- /dev/null +++ b/scripts/coccinelle/free/put_device.cocci @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/// Find missing put_device for every of_find_device_by_node. +/// +// Confidence: Moderate +// Copyright: (C) 2018-2019 Wen Yang, ZTE. +// Comments: +// Options: --no-includes --include-headers + +virtual report +virtual org + +@search exists@ +local idexpression id; +expression x,e,e1; +position p1,p2; +type T,T1,T2,T3; +@@ + +id = of_find_device_by_node@p1(x) +... when != e = id +if (id == NULL || ...) { ... return ...; } +... when != put_device(&id->dev) + when != platform_device_put(id) + when != of_dev_put(id) + when != if (id) { ... put_device(&id->dev) ... } + when != e1 = (T)id + when != e1 = &id->dev + when != e1 = get_device(&id->dev) + when != e1 = (T1)platform_get_drvdata(id) +( + return +( id +| (T2)dev_get_drvdata(&id->dev) +| (T3)platform_get_drvdata(id) +); +| return@p2 ...; +) + +@script:python depends on report@ +p1 << search.p1; +p2 << search.p2; +@@ + +coccilib.report.print_report(p2[0], "ERROR: missing put_device; " + + "call of_find_device_by_node on line " + + p1[0].line + + ", but without a corresponding object release " + + "within this function.") + +@script:python depends on org@ +p1 << search.p1; +p2 << search.p2; +@@ + +cocci.print_main("of_find_device_by_node", p1) +cocci.print_secs("needed put_device", p2) From f6d9db6355227656108cb93dd8c74d9a9904c5fb Mon Sep 17 00:00:00 2001 From: Arseny Maslennikov Date: Sat, 9 Mar 2019 18:43:06 +0300 Subject: [PATCH 556/855] kbuild: deb-pkg: avoid implicit effects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * The man page for dpkg-source(1) notes: > -b, --build directory [format-specific-parameters] > Build a source package (--build since dpkg 1.17.14). > <...> > > dpkg-source will build the source package with the first > format found in this ordered list: the format indicated > with the --format command line option, the format > indicated in debian/source/format, “1.0”. The fallback > to “1.0” is deprecated and will be removed at some point > in the future, you should always document the desired > source format in debian/source/format. See section > SOURCE PACKAGE FORMATS for an extensive description of > the various source package formats. Thus it would be more foolproof to explicitly use 1.0 (as we always did) than to rely on dpkg-source's defaults. * In a similar vein, debian/rules is not made executable by mkdebian, and dpkg-source warns about that but still silently fixes the file. Let's be explicit once again. Signed-off-by: Arseny Maslennikov Signed-off-by: Masahiro Yamada --- scripts/package/mkdebian | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index 8068328a079c..8351584cb24e 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -132,7 +132,9 @@ else echo >&2 "Install lsb-release or set \$KDEB_CHANGELOG_DIST explicitly" fi -mkdir -p debian/ +mkdir -p debian/source/ +echo "1.0" > debian/source/format + echo $debarch > debian/arch extra_build_depends=", $(if_enabled_echo CONFIG_UNWINDER_ORC libelf-dev)" extra_build_depends="$extra_build_depends, $(if_enabled_echo CONFIG_SYSTEM_TRUSTED_KEYRING libssl-dev:native)" @@ -223,5 +225,6 @@ clean: binary: binary-arch EOF +chmod +x debian/rules exit 0 From 0c22be0712b8771011880972aa30d3412aa3334e Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 14 Mar 2019 16:41:59 -0700 Subject: [PATCH 557/855] kbuild: Make NOSTDINC_FLAGS a simply expanded variable During a simple no-op (nothing changed) build I saw 39 invocations of the C compiler with the argument "-print-file-name=include". We don't need to call the C compiler 39 times for this--one time will suffice. Let's change NOSTDINC_FLAGS to a simply expanded variable to avoid this since there doesn't appear to be any reason it should be recursively expanded. On my build this shaved ~400 ms off my "no-op" build. Note that the recursive expansion seems to date back to the (really old) commit e8f5bdb02ce0 ("[PATCH] Makefile include path ordering"). It's a little unclear to me if the point of that patch was to switch the variable to be recursively expanded (which it did) or to avoid directly assigning to NOSTDINC_FLAGS (AKA to switch to +=) because someone else (out of tree?) was setting it. I presume later since if the only goal was to switch to recursive expansion the patch would have just removed the ":". Signed-off-by: Douglas Anderson Signed-off-by: Masahiro Yamada --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1ef92d705984..08a01add09a6 100644 --- a/Makefile +++ b/Makefile @@ -412,7 +412,7 @@ CHECK = sparse CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \ -Wbitwise -Wno-return-void -Wno-unknown-attribute $(CF) -NOSTDINC_FLAGS = +NOSTDINC_FLAGS := CFLAGS_MODULE = AFLAGS_MODULE = LDFLAGS_MODULE = From f84dde10d893cd368e73dda04b694169542ed792 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 15 Mar 2019 09:25:03 -0700 Subject: [PATCH 558/855] Revert "modsign: Abort modules_install when signing fails" This reverts commit caf6fe91ddf62a96401e21e9b7a07227440f4185. The commit was fine but is no longer needed as of commit 3a2429e1faf4 ("kbuild: change if_changed_rule for multi-line recipe"). Let's go back to using ";" to be consistent. For some discussion, see: https://lkml.kernel.org/r/CAK7LNASde0Q9S5GKeQiWhArfER4S4wL1=R_FW8q0++_X3T5=hQ@mail.gmail.com Signed-off-by: Douglas Anderson Signed-off-by: Masahiro Yamada --- scripts/Makefile.modinst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst index ff5ca9817a85..0dae402661f3 100644 --- a/scripts/Makefile.modinst +++ b/scripts/Makefile.modinst @@ -23,7 +23,7 @@ quiet_cmd_modules_install = INSTALL $@ mkdir -p $(2) ; \ cp $@ $(2) ; \ $(mod_strip_cmd) $(2)/$(notdir $@) ; \ - $(mod_sign_cmd) $(2)/$(notdir $@) $(patsubst %,|| true,$(KBUILD_EXTMOD)) && \ + $(mod_sign_cmd) $(2)/$(notdir $@) $(patsubst %,|| true,$(KBUILD_EXTMOD)) ; \ $(mod_compress_cmd) $(2)/$(notdir $@) # Modules built outside the kernel source tree go into extra by default From 7cbbbb8bc2974264bbbf326d9a4552fc8878d375 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 17 Mar 2019 11:01:08 +0900 Subject: [PATCH 559/855] kbuild: warn redundant generic-y The generic-y is redundant under the following condition: - arch has its own implementation - the same header is added to generated-y - the same header is added to mandatory-y If a redundant generic-y is found, the warning like follows is displayed: scripts/Makefile.asm-generic:20: redundant generic-y found in arch/arm/include/asm/Kbuild: timex.h I fixed up arch Kbuild files found by this. Suggested-by: Sam Ravnborg Signed-off-by: Masahiro Yamada --- arch/arm/include/asm/Kbuild | 1 - arch/h8300/include/asm/Kbuild | 1 - arch/ia64/include/uapi/asm/Kbuild | 1 - arch/openrisc/include/asm/Kbuild | 2 -- arch/parisc/include/asm/Kbuild | 2 -- arch/powerpc/include/asm/Kbuild | 1 - arch/s390/include/asm/Kbuild | 1 - arch/s390/include/uapi/asm/Kbuild | 1 - arch/x86/include/uapi/asm/Kbuild | 1 - arch/xtensa/include/asm/Kbuild | 1 - arch/xtensa/include/uapi/asm/Kbuild | 1 - scripts/Makefile.asm-generic | 6 ++++++ 12 files changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 1d66db9c9db5..a8a4eb7f6dae 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -18,7 +18,6 @@ generic-y += segment.h generic-y += serial.h generic-y += simd.h generic-y += sizes.h -generic-y += timex.h generic-y += trace_clock.h generated-y += mach-types.h diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index 961c1dc064e1..3e7c8ecf151e 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -17,7 +17,6 @@ generic-y += fb.h generic-y += ftrace.h generic-y += futex.h generic-y += hardirq.h -generic-y += hash.h generic-y += hw_irq.h generic-y += irq_regs.h generic-y += irq_work.h diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild index b71c5f787783..5b819e53c397 100644 --- a/arch/ia64/include/uapi/asm/Kbuild +++ b/arch/ia64/include/uapi/asm/Kbuild @@ -2,4 +2,3 @@ include include/uapi/asm-generic/Kbuild.asm generated-y += unistd_64.h generic-y += kvm_para.h -generic-y += socket.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 1f04844b6b82..5a73e2956ac4 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -15,7 +15,6 @@ generic-y += fb.h generic-y += ftrace.h generic-y += hardirq.h generic-y += hw_irq.h -generic-y += irq.h generic-y += irq_regs.h generic-y += irq_work.h generic-y += kdebug.h @@ -35,7 +34,6 @@ generic-y += qrwlock.h generic-y += sections.h generic-y += segment.h generic-y += shmparam.h -generic-y += string.h generic-y += switch_to.h generic-y += topology.h generic-y += trace_clock.h diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 0b1e354c8c24..6f49e77d82a2 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -1,7 +1,6 @@ generated-y += syscall_table_32.h generated-y += syscall_table_64.h generated-y += syscall_table_c32.h -generic-y += barrier.h generic-y += current.h generic-y += device.h generic-y += div64.h @@ -20,7 +19,6 @@ generic-y += percpu.h generic-y += preempt.h generic-y += seccomp.h generic-y += segment.h -generic-y += topology.h generic-y += trace_clock.h generic-y += user.h generic-y += vga.h diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 77ff7fb24823..a0c132bedfae 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -5,7 +5,6 @@ generated-y += syscall_table_spu.h generic-y += div64.h generic-y += export.h generic-y += irq_regs.h -generic-y += irq_work.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += preempt.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index e3239772887a..12d77cb11fe5 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -20,7 +20,6 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h -generic-y += preempt.h generic-y += rwsem.h generic-y += trace_clock.h generic-y += unaligned.h diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild index 6b0f30b14642..da3e0d48abbc 100644 --- a/arch/s390/include/uapi/asm/Kbuild +++ b/arch/s390/include/uapi/asm/Kbuild @@ -3,4 +3,3 @@ include include/uapi/asm-generic/Kbuild.asm generated-y += unistd_32.h generated-y += unistd_64.h -generic-y += socket.h diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild index efe701b7c6ce..f6648e9928b3 100644 --- a/arch/x86/include/uapi/asm/Kbuild +++ b/arch/x86/include/uapi/asm/Kbuild @@ -3,4 +3,3 @@ include include/uapi/asm-generic/Kbuild.asm generated-y += unistd_32.h generated-y += unistd_64.h generated-y += unistd_x32.h -generic-y += socket.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index d939e13e8d84..42b6cb3d16f7 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -15,7 +15,6 @@ generic-y += irq_work.h generic-y += kdebug.h generic-y += kmap_types.h generic-y += kprobes.h -generic-y += linkage.h generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild index 6b43e5049ff7..960bf1e4be53 100644 --- a/arch/xtensa/include/uapi/asm/Kbuild +++ b/arch/xtensa/include/uapi/asm/Kbuild @@ -2,4 +2,3 @@ include include/uapi/asm-generic/Kbuild.asm generated-y += unistd_32.h generic-y += kvm_para.h -generic-y += socket.h diff --git a/scripts/Makefile.asm-generic b/scripts/Makefile.asm-generic index a62d2823f6cf..c7d2b7acad26 100644 --- a/scripts/Makefile.asm-generic +++ b/scripts/Makefile.asm-generic @@ -14,6 +14,12 @@ src := $(subst /generated,,$(obj)) include scripts/Kbuild.include +redundant := $(filter $(mandatory-y) $(generated-y), $(generic-y)) +redundant += $(foreach f, $(generic-y), $(if $(wildcard $(srctree)/$(src)/$(f)),$(f))) +redundant := $(sort $(redundant)) +$(if $(redundant),\ + $(warning redundant generic-y found in $(src)/Kbuild: $(redundant))) + # If arch does not implement mandatory headers, fallback to asm-generic ones. mandatory-y := $(filter-out $(generated-y), $(mandatory-y)) generic-y += $(foreach f, $(mandatory-y), $(if $(wildcard $(srctree)/$(src)/$(f)),,$(f))) From 037fc3368be46dc1a2a90f6e50c8cbce49d75fd6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 17 Mar 2019 11:01:09 +0900 Subject: [PATCH 560/855] kbuild: force all architectures except um to include mandatory-y Currently, every arch/*/include/uapi/asm/Kbuild explicitly includes the common Kbuild.asm file. Factor out the duplicated include directives to scripts/Makefile.asm-generic so that no architecture would opt out of the mandatory-y mechanism. um is not forced to include mandatory-y since it is a very exceptional case which does not support UAPI. Signed-off-by: Masahiro Yamada --- Documentation/kbuild/makefiles.txt | 2 +- Makefile | 6 ++++-- arch/alpha/include/uapi/asm/Kbuild | 2 -- arch/arc/include/uapi/asm/Kbuild | 2 -- arch/arm/include/uapi/asm/Kbuild | 1 - arch/arm64/include/uapi/asm/Kbuild | 1 - arch/c6x/include/uapi/asm/Kbuild | 2 -- arch/csky/include/uapi/asm/Kbuild | 2 -- arch/h8300/include/uapi/asm/Kbuild | 2 -- arch/hexagon/include/uapi/asm/Kbuild | 2 -- arch/ia64/include/uapi/asm/Kbuild | 2 -- arch/m68k/include/uapi/asm/Kbuild | 2 -- arch/microblaze/include/uapi/asm/Kbuild | 2 -- arch/mips/include/uapi/asm/Kbuild | 2 -- arch/nds32/include/uapi/asm/Kbuild | 2 -- arch/nios2/include/uapi/asm/Kbuild | 2 -- arch/openrisc/include/uapi/asm/Kbuild | 2 -- arch/parisc/include/uapi/asm/Kbuild | 2 -- arch/powerpc/include/uapi/asm/Kbuild | 2 -- arch/riscv/include/uapi/asm/Kbuild | 1 - arch/s390/include/uapi/asm/Kbuild | 1 - arch/sh/include/uapi/asm/Kbuild | 1 - arch/sparc/include/uapi/asm/Kbuild | 2 -- arch/unicore32/include/uapi/asm/Kbuild | 2 -- arch/x86/include/uapi/asm/Kbuild | 2 -- arch/xtensa/include/uapi/asm/Kbuild | 2 -- include/asm-generic/Kbuild | 5 +++++ include/uapi/asm-generic/{Kbuild.asm => Kbuild} | 4 +++- scripts/Makefile.asm-generic | 5 +++++ 29 files changed, 18 insertions(+), 47 deletions(-) create mode 100644 include/asm-generic/Kbuild rename include/uapi/asm-generic/{Kbuild.asm => Kbuild} (87%) diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index f124be6e4c3a..03c065855eaf 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt @@ -1274,7 +1274,7 @@ See subsequent chapter for the syntax of the Kbuild file. --- 7.4 mandatory-y - mandatory-y is essentially used by include/(uapi/)asm-generic/Kbuild.asm + mandatory-y is essentially used by include/(uapi/)asm-generic/Kbuild to define the minimum set of ASM headers that all architectures must have. This works like optional generic-y. If a mandatory header is missing diff --git a/Makefile b/Makefile index 08a01add09a6..4f45ae628f54 100644 --- a/Makefile +++ b/Makefile @@ -1098,9 +1098,11 @@ asm-generic := -f $(srctree)/scripts/Makefile.asm-generic obj PHONY += asm-generic uapi-asm-generic asm-generic: uapi-asm-generic - $(Q)$(MAKE) $(asm-generic)=arch/$(SRCARCH)/include/generated/asm + $(Q)$(MAKE) $(asm-generic)=arch/$(SRCARCH)/include/generated/asm \ + generic=include/asm-generic uapi-asm-generic: - $(Q)$(MAKE) $(asm-generic)=arch/$(SRCARCH)/include/generated/uapi/asm + $(Q)$(MAKE) $(asm-generic)=arch/$(SRCARCH)/include/generated/uapi/asm \ + generic=include/uapi/asm-generic PHONY += prepare-objtool prepare-objtool: $(objtool_target) diff --git a/arch/alpha/include/uapi/asm/Kbuild b/arch/alpha/include/uapi/asm/Kbuild index 439f5157aa35..7417847dc438 100644 --- a/arch/alpha/include/uapi/asm/Kbuild +++ b/arch/alpha/include/uapi/asm/Kbuild @@ -1,3 +1 @@ -include include/uapi/asm-generic/Kbuild.asm - generated-y += unistd_32.h diff --git a/arch/arc/include/uapi/asm/Kbuild b/arch/arc/include/uapi/asm/Kbuild index 0febf1a07c30..755bb11323d8 100644 --- a/arch/arc/include/uapi/asm/Kbuild +++ b/arch/arc/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ -include include/uapi/asm-generic/Kbuild.asm - generic-y += kvm_para.h generic-y += ucontext.h diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild index eee8f7d23899..23b4464c0995 100644 --- a/arch/arm/include/uapi/asm/Kbuild +++ b/arch/arm/include/uapi/asm/Kbuild @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -include include/uapi/asm-generic/Kbuild.asm generated-y += unistd-common.h generated-y += unistd-oabi.h diff --git a/arch/arm64/include/uapi/asm/Kbuild b/arch/arm64/include/uapi/asm/Kbuild index 87eea29b24ab..602d137932dc 100644 --- a/arch/arm64/include/uapi/asm/Kbuild +++ b/arch/arm64/include/uapi/asm/Kbuild @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -include include/uapi/asm-generic/Kbuild.asm generic-y += kvm_para.h diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild index 0febf1a07c30..755bb11323d8 100644 --- a/arch/c6x/include/uapi/asm/Kbuild +++ b/arch/c6x/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ -include include/uapi/asm-generic/Kbuild.asm - generic-y += kvm_para.h generic-y += ucontext.h diff --git a/arch/csky/include/uapi/asm/Kbuild b/arch/csky/include/uapi/asm/Kbuild index c1b06dcf6cf8..1c72f04ff75d 100644 --- a/arch/csky/include/uapi/asm/Kbuild +++ b/arch/csky/include/uapi/asm/Kbuild @@ -1,3 +1 @@ -include include/uapi/asm-generic/Kbuild.asm - generic-y += ucontext.h diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild index 0febf1a07c30..755bb11323d8 100644 --- a/arch/h8300/include/uapi/asm/Kbuild +++ b/arch/h8300/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ -include include/uapi/asm-generic/Kbuild.asm - generic-y += kvm_para.h generic-y += ucontext.h diff --git a/arch/hexagon/include/uapi/asm/Kbuild b/arch/hexagon/include/uapi/asm/Kbuild index c1b06dcf6cf8..1c72f04ff75d 100644 --- a/arch/hexagon/include/uapi/asm/Kbuild +++ b/arch/hexagon/include/uapi/asm/Kbuild @@ -1,3 +1 @@ -include include/uapi/asm-generic/Kbuild.asm - generic-y += ucontext.h diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild index 5b819e53c397..20018cb883a9 100644 --- a/arch/ia64/include/uapi/asm/Kbuild +++ b/arch/ia64/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ -include include/uapi/asm-generic/Kbuild.asm - generated-y += unistd_64.h generic-y += kvm_para.h diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild index 960bf1e4be53..8a7ad40be463 100644 --- a/arch/m68k/include/uapi/asm/Kbuild +++ b/arch/m68k/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ -include include/uapi/asm-generic/Kbuild.asm - generated-y += unistd_32.h generic-y += kvm_para.h diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild index 97823ec46e97..3ce84fbb2678 100644 --- a/arch/microblaze/include/uapi/asm/Kbuild +++ b/arch/microblaze/include/uapi/asm/Kbuild @@ -1,5 +1,3 @@ -include include/uapi/asm-generic/Kbuild.asm - generated-y += unistd_32.h generic-y += kvm_para.h generic-y += ucontext.h diff --git a/arch/mips/include/uapi/asm/Kbuild b/arch/mips/include/uapi/asm/Kbuild index 0851c103a8ce..c3798bfe0486 100644 --- a/arch/mips/include/uapi/asm/Kbuild +++ b/arch/mips/include/uapi/asm/Kbuild @@ -1,5 +1,3 @@ -include include/uapi/asm-generic/Kbuild.asm - generated-y += unistd_n32.h generated-y += unistd_n64.h generated-y += unistd_o32.h diff --git a/arch/nds32/include/uapi/asm/Kbuild b/arch/nds32/include/uapi/asm/Kbuild index c1b06dcf6cf8..1c72f04ff75d 100644 --- a/arch/nds32/include/uapi/asm/Kbuild +++ b/arch/nds32/include/uapi/asm/Kbuild @@ -1,3 +1 @@ -include include/uapi/asm-generic/Kbuild.asm - generic-y += ucontext.h diff --git a/arch/nios2/include/uapi/asm/Kbuild b/arch/nios2/include/uapi/asm/Kbuild index 0febf1a07c30..755bb11323d8 100644 --- a/arch/nios2/include/uapi/asm/Kbuild +++ b/arch/nios2/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ -include include/uapi/asm-generic/Kbuild.asm - generic-y += kvm_para.h generic-y += ucontext.h diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild index 0febf1a07c30..755bb11323d8 100644 --- a/arch/openrisc/include/uapi/asm/Kbuild +++ b/arch/openrisc/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ -include include/uapi/asm-generic/Kbuild.asm - generic-y += kvm_para.h generic-y += ucontext.h diff --git a/arch/parisc/include/uapi/asm/Kbuild b/arch/parisc/include/uapi/asm/Kbuild index c54353d390ff..22fdbd08cdc8 100644 --- a/arch/parisc/include/uapi/asm/Kbuild +++ b/arch/parisc/include/uapi/asm/Kbuild @@ -1,5 +1,3 @@ -include include/uapi/asm-generic/Kbuild.asm - generated-y += unistd_32.h generated-y += unistd_64.h generic-y += kvm_para.h diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild index 214a39acdf25..2bd5b392277c 100644 --- a/arch/powerpc/include/uapi/asm/Kbuild +++ b/arch/powerpc/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ -include include/uapi/asm-generic/Kbuild.asm - generated-y += unistd_32.h generated-y += unistd_64.h diff --git a/arch/riscv/include/uapi/asm/Kbuild b/arch/riscv/include/uapi/asm/Kbuild index d2ee86b4c091..e69de29bb2d1 100644 --- a/arch/riscv/include/uapi/asm/Kbuild +++ b/arch/riscv/include/uapi/asm/Kbuild @@ -1 +0,0 @@ -include include/uapi/asm-generic/Kbuild.asm diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild index da3e0d48abbc..46c1ff0b842a 100644 --- a/arch/s390/include/uapi/asm/Kbuild +++ b/arch/s390/include/uapi/asm/Kbuild @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -include include/uapi/asm-generic/Kbuild.asm generated-y += unistd_32.h generated-y += unistd_64.h diff --git a/arch/sh/include/uapi/asm/Kbuild b/arch/sh/include/uapi/asm/Kbuild index eaa30bcd93bf..ecfbd40924dd 100644 --- a/arch/sh/include/uapi/asm/Kbuild +++ b/arch/sh/include/uapi/asm/Kbuild @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -include include/uapi/asm-generic/Kbuild.asm generated-y += unistd_32.h generic-y += kvm_para.h diff --git a/arch/sparc/include/uapi/asm/Kbuild b/arch/sparc/include/uapi/asm/Kbuild index 214a39acdf25..2bd5b392277c 100644 --- a/arch/sparc/include/uapi/asm/Kbuild +++ b/arch/sparc/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ -include include/uapi/asm-generic/Kbuild.asm - generated-y += unistd_32.h generated-y += unistd_64.h diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild index 0febf1a07c30..755bb11323d8 100644 --- a/arch/unicore32/include/uapi/asm/Kbuild +++ b/arch/unicore32/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ -include include/uapi/asm-generic/Kbuild.asm - generic-y += kvm_para.h generic-y += ucontext.h diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild index f6648e9928b3..59b5ad310f78 100644 --- a/arch/x86/include/uapi/asm/Kbuild +++ b/arch/x86/include/uapi/asm/Kbuild @@ -1,5 +1,3 @@ -include include/uapi/asm-generic/Kbuild.asm - generated-y += unistd_32.h generated-y += unistd_64.h generated-y += unistd_x32.h diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild index 960bf1e4be53..8a7ad40be463 100644 --- a/arch/xtensa/include/uapi/asm/Kbuild +++ b/arch/xtensa/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ -include include/uapi/asm-generic/Kbuild.asm - generated-y += unistd_32.h generic-y += kvm_para.h diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild new file mode 100644 index 000000000000..6f4536d70b8e --- /dev/null +++ b/include/asm-generic/Kbuild @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# asm headers that all architectures except um should have +# (This file is not included when SRCARCH=um since UML borrows several +# asm headers from the host architecutre.) diff --git a/include/uapi/asm-generic/Kbuild.asm b/include/uapi/asm-generic/Kbuild similarity index 87% rename from include/uapi/asm-generic/Kbuild.asm rename to include/uapi/asm-generic/Kbuild index 355c4ac2c0b0..ebb180aac74e 100644 --- a/include/uapi/asm-generic/Kbuild.asm +++ b/include/uapi/asm-generic/Kbuild @@ -1,6 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 # # Headers that are mandatory in usr/include/asm/ -# +# (This file is not included when SRCARCH=um since UML does not support UAPI.) + mandatory-y += auxvec.h mandatory-y += bitsperlong.h mandatory-y += bpf_perf_event.h diff --git a/scripts/Makefile.asm-generic b/scripts/Makefile.asm-generic index c7d2b7acad26..82ad63dcd62b 100644 --- a/scripts/Makefile.asm-generic +++ b/scripts/Makefile.asm-generic @@ -12,6 +12,11 @@ all: src := $(subst /generated,,$(obj)) -include $(src)/Kbuild +# $(generic)/Kbuild lists mandatory-y. Exclude um since it is a special case. +ifneq ($(SRCARCH),um) +include $(generic)/Kbuild +endif + include scripts/Kbuild.include redundant := $(filter $(mandatory-y) $(generated-y), $(generic-y)) From c71bb9f8666602a22aee9df36a2df35e47edd8cc Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 17 Mar 2019 15:05:03 +0900 Subject: [PATCH 561/855] kconfig: remove stale lxdialog/.gitignore When this .gitignore was added, lxdialog was an independent hostprogs-y. Now that all objects in lxdialog/ are directly linked to mconf, the lxdialog is no longer generated. Signed-off-by: Masahiro Yamada --- scripts/kconfig/lxdialog/.gitignore | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 scripts/kconfig/lxdialog/.gitignore diff --git a/scripts/kconfig/lxdialog/.gitignore b/scripts/kconfig/lxdialog/.gitignore deleted file mode 100644 index 90b08ff025a6..000000000000 --- a/scripts/kconfig/lxdialog/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# -# Generated files -# -lxdialog From 2d012c65a9ca26a0ef87ea0a42f1653dd37155f5 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sun, 17 Mar 2019 15:49:29 +0900 Subject: [PATCH 562/855] ALSA: firewire-motu: use 'version' field of unit directory to identify model Current ALSA firewire-motu driver uses the value of 'model' field of unit directory in configuration ROM for modalias for MOTU FireWire models. However, as long as I checked, Pre8 and 828mk3(Hybrid) have the same value for the field (=0x100800). unit | version | model --------------- | --------- | ---------- 828mkII | 0x000003 | 0x101800 Traveler | 0x000009 | 0x107800 Pre8 | 0x00000f | 0x100800 <- 828mk3(FW) | 0x000015 | 0x106800 AudioExpress | 0x000033 | 0x104800 828mk3(Hybrid) | 0x000035 | 0x100800 <- When updating firmware for MOTU 8pre FireWire from v1.0.0 to v1.0.3, I got change of the value from 0x100800 to 0x103800. On the other hand, the value of 'version' field is fixed to 0x00000f. As a quick glance, the higher 12 bits of the value of 'version' field represent firmware version, while the lower 12 bits is unknown. By induction, the value of 'version' field represents actual model. This commit changes modalias to match the value of 'version' field, instead of 'model' field. For degug, long name of added sound card includes hexadecimal value of 'model' field. Fixes: 6c5e1ac0e144 ("ALSA: firewire-motu: add support for Motu Traveler") Signed-off-by: Takashi Sakamoto Cc: # v4.19+ Signed-off-by: Takashi Iwai --- sound/firewire/motu/motu.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sound/firewire/motu/motu.c b/sound/firewire/motu/motu.c index 220e61926ea4..513291ba0ab0 100644 --- a/sound/firewire/motu/motu.c +++ b/sound/firewire/motu/motu.c @@ -36,7 +36,7 @@ static void name_card(struct snd_motu *motu) fw_csr_iterator_init(&it, motu->unit->directory); while (fw_csr_iterator_next(&it, &key, &val)) { switch (key) { - case CSR_VERSION: + case CSR_MODEL: version = val; break; } @@ -46,7 +46,7 @@ static void name_card(struct snd_motu *motu) strcpy(motu->card->shortname, motu->spec->name); strcpy(motu->card->mixername, motu->spec->name); snprintf(motu->card->longname, sizeof(motu->card->longname), - "MOTU %s (version:%d), GUID %08x%08x at %s, S%d", + "MOTU %s (version:%06x), GUID %08x%08x at %s, S%d", motu->spec->name, version, fw_dev->config_rom[3], fw_dev->config_rom[4], dev_name(&motu->unit->device), 100 << fw_dev->max_speed); @@ -237,20 +237,20 @@ static const struct snd_motu_spec motu_audio_express = { #define SND_MOTU_DEV_ENTRY(model, data) \ { \ .match_flags = IEEE1394_MATCH_VENDOR_ID | \ - IEEE1394_MATCH_MODEL_ID | \ - IEEE1394_MATCH_SPECIFIER_ID, \ + IEEE1394_MATCH_SPECIFIER_ID | \ + IEEE1394_MATCH_VERSION, \ .vendor_id = OUI_MOTU, \ - .model_id = model, \ .specifier_id = OUI_MOTU, \ + .version = model, \ .driver_data = (kernel_ulong_t)data, \ } static const struct ieee1394_device_id motu_id_table[] = { - SND_MOTU_DEV_ENTRY(0x101800, &motu_828mk2), - SND_MOTU_DEV_ENTRY(0x107800, &snd_motu_spec_traveler), - SND_MOTU_DEV_ENTRY(0x106800, &motu_828mk3), /* FireWire only. */ - SND_MOTU_DEV_ENTRY(0x100800, &motu_828mk3), /* Hybrid. */ - SND_MOTU_DEV_ENTRY(0x104800, &motu_audio_express), + SND_MOTU_DEV_ENTRY(0x000003, &motu_828mk2), + SND_MOTU_DEV_ENTRY(0x000009, &snd_motu_spec_traveler), + SND_MOTU_DEV_ENTRY(0x000015, &motu_828mk3), /* FireWire only. */ + SND_MOTU_DEV_ENTRY(0x000035, &motu_828mk3), /* Hybrid. */ + SND_MOTU_DEV_ENTRY(0x000033, &motu_audio_express), { } }; MODULE_DEVICE_TABLE(ieee1394, motu_id_table); From c634dc6bdedeb0b2c750fc611612618a85639ab2 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Thu, 14 Mar 2019 02:42:43 +0800 Subject: [PATCH 563/855] perf/x86/intel: Make dev_attr_allow_tsx_force_abort static Fixes: 400816f60c54 ("perf/x86/intel: Implement support for TSX Force Abort") Signed-off-by: kbuild test robot Signed-off-by: Thomas Gleixner Cc: "Peter Zijlstra (Intel)" Cc: kbuild-all@01.org Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Kan Liang Cc: Jiri Olsa Cc: Andi Kleen Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190313184243.GA10820@lkp-sb-ep06 --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 92dfeb343a6a..8baa441d8000 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4179,7 +4179,7 @@ static struct attribute *intel_pmu_caps_attrs[] = { NULL }; -DEVICE_BOOL_ATTR(allow_tsx_force_abort, 0644, allow_tsx_force_abort); +static DEVICE_BOOL_ATTR(allow_tsx_force_abort, 0644, allow_tsx_force_abort); static struct attribute *intel_pmu_attrs[] = { &dev_attr_freeze_on_smi.attr, From 41c8d0adf3c4df1867d98cee4a2c4531352a33ad Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 12 Mar 2019 16:44:28 +0200 Subject: [PATCH 564/855] auxdisplay: hd44780: Fix memory leak on ->remove() We have to free on ->remove() the allocated resources on ->probe(). Fixes: d47d88361fee ("auxdisplay: Add HD44780 Character LCD support") Reviewed-by: Geert Uytterhoeven Signed-off-by: Andy Shevchenko Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/hd44780.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/auxdisplay/hd44780.c b/drivers/auxdisplay/hd44780.c index 9ad93ea42fdc..3cde351fb5c9 100644 --- a/drivers/auxdisplay/hd44780.c +++ b/drivers/auxdisplay/hd44780.c @@ -280,6 +280,8 @@ static int hd44780_remove(struct platform_device *pdev) struct charlcd *lcd = platform_get_drvdata(pdev); charlcd_unregister(lcd); + + kfree(lcd); return 0; } From b658a2113ba4d4b99e2a57926379b0c0b0c648ab Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 12 Mar 2019 16:44:29 +0200 Subject: [PATCH 565/855] auxdisplay: charlcd: Move to_priv() to charlcd namespace In order to be more particular in names, rename to_priv() macro to charlcd_to_priv(). No functional change intended. Reviewed-by: Geert Uytterhoeven Signed-off-by: Andy Shevchenko Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/charlcd.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c index 60e0b772673f..407acd22efa8 100644 --- a/drivers/auxdisplay/charlcd.c +++ b/drivers/auxdisplay/charlcd.c @@ -91,7 +91,7 @@ struct charlcd_priv { unsigned long long drvdata[0]; }; -#define to_priv(p) container_of(p, struct charlcd_priv, lcd) +#define charlcd_to_priv(p) container_of(p, struct charlcd_priv, lcd) /* Device single-open policy control */ static atomic_t charlcd_available = ATOMIC_INIT(1); @@ -105,7 +105,7 @@ static void long_sleep(int ms) /* turn the backlight on or off */ static void charlcd_backlight(struct charlcd *lcd, int on) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); if (!lcd->ops->backlight) return; @@ -134,7 +134,7 @@ static void charlcd_bl_off(struct work_struct *work) /* turn the backlight on for a little while */ void charlcd_poke(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); if (!lcd->ops->backlight) return; @@ -152,7 +152,7 @@ EXPORT_SYMBOL_GPL(charlcd_poke); static void charlcd_gotoxy(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); unsigned int addr; /* @@ -170,7 +170,7 @@ static void charlcd_gotoxy(struct charlcd *lcd) static void charlcd_home(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); priv->addr.x = 0; priv->addr.y = 0; @@ -179,7 +179,7 @@ static void charlcd_home(struct charlcd *lcd) static void charlcd_print(struct charlcd *lcd, char c) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); if (priv->addr.x < lcd->bwidth) { if (lcd->char_conv) @@ -211,7 +211,7 @@ static void charlcd_clear_fast(struct charlcd *lcd) /* clears the display and resets X/Y */ static void charlcd_clear_display(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); lcd->ops->write_cmd(lcd, LCD_CMD_DISPLAY_CLEAR); priv->addr.x = 0; @@ -223,7 +223,7 @@ static void charlcd_clear_display(struct charlcd *lcd) static int charlcd_init_display(struct charlcd *lcd) { void (*write_cmd_raw)(struct charlcd *lcd, int cmd); - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); u8 init; if (lcd->ifwidth != 4 && lcd->ifwidth != 8) @@ -369,7 +369,7 @@ static bool parse_xy(const char *s, unsigned long *x, unsigned long *y) static inline int handle_lcd_special_code(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); /* LCD special codes */ @@ -580,7 +580,7 @@ static inline int handle_lcd_special_code(struct charlcd *lcd) static void charlcd_write_char(struct charlcd *lcd, char c) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); /* first, we'll test if we're in escape mode */ if ((c != '\n') && priv->esc_seq.len >= 0) { @@ -705,7 +705,7 @@ static ssize_t charlcd_write(struct file *file, const char __user *buf, static int charlcd_open(struct inode *inode, struct file *file) { - struct charlcd_priv *priv = to_priv(the_charlcd); + struct charlcd_priv *priv = charlcd_to_priv(the_charlcd); int ret; ret = -EBUSY; @@ -766,7 +766,7 @@ static void charlcd_puts(struct charlcd *lcd, const char *s) /* initialize the LCD driver */ static int charlcd_init(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); int ret; if (lcd->ops->backlight) { @@ -866,7 +866,7 @@ EXPORT_SYMBOL_GPL(charlcd_register); int charlcd_unregister(struct charlcd *lcd) { - struct charlcd_priv *priv = to_priv(lcd); + struct charlcd_priv *priv = charlcd_to_priv(lcd); unregister_reboot_notifier(&panel_notifier); charlcd_puts(lcd, "\x0cLCD driver unloaded.\x1b[Lc\x1b[Lb\x1b[L-"); From 8e44fc85060ec997e9c6f3c49a04274db6621d26 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 12 Mar 2019 16:44:30 +0200 Subject: [PATCH 566/855] auxdisplay: charlcd: Introduce charlcd_free() helper The charlcd_free() is a counterpart to charlcd_alloc() and should be called symmetrically on tear down. Reviewed-by: Geert Uytterhoeven Signed-off-by: Andy Shevchenko Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/charlcd.c | 6 ++++++ include/misc/charlcd.h | 1 + 2 files changed, 7 insertions(+) diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c index 407acd22efa8..a351a9400054 100644 --- a/drivers/auxdisplay/charlcd.c +++ b/drivers/auxdisplay/charlcd.c @@ -818,6 +818,12 @@ struct charlcd *charlcd_alloc(unsigned int drvdata_size) } EXPORT_SYMBOL_GPL(charlcd_alloc); +void charlcd_free(struct charlcd *lcd) +{ + kfree(charlcd_to_priv(lcd)); +} +EXPORT_SYMBOL_GPL(charlcd_free); + static int panel_notify_sys(struct notifier_block *this, unsigned long code, void *unused) { diff --git a/include/misc/charlcd.h b/include/misc/charlcd.h index 23f61850f363..1832402324ce 100644 --- a/include/misc/charlcd.h +++ b/include/misc/charlcd.h @@ -35,6 +35,7 @@ struct charlcd_ops { }; struct charlcd *charlcd_alloc(unsigned int drvdata_size); +void charlcd_free(struct charlcd *lcd); int charlcd_register(struct charlcd *lcd); int charlcd_unregister(struct charlcd *lcd); From 9b11d63966fc70b5496013bb211fa9025cd1ad61 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 12 Mar 2019 16:44:31 +0200 Subject: [PATCH 567/855] auxdisplay: panel: Convert to use charlcd_free() Convert to use charlcd_free() instead of kfree() for sake of type check. Reviewed-by: Geert Uytterhoeven Signed-off-by: Andy Shevchenko Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/panel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c index 21b9b2f2470a..e06de63497cf 100644 --- a/drivers/auxdisplay/panel.c +++ b/drivers/auxdisplay/panel.c @@ -1620,7 +1620,7 @@ err_lcd_unreg: if (lcd.enabled) charlcd_unregister(lcd.charlcd); err_unreg_device: - kfree(lcd.charlcd); + charlcd_free(lcd.charlcd); lcd.charlcd = NULL; parport_unregister_device(pprt); pprt = NULL; @@ -1647,7 +1647,7 @@ static void panel_detach(struct parport *port) if (lcd.enabled) { charlcd_unregister(lcd.charlcd); lcd.initialized = false; - kfree(lcd.charlcd); + charlcd_free(lcd.charlcd); lcd.charlcd = NULL; } From cb79eb95c56fe6afe2baf1df01f22b1bed3f6060 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 12 Mar 2019 16:44:32 +0200 Subject: [PATCH 568/855] auxdisplay: hd44780: Convert to use charlcd_free() Convert to use charlcd_free() instead of kfree() for sake of type check. Reviewed-by: Geert Uytterhoeven Signed-off-by: Andy Shevchenko Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/hd44780.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/auxdisplay/hd44780.c b/drivers/auxdisplay/hd44780.c index 3cde351fb5c9..ab15b64707ad 100644 --- a/drivers/auxdisplay/hd44780.c +++ b/drivers/auxdisplay/hd44780.c @@ -271,7 +271,7 @@ static int hd44780_probe(struct platform_device *pdev) return 0; fail: - kfree(lcd); + charlcd_free(lcd); return ret; } @@ -281,7 +281,7 @@ static int hd44780_remove(struct platform_device *pdev) charlcd_unregister(lcd); - kfree(lcd); + charlcd_free(lcd); return 0; } From 24c764abfd0d4b6e8e33c3818b668edbb4936d6f Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Fri, 1 Mar 2019 18:48:14 +0000 Subject: [PATCH 569/855] auxdisplay: deconfuse configuration The auxdisplay Kconfig is confusing. It creates two separate menus even though the settings are closely related. Moreover, the options for setting the boot message depend on CONFIG_PARPORT even though they are used by drivers that do not. Clear up the confusion by moving the "Parallel port LCD/Keypad" menu under auxdisplay where it logically belongs. Change the boot message options to depend only on CONFIG_CHARLCD, making them accessible also when only the HD44780 is selected. Since the "Parallel port LCD/Keypad" driver now has a new dependency on CONFIG_AUXDISPLAY, rename its Kconfig symbol and keep the old one such that make oldconfig will not disable the driver. Signed-off-by: Mans Rullgard Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/Kconfig | 17 ++++++++++++----- drivers/auxdisplay/Makefile | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig index 57410f9c5d44..7d3fe27d6868 100644 --- a/drivers/auxdisplay/Kconfig +++ b/drivers/auxdisplay/Kconfig @@ -164,9 +164,7 @@ config ARM_CHARLCD line and the Linux version on the second line, but that's still useful. -endif # AUXDISPLAY - -menuconfig PANEL +menuconfig PARPORT_PANEL tristate "Parallel port LCD/Keypad Panel support" depends on PARPORT select CHARLCD @@ -178,7 +176,7 @@ menuconfig PANEL compiled as a module, or linked into the kernel and started at boot. If you don't understand what all this is about, say N. -if PANEL +if PARPORT_PANEL config PANEL_PARPORT int "Default parallel port number (0=LPT1)" @@ -419,8 +417,11 @@ config PANEL_LCD_PIN_BL Default for the 'BL' pin in custom profile is '0' (uncontrolled). +endif # PARPORT_PANEL + config PANEL_CHANGE_MESSAGE bool "Change LCD initialization message ?" + depends on CHARLCD default "n" ---help--- This allows you to replace the boot message indicating the kernel version @@ -444,7 +445,13 @@ config PANEL_BOOT_MESSAGE An empty message will only clear the display at driver init time. Any other printf()-formatted message is valid with newline and escape codes. -endif # PANEL +endif # AUXDISPLAY + +config PANEL + tristate "Parallel port LCD/Keypad Panel support (OLD OPTION)" + depends on PARPORT + select AUXDISPLAY + select PARPORT_PANEL config CHARLCD tristate "Character LCD core support" if COMPILE_TEST diff --git a/drivers/auxdisplay/Makefile b/drivers/auxdisplay/Makefile index 7ac6776ca3f6..cf54b5efb07e 100644 --- a/drivers/auxdisplay/Makefile +++ b/drivers/auxdisplay/Makefile @@ -10,4 +10,4 @@ obj-$(CONFIG_CFAG12864B) += cfag12864b.o cfag12864bfb.o obj-$(CONFIG_IMG_ASCII_LCD) += img-ascii-lcd.o obj-$(CONFIG_HD44780) += hd44780.o obj-$(CONFIG_HT16K33) += ht16k33.o -obj-$(CONFIG_PANEL) += panel.o +obj-$(CONFIG_PARPORT_PANEL) += panel.o From c9171722459fdeab0f27790ae04c0c5a4ae5a9b2 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Fri, 1 Mar 2019 18:48:15 +0000 Subject: [PATCH 570/855] auxdisplay: charlcd: simplify init message display If CONFIG_PANEL_CHANGE_MESSAGE is set, CONFIG_PANEL_BOOT_MESSAGE will also be defined, so the double ifdef is pointless. Simplify the code further by using an intermediate macro rather duplicating most of the line. Signed-off-by: Mans Rullgard Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/charlcd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c index a351a9400054..5212675564d7 100644 --- a/drivers/auxdisplay/charlcd.c +++ b/drivers/auxdisplay/charlcd.c @@ -763,6 +763,12 @@ static void charlcd_puts(struct charlcd *lcd, const char *s) } } +#ifdef CONFIG_PANEL_BOOT_MESSAGE +#define LCD_INIT_TEXT CONFIG_PANEL_BOOT_MESSAGE +#else +#define LCD_INIT_TEXT "Linux-" UTS_RELEASE "\n" +#endif + /* initialize the LCD driver */ static int charlcd_init(struct charlcd *lcd) { @@ -784,13 +790,8 @@ static int charlcd_init(struct charlcd *lcd) return ret; /* display a short message */ -#ifdef CONFIG_PANEL_CHANGE_MESSAGE -#ifdef CONFIG_PANEL_BOOT_MESSAGE - charlcd_puts(lcd, "\x1b[Lc\x1b[Lb\x1b[L*" CONFIG_PANEL_BOOT_MESSAGE); -#endif -#else - charlcd_puts(lcd, "\x1b[Lc\x1b[Lb\x1b[L*Linux-" UTS_RELEASE "\n"); -#endif + charlcd_puts(lcd, "\x1b[Lc\x1b[Lb\x1b[L*" LCD_INIT_TEXT); + /* clear the display on the next device opening */ priv->must_clear = true; charlcd_home(lcd); From cc5d04d840d62d7c75e268c51da7cd0be2ee03c0 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Fri, 1 Mar 2019 18:48:16 +0000 Subject: [PATCH 571/855] auxdisplay: charlcd: make backlight initial state configurable The charlcd driver currently flashes the backlight once on init. This may not be desirable. Thus, add options for turning the backlight off or on as well. Signed-off-by: Mans Rullgard Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/Kconfig | 21 +++++++++++++++++++++ drivers/auxdisplay/charlcd.c | 10 +++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig index 7d3fe27d6868..c52c738e554a 100644 --- a/drivers/auxdisplay/Kconfig +++ b/drivers/auxdisplay/Kconfig @@ -445,6 +445,27 @@ config PANEL_BOOT_MESSAGE An empty message will only clear the display at driver init time. Any other printf()-formatted message is valid with newline and escape codes. +choice + prompt "Backlight initial state" + default CHARLCD_BL_FLASH + + config CHARLCD_BL_OFF + bool "Off" + help + Backlight is initially turned off + + config CHARLCD_BL_ON + bool "On" + help + Backlight is initially turned on + + config CHARLCD_BL_FLASH + bool "Flash" + help + Backlight is flashed briefly on init + +endchoice + endif # AUXDISPLAY config PANEL diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c index 5212675564d7..92745efefb54 100644 --- a/drivers/auxdisplay/charlcd.c +++ b/drivers/auxdisplay/charlcd.c @@ -769,6 +769,14 @@ static void charlcd_puts(struct charlcd *lcd, const char *s) #define LCD_INIT_TEXT "Linux-" UTS_RELEASE "\n" #endif +#ifdef CONFIG_CHARLCD_BL_ON +#define LCD_INIT_BL "\x1b[L+" +#elif defined(CONFIG_CHARLCD_BL_FLASH) +#define LCD_INIT_BL "\x1b[L*" +#else +#define LCD_INIT_BL "\x1b[L-" +#endif + /* initialize the LCD driver */ static int charlcd_init(struct charlcd *lcd) { @@ -790,7 +798,7 @@ static int charlcd_init(struct charlcd *lcd) return ret; /* display a short message */ - charlcd_puts(lcd, "\x1b[Lc\x1b[Lb\x1b[L*" LCD_INIT_TEXT); + charlcd_puts(lcd, "\x1b[Lc\x1b[Lb" LCD_INIT_BL LCD_INIT_TEXT); /* clear the display on the next device opening */ priv->must_clear = true; From 9e98c678c2d6ae3a17cb2de55d17f69dddaa231b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 17 Mar 2019 14:22:26 -0700 Subject: [PATCH 572/855] Linux 5.1-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 4f45ae628f54..99c0530489ef 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 -PATCHLEVEL = 0 +PATCHLEVEL = 1 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Shy Crocodile # *DOCUMENTATION* From 8f3b487685b2acf71b42bb30d68fd9271bec8695 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 15 Mar 2019 11:37:20 +1000 Subject: [PATCH 573/855] drm/udl: use drm_gem_object_put_unlocked. When Daniel removed struct_mutex he didn't fix this call to the unlocked variant which is required since we no longer use struct mutex. This fixes a bunch of: WARNING: CPU: 4 PID: 1370 at drivers/gpu/drm/drm_gem.c:931 drm_gem_object_put+0x2b/0x30 [drm] Modules linked in: udl xt_CHECKSUM ipt_MASQUERADE tun bridge stp llc nf_conntrack_netbios_ns nf_conntrack_broadcast xt_CT ip6t> CPU: 4 PID: 1370 Comm: Xorg Not tainted 5.0.0+ #2 backtraces when you plug in a udl device. Fixes: ae358dacd217 (drm/udl: Get rid of dev->struct_mutex usage) Reviewed-by: Daniel Vetter Cc: Sean Paul Signed-off-by: Dave Airlie --- drivers/gpu/drm/udl/udl_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c index d5a23295dd80..bb7b58407039 100644 --- a/drivers/gpu/drm/udl/udl_gem.c +++ b/drivers/gpu/drm/udl/udl_gem.c @@ -224,7 +224,7 @@ int udl_gem_mmap(struct drm_file *file, struct drm_device *dev, *offset = drm_vma_node_offset_addr(&gobj->base.vma_node); out: - drm_gem_object_put(&gobj->base); + drm_gem_object_put_unlocked(&gobj->base); unlock: mutex_unlock(&udl->gem_lock); return ret; From 587443e7773e150ae29e643ee8f41a1eed226565 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 6 Mar 2019 19:17:56 +0200 Subject: [PATCH 574/855] IB/mlx4: Fix race condition between catas error reset and aliasguid flows Code review revealed a race condition which could allow the catas error flow to interrupt the alias guid query post mechanism at random points. Thiis is fixed by doing cancel_delayed_work_sync() instead of cancel_delayed_work() during the alias guid mechanism destroy flow. Fixes: a0c64a17aba8 ("mlx4: Add alias_guid mechanism") Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/alias_GUID.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index 782499abcd98..2a0b59a4b6eb 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -804,8 +804,8 @@ void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev) unsigned long flags; for (i = 0 ; i < dev->num_ports; i++) { - cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work); det = &sriov->alias_guid.ports_guid[i]; + cancel_delayed_work_sync(&det->alias_guid_work); spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); while (!list_empty(&det->cb_list)) { cb_ctx = list_entry(det->cb_list.next, From f84b66b9cce78e8f9d38204fdaa75f07c75f4911 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 6 Mar 2019 19:20:50 +0200 Subject: [PATCH 575/855] net/mlx5: Fix DCT creation bad flow In case the DCT creation command has succeeded a DRAIN must be issued before calling DESTROY. In addition, the original code used the wrong parameter for the DESTROY command, 'in' instead of 'din', which caused another creation try instead of destroying. Cc: # 4.15 Fixes: 57cda166bbe0 ("net/mlx5: Add DCT command interface") Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 66 +++++++++++--------- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 370ca94b6775..c7c2920c05c4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -40,6 +40,9 @@ #include "mlx5_core.h" #include "lib/eq.h" +static int mlx5_core_drain_dct(struct mlx5_core_dev *dev, + struct mlx5_core_dct *dct); + static struct mlx5_core_rsc_common * mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn) { @@ -227,13 +230,42 @@ static void destroy_resource_common(struct mlx5_core_dev *dev, wait_for_completion(&qp->common.free); } +static int _mlx5_core_destroy_dct(struct mlx5_core_dev *dev, + struct mlx5_core_dct *dct, bool need_cleanup) +{ + u32 out[MLX5_ST_SZ_DW(destroy_dct_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {0}; + struct mlx5_core_qp *qp = &dct->mqp; + int err; + + err = mlx5_core_drain_dct(dev, dct); + if (err) { + if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + goto destroy; + } else { + mlx5_core_warn( + dev, "failed drain DCT 0x%x with error 0x%x\n", + qp->qpn, err); + return err; + } + } + wait_for_completion(&dct->drained); +destroy: + if (need_cleanup) + destroy_resource_common(dev, &dct->mqp); + MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT); + MLX5_SET(destroy_dct_in, in, dctn, qp->qpn); + MLX5_SET(destroy_dct_in, in, uid, qp->uid); + err = mlx5_cmd_exec(dev, (void *)&in, sizeof(in), + (void *)&out, sizeof(out)); + return err; +} + int mlx5_core_create_dct(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct, u32 *in, int inlen) { u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0}; - u32 din[MLX5_ST_SZ_DW(destroy_dct_in)] = {0}; - u32 dout[MLX5_ST_SZ_DW(destroy_dct_out)] = {0}; struct mlx5_core_qp *qp = &dct->mqp; int err; @@ -254,11 +286,7 @@ int mlx5_core_create_dct(struct mlx5_core_dev *dev, return 0; err_cmd: - MLX5_SET(destroy_dct_in, din, opcode, MLX5_CMD_OP_DESTROY_DCT); - MLX5_SET(destroy_dct_in, din, dctn, qp->qpn); - MLX5_SET(destroy_dct_in, din, uid, qp->uid); - mlx5_cmd_exec(dev, (void *)&in, sizeof(din), - (void *)&out, sizeof(dout)); + _mlx5_core_destroy_dct(dev, dct, false); return err; } EXPORT_SYMBOL_GPL(mlx5_core_create_dct); @@ -323,29 +351,7 @@ static int mlx5_core_drain_dct(struct mlx5_core_dev *dev, int mlx5_core_destroy_dct(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct) { - u32 out[MLX5_ST_SZ_DW(destroy_dct_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {0}; - struct mlx5_core_qp *qp = &dct->mqp; - int err; - - err = mlx5_core_drain_dct(dev, dct); - if (err) { - if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { - goto destroy; - } else { - mlx5_core_warn(dev, "failed drain DCT 0x%x with error 0x%x\n", qp->qpn, err); - return err; - } - } - wait_for_completion(&dct->drained); -destroy: - destroy_resource_common(dev, &dct->mqp); - MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT); - MLX5_SET(destroy_dct_in, in, dctn, qp->qpn); - MLX5_SET(destroy_dct_in, in, uid, qp->uid); - err = mlx5_cmd_exec(dev, (void *)&in, sizeof(in), - (void *)&out, sizeof(out)); - return err; + return _mlx5_core_destroy_dct(dev, dct, true); } EXPORT_SYMBOL_GPL(mlx5_core_destroy_dct); From c5ae1954c47d3fd8815bd5a592aba18702c93f33 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 6 Mar 2019 19:21:42 +0200 Subject: [PATCH 576/855] IB/mlx5: Use mlx5 core to create/destroy a DEVX DCT To prevent a hardware memory leak when a DEVX DCT object is destroyed without calling DRAIN DCT before, (e.g. under cleanup flow), need to manage its creation and destruction via mlx5 core. In that case the DRAIN DCT command will be called and only once that it will be completed the DESTROY DCT command will be called. Otherwise, the DESTROY DCT may fail and a hardware leak may occur. As of that change the DRAIN DCT command should not be exposed any more from DEVX, it's managed internally by the driver to work as expected by the device specification. Fixes: 7efce3691d33 ("IB/mlx5: Add obj create and destroy functionality") Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 34 +++++++++++++++----- drivers/infiniband/hw/mlx5/qp.c | 4 ++- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 6 ++-- include/linux/mlx5/qp.h | 3 +- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index eaa055007f28..9e08df7914aa 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -20,6 +20,7 @@ enum devx_obj_flags { DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0, + DEVX_OBJ_FLAGS_DCT = 1 << 1, }; struct devx_async_data { @@ -39,7 +40,10 @@ struct devx_obj { u32 dinlen; /* destroy inbox length */ u32 dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW]; u32 flags; - struct mlx5_ib_devx_mr devx_mr; + union { + struct mlx5_ib_devx_mr devx_mr; + struct mlx5_core_dct core_dct; + }; }; struct devx_umem { @@ -347,7 +351,6 @@ static u64 devx_get_obj_id(const void *in) obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, MLX5_GET(arm_rq_in, in, srq_number)); break; - case MLX5_CMD_OP_DRAIN_DCT: case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT, MLX5_GET(drain_dct_in, in, dctn)); @@ -618,7 +621,6 @@ static bool devx_is_obj_modify_cmd(const void *in) case MLX5_CMD_OP_2RST_QP: case MLX5_CMD_OP_ARM_XRC_SRQ: case MLX5_CMD_OP_ARM_RQ: - case MLX5_CMD_OP_DRAIN_DCT: case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: case MLX5_CMD_OP_ARM_XRQ: case MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY: @@ -1124,7 +1126,11 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) devx_cleanup_mkey(obj); - ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); + if (obj->flags & DEVX_OBJ_FLAGS_DCT) + ret = mlx5_core_destroy_dct(obj->mdev, &obj->core_dct); + else + ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, + sizeof(out)); if (ib_is_destroy_retryable(ret, why, uobject)) return ret; @@ -1185,9 +1191,17 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( devx_set_umem_valid(cmd_in); } - err = mlx5_cmd_exec(dev->mdev, cmd_in, - cmd_in_len, - cmd_out, cmd_out_len); + if (opcode == MLX5_CMD_OP_CREATE_DCT) { + obj->flags |= DEVX_OBJ_FLAGS_DCT; + err = mlx5_core_create_dct(dev->mdev, &obj->core_dct, + cmd_in, cmd_in_len, + cmd_out, cmd_out_len); + } else { + err = mlx5_cmd_exec(dev->mdev, cmd_in, + cmd_in_len, + cmd_out, cmd_out_len); + } + if (err) goto obj_free; @@ -1214,7 +1228,11 @@ err_copy: if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) devx_cleanup_mkey(obj); obj_destroy: - mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); + if (obj->flags & DEVX_OBJ_FLAGS_DCT) + mlx5_core_destroy_dct(obj->mdev, &obj->core_dct); + else + mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, + sizeof(out)); obj_free: kfree(obj); return err; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 6b1f0e76900b..7cd006da1dae 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3729,6 +3729,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { struct mlx5_ib_modify_qp_resp resp = {}; + u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0}; u32 min_resp_len = offsetof(typeof(resp), dctn) + sizeof(resp.dctn); @@ -3747,7 +3748,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in, - MLX5_ST_SZ_BYTES(create_dct_in)); + MLX5_ST_SZ_BYTES(create_dct_in), out, + sizeof(out)); if (err) return err; resp.dctn = qp->dct.mdct.mqp.qpn; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index c7c2920c05c4..b8ba74de9555 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -263,16 +263,16 @@ destroy: int mlx5_core_create_dct(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct, - u32 *in, int inlen) + u32 *in, int inlen, + u32 *out, int outlen) { - u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0}; struct mlx5_core_qp *qp = &dct->mqp; int err; init_completion(&dct->drained); MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT); - err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, inlen, out, outlen); if (err) { mlx5_core_warn(dev, "create DCT failed, ret %d\n", err); return err; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index b26ea9077384..0343c81d4c5f 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -557,7 +557,8 @@ static inline struct mlx5_core_mkey *__mlx5_mr_lookup(struct mlx5_core_dev *dev, int mlx5_core_create_dct(struct mlx5_core_dev *dev, struct mlx5_core_dct *qp, - u32 *in, int inlen); + u32 *in, int inlen, + u32 *out, int outlen); int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, u32 *in, From cd27287562d69629c5f007c6f64c27b6ff15f3e5 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 11 Mar 2019 14:35:58 +0200 Subject: [PATCH 577/855] IB/mlx5: Fix mapping of link-mode to IB width and speed Add mapping of link mode: CAUI4 100Gbps CR4/KR4 with 4 lines and 25Gbps. Fix mapping of link mode: GAUI2 50Gbps CR2/KR2 to be 2 lines with 25Gbps. Fixes: 08e8676f1607 ("IB/mlx5: Add support for 50Gbps per lane link modes") Signed-off-by: Aya Levin Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 994c19d01211..531ff20b32ad 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -415,10 +415,17 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u8 *active_speed, *active_speed = IB_SPEED_EDR; break; case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2): + *active_width = IB_WIDTH_2X; + *active_speed = IB_SPEED_EDR; + break; case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_HDR; break; + case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + break; case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2): *active_width = IB_WIDTH_2X; *active_speed = IB_SPEED_HDR; From ec4fe4bcc584b55e24e8d1768f5510a62c0fd619 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 14 Mar 2019 18:37:29 +0800 Subject: [PATCH 578/855] i40iw: Avoid panic when handling the inetdev event There is a panic reported that on a system with x722 ethernet, when doing the operations like: # ip link add br0 type bridge # ip link set eno1 master br0 # systemctl restart systemd-networkd The system will panic "BUG: unable to handle kernel null pointer dereference at 0000000000000034", with call chain: i40iw_inetaddr_event notifier_call_chain blocking_notifier_call_chain notifier_call_chain __inet_del_ifa inet_rtm_deladdr rtnetlink_rcv_msg netlink_rcv_skb rtnetlink_rcv netlink_unicast netlink_sendmsg sock_sendmsg __sys_sendto It is caused by "local_ipaddr = ntohl(in->ifa_list->ifa_address)", while the in->ifa_list is NULL. So add a check for the "in->ifa_list == NULL" case, and skip the ARP operation accordingly. Signed-off-by: Feng Tang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_utils.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index c5a881172524..337410f40860 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -173,7 +173,12 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, rcu_read_lock(); in = __in_dev_get_rcu(upper_dev); - local_ipaddr = ntohl(in->ifa_list->ifa_address); + + if (!in->ifa_list) + local_ipaddr = 0; + else + local_ipaddr = ntohl(in->ifa_list->ifa_address); + rcu_read_unlock(); } else { local_ipaddr = ntohl(ifa->ifa_address); @@ -185,6 +190,11 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, case NETDEV_UP: /* Fall through */ case NETDEV_CHANGEADDR: + + /* Just skip if no need to handle ARP cache */ + if (!local_ipaddr) + break; + i40iw_manage_arp_cache(iwdev, netdev->dev_addr, &local_ipaddr, From b5b4453e7912f056da1ca7572574cada32ecb60c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 14 Mar 2019 00:14:38 +1100 Subject: [PATCH 579/855] powerpc/vdso64: Fix CLOCK_MONOTONIC inconsistencies across Y2038 Jakub Drnec reported: Setting the realtime clock can sometimes make the monotonic clock go back by over a hundred years. Decreasing the realtime clock across the y2k38 threshold is one reliable way to reproduce. Allegedly this can also happen just by running ntpd, I have not managed to reproduce that other than booting with rtc at >2038 and then running ntp. When this happens, anything with timers (e.g. openjdk) breaks rather badly. And included a test case (slightly edited for brevity): #define _POSIX_C_SOURCE 199309L #include #include #include #include long get_time(void) { struct timespec tp; clock_gettime(CLOCK_MONOTONIC, &tp); return tp.tv_sec + tp.tv_nsec / 1000000000; } int main(void) { long last = get_time(); while(1) { long now = get_time(); if (now < last) { printf("clock went backwards by %ld seconds!\n", last - now); } last = now; sleep(1); } return 0; } Which when run concurrently with: # date -s 2040-1-1 # date -s 2037-1-1 Will detect the clock going backward. The root cause is that wtom_clock_sec in struct vdso_data is only a 32-bit signed value, even though we set its value to be equal to tk->wall_to_monotonic.tv_sec which is 64-bits. Because the monotonic clock starts at zero when the system boots the wall_to_montonic.tv_sec offset is negative for current and future dates. Currently on a freshly booted system the offset will be in the vicinity of negative 1.5 billion seconds. However if the wall clock is set past the Y2038 boundary, the offset from wall to monotonic becomes less than negative 2^31, and no longer fits in 32-bits. When that value is assigned to wtom_clock_sec it is truncated and becomes positive, causing the VDSO assembly code to calculate CLOCK_MONOTONIC incorrectly. That causes CLOCK_MONOTONIC to jump ahead by ~4 billion seconds which it is not meant to do. Worse, if the time is then set back before the Y2038 boundary CLOCK_MONOTONIC will jump backward. We can fix it simply by storing the full 64-bit offset in the vdso_data, and using that in the VDSO assembly code. We also shuffle some of the fields in vdso_data to avoid creating a hole. The original commit that added the CLOCK_MONOTONIC support to the VDSO did actually use a 64-bit value for wtom_clock_sec, see commit a7f290dad32e ("[PATCH] powerpc: Merge vdso's and add vdso support to 32 bits kernel") (Nov 2005). However just 3 days later it was converted to 32-bits in commit 0c37ec2aa88b ("[PATCH] powerpc: vdso fixes (take #2)"), and the bug has existed since then AFAICS. Fixes: 0c37ec2aa88b ("[PATCH] powerpc: vdso fixes (take #2)") Cc: stable@vger.kernel.org # v2.6.15+ Link: http://lkml.kernel.org/r/HaC.ZfES.62bwlnvAvMP.1STMMj@seznam.cz Reported-by: Jakub Drnec Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/vdso_datapage.h | 8 ++++---- arch/powerpc/kernel/vdso64/gettimeofday.S | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h index 1afe90ade595..bbc06bd72b1f 100644 --- a/arch/powerpc/include/asm/vdso_datapage.h +++ b/arch/powerpc/include/asm/vdso_datapage.h @@ -82,10 +82,10 @@ struct vdso_data { __u32 icache_block_size; /* L1 i-cache block size */ __u32 dcache_log_block_size; /* L1 d-cache log block size */ __u32 icache_log_block_size; /* L1 i-cache log block size */ - __s32 wtom_clock_sec; /* Wall to monotonic clock */ - __s32 wtom_clock_nsec; - struct timespec stamp_xtime; /* xtime as at tb_orig_stamp */ - __u32 stamp_sec_fraction; /* fractional seconds of stamp_xtime */ + __u32 stamp_sec_fraction; /* fractional seconds of stamp_xtime */ + __s32 wtom_clock_nsec; /* Wall to monotonic clock nsec */ + __s64 wtom_clock_sec; /* Wall to monotonic clock sec */ + struct timespec stamp_xtime; /* xtime as at tb_orig_stamp */ __u32 syscall_map_64[SYSCALL_MAP_SIZE]; /* map of syscalls */ __u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */ }; diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S index a4ed9edfd5f0..1f324c28705b 100644 --- a/arch/powerpc/kernel/vdso64/gettimeofday.S +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S @@ -92,7 +92,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) * At this point, r4,r5 contain our sec/nsec values. */ - lwa r6,WTOM_CLOCK_SEC(r3) + ld r6,WTOM_CLOCK_SEC(r3) lwa r9,WTOM_CLOCK_NSEC(r3) /* We now have our result in r6,r9. We create a fake dependency @@ -125,7 +125,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) bne cr6,75f /* CLOCK_MONOTONIC_COARSE */ - lwa r6,WTOM_CLOCK_SEC(r3) + ld r6,WTOM_CLOCK_SEC(r3) lwa r9,WTOM_CLOCK_NSEC(r3) /* check if counter has updated */ From e60a582bcde01158a64ff948fb799f21f5d31a11 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Mar 2019 11:09:19 +0100 Subject: [PATCH 580/855] mmc: pxamci: fix enum type confusion clang points out several instances of mismatched types in this drivers, all coming from a single declaration: drivers/mmc/host/pxamci.c:193:15: error: implicit conversion from enumeration type 'enum dma_transfer_direction' to different enumeration type 'enum dma_data_direction' [-Werror,-Wenum-conversion] direction = DMA_DEV_TO_MEM; ~ ^~~~~~~~~~~~~~ drivers/mmc/host/pxamci.c:212:62: error: implicit conversion from enumeration type 'enum dma_data_direction' to different enumeration type 'enum dma_transfer_direction' [-Werror,-Wenum-conversion] tx = dmaengine_prep_slave_sg(chan, data->sg, host->dma_len, direction, The behavior is correct, so this must be a simply typo from dma_data_direction and dma_transfer_direction being similarly named types with a similar purpose. Fixes: 6464b7140951 ("mmc: pxamci: switch over to dmaengine use") Signed-off-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Acked-by: Robert Jarzmik Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/pxamci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c index c907bf502a12..c1d3f0e38921 100644 --- a/drivers/mmc/host/pxamci.c +++ b/drivers/mmc/host/pxamci.c @@ -162,7 +162,7 @@ static void pxamci_dma_irq(void *param); static void pxamci_setup_data(struct pxamci_host *host, struct mmc_data *data) { struct dma_async_tx_descriptor *tx; - enum dma_data_direction direction; + enum dma_transfer_direction direction; struct dma_slave_config config; struct dma_chan *chan; unsigned int nob = data->blocks; From 9ce58dd7d9da3ca0d7cb8c9568f1c6f4746da65a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Mar 2019 11:10:11 +0100 Subject: [PATCH 581/855] mmc: davinci: remove extraneous __init annotation Building with clang finds a mistaken __init tag: WARNING: vmlinux.o(.text+0x5e4250): Section mismatch in reference from the function davinci_mmcsd_probe() to the function .init.text:init_mmcsd_host() The function davinci_mmcsd_probe() references the function __init init_mmcsd_host(). This is often because davinci_mmcsd_probe lacks a __init annotation or the annotation of init_mmcsd_host is wrong. Signed-off-by: Arnd Bergmann Acked-by: Wolfram Sang Reviewed-by: Nathan Chancellor Signed-off-by: Ulf Hansson --- drivers/mmc/host/davinci_mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/davinci_mmc.c b/drivers/mmc/host/davinci_mmc.c index 49e0daf2ef5e..f37003df1e01 100644 --- a/drivers/mmc/host/davinci_mmc.c +++ b/drivers/mmc/host/davinci_mmc.c @@ -1117,7 +1117,7 @@ static inline void mmc_davinci_cpufreq_deregister(struct mmc_davinci_host *host) { } #endif -static void __init init_mmcsd_host(struct mmc_davinci_host *host) +static void init_mmcsd_host(struct mmc_davinci_host *host) { mmc_davinci_reset_ctrl(host, 1); From 4e50ce03976fbc8ae995a000c4b10c737467beaa Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 13 Mar 2019 10:03:17 +0100 Subject: [PATCH 582/855] iommu/amd: fix sg->dma_address for sg->offset bigger than PAGE_SIZE Take into account that sg->offset can be bigger than PAGE_SIZE when setting segment sg->dma_address. Otherwise sg->dma_address will point at diffrent page, what makes DMA not possible with erros like this: xhci_hcd 0000:38:00.3: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0x00000000fdaa70c0 flags=0x0020] xhci_hcd 0000:38:00.3: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0x00000000fdaa7040 flags=0x0020] xhci_hcd 0000:38:00.3: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0x00000000fdaa7080 flags=0x0020] xhci_hcd 0000:38:00.3: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0x00000000fdaa7100 flags=0x0020] xhci_hcd 0000:38:00.3: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0x00000000fdaa7000 flags=0x0020] Additinally with wrong sg->dma_address unmap_sg will free wrong pages, what what can cause crashes like this: Feb 28 19:27:45 kernel: BUG: Bad page state in process cinnamon pfn:39e8b1 Feb 28 19:27:45 kernel: Disabling lock debugging due to kernel taint Feb 28 19:27:45 kernel: flags: 0x2ffff0000000000() Feb 28 19:27:45 kernel: raw: 02ffff0000000000 0000000000000000 ffffffff00000301 0000000000000000 Feb 28 19:27:45 kernel: raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 Feb 28 19:27:45 kernel: page dumped because: nonzero _refcount Feb 28 19:27:45 kernel: Modules linked in: ccm fuse arc4 nct6775 hwmon_vid amdgpu nls_iso8859_1 nls_cp437 edac_mce_amd vfat fat kvm_amd ccp rng_core kvm mt76x0u mt76x0_common mt76x02_usb irqbypass mt76_usb mt76x02_lib mt76 crct10dif_pclmul crc32_pclmul chash mac80211 amd_iommu_v2 ghash_clmulni_intel gpu_sched i2c_algo_bit ttm wmi_bmof snd_hda_codec_realtek snd_hda_codec_generic drm_kms_helper snd_hda_codec_hdmi snd_hda_intel drm snd_hda_codec aesni_intel snd_hda_core snd_hwdep aes_x86_64 crypto_simd snd_pcm cfg80211 cryptd mousedev snd_timer glue_helper pcspkr r8169 input_leds realtek agpgart libphy rfkill snd syscopyarea sysfillrect sysimgblt fb_sys_fops soundcore sp5100_tco k10temp i2c_piix4 wmi evdev gpio_amdpt pinctrl_amd mac_hid pcc_cpufreq acpi_cpufreq sg ip_tables x_tables ext4(E) crc32c_generic(E) crc16(E) mbcache(E) jbd2(E) fscrypto(E) sd_mod(E) hid_generic(E) usbhid(E) hid(E) dm_mod(E) serio_raw(E) atkbd(E) libps2(E) crc32c_intel(E) ahci(E) libahci(E) libata(E) xhci_pci(E) xhci_hcd(E) Feb 28 19:27:45 kernel: scsi_mod(E) i8042(E) serio(E) bcache(E) crc64(E) Feb 28 19:27:45 kernel: CPU: 2 PID: 896 Comm: cinnamon Tainted: G B W E 4.20.12-arch1-1-custom #1 Feb 28 19:27:45 kernel: Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./B450M Pro4, BIOS P1.20 06/26/2018 Feb 28 19:27:45 kernel: Call Trace: Feb 28 19:27:45 kernel: dump_stack+0x5c/0x80 Feb 28 19:27:45 kernel: bad_page.cold.29+0x7f/0xb2 Feb 28 19:27:45 kernel: __free_pages_ok+0x2c0/0x2d0 Feb 28 19:27:45 kernel: skb_release_data+0x96/0x180 Feb 28 19:27:45 kernel: __kfree_skb+0xe/0x20 Feb 28 19:27:45 kernel: tcp_recvmsg+0x894/0xc60 Feb 28 19:27:45 kernel: ? reuse_swap_page+0x120/0x340 Feb 28 19:27:45 kernel: ? ptep_set_access_flags+0x23/0x30 Feb 28 19:27:45 kernel: inet_recvmsg+0x5b/0x100 Feb 28 19:27:45 kernel: __sys_recvfrom+0xc3/0x180 Feb 28 19:27:45 kernel: ? handle_mm_fault+0x10a/0x250 Feb 28 19:27:45 kernel: ? syscall_trace_enter+0x1d3/0x2d0 Feb 28 19:27:45 kernel: ? __audit_syscall_exit+0x22a/0x290 Feb 28 19:27:45 kernel: __x64_sys_recvfrom+0x24/0x30 Feb 28 19:27:45 kernel: do_syscall_64+0x5b/0x170 Feb 28 19:27:45 kernel: entry_SYSCALL_64_after_hwframe+0x44/0xa9 Cc: stable@vger.kernel.org Reported-and-tested-by: Jan Viktorin Reviewed-by: Alexander Duyck Signed-off-by: Stanislaw Gruszka Fixes: 80187fd39dcb ('iommu/amd: Optimize map_sg and unmap_sg') Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index b319e51c379b..21cb088d6687 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2608,7 +2608,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, /* Everything is mapped - write the right values into s->dma_address */ for_each_sg(sglist, s, nelems, i) { - s->dma_address += address + s->offset; + /* + * Add in the remaining piece of the scatter-gather offset that + * was masked out when we were determining the physical address + * via (sg_phys(s) & PAGE_MASK) earlier. + */ + s->dma_address += address + (s->offset & ~PAGE_MASK); s->dma_length = s->length; } From 721f1e6c1fd137e7e2053d8e103b666faaa2d50c Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Mon, 18 Mar 2019 13:45:43 +0100 Subject: [PATCH 583/855] ALSA: hda - add Lenovo IdeaCentre B550 to the power_save_blacklist Another machine which does not like the power saving (noise): https://bugzilla.redhat.com/show_bug.cgi?id=1689623 Also, reorder the Lenovo C50 entry to keep the table sorted. Reported-by: hs.guimaraes@outlook.com Signed-off-by: Jaroslav Kysela Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index e5c49003e75f..4f502c92061f 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2144,10 +2144,12 @@ static struct snd_pci_quirk power_save_blacklist[] = { SND_PCI_QUIRK(0x8086, 0x2057, "Intel NUC5i7RYB", 0), /* https://bugzilla.redhat.com/show_bug.cgi?id=1520902 */ SND_PCI_QUIRK(0x8086, 0x2068, "Intel NUC7i3BNB", 0), - /* https://bugzilla.redhat.com/show_bug.cgi?id=1572975 */ - SND_PCI_QUIRK(0x17aa, 0x36a7, "Lenovo C50 All in one", 0), /* https://bugzilla.kernel.org/show_bug.cgi?id=198611 */ SND_PCI_QUIRK(0x17aa, 0x2227, "Lenovo X1 Carbon 3rd Gen", 0), + /* https://bugzilla.redhat.com/show_bug.cgi?id=1689623 */ + SND_PCI_QUIRK(0x17aa, 0x367b, "Lenovo IdeaCentre B550", 0), + /* https://bugzilla.redhat.com/show_bug.cgi?id=1572975 */ + SND_PCI_QUIRK(0x17aa, 0x36a7, "Lenovo C50 All in one", 0), {} }; #endif /* CONFIG_PM */ From b4748e7ab731e436cf5db4786358ada5dd2db6dd Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 17 Mar 2019 23:21:24 +0000 Subject: [PATCH 584/855] ALSA: opl3: fix mismatch between snd_opl3_drum_switch definition and declaration The function snd_opl3_drum_switch declaration in the header file has the order of the two arguments on_off and vel swapped when compared to the definition arguments of vel and on_off. Fix this by swapping them around to match the definition. This error predates the git history, so no idea when this error was introduced. Signed-off-by: Colin Ian King Signed-off-by: Takashi Iwai --- sound/drivers/opl3/opl3_voice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/drivers/opl3/opl3_voice.h b/sound/drivers/opl3/opl3_voice.h index 5b02bd49fde4..4e4ecc21760b 100644 --- a/sound/drivers/opl3/opl3_voice.h +++ b/sound/drivers/opl3/opl3_voice.h @@ -41,7 +41,7 @@ void snd_opl3_timer_func(struct timer_list *t); /* Prototypes for opl3_drums.c */ void snd_opl3_load_drums(struct snd_opl3 *opl3); -void snd_opl3_drum_switch(struct snd_opl3 *opl3, int note, int on_off, int vel, struct snd_midi_channel *chan); +void snd_opl3_drum_switch(struct snd_opl3 *opl3, int note, int vel, int on_off, struct snd_midi_channel *chan); /* Prototypes for opl3_oss.c */ #if IS_ENABLED(CONFIG_SND_SEQUENCER_OSS) From 4622a2d43101ea2e3d54a2af090f25a5886c648b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:27 +0000 Subject: [PATCH 585/855] powerpc/6xx: fix setup and use of SPRN_SPRG_PGDIR for hash32 Not only the 603 but all 6xx need SPRN_SPRG_PGDIR to be initialised at startup. This patch move it from __setup_cpu_603() to start_here() and __secondary_start(), close to the initialisation of SPRN_THREAD. Previously, virt addr of PGDIR was retrieved from thread struct. Now that it is the phys addr which is stored in SPRN_SPRG_PGDIR, hash_page() shall not convert it to phys anymore. This patch removes the conversion. Fixes: 93c4a162b014 ("powerpc/6xx: Store PGDIR physical address in a SPRG") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/cpu_setup_6xx.S | 3 --- arch/powerpc/kernel/head_32.S | 6 ++++++ arch/powerpc/mm/hash_low_32.S | 8 ++++---- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S index 6f1c11e0691f..7534ecff5e92 100644 --- a/arch/powerpc/kernel/cpu_setup_6xx.S +++ b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -24,9 +24,6 @@ BEGIN_MMU_FTR_SECTION li r10,0 mtspr SPRN_SPRG_603_LRU,r10 /* init SW LRU tracking */ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) - lis r10, (swapper_pg_dir - PAGE_OFFSET)@h - ori r10, r10, (swapper_pg_dir - PAGE_OFFSET)@l - mtspr SPRN_SPRG_PGDIR, r10 BEGIN_FTR_SECTION bl __init_fpu_registers diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index ce6a972f2584..48051c8977c5 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -855,6 +855,9 @@ __secondary_start: li r3,0 stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ #endif + lis r4, (swapper_pg_dir - PAGE_OFFSET)@h + ori r4, r4, (swapper_pg_dir - PAGE_OFFSET)@l + mtspr SPRN_SPRG_PGDIR, r4 /* enable MMU and jump to start_secondary */ li r4,MSR_KERNEL @@ -942,6 +945,9 @@ start_here: li r3,0 stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ #endif + lis r4, (swapper_pg_dir - PAGE_OFFSET)@h + ori r4, r4, (swapper_pg_dir - PAGE_OFFSET)@l + mtspr SPRN_SPRG_PGDIR, r4 /* stack */ lis r1,init_thread_union@ha diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 1f13494efb2b..a6c491f18a04 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -70,12 +70,12 @@ _GLOBAL(hash_page) lis r0,KERNELBASE@h /* check if kernel address */ cmplw 0,r4,r0 ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ - mfspr r5, SPRN_SPRG_PGDIR /* virt page-table root */ + mfspr r5, SPRN_SPRG_PGDIR /* phys page-table root */ blt+ 112f /* assume user more likely */ - lis r5,swapper_pg_dir@ha /* if kernel address, use */ - addi r5,r5,swapper_pg_dir@l /* kernel page table */ + lis r5, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addi r5 ,r5 ,(swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ -112: tophys(r5, r5) +112: #ifndef CONFIG_PTE_64BIT rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ lwz r8,0(r5) /* get pmd entry */ From 6ce59025f1182125e75c8d121daf44056b65dd1f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2019 08:08:43 -0600 Subject: [PATCH 586/855] paride/pf: cleanup queues when detection fails The driver allocates queues for all the units it potentially supports. But if we fail to detect any drives, then we fail loading the module without cleaning up those queues. This is now evident with the switch to blk-mq, though the bug has been there forever as far as I can tell. Also fix cleanup through regular module exit. Reported-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Jens Axboe --- drivers/block/paride/pf.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index e92e7a8eeeb2..103b617cdc31 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -761,8 +761,12 @@ static int pf_detect(void) return 0; printk("%s: No ATAPI disk detected\n", name); - for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) + for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { + blk_cleanup_queue(pf->disk->queue); + pf->disk->queue = NULL; + blk_mq_free_tag_set(&pf->tag_set); put_disk(pf->disk); + } pi_unregister_driver(par_drv); return -1; } @@ -1047,13 +1051,15 @@ static void __exit pf_exit(void) int unit; unregister_blkdev(major, name); for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { - if (!pf->present) - continue; - del_gendisk(pf->disk); + if (pf->present) + del_gendisk(pf->disk); + blk_cleanup_queue(pf->disk->queue); blk_mq_free_tag_set(&pf->tag_set); put_disk(pf->disk); - pi_release(pf->pi); + + if (pf->present) + pi_release(pf->pi); } } From 81b74ac68c28fddb3589ad5d4d5e587baf4bb781 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2019 08:10:32 -0600 Subject: [PATCH 587/855] paride/pcd: cleanup queues when detection fails The driver allocates queues for all the units it potentially supports. But if we fail to detect any drives, then we fail loading the module without cleaning up those queues. This is now evident with the switch to blk-mq, though the bug has been there forever as far as I can tell. Also fix cleanup through regular module exit. Reported-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Jens Axboe --- drivers/block/paride/pcd.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 96670eefaeb2..377a694dc228 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -749,8 +749,12 @@ static int pcd_detect(void) return 0; printk("%s: No CD-ROM drive found\n", name); - for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) + for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { + blk_cleanup_queue(cd->disk->queue); + cd->disk->queue = NULL; + blk_mq_free_tag_set(&cd->tag_set); put_disk(cd->disk); + } pi_unregister_driver(par_drv); return -1; } From aa36e3616532f82a920b5ebf4e059fbafae63d88 Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Sat, 19 Jan 2019 17:15:23 +0100 Subject: [PATCH 588/855] thermal/intel_powerclamp: fix __percpu declaration of worker_data This variable is declared as: static struct powerclamp_worker_data * __percpu worker_data; In other words, a percpu pointer to struct ... But this variable not used like so but as a pointer to a percpu struct powerclamp_worker_data. So fix the declaration as: static struct powerclamp_worker_data __percpu *worker_data; This also quiets Sparse's warnings from __verify_pcpu_ptr(), like: 494:49: warning: incorrect type in initializer (different address spaces) 494:49: expected void const [noderef] *__vpp_verify 494:49: got struct powerclamp_worker_data * Signed-off-by: Luc Van Oostenryck Reviewed-by: Petr Mladek Signed-off-by: Zhang Rui --- drivers/thermal/intel/intel_powerclamp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index 7571f7c2e7c9..b12ecd436e23 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -101,7 +101,7 @@ struct powerclamp_worker_data { bool clamping; }; -static struct powerclamp_worker_data * __percpu worker_data; +static struct powerclamp_worker_data __percpu *worker_data; static struct thermal_cooling_device *cooling_dev; static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu * clamping kthread worker From 3b5236cc5d086dd3ddd01113ee9255421aab9fab Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 22 Jan 2019 16:47:41 +0100 Subject: [PATCH 589/855] thermal: samsung: Fix incorrect check after code merge Merge commit 19785cf93b6c ("Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/evalenti/linux-soc-thermal") broke the code introduced by commit ffe6e16f14fa ("thermal: exynos: Reduce severity of too early temperature read"). Restore the original code from the mentioned commit to finally fix the warning message during boot: thermal thermal_zone0: failed to read out thermal zone (-22) Reported-by: Marian Mihailescu Signed-off-by: Marek Szyprowski Fixes: 19785cf93b6c ("Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/evalenti/linux-soc-thermal") Reviewed-by: Krzysztof Kozlowski Signed-off-by: Zhang Rui --- drivers/thermal/samsung/exynos_tmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c index 48eef552cba4..fc9399d9c082 100644 --- a/drivers/thermal/samsung/exynos_tmu.c +++ b/drivers/thermal/samsung/exynos_tmu.c @@ -666,7 +666,7 @@ static int exynos_get_temp(void *p, int *temp) struct exynos_tmu_data *data = p; int value, ret = 0; - if (!data || !data->tmu_read || !data->enabled) + if (!data || !data->tmu_read) return -EINVAL; else if (!data->enabled) /* From 35122495a8c6683e863acf7b05a7036b2be64c7a Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Tue, 29 Jan 2019 09:55:57 +0000 Subject: [PATCH 590/855] thermal: bcm2835: Fix crash in bcm2835_thermal_debugfs "cat /sys/kernel/debug/bcm2835_thermal/regset" causes a NULL pointer dereference in bcm2835_thermal_debugfs. The driver makes use of the implementation details of the thermal framework to retrieve a pointer to its private data from a struct thermal_zone_device, and gets it wrong - leading to the crash. Instead, store its private data as the drvdata and retrieve the thermal_zone_device pointer from it. Fixes: bcb7dd9ef206 ("thermal: bcm2835: add thermal driver for bcm2835 SoC") Signed-off-by: Phil Elwell Signed-off-by: Zhang Rui --- drivers/thermal/broadcom/bcm2835_thermal.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/thermal/broadcom/bcm2835_thermal.c b/drivers/thermal/broadcom/bcm2835_thermal.c index 720760cd493f..ba39647a690c 100644 --- a/drivers/thermal/broadcom/bcm2835_thermal.c +++ b/drivers/thermal/broadcom/bcm2835_thermal.c @@ -119,8 +119,7 @@ static const struct debugfs_reg32 bcm2835_thermal_regs[] = { static void bcm2835_thermal_debugfs(struct platform_device *pdev) { - struct thermal_zone_device *tz = platform_get_drvdata(pdev); - struct bcm2835_thermal_data *data = tz->devdata; + struct bcm2835_thermal_data *data = platform_get_drvdata(pdev); struct debugfs_regset32 *regset; data->debugfsdir = debugfs_create_dir("bcm2835_thermal", NULL); @@ -266,7 +265,7 @@ static int bcm2835_thermal_probe(struct platform_device *pdev) data->tz = tz; - platform_set_drvdata(pdev, tz); + platform_set_drvdata(pdev, data); /* * Thermal_zone doesn't enable hwmon as default, @@ -290,8 +289,8 @@ err_clk: static int bcm2835_thermal_remove(struct platform_device *pdev) { - struct thermal_zone_device *tz = platform_get_drvdata(pdev); - struct bcm2835_thermal_data *data = tz->devdata; + struct bcm2835_thermal_data *data = platform_get_drvdata(pdev); + struct thermal_zone_device *tz = data->tz; debugfs_remove_recursive(data->debugfsdir); thermal_zone_of_sensor_unregister(&pdev->dev, tz); From e0fda7377d30685feaef4d93d9fdfde91c5d7d9a Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Mon, 18 Feb 2019 14:22:30 +0800 Subject: [PATCH 591/855] thermal: cpu_cooling: Remove unused cur_freq variable The 'cur_freq' local variable became unused after commit 84fe2cab4859 ("cpu_cooling: Drop static-power related stuff"), let's remove it. Cc: Amit Daniel Kachhap Cc: Viresh Kumar Cc: Javi Merino Cc: Zhang Rui Cc: Eduardo Valentin Cc: Daniel Lezcano Signed-off-by: Shaokun Zhang Acked-by: Viresh Kumar Signed-off-by: Zhang Rui --- drivers/thermal/cpu_cooling.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index 6fff16113628..f7c1f49ec87f 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -536,12 +536,11 @@ static int cpufreq_power2state(struct thermal_cooling_device *cdev, struct thermal_zone_device *tz, u32 power, unsigned long *state) { - unsigned int cur_freq, target_freq; + unsigned int target_freq; u32 last_load, normalised_power; struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; struct cpufreq_policy *policy = cpufreq_cdev->policy; - cur_freq = cpufreq_quick_get(policy->cpu); power = power > 0 ? power : 0; last_load = cpufreq_cdev->last_load ?: 1; normalised_power = (power * 100) / last_load; From 16fc8eca1975358111dbd7ce65e4ce42d1a848fb Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Wed, 10 Oct 2018 01:30:06 -0700 Subject: [PATCH 592/855] thermal/int340x_thermal: Add additional UUIDs Add more supported DPTF policies than the driver currently exposes. Signed-off-by: Matthew Garrett Cc: Nisha Aram Signed-off-by: Zhang Rui --- .../intel/int340x_thermal/int3400_thermal.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c index 61ca7ce3624e..e0f39cacbc18 100644 --- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c @@ -22,6 +22,13 @@ enum int3400_thermal_uuid { INT3400_THERMAL_PASSIVE_1, INT3400_THERMAL_ACTIVE, INT3400_THERMAL_CRITICAL, + INT3400_THERMAL_ADAPTIVE_PERFORMANCE, + INT3400_THERMAL_EMERGENCY_CALL_MODE, + INT3400_THERMAL_PASSIVE_2, + INT3400_THERMAL_POWER_BOSS, + INT3400_THERMAL_VIRTUAL_SENSOR, + INT3400_THERMAL_COOLING_MODE, + INT3400_THERMAL_HARDWARE_DUTY_CYCLING, INT3400_THERMAL_MAXIMUM_UUID, }; @@ -29,6 +36,13 @@ static char *int3400_thermal_uuids[INT3400_THERMAL_MAXIMUM_UUID] = { "42A441D6-AE6A-462b-A84B-4A8CE79027D3", "3A95C389-E4B8-4629-A526-C52C88626BAE", "97C68AE7-15FA-499c-B8C9-5DA81D606E0A", + "63BE270F-1C11-48FD-A6F7-3AF253FF3E2D", + "5349962F-71E6-431D-9AE8-0A635B710AEE", + "9E04115A-AE87-4D1C-9500-0F3E340BFE75", + "F5A35014-C209-46A4-993A-EB56DE7530A1", + "6ED722A7-9240-48A5-B479-31EEF723D7CF", + "16CAF1B7-DD38-40ED-B1C1-1B8A1913D531", + "BE84BABF-C4D4-403D-B495-3128FD44dAC1", }; struct int3400_thermal_priv { From 396ee4d0cd52c13b3f6421b8d324d65da5e7e409 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Wed, 10 Oct 2018 01:30:07 -0700 Subject: [PATCH 593/855] thermal/int340x_thermal: fix mode setting int3400 only pushes the UUID into the firmware when the mode is flipped to "enable". The current code only exposes the mode flag if the firmware supports the PASSIVE_1 UUID, which not all machines do. Remove the restriction. Signed-off-by: Matthew Garrett Signed-off-by: Zhang Rui --- drivers/thermal/intel/int340x_thermal/int3400_thermal.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c index e0f39cacbc18..5f3ed24e26ec 100644 --- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c @@ -313,10 +313,9 @@ static int int3400_thermal_probe(struct platform_device *pdev) platform_set_drvdata(pdev, priv); - if (priv->uuid_bitmap & 1 << INT3400_THERMAL_PASSIVE_1) { - int3400_thermal_ops.get_mode = int3400_thermal_get_mode; - int3400_thermal_ops.set_mode = int3400_thermal_set_mode; - } + int3400_thermal_ops.get_mode = int3400_thermal_get_mode; + int3400_thermal_ops.set_mode = int3400_thermal_set_mode; + priv->thermal = thermal_zone_device_register("INT3400 Thermal", 0, 0, priv, &int3400_thermal_ops, &int3400_thermal_params, 0, 0); From 684b73245cd4d2608f4f2214f6bff02ba6ceca5f Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 15 Mar 2019 11:05:10 +0800 Subject: [PATCH 594/855] blk-mq: use blk_mq_sched_mark_restart_hctx to set RESTART Let blk_mq_mark_tag_wait() use the blk_mq_sched_mark_restart_hctx() to set BLK_MQ_S_SCHED_RESTART. Signed-off-by: Yufen Yu Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a9c181603cbd..ea01c23b58a3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1093,8 +1093,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, bool ret; if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + blk_mq_sched_mark_restart_hctx(hctx); /* * It's possible that a tag was freed in the window between the From f7c8a4120eedf24c36090b7542b179ff7a649219 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Mon, 18 Mar 2019 20:23:17 +0800 Subject: [PATCH 595/855] loop: access lo_backing_file only when the loop device is Lo_bound Commit 758a58d0bc67 ("loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()") separates "lo->lo_backing_file = NULL" and "lo->lo_state = Lo_unbound" into different critical regions protected by loop_ctl_mutex. However, there is below race that the NULL lo->lo_backing_file would be accessed when the backend of a loop is another loop device, e.g., loop0's backend is a file, while loop1's backend is loop0. loop0's backend is file loop1's backend is loop0 __loop_clr_fd() mutex_lock(&loop_ctl_mutex); lo->lo_backing_file = NULL; --> set to NULL mutex_unlock(&loop_ctl_mutex); loop_set_fd() mutex_lock_killable(&loop_ctl_mutex); loop_validate_file() f = l->lo_backing_file; --> NULL access if loop0 is not Lo_unbound mutex_lock(&loop_ctl_mutex); lo->lo_state = Lo_unbound; mutex_unlock(&loop_ctl_mutex); lo->lo_backing_file should be accessed only when the loop device is Lo_bound. In fact, the problem has been introduced already in commit 7ccd0791d985 ("loop: Push loop_ctl_mutex down into loop_clr_fd()") after which loop_validate_file() could see devices in Lo_rundown state with which it did not count. It was harmless at that point but still. Fixes: 7ccd0791d985 ("loop: Push loop_ctl_mutex down into loop_clr_fd()") Reported-by: syzbot+9bdc1adc1c55e7fe765b@syzkaller.appspotmail.com Signed-off-by: Dongli Zhang Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 1e6edd568214..bf1c61cab8eb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -656,7 +656,7 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) return -EBADF; l = f->f_mapping->host->i_bdev->bd_disk->private_data; - if (l->lo_state == Lo_unbound) { + if (l->lo_state != Lo_bound) { return -EINVAL; } f = l->lo_backing_file; From fb4d83f293e072ad96848959d20664e2a9f2235b Mon Sep 17 00:00:00 2001 From: Pi-Hsun Shih Date: Wed, 9 Jan 2019 13:57:24 +0800 Subject: [PATCH 596/855] thermal: mtk: Allocate enough space for mtk_thermal. The mtk_thermal struct contains a 'struct mtk_thermal_bank banks[];', but the allocation only allocates sizeof(struct mtk_thermal) bytes, which cause out of bound access with the ->banks[] member. Change it to a fixed size array instead. Signed-off-by: Pi-Hsun Shih Reviewed-by: Daniel Lezcano Signed-off-by: Zhang Rui --- drivers/thermal/mtk_thermal.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/mtk_thermal.c b/drivers/thermal/mtk_thermal.c index 5c07a61447d3..e4ea7f6aef20 100644 --- a/drivers/thermal/mtk_thermal.c +++ b/drivers/thermal/mtk_thermal.c @@ -199,6 +199,9 @@ enum { #define MT7622_TS1 0 #define MT7622_NUM_CONTROLLER 1 +/* The maximum number of banks */ +#define MAX_NUM_ZONES 8 + /* The calibration coefficient of sensor */ #define MT7622_CALIBRATION 165 @@ -249,7 +252,7 @@ struct mtk_thermal_data { const int num_controller; const int *controller_offset; bool need_switch_bank; - struct thermal_bank_cfg bank_data[]; + struct thermal_bank_cfg bank_data[MAX_NUM_ZONES]; }; struct mtk_thermal { @@ -268,7 +271,7 @@ struct mtk_thermal { s32 vts[MAX_NUM_VTS]; const struct mtk_thermal_data *conf; - struct mtk_thermal_bank banks[]; + struct mtk_thermal_bank banks[MAX_NUM_ZONES]; }; /* MT8183 thermal sensor data */ From e925b5be5751f6a7286bbd9a4cbbc4ac90cc5fa6 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 18 Mar 2019 22:26:33 +0800 Subject: [PATCH 597/855] thermal/intel_powerclamp: fix truncated kthread name kthread name only allows 15 characters (TASK_COMMON_LEN is 16). Thus rename the kthreads created by intel_powerclamp driver from "kidle_inject/ + decimal cpuid" to "kidle_inj/ + decimal cpuid" to avoid truncated kthead name for cpu 100 and later. Signed-off-by: Zhang Rui --- drivers/thermal/intel/intel_powerclamp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index 7571f7c2e7c9..9434f4eb421a 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -494,7 +494,7 @@ static void start_power_clamp_worker(unsigned long cpu) struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); struct kthread_worker *worker; - worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu); + worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu); if (IS_ERR(worker)) return; From 29b0b5d56589d66bd5793f1e09211ce7d7d3cd36 Mon Sep 17 00:00:00 2001 From: Alin Nastac Date: Mon, 11 Mar 2019 17:18:42 +0100 Subject: [PATCH 598/855] netfilter: nf_conntrack_sip: remove direct dependency on IPv6 Previous implementation was not usable with CONFIG_IPV6=m. Fixes: a3419ce3356c ("netfilter: nf_conntrack_sip: add sip_external_media logic") Signed-off-by: Alin Nastac Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 37 ++++++++++++++------------------ 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index f067c6b50857..39fcc1ed18f3 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -20,9 +20,9 @@ #include #include #include +#include +#include -#include -#include #include #include #include @@ -871,38 +871,33 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, } else if (sip_external_media) { struct net_device *dev = skb_dst(skb)->dev; struct net *net = dev_net(dev); - struct rtable *rt; - struct flowi4 fl4 = {}; -#if IS_ENABLED(CONFIG_IPV6) - struct flowi6 fl6 = {}; -#endif + struct flowi fl; struct dst_entry *dst = NULL; + memset(&fl, 0, sizeof(fl)); + switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: - fl4.daddr = daddr->ip; - rt = ip_route_output_key(net, &fl4); - if (!IS_ERR(rt)) - dst = &rt->dst; + fl.u.ip4.daddr = daddr->ip; + nf_ip_route(net, &dst, &fl, false); break; -#if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: - fl6.daddr = daddr->in6; - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - dst_release(dst); - dst = NULL; - } + fl.u.ip6.daddr = daddr->in6; + nf_ip6_route(net, &dst, &fl, false); break; -#endif } /* Don't predict any conntracks when media endpoint is reachable * through the same interface as the signalling peer. */ - if (dst && dst->dev == dev) - return NF_ACCEPT; + if (dst) { + bool external_media = (dst->dev == dev); + + dst_release(dst); + if (external_media) + return NF_ACCEPT; + } } /* We need to check whether the registration exists before attempting From 05b7639da55f5555b9866a1f4b7e8995232a6323 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Mar 2019 12:10:59 +0100 Subject: [PATCH 599/855] netfilter: nft_set_rbtree: check for inactive element after flag mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise, we hit bogus ENOENT when removing elements. Fixes: e701001e7cbe ("netfilter: nft_rbtree: allow adjacent intervals with dynamic updates") Reported-by: Václav Zindulka Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index fa61208371f8..321a0036fdf5 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -308,10 +308,6 @@ static void *nft_rbtree_deactivate(const struct net *net, else if (d > 0) parent = parent->rb_right; else { - if (!nft_set_elem_active(&rbe->ext, genmask)) { - parent = parent->rb_left; - continue; - } if (nft_rbtree_interval_end(rbe) && !nft_rbtree_interval_end(this)) { parent = parent->rb_left; @@ -320,6 +316,9 @@ static void *nft_rbtree_deactivate(const struct net *net, nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; + } else if (!nft_set_elem_active(&rbe->ext, genmask)) { + parent = parent->rb_left; + continue; } nft_rbtree_flush(net, set, rbe); return rbe; From e166e4fdaced850bee3d5ee12a5740258fb30587 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 13 Mar 2019 16:33:29 +0800 Subject: [PATCH 600/855] netfilter: bridge: set skb transport_header before entering NF_INET_PRE_ROUTING Since Commit 21d1196a35f5 ("ipv4: set transport header earlier"), skb->transport_header has been always set before entering INET netfilter. This patch is to set skb->transport_header for bridge before entering INET netfilter by bridge-nf-call-iptables. It also fixes an issue that sctp_error() couldn't compute a right csum due to unset skb->transport_header. Fixes: e6d8b64b34aa ("net: sctp: fix and consolidate SCTP checksumming code") Reported-by: Li Shuang Suggested-by: Pablo Neira Ayuso Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 1 + net/bridge/br_netfilter_ipv6.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 9d34de68571b..22afa566cbce 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -502,6 +502,7 @@ static unsigned int br_nf_pre_routing(void *priv, nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; skb->protocol = htons(ETH_P_IP); + skb->transport_header = skb->network_header + ip_hdr(skb)->ihl * 4; NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->net, state->sk, skb, skb->dev, NULL, diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 564710f88f93..e88d6641647b 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -235,6 +235,8 @@ unsigned int br_nf_pre_routing_ipv6(void *priv, nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr; skb->protocol = htons(ETH_P_IPV6); + skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->net, state->sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_ipv6); From d1fa381033eb718df5c602f64b6e88676138dfc6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 13 Mar 2019 22:15:59 +0100 Subject: [PATCH 601/855] netfilter: fix NETFILTER_XT_TARGET_TEE dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With NETFILTER_XT_TARGET_TEE=y and IP6_NF_IPTABLES=m, we get a link error when referencing the NF_DUP_IPV6 module: net/netfilter/xt_TEE.o: In function `tee_tg6': xt_TEE.c:(.text+0x14): undefined reference to `nf_dup_ipv6' The problem here is the 'select NF_DUP_IPV6 if IP6_NF_IPTABLES' that forces NF_DUP_IPV6 to be =m as well rather than setting it to =y as was intended here. Adding a soft dependency on IP6_NF_IPTABLES avoids that broken configuration. Fixes: 5d400a4933e8 ("netfilter: Kconfig: Change select IPv6 dependencies") Cc: Máté Eckl Cc: Taehee Yoo Link: https://patchwork.ozlabs.org/patch/999498/ Link: https://lore.kernel.org/patchwork/patch/960062/ Reported-by: Randy Dunlap Reported-by: Stephen Rothwell Signed-off-by: Arnd Bergmann Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index d43ffb09939b..6548271209a0 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1007,6 +1007,7 @@ config NETFILTER_XT_TARGET_TEE depends on NETFILTER_ADVANCED depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK + depends on IP6_NF_IPTABLES || !IP6_NF_IPTABLES select NF_DUP_IPV4 select NF_DUP_IPV6 if IP6_NF_IPTABLES ---help--- From 6d65561f3d5ec933151939c543d006b79044e7a6 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Thu, 14 Mar 2019 02:58:18 -0500 Subject: [PATCH 602/855] netfilter: ip6t_srh: fix NULL pointer dereferences skb_header_pointer may return NULL. The current code dereference its return values without a NULL check. The fix inserts the checks to avoid NULL pointer dereferences. Fixes: 202a8ff545cc ("netfilter: add IPv6 segment routing header 'srh' match") Signed-off-by: Kangjie Lu Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_srh.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c index 1059894a6f4c..4cb83fb69844 100644 --- a/net/ipv6/netfilter/ip6t_srh.c +++ b/net/ipv6/netfilter/ip6t_srh.c @@ -210,6 +210,8 @@ static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par) psidoff = srhoff + sizeof(struct ipv6_sr_hdr) + ((srh->segments_left + 1) * sizeof(struct in6_addr)); psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid); + if (!psid) + return false; if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID, ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk, &srhinfo->psid_addr))) @@ -223,6 +225,8 @@ static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par) nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) + ((srh->segments_left - 1) * sizeof(struct in6_addr)); nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid); + if (!nsid) + return false; if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID, ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk, &srhinfo->nsid_addr))) @@ -233,6 +237,8 @@ static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par) if (srhinfo->mt_flags & IP6T_SRH_LSID) { lsidoff = srhoff + sizeof(struct ipv6_sr_hdr); lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid); + if (!lsid) + return false; if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID, ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk, &srhinfo->lsid_addr))) From 8ffcd32f64633926163cdd07a7d295c500a947d1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 14 Mar 2019 10:50:20 +0100 Subject: [PATCH 603/855] netfilter: nf_tables: bogus EBUSY in helper removal from transaction Proper use counter updates when activating and deactivating the object, otherwise, this hits bogus EBUSY error. Fixes: cd5125d8f518 ("netfilter: nf_tables: split set destruction in deactivate and destroy phase") Reported-by: Laura Garcia Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_objref.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 457a9ceb46af..8dfa798ea683 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -65,21 +65,34 @@ nla_put_failure: return -1; } -static void nft_objref_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +static void nft_objref_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + enum nft_trans_phase phase) { struct nft_object *obj = nft_objref_priv(expr); + if (phase == NFT_TRANS_COMMIT) + return; + obj->use--; } +static void nft_objref_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_object *obj = nft_objref_priv(expr); + + obj->use++; +} + static struct nft_expr_type nft_objref_type; static const struct nft_expr_ops nft_objref_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)), .eval = nft_objref_eval, .init = nft_objref_init, - .destroy = nft_objref_destroy, + .activate = nft_objref_activate, + .deactivate = nft_objref_deactivate, .dump = nft_objref_dump, }; From 74710e05906c37da6b436386dc13c44dbe5d8308 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 15 Mar 2019 17:21:01 +0100 Subject: [PATCH 604/855] netfilter: nft_redir: fix module autoload with ip4 AF_INET4 does not exist. Fixes: c78efc99c750 ("netfilter: nf_tables: nat: merge nft_redir protocol specific modules)" Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_redir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index f8092926f704..a340cd8a751b 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -233,5 +233,5 @@ module_exit(nft_redir_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arturo Borrero Gonzalez "); -MODULE_ALIAS_NFT_AF_EXPR(AF_INET4, "redir"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir"); MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir"); From d3ca4651d05c0ff7259d087d8c949bcf3e14fb46 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Mar 2019 15:04:18 +0100 Subject: [PATCH 605/855] udf: Fix crash on IO error during truncate When truncate(2) hits IO error when reading indirect extent block the code just bugs with: kernel BUG at linux-4.15.0/fs/udf/truncate.c:249! ... Fix the problem by bailing out cleanly in case of IO error. CC: stable@vger.kernel.org Reported-by: jean-luc malet Signed-off-by: Jan Kara --- fs/udf/truncate.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index b647f0bd150c..94220ba85628 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -260,6 +260,9 @@ void udf_truncate_extents(struct inode *inode) epos.block = eloc; epos.bh = udf_tread(sb, udf_get_lb_pblock(sb, &eloc, 0)); + /* Error reading indirect block? */ + if (!epos.bh) + return; if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> From 2b42be5eb24564227b15e66f54f088e5a26549c7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Mar 2019 15:27:02 +0100 Subject: [PATCH 606/855] udf: Propagate errors from udf_truncate_extents() Make udf_truncate_extents() properly propagate errors to its callers and let udf_setsize() handle the error properly as well. This lets userspace know in case there's some error when truncating blocks. Signed-off-by: Jan Kara --- fs/udf/inode.c | 4 +++- fs/udf/truncate.c | 7 ++++--- fs/udf/udfdecl.h | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index ae796e10f68b..e7276932e433 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1242,8 +1242,10 @@ set_size: truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); - udf_truncate_extents(inode); + err = udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); + if (err) + return err; } update_time: inode->i_mtime = inode->i_ctime = current_time(inode); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 94220ba85628..63a47f1e1d52 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -199,7 +199,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode, * for making file shorter. For making file longer, udf_extend_file() has to * be used. */ -void udf_truncate_extents(struct inode *inode) +int udf_truncate_extents(struct inode *inode) { struct extent_position epos; struct kernel_lb_addr eloc, neloc = {}; @@ -224,7 +224,7 @@ void udf_truncate_extents(struct inode *inode) if (etype == -1) { /* We should extend the file? */ WARN_ON(byte_offset); - return; + return 0; } epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); @@ -262,7 +262,7 @@ void udf_truncate_extents(struct inode *inode) udf_get_lb_pblock(sb, &eloc, 0)); /* Error reading indirect block? */ if (!epos.bh) - return; + return -EIO; if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> @@ -286,4 +286,5 @@ void udf_truncate_extents(struct inode *inode) iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); + return 0; } diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index ee246769dee4..d89ef71887fc 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -235,7 +235,7 @@ extern struct inode *udf_new_inode(struct inode *, umode_t); /* truncate.c */ extern void udf_truncate_tail_extent(struct inode *); extern void udf_discard_prealloc(struct inode *); -extern void udf_truncate_extents(struct inode *); +extern int udf_truncate_extents(struct inode *); /* balloc.c */ extern void udf_free_blocks(struct super_block *, struct inode *, From f01a7dbe98ae4265023fa5d3af0f076f0b18a647 Mon Sep 17 00:00:00 2001 From: Martynas Pumputis Date: Mon, 18 Mar 2019 16:10:26 +0100 Subject: [PATCH 607/855] bpf: Try harder when allocating memory for large maps It has been observed that sometimes a higher order memory allocation for BPF maps fails when there is no obvious memory pressure in a system. E.g. the map (BPF_MAP_TYPE_LRU_HASH, key=38, value=56, max_elems=524288) could not be created due to vmalloc unable to allocate 75497472B, when the system's memory consumption (in MB) was the following: Total: 3942 Used: 837 (21.24%) Free: 138 Buffers: 239 Cached: 2727 Later analysis [1] by Michal Hocko showed that the vmalloc was not trying to reclaim memory from the page cache and was failing prematurely due to __GFP_NORETRY. Considering dcda9b0471 ("mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_MAYFAIL with more useful semantic") and [1], we can replace __GFP_NORETRY with __GFP_RETRY_MAYFAIL, as it won't invoke OOM killer and will try harder to fulfil allocation requests. Unfortunately, replacing the body of the BPF map memory allocation function with the kvmalloc_node helper function is not an option at this point in time, given 1) kmalloc is non-optional for higher order allocations, and 2) passing __GFP_RETRY_MAYFAIL to the kmalloc would stress the slab allocator too much for large requests. The change has been tested with the workloads mentioned above and by observing oom_kill value from /proc/vmstat. [1]: https://lore.kernel.org/bpf/20190310071318.GW5232@dhcp22.suse.cz/ Signed-off-by: Martynas Pumputis Acked-by: Yonghong Song Cc: Michal Hocko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20190318153940.GL8924@dhcp22.suse.cz/ --- kernel/bpf/syscall.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 62f6bced3a3c..afca36f53c49 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -136,21 +136,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) void *bpf_map_area_alloc(size_t size, int numa_node) { - /* We definitely need __GFP_NORETRY, so OOM killer doesn't - * trigger under memory pressure as we really just want to - * fail instead. + /* We really just want to fail instead of triggering OOM killer + * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, + * which is used for lower order allocation requests. + * + * It has been observed that higher order allocation requests done by + * vmalloc with __GFP_NORETRY being set might fail due to not trying + * to reclaim memory from the page cache, thus we set + * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO; + + const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; void *area; if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | flags, numa_node); + area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + numa_node); if (area != NULL) return area; } - return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, - __builtin_return_address(0)); + return __vmalloc_node_flags_caller(size, numa_node, + GFP_KERNEL | __GFP_RETRY_MAYFAIL | + flags, __builtin_return_address(0)); } void bpf_map_area_free(void *area) From 6a1afffb08ce5f9fb9ccc20f7ab24846c0142984 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Sun, 17 Mar 2019 14:46:53 +0100 Subject: [PATCH 608/855] selinux: fix NULL dereference in policydb_destroy() The conversion to kvmalloc() forgot to account for the possibility that p->type_attr_map_array might be null in policydb_destroy(). Fix this by destroying its contents only if it is not NULL. Also make sure ebitmap_init() is called on all entries before policydb_destroy() can be called. Right now this is a no-op, because both kvcalloc() and ebitmap_init() just zero out the whole struct, but let's rather not rely on a specific implementation. Reported-by: syzbot+a57b2aff60832666fc28@syzkaller.appspotmail.com Fixes: acdf52d97f82 ("selinux: convert to kvmalloc") Signed-off-by: Ondrej Mosnacek Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/ss/policydb.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 6b576e588725..daecdfb15a9c 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -828,9 +828,11 @@ void policydb_destroy(struct policydb *p) hashtab_map(p->range_tr, range_tr_destroy, NULL); hashtab_destroy(p->range_tr); - for (i = 0; i < p->p_types.nprim; i++) - ebitmap_destroy(&p->type_attr_map_array[i]); - kvfree(p->type_attr_map_array); + if (p->type_attr_map_array) { + for (i = 0; i < p->p_types.nprim; i++) + ebitmap_destroy(&p->type_attr_map_array[i]); + kvfree(p->type_attr_map_array); + } ebitmap_destroy(&p->filename_trans_ttypes); ebitmap_destroy(&p->policycaps); @@ -2496,10 +2498,13 @@ int policydb_read(struct policydb *p, void *fp) if (!p->type_attr_map_array) goto bad; + /* just in case ebitmap_init() becomes more than just a memset(0): */ + for (i = 0; i < p->p_types.nprim; i++) + ebitmap_init(&p->type_attr_map_array[i]); + for (i = 0; i < p->p_types.nprim; i++) { struct ebitmap *e = &p->type_attr_map_array[i]; - ebitmap_init(e); if (p->policyvers >= POLICYDB_VERSION_AVTAB) { rc = ebitmap_read(e, fp); if (rc) From fd6fab2cb78d3b6023c26ec53e0aa6f0b477d2f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Mar 2019 16:30:06 -0600 Subject: [PATCH 609/855] io_uring: retry bulk slab allocs as single allocs I've seen cases where bulk alloc fails, since the bulk alloc API is all-or-nothing - either we get the number we ask for, or it returns 0 as number of entries. If we fail a batch bulk alloc, retry a "normal" kmem_cache_alloc() and just use that instead of failing with -EAGAIN. While in there, ensure we use GFP_KERNEL. That was an oversight in the original code, when we switched away from GFP_ATOMIC. Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 12bb238aed6b..4c6a5e60ddbe 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -399,13 +399,14 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; if (!percpu_ref_tryget(&ctx->refs)) return NULL; if (!state) { - req = kmem_cache_alloc(req_cachep, __GFP_NOWARN); + req = kmem_cache_alloc(req_cachep, gfp); if (unlikely(!req)) goto out; } else if (!state->free_reqs) { @@ -413,10 +414,18 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, int ret; sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs)); - ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz, - state->reqs); - if (unlikely(ret <= 0)) - goto out; + ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs); + + /* + * Bulk alloc is all-or-nothing. If we fail to get a batch, + * retry single alloc to be on the safe side. + */ + if (unlikely(ret <= 0)) { + state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); + if (!state->reqs[0]) + goto out; + ret = 1; + } state->free_reqs = ret - 1; state->cur_req = 1; req = state->reqs[0]; From bf33a7699e992b12d4c7d39dc3f0b61f6b26c5c2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Mar 2019 15:22:18 -0600 Subject: [PATCH 610/855] io_uring: mark me as the maintainer And io_uring as maintained in general. Signed-off-by: Jens Axboe --- MAINTAINERS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 17b59b66474b..a90137af48c2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8091,6 +8091,16 @@ F: include/linux/iommu.h F: include/linux/of_iommu.h F: include/linux/iova.h +IO_URING +M: Jens Axboe +L: linux-block@vger.kernel.org +L: linux-fsdevel@vger.kernel.org +T: git git://git.kernel.dk/linux-block +T: git git://git.kernel.dk/liburing +S: Maintained +F: fs/io_uring.c +F: include/uapi/linux/io_uring.h + IP MASQUERADING M: Juanjo Ciarlante S: Maintained From 875f1d0769cdcfe1596ff0ca609b453359e42ec9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Feb 2019 13:05:25 -0700 Subject: [PATCH 611/855] iov_iter: add ITER_BVEC_FLAG_NO_REF flag For ITER_BVEC, if we're holding on to kernel pages, the caller doesn't need to grab a reference to the bvec pages, and drop that same reference on IO completion. This is essentially safe for any ITER_BVEC, but some use cases end up reusing pages and uncondtionally dropping a page reference on completion. And example of that is sendfile(2), that ends up being a splice_in + splice_out on the pipe pages. Add a flag that tells us it's fine to not grab a page reference to the bvec pages, since that caller knows not to drop a reference when it's done with the pages. Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +++ include/linux/uio.h | 24 +++++++++++++++++++----- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4c6a5e60ddbe..c592a0933b0d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -855,6 +855,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); if (offset) iov_iter_advance(iter, offset); + + /* don't drop a reference to these pages */ + iter->type |= ITER_BVEC_FLAG_NO_REF; return 0; } diff --git a/include/linux/uio.h b/include/linux/uio.h index ecf584f6b82d..4e926641fa80 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -23,14 +23,23 @@ struct kvec { }; enum iter_type { - ITER_IOVEC = 0, - ITER_KVEC = 2, - ITER_BVEC = 4, - ITER_PIPE = 8, - ITER_DISCARD = 16, + /* set if ITER_BVEC doesn't hold a bv_page ref */ + ITER_BVEC_FLAG_NO_REF = 2, + + /* iter types */ + ITER_IOVEC = 4, + ITER_KVEC = 8, + ITER_BVEC = 16, + ITER_PIPE = 32, + ITER_DISCARD = 64, }; struct iov_iter { + /* + * Bit 0 is the read/write bit, set if we're writing. + * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and + * the caller isn't expecting to drop a page reference when done. + */ unsigned int type; size_t iov_offset; size_t count; @@ -84,6 +93,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i) return i->type & (READ | WRITE); } +static inline bool iov_iter_bvec_no_ref(const struct iov_iter *i) +{ + return (i->type & ITER_BVEC_FLAG_NO_REF) != 0; +} + /* * Total number of bytes covered by an iovec. * From 399254aaf4892113c806816f7e64cf40c804d46d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Feb 2019 13:13:23 -0700 Subject: [PATCH 612/855] block: add BIO_NO_PAGE_REF flag If bio_iov_iter_get_pages() is called on an iov_iter that is flagged with NO_REF, then we don't need to add a page reference for the pages that we add. Add BIO_NO_PAGE_REF to track this in the bio, so IO completion knows not to drop a reference to these pages. Signed-off-by: Jens Axboe --- block/bio.c | 43 ++++++++++++++++++++++----------------- fs/block_dev.c | 12 ++++++----- fs/iomap.c | 12 ++++++----- include/linux/blk_types.h | 1 + 4 files changed, 39 insertions(+), 29 deletions(-) diff --git a/block/bio.c b/block/bio.c index 71a78d9fb8b7..b64cedc7f87c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -849,20 +849,14 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) size = bio_add_page(bio, bv->bv_page, len, bv->bv_offset + iter->iov_offset); if (size == len) { - struct page *page; - int i; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct page *page; + int i; + + mp_bvec_for_each_page(page, bv, i) + get_page(page); + } - /* - * For the normal O_DIRECT case, we could skip grabbing this - * reference and then not have to put them again when IO - * completes. But this breaks some in-kernel users, like - * splicing to/from a loop device, where we release the pipe - * pages unconditionally. If we can fix that case, we can - * get rid of the get here and the need to call - * bio_release_pages() at IO completion time. - */ - mp_bvec_for_each_page(page, bv, i) - get_page(page); iov_iter_advance(iter, size); return 0; } @@ -925,10 +919,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those - * pages. For now, when adding kernel pages, we still grab a reference to the - * page. This isn't strictly needed for the common case, but some call paths - * end up releasing pages from eg a pipe and we can't easily control these. - * See comment in __bio_iov_bvec_add_pages(). + * pages. If we're adding kernel pages, and the caller told us it's safe to + * do so, we just have to add the pages to the bio directly. We don't grab an + * extra reference to those pages (the user should already have that), and we + * don't put the page on IO completion. The caller needs to check if the bio is + * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be + * released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in *iter, whatever is smaller. If @@ -940,6 +936,13 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) const bool is_bvec = iov_iter_is_bvec(iter); unsigned short orig_vcnt = bio->bi_vcnt; + /* + * If this is a BVEC iter, then the pages are kernel pages. Don't + * release them on IO completion, if the caller asked us to. + */ + if (is_bvec && iov_iter_bvec_no_ref(iter)) + bio_set_flag(bio, BIO_NO_PAGE_REF); + do { int ret; @@ -1696,7 +1699,8 @@ static void bio_dirty_fn(struct work_struct *work) next = bio->bi_private; bio_set_pages_dirty(bio); - bio_release_pages(bio); + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) + bio_release_pages(bio); bio_put(bio); } } @@ -1713,7 +1717,8 @@ void bio_check_pages_dirty(struct bio *bio) goto defer; } - bio_release_pages(bio); + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) + bio_release_pages(bio); bio_put(bio); return; defer: diff --git a/fs/block_dev.c b/fs/block_dev.c index e9faa52bb489..78d3257435c0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -336,12 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio) if (should_dirty) { bio_check_pages_dirty(bio); } else { - struct bio_vec *bvec; - int i; - struct bvec_iter_all iter_all; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + int i; - bio_for_each_segment_all(bvec, bio, i, iter_all) - put_page(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, i, iter_all) + put_page(bvec->bv_page); + } bio_put(bio); } } diff --git a/fs/iomap.c b/fs/iomap.c index 97cb9d486a7d..abdd18e404f8 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio) if (should_dirty) { bio_check_pages_dirty(bio); } else { - struct bio_vec *bvec; - int i; - struct bvec_iter_all iter_all; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + int i; - bio_for_each_segment_all(bvec, bio, i, iter_all) - put_page(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, i, iter_all) + put_page(bvec->bv_page); + } bio_put(bio); } } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d66bf5f32610..791fee35df88 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -215,6 +215,7 @@ struct bio { /* * bio flags */ +#define BIO_NO_PAGE_REF 0 /* don't put release vec pages */ #define BIO_SEG_VALID 1 /* bi_phys_segments valid */ #define BIO_CLONED 2 /* doesn't own data */ #define BIO_BOUNCED 3 /* bio is a bounce bio */ From 3028efe03be9c8c4cd7923f0f3c39b2871cc8a8f Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 18 Mar 2019 17:00:28 +0000 Subject: [PATCH 613/855] NFS: Fix nfs4_lock_state refcounting in nfs4_alloc_{lock,unlock}data() Commit 7b587e1a5a6c ("NFS: use locks_copy_lock() to copy locks.") changed the lock copying from memcpy() to the dedicated locks_copy_lock() function. The latter correctly increments the nfs4_lock_state.ls_count via nfs4_fl_copy_lock(), however, this refcount has already been incremented in the nfs4_alloc_{lock,unlock}data(). Kmemleak subsequently reports an unreferenced nfs4_lock_state object as below (arm64 platform): unreferenced object 0xffff8000fce0b000 (size 256): comm "systemd-sysuser", pid 1608, jiffies 4294892825 (age 32.348s) hex dump (first 32 bytes): 20 57 4c fb 00 80 ff ff 20 57 4c fb 00 80 ff ff WL..... WL..... 00 57 4c fb 00 80 ff ff 01 00 00 00 00 00 00 00 .WL............. backtrace: [<000000000d15010d>] kmem_cache_alloc+0x178/0x208 [<00000000d7c1d264>] nfs4_set_lock_state+0x124/0x1f0 [<000000009c867628>] nfs4_proc_lock+0x90/0x478 [<000000001686bd74>] do_setlk+0x64/0xe8 [<00000000e01500d4>] nfs_lock+0xe8/0x1f0 [<000000004f387d8d>] vfs_lock_file+0x18/0x40 [<00000000656ab79b>] do_lock_file_wait+0x68/0xf8 [<00000000f17c4a4b>] fcntl_setlk+0x224/0x280 [<0000000052a242c6>] do_fcntl+0x418/0x730 [<000000004f47291a>] __arm64_sys_fcntl+0x84/0xd0 [<00000000d6856e01>] el0_svc_common+0x80/0xf0 [<000000009c4bd1df>] el0_svc_handler+0x2c/0x80 [<00000000b1a0d479>] el0_svc+0x8/0xc [<0000000056c62a0f>] 0xffffffffffffffff This patch removes the original refcount_inc(&lsp->ls_count) that was paired with the memcpy() lock copying. Fixes: 7b587e1a5a6c ("NFS: use locks_copy_lock() to copy locks.") Cc: # 5.0.x- Cc: NeilBrown Signed-off-by: Catalin Marinas Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4dbb0ee23432..6d2812a39287 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6301,7 +6301,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->arg.seqid = seqid; p->res.seqid = seqid; p->lsp = lsp; - refcount_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); p->l_ctx = nfs_get_lock_context(ctx); @@ -6526,7 +6525,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->res.lock_seqid = p->arg.lock_seqid; p->lsp = lsp; p->server = server; - refcount_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); From 6f8f89ce1e18de1e391c9c1c14e7738881d1c00c Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Mon, 11 Mar 2019 09:58:38 -0600 Subject: [PATCH 614/855] platform/chrome: Fix locking pattern in wilco_ec_mailbox() Before, ec->data_buffer could be written to from multiple contexts at the same time. Since the ec is shared data, it needs to be inside the mutex as well. Fixes: 7b3d4f44abf0 ("platform/chrome: Add new driver for Wilco EC") Signed-off-by: Nick Crews Signed-off-by: Enric Balletbo i Serra Signed-off-by: Benson Leung --- drivers/platform/chrome/wilco_ec/mailbox.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/chrome/wilco_ec/mailbox.c b/drivers/platform/chrome/wilco_ec/mailbox.c index f6ff29a11f1a..14355668ddfa 100644 --- a/drivers/platform/chrome/wilco_ec/mailbox.c +++ b/drivers/platform/chrome/wilco_ec/mailbox.c @@ -223,11 +223,11 @@ int wilco_ec_mailbox(struct wilco_ec_device *ec, struct wilco_ec_message *msg) msg->command, msg->type, msg->flags, msg->response_size, msg->request_size); + mutex_lock(&ec->mailbox_lock); /* Prepare request packet */ rq = ec->data_buffer; wilco_ec_prepare(msg, rq); - mutex_lock(&ec->mailbox_lock); ret = wilco_ec_transfer(ec, msg, rq); mutex_unlock(&ec->mailbox_lock); From 3897b6f0a859288c22fb793fad11ec2327e60fcd Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 14 Mar 2019 08:56:28 +0100 Subject: [PATCH 615/855] btrfs: raid56: properly unmap parity page in finish_parity_scrub() Parity page is incorrectly unmapped in finish_parity_scrub(), triggering a reference counter bug on i386, i.e.: [ 157.662401] kernel BUG at mm/highmem.c:349! [ 157.666725] invalid opcode: 0000 [#1] SMP PTI The reason is that kunmap(p_page) was completely left out, so we never did an unmap for the p_page and the loop unmapping the rbio page was iterating over the wrong number of stripes: unmapping should be done with nr_data instead of rbio->real_stripes. Test case to reproduce the bug: - create a raid5 btrfs filesystem: # mkfs.btrfs -m raid5 -d raid5 /dev/sdb /dev/sdc /dev/sdd /dev/sde - mount it: # mount /dev/sdb /mnt - run btrfs scrub in a loop: # while :; do btrfs scrub start -BR /mnt; done BugLink: https://bugs.launchpad.net/bugs/1812845 Fixes: 5a6ac9eacb49 ("Btrfs, raid56: support parity scrub on raid56") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Johannes Thumshirn Signed-off-by: Andrea Righi Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index e74455eb42f9..6976e2280771 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2429,8 +2429,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, bitmap_clear(rbio->dbitmap, pagenr, 1); kunmap(p); - for (stripe = 0; stripe < rbio->real_stripes; stripe++) + for (stripe = 0; stripe < nr_data; stripe++) kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + kunmap(p_page); } __free_page(p_page); From 16d80c54ad42c573a897ae7bcf5a9816be54e6fe Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 15 Mar 2019 14:50:04 +0100 Subject: [PATCH 616/855] rbd: set io_min, io_opt and discard_granularity to alloc_size Now that we have alloc_size that controls our discard behavior, it doesn't make sense to have these set to object (set) size. alloc_size defaults to 64k, but because discard_granularity is likely 4M, only ranges that are equal to or bigger than 4M can be considered during fstrim. A smaller io_min is also more likely to be met, resulting in fewer deferred writes on bluestore OSDs. Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4ba967d65cf9..3b2c9289dccb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -833,7 +833,7 @@ static int parse_rbd_opts_token(char *c, void *private) pctx->opts->queue_depth = intval; break; case Opt_alloc_size: - if (intval < 1) { + if (intval < SECTOR_SIZE) { pr_err("alloc_size out of range\n"); return -EINVAL; } @@ -4203,12 +4203,12 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) q->limits.max_sectors = queue_max_hw_sectors(q); blk_queue_max_segments(q, USHRT_MAX); blk_queue_max_segment_size(q, UINT_MAX); - blk_queue_io_min(q, objset_bytes); - blk_queue_io_opt(q, objset_bytes); + blk_queue_io_min(q, rbd_dev->opts->alloc_size); + blk_queue_io_opt(q, rbd_dev->opts->alloc_size); if (rbd_dev->opts->trim) { blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); - q->limits.discard_granularity = objset_bytes; + q->limits.discard_granularity = rbd_dev->opts->alloc_size; blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); } From 165aa2bfb42904b1bec4bf2fa257c8c603c14a06 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Mon, 28 Jan 2019 15:24:42 +0100 Subject: [PATCH 617/855] scsi: iscsi: flush running unbind operations when removing a session In some cases, the iscsi_remove_session() function is called while an unbind_work operation is still running. This may cause a situation where sysfs objects are removed in an incorrect order, triggering a kernel warning. [ 605.249442] ------------[ cut here ]------------ [ 605.259180] sysfs group 'power' not found for kobject 'target2:0:0' [ 605.321371] WARNING: CPU: 1 PID: 26794 at fs/sysfs/group.c:235 sysfs_remove_group+0x76/0x80 [ 605.341266] Modules linked in: dm_service_time target_core_user target_core_pscsi target_core_file target_core_iblock iscsi_target_mod target_core_mod nls_utf8 isofs ppdev bochs_drm nfit ttm libnvdimm drm_kms_helper syscopyarea sysfillrect sysimgblt joydev pcspkr fb_sys_fops drm i2c_piix4 sg parport_pc parport xfs libcrc32c dm_multipath sr_mod sd_mod cdrom ata_generic 8021q garp mrp ata_piix stp crct10dif_pclmul crc32_pclmul llc libata crc32c_intel virtio_net net_failover ghash_clmulni_intel serio_raw failover sunrpc dm_mirror dm_region_hash dm_log dm_mod be2iscsi bnx2i cnic uio cxgb4i cxgb4 libcxgbi libcxgb qla4xxx iscsi_boot_sysfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi [ 605.627479] CPU: 1 PID: 26794 Comm: kworker/u32:2 Not tainted 4.18.0-60.el8.x86_64 #1 [ 605.721401] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20180724_192412-buildhw-07.phx2.fedoraproject.org-1.fc29 04/01/2014 [ 605.823651] Workqueue: scsi_wq_2 __iscsi_unbind_session [scsi_transport_iscsi] [ 605.830940] RIP: 0010:sysfs_remove_group+0x76/0x80 [ 605.922907] Code: 48 89 df 5b 5d 41 5c e9 38 c4 ff ff 48 89 df e8 e0 bf ff ff eb cb 49 8b 14 24 48 8b 75 00 48 c7 c7 38 73 cb a7 e8 24 77 d7 ff <0f> 0b 5b 5d 41 5c c3 0f 1f 00 0f 1f 44 00 00 41 56 41 55 41 54 55 [ 606.122304] RSP: 0018:ffffbadcc8d1bda8 EFLAGS: 00010286 [ 606.218492] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 606.326381] RDX: ffff98bdfe85eb40 RSI: ffff98bdfe856818 RDI: ffff98bdfe856818 [ 606.514498] RBP: ffffffffa7ab73e0 R08: 0000000000000268 R09: 0000000000000007 [ 606.529469] R10: 0000000000000000 R11: ffffffffa860d9ad R12: ffff98bdf978e838 [ 606.630535] R13: ffff98bdc2cd4010 R14: ffff98bdc2cd3ff0 R15: ffff98bdc2cd4000 [ 606.824707] FS: 0000000000000000(0000) GS:ffff98bdfe840000(0000) knlGS:0000000000000000 [ 607.018333] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 607.117844] CR2: 00007f84b78ac024 CR3: 000000002c00a003 CR4: 00000000003606e0 [ 607.117844] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 607.420926] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 607.524236] Call Trace: [ 607.530591] device_del+0x56/0x350 [ 607.624393] ? ata_tlink_match+0x30/0x30 [libata] [ 607.727805] ? attribute_container_device_trigger+0xb4/0xf0 [ 607.829911] scsi_target_reap_ref_release+0x39/0x50 [ 607.928572] scsi_remove_target+0x1a2/0x1d0 [ 608.017350] __iscsi_unbind_session+0xb3/0x160 [scsi_transport_iscsi] [ 608.117435] process_one_work+0x1a7/0x360 [ 608.132917] worker_thread+0x30/0x390 [ 608.222900] ? pwq_unbound_release_workfn+0xd0/0xd0 [ 608.323989] kthread+0x112/0x130 [ 608.418318] ? kthread_bind+0x30/0x30 [ 608.513821] ret_from_fork+0x35/0x40 [ 608.613909] ---[ end trace 0b98c310c8a6138c ]--- Signed-off-by: Maurizio Lombardi Acked-by: Chris Leech Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_iscsi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 0508831d6fb9..0a82e93566dc 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2200,6 +2200,8 @@ void iscsi_remove_session(struct iscsi_cls_session *session) scsi_target_unblock(&session->dev, SDEV_TRANSPORT_OFFLINE); /* flush running scans then delete devices */ flush_work(&session->scan_work); + /* flush running unbind operations */ + flush_work(&session->unbind_work); __iscsi_unbind_session(&session->unbind_work); /* hw iscsi may not have removed all connections from session */ From 6e0473633af059a559ce7b4cbaa51e389c94085e Mon Sep 17 00:00:00 2001 From: Thomas Preston Date: Wed, 6 Mar 2019 20:06:18 +0000 Subject: [PATCH 618/855] drm/i915/bios: assume eDP is present on port A when there is no VBT We rely on VBT DDI port info for eDP detection on GEN9 platforms and above. This breaks GEN9 platforms which don't have VBT because port A eDP now defaults to false. Fix this by defaulting to true when VBT is missing. Fixes: a98d9c1d7e9b ("drm/i915/ddi: Rely on VBT DDI port info for eDP detection") Signed-off-by: Thomas Preston Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20190306200618.17405-1-thomas.preston@codethink.co.uk (cherry picked from commit 2131bc0ced6088648e47f126566c3da58b07e4ef) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_bios.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index b508d8a735e0..4364f42cac6b 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -1673,6 +1673,7 @@ init_vbt_missing_defaults(struct drm_i915_private *dev_priv) info->supports_dvi = (port != PORT_A && port != PORT_E); info->supports_hdmi = info->supports_dvi; info->supports_dp = (port != PORT_E); + info->supports_edp = (port == PORT_A); } } From 65f26e978d7c55c3c3d04296058d95cf7b6e3f14 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 15 Mar 2019 16:39:33 +0000 Subject: [PATCH 619/855] drm/i915: Fix off-by-one in reporting hanging process ffs() is 1-indexed, but we want to use it as an index into an array, so use __ffs() instead. Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20190315163933.19352-1-chris@chris-wilson.co.uk (cherry picked from commit 9073e5b26743b8b675cc44a9c0c8f8c3d584e1c0) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_gpu_error.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 9a65341fec09..aa6791255252 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1721,7 +1721,7 @@ error_msg(struct i915_gpu_state *error, unsigned long engines, const char *msg) i915_error_generate_code(error, engines)); if (engines) { /* Just show the first executing process, more is confusing */ - i = ffs(engines); + i = __ffs(engines); len += scnprintf(error->error_msg + len, sizeof(error->error_msg) - len, ", in %s [%d]", From 000c4f90e3f0194eef218ff2c6a8fd8ca1de4313 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 14 Mar 2019 07:58:29 +0000 Subject: [PATCH 620/855] drm/i915: Sanity check mmap length against object size We assumed that vm_mmap() would reject an attempt to mmap past the end of the filp (our object), but we were wrong. Applications that tried to use the mmap beyond the end of the object would be greeted by a SIGBUS. After this patch, those applications will be told about the error on creating the mmap, rather than at a random moment on later access. Reported-by: Antonio Argenziano Testcase: igt/gem_mmap/bad-size Signed-off-by: Chris Wilson Cc: Antonio Argenziano Cc: Joonas Lahtinen Cc: Tvrtko Ursulin Cc: stable@vger.kernel.org Reviewed-by: Tvrtko Ursulin Reviewed-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20190314075829.16838-1-chris@chris-wilson.co.uk (cherry picked from commit 794a11cb67201ad1bb61af510bb8460280feb3f3) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_gem.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 30d516e975c6..8558e81fdc2a 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1734,8 +1734,13 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data, * pages from. */ if (!obj->base.filp) { - i915_gem_object_put(obj); - return -ENXIO; + addr = -ENXIO; + goto err; + } + + if (range_overflows(args->offset, args->size, (u64)obj->base.size)) { + addr = -EINVAL; + goto err; } addr = vm_mmap(obj->base.filp, 0, args->size, @@ -1749,8 +1754,8 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data, struct vm_area_struct *vma; if (down_write_killable(&mm->mmap_sem)) { - i915_gem_object_put(obj); - return -EINTR; + addr = -EINTR; + goto err; } vma = find_vma(mm, addr); if (vma && __vma_matches(vma, obj->base.filp, addr, args->size)) @@ -1768,12 +1773,10 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data, i915_gem_object_put(obj); args->addr_ptr = (u64)addr; - return 0; err: i915_gem_object_put(obj); - return addr; } From 4a9be28c45bf02fa0436808bb6c0baeba30e120e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 19 Mar 2019 11:33:24 +1100 Subject: [PATCH 621/855] NFS: fix mount/umount race in nlmclnt. If the last NFSv3 unmount from a given host races with a mount from the same host, we can destroy an nlm_host that is still in use. Specifically nlmclnt_lookup_host() can increment h_count on an nlm_host that nlmclnt_release_host() has just successfully called refcount_dec_and_test() on. Once nlmclnt_lookup_host() drops the mutex, nlm_destroy_host_lock() will be called to destroy the nlmclnt which is now in use again. The cause of the problem is that the dec_and_test happens outside the locked region. This is easily fixed by using refcount_dec_and_mutex_lock(). Fixes: 8ea6ecc8b075 ("lockd: Create client-side nlm_host cache") Cc: stable@vger.kernel.org (v2.6.38+) Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 93fb7cf0b92b..f0b5c987d6ae 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -290,12 +290,11 @@ void nlmclnt_release_host(struct nlm_host *host) WARN_ON_ONCE(host->h_server); - if (refcount_dec_and_test(&host->h_count)) { + if (refcount_dec_and_mutex_lock(&host->h_count, &nlm_host_mutex)) { WARN_ON_ONCE(!list_empty(&host->h_lockowners)); WARN_ON_ONCE(!list_empty(&host->h_granted)); WARN_ON_ONCE(!list_empty(&host->h_reclaim)); - mutex_lock(&nlm_host_mutex); nlm_destroy_host_locked(host); mutex_unlock(&nlm_host_mutex); } From 744c67ffeb06f2d2493f4049ba0bd19698ce0adf Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Tue, 19 Mar 2019 09:28:43 +0800 Subject: [PATCH 622/855] ALSA: hda - Don't trigger jackpoll_work in azx_resume The commit 3baffc4a84d7 (ALSA: hda/intel: Refactoring PM code) changed the behaviour of azx_resume(), it triggers the jackpoll_work after applying this commit. This change introduced a new issue, all codecs are runtime active after S3, and will not call runtime_suspend() automatically. The root cause is the jackpoll_work calls snd_hda_power_up/down_pm, and it calls up_pm before snd_hdac_enter_pm is called, while calls the down_pm in the middle of enter_pm and leave_pm is called. This makes the dev->power.usage_count unbalanced after S3. To fix it, let azx_resume() don't trigger jackpoll_work as before it did. Fixes: 3baffc4a84d7 ("ALSA: hda/intel: Refactoring PM code") Signed-off-by: Hui Wang Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 4f502c92061f..ece256a3b48f 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -947,7 +947,7 @@ static void __azx_runtime_suspend(struct azx *chip) display_power(chip, false); } -static void __azx_runtime_resume(struct azx *chip) +static void __azx_runtime_resume(struct azx *chip, bool from_rt) { struct hda_intel *hda = container_of(chip, struct hda_intel, chip); struct hdac_bus *bus = azx_bus(chip); @@ -964,7 +964,7 @@ static void __azx_runtime_resume(struct azx *chip) azx_init_pci(chip); hda_intel_init_chip(chip, true); - if (status) { + if (status && from_rt) { list_for_each_codec(codec, &chip->bus) if (status & (1 << codec->addr)) schedule_delayed_work(&codec->jackpoll_work, @@ -1016,7 +1016,7 @@ static int azx_resume(struct device *dev) chip->msi = 0; if (azx_acquire_irq(chip, 1) < 0) return -EIO; - __azx_runtime_resume(chip); + __azx_runtime_resume(chip, false); snd_power_change_state(card, SNDRV_CTL_POWER_D0); trace_azx_resume(chip); @@ -1081,7 +1081,7 @@ static int azx_runtime_resume(struct device *dev) chip = card->private_data; if (!azx_has_pm_runtime(chip)) return 0; - __azx_runtime_resume(chip); + __azx_runtime_resume(chip, true); /* disable controller Wake Up event*/ azx_writew(chip, WAKEEN, azx_readw(chip, WAKEEN) & From b5a236c175b0d984552a5f7c9d35141024c2b261 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Tue, 19 Mar 2019 09:28:44 +0800 Subject: [PATCH 623/855] ALSA: hda - Enforces runtime_resume after S3 and S4 for each codec Recently we found the audio jack detection stop working after suspend on many machines with Realtek codec. Sometimes the audio selection dialogue didn't show up after users plugged headhphone/headset into the headset jack, sometimes after uses plugged headphone/headset, then click the sound icon on the upper-right corner of gnome-desktop, it also showed the speaker rather than the headphone. The root cause is that before suspend, the codec already call the runtime_suspend since this codec is not used by any apps, then in resume, it will not call runtime_resume for this codec. But for some realtek codec (so far, alc236, alc255 and alc891) with the specific BIOS, if it doesn't run runtime_resume after suspend, all codec functions including jack detection stop working anymore. This problem existed for a long time, but it was not exposed, that is because when problem happens, if users play sound or open sound-setting to check audio device, this will trigger calling to runtime_resume (via snd_hda_power_up), then the codec starts working again before users notice this problem. Since we don't know how many codec and BIOS combinations have this problem, to fix it, let the driver call runtime_resume for all codecs in pm_resume, maybe for some codecs, this is not needed, but it is harmless. After a codec is runtime resumed, if it is not used by any apps, it will be runtime suspended soon and furthermore we don't run suspend frequently, this change will not add much power consumption. Fixes: cc72da7d4d06 ("ALSA: hda - Use standard runtime PM for codec power-save control") Signed-off-by: Hui Wang Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_codec.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index 5f2005098a60..ec0b8595eb4d 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -2939,6 +2939,20 @@ static int hda_codec_runtime_resume(struct device *dev) #endif /* CONFIG_PM */ #ifdef CONFIG_PM_SLEEP +static int hda_codec_force_resume(struct device *dev) +{ + int ret; + + /* The get/put pair below enforces the runtime resume even if the + * device hasn't been used at suspend time. This trick is needed to + * update the jack state change during the sleep. + */ + pm_runtime_get_noresume(dev); + ret = pm_runtime_force_resume(dev); + pm_runtime_put(dev); + return ret; +} + static int hda_codec_pm_suspend(struct device *dev) { dev->power.power_state = PMSG_SUSPEND; @@ -2948,7 +2962,7 @@ static int hda_codec_pm_suspend(struct device *dev) static int hda_codec_pm_resume(struct device *dev) { dev->power.power_state = PMSG_RESUME; - return pm_runtime_force_resume(dev); + return hda_codec_force_resume(dev); } static int hda_codec_pm_freeze(struct device *dev) @@ -2960,13 +2974,13 @@ static int hda_codec_pm_freeze(struct device *dev) static int hda_codec_pm_thaw(struct device *dev) { dev->power.power_state = PMSG_THAW; - return pm_runtime_force_resume(dev); + return hda_codec_force_resume(dev); } static int hda_codec_pm_restore(struct device *dev) { dev->power.power_state = PMSG_RESTORE; - return pm_runtime_force_resume(dev); + return hda_codec_force_resume(dev); } #endif /* CONFIG_PM_SLEEP */ From b2d22b6bb33aac10c415e4ba13c8eade201c6f09 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Mar 2019 12:42:37 +0100 Subject: [PATCH 624/855] fanotify: Allow copying of file handle to userspace When file handle is embedded inside fanotify_event and usercopy checks are enabled, we get a warning like: Bad or missing usercopy whitelist? Kernel memory exposure attempt detected from SLAB object 'fanotify_event' (offset 40, size 8)! WARNING: CPU: 1 PID: 7649 at mm/usercopy.c:78 usercopy_warn+0xeb/0x110 mm/usercopy.c:78 Annotate handling in fanotify_event properly to mark copying it to userspace is fine. Reported-by: syzbot+2c49971e251e36216d1f@syzkaller.appspotmail.com Fixes: a8b13aa20afb ("fanotify: enable FAN_REPORT_FID init flag") Signed-off-by: Kees Cook Reviewed-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify_user.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 56992b32c6bb..a90bb19dcfa2 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -208,6 +208,7 @@ static int copy_fid_to_user(struct fanotify_event *event, char __user *buf) { struct fanotify_event_info_fid info = { }; struct file_handle handle = { }; + unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh; size_t fh_len = event->fh_len; size_t len = fanotify_event_info_len(event); @@ -233,7 +234,16 @@ static int copy_fid_to_user(struct fanotify_event *event, char __user *buf) buf += sizeof(handle); len -= sizeof(handle); - if (copy_to_user(buf, fanotify_event_fh(event), fh_len)) + /* + * For an inline fh, copy through stack to exclude the copy from + * usercopy hardening protections. + */ + fh = fanotify_event_fh(event); + if (fh_len <= FANOTIFY_INLINE_FH_LEN) { + memcpy(bounce, fh, fh_len); + fh = bounce; + } + if (copy_to_user(buf, fh, fh_len)) return -EFAULT; /* Pad with 0's */ From a23314e9d88d89d49e69db08f60b7caa470f04e1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Mar 2019 09:32:02 +0100 Subject: [PATCH 625/855] sched/cpufreq: Fix 32-bit math overflow Vincent Wang reported that get_next_freq() has a mult overflow bug on 32-bit platforms in the IOWAIT boost case, since in that case {util,max} are in freq units instead of capacity units. Solve this by moving the IOWAIT boost to capacity units. And since this means @max is constant; simplify the code. Reported-by: Vincent Wang Tested-by: Vincent Wang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Chunyan Zhang Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J. Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190305083202.GU32494@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 59 ++++++++++++++------------------ 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 033ec7c45f13..1ccf77f6d346 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -48,10 +48,10 @@ struct sugov_cpu { bool iowait_boost_pending; unsigned int iowait_boost; - unsigned int iowait_boost_max; u64 last_update; unsigned long bw_dl; + unsigned long min; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -303,8 +303,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, if (delta_ns <= TICK_NSEC) return false; - sg_cpu->iowait_boost = set_iowait_boost - ? sg_cpu->sg_policy->policy->min : 0; + sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0; sg_cpu->iowait_boost_pending = set_iowait_boost; return true; @@ -344,14 +343,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, /* Double the boost at each request */ if (sg_cpu->iowait_boost) { - sg_cpu->iowait_boost <<= 1; - if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + sg_cpu->iowait_boost = + min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); return; } /* First wakeup after IO: start with minimum boost */ - sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + sg_cpu->iowait_boost = sg_cpu->min; } /** @@ -373,47 +371,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - unsigned long *util, unsigned long *max) +static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long util, unsigned long max) { - unsigned int boost_util, boost_max; + unsigned long boost; /* No boost currently required */ if (!sg_cpu->iowait_boost) - return; + return util; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return; + return util; - /* - * An IO waiting task has just woken up: - * allow to further double the boost value - */ - if (sg_cpu->iowait_boost_pending) { - sg_cpu->iowait_boost_pending = false; - } else { + if (!sg_cpu->iowait_boost_pending) { /* - * Otherwise: reduce the boost value and disable it when we - * reach the minimum. + * No boost pending; reduce the boost value. */ sg_cpu->iowait_boost >>= 1; - if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + if (sg_cpu->iowait_boost < sg_cpu->min) { sg_cpu->iowait_boost = 0; - return; + return util; } } + sg_cpu->iowait_boost_pending = false; + /* - * Apply the current boost value: a CPU is boosted only if its current - * utilization is smaller then the current IO boost level. + * @util is already in capacity scale; convert iowait_boost + * into the same scale so we can compare. */ - boost_util = sg_cpu->iowait_boost; - boost_max = sg_cpu->iowait_boost_max; - if (*util * boost_max < *max * boost_util) { - *util = boost_util; - *max = boost_max; - } + boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; + return max(boost, util); } #ifdef CONFIG_NO_HZ_COMMON @@ -460,7 +449,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, util = sugov_get_util(sg_cpu); max = sg_cpu->max; - sugov_iowait_apply(sg_cpu, time, &util, &max); + util = sugov_iowait_apply(sg_cpu, time, util, max); next_f = get_next_freq(sg_policy, util, max); /* * Do not reduce the frequency if the CPU has not been idle @@ -500,7 +489,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) j_util = sugov_get_util(j_sg_cpu); j_max = j_sg_cpu->max; - sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); + j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max); if (j_util * max > j_max * util) { util = j_util; @@ -837,7 +826,9 @@ static int sugov_start(struct cpufreq_policy *policy) memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + sg_cpu->min = + (SCHED_CAPACITY_SCALE * policy->cpuinfo.min_freq) / + policy->cpuinfo.max_freq; } for_each_cpu(cpu, policy->cpus) { From 4c47acd824aaaa8fc6dc519fb4e08d1522105b7a Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 6 Mar 2019 20:11:42 +0300 Subject: [PATCH 626/855] sched/core: Fix buffer overflow in cgroup2 property cpu.max Add limit into sscanf format string for on-stack buffer. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Johannes Weiner Cc: Li Zefan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Fixes: 0d5936344f30 ("sched: Implement interface for cgroup unified hierarchy") Link: https://lkml.kernel.org/r/155189230232.2620.13120481613524200065.stgit@buzz Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6b2c055564b5..b7a4afdc33cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6943,7 +6943,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, { char tok[21]; /* U64_MAX */ - if (!sscanf(buf, "%s %llu", tok, periodp)) + if (sscanf(buf, "%20s %llu", tok, periodp) < 1) return -EINVAL; *periodp *= NSEC_PER_USEC; From e25a7a944f1936b5134b7ee06bc432fc701e4aa3 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 11 Feb 2019 17:59:44 +0000 Subject: [PATCH 627/855] sched/fair: Comment some nohz_balancer_kick() kick conditions We now have a comment explaining the first sched_domain based NOHZ kick, so might as well comment them all. While at it, unwrap a line that fits under 80 characters. Co-authored-by: Peter Zijlstra Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dietmar.Eggemann@arm.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: morten.rasmussen@arm.com Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20190211175946.4961-2-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8213ff6e365d..e6f7d39d4d45 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9612,8 +9612,12 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(rq->sd); if (sd) { - if ((rq->cfs.h_nr_running >= 1) && - check_cpu_capacity(rq, sd)) { + /* + * If there's a CFS task and the current CPU has reduced + * capacity; kick the ILB to see if there's a better CPU to run + * on. + */ + if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_KICK_MASK; goto unlock; } @@ -9621,6 +9625,11 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { + /* + * When ASYM_PACKING; see if there's a more preferred CPU + * currently idle; in which case, kick the ILB to move tasks + * around. + */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym_prefer(i, cpu)) { flags = NOHZ_KICK_MASK; From a0fe2cf086aef213d1b4bca1b1291a3dee8357c9 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 11 Feb 2019 17:59:45 +0000 Subject: [PATCH 628/855] sched/fair: Tune down misfit NOHZ kicks In this commit: 3b1baa6496e6 ("sched/fair: Add 'group_misfit_task' load-balance type") we set rq->misfit_task_load whenever the current running task has a utilization greater than 80% of rq->cpu_capacity. A non-zero value in this field enables misfit load balancing. However, if the task being looked at is already running on a CPU of highest capacity, there's nothing more we can do for it. We can currently spot this in update_sd_pick_busiest(), which prevents us from selecting a sched_group of group_type == group_misfit_task as the busiest group, but we don't do any of that in nohz_balancer_kick(). This means that we could repeatedly kick NOHZ CPUs when there's no improvements in terms of load balance to be done. Introduce a check_misfit_status() helper that returns true iff there is a CPU in the system that could give more CPU capacity to a rq's misfit task - IOW, there exists a CPU of higher capacity_orig or the rq's CPU is severely pressured by rt/IRQ. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dietmar.Eggemann@arm.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: morten.rasmussen@arm.com Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20190211175946.4961-3-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e6f7d39d4d45..f0d2f8a352bf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8058,6 +8058,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) (rq->cpu_capacity_orig * 100)); } +/* + * Check whether a rq has a misfit task and if it looks like we can actually + * help that task: we can migrate the task to a CPU of higher capacity, or + * the task's current CPU is heavily pressured. + */ +static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +{ + return rq->misfit_task_load && + (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || + check_cpu_capacity(rq, sd)); +} + /* * Group imbalance indicates (and tries to solve) the problem where balancing * groups is inadequate due to ->cpus_allowed constraints. @@ -9585,7 +9597,7 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; - if (rq->nr_running >= 2 || rq->misfit_task_load) { + if (rq->nr_running >= 2) { flags = NOHZ_KICK_MASK; goto out; } @@ -9623,6 +9635,18 @@ static void nohz_balancer_kick(struct rq *rq) } } + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); + if (sd) { + /* + * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU + * to run the misfit task on. + */ + if (check_misfit_status(rq, sd)) { + flags = NOHZ_KICK_MASK; + goto unlock; + } + } + sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { /* From b9a7b8831600afc51c9ba52c05f12db2266f01c7 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 11 Feb 2019 17:59:46 +0000 Subject: [PATCH 629/855] sched/fair: Skip LLC NOHZ logic for asymmetric systems The LLC NOHZ condition will become true as soon as >=2 CPUs in a single LLC domain are busy. On big.LITTLE systems, this translates to two or more CPUs of a "cluster" (big or LITTLE) being busy. Issuing a NOHZ kick in these conditions isn't desired for asymmetric systems, as if the busy CPUs can provide enough compute capacity to the running tasks, then we can leave the NOHZ CPUs in peace. Skip the LLC NOHZ condition for asymmetric systems, and rely on nr_running & capacity checks to trigger NOHZ kicks when the system actually needs them. Suggested-by: Morten Rasmussen Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dietmar.Eggemann@arm.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20190211175946.4961-4-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 69 +++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f0d2f8a352bf..51003e1c794d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9603,24 +9603,6 @@ static void nohz_balancer_kick(struct rq *rq) } rcu_read_lock(); - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - if (sds) { - /* - * If there is an imbalance between LLC domains (IOW we could - * increase the overall cache use), we need some less-loaded LLC - * domain to pull some load. Likewise, we may need to spread - * load within the current LLC domain (e.g. packed SMT cores but - * other CPUs are idle). We can't really know from here how busy - * the others are - so just get a nohz balance going if it looks - * like this LLC domain has tasks we could move. - */ - nr_busy = atomic_read(&sds->nr_busy_cpus); - if (nr_busy > 1) { - flags = NOHZ_KICK_MASK; - goto unlock; - } - - } sd = rcu_dereference(rq->sd); if (sd) { @@ -9635,18 +9617,6 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); - if (sd) { - /* - * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU - * to run the misfit task on. - */ - if (check_misfit_status(rq, sd)) { - flags = NOHZ_KICK_MASK; - goto unlock; - } - } - sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { /* @@ -9661,6 +9631,45 @@ static void nohz_balancer_kick(struct rq *rq) } } } + + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); + if (sd) { + /* + * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU + * to run the misfit task on. + */ + if (check_misfit_status(rq, sd)) { + flags = NOHZ_KICK_MASK; + goto unlock; + } + + /* + * For asymmetric systems, we do not want to nicely balance + * cache use, instead we want to embrace asymmetry and only + * ensure tasks have enough CPU capacity. + * + * Skip the LLC logic because it's not relevant in that case. + */ + goto unlock; + } + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) { + /* + * If there is an imbalance between LLC domains (IOW we could + * increase the overall cache use), we need some less-loaded LLC + * domain to pull some load. Likewise, we may need to spread + * load within the current LLC domain (e.g. packed SMT cores but + * other CPUs are idle). We can't really know from here how busy + * the others are - so just get a nohz balance going if it looks + * like this LLC domain has tasks we could move. + */ + nr_busy = atomic_read(&sds->nr_busy_cpus); + if (nr_busy > 1) { + flags = NOHZ_KICK_MASK; + goto unlock; + } + } unlock: rcu_read_unlock(); out: From a3151724437f54076cc10bc02b1c4f0003ae36cd Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Mon, 18 Mar 2019 22:24:03 +0100 Subject: [PATCH 630/855] x86/mm: Don't leak kernel addresses Since commit: ad67b74d2469d9b8 ("printk: hash addresses printed with %p") at boot "____ptrval____" is printed instead of actual addresses: found SMP MP-table at [mem 0x000f5cc0-0x000f5ccf] mapped at [(____ptrval____)] Instead of changing the print to "%px", and leaking a kernel addresses, just remove the print completely, like in: 071929dbdd865f77 ("arm64: Stop printing the virtual memory layout"). Signed-off-by: Matteo Croce Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 3482460d984d..1bfe5c6e6cfe 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -598,8 +598,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) mpf_base = base; mpf_found = true; - pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n", - base, base + sizeof(*mpf) - 1, mpf); + pr_info("found SMP MP-table at [mem %#010lx-%#010lx]\n", + base, base + sizeof(*mpf) - 1); memblock_reserve(base, sizeof(*mpf)); if (mpf->physptr) From a872fc8bf0304fd56347e94468f07d7e82c679ea Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 00:43:22 +0900 Subject: [PATCH 631/855] arm64: kprobes: Move extable address check into arch_prepare_kprobe() Move extable address check into arch_prepare_kprobe() from arch_within_kprobe_blacklist(). The blacklist is exposed via debugfs as a list of symbols. The extable entries are smaller, so must be filtered out by arch_prepare_kprobe(). Acked-by: Will Deacon Reviewed-by: James Morse Signed-off-by: Masami Hiramatsu Signed-off-by: Catalin Marinas --- arch/arm64/kernel/probes/kprobes.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 7fb6f3aa5ceb..16bc3b2e9351 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -102,6 +102,10 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) if (in_exception_text(probe_addr)) return -EINVAL; + + if (search_exception_tables(probe_addr)) + return -EINVAL; + if (probe_addr >= (unsigned long) __start_rodata && probe_addr <= (unsigned long) __end_rodata) return -EINVAL; @@ -485,8 +489,7 @@ bool arch_within_kprobe_blacklist(unsigned long addr) (addr >= (unsigned long)__idmap_text_start && addr < (unsigned long)__idmap_text_end) || (addr >= (unsigned long)__hyp_text_start && - addr < (unsigned long)__hyp_text_end) || - !!search_exception_tables(addr)) + addr < (unsigned long)__hyp_text_end)) return true; if (!is_kernel_in_hyp_mode()) { From b5586163de1ce90317cd4037f69b14105be9f656 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 00:43:51 +0900 Subject: [PATCH 632/855] arm64: kprobes: Remove unneeded RODATA check Remove unneeded RODATA check from arch_prepare_kprobe(). Since check_kprobe_address_safe() already ensured that the probe address is in kernel text, we don't need to check whether the address in RODATA or not. That must be always false. Acked-by: Will Deacon Signed-off-by: Masami Hiramatsu Signed-off-by: Catalin Marinas --- arch/arm64/kernel/probes/kprobes.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 16bc3b2e9351..b168873147f5 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -91,8 +91,6 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) int __kprobes arch_prepare_kprobe(struct kprobe *p) { unsigned long probe_addr = (unsigned long)p->addr; - extern char __start_rodata[]; - extern char __end_rodata[]; if (probe_addr & 0x3) return -EINVAL; @@ -106,10 +104,6 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) if (search_exception_tables(probe_addr)) return -EINVAL; - if (probe_addr >= (unsigned long) __start_rodata && - probe_addr <= (unsigned long) __end_rodata) - return -EINVAL; - /* decode instruction */ switch (arm_kprobe_decode_insn(p->addr, &p->ainsn)) { case INSN_REJECTED: /* insn not supported */ From 6e08af0f10dcde01f0bdcc64cf91fea9d25e77cc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 00:44:19 +0900 Subject: [PATCH 633/855] arm64: kprobes: Move exception_text check in blacklist Move exception/irqentry text address check in blacklist, since those are symbol based rejection. If we prohibit probing on the symbols in exception_text, those should be blacklisted. Acked-by: Will Deacon Signed-off-by: Masami Hiramatsu Signed-off-by: Catalin Marinas --- arch/arm64/kernel/probes/kprobes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index b168873147f5..d6697930d075 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -98,9 +98,6 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) /* copy instruction */ p->opcode = le32_to_cpu(*p->addr); - if (in_exception_text(probe_addr)) - return -EINVAL; - if (search_exception_tables(probe_addr)) return -EINVAL; @@ -483,7 +480,8 @@ bool arch_within_kprobe_blacklist(unsigned long addr) (addr >= (unsigned long)__idmap_text_start && addr < (unsigned long)__idmap_text_end) || (addr >= (unsigned long)__hyp_text_start && - addr < (unsigned long)__hyp_text_end)) + addr < (unsigned long)__hyp_text_end) || + in_exception_text(addr)) return true; if (!is_kernel_in_hyp_mode()) { From 6a019a92aa580cd5abdaae578a2a297c9af80174 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 00:44:48 +0900 Subject: [PATCH 634/855] arm64: kprobes: Use arch_populate_kprobe_blacklist() Use arch_populate_kprobe_blacklist() instead of arch_within_kprobe_blacklist() so that we can see the full blacklisted symbols under the debugfs. Acked-by: Will Deacon Signed-off-by: Masami Hiramatsu [catalin.marinas@arm.com: Add arch_populate_kprobe_blacklist() comment] Signed-off-by: Catalin Marinas --- arch/arm64/kernel/probes/kprobes.c | 47 ++++++++++++++++++------------ 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index d6697930d075..7a679caf4585 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -471,26 +471,37 @@ kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr) return DBG_HOOK_HANDLED; } -bool arch_within_kprobe_blacklist(unsigned long addr) +/* + * Provide a blacklist of symbols identifying ranges which cannot be kprobed. + * This blacklist is exposed to userspace via debugfs (kprobes/blacklist). + */ +int __init arch_populate_kprobe_blacklist(void) { - if ((addr >= (unsigned long)__kprobes_text_start && - addr < (unsigned long)__kprobes_text_end) || - (addr >= (unsigned long)__entry_text_start && - addr < (unsigned long)__entry_text_end) || - (addr >= (unsigned long)__idmap_text_start && - addr < (unsigned long)__idmap_text_end) || - (addr >= (unsigned long)__hyp_text_start && - addr < (unsigned long)__hyp_text_end) || - in_exception_text(addr)) - return true; + int ret; - if (!is_kernel_in_hyp_mode()) { - if ((addr >= (unsigned long)__hyp_idmap_text_start && - addr < (unsigned long)__hyp_idmap_text_end)) - return true; - } - - return false; + ret = kprobe_add_area_blacklist((unsigned long)__entry_text_start, + (unsigned long)__entry_text_end); + if (ret) + return ret; + ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, + (unsigned long)__irqentry_text_end); + if (ret) + return ret; + ret = kprobe_add_area_blacklist((unsigned long)__exception_text_start, + (unsigned long)__exception_text_end); + if (ret) + return ret; + ret = kprobe_add_area_blacklist((unsigned long)__idmap_text_start, + (unsigned long)__idmap_text_end); + if (ret) + return ret; + ret = kprobe_add_area_blacklist((unsigned long)__hyp_text_start, + (unsigned long)__hyp_text_end); + if (ret || is_kernel_in_hyp_mode()) + return ret; + ret = kprobe_add_area_blacklist((unsigned long)__hyp_idmap_text_start, + (unsigned long)__hyp_idmap_text_end); + return ret; } void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) From 7ff2c2a1a71e83f74574b8001ea88deb3c166ad7 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 18 Mar 2019 17:45:19 +0200 Subject: [PATCH 635/855] btrfs: Fix bound checking in qgroup_trace_new_subtree_blocks If 'cur_level' is 7 then the bound checking at the top of the function will actually pass. Later on, it's possible to dereference ds_path->nodes[cur_level+1] which will be an out of bounds. The correct check will be cur_level >= BTRFS_MAX_LEVEL - 1 . Fixes-coverty-id: 1440918 Fixes-coverty-id: 1440911 Fixes: ea49f3e73c4b ("btrfs: qgroup: Introduce function to find all new tree blocks of reloc tree") CC: stable@vger.kernel.org # 4.20+ Reviewed-by: Qu Wenruo Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index eb680b715dd6..e659d9d61107 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1922,8 +1922,8 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, int i; /* Level sanity check */ - if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL || - root_level < 0 || root_level >= BTRFS_MAX_LEVEL || + if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || + root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || root_level < cur_level) { btrfs_err_rl(fs_info, "%s: bad levels, cur_level=%d root_level=%d", From 139a56170de67101791d6e6c8e940c6328393fe9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 18 Mar 2019 17:45:20 +0200 Subject: [PATCH 636/855] btrfs: Avoid possible qgroup_rsv_size overflow in btrfs_calculate_inode_block_rsv_size qgroup_rsv_size is calculated as the product of outstanding_extent * fs_info->nodesize. The product is calculated with 32 bit precision since both variables are defined as u32. Yet qgroup_rsv_size expects a 64 bit result. Avoid possible multiplication overflow by casting outstanding_extent to u64. Such overflow would in the worst case (64K nodesize) require more than 65536 extents, which is quite large and i'ts not likely that it would happen in practice. Fixes-coverity-id: 1435101 Fixes: ff6bc37eb7f6 ("btrfs: qgroup: Use independent and accurate per inode qgroup rsv") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Qu Wenruo Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1d49694e6ae3..c5880329ae37 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6174,7 +6174,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, * * This is overestimating in most cases. */ - qgroup_rsv_size = outstanding_extents * fs_info->nodesize; + qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; spin_lock(&block_rsv->lock); block_rsv->size = reserve_size; From 7c9abe12b359d988970c5e38f0e190249e5ae00f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Mar 2019 13:51:22 +0100 Subject: [PATCH 637/855] netfilter: nf_flowtable: remove duplicated transition in diagram No direct transition from prerouting to forward hook, routing lookup needs to happen first. Fixes: 19b351f16fd9 ("netfilter: add flowtable documentation") Signed-off-by: Pablo Neira Ayuso --- Documentation/networking/nf_flowtable.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/nf_flowtable.txt b/Documentation/networking/nf_flowtable.txt index 54128c50d508..ca2136c76042 100644 --- a/Documentation/networking/nf_flowtable.txt +++ b/Documentation/networking/nf_flowtable.txt @@ -44,10 +44,10 @@ including the Netfilter hooks and the flowtable fastpath bypass. / \ / \ |Routing | / \ --> ingress ---> prerouting ---> |decision| | postrouting |--> neigh_xmit \_________/ \__________/ ---------- \____________/ ^ - | ^ | | ^ | - flowtable | | ____\/___ | | - | | | / \ | | - __\/___ | --------->| forward |------------ | + | ^ | ^ | + flowtable | ____\/___ | | + | | / \ | | + __\/___ | | forward |------------ | |-----| | \_________/ | |-----| | 'flow offload' rule | |-----| | adds entry to | From 3dbcea54b3ff706c58f8e8d4470f5e5d3d24a6a0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 15 Mar 2019 12:22:36 +0000 Subject: [PATCH 638/855] arm64: apply workaround on A64FX v1r0 Fujitsu erratum 010001 applies to A64FX v0r0 and v1r0, and we try to handle either by masking MIDR with MIDR_FUJITSU_ERRATUM_010001_MASK before comparing it to MIDR_FUJITSU_ERRATUM_010001. Unfortunately, MIDR_FUJITSU_ERRATUM_010001 is constructed incorrectly using MIDR_VARIANT(), which is intended to extract the variant field from MIDR_EL1, rather than generate the field in-place. This results in MIDR_FUJITSU_ERRATUM_010001 being all-ones, and we only match A64FX v0r0. This patch uses MIDR_CPU_VAR_REV() to generate an in-place mask for the variant field, ensuring the we match both v0r0 and v1r0. Fixes: 3e32131abc311a5c ("arm64: Add workaround for Fujitsu A64FX erratum 010001") Reported-by: "Okamoto, Takayuki" Signed-off-by: Mark Rutland [catalin.marinas@arm.com: fixed the patch author] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cputype.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 2afb1338b48a..f3b659587a36 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -129,7 +129,7 @@ /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ #define MIDR_FUJITSU_ERRATUM_010001 MIDR_FUJITSU_A64FX -#define MIDR_FUJITSU_ERRATUM_010001_MASK (~MIDR_VARIANT(1)) +#define MIDR_FUJITSU_ERRATUM_010001_MASK (~MIDR_CPU_VAR_REV(1, 0)) #define TCR_CLEAR_FUJITSU_ERRATUM_010001 (TCR_NFD1 | TCR_NFD0) #ifndef __ASSEMBLY__ From c82fd1e6bd55ecc001e610e5484e292a7d8a39fc Mon Sep 17 00:00:00 2001 From: William Cohen Date: Fri, 1 Mar 2019 15:00:41 -0500 Subject: [PATCH 639/855] arm64/stacktrace: Export save_stack_trace_regs() The ARM64 implements the save_stack_trace_regs function, but it is unusable for any diagnostic tooling compiled as a kernel module due the missing EXPORT_SYMBOL_GPL for the function. Export save_stack_trace_regs() to align with other architectures such as s390, openrisc, and powerpc. This is similar to the ARM64 export of save_stack_trace_tsk() added in git commit e27c7fa015d6. Signed-off-by: William Cohen Signed-off-by: Catalin Marinas --- arch/arm64/kernel/stacktrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 1a29f2695ff2..d908b5e9e949 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -143,6 +143,7 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } +EXPORT_SYMBOL_GPL(save_stack_trace_regs); static noinline void __save_stack_trace(struct task_struct *tsk, struct stack_trace *trace, unsigned int nosched) From efd00c722ca855745fcc35a7e6675b5a782a3fc8 Mon Sep 17 00:00:00 2001 From: Hanjun Guo Date: Tue, 5 Mar 2019 21:40:57 +0800 Subject: [PATCH 640/855] arm64: Add MIDR encoding for HiSilicon Taishan CPUs Adding the MIDR encodings for HiSilicon Taishan v110 CPUs, which is used in Kunpeng ARM64 server SoCs. TSV110 is the abbreviation of Taishan v110. Signed-off-by: Hanjun Guo Reviewed-by: John Garry Reviewed-by: Zhangshaokun Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cputype.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index f3b659587a36..5f1437099b99 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -77,6 +77,7 @@ #define ARM_CPU_IMP_QCOM 0x51 #define ARM_CPU_IMP_NVIDIA 0x4E #define ARM_CPU_IMP_FUJITSU 0x46 +#define ARM_CPU_IMP_HISI 0x48 #define ARM_CPU_PART_AEM_V8 0xD0F #define ARM_CPU_PART_FOUNDATION 0xD00 @@ -107,6 +108,8 @@ #define FUJITSU_CPU_PART_A64FX 0x001 +#define HISI_CPU_PART_TSV110 0xD01 + #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) @@ -126,6 +129,7 @@ #define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER) #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) +#define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ #define MIDR_FUJITSU_ERRATUM_010001 MIDR_FUJITSU_A64FX From 0ecc471a2cb7d4d386089445a727f47b59dc9b6e Mon Sep 17 00:00:00 2001 From: Hanjun Guo Date: Tue, 5 Mar 2019 21:40:58 +0800 Subject: [PATCH 641/855] arm64: kpti: Whitelist HiSilicon Taishan v110 CPUs HiSilicon Taishan v110 CPUs didn't implement CSV3 field of the ID_AA64PFR0_EL1 and are not susceptible to Meltdown, so whitelist the MIDR in kpti_safe_list[] table. Signed-off-by: Hanjun Guo Reviewed-by: John Garry Reviewed-by: Zhangshaokun Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index e24e94d28767..4061de10cea6 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -963,6 +963,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), { /* sentinel */ } }; char const *str = "command line option"; From 6f40b2a5dac4b448fe1e8d94dc4238cd95cd5e34 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Mar 2019 15:49:53 -0700 Subject: [PATCH 642/855] perf list: Filter metrics too When a filter is specified on the command line, filter the metrics too. Before: % perf list foo List of pre-defined events (to be used in -e): Metric Groups: DSB: DSB_Coverage [Fraction of Uops delivered by the DSB (aka Decoded Icache; or Uop Cache)] ... more metrics ... After: % perf list foo List of pre-defined events (to be used in -e): Metric Groups: Signed-off-by: Andi Kleen Acked-by: Jiri Olsa LPU-Reference: 20190314225002.30108-1-andi@firstfloor.org Link: https://lkml.kernel.org/n/tip-1y8oi2s8c4jhjtykgs5zvda1@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-list.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index c9f98d00c0e9..a8394b4f1167 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -119,7 +119,7 @@ int cmd_list(int argc, const char **argv) details_flag); print_tracepoint_events(NULL, s, raw_dump); print_sdt_events(NULL, s, raw_dump); - metricgroup__print(true, true, NULL, raw_dump, details_flag); + metricgroup__print(true, true, s, raw_dump, details_flag); free(s); } } From 03724b2e9c45d931eff0f304f2d3363ade65ca89 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Mar 2019 15:49:55 -0700 Subject: [PATCH 643/855] perf record: Allow to limit number of reported perf.data files When doing long term recording and waiting for some event to snapshot on, we often only care about the last minute or so. The --switch-output command line option supports rotating the perf.data file when the size exceeds a threshold. But the disk would still be filled with unnecessary old files. Add a new option to only keep a number of rotated files, so that the disk space usage can be limited. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa LPU-Reference: 20190314225002.30108-3-andi@firstfloor.org Link: https://lkml.kernel.org/n/tip-y5u2lik0ragt4vlktz6qc9ks@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 4 ++++ tools/perf/builtin-record.c | 30 +++++++++++++++++++++++- tools/perf/util/data.c | 11 ++++----- tools/perf/util/data.h | 2 +- 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 8f0c2be34848..8fe4dffcadd0 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -495,6 +495,10 @@ overhead. You can still switch them on with: --switch-output --no-no-buildid --no-no-buildid-cache +--switch-max-files=N:: + +When rotating perf.data with --switch-output, only keep N files. + --dry-run:: Parse options then exit. --dry-run can be used to detect errors in cmdline options. diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index a468d882e74f..02d7c40b2d10 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -62,6 +62,9 @@ struct switch_output { unsigned long time; const char *str; bool set; + char **filenames; + int num_files; + int cur_file; }; struct record { @@ -892,6 +895,7 @@ record__switch_output(struct record *rec, bool at_exit) { struct perf_data *data = &rec->data; int fd, err; + char *new_filename; /* Same Size: "2015122520103046"*/ char timestamp[] = "InvalidTimestamp"; @@ -912,7 +916,7 @@ record__switch_output(struct record *rec, bool at_exit) fd = perf_data__switch(data, timestamp, rec->session->header.data_offset, - at_exit); + at_exit, &new_filename); if (fd >= 0 && !at_exit) { rec->bytes_written = 0; rec->session->header.data_size = 0; @@ -922,6 +926,21 @@ record__switch_output(struct record *rec, bool at_exit) fprintf(stderr, "[ perf record: Dump %s.%s ]\n", data->path, timestamp); + if (rec->switch_output.num_files) { + int n = rec->switch_output.cur_file + 1; + + if (n >= rec->switch_output.num_files) + n = 0; + rec->switch_output.cur_file = n; + if (rec->switch_output.filenames[n]) { + remove(rec->switch_output.filenames[n]); + free(rec->switch_output.filenames[n]); + } + rec->switch_output.filenames[n] = new_filename; + } else { + free(new_filename); + } + /* Output tracking events */ if (!at_exit) { record__synthesize(rec, false); @@ -1973,6 +1992,8 @@ static struct option __record_options[] = { &record.switch_output.set, "signal,size,time", "Switch output when receive SIGUSR2 or cross size,time threshold", "signal"), + OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files, + "Limit number of switch output generated files"), OPT_BOOLEAN(0, "dry-run", &dry_run, "Parse options then exit"), #ifdef HAVE_AIO_SUPPORT @@ -2059,6 +2080,13 @@ int cmd_record(int argc, const char **argv) alarm(rec->switch_output.time); } + if (rec->switch_output.num_files) { + rec->switch_output.filenames = calloc(sizeof(char *), + rec->switch_output.num_files); + if (!rec->switch_output.filenames) + return -EINVAL; + } + /* * Allow aliases to facilitate the lookup of symbols for address * filters. Refer to auxtrace_parse_filters(). diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index c6b67efea11a..6a64f713710d 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -361,9 +361,9 @@ ssize_t perf_data__write(struct perf_data *data, int perf_data__switch(struct perf_data *data, const char *postfix, - size_t pos, bool at_exit) + size_t pos, bool at_exit, + char **new_filepath) { - char *new_filepath; int ret; if (check_pipe(data)) @@ -371,15 +371,15 @@ int perf_data__switch(struct perf_data *data, if (perf_data__is_read(data)) return -EINVAL; - if (asprintf(&new_filepath, "%s.%s", data->path, postfix) < 0) + if (asprintf(new_filepath, "%s.%s", data->path, postfix) < 0) return -ENOMEM; /* * Only fire a warning, don't return error, continue fill * original file. */ - if (rename(data->path, new_filepath)) - pr_warning("Failed to rename %s to %s\n", data->path, new_filepath); + if (rename(data->path, *new_filepath)) + pr_warning("Failed to rename %s to %s\n", data->path, *new_filepath); if (!at_exit) { close(data->file.fd); @@ -396,7 +396,6 @@ int perf_data__switch(struct perf_data *data, } ret = data->file.fd; out: - free(new_filepath); return ret; } diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index 6aef8746469f..259868a39019 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -70,7 +70,7 @@ ssize_t perf_data_file__write(struct perf_data_file *file, */ int perf_data__switch(struct perf_data *data, const char *postfix, - size_t pos, bool at_exit); + size_t pos, bool at_exit, char **new_filepath); int perf_data__create_dir(struct perf_data *data, int nr); int perf_data__open_dir(struct perf_data *data); From 9496c015ed39ddfce971d63a1442e6d258504a7d Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Tue, 19 Mar 2019 23:05:18 +0800 Subject: [PATCH 644/855] blk-mq: remove unused 'nr_expired' from blk_mq_hw_ctx There is no usage of 'nr_expired'. The 'nr_expired' was introduced by commit 1d9bd5161ba3 ("blk-mq: replace timeout synchronization with a RCU and generation based scheme"). Its usage was removed since commit 12f5b9314545 ("blk-mq: Remove generation seqeunce"). Signed-off-by: Dongli Zhang Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b0c814bcc7e3..35359697318b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -57,7 +57,6 @@ struct blk_mq_hw_ctx { unsigned int queue_num; atomic_t nr_active; - unsigned int nr_expired; struct hlist_node cpuhp_dead; struct kobject kobj; From c0316470683af0507427c3e0660246feacdf8363 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 7 Mar 2019 13:22:07 +0100 Subject: [PATCH 645/855] mt7601u: check chip version on probe Since some USB device IDs are duplicated between mt7601u and mt76x0u devices, check chip version on probe and return error if not match 0x7601. Don't think this is serious issue, probe most likely will fail at some other point for wrong device, but we do not have to configure it if we know is not mt7601u device. Reported-by: Xose Vazquez Perez Signed-off-by: Stanislaw Gruszka Acked-by: Jakub Kicinski Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt7601u/usb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/mediatek/mt7601u/usb.c b/drivers/net/wireless/mediatek/mt7601u/usb.c index d8b7863f7926..6ae7f14dc9bf 100644 --- a/drivers/net/wireless/mediatek/mt7601u/usb.c +++ b/drivers/net/wireless/mediatek/mt7601u/usb.c @@ -303,6 +303,10 @@ static int mt7601u_probe(struct usb_interface *usb_intf, mac_rev = mt7601u_rr(dev, MT_MAC_CSR0); dev_info(dev->dev, "ASIC revision: %08x MAC revision: %08x\n", asic_rev, mac_rev); + if ((asic_rev >> 16) != 0x7601) { + ret = -ENODEV; + goto err; + } /* Note: vendor driver skips this check for MT7601U */ if (!(mt7601u_rr(dev, MT_EFUSE_CTRL) & MT_EFUSE_CTRL_SEL)) From 40b941611bd4826b7e3e449f738af98ad22e1ce6 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 7 Mar 2019 13:22:21 +0100 Subject: [PATCH 646/855] mt76x02u: check chip version on probe Since some USB device IDs are duplicated between mt76x0u, mt7601u and mt76x2u device, check chip version on probe and return error if not match the driver. Don't think this is serious issue, probe most likely will fail at some other point for wrong device, but we do not have to configure it if we know is not our device. Reported-by: Xose Vazquez Perez Signed-off-by: Stanislaw Gruszka Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/mt76x0/usb.c | 10 +++++++--- drivers/net/wireless/mediatek/mt76/mt76x02.h | 7 +++++++ drivers/net/wireless/mediatek/mt76/mt76x2/usb.c | 4 ++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c b/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c index 91718647da02..e5a06f74a6f7 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c @@ -229,7 +229,7 @@ static int mt76x0u_probe(struct usb_interface *usb_intf, struct usb_device *usb_dev = interface_to_usbdev(usb_intf); struct mt76x02_dev *dev; struct mt76_dev *mdev; - u32 asic_rev, mac_rev; + u32 mac_rev; int ret; mdev = mt76_alloc_device(&usb_intf->dev, sizeof(*dev), &mt76x0u_ops, @@ -262,10 +262,14 @@ static int mt76x0u_probe(struct usb_interface *usb_intf, goto err; } - asic_rev = mt76_rr(dev, MT_ASIC_VERSION); + mdev->rev = mt76_rr(dev, MT_ASIC_VERSION); mac_rev = mt76_rr(dev, MT_MAC_CSR0); dev_info(mdev->dev, "ASIC revision: %08x MAC revision: %08x\n", - asic_rev, mac_rev); + mdev->rev, mac_rev); + if (!is_mt76x0(dev)) { + ret = -ENODEV; + goto err; + } /* Note: vendor driver skips this check for MT76X0U */ if (!(mt76_rr(dev, MT_EFUSE_CTRL) & MT_EFUSE_CTRL_SEL)) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02.h b/drivers/net/wireless/mediatek/mt76/mt76x02.h index b5a36d9fb2bd..07061eb4d1e1 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02.h +++ b/drivers/net/wireless/mediatek/mt76/mt76x02.h @@ -192,6 +192,13 @@ void mt76x02_mac_start(struct mt76x02_dev *dev); void mt76x02_init_debugfs(struct mt76x02_dev *dev); +static inline bool is_mt76x0(struct mt76x02_dev *dev) +{ + return mt76_chip(&dev->mt76) == 0x7610 || + mt76_chip(&dev->mt76) == 0x7630 || + mt76_chip(&dev->mt76) == 0x7650; +} + static inline bool is_mt76x2(struct mt76x02_dev *dev) { return mt76_chip(&dev->mt76) == 0x7612 || diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/usb.c b/drivers/net/wireless/mediatek/mt76/mt76x2/usb.c index 7a5d539873ca..ac0f13d46299 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/usb.c @@ -65,6 +65,10 @@ static int mt76x2u_probe(struct usb_interface *intf, mdev->rev = mt76_rr(dev, MT_ASIC_VERSION); dev_info(mdev->dev, "ASIC revision: %08x\n", mdev->rev); + if (!is_mt76x2(dev)) { + err = -ENODEV; + goto err; + } err = mt76x2u_register_device(dev); if (err < 0) From f2a00a821aacfa77985e4dbe83ed064c48a21bd5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 11 Mar 2019 14:09:53 +0100 Subject: [PATCH 647/855] mt76: mt7603: use the correct hweight8() function __sw_hweight8() is only defined if CONFIG_GENERIC_HWEIGHT is enabled. The function that works on all architectures is hweight8(). Signed-off-by: Felix Fietkau Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/mt7603/beacon.c | 3 +-- drivers/net/wireless/mediatek/mt76/mt7603/init.c | 2 +- drivers/net/wireless/mediatek/mt76/mt7603/mcu.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/beacon.c b/drivers/net/wireless/mediatek/mt76/mt7603/beacon.c index afcd86f735b4..4dcb465095d1 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/beacon.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/beacon.c @@ -135,8 +135,7 @@ void mt7603_pre_tbtt_tasklet(unsigned long arg) out: mt76_queue_tx_cleanup(dev, MT_TXQ_BEACON, false); - if (dev->mt76.q_tx[MT_TXQ_BEACON].queued > - __sw_hweight8(dev->beacon_mask)) + if (dev->mt76.q_tx[MT_TXQ_BEACON].queued > hweight8(dev->beacon_mask)) dev->beacon_check++; } diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/init.c b/drivers/net/wireless/mediatek/mt76/mt7603/init.c index 15cc8f33b34d..d54dda67d036 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/init.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/init.c @@ -112,7 +112,7 @@ static void mt7603_phy_init(struct mt7603_dev *dev) { int rx_chains = dev->mt76.antenna_mask; - int tx_chains = __sw_hweight8(rx_chains) - 1; + int tx_chains = hweight8(rx_chains) - 1; mt76_rmw(dev, MT_WF_RMAC_RMCR, (MT_WF_RMAC_RMCR_SMPS_MODE | diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7603/mcu.c index 4b0713f1fd5e..d06905ea8cc6 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/mcu.c @@ -433,7 +433,7 @@ int mt7603_mcu_set_channel(struct mt7603_dev *dev) { struct cfg80211_chan_def *chandef = &dev->mt76.chandef; struct ieee80211_hw *hw = mt76_hw(dev); - int n_chains = __sw_hweight8(dev->mt76.antenna_mask); + int n_chains = hweight8(dev->mt76.antenna_mask); struct { u8 control_chan; u8 center_chan; From 13f61dfc5235cfa82b1efbcdf4b4f14d9be233da Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 11 Mar 2019 14:24:35 +0100 Subject: [PATCH 648/855] mt76: fix schedule while atomic in mt76x02_reset_state Fix following schedule while atomic in mt76x02_reset_state since synchronize_rcu is run inside a RCU section [44036.944222] mt76x2e 0000:06:00.0: MCU message 31 (seq 3) timed out [44036.944281] BUG: sleeping function called from invalid context at kernel/rcu/tree_exp.h:818 [44036.944284] in_atomic(): 1, irqs_disabled(): 0, pid: 28066, name: kworker/u4:1 [44036.944287] INFO: lockdep is turned off. [44036.944292] CPU: 1 PID: 28066 Comm: kworker/u4:1 Tainted: G W 5.0.0-rc7-wdn-t1+ #7 [44036.944294] Hardware name: Dell Inc. Studio XPS 1340/0K183D, BIOS A11 09/08/2009 [44036.944305] Workqueue: phy1 mt76x02_wdt_work [mt76x02_lib] [44036.944308] Call Trace: [44036.944317] dump_stack+0x67/0x90 [44036.944322] ___might_sleep.cold.88+0x9f/0xaf [44036.944327] rcu_blocking_is_gp+0x13/0x50 [44036.944330] synchronize_rcu+0x17/0x80 [44036.944337] mt76_sta_state+0x138/0x1d0 [mt76] [44036.944349] mt76x02_wdt_work+0x1c9/0x610 [mt76x02_lib] [44036.944355] process_one_work+0x2a5/0x620 [44036.944361] worker_thread+0x35/0x3e0 [44036.944368] kthread+0x11c/0x140 [44036.944376] ret_from_fork+0x3a/0x50 [44036.944384] BUG: scheduling while atomic: kworker/u4:1/28066/0x00000002 [44036.944387] INFO: lockdep is turned off. [44036.944389] Modules linked in: cmac ctr ccm af_packet snd_hda_codec_hdmi Introduce __mt76_sta_remove in order to run sta_remove without holding dev->mutex. Move __mt76_sta_remove outside of RCU section in mt76x02_reset_state Fixes: e4ebb8b403d1 ("mt76: mt76x2: implement full device restart on watchdog reset") Signed-off-by: Lorenzo Bianconi Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/mac80211.c | 18 +++++++++++------- drivers/net/wireless/mediatek/mt76/mt76.h | 2 ++ .../net/wireless/mediatek/mt76/mt76x02_mmio.c | 19 ++++++++++--------- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c index a033745adb2f..316167404729 100644 --- a/drivers/net/wireless/mediatek/mt76/mac80211.c +++ b/drivers/net/wireless/mediatek/mt76/mac80211.c @@ -679,19 +679,15 @@ out: return ret; } -static void -mt76_sta_remove(struct mt76_dev *dev, struct ieee80211_vif *vif, - struct ieee80211_sta *sta) +void __mt76_sta_remove(struct mt76_dev *dev, struct ieee80211_vif *vif, + struct ieee80211_sta *sta) { struct mt76_wcid *wcid = (struct mt76_wcid *)sta->drv_priv; - int idx = wcid->idx; - int i; + int i, idx = wcid->idx; rcu_assign_pointer(dev->wcid[idx], NULL); synchronize_rcu(); - mutex_lock(&dev->mutex); - if (dev->drv->sta_remove) dev->drv->sta_remove(dev, vif, sta); @@ -699,7 +695,15 @@ mt76_sta_remove(struct mt76_dev *dev, struct ieee80211_vif *vif, for (i = 0; i < ARRAY_SIZE(sta->txq); i++) mt76_txq_remove(dev, sta->txq[i]); mt76_wcid_free(dev->wcid_mask, idx); +} +EXPORT_SYMBOL_GPL(__mt76_sta_remove); +static void +mt76_sta_remove(struct mt76_dev *dev, struct ieee80211_vif *vif, + struct ieee80211_sta *sta) +{ + mutex_lock(&dev->mutex); + __mt76_sta_remove(dev, vif, sta); mutex_unlock(&dev->mutex); } diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index cb50e96e736b..bcbfd3c4a44b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -695,6 +695,8 @@ int mt76_sta_state(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state); +void __mt76_sta_remove(struct mt76_dev *dev, struct ieee80211_vif *vif, + struct ieee80211_sta *sta); struct ieee80211_sta *mt76_rx_convert(struct sk_buff *skb); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c index 9de015bcd4f9..daaed1220147 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c @@ -441,19 +441,23 @@ static void mt76x02_reset_state(struct mt76x02_dev *dev) { int i; + lockdep_assert_held(&dev->mt76.mutex); + clear_bit(MT76_STATE_RUNNING, &dev->mt76.state); rcu_read_lock(); - ieee80211_iter_keys_rcu(dev->mt76.hw, NULL, mt76x02_key_sync, NULL); + rcu_read_unlock(); for (i = 0; i < ARRAY_SIZE(dev->mt76.wcid); i++) { - struct mt76_wcid *wcid = rcu_dereference(dev->mt76.wcid[i]); - struct mt76x02_sta *msta; struct ieee80211_sta *sta; struct ieee80211_vif *vif; + struct mt76x02_sta *msta; + struct mt76_wcid *wcid; void *priv; + wcid = rcu_dereference_protected(dev->mt76.wcid[i], + lockdep_is_held(&dev->mt76.mutex)); if (!wcid) continue; @@ -463,13 +467,10 @@ static void mt76x02_reset_state(struct mt76x02_dev *dev) priv = msta->vif; vif = container_of(priv, struct ieee80211_vif, drv_priv); - mt76_sta_state(dev->mt76.hw, vif, sta, - IEEE80211_STA_NONE, IEEE80211_STA_NOTEXIST); + __mt76_sta_remove(&dev->mt76, vif, sta); memset(msta, 0, sizeof(*msta)); } - rcu_read_unlock(); - dev->vif_mask = 0; dev->beacon_mask = 0; } @@ -489,11 +490,11 @@ static void mt76x02_watchdog_reset(struct mt76x02_dev *dev) for (i = 0; i < ARRAY_SIZE(dev->mt76.napi); i++) napi_disable(&dev->mt76.napi[i]); + mutex_lock(&dev->mt76.mutex); + if (restart) mt76x02_reset_state(dev); - mutex_lock(&dev->mt76.mutex); - if (dev->beacon_mask) mt76_clear(dev, MT_BEACON_TIME_CFG, MT_BEACON_TIME_CFG_BEACON_TX | From 7dfc45e6282a7662279d168cc1219929456f8750 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 12 Mar 2019 13:32:07 +0100 Subject: [PATCH 649/855] mt76x02: do not enable RTS/CTS by default My commit 26a7b5473191 ("mt76x02: set protection according to ht operation element") enabled by default RTS/CTS protection for OFDM and CCK traffic, because MT_TX_RTS_CFG_THRESH is configured to non 0xffff by initvals and .set_rts_threshold callback is not called by mac80211 on initialization, only on user request or during ieee80211_reconfig() (suspend/resuem or restart_hw). Enabling RTS/CTS cause some problems when sending probe request frames by hcxdumptool penetration tool, but I expect it can cause other issues on different scenarios. Restore previous setting of RTS/CTS being disabled by default for OFDM/CCK by changing MT_TX_RTS_CFG_THRESH initvals to 0xffff. Fixes: 26a7b5473191 ("mt76x02: set protection according to ht operation element") Signed-off-by: Stanislaw Gruszka Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/mt76x0/initvals.h | 2 +- drivers/net/wireless/mediatek/mt76/mt76x2/init.c | 2 +- drivers/net/wireless/mediatek/mt76/mt76x2/usb_mac.c | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/initvals.h b/drivers/net/wireless/mediatek/mt76/mt76x0/initvals.h index 0290ba5869a5..736f81752b5b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/initvals.h +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/initvals.h @@ -46,7 +46,7 @@ static const struct mt76_reg_pair common_mac_reg_table[] = { { MT_MM20_PROT_CFG, 0x01742004 }, { MT_MM40_PROT_CFG, 0x03f42084 }, { MT_TXOP_CTRL_CFG, 0x0000583f }, - { MT_TX_RTS_CFG, 0x00092b20 }, + { MT_TX_RTS_CFG, 0x00ffff20 }, { MT_EXP_ACK_TIME, 0x002400ca }, { MT_TXOP_HLDR_ET, 0x00000002 }, { MT_XIFS_TIME_CFG, 0x33a41010 }, diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/init.c b/drivers/net/wireless/mediatek/mt76/mt76x2/init.c index f8534362e2c8..a30ef2c5a9db 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/init.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/init.c @@ -106,7 +106,7 @@ void mt76_write_mac_initvals(struct mt76x02_dev *dev) { MT_TX_SW_CFG1, 0x00010000 }, { MT_TX_SW_CFG2, 0x00000000 }, { MT_TXOP_CTRL_CFG, 0x0400583f }, - { MT_TX_RTS_CFG, 0x00100020 }, + { MT_TX_RTS_CFG, 0x00ffff20 }, { MT_TX_TIMEOUT_CFG, 0x000a2290 }, { MT_TX_RETRY_CFG, 0x47f01f0f }, { MT_EXP_ACK_TIME, 0x002c00dc }, diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/usb_mac.c b/drivers/net/wireless/mediatek/mt76/mt76x2/usb_mac.c index 5e84b4535cb1..3b82345756ea 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/usb_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/usb_mac.c @@ -93,7 +93,6 @@ int mt76x2u_mac_reset(struct mt76x02_dev *dev) mt76_wr(dev, MT_TX_LINK_CFG, 0x1020); mt76_wr(dev, MT_AUTO_RSP_CFG, 0x13); mt76_wr(dev, MT_MAX_LEN_CFG, 0x2f00); - mt76_wr(dev, MT_TX_RTS_CFG, 0x92b20); mt76_wr(dev, MT_WMM_AIFSN, 0x2273); mt76_wr(dev, MT_WMM_CWMIN, 0x2344); From 0e3edd94448001a33ed7f8af2179cd4a280348a2 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 19 Mar 2019 23:20:42 +0800 Subject: [PATCH 650/855] drivers: base: swnode: Make two functions static Fix sparse warning: drivers/base/swnode.c:475:22: warning: symbol 'software_node_get_parent' was not declared. Should it be static? drivers/base/swnode.c:484:22: warning: symbol 'software_node_get_next_child' was not declared. Should it be static? Signed-off-by: YueHaibing Signed-off-by: Rafael J. Wysocki --- drivers/base/swnode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index 1fad9291f6aa..7fc5a18e02ad 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -472,7 +472,7 @@ static int software_node_read_string_array(const struct fwnode_handle *fwnode, val, nval); } -struct fwnode_handle * +static struct fwnode_handle * software_node_get_parent(const struct fwnode_handle *fwnode) { struct software_node *swnode = to_software_node(fwnode); @@ -481,7 +481,7 @@ software_node_get_parent(const struct fwnode_handle *fwnode) NULL; } -struct fwnode_handle * +static struct fwnode_handle * software_node_get_next_child(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { From 2a95496634a017c19641f26f00907af75b962f01 Mon Sep 17 00:00:00 2001 From: David Arcari Date: Tue, 12 Feb 2019 09:34:39 -0500 Subject: [PATCH 651/855] tools/power turbostat: return the exit status of a command turbostat failed to return a non-zero exit status even though the supplied command (turbostat ) failed. Currently when turbostat forks a command it returns zero instead of the actual exit status of the command. Modify the code to return the exit status. Signed-off-by: David Arcari Acked-by: Len Brown Signed-off-by: Rafael J. Wysocki --- tools/power/x86/turbostat/turbostat.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9327c0ddc3a5..c3fad065c89c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5077,6 +5077,9 @@ int fork_it(char **argv) signal(SIGQUIT, SIG_IGN); if (waitpid(child_pid, &status, 0) == -1) err(status, "waitpid"); + + if (WIFEXITED(status)) + status = WEXITSTATUS(status); } /* * n.b. fork_it() does not check for errors from for_all_cpus() From 0cb98abb5bd13b9a636bde603d952d722688b428 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 19 Mar 2019 12:12:13 -0400 Subject: [PATCH 652/855] NFSv4.1 don't free interrupted slot on open Allow the async rpc task for finish and update the open state if needed, then free the slot. Otherwise, the async rpc unable to decode the reply. Signed-off-by: Olga Kornievskaia Fixes: ae55e59da0e4 ("pnfs: Don't release the sequence slot...") Cc: stable@vger.kernel.org # v4.18+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6d2812a39287..741ff8c9c6ed 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2933,7 +2933,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, } out: - nfs4_sequence_free_slot(&opendata->o_res.seq_res); + if (!opendata->cancelled) + nfs4_sequence_free_slot(&opendata->o_res.seq_res); return ret; } From 57aeef7f7a6848024b92c32bf23e8c85a8ac896d Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 19 Mar 2019 09:32:36 -0700 Subject: [PATCH 653/855] platform/chrome: cros_ec_debugfs: cancel/schedule logging work only if supported The following traceback was reported on ASUS C201, which does not support console logging. ------------[ cut here ]------------ WARNING: CPU: 2 PID: 361 at kernel/workqueue.c:3030 __flush_work+0x38/0x154 Modules linked in: snd_soc_hdmi_codec cros_ec_debugfs cros_ec_sysfs uvcvideo dw_hdmi_cec dw_hdmi_i2s_audio videobuf2_vmalloc cfg80211 gpio_charger rk_crypto rfkill videobuf2_memops videobuf2_v4l2 des_generic videobuf2_common ofpart m25p80 spi_nor tpm_i2c_infineon sbs_battery mtd tpm joydev cros_ec_dev coreboot_table evdev mousedev ip_tables x_tables [last unloaded: brcmutil] CPU: 2 PID: 361 Comm: systemd-sleep Not tainted 5.1.0-rc1-1-ARCH+ #1 Hardware name: Rockchip (Device Tree) [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x7c/0x9c) [] (dump_stack) from [] (__warn+0xd0/0xec) [] (__warn) from [] (warn_slowpath_null+0x38/0x44) [] (warn_slowpath_null) from [] (__flush_work+0x38/0x154) [] (__flush_work) from [] (__cancel_work_timer+0x114/0x1a4) [] (__cancel_work_timer) from [] (cros_ec_debugfs_suspend+0x14/0x1c [cros_ec_debugfs]) [] (cros_ec_debugfs_suspend [cros_ec_debugfs]) from [] (dpm_run_callback+0x64/0xcc) [] (dpm_run_callback) from [] (__device_suspend+0x174/0x3a8) [] (__device_suspend) from [] (dpm_suspend+0x174/0x1e0) [] (dpm_suspend) from [] (suspend_devices_and_enter+0x6c/0x50c) [] (suspend_devices_and_enter) from [] (pm_suspend+0x20c/0x274) [] (pm_suspend) from [] (state_store+0x54/0x88) [] (state_store) from [] (kernfs_fop_write+0x114/0x180) [] (kernfs_fop_write) from [] (__vfs_write+0x1c/0x154) [] (__vfs_write) from [] (vfs_write+0xb8/0x198) [] (vfs_write) from [] (ksys_write+0x3c/0x74) [] (ksys_write) from [] (ret_fast_syscall+0x0/0x4c) Exception stack(0xe9365fa8 to 0xe9365ff0) 5fa0: 00000004 beef8b28 00000004 beef8b28 00000004 00000000 5fc0: 00000004 beef8b28 02319170 00000004 beef8b28 00000004 b6f3d900 beef8b74 5fe0: 0000006c beef8a98 b6c0adac b6c66534 ---[ end trace f4ee5df14e8ea0ec ]--- If console logging is not supported, the work structure is never initialized, resulting in the traceback. Calling cancel/schedule functions conditionally fixes the problem. While at it, also fix error handling in the probe function. Reported-by: Urja Rannikko Cc: Urja Rannikko Fixes: 6fce0a2cf5a05 ("mfd / platform: cros_ec: Move debugfs attributes to its own driver") Signed-off-by: Guenter Roeck Signed-off-by: Benson Leung --- drivers/platform/chrome/cros_ec_debugfs.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c index 900c7073c46f..71308766e891 100644 --- a/drivers/platform/chrome/cros_ec_debugfs.c +++ b/drivers/platform/chrome/cros_ec_debugfs.c @@ -440,7 +440,7 @@ static int cros_ec_debugfs_probe(struct platform_device *pd) ret = cros_ec_create_pdinfo(debug_info); if (ret) - goto remove_debugfs; + goto remove_log; ec->debug_info = debug_info; @@ -448,6 +448,8 @@ static int cros_ec_debugfs_probe(struct platform_device *pd) return 0; +remove_log: + cros_ec_cleanup_console_log(debug_info); remove_debugfs: debugfs_remove_recursive(debug_info->dir); return ret; @@ -467,7 +469,8 @@ static int __maybe_unused cros_ec_debugfs_suspend(struct device *dev) { struct cros_ec_dev *ec = dev_get_drvdata(dev); - cancel_delayed_work_sync(&ec->debug_info->log_poll_work); + if (ec->debug_info->log_buffer.buf) + cancel_delayed_work_sync(&ec->debug_info->log_poll_work); return 0; } @@ -476,7 +479,8 @@ static int __maybe_unused cros_ec_debugfs_resume(struct device *dev) { struct cros_ec_dev *ec = dev_get_drvdata(dev); - schedule_delayed_work(&ec->debug_info->log_poll_work, 0); + if (ec->debug_info->log_buffer.buf) + schedule_delayed_work(&ec->debug_info->log_poll_work, 0); return 0; } From db983f6eef57a9d78af79bc32389b7e60eb3c47d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 18 Mar 2019 09:29:26 -0700 Subject: [PATCH 654/855] scsi: core: Also call destroy_rcu_head() for passthrough requests cmd->rcu is initialized by scsi_initialize_rq(). For passthrough requests, blk_get_request() calls scsi_initialize_rq(). For filesystem requests, scsi_init_command() calls scsi_initialize_rq(). Make sure that destroy_rcu_head() is called for passthrough requests. Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Ewan D. Milne Cc: Johannes Thumshirn Reported-by: Ewan D. Milne Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 20189675677a..f0db2dd0be75 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -585,9 +585,16 @@ static bool scsi_end_request(struct request *req, blk_status_t error, if (!blk_rq_is_scsi(req)) { WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED)); cmd->flags &= ~SCMD_INITIALIZED; - destroy_rcu_head(&cmd->rcu); } + /* + * Calling rcu_barrier() is not necessary here because the + * SCSI error handler guarantees that the function called by + * call_rcu() has been called before scsi_end_request() is + * called. + */ + destroy_rcu_head(&cmd->rcu); + /* * In the MQ case the command gets freed by __blk_mq_end_request, * so we have to do all cleanup that depends on it earlier. From 17605afaae825b0291f80c62a7f6565879edaa8a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Mar 2019 16:27:58 -0700 Subject: [PATCH 655/855] scsi: core: Avoid that a kernel warning appears during system resume Since scsi_device_quiesce() skips SCSI devices that have another state than RUNNING, OFFLINE or TRANSPORT_OFFLINE, scsi_device_resume() should not complain about SCSI devices that have been skipped. Hence this patch. This patch avoids that the following warning appears during resume: WARNING: CPU: 3 PID: 1039 at blk_clear_pm_only+0x2a/0x30 CPU: 3 PID: 1039 Comm: kworker/u8:49 Not tainted 5.0.0+ #1 Hardware name: LENOVO 4180F42/4180F42, BIOS 83ET75WW (1.45 ) 05/10/2013 Workqueue: events_unbound async_run_entry_fn RIP: 0010:blk_clear_pm_only+0x2a/0x30 Call Trace: ? scsi_device_resume+0x28/0x50 ? scsi_dev_type_resume+0x2b/0x80 ? async_run_entry_fn+0x2c/0xd0 ? process_one_work+0x1f0/0x3f0 ? worker_thread+0x28/0x3c0 ? process_one_work+0x3f0/0x3f0 ? kthread+0x10c/0x130 ? __kthread_create_on_node+0x150/0x150 ? ret_from_fork+0x1f/0x30 Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Ming Lei Cc: Johannes Thumshirn Cc: Oleksandr Natalenko Cc: Martin Steigerwald Cc: Reported-by: Jisheng Zhang Tested-by: Jisheng Zhang Fixes: 3a0a529971ec ("block, scsi: Make SCSI quiesce and resume work reliably") # v4.15 Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index f0db2dd0be75..601b9f1de267 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2548,8 +2548,10 @@ void scsi_device_resume(struct scsi_device *sdev) * device deleted during suspend) */ mutex_lock(&sdev->state_mutex); - sdev->quiesced_by = NULL; - blk_clear_pm_only(sdev->request_queue); + if (sdev->quiesced_by) { + sdev->quiesced_by = NULL; + blk_clear_pm_only(sdev->request_queue); + } if (sdev->sdev_state == SDEV_QUIESCE) scsi_device_set_state(sdev, SDEV_RUNNING); mutex_unlock(&sdev->state_mutex); From 4705f10e82c63924bd84a9b31d15839ec9ba3d06 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Fri, 15 Mar 2019 15:04:18 -0700 Subject: [PATCH 656/855] scsi: qla2xxx: Fix FC-AL connection target discovery Commit 7f147f9bfd44 ("scsi: qla2xxx: Fix N2N target discovery with Local loop") fixed N2N target discovery for local loop. However, same code is used for FC-AL discovery as well. Added check to make sure we are bypassing area and domain check only in N2N topology for target discovery. Fixes: 7f147f9bfd44 ("scsi: qla2xxx: Fix N2N target discovery with Local loop") Cc: stable@vger.kernel.org # 5.0+ Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_init.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index 420045155ba0..0c700b140ce7 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -4991,6 +4991,13 @@ qla2x00_configure_local_loop(scsi_qla_host_t *vha) if ((domain & 0xf0) == 0xf0) continue; + /* Bypass if not same domain and area of adapter. */ + if (area && domain && ((area != vha->d_id.b.area) || + (domain != vha->d_id.b.domain)) && + (ha->current_topology == ISP_CFG_NL)) + continue; + + /* Bypass invalid local loop ID. */ if (loop_id > LAST_LOCAL_LOOP_ID) continue; From ac444b4f0ace05d7c4c99f6b1e5b0cae0852f025 Mon Sep 17 00:00:00 2001 From: Himanshu Madhani Date: Fri, 15 Mar 2019 15:04:19 -0700 Subject: [PATCH 657/855] scsi: qla2xxx: Fix NULL pointer crash due to stale CPUID This patch fixes crash due to NULL pointer derefrence because CPU pointer is not set and used by driver. Instead, driver is passes CPU as tag via ha->isp_ops->{lun_reset|target_reset} [ 30.160780] qla2xxx [0000:a0:00.1]-8038:9: Cable is unplugged... [ 69.984045] qla2xxx [0000:a0:00.0]-8009:8: DEVICE RESET ISSUED nexus=8:0:0 cmd=00000000b0d62f46. [ 69.992849] BUG: unable to handle kernel NULL pointer dereference at 0000000000000040 [ 70.000680] PGD 0 P4D 0 [ 70.003232] Oops: 0000 [#1] SMP PTI [ 70.006727] CPU: 2 PID: 6714 Comm: sg_reset Kdump: loaded Not tainted 4.18.0-67.el8.x86_64 #1 [ 70.015258] Hardware name: NEC Express5800/T110j [N8100-2758Y]/MX32-PH0-NJ, BIOS F11 02/13/2019 [ 70.024016] RIP: 0010:blk_mq_rq_cpu+0x9/0x10 [ 70.028315] Code: 01 58 01 00 00 48 83 c0 28 48 3d 80 02 00 00 75 ab c3 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 8b 47 08 <8b> 40 40 c3 0f 1f 00 0f 1f 44 00 00 48 83 ec 10 48 c7 c6 20 6e 7c [ 70.047087] RSP: 0018:ffff99a481487d58 EFLAGS: 00010246 [ 70.052322] RAX: 0000000000000000 RBX: ffffffffc041b08b RCX: 0000000000000000 [ 70.059466] RDX: 0000000000000000 RSI: ffff8d10b6b16898 RDI: ffff8d10b341e400 [ 70.066615] RBP: ffffffffc03a6bd0 R08: 0000000000000415 R09: 0000000000aaaaaa [ 70.073765] R10: 0000000000000001 R11: 0000000000000001 R12: ffff8d10b341e528 [ 70.080914] R13: ffff8d10aadefc00 R14: ffff8d0f64efa998 R15: ffff8d0f64efa000 [ 70.088083] FS: 00007f90a201e540(0000) GS:ffff8d10b6b00000(0000) knlGS:0000000000000000 [ 70.096188] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 70.101959] CR2: 0000000000000040 CR3: 0000000268886005 CR4: 00000000003606e0 [ 70.109127] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 70.116277] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 70.123425] Call Trace: [ 70.125896] __qla2xxx_eh_generic_reset+0xb1/0x220 [qla2xxx] [ 70.131572] scsi_ioctl_reset+0x1f5/0x2a0 [ 70.135600] scsi_ioctl+0x18e/0x397 [ 70.139099] ? sd_ioctl+0x7c/0x100 [sd_mod] [ 70.143287] blkdev_ioctl+0x32b/0x9f0 [ 70.146954] ? __check_object_size+0xa3/0x181 [ 70.151323] block_ioctl+0x39/0x40 [ 70.154735] do_vfs_ioctl+0xa4/0x630 [ 70.158322] ? syscall_trace_enter+0x1d3/0x2c0 [ 70.162769] ksys_ioctl+0x60/0x90 [ 70.166104] __x64_sys_ioctl+0x16/0x20 [ 70.169859] do_syscall_64+0x5b/0x1b0 [ 70.173532] entry_SYSCALL_64_after_hwframe+0x65/0xca [ 70.178587] RIP: 0033:0x7f90a1b3445b [ 70.182183] Code: 0f 1e fa 48 8b 05 2d aa 2c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d fd a9 2c 00 f7 d8 64 89 01 48 [ 70.200956] RSP: 002b:00007fffdca88b68 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 70.208535] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f90a1b3445b [ 70.215684] RDX: 00007fffdca88b84 RSI: 0000000000002284 RDI: 0000000000000003 [ 70.222833] RBP: 00007fffdca88ca8 R08: 00007fffdca88b84 R09: 0000000000000000 [ 70.229981] R10: 0000000000000000 R11: 0000000000000246 R12: 00007fffdca88b84 [ 70.237131] R13: 0000000000000000 R14: 000055ab09b0bd28 R15: 0000000000000000 [ 70.244284] Modules linked in: nft_chain_route_ipv4 xt_CHECKSUM nft_chain_nat_ipv4 ipt_MASQUERADE nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack libcrc32c ipt_REJECT nf_reject_ipv4 nft_counter nft_compat tun bridge stp llc nf_tables nfnetli nk devlink sunrpc vfat fat intel_rapl intel_pmc_core x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm wmi_bmof iTCO_wdt iTCO_ vendor_support irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel ipmi_ssif intel_cstate intel_uncore intel_rapl_perf ipmi_si jo ydev pcspkr ipmi_devintf sg wmi ipmi_msghandler video acpi_power_meter acpi_pad mei_me i2c_i801 mei ip_tables ext4 mbcache jbd2 sr_mod cd rom sd_mod qla2xxx ast i2c_algo_bit drm_kms_helper nvme_fc syscopyarea sysfillrect uas sysimgblt fb_sys_fops nvme_fabrics ttm [ 70.314805] usb_storage nvme_core crc32c_intel scsi_transport_fc ahci drm libahci tg3 libata megaraid_sas pinctrl_cannonlake pinctrl_ intel [ 70.327335] CR2: 0000000000000040 Fixes: 9cf2bab630765 ("block: kill request ->cpu member") Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 677f82fdf56f..91f576d743fe 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1517,7 +1517,7 @@ __qla2xxx_eh_generic_reset(char *name, enum nexus_wait_type type, goto eh_reset_failed; } err = 2; - if (do_reset(fcport, cmd->device->lun, blk_mq_rq_cpu(cmd->request) + 1) + if (do_reset(fcport, cmd->device->lun, 1) != QLA_SUCCESS) { ql_log(ql_log_warn, vha, 0x800c, "do_reset failed for cmd=%p.\n", cmd); From 39bbd3310ec304a7b695240c6791893a88ec9729 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 13 Mar 2019 09:36:52 +0100 Subject: [PATCH 658/855] drm/amdgpu: revert "cleanup setting bulk_movable" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 8466cc61da89d33441e0d7a98de1ba98697cd465. It can trigger a reference counter bug in TTM. Need to investigate further, but for now revert the offending change. Signed-off-by: Christian König Reviewed-by: Michel Dänzer Tested-by: Michel Dänzer Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index bfa9062ce6b9..16fcb56c232b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -700,6 +700,8 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct amdgpu_vm_bo_base *bo_base, *tmp; int r = 0; + vm->bulk_moveable &= list_empty(&vm->evicted); + list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) { struct amdgpu_bo *bo = bo_base->bo; From 72464382fc2d3673eb51f21a57f2c0a320c1552f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 18 Mar 2019 11:09:54 +0100 Subject: [PATCH 659/855] drm/amdgpu: fix invalid use of change_bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We only need to clear the bit in a 32bit integer. This fixes a crah on ARM64 and PPC64LE caused by "drm/amdgpu: update the vm invalidation engine layout V2" Signed-off-by: Christian König Acked-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 600259b4e291..2fe8397241ea 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -742,7 +742,7 @@ static int gmc_v9_0_allocate_vm_inv_eng(struct amdgpu_device *adev) } ring->vm_inv_eng = inv_eng - 1; - change_bit(inv_eng - 1, (unsigned long *)(&vm_inv_engs[vmhub])); + vm_inv_engs[vmhub] &= ~(1 << ring->vm_inv_eng); dev_info(adev->dev, "ring %s uses VM inv eng %u on hub %u\n", ring->name, ring->vm_inv_eng, ring->funcs->vmhub); From c38dab7df7ee4fdecd51e390d4d33d5ef5cff49d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Mar 2019 15:49:56 -0700 Subject: [PATCH 660/855] perf record: Clarify help for --switch-output The help description for --switch-output looks like there are multiple comma separated fields. But it's actually a choice of different options. Make it clear and less confusing. Before: % perf record -h ... --switch-output[=] Switch output when receive SIGUSR2 or cross size,time threshold After: % perf record -h ... --switch-output[=] Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold Signed-off-by: Andi Kleen Acked-by: Jiri Olsa LPU-Reference: 20190314225002.30108-4-andi@firstfloor.org Link: https://lkml.kernel.org/n/tip-9yecyuha04nyg8toyd1b2pgi@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 02d7c40b2d10..e7144a1c1c82 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1989,8 +1989,8 @@ static struct option __record_options[] = { OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary, "Record timestamp boundary (time of first/last samples)"), OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str, - &record.switch_output.set, "signal,size,time", - "Switch output when receive SIGUSR2 or cross size,time threshold", + &record.switch_output.set, "signal or size[BKMG] or time[smhd]", + "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold", "signal"), OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files, "Limit number of switch output generated files"), From 702fb9b415e7c99bd671fc0d1da26574c125471a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Mar 2019 15:49:57 -0700 Subject: [PATCH 661/855] perf report: Show all sort keys in help output Show all the supported sort keys in the command line help output, so that it's not needed to refer to the manpage. Before: % perf report -h ... -s, --sort sort by key(s): pid, comm, dso, symbol, parent, cpu, srcline, ... Please refer the man page for the complete list. After: % perf report -h ... -s, --sort sort by key(s): overhead overhead_sys overhead_us overhead_guest_sys overhead_guest_us overhead_children sample period pid comm dso symbol parent cpu ... Signed-off-by: Andi Kleen Acked-by: Jiri Olsa LPU-Reference: 20190314225002.30108-5-andi@firstfloor.org Link: https://lkml.kernel.org/n/tip-9r3uz2ch4izoi1uln3f889co@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 5 ++-- tools/perf/util/sort.c | 52 +++++++++++++++++++++++++++++++++++++ tools/perf/util/sort.h | 2 ++ 3 files changed, 56 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 1921aaa9cece..4054eb1f98ac 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1083,10 +1083,9 @@ int cmd_report(int argc, const char **argv) OPT_BOOLEAN(0, "header-only", &report.header_only, "Show only data header."), OPT_STRING('s', "sort", &sort_order, "key[,key2...]", - "sort by key(s): pid, comm, dso, symbol, parent, cpu, srcline, ..." - " Please refer the man page for the complete list."), + sort_help("sort by key(s):")), OPT_STRING('F', "fields", &field_order, "key[,keys...]", - "output field(s): overhead, period, sample plus all of sort keys"), + sort_help("output field(s): overhead period sample ")), OPT_BOOLEAN(0, "show-cpu-utilization", &symbol_conf.show_cpu_utilization, "Show sample percentage for different cpu modes"), OPT_BOOLEAN_FLAG(0, "showcpuutilization", &symbol_conf.show_cpu_utilization, diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index bdd30cab51cb..5d2518e89fc4 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -13,6 +13,7 @@ #include "evsel.h" #include "evlist.h" #include "strlist.h" +#include "strbuf.h" #include #include "mem-events.h" #include "annotate.h" @@ -3107,3 +3108,54 @@ void reset_output_field(void) reset_dimensions(); perf_hpp__reset_output_field(&perf_hpp_list); } + +#define INDENT (3*8 + 1) + +static void add_key(struct strbuf *sb, const char *str, int *llen) +{ + if (*llen >= 75) { + strbuf_addstr(sb, "\n\t\t\t "); + *llen = INDENT; + } + strbuf_addf(sb, " %s", str); + *llen += strlen(str) + 1; +} + +static void add_sort_string(struct strbuf *sb, struct sort_dimension *s, int n, + int *llen) +{ + int i; + + for (i = 0; i < n; i++) + add_key(sb, s[i].name, llen); +} + +static void add_hpp_sort_string(struct strbuf *sb, struct hpp_dimension *s, int n, + int *llen) +{ + int i; + + for (i = 0; i < n; i++) + add_key(sb, s[i].name, llen); +} + +const char *sort_help(const char *prefix) +{ + struct strbuf sb; + char *s; + int len = strlen(prefix) + INDENT; + + strbuf_init(&sb, 300); + strbuf_addstr(&sb, prefix); + add_hpp_sort_string(&sb, hpp_sort_dimensions, + ARRAY_SIZE(hpp_sort_dimensions), &len); + add_sort_string(&sb, common_sort_dimensions, + ARRAY_SIZE(common_sort_dimensions), &len); + add_sort_string(&sb, bstack_sort_dimensions, + ARRAY_SIZE(bstack_sort_dimensions), &len); + add_sort_string(&sb, memory_sort_dimensions, + ARRAY_SIZE(memory_sort_dimensions), &len); + s = strbuf_detach(&sb, NULL); + strbuf_release(&sb); + return s; +} diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index bb9442ab7a0c..ce376a73f964 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -296,6 +296,8 @@ void reset_output_field(void); void sort__setup_elide(FILE *fp); void perf_hpp__set_elide(int idx, bool elide); +const char *sort_help(const char *prefix); + int report_parse_ignore_callees_opt(const struct option *opt, const char *arg, int unset); bool is_strict_order(const char *order); From a4e7e6efabc57e85e0737c2eaa391525c0ae36f3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Mar 2019 15:49:59 -0700 Subject: [PATCH 662/855] perf report: Indicate JITed code better in report Print [TID] tid %d instead of the crypted /tmp/perf-%d.map default. % cat >loop.java public class loop { public static void main(String[] args) { for (;;); } } ^D % javac loop.java % perf record java loop ^C Before: % perf report --stdio ... 56.09% java perf-34724.map [.] 0x00007fd5bd021896 19.12% java perf-34724.map [.] 0x00007fd5bd021887 9.79% java perf-34724.map [.] 0x00007fd5bd021783 8.97% java perf-34724.map [.] 0x00007fd5bd02175b After: % perf report --stdio ... 56.09% java [JIT] tid 34724 [.] 0x00007fd5bd021896 19.12% java [JIT] tid 34724 [.] 0x00007fd5bd021887 9.79% java [JIT] tid 34724 [.] 0x00007fd5bd021783 8.97% java [JIT] tid 34724 [.] 0x00007fd5bd02175b Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo LPU-Reference: 20190314225002.30108-7-andi@firstfloor.org Link: https://lkml.kernel.org/n/tip-r17l6py9g0sezb7mi1f286gt@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index ba58ba603b69..ab8a455d2283 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1141,28 +1141,34 @@ void dso__set_short_name(struct dso *dso, const char *name, bool name_allocated) static void dso__set_basename(struct dso *dso) { - /* - * basename() may modify path buffer, so we must pass - * a copy. - */ - char *base, *lname = strdup(dso->long_name); + char *base, *lname; + int tid; - if (!lname) - return; + if (sscanf(dso->long_name, "/tmp/perf-%d.map", &tid) == 1) { + if (asprintf(&base, "[JIT] tid %d", tid) < 0) + return; + } else { + /* + * basename() may modify path buffer, so we must pass + * a copy. + */ + lname = strdup(dso->long_name); + if (!lname) + return; - /* - * basename() may return a pointer to internal - * storage which is reused in subsequent calls - * so copy the result. - */ - base = strdup(basename(lname)); + /* + * basename() may return a pointer to internal + * storage which is reused in subsequent calls + * so copy the result. + */ + base = strdup(basename(lname)); - free(lname); + free(lname); - if (!base) - return; - - dso__set_short_name(dso, base, true); + if (!base) + return; + } + dso__set_short_name(dso, base, true); } int dso__name_len(const struct dso *dso) From 90b10f47c0ee2a70bd036d9da5e810f522b54a8f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Mar 2019 15:50:00 -0700 Subject: [PATCH 663/855] perf script: Support relative time When comparing time stamps in 'perf script' traces it can be annoying to work with the full perf time stamps. Add a --reltime option that displays time stamps relative to the trace start to make it easier to read the traces. Note: not currently supported for --time. Report an error in this case. Before: % perf script swapper 0 [000] 245402.891216: 1 cycles:ppp: ffffffffa0068814 native_write_msr+0x4 ([kernel.kallsyms]) swapper 0 [000] 245402.891223: 1 cycles:ppp: ffffffffa0068814 native_write_msr+0x4 ([kernel.kallsyms]) swapper 0 [000] 245402.891227: 5 cycles:ppp: ffffffffa0068814 native_write_msr+0x4 ([kernel.kallsyms]) swapper 0 [000] 245402.891231: 41 cycles:ppp: ffffffffa0068816 native_write_msr+0x6 ([kernel.kallsyms]) swapper 0 [000] 245402.891235: 355 cycles:ppp: ffffffffa000dd51 intel_bts_enable_local+0x21 ([kernel.kallsyms]) swapper 0 [000] 245402.891239: 3084 cycles:ppp: ffffffffa0a0150a end_repeat_nmi+0x48 ([kernel.kallsyms]) After: % perf script --reltime swapper 0 [000] 0.000000: 1 cycles:ppp: ffffffffa0068814 native_write_msr+0x4 ([kernel.kallsyms]) swapper 0 [000] 0.000006: 1 cycles:ppp: ffffffffa0068814 native_write_msr+0x4 ([kernel.kallsyms]) swapper 0 [000] 0.000010: 5 cycles:ppp: ffffffffa0068814 native_write_msr+0x4 ([kernel.kallsyms]) swapper 0 [000] 0.000014: 41 cycles:ppp: ffffffffa0068816 native_write_msr+0x6 ([kernel.kallsyms]) swapper 0 [000] 0.000018: 355 cycles:ppp: ffffffffa000dd51 intel_bts_enable_local+0x21 ([kernel.kallsyms]) swapper 0 [000] 0.000022: 3084 cycles:ppp: ffffffffa0a0150a end_repeat_nmi+0x48 ([kernel.kallsyms]) Committer notes: Do not use 'time' as the name of a variable, as this breaks the build on older glibcs: cc1: warnings being treated as errors builtin-script.c: In function 'perf_sample__fprintf_start': builtin-script.c:691: warning: declaration of 'time' shadows a global declaration /usr/include/time.h:187: warning: shadowed declaration is here Signed-off-by: Andi Kleen Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa LPU-Reference: 20190314225002.30108-8-andi@firstfloor.org Link: https://lkml.kernel.org/n/tip-bpahyi6pr9r399mvihu65fvc@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 3 +++ tools/perf/builtin-script.c | 18 ++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 2e19fd7ffe35..9b0d04dd2a61 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -380,6 +380,9 @@ include::itrace.txt[] Set the maximum number of program blocks to print with brstackasm for each sample. +--reltime:: + Print time stamps relative to trace start. + --per-event-dump:: Create per event files with a "perf.data.EVENT.dump" name instead of printing to stdout, useful, for instance, for generating flamegraphs. diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 2f93d60c5a17..61cfd8f70989 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -53,6 +53,8 @@ static char const *script_name; static char const *generate_script_lang; +static bool reltime; +static u64 initial_time; static bool debug_mode; static u64 last_timestamp; static u64 nr_unordered; @@ -686,7 +688,13 @@ static int perf_sample__fprintf_start(struct perf_sample *sample, } if (PRINT_FIELD(TIME)) { - nsecs = sample->time; + u64 t = sample->time; + if (reltime) { + if (!initial_time) + initial_time = sample->time; + t = sample->time - initial_time; + } + nsecs = t; secs = nsecs / NSEC_PER_SEC; nsecs -= secs * NSEC_PER_SEC; @@ -694,7 +702,7 @@ static int perf_sample__fprintf_start(struct perf_sample *sample, printed += fprintf(fp, "%5lu.%09llu: ", secs, nsecs); else { char sample_time[32]; - timestamp__scnprintf_usec(sample->time, sample_time, sizeof(sample_time)); + timestamp__scnprintf_usec(t, sample_time, sizeof(sample_time)); printed += fprintf(fp, "%12s: ", sample_time); } } @@ -3413,6 +3421,7 @@ int cmd_script(int argc, const char **argv) "Set the maximum stack depth when parsing the callchain, " "anything beyond the specified depth will be ignored. " "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), + OPT_BOOLEAN(0, "reltime", &reltime, "Show time stamps relative to start"), OPT_BOOLEAN('I', "show-info", &show_full_info, "display extended information from perf.data file"), OPT_BOOLEAN('\0', "show-kernel-path", &symbol_conf.show_kernel_path, @@ -3487,6 +3496,11 @@ int cmd_script(int argc, const char **argv) } } + if (script.time_str && reltime) { + fprintf(stderr, "Don't combine --reltime with --time\n"); + return -1; + } + if (itrace_synth_opts.callchain && itrace_synth_opts.callchain_sz > scripting_max_stack) scripting_max_stack = itrace_synth_opts.callchain_sz; From 75998bb263bf48c1c85d78cd2d2f3a97d3747cab Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Mar 2019 15:50:01 -0700 Subject: [PATCH 664/855] perf stat: Fix --no-scale The -c option to enable multiplex scaling has been useless for quite some time because scaling is default. It's only useful as --no-scale to disable scaling. But the non scaling code path has bitrotted and doesn't print anything because perf output code relies on value run/ena information. Also even when we don't want to scale a value it's still useful to show its multiplex percentage. This patch: - Fixes help and documentation to show --no-scale instead of -c - Removes -c, only keeps the long option because -c doesn't support negatives. - Enables running/enabled even with --no-scale - And fixes some other problems in the no-scale output. Before: $ perf stat --no-scale -e cycles true Performance counter stats for 'true': cycles 0.000984154 seconds time elapsed After: $ ./perf stat --no-scale -e cycles true Performance counter stats for 'true': 706,070 cycles 0.001219821 seconds time elapsed Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo LPU-Reference: 20190314225002.30108-9-andi@firstfloor.org Link: https://lkml.kernel.org/n/tip-xggjvwcdaj2aqy8ib3i4b1g6@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-stat.txt | 5 ++--- tools/perf/builtin-stat.c | 3 ++- tools/perf/util/evsel.c | 3 +-- tools/perf/util/stat.c | 12 ++++-------- 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 4bc2085e5197..39c05f89104e 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -72,9 +72,8 @@ report:: --all-cpus:: system-wide collection from all CPUs (default if no target is specified) --c:: ---scale:: - scale/normalize counter values +--no-scale:: + Don't scale/normalize counter values -d:: --detailed:: diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 7b8f09b0b8bf..49ee3c2033ec 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -718,7 +718,8 @@ static struct option stat_options[] = { "system-wide collection from all CPUs"), OPT_BOOLEAN('g', "group", &group, "put the counters into a counter group"), - OPT_BOOLEAN('c', "scale", &stat_config.scale, "scale/normalize counters"), + OPT_BOOLEAN(0, "scale", &stat_config.scale, + "Use --no-scale to disable counter scaling for multiplexing"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), OPT_INTEGER('r', "repeat", &stat_config.run_count, diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 3bbf73e979c0..53ec40cacd4b 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1344,8 +1344,7 @@ void perf_counts_values__scale(struct perf_counts_values *count, scaled = 1; count->val = (u64)((double) count->val * count->ena / count->run + 0.5); } - } else - count->ena = count->run = 0; + } if (pscaled) *pscaled = scaled; diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 4d40515307b8..2856cc9d5a31 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -291,10 +291,8 @@ process_counter_values(struct perf_stat_config *config, struct perf_evsel *evsel break; case AGGR_GLOBAL: aggr->val += count->val; - if (config->scale) { - aggr->ena += count->ena; - aggr->run += count->run; - } + aggr->ena += count->ena; + aggr->run += count->run; case AGGR_UNSET: default: break; @@ -442,10 +440,8 @@ int create_perf_stat_counter(struct perf_evsel *evsel, struct perf_event_attr *attr = &evsel->attr; struct perf_evsel *leader = evsel->leader; - if (config->scale) { - attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | - PERF_FORMAT_TOTAL_TIME_RUNNING; - } + attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING; /* * The event is part of non trivial group, let's enable From 42a5864cf0a9a7e4eb541b5f390749a69c288c80 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Mar 2019 15:50:02 -0700 Subject: [PATCH 665/855] perf stat: Improve scaling The multiplexing scaling in perf stat mysteriously adds 0.5 to the value. This dates back to the original perf tool. Other scaling code doesn't use that strange convention. Remove the extra 0.5. Before: $ perf stat -e 'cycles,cycles,cycles,cycles,cycles,cycles' grep -rq foo Performance counter stats for 'grep -rq foo': 6,403,580 cycles (81.62%) 6,404,341 cycles (81.64%) 6,402,983 cycles (81.62%) 6,399,941 cycles (81.63%) 6,399,451 cycles (81.62%) 6,436,105 cycles (91.87%) 0.005843799 seconds time elapsed 0.002905000 seconds user 0.002902000 seconds sys After: $ perf stat -e 'cycles,cycles,cycles,cycles,cycles,cycles' grep -rq foo Performance counter stats for 'grep -rq foo': 6,422,704 cycles (81.68%) 6,401,842 cycles (81.68%) 6,398,432 cycles (81.68%) 6,397,098 cycles (81.68%) 6,396,074 cycles (81.67%) 6,434,980 cycles (91.62%) 0.005884437 seconds time elapsed 0.003580000 seconds user 0.002356000 seconds sys Signed-off-by: Andi Kleen Acked-by: Jiri Olsa LPU-Reference: 20190314225002.30108-10-andi@firstfloor.org Tested-by: Arnaldo Carvalho de Melo Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 53ec40cacd4b..16f02b83b23f 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1342,7 +1342,7 @@ void perf_counts_values__scale(struct perf_counts_values *count, count->val = 0; } else if (count->run < count->ena) { scaled = 1; - count->val = (u64)((double) count->val * count->ena / count->run + 0.5); + count->val = (u64)((double) count->val * count->ena / count->run); } } From c3b4d5c4afb0856e62a60db04c71205712b0d24f Mon Sep 17 00:00:00 2001 From: Mamatha Inamdar Date: Thu, 7 Feb 2019 15:09:28 +0530 Subject: [PATCH 666/855] perf vendor events: Remove P8 HW events which are not supported This patch is to remove following hardware events from JSON file which are not supported on POWER8. pm_co_disp_fail pm_co_tm_sc_footprint pm_iside_disp pm_iside_disp_fail pm_iside_disp_fail_other pm_iside_mru_touch pm_l2_castout_mod pm_l2_castout_shr pm_l2_dc_inv pm_l2_disp_all_l2miss pm_l2_grp_guess_correct pm_l2_grp_guess_wrong pm_l2_ic_inv pm_l2_inst pm_l2_inst_miss pm_l2_ld pm_l2_ld_disp pm_l2_ld_hit pm_l2_ld_miss pm_l2_loc_guess_correct pm_l2_loc_guess_wrong pm_l2_rcld_disp pm_l2_rcld_disp_fail_addr pm_l2_rcld_disp_fail_other pm_l2_rcst_disp pm_l2_rcst_disp_fail_addr pm_l2_rcst_disp_fail_other pm_l2_rc_st_done pm_l2_rty_ld pm_l2_sn_m_rd_done pm_l2_sn_m_wr_done pm_l2_sn_sx_i_done pm_l2_st_disp pm_l2_st_hit pm_l2_sys_guess_correct pm_l2_sys_guess_wrong pm_l2_sys_pump pm_l3_ci_hit pm_l3_ci_miss pm_l3_cinj pm_l3_co pm_l3_co_lco pm_l3_grp_guess_correct pm_l3_grp_guess_wrong_high pm_l3_grp_guess_wrong_low pm_l3_hit pm_l3_l2_co_hit pm_l3_l2_co_miss pm_l3_lat_ci_hit pm_l3_lat_ci_miss pm_l3_ld_hit pm_l3_ld_miss pm_l3_loc_guess_correct pm_l3_loc_guess_wrong pm_l3_miss pm_l3_p0_co_l31 pm_l3_p0_co_mem pm_l3_p0_co_rty pm_l3_p0_grp_pump pm_l3_p0_lco_data pm_l3_p0_lco_no_data pm_l3_p0_lco_rty pm_l3_p0_node_pump pm_l3_p0_pf_rty pm_l3_p0_sn_hit pm_l3_p0_sn_inv pm_l3_p0_sn_miss pm_l3_p0_sys_pump pm_l3_p1_co_l31 pm_l3_p1_co_mem pm_l3_p1_co_rty pm_l3_p1_grp_pump pm_l3_p1_lco_data pm_l3_p1_lco_no_data pm_l3_p1_lco_rty pm_l3_p1_node_pump pm_l3_p1_pf_rty pm_l3_p1_sn_hit pm_l3_p1_sn_inv pm_l3_p1_sn_miss pm_l3_p1_sys_pump pm_l3_pf_hit_l3 pm_l3_sys_guess_correct pm_l3_sys_guess_wrong pm_l3_trans_pf pm_l3_wi0_busy pm_l3_wi_usage pm_non_tm_rst_sc pm_rd_clearing_sc pm_rd_forming_sc pm_rd_hit_pf pm_snp_tm_hit_m pm_snp_tm_hit_t pm_st_caused_fail pm_tm_cam_overflow pm_tm_cap_overflow pm_tm_fav_caused_fail pm_tm_ld_caused_fail pm_tm_ld_conf pm_tm_rst_sc pm_tm_sc_co pm_tm_st_caused_fail pm_tm_st_conf Signed-off-by: Mamatha Inamdar Acked-by: Ravi Bangoria Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sukadev Bhattiprolu Fixes: 2a81fa3bb5ed ("perf vendor events: Add power8 PMU events") Link: http://lkml.kernel.org/r/154953186583.11022.14819560028300370163.stgit@localhost.localdomain Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/powerpc/power8/other.json | 594 ------------------ 1 file changed, 594 deletions(-) diff --git a/tools/perf/pmu-events/arch/powerpc/power8/other.json b/tools/perf/pmu-events/arch/powerpc/power8/other.json index 704302c3e67d..9dc2f6b70354 100644 --- a/tools/perf/pmu-events/arch/powerpc/power8/other.json +++ b/tools/perf/pmu-events/arch/powerpc/power8/other.json @@ -347,18 +347,6 @@ "BriefDescription": "CO mach 0 Busy. Used by PMU to sample ave RC livetime(mach0 used as sample point)", "PublicDescription": "" }, - {, - "EventCode": "0x517082", - "EventName": "PM_CO_DISP_FAIL", - "BriefDescription": "CO dispatch failed due to all CO machines being busy", - "PublicDescription": "" - }, - {, - "EventCode": "0x527084", - "EventName": "PM_CO_TM_SC_FOOTPRINT", - "BriefDescription": "L2 did a cleanifdirty CO to the L3 (ie created an SC line in the L3)", - "PublicDescription": "" - }, {, "EventCode": "0x3608a", "EventName": "PM_CO_USAGE", @@ -1577,36 +1565,12 @@ "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a instruction side request", "PublicDescription": "" }, - {, - "EventCode": "0x617082", - "EventName": "PM_ISIDE_DISP", - "BriefDescription": "All i-side dispatch attempts", - "PublicDescription": "" - }, - {, - "EventCode": "0x627084", - "EventName": "PM_ISIDE_DISP_FAIL", - "BriefDescription": "All i-side dispatch attempts that failed due to a addr collision with another machine", - "PublicDescription": "" - }, - {, - "EventCode": "0x627086", - "EventName": "PM_ISIDE_DISP_FAIL_OTHER", - "BriefDescription": "All i-side dispatch attempts that failed due to a reason other than addrs collision", - "PublicDescription": "" - }, {, "EventCode": "0x4608e", "EventName": "PM_ISIDE_L2MEMACC", "BriefDescription": "valid when first beat of data comes in for an i-side fetch where data came from mem(or L4)", "PublicDescription": "" }, - {, - "EventCode": "0x44608e", - "EventName": "PM_ISIDE_MRU_TOUCH", - "BriefDescription": "Iside L2 MRU touch", - "PublicDescription": "" - }, {, "EventCode": "0x30ac", "EventName": "PM_ISU_REF_FX0", @@ -1733,222 +1697,36 @@ "BriefDescription": "Instruction Demand sectors wriittent into IL1", "PublicDescription": "" }, - {, - "EventCode": "0x417080", - "EventName": "PM_L2_CASTOUT_MOD", - "BriefDescription": "L2 Castouts - Modified (M, Mu, Me)", - "PublicDescription": "" - }, - {, - "EventCode": "0x417082", - "EventName": "PM_L2_CASTOUT_SHR", - "BriefDescription": "L2 Castouts - Shared (T, Te, Si, S)", - "PublicDescription": "" - }, {, "EventCode": "0x27084", "EventName": "PM_L2_CHIP_PUMP", "BriefDescription": "RC requests that were local on chip pump attempts", "PublicDescription": "" }, - {, - "EventCode": "0x427086", - "EventName": "PM_L2_DC_INV", - "BriefDescription": "Dcache invalidates from L2", - "PublicDescription": "" - }, - {, - "EventCode": "0x44608c", - "EventName": "PM_L2_DISP_ALL_L2MISS", - "BriefDescription": "All successful Ld/St dispatches for this thread that were an L2miss", - "PublicDescription": "" - }, {, "EventCode": "0x27086", "EventName": "PM_L2_GROUP_PUMP", "BriefDescription": "RC requests that were on Node Pump attempts", "PublicDescription": "" }, - {, - "EventCode": "0x626084", - "EventName": "PM_L2_GRP_GUESS_CORRECT", - "BriefDescription": "L2 guess grp and guess was correct (data intra-6chip AND ^on-chip)", - "PublicDescription": "" - }, - {, - "EventCode": "0x626086", - "EventName": "PM_L2_GRP_GUESS_WRONG", - "BriefDescription": "L2 guess grp and guess was not correct (ie data on-chip OR beyond-6chip)", - "PublicDescription": "" - }, - {, - "EventCode": "0x427084", - "EventName": "PM_L2_IC_INV", - "BriefDescription": "Icache Invalidates from L2", - "PublicDescription": "" - }, - {, - "EventCode": "0x436088", - "EventName": "PM_L2_INST", - "BriefDescription": "All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs)", - "PublicDescription": "" - }, - {, - "EventCode": "0x43608a", - "EventName": "PM_L2_INST_MISS", - "BriefDescription": "All successful i-side dispatches that were an L2miss for this thread (excludes i_l2mru_tch reqs)", - "PublicDescription": "" - }, - {, - "EventCode": "0x416080", - "EventName": "PM_L2_LD", - "BriefDescription": "All successful D-side Load dispatches for this thread", - "PublicDescription": "" - }, - {, - "EventCode": "0x437088", - "EventName": "PM_L2_LD_DISP", - "BriefDescription": "All successful load dispatches", - "PublicDescription": "" - }, - {, - "EventCode": "0x43708a", - "EventName": "PM_L2_LD_HIT", - "BriefDescription": "All successful load dispatches that were L2 hits", - "PublicDescription": "" - }, - {, - "EventCode": "0x426084", - "EventName": "PM_L2_LD_MISS", - "BriefDescription": "All successful D-Side Load dispatches that were an L2miss for this thread", - "PublicDescription": "" - }, - {, - "EventCode": "0x616080", - "EventName": "PM_L2_LOC_GUESS_CORRECT", - "BriefDescription": "L2 guess loc and guess was correct (ie data local)", - "PublicDescription": "" - }, - {, - "EventCode": "0x616082", - "EventName": "PM_L2_LOC_GUESS_WRONG", - "BriefDescription": "L2 guess loc and guess was not correct (ie data not on chip)", - "PublicDescription": "" - }, - {, - "EventCode": "0x516080", - "EventName": "PM_L2_RCLD_DISP", - "BriefDescription": "L2 RC load dispatch attempt", - "PublicDescription": "" - }, - {, - "EventCode": "0x516082", - "EventName": "PM_L2_RCLD_DISP_FAIL_ADDR", - "BriefDescription": "L2 RC load dispatch attempt failed due to address collision with RC/CO/SN/SQ", - "PublicDescription": "" - }, - {, - "EventCode": "0x526084", - "EventName": "PM_L2_RCLD_DISP_FAIL_OTHER", - "BriefDescription": "L2 RC load dispatch attempt failed due to other reasons", - "PublicDescription": "" - }, - {, - "EventCode": "0x536088", - "EventName": "PM_L2_RCST_DISP", - "BriefDescription": "L2 RC store dispatch attempt", - "PublicDescription": "" - }, - {, - "EventCode": "0x53608a", - "EventName": "PM_L2_RCST_DISP_FAIL_ADDR", - "BriefDescription": "L2 RC store dispatch attempt failed due to address collision with RC/CO/SN/SQ", - "PublicDescription": "" - }, - {, - "EventCode": "0x54608c", - "EventName": "PM_L2_RCST_DISP_FAIL_OTHER", - "BriefDescription": "L2 RC store dispatch attempt failed due to other reasons", - "PublicDescription": "" - }, - {, - "EventCode": "0x537088", - "EventName": "PM_L2_RC_ST_DONE", - "BriefDescription": "RC did st to line that was Tx or Sx", - "PublicDescription": "" - }, - {, - "EventCode": "0x63708a", - "EventName": "PM_L2_RTY_LD", - "BriefDescription": "RC retries on PB for any load from core", - "PublicDescription": "" - }, {, "EventCode": "0x3708a", "EventName": "PM_L2_RTY_ST", "BriefDescription": "RC retries on PB for any store from core", "PublicDescription": "" }, - {, - "EventCode": "0x54708c", - "EventName": "PM_L2_SN_M_RD_DONE", - "BriefDescription": "SNP dispatched for a read and was M", - "PublicDescription": "" - }, - {, - "EventCode": "0x54708e", - "EventName": "PM_L2_SN_M_WR_DONE", - "BriefDescription": "SNP dispatched for a write and was M", - "PublicDescription": "" - }, - {, - "EventCode": "0x53708a", - "EventName": "PM_L2_SN_SX_I_DONE", - "BriefDescription": "SNP dispatched and went from Sx or Tx to Ix", - "PublicDescription": "" - }, {, "EventCode": "0x17080", "EventName": "PM_L2_ST", "BriefDescription": "All successful D-side store dispatches for this thread", "PublicDescription": "" }, - {, - "EventCode": "0x44708c", - "EventName": "PM_L2_ST_DISP", - "BriefDescription": "All successful store dispatches", - "PublicDescription": "" - }, - {, - "EventCode": "0x44708e", - "EventName": "PM_L2_ST_HIT", - "BriefDescription": "All successful store dispatches that were L2Hits", - "PublicDescription": "" - }, {, "EventCode": "0x17082", "EventName": "PM_L2_ST_MISS", "BriefDescription": "All successful D-side store dispatches for this thread that were L2 Miss", "PublicDescription": "" }, - {, - "EventCode": "0x636088", - "EventName": "PM_L2_SYS_GUESS_CORRECT", - "BriefDescription": "L2 guess sys and guess was correct (ie data beyond-6chip)", - "PublicDescription": "" - }, - {, - "EventCode": "0x63608a", - "EventName": "PM_L2_SYS_GUESS_WRONG", - "BriefDescription": "L2 guess sys and guess was not correct (ie data ^beyond-6chip)", - "PublicDescription": "" - }, - {, - "EventCode": "0x617080", - "EventName": "PM_L2_SYS_PUMP", - "BriefDescription": "RC requests that were system pump attempts", - "PublicDescription": "" - }, {, "EventCode": "0x1e05e", "EventName": "PM_L2_TM_REQ_ABORT", @@ -1961,36 +1739,12 @@ "BriefDescription": "TM marked store abort", "PublicDescription": "" }, - {, - "EventCode": "0x23808a", - "EventName": "PM_L3_CINJ", - "BriefDescription": "l3 ci of cache inject", - "PublicDescription": "" - }, - {, - "EventCode": "0x128084", - "EventName": "PM_L3_CI_HIT", - "BriefDescription": "L3 Castins Hit (total count", - "PublicDescription": "" - }, - {, - "EventCode": "0x128086", - "EventName": "PM_L3_CI_MISS", - "BriefDescription": "L3 castins miss (total count", - "PublicDescription": "" - }, {, "EventCode": "0x819082", "EventName": "PM_L3_CI_USAGE", "BriefDescription": "rotating sample of 16 CI or CO actives", "PublicDescription": "" }, - {, - "EventCode": "0x438088", - "EventName": "PM_L3_CO", - "BriefDescription": "l3 castout occurring ( does not include casthrough or log writes (cinj/dmaw)", - "PublicDescription": "" - }, {, "EventCode": "0x83908b", "EventName": "PM_L3_CO0_ALLOC", @@ -2009,120 +1763,18 @@ "BriefDescription": "L3 CO to L3.1 OR of port 0 and 1 ( lossy)", "PublicDescription": "" }, - {, - "EventCode": "0x238088", - "EventName": "PM_L3_CO_LCO", - "BriefDescription": "Total L3 castouts occurred on LCO", - "PublicDescription": "" - }, {, "EventCode": "0x28084", "EventName": "PM_L3_CO_MEM", "BriefDescription": "L3 CO to memory OR of port 0 and 1 ( lossy)", "PublicDescription": "" }, - {, - "EventCode": "0xb19082", - "EventName": "PM_L3_GRP_GUESS_CORRECT", - "BriefDescription": "Initial scope=group and data from same group (near) (pred successful)", - "PublicDescription": "" - }, - {, - "EventCode": "0xb3908a", - "EventName": "PM_L3_GRP_GUESS_WRONG_HIGH", - "BriefDescription": "Initial scope=group but data from local node. Predition too high", - "PublicDescription": "" - }, - {, - "EventCode": "0xb39088", - "EventName": "PM_L3_GRP_GUESS_WRONG_LOW", - "BriefDescription": "Initial scope=group but data from outside group (far or rem). Prediction too Low", - "PublicDescription": "" - }, - {, - "EventCode": "0x218080", - "EventName": "PM_L3_HIT", - "BriefDescription": "L3 Hits", - "PublicDescription": "" - }, - {, - "EventCode": "0x138088", - "EventName": "PM_L3_L2_CO_HIT", - "BriefDescription": "L2 castout hits", - "PublicDescription": "" - }, - {, - "EventCode": "0x13808a", - "EventName": "PM_L3_L2_CO_MISS", - "BriefDescription": "L2 castout miss", - "PublicDescription": "" - }, - {, - "EventCode": "0x14808c", - "EventName": "PM_L3_LAT_CI_HIT", - "BriefDescription": "L3 Lateral Castins Hit", - "PublicDescription": "" - }, - {, - "EventCode": "0x14808e", - "EventName": "PM_L3_LAT_CI_MISS", - "BriefDescription": "L3 Lateral Castins Miss", - "PublicDescription": "" - }, - {, - "EventCode": "0x228084", - "EventName": "PM_L3_LD_HIT", - "BriefDescription": "L3 demand LD Hits", - "PublicDescription": "" - }, - {, - "EventCode": "0x228086", - "EventName": "PM_L3_LD_MISS", - "BriefDescription": "L3 demand LD Miss", - "PublicDescription": "" - }, {, "EventCode": "0x1e052", "EventName": "PM_L3_LD_PREF", "BriefDescription": "L3 Load Prefetches", "PublicDescription": "" }, - {, - "EventCode": "0xb19080", - "EventName": "PM_L3_LOC_GUESS_CORRECT", - "BriefDescription": "initial scope=node/chip and data from local node (local) (pred successful)", - "PublicDescription": "" - }, - {, - "EventCode": "0xb29086", - "EventName": "PM_L3_LOC_GUESS_WRONG", - "BriefDescription": "Initial scope=node but data from out side local node (near or far or rem). Prediction too Low", - "PublicDescription": "" - }, - {, - "EventCode": "0x218082", - "EventName": "PM_L3_MISS", - "BriefDescription": "L3 Misses", - "PublicDescription": "" - }, - {, - "EventCode": "0x54808c", - "EventName": "PM_L3_P0_CO_L31", - "BriefDescription": "l3 CO to L3.1 (lco) port 0", - "PublicDescription": "" - }, - {, - "EventCode": "0x538088", - "EventName": "PM_L3_P0_CO_MEM", - "BriefDescription": "l3 CO to memory port 0", - "PublicDescription": "" - }, - {, - "EventCode": "0x929084", - "EventName": "PM_L3_P0_CO_RTY", - "BriefDescription": "L3 CO received retry port 0", - "PublicDescription": "" - }, {, "EventCode": "0xa29084", "EventName": "PM_L3_P0_GRP_PUMP", @@ -2147,120 +1799,6 @@ "BriefDescription": "L3 LCO received retry port 0", "PublicDescription": "" }, - {, - "EventCode": "0xa19080", - "EventName": "PM_L3_P0_NODE_PUMP", - "BriefDescription": "L3 pf sent with nodal scope port 0", - "PublicDescription": "" - }, - {, - "EventCode": "0x919080", - "EventName": "PM_L3_P0_PF_RTY", - "BriefDescription": "L3 PF received retry port 0", - "PublicDescription": "" - }, - {, - "EventCode": "0x939088", - "EventName": "PM_L3_P0_SN_HIT", - "BriefDescription": "L3 snoop hit port 0", - "PublicDescription": "" - }, - {, - "EventCode": "0x118080", - "EventName": "PM_L3_P0_SN_INV", - "BriefDescription": "Port0 snooper detects someone doing a store to a line thats Sx", - "PublicDescription": "" - }, - {, - "EventCode": "0x94908c", - "EventName": "PM_L3_P0_SN_MISS", - "BriefDescription": "L3 snoop miss port 0", - "PublicDescription": "" - }, - {, - "EventCode": "0xa39088", - "EventName": "PM_L3_P0_SYS_PUMP", - "BriefDescription": "L3 pf sent with sys scope port 0", - "PublicDescription": "" - }, - {, - "EventCode": "0x54808e", - "EventName": "PM_L3_P1_CO_L31", - "BriefDescription": "l3 CO to L3.1 (lco) port 1", - "PublicDescription": "" - }, - {, - "EventCode": "0x53808a", - "EventName": "PM_L3_P1_CO_MEM", - "BriefDescription": "l3 CO to memory port 1", - "PublicDescription": "" - }, - {, - "EventCode": "0x929086", - "EventName": "PM_L3_P1_CO_RTY", - "BriefDescription": "L3 CO received retry port 1", - "PublicDescription": "" - }, - {, - "EventCode": "0xa29086", - "EventName": "PM_L3_P1_GRP_PUMP", - "BriefDescription": "L3 pf sent with grp scope port 1", - "PublicDescription": "" - }, - {, - "EventCode": "0x528086", - "EventName": "PM_L3_P1_LCO_DATA", - "BriefDescription": "lco sent with data port 1", - "PublicDescription": "" - }, - {, - "EventCode": "0x518082", - "EventName": "PM_L3_P1_LCO_NO_DATA", - "BriefDescription": "dataless l3 lco sent port 1", - "PublicDescription": "" - }, - {, - "EventCode": "0xa4908e", - "EventName": "PM_L3_P1_LCO_RTY", - "BriefDescription": "L3 LCO received retry port 1", - "PublicDescription": "" - }, - {, - "EventCode": "0xa19082", - "EventName": "PM_L3_P1_NODE_PUMP", - "BriefDescription": "L3 pf sent with nodal scope port 1", - "PublicDescription": "" - }, - {, - "EventCode": "0x919082", - "EventName": "PM_L3_P1_PF_RTY", - "BriefDescription": "L3 PF received retry port 1", - "PublicDescription": "" - }, - {, - "EventCode": "0x93908a", - "EventName": "PM_L3_P1_SN_HIT", - "BriefDescription": "L3 snoop hit port 1", - "PublicDescription": "" - }, - {, - "EventCode": "0x118082", - "EventName": "PM_L3_P1_SN_INV", - "BriefDescription": "Port1 snooper detects someone doing a store to a line thats Sx", - "PublicDescription": "" - }, - {, - "EventCode": "0x94908e", - "EventName": "PM_L3_P1_SN_MISS", - "BriefDescription": "L3 snoop miss port 1", - "PublicDescription": "" - }, - {, - "EventCode": "0xa3908a", - "EventName": "PM_L3_P1_SYS_PUMP", - "BriefDescription": "L3 pf sent with sys scope port 1", - "PublicDescription": "" - }, {, "EventCode": "0x84908d", "EventName": "PM_L3_PF0_ALLOC", @@ -2273,12 +1811,6 @@ "BriefDescription": "lifetime, sample of PF machine 0 valid", "PublicDescription": "" }, - {, - "EventCode": "0x428084", - "EventName": "PM_L3_PF_HIT_L3", - "BriefDescription": "l3 pf hit in l3", - "PublicDescription": "" - }, {, "EventCode": "0x18080", "EventName": "PM_L3_PF_MISS_L3", @@ -2369,42 +1901,12 @@ "BriefDescription": "Data stream touchto L3", "PublicDescription": "" }, - {, - "EventCode": "0xb29084", - "EventName": "PM_L3_SYS_GUESS_CORRECT", - "BriefDescription": "Initial scope=system and data from outside group (far or rem)(pred successful)", - "PublicDescription": "" - }, - {, - "EventCode": "0xb4908c", - "EventName": "PM_L3_SYS_GUESS_WRONG", - "BriefDescription": "Initial scope=system but data from local or near. Predction too high", - "PublicDescription": "" - }, - {, - "EventCode": "0x24808e", - "EventName": "PM_L3_TRANS_PF", - "BriefDescription": "L3 Transient prefetch", - "PublicDescription": "" - }, {, "EventCode": "0x18081", "EventName": "PM_L3_WI0_ALLOC", "BriefDescription": "lifetime, sample of Write Inject machine 0 valid", "PublicDescription": "0.0" }, - {, - "EventCode": "0x418080", - "EventName": "PM_L3_WI0_BUSY", - "BriefDescription": "lifetime, sample of Write Inject machine 0 valid", - "PublicDescription": "" - }, - {, - "EventCode": "0x418082", - "EventName": "PM_L3_WI_USAGE", - "BriefDescription": "rotating sample of 8 WI actives", - "PublicDescription": "" - }, {, "EventCode": "0xc080", "EventName": "PM_LD_REF_L1_LSU0", @@ -3311,12 +2813,6 @@ "BriefDescription": "Dispatch time non favored tbegin", "PublicDescription": "" }, - {, - "EventCode": "0x328084", - "EventName": "PM_NON_TM_RST_SC", - "BriefDescription": "non tm snp rst tm sc", - "PublicDescription": "" - }, {, "EventCode": "0x2001a", "EventName": "PM_NTCG_ALL_FIN", @@ -3419,24 +2915,6 @@ "BriefDescription": "Continuous 16 cycle(2to1) window where this signals rotates thru sampling each L2 RC machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running", "PublicDescription": "" }, - {, - "EventCode": "0x34808e", - "EventName": "PM_RD_CLEARING_SC", - "BriefDescription": "rd clearing sc", - "PublicDescription": "" - }, - {, - "EventCode": "0x34808c", - "EventName": "PM_RD_FORMING_SC", - "BriefDescription": "rd forming sc", - "PublicDescription": "" - }, - {, - "EventCode": "0x428086", - "EventName": "PM_RD_HIT_PF", - "BriefDescription": "rd machine hit l3 pf machine", - "PublicDescription": "" - }, {, "EventCode": "0x20004", "EventName": "PM_REAL_SRQ_FULL", @@ -3503,18 +2981,6 @@ "BriefDescription": "TLBIE snoop", "PublicDescription": "TLBIE snoopSnoop TLBIE" }, - {, - "EventCode": "0x338088", - "EventName": "PM_SNP_TM_HIT_M", - "BriefDescription": "snp tm st hit m mu", - "PublicDescription": "" - }, - {, - "EventCode": "0x33808a", - "EventName": "PM_SNP_TM_HIT_T", - "BriefDescription": "snp tm_st_hit t tn te", - "PublicDescription": "" - }, {, "EventCode": "0x4608c", "EventName": "PM_SN_USAGE", @@ -3533,12 +2999,6 @@ "BriefDescription": "STCX executed reported at sent to nest", "PublicDescription": "STCX executed reported at sent to nest42" }, - {, - "EventCode": "0x717080", - "EventName": "PM_ST_CAUSED_FAIL", - "BriefDescription": "Non TM St caused any thread to fail", - "PublicDescription": "" - }, {, "EventCode": "0x3090", "EventName": "PM_SWAP_CANCEL", @@ -3623,18 +3083,6 @@ "BriefDescription": "Tm any tbegin", "PublicDescription": "" }, - {, - "EventCode": "0x318082", - "EventName": "PM_TM_CAM_OVERFLOW", - "BriefDescription": "l3 tm cam overflow during L2 co of SC", - "PublicDescription": "" - }, - {, - "EventCode": "0x74708c", - "EventName": "PM_TM_CAP_OVERFLOW", - "BriefDescription": "TM Footprint Capactiy Overflow", - "PublicDescription": "" - }, {, "EventCode": "0x20ba", "EventName": "PM_TM_END_ALL", @@ -3689,48 +3137,6 @@ "BriefDescription": "Transactional conflict from LSU, whatever gets reported to texas", "PublicDescription": "Transactional conflict from LSU, whatever gets reported to texas 42" }, - {, - "EventCode": "0x727086", - "EventName": "PM_TM_FAV_CAUSED_FAIL", - "BriefDescription": "TM Load (fav) caused another thread to fail", - "PublicDescription": "" - }, - {, - "EventCode": "0x717082", - "EventName": "PM_TM_LD_CAUSED_FAIL", - "BriefDescription": "Non TM Ld caused any thread to fail", - "PublicDescription": "" - }, - {, - "EventCode": "0x727084", - "EventName": "PM_TM_LD_CONF", - "BriefDescription": "TM Load (fav or non-fav) ran into conflict (failed)", - "PublicDescription": "" - }, - {, - "EventCode": "0x328086", - "EventName": "PM_TM_RST_SC", - "BriefDescription": "tm snp rst tm sc", - "PublicDescription": "" - }, - {, - "EventCode": "0x318080", - "EventName": "PM_TM_SC_CO", - "BriefDescription": "l3 castout tm Sc line", - "PublicDescription": "" - }, - {, - "EventCode": "0x73708a", - "EventName": "PM_TM_ST_CAUSED_FAIL", - "BriefDescription": "TM Store (fav or non-fav) caused another thread to fail", - "PublicDescription": "" - }, - {, - "EventCode": "0x737088", - "EventName": "PM_TM_ST_CONF", - "BriefDescription": "TM Store (fav or non-fav) ran into conflict (failed)", - "PublicDescription": "" - }, {, "EventCode": "0x20bc", "EventName": "PM_TM_TBEGIN", From af7a14a750b8eb3cdb26ec15344616ca170b06f2 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:41 +0800 Subject: [PATCH 667/855] perf tools: Add doc about how to build perf with Asan and UBSan AddressSanitizer (or ASan) and UndefinedBehaviorSanitizer (or UBSan) are very useful tools to detect program bugs: - AddressSanitizer (or ASan) is a GCC feature that detects memory corruption bugs such as buffer overflows and memory leaks. - UndefinedBehaviorSanitizer (or UBSan) is a fast undefined behavior detector supported by GCC. UBSan detects undefined behaviors of programs at runtime. This patch adds a document about how to use them on perf. Later patches will fix some of the issues disclosed by them. Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Link: http://lkml.kernel.org/r/20190316080556.3075-2-changbin.du@gmail.com [ Make some changes based on comments made by Jiri Olsa ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/Build.txt | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tools/perf/Documentation/Build.txt b/tools/perf/Documentation/Build.txt index f6fc6507ba55..3766886c4bca 100644 --- a/tools/perf/Documentation/Build.txt +++ b/tools/perf/Documentation/Build.txt @@ -47,3 +47,27 @@ Those objects are then used in final linking: NOTE this description is omitting other libraries involved, only focusing on build framework outcomes + +3) Build with ASan or UBSan +========================== + $ cd tools/perf + $ make DESTDIR=/usr + $ make DESTDIR=/usr install + +AddressSanitizer (or ASan) is a GCC feature that detects memory corruption bugs +such as buffer overflows and memory leaks. + + $ cd tools/perf + $ make DEBUG=1 EXTRA_CFLAGS='-fno-omit-frame-pointer -fsanitize=address' + $ ASAN_OPTIONS=log_path=asan.log ./perf record -a + +ASan outputs all detected issues into a log file named 'asan.log.'. + +UndefinedBehaviorSanitizer (or UBSan) is a fast undefined behavior detector +supported by GCC. UBSan detects undefined behaviors of programs at runtime. + + $ cd tools/perf + $ make DEBUG=1 EXTRA_CFLAGS='-fno-omit-frame-pointer -fsanitize=undefined' + $ UBSAN_OPTIONS=print_stacktrace=1 ./perf record -a + +If UBSan detects any problem at runtime, it outputs a “runtime error:” message. From 39df730b09774bd860e39ea208a48d15078236cb Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:42 +0800 Subject: [PATCH 668/855] perf list: Don't forget to drop the reference to the allocated thread_map Detected via gcc's ASan: Direct leak of 2048 byte(s) in 64 object(s) allocated from: 6 #0 0x7f606512e370 in __interceptor_realloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xee370) 7 #1 0x556b0f1d7ddd in thread_map__realloc util/thread_map.c:43 8 #2 0x556b0f1d84c7 in thread_map__new_by_tid util/thread_map.c:85 9 #3 0x556b0f0e045e in is_event_supported util/parse-events.c:2250 10 #4 0x556b0f0e1aa1 in print_hwcache_events util/parse-events.c:2382 11 #5 0x556b0f0e3231 in print_events util/parse-events.c:2514 12 #6 0x556b0ee0a66e in cmd_list /home/changbin/work/linux/tools/perf/builtin-list.c:58 13 #7 0x556b0f01e0ae in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302 14 #8 0x556b0f01e859 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354 15 #9 0x556b0f01edc8 in run_argv /home/changbin/work/linux/tools/perf/perf.c:398 16 #10 0x556b0f01f71f in main /home/changbin/work/linux/tools/perf/perf.c:520 17 #11 0x7f6062ccf09a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a) Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Fixes: 89896051f8da ("perf tools: Do not put a variable sized type not at the end of a struct") Link: http://lkml.kernel.org/r/20190316080556.3075-3-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 4dcc01b2532c..2e9035c4c252 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2271,6 +2271,7 @@ static bool is_event_supported(u8 type, unsigned config) perf_evsel__delete(evsel); } + thread_map__put(tmap); return ret; } From 11c1ea6f1a9bc97bf857fd12f72eacb6c69794e2 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:43 +0800 Subject: [PATCH 669/855] perf tools: Fix errors under optimization level '-Og' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Optimization level '-Og' offers a reasonable level of optimization while maintaining fast compilation and a good debugging experience. This patch tries to make it work. $ make DEBUG=1 EXTRA_CFLAGS='-Og' bench/epoll-ctl.c: In function ‘do_threads’: bench/epoll-ctl.c:274:9: error: ‘ret’ may be used uninitialized in this function [-Werror=maybe-uninitialized] return ret; ^~~ ... Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Link: http://lkml.kernel.org/r/20190316080556.3075-4-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/bpf/libbpf.c | 2 +- tools/perf/bench/epoll-ctl.c | 2 +- tools/perf/bench/epoll-wait.c | 2 +- tools/perf/tests/backward-ring-buffer.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index f5eb60379c8d..4884557aa17f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -622,7 +622,7 @@ bpf_object__init_maps(struct bpf_object *obj, int flags) bool strict = !(flags & MAPS_RELAX_COMPAT); int i, map_idx, map_def_sz, nr_maps = 0; Elf_Scn *scn; - Elf_Data *data; + Elf_Data *data = NULL; Elf_Data *symbols = obj->efile.symbols; if (obj->efile.maps_shndx < 0) diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c index 0c0a6e824934..2af067859966 100644 --- a/tools/perf/bench/epoll-ctl.c +++ b/tools/perf/bench/epoll-ctl.c @@ -224,7 +224,7 @@ static int do_threads(struct worker *worker, struct cpu_map *cpu) pthread_attr_t thread_attr, *attrp = NULL; cpu_set_t cpuset; unsigned int i, j; - int ret; + int ret = 0; if (!noaffinity) pthread_attr_init(&thread_attr); diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c index 5a11534e96a0..fe85448abd45 100644 --- a/tools/perf/bench/epoll-wait.c +++ b/tools/perf/bench/epoll-wait.c @@ -293,7 +293,7 @@ static int do_threads(struct worker *worker, struct cpu_map *cpu) pthread_attr_t thread_attr, *attrp = NULL; cpu_set_t cpuset; unsigned int i, j; - int ret, events = EPOLLIN; + int ret = 0, events = EPOLLIN; if (oneshot) events |= EPOLLONESHOT; diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index 6d598cc071ae..1a9c3becf5ff 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -18,7 +18,7 @@ static void testcase(void) int i; for (i = 0; i < NR_ITERS; i++) { - char proc_name[10]; + char proc_name[15]; snprintf(proc_name, sizeof(proc_name), "p:%d\n", i); prctl(PR_SET_NAME, proc_name); From 9b40dff7ba3caaf0d1919f98e136fa3400bd34aa Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:44 +0800 Subject: [PATCH 670/855] perf config: Fix an error in the config template documentation The option 'sort-order' should be 'sort_order'. Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Fixes: 893c5c798be9 ("perf config: Show default report configuration in example and docs") Link: http://lkml.kernel.org/r/20190316080556.3075-5-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-config.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt index 95054a8176a2..462b3cde0675 100644 --- a/tools/perf/Documentation/perf-config.txt +++ b/tools/perf/Documentation/perf-config.txt @@ -114,7 +114,7 @@ Given a $HOME/.perfconfig like this: [report] # Defaults - sort-order = comm,dso,symbol + sort_order = comm,dso,symbol percent-limit = 0 queue-size = 0 children = true From 54569ba4b06d5baedae4614bde33a25a191473ba Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:45 +0800 Subject: [PATCH 671/855] perf config: Fix a memory leak in collect_config() Detected with gcc's ASan: Direct leak of 66 byte(s) in 5 object(s) allocated from: #0 0x7ff3b1f32070 in __interceptor_strdup (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x3b070) #1 0x560c8761034d in collect_config util/config.c:597 #2 0x560c8760d9cb in get_value util/config.c:169 #3 0x560c8760dfd7 in perf_parse_file util/config.c:285 #4 0x560c8760e0d2 in perf_config_from_file util/config.c:476 #5 0x560c876108fd in perf_config_set__init util/config.c:661 #6 0x560c87610c72 in perf_config_set__new util/config.c:709 #7 0x560c87610d2f in perf_config__init util/config.c:718 #8 0x560c87610e5d in perf_config util/config.c:730 #9 0x560c875ddea0 in main /home/changbin/work/linux/tools/perf/perf.c:442 #10 0x7ff3afb8609a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a) Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Cc: Taeung Song Fixes: 20105ca1240c ("perf config: Introduce perf_config_set class") Link: http://lkml.kernel.org/r/20190316080556.3075-6-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/config.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index fa092511c52b..7e3c1b60120c 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -633,11 +633,10 @@ static int collect_config(const char *var, const char *value, } ret = set_value(item, value); - return ret; out_free: free(key); - return -1; + return ret; } int perf_config_set__collect(struct perf_config_set *set, const char *file_name, From 8bde8516893da5a5fdf06121f74d11b52ab92df5 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:46 +0800 Subject: [PATCH 672/855] perf build-id: Fix memory leak in print_sdt_events() Detected with gcc's ASan: Direct leak of 4356 byte(s) in 120 object(s) allocated from: #0 0x7ff1a2b5a070 in __interceptor_strdup (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x3b070) #1 0x55719aef4814 in build_id_cache__origname util/build-id.c:215 #2 0x55719af649b6 in print_sdt_events util/parse-events.c:2339 #3 0x55719af66272 in print_events util/parse-events.c:2542 #4 0x55719ad1ecaa in cmd_list /home/changbin/work/linux/tools/perf/builtin-list.c:58 #5 0x55719aec745d in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302 #6 0x55719aec7d1a in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354 #7 0x55719aec8184 in run_argv /home/changbin/work/linux/tools/perf/perf.c:398 #8 0x55719aeca41a in main /home/changbin/work/linux/tools/perf/perf.c:520 #9 0x7ff1a07ae09a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a) Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Fixes: 40218daea1db ("perf list: Show SDT and pre-cached events") Link: http://lkml.kernel.org/r/20190316080556.3075-7-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/build-id.c | 1 + tools/perf/util/parse-events.c | 1 + 2 files changed, 2 insertions(+) diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index bff0d17920ed..0c5517a8d0b7 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -185,6 +185,7 @@ char *build_id_cache__linkname(const char *sbuild_id, char *bf, size_t size) return bf; } +/* The caller is responsible to free the returned buffer. */ char *build_id_cache__origname(const char *sbuild_id) { char *linkname; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 2e9035c4c252..5ef4939408f2 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2342,6 +2342,7 @@ void print_sdt_events(const char *subsys_glob, const char *event_glob, printf(" %-50s [%s]\n", buf, "SDT event"); free(buf); } + free(path); } else printf(" %-50s [%s]\n", nd->s, "SDT event"); if (nd2) { From 0dba9e4be95b59e77060645ca8e37ca3231061f5 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:47 +0800 Subject: [PATCH 673/855] perf top: Delete the evlist before perf_session, fixing heap-use-after-free issue The evlist should be destroyed before the perf session. Detected with gcc's ASan: ================================================================= ==27350==ERROR: AddressSanitizer: heap-use-after-free on address 0x62b000002e38 at pc 0x5611da276999 bp 0x7ffce8f1d1a0 sp 0x7ffce8f1d190 WRITE of size 8 at 0x62b000002e38 thread T0 #0 0x5611da276998 in __list_del /home/work/linux/tools/include/linux/list.h:89 #1 0x5611da276d4a in __list_del_entry /home/work/linux/tools/include/linux/list.h:102 #2 0x5611da276e77 in list_del_init /home/work/linux/tools/include/linux/list.h:145 #3 0x5611da2781cd in thread__put util/thread.c:130 #4 0x5611da2cc0a8 in __thread__zput util/thread.h:68 #5 0x5611da2d2dcb in hist_entry__delete util/hist.c:1148 #6 0x5611da2cdf91 in hists__delete_entry util/hist.c:337 #7 0x5611da2ce19e in hists__delete_entries util/hist.c:365 #8 0x5611da2db2ab in hists__delete_all_entries util/hist.c:2639 #9 0x5611da2db325 in hists_evsel__exit util/hist.c:2651 #10 0x5611da1c5352 in perf_evsel__exit util/evsel.c:1304 #11 0x5611da1c5390 in perf_evsel__delete util/evsel.c:1309 #12 0x5611da1b35f0 in perf_evlist__purge util/evlist.c:124 #13 0x5611da1b38e2 in perf_evlist__delete util/evlist.c:148 #14 0x5611da069781 in cmd_top /home/changbin/work/linux/tools/perf/builtin-top.c:1645 #15 0x5611da17d038 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302 #16 0x5611da17d577 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354 #17 0x5611da17d97b in run_argv /home/changbin/work/linux/tools/perf/perf.c:398 #18 0x5611da17e0e9 in main /home/changbin/work/linux/tools/perf/perf.c:520 #19 0x7fdcc970f09a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a) #20 0x5611d9ff35c9 in _start (/home/work/linux/tools/perf/perf+0x3e95c9) 0x62b000002e38 is located 11320 bytes inside of 27448-byte region [0x62b000000200,0x62b000006d38) freed by thread T0 here: #0 0x7fdccb04ab70 in free (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xedb70) #1 0x5611da260df4 in perf_session__delete util/session.c:201 #2 0x5611da063de5 in __cmd_top /home/changbin/work/linux/tools/perf/builtin-top.c:1300 #3 0x5611da06973c in cmd_top /home/changbin/work/linux/tools/perf/builtin-top.c:1642 #4 0x5611da17d038 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302 #5 0x5611da17d577 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354 #6 0x5611da17d97b in run_argv /home/changbin/work/linux/tools/perf/perf.c:398 #7 0x5611da17e0e9 in main /home/changbin/work/linux/tools/perf/perf.c:520 #8 0x7fdcc970f09a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a) previously allocated by thread T0 here: #0 0x7fdccb04b138 in calloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xee138) #1 0x5611da26010c in zalloc util/util.h:23 #2 0x5611da260824 in perf_session__new util/session.c:118 #3 0x5611da0633a6 in __cmd_top /home/changbin/work/linux/tools/perf/builtin-top.c:1192 #4 0x5611da06973c in cmd_top /home/changbin/work/linux/tools/perf/builtin-top.c:1642 #5 0x5611da17d038 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302 #6 0x5611da17d577 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354 #7 0x5611da17d97b in run_argv /home/changbin/work/linux/tools/perf/perf.c:398 #8 0x5611da17e0e9 in main /home/changbin/work/linux/tools/perf/perf.c:520 #9 0x7fdcc970f09a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a) SUMMARY: AddressSanitizer: heap-use-after-free /home/work/linux/tools/include/linux/list.h:89 in __list_del Shadow bytes around the buggy address: 0x0c567fff8570: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd 0x0c567fff8580: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd 0x0c567fff8590: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd 0x0c567fff85a0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd 0x0c567fff85b0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd =>0x0c567fff85c0: fd fd fd fd fd fd fd[fd]fd fd fd fd fd fd fd fd 0x0c567fff85d0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd 0x0c567fff85e0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd 0x0c567fff85f0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd 0x0c567fff8600: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd 0x0c567fff8610: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd Shadow byte legend (one shadow byte represents 8 application bytes): Addressable: 00 Partially addressable: 01 02 03 04 05 06 07 Heap left redzone: fa Freed heap region: fd Stack left redzone: f1 Stack mid redzone: f2 Stack right redzone: f3 Stack after return: f5 Stack use after scope: f8 Global redzone: f9 Global init order: f6 Poisoned by user: f7 Container overflow: fc Array cookie: ac Intra object redzone: bb ASan internal: fe Left alloca redzone: ca Right alloca redzone: cb ==27350==ABORTING Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Link: http://lkml.kernel.org/r/20190316080556.3075-8-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 42 ++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 231a90daa958..614f278235fa 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1189,23 +1189,19 @@ static int __cmd_top(struct perf_top *top) pthread_t thread, thread_process; int ret; - top->session = perf_session__new(NULL, false, NULL); - if (top->session == NULL) - return -1; - if (!top->annotation_opts.objdump_path) { ret = perf_env__lookup_objdump(&top->session->header.env, &top->annotation_opts.objdump_path); if (ret) - goto out_delete; + return ret; } ret = callchain_param__setup_sample_type(&callchain_param); if (ret) - goto out_delete; + return ret; if (perf_session__register_idle_thread(top->session) < 0) - goto out_delete; + return ret; if (top->nr_threads_synthesize > 1) perf_set_multithreaded(); @@ -1227,13 +1223,18 @@ static int __cmd_top(struct perf_top *top) if (perf_hpp_list.socket) { ret = perf_env__read_cpu_topology_map(&perf_env); - if (ret < 0) - goto out_err_cpu_topo; + if (ret < 0) { + char errbuf[BUFSIZ]; + const char *err = str_error_r(-ret, errbuf, sizeof(errbuf)); + + ui__error("Could not read the CPU topology map: %s\n", err); + return ret; + } } ret = perf_top__start_counters(top); if (ret) - goto out_delete; + return ret; top->session->evlist = top->evlist; perf_session__set_id_hdr_size(top->session); @@ -1252,7 +1253,7 @@ static int __cmd_top(struct perf_top *top) ret = -1; if (pthread_create(&thread_process, NULL, process_thread, top)) { ui__error("Could not create process thread.\n"); - goto out_delete; + return ret; } if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui : @@ -1296,19 +1297,7 @@ out_join: out_join_thread: pthread_cond_signal(&top->qe.cond); pthread_join(thread_process, NULL); -out_delete: - perf_session__delete(top->session); - top->session = NULL; - return ret; - -out_err_cpu_topo: { - char errbuf[BUFSIZ]; - const char *err = str_error_r(-ret, errbuf, sizeof(errbuf)); - - ui__error("Could not read the CPU topology map: %s\n", err); - goto out_delete; -} } static int @@ -1639,10 +1628,17 @@ int cmd_top(int argc, const char **argv) signal(SIGWINCH, winch_sig); } + top.session = perf_session__new(NULL, false, NULL); + if (top.session == NULL) { + status = -1; + goto out_delete_evlist; + } + status = __cmd_top(&top); out_delete_evlist: perf_evlist__delete(top.evlist); + perf_session__delete(top.session); return status; } From 70c819e4bf1c5f492768b399d898d458ccdad2b6 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:48 +0800 Subject: [PATCH 674/855] perf top: Fix error handling in cmd_top() We should go to the cleanup path, to avoid leaks, detected using gcc's ASan. Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Link: http://lkml.kernel.org/r/20190316080556.3075-9-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 614f278235fa..2508a7a552fa 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1617,8 +1617,9 @@ int cmd_top(int argc, const char **argv) annotation_config__init(); symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL); - if (symbol__init(NULL) < 0) - return -1; + status = symbol__init(NULL); + if (status < 0) + goto out_delete_evlist; sort__setup_elide(stdout); From cb6186aeffda4d27e56066c79e9579e7831541d3 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:49 +0800 Subject: [PATCH 675/855] perf hist: Add missing map__put() in error case We need to map__put() before returning from failure of sample__resolve_callchain(). Detected with gcc's ASan. Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Krister Johansen Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Fixes: 9c68ae98c6f7 ("perf callchain: Reference count maps") Link: http://lkml.kernel.org/r/20190316080556.3075-10-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 1f230285d78a..7ace7a10054d 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1111,8 +1111,10 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al, err = sample__resolve_callchain(iter->sample, &callchain_cursor, &iter->parent, iter->evsel, al, max_stack_depth); - if (err) + if (err) { + map__put(alm); return err; + } err = iter->ops->prepare_entry(iter, al); if (err) From b49265e04410b97b31a5ee66ef6782c1b2d6cd2c Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:50 +0800 Subject: [PATCH 676/855] perf map: Remove map from 'names' tree in __maps__remove() There are two trees for each map inserted by maps__insert(), so remove it from the 'names' tree in __maps__remove(). Detected with gcc's ASan. Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Eric Saint-Etienne Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Fixes: 1e6285699b30 ("perf symbols: Fix slowness due to -ffunction-section") Link: http://lkml.kernel.org/r/20190316080556.3075-11-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index fbeb0c6efaa6..64bea5eb8bf6 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -917,6 +917,9 @@ static void __maps__remove(struct maps *maps, struct map *map) { rb_erase_init(&map->rb_node, &maps->entries); map__put(map); + + rb_erase_init(&map->rb_node_name, &maps->names); + map__put(map); } void maps__remove(struct maps *maps, struct map *map) From da3a53a7390a89391bd63bead0c2e9af4c5ef3d6 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:51 +0800 Subject: [PATCH 677/855] perf maps: Purge all maps from the 'names' tree Add function __maps__purge_names() to purge all maps from the names tree. We need to cleanup the names tree in maps__exit(). Detected with gcc's ASan. Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Eric Saint-Etienne Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Fixes: 1e6285699b30 ("perf symbols: Fix slowness due to -ffunction-section") Link: http://lkml.kernel.org/r/20190316080556.3075-12-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 64bea5eb8bf6..e32628cd20a7 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -577,10 +577,25 @@ static void __maps__purge(struct maps *maps) } } +static void __maps__purge_names(struct maps *maps) +{ + struct rb_root *root = &maps->names; + struct rb_node *next = rb_first(root); + + while (next) { + struct map *pos = rb_entry(next, struct map, rb_node_name); + + next = rb_next(&pos->rb_node_name); + rb_erase_init(&pos->rb_node_name, root); + map__put(pos); + } +} + static void maps__exit(struct maps *maps) { down_write(&maps->lock); __maps__purge(maps); + __maps__purge_names(maps); up_write(&maps->lock); } From 1e5b0cf8672e622257df024074e6e09bfbcb7750 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:52 +0800 Subject: [PATCH 678/855] perf top: Fix global-buffer-overflow issue The array str[] should have six elements. ================================================================= ==4322==ERROR: AddressSanitizer: global-buffer-overflow on address 0x56463844e300 at pc 0x564637e7ad0d bp 0x7f30c8c89d10 sp 0x7f30c8c89d00 READ of size 8 at 0x56463844e300 thread T9 #0 0x564637e7ad0c in __ordered_events__flush util/ordered-events.c:316 #1 0x564637e7b0e4 in ordered_events__flush util/ordered-events.c:338 #2 0x564637c6a57d in process_thread /home/changbin/work/linux/tools/perf/builtin-top.c:1073 #3 0x7f30d173a163 in start_thread (/lib/x86_64-linux-gnu/libpthread.so.0+0x8163) #4 0x7f30cfffbdee in __clone (/lib/x86_64-linux-gnu/libc.so.6+0x11adee) 0x56463844e300 is located 32 bytes to the left of global variable 'flags' defined in 'util/trace-event-parse.c:229:26' (0x56463844e320) of size 192 0x56463844e300 is located 0 bytes to the right of global variable 'str' defined in 'util/ordered-events.c:268:28' (0x56463844e2e0) of size 32 SUMMARY: AddressSanitizer: global-buffer-overflow util/ordered-events.c:316 in __ordered_events__flush Shadow bytes around the buggy address: 0x0ac947081c10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0ac947081c20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0ac947081c30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0ac947081c40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0ac947081c50: 00 00 00 00 00 00 00 00 f9 f9 f9 f9 00 00 00 00 =>0x0ac947081c60:[f9]f9 f9 f9 00 00 00 00 00 00 00 00 00 00 00 00 0x0ac947081c70: 00 00 00 00 00 00 00 00 00 00 00 00 f9 f9 f9 f9 0x0ac947081c80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0ac947081c90: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0ac947081ca0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0ac947081cb0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Shadow byte legend (one shadow byte represents 8 application bytes): Addressable: 00 Partially addressable: 01 02 03 04 05 06 07 Heap left redzone: fa Freed heap region: fd Stack left redzone: f1 Stack mid redzone: f2 Stack right redzone: f3 Stack after return: f5 Stack use after scope: f8 Global redzone: f9 Global init order: f6 Poisoned by user: f7 Container overflow: fc Array cookie: ac Intra object redzone: bb ASan internal: fe Left alloca redzone: ca Right alloca redzone: cb Thread T9 created by T0 here: #0 0x7f30d179de5f in __interceptor_pthread_create (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x4ae5f) #1 0x564637c6b954 in __cmd_top /home/changbin/work/linux/tools/perf/builtin-top.c:1253 #2 0x564637c7173c in cmd_top /home/changbin/work/linux/tools/perf/builtin-top.c:1642 #3 0x564637d85038 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302 #4 0x564637d85577 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354 #5 0x564637d8597b in run_argv /home/changbin/work/linux/tools/perf/perf.c:398 #6 0x564637d860e9 in main /home/changbin/work/linux/tools/perf/perf.c:520 #7 0x7f30cff0509a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a) Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Cc: Jiri Olsa Fixes: 16c66bc167cc ("perf top: Add processing thread") Fixes: 68ca5d07de20 ("perf ordered_events: Add ordered_events__flush_time interface") Link: http://lkml.kernel.org/r/20190316080556.3075-13-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/ordered-events.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c index ea523d3b248f..989fed6f43b5 100644 --- a/tools/perf/util/ordered-events.c +++ b/tools/perf/util/ordered-events.c @@ -270,6 +270,8 @@ static int __ordered_events__flush(struct ordered_events *oe, enum oe_flush how, "FINAL", "ROUND", "HALF ", + "TOP ", + "TIME ", }; int err; bool show_progress = false; From 42dfa451d825a2ad15793c476f73e7bbc0f9d312 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 18 Mar 2019 16:41:28 -0300 Subject: [PATCH 679/855] perf evsel: Free evsel->counts in perf_evsel__exit() Using gcc's ASan, Changbin reports: ================================================================= ==7494==ERROR: LeakSanitizer: detected memory leaks Direct leak of 48 byte(s) in 1 object(s) allocated from: #0 0x7f0333a89138 in calloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xee138) #1 0x5625e5330a5e in zalloc util/util.h:23 #2 0x5625e5330a9b in perf_counts__new util/counts.c:10 #3 0x5625e5330ca0 in perf_evsel__alloc_counts util/counts.c:47 #4 0x5625e520d8e5 in __perf_evsel__read_on_cpu util/evsel.c:1505 #5 0x5625e517a985 in perf_evsel__read_on_cpu /home/work/linux/tools/perf/util/evsel.h:347 #6 0x5625e517ad1a in test__openat_syscall_event tests/openat-syscall.c:47 #7 0x5625e51528e6 in run_test tests/builtin-test.c:358 #8 0x5625e5152baf in test_and_print tests/builtin-test.c:388 #9 0x5625e51543fe in __cmd_test tests/builtin-test.c:583 #10 0x5625e515572f in cmd_test tests/builtin-test.c:722 #11 0x5625e51c3fb8 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302 #12 0x5625e51c44f7 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354 #13 0x5625e51c48fb in run_argv /home/changbin/work/linux/tools/perf/perf.c:398 #14 0x5625e51c5069 in main /home/changbin/work/linux/tools/perf/perf.c:520 #15 0x7f033214d09a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a) Indirect leak of 72 byte(s) in 1 object(s) allocated from: #0 0x7f0333a89138 in calloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xee138) #1 0x5625e532560d in zalloc util/util.h:23 #2 0x5625e532566b in xyarray__new util/xyarray.c:10 #3 0x5625e5330aba in perf_counts__new util/counts.c:15 #4 0x5625e5330ca0 in perf_evsel__alloc_counts util/counts.c:47 #5 0x5625e520d8e5 in __perf_evsel__read_on_cpu util/evsel.c:1505 #6 0x5625e517a985 in perf_evsel__read_on_cpu /home/work/linux/tools/perf/util/evsel.h:347 #7 0x5625e517ad1a in test__openat_syscall_event tests/openat-syscall.c:47 #8 0x5625e51528e6 in run_test tests/builtin-test.c:358 #9 0x5625e5152baf in test_and_print tests/builtin-test.c:388 #10 0x5625e51543fe in __cmd_test tests/builtin-test.c:583 #11 0x5625e515572f in cmd_test tests/builtin-test.c:722 #12 0x5625e51c3fb8 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302 #13 0x5625e51c44f7 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354 #14 0x5625e51c48fb in run_argv /home/changbin/work/linux/tools/perf/perf.c:398 #15 0x5625e51c5069 in main /home/changbin/work/linux/tools/perf/perf.c:520 #16 0x7f033214d09a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a) His patch took care of evsel->prev_raw_counts, but the above backtraces are about evsel->counts, so fix that instead. Reported-by: Changbin Du Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Link: https://lkml.kernel.org/n/tip-hd1x13g59f0nuhe4anxhsmfp@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 16f02b83b23f..1a2023da5d9c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1292,6 +1292,7 @@ void perf_evsel__exit(struct perf_evsel *evsel) { assert(list_empty(&evsel->node)); assert(evsel->evlist == NULL); + perf_evsel__free_counts(evsel); perf_evsel__free_fd(evsel); perf_evsel__free_id(evsel); perf_evsel__free_config_terms(evsel); From 93faa52e8371f0291ee1ff4994edae2b336b6233 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:54 +0800 Subject: [PATCH 680/855] perf tests: Fix a memory leak of cpu_map object in the openat_syscall_event_on_all_cpus test ================================================================= ==7497==ERROR: LeakSanitizer: detected memory leaks Direct leak of 40 byte(s) in 1 object(s) allocated from: #0 0x7f0333a88f30 in __interceptor_malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xedf30) #1 0x5625e5326213 in cpu_map__trim_new util/cpumap.c:45 #2 0x5625e5326703 in cpu_map__read util/cpumap.c:103 #3 0x5625e53267ef in cpu_map__read_all_cpu_map util/cpumap.c:120 #4 0x5625e5326915 in cpu_map__new util/cpumap.c:135 #5 0x5625e517b355 in test__openat_syscall_event_on_all_cpus tests/openat-syscall-all-cpus.c:36 #6 0x5625e51528e6 in run_test tests/builtin-test.c:358 #7 0x5625e5152baf in test_and_print tests/builtin-test.c:388 #8 0x5625e51543fe in __cmd_test tests/builtin-test.c:583 #9 0x5625e515572f in cmd_test tests/builtin-test.c:722 #10 0x5625e51c3fb8 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302 #11 0x5625e51c44f7 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354 #12 0x5625e51c48fb in run_argv /home/changbin/work/linux/tools/perf/perf.c:398 #13 0x5625e51c5069 in main /home/changbin/work/linux/tools/perf/perf.c:520 #14 0x7f033214d09a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a) Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Fixes: f30a79b012e5 ("perf tools: Add reference counting for cpu_map object") Link: http://lkml.kernel.org/r/20190316080556.3075-15-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/openat-syscall-all-cpus.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c index c531e6deb104..493ecb611540 100644 --- a/tools/perf/tests/openat-syscall-all-cpus.c +++ b/tools/perf/tests/openat-syscall-all-cpus.c @@ -45,7 +45,7 @@ int test__openat_syscall_event_on_all_cpus(struct test *test __maybe_unused, int if (IS_ERR(evsel)) { tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "syscalls", "sys_enter_openat"); pr_debug("%s\n", errbuf); - goto out_thread_map_delete; + goto out_cpu_map_delete; } if (perf_evsel__open(evsel, cpus, threads) < 0) { @@ -119,6 +119,8 @@ out_close_fd: perf_evsel__close_fd(evsel); out_evsel_delete: perf_evsel__delete(evsel); +out_cpu_map_delete: + cpu_map__put(cpus); out_thread_map_delete: thread_map__put(threads); return err; From f97a8991d3b998e518f56794d879f645964de649 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:55 +0800 Subject: [PATCH 681/855] perf tests: Fix memory leak by expr__find_other() in test__expr() ================================================================= ==7506==ERROR: LeakSanitizer: detected memory leaks Direct leak of 13 byte(s) in 3 object(s) allocated from: #0 0x7f03339d6070 in __interceptor_strdup (/usr/lib/x86_64-linux-gnu/libasan.so.5+0x3b070) #1 0x5625e53aaef0 in expr__find_other util/expr.y:221 #2 0x5625e51bcd3f in test__expr tests/expr.c:52 #3 0x5625e51528e6 in run_test tests/builtin-test.c:358 #4 0x5625e5152baf in test_and_print tests/builtin-test.c:388 #5 0x5625e51543fe in __cmd_test tests/builtin-test.c:583 #6 0x5625e515572f in cmd_test tests/builtin-test.c:722 #7 0x5625e51c3fb8 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302 #8 0x5625e51c44f7 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354 #9 0x5625e51c48fb in run_argv /home/changbin/work/linux/tools/perf/perf.c:398 #10 0x5625e51c5069 in main /home/changbin/work/linux/tools/perf/perf.c:520 #11 0x7f033214d09a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a) Signed-off-by: Changbin Du Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Fixes: 075167363f8b ("perf tools: Add a simple expression parser for JSON") Link: http://lkml.kernel.org/r/20190316080556.3075-16-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/expr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index 01f0706995a9..9acc1e80b936 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -19,7 +19,7 @@ int test__expr(struct test *t __maybe_unused, int subtest __maybe_unused) const char *p; const char **other; double val; - int ret; + int i, ret; struct parse_ctx ctx; int num_other; @@ -56,6 +56,9 @@ int test__expr(struct test *t __maybe_unused, int subtest __maybe_unused) TEST_ASSERT_VAL("find other", !strcmp(other[1], "BAZ")); TEST_ASSERT_VAL("find other", !strcmp(other[2], "BOZO")); TEST_ASSERT_VAL("find other", other[3] == NULL); + + for (i = 0; i < num_other; i++) + free((void *)other[i]); free((void *)other); return 0; From d982b33133284fa7efa0e52ae06b88f9be3ea764 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 16 Mar 2019 16:05:56 +0800 Subject: [PATCH 682/855] perf tests: Fix a memory leak in test__perf_evsel__tp_sched_test() ================================================================= ==20875==ERROR: LeakSanitizer: detected memory leaks Direct leak of 1160 byte(s) in 1 object(s) allocated from: #0 0x7f1b6fc84138 in calloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xee138) #1 0x55bd50005599 in zalloc util/util.h:23 #2 0x55bd500068f5 in perf_evsel__newtp_idx util/evsel.c:327 #3 0x55bd4ff810fc in perf_evsel__newtp /home/work/linux/tools/perf/util/evsel.h:216 #4 0x55bd4ff81608 in test__perf_evsel__tp_sched_test tests/evsel-tp-sched.c:69 #5 0x55bd4ff528e6 in run_test tests/builtin-test.c:358 #6 0x55bd4ff52baf in test_and_print tests/builtin-test.c:388 #7 0x55bd4ff543fe in __cmd_test tests/builtin-test.c:583 #8 0x55bd4ff5572f in cmd_test tests/builtin-test.c:722 #9 0x55bd4ffc4087 in run_builtin /home/changbin/work/linux/tools/perf/perf.c:302 #10 0x55bd4ffc45c6 in handle_internal_command /home/changbin/work/linux/tools/perf/perf.c:354 #11 0x55bd4ffc49ca in run_argv /home/changbin/work/linux/tools/perf/perf.c:398 #12 0x55bd4ffc5138 in main /home/changbin/work/linux/tools/perf/perf.c:520 #13 0x7f1b6e34809a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2409a) Indirect leak of 19 byte(s) in 1 object(s) allocated from: #0 0x7f1b6fc83f30 in __interceptor_malloc (/usr/lib/x86_64-linux-gnu/libasan.so.5+0xedf30) #1 0x7f1b6e3ac30f in vasprintf (/lib/x86_64-linux-gnu/libc.so.6+0x8830f) Signed-off-by: Changbin Du Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Fixes: 6a6cd11d4e57 ("perf test: Add test for the sched tracepoint format fields") Link: http://lkml.kernel.org/r/20190316080556.3075-17-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/evsel-tp-sched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/tests/evsel-tp-sched.c b/tools/perf/tests/evsel-tp-sched.c index ea7acf403727..71f60c0f9faa 100644 --- a/tools/perf/tests/evsel-tp-sched.c +++ b/tools/perf/tests/evsel-tp-sched.c @@ -85,5 +85,6 @@ int test__perf_evsel__tp_sched_test(struct test *test __maybe_unused, int subtes if (perf_evsel__test_field(evsel, "target_cpu", 4, true)) ret = -1; + perf_evsel__delete(evsel); return ret; } From 71184c6ab7e60fd59d8dbc8fed62a1c753dc4934 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:37 -0700 Subject: [PATCH 683/855] perf record: Replace option --bpf-event with --no-bpf-event Currently, monitoring of BPF programs through bpf_event is off by default for 'perf record'. To turn it on, the user need to use option "--bpf-event". As BPF gets wider adoption in different subsystems, this option becomes inconvenient. This patch makes bpf_event on by default, and adds option "--no-bpf-event" to turn it off. Since option --bpf-event is not released yet, it is safe to remove it. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: kernel-team@fb.com Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Link: http://lkml.kernel.org/r/20190312053051.2690567-2-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 2 +- tools/perf/perf.h | 2 +- tools/perf/util/bpf-event.c | 2 +- tools/perf/util/evsel.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index e7144a1c1c82..f29874192d3e 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1891,7 +1891,7 @@ static struct option __record_options[] = { OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize, "synthesize non-sample events at the end of output"), OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"), - OPT_BOOLEAN(0, "bpf-event", &record.opts.bpf_event, "record bpf events"), + OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"), OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq, "Fail if the specified frequency can't be used"), OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'", diff --git a/tools/perf/perf.h b/tools/perf/perf.h index b120e547ddc7..c59743def8d3 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -66,7 +66,7 @@ struct record_opts { bool ignore_missing_thread; bool strict_freq; bool sample_id; - bool bpf_event; + bool no_bpf_event; unsigned int freq; unsigned int mmap_pages; unsigned int auxtrace_mmap_pages; diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 028c8ec1f62a..ea012b735a37 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -187,7 +187,7 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_tool *tool, } /* Synthesize PERF_RECORD_BPF_EVENT */ - if (opts->bpf_event) { + if (!opts->no_bpf_event) { *bpf_event = (struct bpf_event){ .header = { .type = PERF_RECORD_BPF_EVENT, diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 1a2023da5d9c..7835e05f0c0a 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1036,7 +1036,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts, attr->mmap2 = track && !perf_missing_features.mmap2; attr->comm = track; attr->ksymbol = track && !perf_missing_features.ksymbol; - attr->bpf_event = track && opts->bpf_event && + attr->bpf_event = track && !opts->no_bpf_event && !perf_missing_features.bpf_event; if (opts->record_namespaces) From 34be16466d4dc06f3d604dafbcdb3327b72e78da Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:38 -0700 Subject: [PATCH 684/855] tools lib bpf: Introduce bpf_program__get_prog_info_linear() Currently, bpf_prog_info includes 9 arrays. The user has the option to fetch any combination of these arrays. However, this requires a lot of handling. This work becomes more tricky when we need to store bpf_prog_info to a file, because these arrays are allocated independently. This patch introduces 'struct bpf_prog_info_linear', which stores arrays of bpf_prog_info in continuous memory. Helper functions are introduced to unify the work to get different sets of bpf_prog_info. Specifically, bpf_program__get_prog_info_linear() allows the user to select which arrays to fetch, and handles details for the user. Please see the comments right before 'enum bpf_prog_info_array' for more details and examples. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Acked-by: Daniel Borkmann Link: https://lkml.kernel.org/r/ce92c091-e80d-a0c1-4aa0-987706c42b20@iogearbox.net Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: kernel-team@fb.com Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Link: http://lkml.kernel.org/r/20190312053051.2690567-3-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/bpf/libbpf.c | 251 +++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 63 ++++++++++ tools/lib/bpf/libbpf.map | 3 + 3 files changed, 317 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4884557aa17f..8fb6e89b4b2c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -112,6 +112,11 @@ void libbpf_print(enum libbpf_print_level level, const char *format, ...) # define LIBBPF_ELF_C_READ_MMAP ELF_C_READ #endif +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + struct bpf_capabilities { /* v4.14: kernel support for program & map names. */ __u32 name:1; @@ -2997,3 +3002,249 @@ bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, ring_buffer_write_tail(header, data_tail); return ret; } + +struct bpf_prog_info_array_desc { + int array_offset; /* e.g. offset of jited_prog_insns */ + int count_offset; /* e.g. offset of jited_prog_len */ + int size_offset; /* > 0: offset of rec size, + * < 0: fix size of -size_offset + */ +}; + +static struct bpf_prog_info_array_desc bpf_prog_info_array_desc[] = { + [BPF_PROG_INFO_JITED_INSNS] = { + offsetof(struct bpf_prog_info, jited_prog_insns), + offsetof(struct bpf_prog_info, jited_prog_len), + -1, + }, + [BPF_PROG_INFO_XLATED_INSNS] = { + offsetof(struct bpf_prog_info, xlated_prog_insns), + offsetof(struct bpf_prog_info, xlated_prog_len), + -1, + }, + [BPF_PROG_INFO_MAP_IDS] = { + offsetof(struct bpf_prog_info, map_ids), + offsetof(struct bpf_prog_info, nr_map_ids), + -(int)sizeof(__u32), + }, + [BPF_PROG_INFO_JITED_KSYMS] = { + offsetof(struct bpf_prog_info, jited_ksyms), + offsetof(struct bpf_prog_info, nr_jited_ksyms), + -(int)sizeof(__u64), + }, + [BPF_PROG_INFO_JITED_FUNC_LENS] = { + offsetof(struct bpf_prog_info, jited_func_lens), + offsetof(struct bpf_prog_info, nr_jited_func_lens), + -(int)sizeof(__u32), + }, + [BPF_PROG_INFO_FUNC_INFO] = { + offsetof(struct bpf_prog_info, func_info), + offsetof(struct bpf_prog_info, nr_func_info), + offsetof(struct bpf_prog_info, func_info_rec_size), + }, + [BPF_PROG_INFO_LINE_INFO] = { + offsetof(struct bpf_prog_info, line_info), + offsetof(struct bpf_prog_info, nr_line_info), + offsetof(struct bpf_prog_info, line_info_rec_size), + }, + [BPF_PROG_INFO_JITED_LINE_INFO] = { + offsetof(struct bpf_prog_info, jited_line_info), + offsetof(struct bpf_prog_info, nr_jited_line_info), + offsetof(struct bpf_prog_info, jited_line_info_rec_size), + }, + [BPF_PROG_INFO_PROG_TAGS] = { + offsetof(struct bpf_prog_info, prog_tags), + offsetof(struct bpf_prog_info, nr_prog_tags), + -(int)sizeof(__u8) * BPF_TAG_SIZE, + }, + +}; + +static __u32 bpf_prog_info_read_offset_u32(struct bpf_prog_info *info, int offset) +{ + __u32 *array = (__u32 *)info; + + if (offset >= 0) + return array[offset / sizeof(__u32)]; + return -(int)offset; +} + +static __u64 bpf_prog_info_read_offset_u64(struct bpf_prog_info *info, int offset) +{ + __u64 *array = (__u64 *)info; + + if (offset >= 0) + return array[offset / sizeof(__u64)]; + return -(int)offset; +} + +static void bpf_prog_info_set_offset_u32(struct bpf_prog_info *info, int offset, + __u32 val) +{ + __u32 *array = (__u32 *)info; + + if (offset >= 0) + array[offset / sizeof(__u32)] = val; +} + +static void bpf_prog_info_set_offset_u64(struct bpf_prog_info *info, int offset, + __u64 val) +{ + __u64 *array = (__u64 *)info; + + if (offset >= 0) + array[offset / sizeof(__u64)] = val; +} + +struct bpf_prog_info_linear * +bpf_program__get_prog_info_linear(int fd, __u64 arrays) +{ + struct bpf_prog_info_linear *info_linear; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + __u32 data_len = 0; + int i, err; + void *ptr; + + if (arrays >> BPF_PROG_INFO_LAST_ARRAY) + return ERR_PTR(-EINVAL); + + /* step 1: get array dimensions */ + err = bpf_obj_get_info_by_fd(fd, &info, &info_len); + if (err) { + pr_debug("can't get prog info: %s", strerror(errno)); + return ERR_PTR(-EFAULT); + } + + /* step 2: calculate total size of all arrays */ + for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { + bool include_array = (arrays & (1UL << i)) > 0; + struct bpf_prog_info_array_desc *desc; + __u32 count, size; + + desc = bpf_prog_info_array_desc + i; + + /* kernel is too old to support this field */ + if (info_len < desc->array_offset + sizeof(__u32) || + info_len < desc->count_offset + sizeof(__u32) || + (desc->size_offset > 0 && info_len < desc->size_offset)) + include_array = false; + + if (!include_array) { + arrays &= ~(1UL << i); /* clear the bit */ + continue; + } + + count = bpf_prog_info_read_offset_u32(&info, desc->count_offset); + size = bpf_prog_info_read_offset_u32(&info, desc->size_offset); + + data_len += count * size; + } + + /* step 3: allocate continuous memory */ + data_len = roundup(data_len, sizeof(__u64)); + info_linear = malloc(sizeof(struct bpf_prog_info_linear) + data_len); + if (!info_linear) + return ERR_PTR(-ENOMEM); + + /* step 4: fill data to info_linear->info */ + info_linear->arrays = arrays; + memset(&info_linear->info, 0, sizeof(info)); + ptr = info_linear->data; + + for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { + struct bpf_prog_info_array_desc *desc; + __u32 count, size; + + if ((arrays & (1UL << i)) == 0) + continue; + + desc = bpf_prog_info_array_desc + i; + count = bpf_prog_info_read_offset_u32(&info, desc->count_offset); + size = bpf_prog_info_read_offset_u32(&info, desc->size_offset); + bpf_prog_info_set_offset_u32(&info_linear->info, + desc->count_offset, count); + bpf_prog_info_set_offset_u32(&info_linear->info, + desc->size_offset, size); + bpf_prog_info_set_offset_u64(&info_linear->info, + desc->array_offset, + ptr_to_u64(ptr)); + ptr += count * size; + } + + /* step 5: call syscall again to get required arrays */ + err = bpf_obj_get_info_by_fd(fd, &info_linear->info, &info_len); + if (err) { + pr_debug("can't get prog info: %s", strerror(errno)); + free(info_linear); + return ERR_PTR(-EFAULT); + } + + /* step 6: verify the data */ + for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { + struct bpf_prog_info_array_desc *desc; + __u32 v1, v2; + + if ((arrays & (1UL << i)) == 0) + continue; + + desc = bpf_prog_info_array_desc + i; + v1 = bpf_prog_info_read_offset_u32(&info, desc->count_offset); + v2 = bpf_prog_info_read_offset_u32(&info_linear->info, + desc->count_offset); + if (v1 != v2) + pr_warning("%s: mismatch in element count\n", __func__); + + v1 = bpf_prog_info_read_offset_u32(&info, desc->size_offset); + v2 = bpf_prog_info_read_offset_u32(&info_linear->info, + desc->size_offset); + if (v1 != v2) + pr_warning("%s: mismatch in rec size\n", __func__); + } + + /* step 7: update info_len and data_len */ + info_linear->info_len = sizeof(struct bpf_prog_info); + info_linear->data_len = data_len; + + return info_linear; +} + +void bpf_program__bpil_addr_to_offs(struct bpf_prog_info_linear *info_linear) +{ + int i; + + for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { + struct bpf_prog_info_array_desc *desc; + __u64 addr, offs; + + if ((info_linear->arrays & (1UL << i)) == 0) + continue; + + desc = bpf_prog_info_array_desc + i; + addr = bpf_prog_info_read_offset_u64(&info_linear->info, + desc->array_offset); + offs = addr - ptr_to_u64(info_linear->data); + bpf_prog_info_set_offset_u64(&info_linear->info, + desc->array_offset, offs); + } +} + +void bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear) +{ + int i; + + for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { + struct bpf_prog_info_array_desc *desc; + __u64 addr, offs; + + if ((info_linear->arrays & (1UL << i)) == 0) + continue; + + desc = bpf_prog_info_array_desc + i; + offs = bpf_prog_info_read_offset_u64(&info_linear->info, + desc->array_offset); + addr = offs + ptr_to_u64(info_linear->data); + bpf_prog_info_set_offset_u64(&info_linear->info, + desc->array_offset, addr); + } +} diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index aa1521a51687..c70785cc8ef5 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -378,6 +378,69 @@ LIBBPF_API bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex); LIBBPF_API bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type, __u32 ifindex); +/* + * Get bpf_prog_info in continuous memory + * + * struct bpf_prog_info has multiple arrays. The user has option to choose + * arrays to fetch from kernel. The following APIs provide an uniform way to + * fetch these data. All arrays in bpf_prog_info are stored in a single + * continuous memory region. This makes it easy to store the info in a + * file. + * + * Before writing bpf_prog_info_linear to files, it is necessary to + * translate pointers in bpf_prog_info to offsets. Helper functions + * bpf_program__bpil_addr_to_offs() and bpf_program__bpil_offs_to_addr() + * are introduced to switch between pointers and offsets. + * + * Examples: + * # To fetch map_ids and prog_tags: + * __u64 arrays = (1UL << BPF_PROG_INFO_MAP_IDS) | + * (1UL << BPF_PROG_INFO_PROG_TAGS); + * struct bpf_prog_info_linear *info_linear = + * bpf_program__get_prog_info_linear(fd, arrays); + * + * # To save data in file + * bpf_program__bpil_addr_to_offs(info_linear); + * write(f, info_linear, sizeof(*info_linear) + info_linear->data_len); + * + * # To read data from file + * read(f, info_linear, ); + * bpf_program__bpil_offs_to_addr(info_linear); + */ +enum bpf_prog_info_array { + BPF_PROG_INFO_FIRST_ARRAY = 0, + BPF_PROG_INFO_JITED_INSNS = 0, + BPF_PROG_INFO_XLATED_INSNS, + BPF_PROG_INFO_MAP_IDS, + BPF_PROG_INFO_JITED_KSYMS, + BPF_PROG_INFO_JITED_FUNC_LENS, + BPF_PROG_INFO_FUNC_INFO, + BPF_PROG_INFO_LINE_INFO, + BPF_PROG_INFO_JITED_LINE_INFO, + BPF_PROG_INFO_PROG_TAGS, + BPF_PROG_INFO_LAST_ARRAY, +}; + +struct bpf_prog_info_linear { + /* size of struct bpf_prog_info, when the tool is compiled */ + __u32 info_len; + /* total bytes allocated for data, round up to 8 bytes */ + __u32 data_len; + /* which arrays are included in data */ + __u64 arrays; + struct bpf_prog_info info; + __u8 data[]; +}; + +LIBBPF_API struct bpf_prog_info_linear * +bpf_program__get_prog_info_linear(int fd, __u64 arrays); + +LIBBPF_API void +bpf_program__bpil_addr_to_offs(struct bpf_prog_info_linear *info_linear); + +LIBBPF_API void +bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 778a26702a70..f3ce50500cf2 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -153,4 +153,7 @@ LIBBPF_0.0.2 { xsk_socket__delete; xsk_umem__fd; xsk_socket__fd; + bpf_program__get_prog_info_linear; + bpf_program__bpil_addr_to_offs; + bpf_program__bpil_offs_to_addr; } LIBBPF_0.0.1; From cae73f2339231d61022769f09c94e4500e8ad47a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:39 -0700 Subject: [PATCH 685/855] bpftool: use bpf_program__get_prog_info_linear() in prog.c:do_dump() This patches uses bpf_program__get_prog_info_linear() to simplify the logic in prog.c do_dump(). Committer testing: Before: # bpftool prog dump xlated id 208 > /tmp/dump.xlated.before # bpftool prog dump jited id 208 > /tmp/dump.jited.before # bpftool map dump id 107 > /tmp/map.dump.before After: # ~acme/git/perf/tools/bpf/bpftool/bpftool map dump id 107 > /tmp/map.dump.after # ~acme/git/perf/tools/bpf/bpftool/bpftool prog dump xlated id 208 > /tmp/dump.xlated.after # ~acme/git/perf/tools/bpf/bpftool/bpftool prog dump jited id 208 > /tmp/dump.jited.after # diff -u /tmp/dump.xlated.before /tmp/dump.xlated.after # diff -u /tmp/dump.jited.before /tmp/dump.jited.after # diff -u /tmp/map.dump.before /tmp/map.dump.after # ~acme/git/perf/tools/bpf/bpftool/bpftool prog dump xlated id 208 0: (bf) r6 = r1 1: (85) call bpf_get_current_pid_tgid#80800 2: (63) *(u32 *)(r10 -328) = r0 3: (bf) r2 = r10 4: (07) r2 += -328 5: (18) r1 = map[id:107] 7: (85) call __htab_map_lookup_elem#85680 8: (15) if r0 == 0x0 goto pc+1 9: (07) r0 += 56 10: (b7) r7 = 0 11: (55) if r0 != 0x0 goto pc+52 12: (bf) r1 = r10 13: (07) r1 += -328 14: (b7) r2 = 64 15: (bf) r3 = r6 16: (85) call bpf_probe_read#-46848 17: (bf) r2 = r10 18: (07) r2 += -320 19: (18) r1 = map[id:106] 21: (07) r1 += 208 22: (61) r0 = *(u32 *)(r2 +0) 23: (35) if r0 >= 0x200 goto pc+3 24: (67) r0 <<= 3 25: (0f) r0 += r1 26: (05) goto pc+1 27: (b7) r0 = 0 28: (15) if r0 == 0x0 goto pc+35 29: (71) r1 = *(u8 *)(r0 +0) 30: (15) if r1 == 0x0 goto pc+33 31: (b7) r5 = 64 32: (79) r1 = *(u64 *)(r10 -320) 33: (15) if r1 == 0x2 goto pc+2 34: (15) if r1 == 0x101 goto pc+3 35: (55) if r1 != 0x15 goto pc+19 36: (79) r3 = *(u64 *)(r6 +16) 37: (05) goto pc+1 38: (79) r3 = *(u64 *)(r6 +24) 39: (15) if r3 == 0x0 goto pc+15 40: (b7) r1 = 0 41: (63) *(u32 *)(r10 -260) = r1 42: (bf) r1 = r10 43: (07) r1 += -256 44: (b7) r2 = 256 45: (85) call bpf_probe_read_str#-46704 46: (b7) r5 = 328 47: (63) *(u32 *)(r10 -264) = r0 48: (bf) r1 = r0 49: (67) r1 <<= 32 50: (77) r1 >>= 32 51: (25) if r1 > 0xff goto pc+3 52: (07) r0 += 72 53: (57) r0 &= 255 54: (bf) r5 = r0 55: (bf) r4 = r10 56: (07) r4 += -328 57: (bf) r1 = r6 58: (18) r2 = map[id:105] 60: (18) r3 = 0xffffffff 62: (85) call bpf_perf_event_output_tp#-45104 63: (bf) r7 = r0 64: (bf) r0 = r7 65: (95) exit # Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Acked-by: Daniel Borkmann Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: kernel-team@fb.com Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Link: http://lkml.kernel.org/r/20190312053051.2690567-4-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/bpf/bpftool/prog.c | 272 +++++++++------------------------------ 1 file changed, 62 insertions(+), 210 deletions(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 8ef80d65a474..d2be5a06c339 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -401,41 +401,31 @@ static int do_show(int argc, char **argv) static int do_dump(int argc, char **argv) { - unsigned int finfo_rec_size, linfo_rec_size, jited_linfo_rec_size; - void *func_info = NULL, *linfo = NULL, *jited_linfo = NULL; - unsigned int nr_finfo, nr_linfo = 0, nr_jited_linfo = 0; + struct bpf_prog_info_linear *info_linear; struct bpf_prog_linfo *prog_linfo = NULL; - unsigned long *func_ksyms = NULL; - struct bpf_prog_info info = {}; - unsigned int *func_lens = NULL; + enum {DUMP_JITED, DUMP_XLATED} mode; const char *disasm_opt = NULL; - unsigned int nr_func_ksyms; - unsigned int nr_func_lens; + struct bpf_prog_info *info; struct dump_data dd = {}; - __u32 len = sizeof(info); + void *func_info = NULL; struct btf *btf = NULL; - unsigned int buf_size; char *filepath = NULL; bool opcodes = false; bool visual = false; char func_sig[1024]; unsigned char *buf; bool linum = false; - __u32 *member_len; - __u64 *member_ptr; + __u32 member_len; + __u64 arrays; ssize_t n; - int err; int fd; if (is_prefix(*argv, "jited")) { if (disasm_init()) return -1; - - member_len = &info.jited_prog_len; - member_ptr = &info.jited_prog_insns; + mode = DUMP_JITED; } else if (is_prefix(*argv, "xlated")) { - member_len = &info.xlated_prog_len; - member_ptr = &info.xlated_prog_insns; + mode = DUMP_XLATED; } else { p_err("expected 'xlated' or 'jited', got: %s", *argv); return -1; @@ -474,175 +464,50 @@ static int do_dump(int argc, char **argv) return -1; } - err = bpf_obj_get_info_by_fd(fd, &info, &len); - if (err) { - p_err("can't get prog info: %s", strerror(errno)); - return -1; - } + if (mode == DUMP_JITED) + arrays = 1UL << BPF_PROG_INFO_JITED_INSNS; + else + arrays = 1UL << BPF_PROG_INFO_XLATED_INSNS; - if (!*member_len) { - p_info("no instructions returned"); - close(fd); - return 0; - } + arrays |= 1UL << BPF_PROG_INFO_JITED_KSYMS; + arrays |= 1UL << BPF_PROG_INFO_JITED_FUNC_LENS; + arrays |= 1UL << BPF_PROG_INFO_FUNC_INFO; + arrays |= 1UL << BPF_PROG_INFO_LINE_INFO; + arrays |= 1UL << BPF_PROG_INFO_JITED_LINE_INFO; - buf_size = *member_len; - - buf = malloc(buf_size); - if (!buf) { - p_err("mem alloc failed"); - close(fd); - return -1; - } - - nr_func_ksyms = info.nr_jited_ksyms; - if (nr_func_ksyms) { - func_ksyms = malloc(nr_func_ksyms * sizeof(__u64)); - if (!func_ksyms) { - p_err("mem alloc failed"); - close(fd); - goto err_free; - } - } - - nr_func_lens = info.nr_jited_func_lens; - if (nr_func_lens) { - func_lens = malloc(nr_func_lens * sizeof(__u32)); - if (!func_lens) { - p_err("mem alloc failed"); - close(fd); - goto err_free; - } - } - - nr_finfo = info.nr_func_info; - finfo_rec_size = info.func_info_rec_size; - if (nr_finfo && finfo_rec_size) { - func_info = malloc(nr_finfo * finfo_rec_size); - if (!func_info) { - p_err("mem alloc failed"); - close(fd); - goto err_free; - } - } - - linfo_rec_size = info.line_info_rec_size; - if (info.nr_line_info && linfo_rec_size && info.btf_id) { - nr_linfo = info.nr_line_info; - linfo = malloc(nr_linfo * linfo_rec_size); - if (!linfo) { - p_err("mem alloc failed"); - close(fd); - goto err_free; - } - } - - jited_linfo_rec_size = info.jited_line_info_rec_size; - if (info.nr_jited_line_info && - jited_linfo_rec_size && - info.nr_jited_ksyms && - info.nr_jited_func_lens && - info.btf_id) { - nr_jited_linfo = info.nr_jited_line_info; - jited_linfo = malloc(nr_jited_linfo * jited_linfo_rec_size); - if (!jited_linfo) { - p_err("mem alloc failed"); - close(fd); - goto err_free; - } - } - - memset(&info, 0, sizeof(info)); - - *member_ptr = ptr_to_u64(buf); - *member_len = buf_size; - info.jited_ksyms = ptr_to_u64(func_ksyms); - info.nr_jited_ksyms = nr_func_ksyms; - info.jited_func_lens = ptr_to_u64(func_lens); - info.nr_jited_func_lens = nr_func_lens; - info.nr_func_info = nr_finfo; - info.func_info_rec_size = finfo_rec_size; - info.func_info = ptr_to_u64(func_info); - info.nr_line_info = nr_linfo; - info.line_info_rec_size = linfo_rec_size; - info.line_info = ptr_to_u64(linfo); - info.nr_jited_line_info = nr_jited_linfo; - info.jited_line_info_rec_size = jited_linfo_rec_size; - info.jited_line_info = ptr_to_u64(jited_linfo); - - err = bpf_obj_get_info_by_fd(fd, &info, &len); + info_linear = bpf_program__get_prog_info_linear(fd, arrays); close(fd); - if (err) { + if (IS_ERR_OR_NULL(info_linear)) { p_err("can't get prog info: %s", strerror(errno)); - goto err_free; + return -1; } - if (*member_len > buf_size) { - p_err("too many instructions returned"); - goto err_free; + info = &info_linear->info; + if (mode == DUMP_JITED) { + if (info->jited_prog_len == 0) { + p_info("no instructions returned"); + goto err_free; + } + buf = (unsigned char *)(info->jited_prog_insns); + member_len = info->jited_prog_len; + } else { /* DUMP_XLATED */ + if (info->xlated_prog_len == 0) { + p_err("error retrieving insn dump: kernel.kptr_restrict set?"); + goto err_free; + } + buf = (unsigned char *)info->xlated_prog_insns; + member_len = info->xlated_prog_len; } - if (info.nr_jited_ksyms > nr_func_ksyms) { - p_err("too many addresses returned"); - goto err_free; - } - - if (info.nr_jited_func_lens > nr_func_lens) { - p_err("too many values returned"); - goto err_free; - } - - if (info.nr_func_info != nr_finfo) { - p_err("incorrect nr_func_info %d vs. expected %d", - info.nr_func_info, nr_finfo); - goto err_free; - } - - if (info.func_info_rec_size != finfo_rec_size) { - p_err("incorrect func_info_rec_size %d vs. expected %d", - info.func_info_rec_size, finfo_rec_size); - goto err_free; - } - - if (linfo && info.nr_line_info != nr_linfo) { - p_err("incorrect nr_line_info %u vs. expected %u", - info.nr_line_info, nr_linfo); - goto err_free; - } - - if (info.line_info_rec_size != linfo_rec_size) { - p_err("incorrect line_info_rec_size %u vs. expected %u", - info.line_info_rec_size, linfo_rec_size); - goto err_free; - } - - if (jited_linfo && info.nr_jited_line_info != nr_jited_linfo) { - p_err("incorrect nr_jited_line_info %u vs. expected %u", - info.nr_jited_line_info, nr_jited_linfo); - goto err_free; - } - - if (info.jited_line_info_rec_size != jited_linfo_rec_size) { - p_err("incorrect jited_line_info_rec_size %u vs. expected %u", - info.jited_line_info_rec_size, jited_linfo_rec_size); - goto err_free; - } - - if ((member_len == &info.jited_prog_len && - info.jited_prog_insns == 0) || - (member_len == &info.xlated_prog_len && - info.xlated_prog_insns == 0)) { - p_err("error retrieving insn dump: kernel.kptr_restrict set?"); - goto err_free; - } - - if (info.btf_id && btf__get_from_id(info.btf_id, &btf)) { + if (info->btf_id && btf__get_from_id(info->btf_id, &btf)) { p_err("failed to get btf"); goto err_free; } - if (nr_linfo) { - prog_linfo = bpf_prog_linfo__new(&info); + func_info = (void *)info->func_info; + + if (info->nr_line_info) { + prog_linfo = bpf_prog_linfo__new(info); if (!prog_linfo) p_info("error in processing bpf_line_info. continue without it."); } @@ -655,9 +520,9 @@ static int do_dump(int argc, char **argv) goto err_free; } - n = write(fd, buf, *member_len); + n = write(fd, buf, member_len); close(fd); - if (n != *member_len) { + if (n != member_len) { p_err("error writing output file: %s", n < 0 ? strerror(errno) : "short write"); goto err_free; @@ -665,19 +530,19 @@ static int do_dump(int argc, char **argv) if (json_output) jsonw_null(json_wtr); - } else if (member_len == &info.jited_prog_len) { + } else if (mode == DUMP_JITED) { const char *name = NULL; - if (info.ifindex) { - name = ifindex_to_bfd_params(info.ifindex, - info.netns_dev, - info.netns_ino, + if (info->ifindex) { + name = ifindex_to_bfd_params(info->ifindex, + info->netns_dev, + info->netns_ino, &disasm_opt); if (!name) goto err_free; } - if (info.nr_jited_func_lens && info.jited_func_lens) { + if (info->nr_jited_func_lens && info->jited_func_lens) { struct kernel_sym *sym = NULL; struct bpf_func_info *record; char sym_name[SYM_MAX_NAME]; @@ -685,17 +550,16 @@ static int do_dump(int argc, char **argv) __u64 *ksyms = NULL; __u32 *lens; __u32 i; - - if (info.nr_jited_ksyms) { + if (info->nr_jited_ksyms) { kernel_syms_load(&dd); - ksyms = (__u64 *) info.jited_ksyms; + ksyms = (__u64 *) info->jited_ksyms; } if (json_output) jsonw_start_array(json_wtr); - lens = (__u32 *) info.jited_func_lens; - for (i = 0; i < info.nr_jited_func_lens; i++) { + lens = (__u32 *) info->jited_func_lens; + for (i = 0; i < info->nr_jited_func_lens; i++) { if (ksyms) { sym = kernel_syms_search(&dd, ksyms[i]); if (sym) @@ -707,7 +571,7 @@ static int do_dump(int argc, char **argv) } if (func_info) { - record = func_info + i * finfo_rec_size; + record = func_info + i * info->func_info_rec_size; btf_dumper_type_only(btf, record->type_id, func_sig, sizeof(func_sig)); @@ -744,49 +608,37 @@ static int do_dump(int argc, char **argv) if (json_output) jsonw_end_array(json_wtr); } else { - disasm_print_insn(buf, *member_len, opcodes, name, + disasm_print_insn(buf, member_len, opcodes, name, disasm_opt, btf, NULL, 0, 0, false); } } else if (visual) { if (json_output) jsonw_null(json_wtr); else - dump_xlated_cfg(buf, *member_len); + dump_xlated_cfg(buf, member_len); } else { kernel_syms_load(&dd); - dd.nr_jited_ksyms = info.nr_jited_ksyms; - dd.jited_ksyms = (__u64 *) info.jited_ksyms; + dd.nr_jited_ksyms = info->nr_jited_ksyms; + dd.jited_ksyms = (__u64 *) info->jited_ksyms; dd.btf = btf; dd.func_info = func_info; - dd.finfo_rec_size = finfo_rec_size; + dd.finfo_rec_size = info->func_info_rec_size; dd.prog_linfo = prog_linfo; if (json_output) - dump_xlated_json(&dd, buf, *member_len, opcodes, + dump_xlated_json(&dd, buf, member_len, opcodes, linum); else - dump_xlated_plain(&dd, buf, *member_len, opcodes, + dump_xlated_plain(&dd, buf, member_len, opcodes, linum); kernel_syms_destroy(&dd); } - free(buf); - free(func_ksyms); - free(func_lens); - free(func_info); - free(linfo); - free(jited_linfo); - bpf_prog_linfo__free(prog_linfo); + free(info_linear); return 0; err_free: - free(buf); - free(func_ksyms); - free(func_lens); - free(func_info); - free(linfo); - free(jited_linfo); - bpf_prog_linfo__free(prog_linfo); + free(info_linear); return -1; } From a742258af131e570a68ad8cf16cd2cc4692675a0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:40 -0700 Subject: [PATCH 686/855] perf bpf: Synthesize bpf events with bpf_program__get_prog_info_linear() With bpf_program__get_prog_info_linear, we can simplify the logic that synthesizes bpf events. This patch doesn't change the behavior of the code. Commiter notes: Needed this (for all four variables), suggested by Song, to overcome build failure on debian experimental cross building to MIPS 32-bit: - u8 (*prog_tags)[BPF_TAG_SIZE] = (void *)(info->prog_tags); + u8 (*prog_tags)[BPF_TAG_SIZE] = (void *)(uintptr_t)(info->prog_tags); util/bpf-event.c: In function 'perf_event__synthesize_one_bpf_prog': util/bpf-event.c:143:35: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] u8 (*prog_tags)[BPF_TAG_SIZE] = (void *)(info->prog_tags); ^ util/bpf-event.c:144:22: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] __u32 *prog_lens = (__u32 *)(info->jited_func_lens); ^ util/bpf-event.c:145:23: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] __u64 *prog_addrs = (__u64 *)(info->jited_ksyms); ^ util/bpf-event.c:146:22: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] void *func_infos = (void *)(info->func_info); ^ cc1: all warnings being treated as errors Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: kernel-team@fb.com Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Link: http://lkml.kernel.org/r/20190312053051.2690567-5-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-event.c | 118 ++++++++++++------------------------ 1 file changed, 40 insertions(+), 78 deletions(-) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index ea012b735a37..e0cbe7f87170 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -3,7 +3,9 @@ #include #include #include +#include #include +#include #include "bpf-event.h" #include "debug.h" #include "symbol.h" @@ -49,99 +51,62 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_tool *tool, { struct ksymbol_event *ksymbol_event = &event->ksymbol_event; struct bpf_event *bpf_event = &event->bpf_event; - u32 sub_prog_cnt, i, func_info_rec_size = 0; - u8 (*prog_tags)[BPF_TAG_SIZE] = NULL; - struct bpf_prog_info info = { .type = 0, }; - u32 info_len = sizeof(info); - void *func_infos = NULL; - u64 *prog_addrs = NULL; + struct bpf_prog_info_linear *info_linear; + struct bpf_prog_info *info; struct btf *btf = NULL; - u32 *prog_lens = NULL; bool has_btf = false; - char errbuf[512]; + u32 sub_prog_cnt, i; int err = 0; + u64 arrays; - /* Call bpf_obj_get_info_by_fd() to get sizes of arrays */ - err = bpf_obj_get_info_by_fd(fd, &info, &info_len); + arrays = 1UL << BPF_PROG_INFO_JITED_KSYMS; + arrays |= 1UL << BPF_PROG_INFO_JITED_FUNC_LENS; + arrays |= 1UL << BPF_PROG_INFO_FUNC_INFO; + arrays |= 1UL << BPF_PROG_INFO_PROG_TAGS; - if (err) { - pr_debug("%s: failed to get BPF program info: %s, aborting\n", - __func__, str_error_r(errno, errbuf, sizeof(errbuf))); + info_linear = bpf_program__get_prog_info_linear(fd, arrays); + if (IS_ERR_OR_NULL(info_linear)) { + info_linear = NULL; + pr_debug("%s: failed to get BPF program info. aborting\n", __func__); return -1; } - if (info_len < offsetof(struct bpf_prog_info, prog_tags)) { + + if (info_linear->info_len < offsetof(struct bpf_prog_info, prog_tags)) { pr_debug("%s: the kernel is too old, aborting\n", __func__); return -2; } + info = &info_linear->info; + /* number of ksyms, func_lengths, and tags should match */ - sub_prog_cnt = info.nr_jited_ksyms; - if (sub_prog_cnt != info.nr_prog_tags || - sub_prog_cnt != info.nr_jited_func_lens) + sub_prog_cnt = info->nr_jited_ksyms; + if (sub_prog_cnt != info->nr_prog_tags || + sub_prog_cnt != info->nr_jited_func_lens) return -1; /* check BTF func info support */ - if (info.btf_id && info.nr_func_info && info.func_info_rec_size) { + if (info->btf_id && info->nr_func_info && info->func_info_rec_size) { /* btf func info number should be same as sub_prog_cnt */ - if (sub_prog_cnt != info.nr_func_info) { + if (sub_prog_cnt != info->nr_func_info) { pr_debug("%s: mismatch in BPF sub program count and BTF function info count, aborting\n", __func__); - return -1; + err = -1; + goto out; } - if (btf__get_from_id(info.btf_id, &btf)) { - pr_debug("%s: failed to get BTF of id %u, aborting\n", __func__, info.btf_id); - return -1; - } - func_info_rec_size = info.func_info_rec_size; - func_infos = calloc(sub_prog_cnt, func_info_rec_size); - if (!func_infos) { - pr_debug("%s: failed to allocate memory for func_infos, aborting\n", __func__); - return -1; + if (btf__get_from_id(info->btf_id, &btf)) { + pr_debug("%s: failed to get BTF of id %u, aborting\n", __func__, info->btf_id); + err = -1; + btf = NULL; + goto out; } has_btf = true; } - /* - * We need address, length, and tag for each sub program. - * Allocate memory and call bpf_obj_get_info_by_fd() again - */ - prog_addrs = calloc(sub_prog_cnt, sizeof(u64)); - if (!prog_addrs) { - pr_debug("%s: failed to allocate memory for prog_addrs, aborting\n", __func__); - goto out; - } - prog_lens = calloc(sub_prog_cnt, sizeof(u32)); - if (!prog_lens) { - pr_debug("%s: failed to allocate memory for prog_lens, aborting\n", __func__); - goto out; - } - prog_tags = calloc(sub_prog_cnt, BPF_TAG_SIZE); - if (!prog_tags) { - pr_debug("%s: failed to allocate memory for prog_tags, aborting\n", __func__); - goto out; - } - - memset(&info, 0, sizeof(info)); - info.nr_jited_ksyms = sub_prog_cnt; - info.nr_jited_func_lens = sub_prog_cnt; - info.nr_prog_tags = sub_prog_cnt; - info.jited_ksyms = ptr_to_u64(prog_addrs); - info.jited_func_lens = ptr_to_u64(prog_lens); - info.prog_tags = ptr_to_u64(prog_tags); - info_len = sizeof(info); - if (has_btf) { - info.nr_func_info = sub_prog_cnt; - info.func_info_rec_size = func_info_rec_size; - info.func_info = ptr_to_u64(func_infos); - } - - err = bpf_obj_get_info_by_fd(fd, &info, &info_len); - if (err) { - pr_debug("%s: failed to get BPF program info, aborting\n", __func__); - goto out; - } - /* Synthesize PERF_RECORD_KSYMBOL */ for (i = 0; i < sub_prog_cnt; i++) { + u8 (*prog_tags)[BPF_TAG_SIZE] = (void *)(uintptr_t)(info->prog_tags); + __u32 *prog_lens = (__u32 *)(uintptr_t)(info->jited_func_lens); + __u64 *prog_addrs = (__u64 *)(uintptr_t)(info->jited_ksyms); + void *func_infos = (void *)(uintptr_t)(info->func_info); const struct bpf_func_info *finfo; const char *short_name = NULL; const struct btf_type *t; @@ -163,13 +128,13 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_tool *tool, KSYM_NAME_LEN - name_len, prog_tags[i], BPF_TAG_SIZE); if (has_btf) { - finfo = func_infos + i * info.func_info_rec_size; + finfo = func_infos + i * info->func_info_rec_size; t = btf__type_by_id(btf, finfo->type_id); short_name = btf__name_by_offset(btf, t->name_off); } else if (i == 0 && sub_prog_cnt == 1) { /* no subprog */ - if (info.name[0]) - short_name = info.name; + if (info->name[0]) + short_name = info->name; } else short_name = "F"; if (short_name) @@ -195,9 +160,9 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_tool *tool, }, .type = PERF_BPF_EVENT_PROG_LOAD, .flags = 0, - .id = info.id, + .id = info->id, }; - memcpy(bpf_event->tag, prog_tags[i], BPF_TAG_SIZE); + memcpy(bpf_event->tag, info->tag, BPF_TAG_SIZE); memset((void *)event + event->header.size, 0, machine->id_hdr_size); event->header.size += machine->id_hdr_size; err = perf_tool__process_synth_event(tool, event, @@ -205,10 +170,7 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_tool *tool, } out: - free(prog_tags); - free(prog_lens); - free(prog_addrs); - free(func_infos); + free(info_linear); free(btf); return err ? -1 : 0; } From e5416950454fa79b7bdc86dac45661b97d887c97 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:41 -0700 Subject: [PATCH 687/855] perf bpf: Make synthesize_bpf_events() receive perf_session pointer instead of perf_tool This patch changes the arguments of perf_event__synthesize_bpf_events() to include perf_session* instead of perf_tool*. perf_session will be used in the next patch. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20190312053051.2690567-6-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 2 +- tools/perf/builtin-top.c | 2 +- tools/perf/util/bpf-event.c | 8 +++++--- tools/perf/util/bpf-event.h | 4 ++-- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index f29874192d3e..e79faccd7842 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1114,7 +1114,7 @@ static int record__synthesize(struct record *rec, bool tail) return err; } - err = perf_event__synthesize_bpf_events(tool, process_synthesized_event, + err = perf_event__synthesize_bpf_events(session, process_synthesized_event, machine, opts); if (err < 0) pr_warning("Couldn't synthesize bpf events.\n"); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 2508a7a552fa..77e6190211d2 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1208,7 +1208,7 @@ static int __cmd_top(struct perf_top *top) init_process_thread(top); - ret = perf_event__synthesize_bpf_events(&top->tool, perf_event__process, + ret = perf_event__synthesize_bpf_events(top->session, perf_event__process, &top->session->machines.host, &top->record_opts); if (ret < 0) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index e0cbe7f87170..5237e8f11997 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -10,6 +10,7 @@ #include "debug.h" #include "symbol.h" #include "machine.h" +#include "session.h" #define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr)) @@ -42,7 +43,7 @@ int machine__process_bpf_event(struct machine *machine __maybe_unused, * -1 for failures; * -2 for lack of kernel support. */ -static int perf_event__synthesize_one_bpf_prog(struct perf_tool *tool, +static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, perf_event__handler_t process, struct machine *machine, int fd, @@ -52,6 +53,7 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_tool *tool, struct ksymbol_event *ksymbol_event = &event->ksymbol_event; struct bpf_event *bpf_event = &event->bpf_event; struct bpf_prog_info_linear *info_linear; + struct perf_tool *tool = session->tool; struct bpf_prog_info *info; struct btf *btf = NULL; bool has_btf = false; @@ -175,7 +177,7 @@ out: return err ? -1 : 0; } -int perf_event__synthesize_bpf_events(struct perf_tool *tool, +int perf_event__synthesize_bpf_events(struct perf_session *session, perf_event__handler_t process, struct machine *machine, struct record_opts *opts) @@ -209,7 +211,7 @@ int perf_event__synthesize_bpf_events(struct perf_tool *tool, continue; } - err = perf_event__synthesize_one_bpf_prog(tool, process, + err = perf_event__synthesize_one_bpf_prog(session, process, machine, fd, event, opts); close(fd); diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h index 7890067e1a37..6698683612a7 100644 --- a/tools/perf/util/bpf-event.h +++ b/tools/perf/util/bpf-event.h @@ -15,7 +15,7 @@ struct record_opts; int machine__process_bpf_event(struct machine *machine, union perf_event *event, struct perf_sample *sample); -int perf_event__synthesize_bpf_events(struct perf_tool *tool, +int perf_event__synthesize_bpf_events(struct perf_session *session, perf_event__handler_t process, struct machine *machine, struct record_opts *opts); @@ -27,7 +27,7 @@ static inline int machine__process_bpf_event(struct machine *machine __maybe_unu return 0; } -static inline int perf_event__synthesize_bpf_events(struct perf_tool *tool __maybe_unused, +static inline int perf_event__synthesize_bpf_events(struct perf_session *session __maybe_unused, perf_event__handler_t process __maybe_unused, struct machine *machine __maybe_unused, struct record_opts *opts __maybe_unused) From e4378f0cb90be0368c48baad69a99203c58e3196 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:42 -0700 Subject: [PATCH 688/855] perf bpf: Save bpf_prog_info in a rbtree in perf_env bpf_prog_info contains information necessary to annotate bpf programs. This patch saves bpf_prog_info for bpf programs loaded in the system. Some big picture of the next few patches: To fully annotate BPF programs with source code mapping, 4 different informations are needed: 1) PERF_RECORD_KSYMBOL 2) PERF_RECORD_BPF_EVENT 3) bpf_prog_info 4) btf Before this set, 1) and 2) in the list are already saved to perf.data file. For BPF programs that are already loaded before perf run, 1) and 2) are synthesized by perf_event__synthesize_bpf_events(). For short living BPF programs, 1) and 2) are generated by kernel. This set handles 3) and 4) from the list. Again, it is necessary to handle existing BPF program and short living program separately. This patch handles 3) for exising BPF programs while synthesizing 1) and 2) in perf_event__synthesize_bpf_events(). These data are stored in perf_env. The next patch saves these data from perf_env to perf.data as headers. Similarly, the two patches after the next saves 4) of existing BPF programs to perf_env and perf.data. Another patch later will handle 3) and 4) for short living BPF programs by monitoring 1) and 2) in a dedicate thread. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20190312053051.2690567-7-songliubraving@fb.com [ set env->bpf_progs.infos_cnt to zero in perf_env__purge_bpf() as noted by jolsa ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/perf.c | 1 + tools/perf/util/bpf-event.c | 30 ++++++++++++- tools/perf/util/bpf-event.h | 7 ++- tools/perf/util/env.c | 88 +++++++++++++++++++++++++++++++++++++ tools/perf/util/env.h | 19 ++++++++ tools/perf/util/session.c | 1 + 6 files changed, 144 insertions(+), 2 deletions(-) diff --git a/tools/perf/perf.c b/tools/perf/perf.c index a11cb006f968..72df4b6fa36f 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -298,6 +298,7 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv) use_pager = 1; commit_pager_choice(); + perf_env__init(&perf_env); perf_env__set_cmdline(&perf_env, argc, argv); status = p->fn(argc, argv); perf_config__exit(); diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 5237e8f11997..37ee4e2a728a 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -10,6 +10,7 @@ #include "debug.h" #include "symbol.h" #include "machine.h" +#include "env.h" #include "session.h" #define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr)) @@ -54,17 +55,28 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, struct bpf_event *bpf_event = &event->bpf_event; struct bpf_prog_info_linear *info_linear; struct perf_tool *tool = session->tool; + struct bpf_prog_info_node *info_node; struct bpf_prog_info *info; struct btf *btf = NULL; bool has_btf = false; + struct perf_env *env; u32 sub_prog_cnt, i; int err = 0; u64 arrays; + /* + * for perf-record and perf-report use header.env; + * otherwise, use global perf_env. + */ + env = session->data ? &session->header.env : &perf_env; + arrays = 1UL << BPF_PROG_INFO_JITED_KSYMS; arrays |= 1UL << BPF_PROG_INFO_JITED_FUNC_LENS; arrays |= 1UL << BPF_PROG_INFO_FUNC_INFO; arrays |= 1UL << BPF_PROG_INFO_PROG_TAGS; + arrays |= 1UL << BPF_PROG_INFO_JITED_INSNS; + arrays |= 1UL << BPF_PROG_INFO_LINE_INFO; + arrays |= 1UL << BPF_PROG_INFO_JITED_LINE_INFO; info_linear = bpf_program__get_prog_info_linear(fd, arrays); if (IS_ERR_OR_NULL(info_linear)) { @@ -153,8 +165,8 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, machine, process); } - /* Synthesize PERF_RECORD_BPF_EVENT */ if (!opts->no_bpf_event) { + /* Synthesize PERF_RECORD_BPF_EVENT */ *bpf_event = (struct bpf_event){ .header = { .type = PERF_RECORD_BPF_EVENT, @@ -167,6 +179,22 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, memcpy(bpf_event->tag, info->tag, BPF_TAG_SIZE); memset((void *)event + event->header.size, 0, machine->id_hdr_size); event->header.size += machine->id_hdr_size; + + /* save bpf_prog_info to env */ + info_node = malloc(sizeof(struct bpf_prog_info_node)); + if (!info_node) { + err = -1; + goto out; + } + + info_node->info_linear = info_linear; + perf_env__insert_bpf_prog_info(env, info_node); + info_linear = NULL; + + /* + * process after saving bpf_prog_info to env, so that + * required information is ready for look up + */ err = perf_tool__process_synth_event(tool, event, machine, process); } diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h index 6698683612a7..fad932f7404f 100644 --- a/tools/perf/util/bpf-event.h +++ b/tools/perf/util/bpf-event.h @@ -3,14 +3,19 @@ #define __PERF_BPF_EVENT_H #include +#include #include "event.h" struct machine; union perf_event; struct perf_sample; -struct perf_tool; struct record_opts; +struct bpf_prog_info_node { + struct bpf_prog_info_linear *info_linear; + struct rb_node rb_node; +}; + #ifdef HAVE_LIBBPF_SUPPORT int machine__process_bpf_event(struct machine *machine, union perf_event *event, struct perf_sample *sample); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 4c23779e271a..98cd36f0e317 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -3,15 +3,97 @@ #include "env.h" #include "sane_ctype.h" #include "util.h" +#include "bpf-event.h" #include #include +#include struct perf_env perf_env; +void perf_env__insert_bpf_prog_info(struct perf_env *env, + struct bpf_prog_info_node *info_node) +{ + __u32 prog_id = info_node->info_linear->info.id; + struct bpf_prog_info_node *node; + struct rb_node *parent = NULL; + struct rb_node **p; + + down_write(&env->bpf_progs.lock); + p = &env->bpf_progs.infos.rb_node; + + while (*p != NULL) { + parent = *p; + node = rb_entry(parent, struct bpf_prog_info_node, rb_node); + if (prog_id < node->info_linear->info.id) { + p = &(*p)->rb_left; + } else if (prog_id > node->info_linear->info.id) { + p = &(*p)->rb_right; + } else { + pr_debug("duplicated bpf prog info %u\n", prog_id); + goto out; + } + } + + rb_link_node(&info_node->rb_node, parent, p); + rb_insert_color(&info_node->rb_node, &env->bpf_progs.infos); + env->bpf_progs.infos_cnt++; +out: + up_write(&env->bpf_progs.lock); +} + +struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env, + __u32 prog_id) +{ + struct bpf_prog_info_node *node = NULL; + struct rb_node *n; + + down_read(&env->bpf_progs.lock); + n = env->bpf_progs.infos.rb_node; + + while (n) { + node = rb_entry(n, struct bpf_prog_info_node, rb_node); + if (prog_id < node->info_linear->info.id) + n = n->rb_left; + else if (prog_id > node->info_linear->info.id) + n = n->rb_right; + else + break; + } + + up_read(&env->bpf_progs.lock); + return node; +} + +/* purge data in bpf_progs.infos tree */ +static void perf_env__purge_bpf(struct perf_env *env) +{ + struct rb_root *root; + struct rb_node *next; + + down_write(&env->bpf_progs.lock); + + root = &env->bpf_progs.infos; + next = rb_first(root); + + while (next) { + struct bpf_prog_info_node *node; + + node = rb_entry(next, struct bpf_prog_info_node, rb_node); + next = rb_next(&node->rb_node); + rb_erase(&node->rb_node, root); + free(node); + } + + env->bpf_progs.infos_cnt = 0; + + up_write(&env->bpf_progs.lock); +} + void perf_env__exit(struct perf_env *env) { int i; + perf_env__purge_bpf(env); zfree(&env->hostname); zfree(&env->os_release); zfree(&env->version); @@ -38,6 +120,12 @@ void perf_env__exit(struct perf_env *env) zfree(&env->memory_nodes); } +void perf_env__init(struct perf_env *env) +{ + env->bpf_progs.infos = RB_ROOT; + init_rwsem(&env->bpf_progs.lock); +} + int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[]) { int i; diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index d01b8355f4ca..24b11c564ba4 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -3,7 +3,9 @@ #define __PERF_ENV_H #include +#include #include "cpumap.h" +#include "rwsem.h" struct cpu_topology_map { int socket_id; @@ -64,8 +66,20 @@ struct perf_env { struct memory_node *memory_nodes; unsigned long long memory_bsize; u64 clockid_res_ns; + + /* + * bpf_info_lock protects bpf rbtrees. This is needed because the + * trees are accessed by different threads in perf-top + */ + struct { + struct rw_semaphore lock; + struct rb_root infos; + u32 infos_cnt; + } bpf_progs; }; +struct bpf_prog_info_node; + extern struct perf_env perf_env; void perf_env__exit(struct perf_env *env); @@ -80,4 +94,9 @@ const char *perf_env__arch(struct perf_env *env); const char *perf_env__raw_arch(struct perf_env *env); int perf_env__nr_cpus_avail(struct perf_env *env); +void perf_env__init(struct perf_env *env); +void perf_env__insert_bpf_prog_info(struct perf_env *env, + struct bpf_prog_info_node *info_node); +struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env, + __u32 prog_id); #endif /* __PERF_ENV_H */ diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 0ec34227bd60..b17f1c9bc965 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -132,6 +132,7 @@ struct perf_session *perf_session__new(struct perf_data *data, ordered_events__init(&session->ordered_events, ordered_events__deliver_event, NULL); + perf_env__init(&session->header.env); if (data) { if (perf_data__open(data)) goto out_delete; From 606f972b1361f477cbd4e6e8ac00742fde4b39db Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:43 -0700 Subject: [PATCH 689/855] perf bpf: Save bpf_prog_info information as headers to perf.data This patch enables perf-record to save bpf_prog_info information as headers to perf.data. A new header type HEADER_BPF_PROG_INFO is introduced for this data. Committer testing: As root, being on the kernel sources top level directory, run: # perf trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c -e *msg Just to compile and load a BPF program that attaches to the raw_syscalls:sys_{enter,exit} tracepoints to trace the syscalls ending in "msg" (recvmsg, sendmsg, recvmmsg, sendmmsg, etc). Then do a systemwide perf record session for a few seconds: # perf record -a sleep 2s Then look at: # perf report --header-only | grep -i bpf # bpf_prog_info of id 13 # bpf_prog_info of id 14 # bpf_prog_info of id 15 # bpf_prog_info of id 16 # bpf_prog_info of id 17 # bpf_prog_info of id 18 # bpf_prog_info of id 21 # bpf_prog_info of id 22 # bpf_prog_info of id 208 # bpf_prog_info of id 209 # We need to show more info about these programs, like bpftool does for the ones running on the system, i.e. 'perf record/perf report' become a way of saving the BPF state in a machine to then analyse on another, together with all the other information that is already saved in the perf.data header: # perf report --header-only # ======== # captured on : Tue Mar 12 11:42:13 2019 # header version : 1 # data offset : 296 # data size : 16294184 # feat offset : 16294480 # hostname : quaco # os release : 5.0.0+ # perf version : 5.0.gd783c8 # arch : x86_64 # nrcpus online : 8 # nrcpus avail : 8 # cpudesc : Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz # cpuid : GenuineIntel,6,142,10 # total memory : 24555720 kB # cmdline : /home/acme/bin/perf (deleted) record -a # event : name = cycles:ppp, , id = { 3190123, 3190124, 3190125, 3190126, 3190127, 3190128, 3190129, 3190130 }, size = 112, { sample_period, sample_freq } = 4000, sample_type = IP|TID|TIME|CPU|PERIOD, read_format = ID, disabled = 1, inherit = 1, mmap = 1, comm = 1, freq = 1, task = 1, precise_ip = 3, sample_id_all = 1, exclude_guest = 1, mmap2 = 1, comm_exec = 1 # CPU_TOPOLOGY info available, use -I to display # NUMA_TOPOLOGY info available, use -I to display # pmu mappings: intel_pt = 8, software = 1, power = 11, uprobe = 7, uncore_imc = 12, cpu = 4, cstate_core = 18, uncore_cbox_2 = 15, breakpoint = 5, uncore_cbox_0 = 13, tracepoint = 2, cstate_pkg = 19, uncore_arb = 17, kprobe = 6, i915 = 10, msr = 9, uncore_cbox_3 = 16, uncore_cbox_1 = 14 # CACHE info available, use -I to display # time of first sample : 116392.441701 # time of last sample : 116400.932584 # sample duration : 8490.883 ms # MEM_TOPOLOGY info available, use -I to display # bpf_prog_info of id 13 # bpf_prog_info of id 14 # bpf_prog_info of id 15 # bpf_prog_info of id 16 # bpf_prog_info of id 17 # bpf_prog_info of id 18 # bpf_prog_info of id 21 # bpf_prog_info of id 22 # bpf_prog_info of id 208 # bpf_prog_info of id 209 # missing features: TRACING_DATA BRANCH_STACK GROUP_DESC AUXTRACE STAT CLOCKID DIR_FORMAT # ======== # Committer notes: We can't use the libbpf unconditionally, as the build may have been with NO_LIBBPF, when we end up with linking errors, so provide dummy {process,write}_bpf_prog_info() wrapped by HAVE_LIBBPF_SUPPORT for that case. Printing are not affected by this, so can continue as is. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20190312053051.2690567-8-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 153 ++++++++++++++++++++++++++++++++++++++- tools/perf/util/header.h | 1 + 2 files changed, 153 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index b0683bf4d9f3..e6a81af516f6 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "evlist.h" #include "evsel.h" @@ -40,6 +41,7 @@ #include "time-utils.h" #include "units.h" #include "cputopo.h" +#include "bpf-event.h" #include "sane_ctype.h" @@ -876,6 +878,56 @@ static int write_dir_format(struct feat_fd *ff, return do_write(ff, &data->dir.version, sizeof(data->dir.version)); } +#ifdef HAVE_LIBBPF_SUPPORT +static int write_bpf_prog_info(struct feat_fd *ff, + struct perf_evlist *evlist __maybe_unused) +{ + struct perf_env *env = &ff->ph->env; + struct rb_root *root; + struct rb_node *next; + int ret; + + down_read(&env->bpf_progs.lock); + + ret = do_write(ff, &env->bpf_progs.infos_cnt, + sizeof(env->bpf_progs.infos_cnt)); + if (ret < 0) + goto out; + + root = &env->bpf_progs.infos; + next = rb_first(root); + while (next) { + struct bpf_prog_info_node *node; + size_t len; + + node = rb_entry(next, struct bpf_prog_info_node, rb_node); + next = rb_next(&node->rb_node); + len = sizeof(struct bpf_prog_info_linear) + + node->info_linear->data_len; + + /* before writing to file, translate address to offset */ + bpf_program__bpil_addr_to_offs(node->info_linear); + ret = do_write(ff, node->info_linear, len); + /* + * translate back to address even when do_write() fails, + * so that this function never changes the data. + */ + bpf_program__bpil_offs_to_addr(node->info_linear); + if (ret < 0) + goto out; + } +out: + up_read(&env->bpf_progs.lock); + return ret; +} +#else // HAVE_LIBBPF_SUPPORT +static int write_bpf_prog_info(struct feat_fd *ff __maybe_unused, + struct perf_evlist *evlist __maybe_unused) +{ + return 0; +} +#endif // HAVE_LIBBPF_SUPPORT + static int cpu_cache_level__sort(const void *a, const void *b) { struct cpu_cache_level *cache_a = (struct cpu_cache_level *)a; @@ -1367,6 +1419,29 @@ static void print_dir_format(struct feat_fd *ff, FILE *fp) fprintf(fp, "# directory data version : %"PRIu64"\n", data->dir.version); } +static void print_bpf_prog_info(struct feat_fd *ff, FILE *fp) +{ + struct perf_env *env = &ff->ph->env; + struct rb_root *root; + struct rb_node *next; + + down_read(&env->bpf_progs.lock); + + root = &env->bpf_progs.infos; + next = rb_first(root); + + while (next) { + struct bpf_prog_info_node *node; + + node = rb_entry(next, struct bpf_prog_info_node, rb_node); + next = rb_next(&node->rb_node); + fprintf(fp, "# bpf_prog_info of id %u\n", + node->info_linear->info.id); + } + + up_read(&env->bpf_progs.lock); +} + static void free_event_desc(struct perf_evsel *events) { struct perf_evsel *evsel; @@ -2414,6 +2489,81 @@ static int process_dir_format(struct feat_fd *ff, return do_read_u64(ff, &data->dir.version); } +#ifdef HAVE_LIBBPF_SUPPORT +static int process_bpf_prog_info(struct feat_fd *ff, void *data __maybe_unused) +{ + struct bpf_prog_info_linear *info_linear; + struct bpf_prog_info_node *info_node; + struct perf_env *env = &ff->ph->env; + u32 count, i; + int err = -1; + + if (ff->ph->needs_swap) { + pr_warning("interpreting bpf_prog_info from systems with endianity is not yet supported\n"); + return 0; + } + + if (do_read_u32(ff, &count)) + return -1; + + down_write(&env->bpf_progs.lock); + + for (i = 0; i < count; ++i) { + u32 info_len, data_len; + + info_linear = NULL; + info_node = NULL; + if (do_read_u32(ff, &info_len)) + goto out; + if (do_read_u32(ff, &data_len)) + goto out; + + if (info_len > sizeof(struct bpf_prog_info)) { + pr_warning("detected invalid bpf_prog_info\n"); + goto out; + } + + info_linear = malloc(sizeof(struct bpf_prog_info_linear) + + data_len); + if (!info_linear) + goto out; + info_linear->info_len = sizeof(struct bpf_prog_info); + info_linear->data_len = data_len; + if (do_read_u64(ff, (u64 *)(&info_linear->arrays))) + goto out; + if (__do_read(ff, &info_linear->info, info_len)) + goto out; + if (info_len < sizeof(struct bpf_prog_info)) + memset(((void *)(&info_linear->info)) + info_len, 0, + sizeof(struct bpf_prog_info) - info_len); + + if (__do_read(ff, info_linear->data, data_len)) + goto out; + + info_node = malloc(sizeof(struct bpf_prog_info_node)); + if (!info_node) + goto out; + + /* after reading from file, translate offset to address */ + bpf_program__bpil_offs_to_addr(info_linear); + info_node->info_linear = info_linear; + perf_env__insert_bpf_prog_info(env, info_node); + } + + return 0; +out: + free(info_linear); + free(info_node); + up_write(&env->bpf_progs.lock); + return err; +} +#else // HAVE_LIBBPF_SUPPORT +static int process_bpf_prog_info(struct feat_fd *ff __maybe_unused, void *data __maybe_unused) +{ + return 0; +} +#endif // HAVE_LIBBPF_SUPPORT + struct feature_ops { int (*write)(struct feat_fd *ff, struct perf_evlist *evlist); void (*print)(struct feat_fd *ff, FILE *fp); @@ -2474,7 +2624,8 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPR(SAMPLE_TIME, sample_time, false), FEAT_OPR(MEM_TOPOLOGY, mem_topology, true), FEAT_OPR(CLOCKID, clockid, false), - FEAT_OPN(DIR_FORMAT, dir_format, false) + FEAT_OPN(DIR_FORMAT, dir_format, false), + FEAT_OPR(BPF_PROG_INFO, bpf_prog_info, false) }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 6a231340238d..1dc85f0f895d 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -40,6 +40,7 @@ enum { HEADER_MEM_TOPOLOGY, HEADER_CLOCKID, HEADER_DIR_FORMAT, + HEADER_BPF_PROG_INFO, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; From 3792cb2ff43b1b193136a03ce1336462a827d792 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:44 -0700 Subject: [PATCH 690/855] perf bpf: Save BTF in a rbtree in perf_env BTF contains information necessary to annotate BPF programs. This patch saves BTF for BPF programs loaded in the system. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20190312053051.2690567-9-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-event.c | 23 +++++++++++++ tools/perf/util/bpf-event.h | 7 ++++ tools/perf/util/env.c | 67 +++++++++++++++++++++++++++++++++++++ tools/perf/util/env.h | 5 +++ 4 files changed, 102 insertions(+) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 37ee4e2a728a..a4fc52b4ffae 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -34,6 +34,28 @@ int machine__process_bpf_event(struct machine *machine __maybe_unused, return 0; } +static int perf_env__fetch_btf(struct perf_env *env, + u32 btf_id, + struct btf *btf) +{ + struct btf_node *node; + u32 data_size; + const void *data; + + data = btf__get_raw_data(btf, &data_size); + + node = malloc(data_size + sizeof(struct btf_node)); + if (!node) + return -1; + + node->id = btf_id; + node->data_size = data_size; + memcpy(node->data, data, data_size); + + perf_env__insert_btf(env, node); + return 0; +} + /* * Synthesize PERF_RECORD_KSYMBOL and PERF_RECORD_BPF_EVENT for one bpf * program. One PERF_RECORD_BPF_EVENT is generated for the program. And @@ -113,6 +135,7 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, goto out; } has_btf = true; + perf_env__fetch_btf(env, info->btf_id, btf); } /* Synthesize PERF_RECORD_KSYMBOL */ diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h index fad932f7404f..b9ec394dc7c7 100644 --- a/tools/perf/util/bpf-event.h +++ b/tools/perf/util/bpf-event.h @@ -16,6 +16,13 @@ struct bpf_prog_info_node { struct rb_node rb_node; }; +struct btf_node { + struct rb_node rb_node; + u32 id; + u32 data_size; + char data[]; +}; + #ifdef HAVE_LIBBPF_SUPPORT int machine__process_bpf_event(struct machine *machine, union perf_event *event, struct perf_sample *sample); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 98cd36f0e317..c6351b557bb0 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -64,6 +64,58 @@ struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env, return node; } +void perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node) +{ + struct rb_node *parent = NULL; + __u32 btf_id = btf_node->id; + struct btf_node *node; + struct rb_node **p; + + down_write(&env->bpf_progs.lock); + p = &env->bpf_progs.btfs.rb_node; + + while (*p != NULL) { + parent = *p; + node = rb_entry(parent, struct btf_node, rb_node); + if (btf_id < node->id) { + p = &(*p)->rb_left; + } else if (btf_id > node->id) { + p = &(*p)->rb_right; + } else { + pr_debug("duplicated btf %u\n", btf_id); + goto out; + } + } + + rb_link_node(&btf_node->rb_node, parent, p); + rb_insert_color(&btf_node->rb_node, &env->bpf_progs.btfs); + env->bpf_progs.btfs_cnt++; +out: + up_write(&env->bpf_progs.lock); +} + +struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id) +{ + struct btf_node *node = NULL; + struct rb_node *n; + + down_read(&env->bpf_progs.lock); + n = env->bpf_progs.btfs.rb_node; + + while (n) { + node = rb_entry(n, struct btf_node, rb_node); + if (btf_id < node->id) + n = n->rb_left; + else if (btf_id > node->id) + n = n->rb_right; + else + break; + } + + up_read(&env->bpf_progs.lock); + return node; +} + /* purge data in bpf_progs.infos tree */ static void perf_env__purge_bpf(struct perf_env *env) { @@ -86,6 +138,20 @@ static void perf_env__purge_bpf(struct perf_env *env) env->bpf_progs.infos_cnt = 0; + root = &env->bpf_progs.btfs; + next = rb_first(root); + + while (next) { + struct btf_node *node; + + node = rb_entry(next, struct btf_node, rb_node); + next = rb_next(&node->rb_node); + rb_erase(&node->rb_node, root); + free(node); + } + + env->bpf_progs.btfs_cnt = 0; + up_write(&env->bpf_progs.lock); } @@ -123,6 +189,7 @@ void perf_env__exit(struct perf_env *env) void perf_env__init(struct perf_env *env) { env->bpf_progs.infos = RB_ROOT; + env->bpf_progs.btfs = RB_ROOT; init_rwsem(&env->bpf_progs.lock); } diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 24b11c564ba4..4f8e2b485c01 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -75,10 +75,13 @@ struct perf_env { struct rw_semaphore lock; struct rb_root infos; u32 infos_cnt; + struct rb_root btfs; + u32 btfs_cnt; } bpf_progs; }; struct bpf_prog_info_node; +struct btf_node; extern struct perf_env perf_env; @@ -99,4 +102,6 @@ void perf_env__insert_bpf_prog_info(struct perf_env *env, struct bpf_prog_info_node *info_node); struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env, __u32 prog_id); +void perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node); +struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id); #endif /* __PERF_ENV_H */ From a70a1123174ab592c5fa8ecf09f9fad9b335b872 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:45 -0700 Subject: [PATCH 691/855] perf bpf: Save BTF information as headers to perf.data This patch enables 'perf record' to save BTF information as headers to perf.data. A new header type HEADER_BPF_BTF is introduced for this data. Committer testing: As root, being on the kernel sources top level directory, run: # perf trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c -e *msg Just to compile and load a BPF program that attaches to the raw_syscalls:sys_{enter,exit} tracepoints to trace the syscalls ending in "msg" (recvmsg, sendmsg, recvmmsg, sendmmsg, etc). Make sure you have a recent enough clang, say version 9, to get the BTF ELF sections needed for this testing: # clang --version | head -1 clang version 9.0.0 (https://git.llvm.org/git/clang.git/ 7906282d3afec5dfdc2b27943fd6c0309086c507) (https://git.llvm.org/git/llvm.git/ a1b5de1ff8ae8bc79dc8e86e1f82565229bd0500) # readelf -SW tools/perf/examples/bpf/augmented_raw_syscalls.o | grep BTF [22] .BTF PROGBITS 0000000000000000 000ede 000b0e 00 0 0 1 [23] .BTF.ext PROGBITS 0000000000000000 0019ec 0002a0 00 0 0 1 [24] .rel.BTF.ext REL 0000000000000000 002fa8 000270 10 30 23 8 Then do a systemwide perf record session for a few seconds: # perf record -a sleep 2s Then look at: # perf report --header-only | grep b[pt]f # event : name = cycles:ppp, , id = { 1116204, 1116205, 1116206, 1116207, 1116208, 1116209, 1116210, 1116211 }, size = 112, { sample_period, sample_freq } = 4000, sample_type = IP|TID|TIME|PERIOD, read_format = ID, disabled = 1, inherit = 1, mmap = 1, comm = 1, freq = 1, enable_on_exec = 1, task = 1, precise_ip = 3, sample_id_all = 1, exclude_guest = 1, mmap2 = 1, comm_exec = 1, ksymbol = 1, bpf_event = 1 # bpf_prog_info of id 13 # bpf_prog_info of id 14 # bpf_prog_info of id 15 # bpf_prog_info of id 16 # bpf_prog_info of id 17 # bpf_prog_info of id 18 # bpf_prog_info of id 21 # bpf_prog_info of id 22 # bpf_prog_info of id 51 # bpf_prog_info of id 52 # btf info of id 8 # We need to show more info about these BPF and BTF entries , but that can be done later. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20190312053051.2690567-10-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 101 ++++++++++++++++++++++++++++++++++++++- tools/perf/util/header.h | 1 + 2 files changed, 101 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index e6a81af516f6..01dda2f65d36 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -928,6 +928,39 @@ static int write_bpf_prog_info(struct feat_fd *ff __maybe_unused, } #endif // HAVE_LIBBPF_SUPPORT +static int write_bpf_btf(struct feat_fd *ff, + struct perf_evlist *evlist __maybe_unused) +{ + struct perf_env *env = &ff->ph->env; + struct rb_root *root; + struct rb_node *next; + int ret; + + down_read(&env->bpf_progs.lock); + + ret = do_write(ff, &env->bpf_progs.btfs_cnt, + sizeof(env->bpf_progs.btfs_cnt)); + + if (ret < 0) + goto out; + + root = &env->bpf_progs.btfs; + next = rb_first(root); + while (next) { + struct btf_node *node; + + node = rb_entry(next, struct btf_node, rb_node); + next = rb_next(&node->rb_node); + ret = do_write(ff, &node->id, + sizeof(u32) * 2 + node->data_size); + if (ret < 0) + goto out; + } +out: + up_read(&env->bpf_progs.lock); + return ret; +} + static int cpu_cache_level__sort(const void *a, const void *b) { struct cpu_cache_level *cache_a = (struct cpu_cache_level *)a; @@ -1442,6 +1475,28 @@ static void print_bpf_prog_info(struct feat_fd *ff, FILE *fp) up_read(&env->bpf_progs.lock); } +static void print_bpf_btf(struct feat_fd *ff, FILE *fp) +{ + struct perf_env *env = &ff->ph->env; + struct rb_root *root; + struct rb_node *next; + + down_read(&env->bpf_progs.lock); + + root = &env->bpf_progs.btfs; + next = rb_first(root); + + while (next) { + struct btf_node *node; + + node = rb_entry(next, struct btf_node, rb_node); + next = rb_next(&node->rb_node); + fprintf(fp, "# btf info of id %u\n", node->id); + } + + up_read(&env->bpf_progs.lock); +} + static void free_event_desc(struct perf_evsel *events) { struct perf_evsel *evsel; @@ -2564,6 +2619,49 @@ static int process_bpf_prog_info(struct feat_fd *ff __maybe_unused, void *data _ } #endif // HAVE_LIBBPF_SUPPORT +static int process_bpf_btf(struct feat_fd *ff, void *data __maybe_unused) +{ + struct perf_env *env = &ff->ph->env; + u32 count, i; + + if (ff->ph->needs_swap) { + pr_warning("interpreting btf from systems with endianity is not yet supported\n"); + return 0; + } + + if (do_read_u32(ff, &count)) + return -1; + + down_write(&env->bpf_progs.lock); + + for (i = 0; i < count; ++i) { + struct btf_node *node; + u32 id, data_size; + + if (do_read_u32(ff, &id)) + return -1; + if (do_read_u32(ff, &data_size)) + return -1; + + node = malloc(sizeof(struct btf_node) + data_size); + if (!node) + return -1; + + node->id = id; + node->data_size = data_size; + + if (__do_read(ff, node->data, data_size)) { + free(node); + return -1; + } + + perf_env__insert_btf(env, node); + } + + up_write(&env->bpf_progs.lock); + return 0; +} + struct feature_ops { int (*write)(struct feat_fd *ff, struct perf_evlist *evlist); void (*print)(struct feat_fd *ff, FILE *fp); @@ -2625,7 +2723,8 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPR(MEM_TOPOLOGY, mem_topology, true), FEAT_OPR(CLOCKID, clockid, false), FEAT_OPN(DIR_FORMAT, dir_format, false), - FEAT_OPR(BPF_PROG_INFO, bpf_prog_info, false) + FEAT_OPR(BPF_PROG_INFO, bpf_prog_info, false), + FEAT_OPR(BPF_BTF, bpf_btf, false), }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 1dc85f0f895d..386da49e1bfa 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -41,6 +41,7 @@ enum { HEADER_CLOCKID, HEADER_DIR_FORMAT, HEADER_BPF_PROG_INFO, + HEADER_BPF_BTF, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; From ee7a112fbcc8edb4cf2f84ce5fcc2da7818fd4b8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:46 -0700 Subject: [PATCH 692/855] perf top: Add option --no-bpf-event This patch adds option --no-bpf-event to 'perf top', which is the same as the option of 'perf record'. The following patches will use this option. Committer testing: # perf top -vv 2> /tmp/perf_event_attr.out # cat /tmp/perf_event_attr.out ------------------------------------------------------------ perf_event_attr: size 112 { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|CPU|PERIOD read_format ID disabled 1 inherit 1 mmap 1 comm 1 freq 1 task 1 precise_ip 3 sample_id_all 1 exclude_guest 1 mmap2 1 comm_exec 1 ksymbol 1 bpf_event 1 ------------------------------------------------------------ # After this patch: # perf top --no-bpf-event -vv 2> /tmp/perf_event_attr.out # cat /tmp/perf_event_attr.out ------------------------------------------------------------ perf_event_attr: size 112 { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|CPU|PERIOD read_format ID disabled 1 inherit 1 mmap 1 comm 1 freq 1 task 1 precise_ip 3 sample_id_all 1 exclude_guest 1 mmap2 1 comm_exec 1 ksymbol 1 ------------------------------------------------------------ # Signed-off-by: Song Liu Tested-by: Arnaldo Carvalho de Melo Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20190312053051.2690567-11-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 77e6190211d2..c2ea22c4ea67 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1469,6 +1469,7 @@ int cmd_top(int argc, const char **argv) "Display raw encoding of assembly instructions (default)"), OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel, "Enable kernel symbol demangling"), + OPT_BOOLEAN(0, "no-bpf-event", &top.record_opts.no_bpf_event, "do not record bpf events"), OPT_STRING(0, "objdump", &top.annotation_opts.objdump_path, "path", "objdump binary to use for disassembly and annotations"), OPT_STRING('M', "disassembler-style", &top.annotation_opts.disassembler_style, "disassembler style", From 31be9478ed7f43d6351e0d5a2257ca76609c83d3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:47 -0700 Subject: [PATCH 693/855] perf feature detection: Add -lopcodes to feature-libbfd Both libbfd and libopcodes are distributed with binutil-dev/devel. When libbfd is present, it is OK to assume that libopcodes also present. This has been a safe assumption for bpftool. This patch adds -lopcodes to perf/Makefile.config. libopcodes will be used in the next commit for BPF annotation. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20190312053051.2690567-12-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 0f11d5891301..df4ad45599ca 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -713,7 +713,7 @@ else endif ifeq ($(feature-libbfd), 1) - EXTLIBS += -lbfd + EXTLIBS += -lbfd -lopcodes else # we are on a system that requires -liberty and (maybe) -lz # to link against -lbfd; test each case individually here @@ -724,10 +724,10 @@ else $(call feature_check,libbfd-liberty-z) ifeq ($(feature-libbfd-liberty), 1) - EXTLIBS += -lbfd -liberty + EXTLIBS += -lbfd -lopcodes -liberty else ifeq ($(feature-libbfd-liberty-z), 1) - EXTLIBS += -lbfd -liberty -lz + EXTLIBS += -lbfd -lopcodes -liberty -lz endif endif endif From 9b86d04d53b98399017fea44e9047165ffe12d42 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:48 -0700 Subject: [PATCH 694/855] perf symbols: Introduce DSO_BINARY_TYPE__BPF_PROG_INFO Introduce a new dso type DSO_BINARY_TYPE__BPF_PROG_INFO for BPF programs. In symbol__disassemble(), DSO_BINARY_TYPE__BPF_PROG_INFO dso will call into a new function symbol__disassemble_bpf() in an upcoming patch, where annotation line information is filled based bpf_prog_info and btf saved in given perf_env. Committer notes: Removed the unnamed union with 'bpf_prog' and 'cache' in 'struct dso', to fix this bug when exiting 'perf top': # perf top perf: Segmentation fault -------- backtrace -------- perf[0x5a785a] /lib64/libc.so.6(+0x385bf)[0x7fd68443c5bf] perf(rb_first+0x2b)[0x4d6eeb] perf(dso__delete+0xb7)[0x4dffb7] perf[0x4f9e37] perf(perf_session__delete+0x64)[0x504df4] perf(cmd_top+0x1957)[0x454467] perf[0x4aad18] perf(main+0x61c)[0x42ec7c] /lib64/libc.so.6(__libc_start_main+0xf2)[0x7fd684428412] perf(_start+0x2d)[0x42eead] # # addr2line -fe ~/bin/perf 0x4dffb7 dso_cache__free /home/acme/git/perf/tools/perf/util/dso.c:713 That is trying to access the dso->data.cache, and that is not used with BPF programs, so we end up accessing what is in bpf_prog.first_member, b00m. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20190312053051.2690567-13-songliubraving@fb.com [ split from a larger patch ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.c | 1 + tools/perf/util/dso.h | 8 ++++++++ tools/perf/util/symbol.c | 1 + 3 files changed, 10 insertions(+) diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index ab8a455d2283..e059976d9d93 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -184,6 +184,7 @@ int dso__read_binary_type_filename(const struct dso *dso, case DSO_BINARY_TYPE__KALLSYMS: case DSO_BINARY_TYPE__GUEST_KALLSYMS: case DSO_BINARY_TYPE__JAVA_JIT: + case DSO_BINARY_TYPE__BPF_PROG_INFO: case DSO_BINARY_TYPE__NOT_FOUND: ret = -1; break; diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index bb417c54c25a..6e3f63781e51 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -14,6 +14,7 @@ struct machine; struct map; +struct perf_env; enum dso_binary_type { DSO_BINARY_TYPE__KALLSYMS = 0, @@ -35,6 +36,7 @@ enum dso_binary_type { DSO_BINARY_TYPE__KCORE, DSO_BINARY_TYPE__GUEST_KCORE, DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO, + DSO_BINARY_TYPE__BPF_PROG_INFO, DSO_BINARY_TYPE__NOT_FOUND, }; @@ -189,6 +191,12 @@ struct dso { u64 debug_frame_offset; u64 eh_frame_hdr_offset; } data; + /* bpf prog information */ + struct { + u32 id; + u32 sub_id; + struct perf_env *env; + } bpf_prog; union { /* Tool specific area */ void *priv; diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 58442ca5e3c4..5cbad55cd99d 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1455,6 +1455,7 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod, case DSO_BINARY_TYPE__BUILD_ID_CACHE_DEBUGINFO: return true; + case DSO_BINARY_TYPE__BPF_PROG_INFO: case DSO_BINARY_TYPE__NOT_FOUND: default: return false; From 3ca3877a9732b68cf0289367a859f6c163a79bfa Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:49 -0700 Subject: [PATCH 695/855] perf bpf: Process PERF_BPF_EVENT_PROG_LOAD for annotation This patch adds processing of PERF_BPF_EVENT_PROG_LOAD, which sets proper DSO type/id/etc of memory regions mapped to BPF programs to DSO_BINARY_TYPE__BPF_PROG_INFO. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20190312053051.2690567-14-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-event.c | 54 +++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index a4fc52b4ffae..852e960692cb 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -12,6 +12,7 @@ #include "machine.h" #include "env.h" #include "session.h" +#include "map.h" #define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr)) @@ -25,12 +26,65 @@ static int snprintf_hex(char *buf, size_t size, unsigned char *data, size_t len) return ret; } +static int machine__process_bpf_event_load(struct machine *machine, + union perf_event *event, + struct perf_sample *sample __maybe_unused) +{ + struct bpf_prog_info_linear *info_linear; + struct bpf_prog_info_node *info_node; + struct perf_env *env = machine->env; + int id = event->bpf_event.id; + unsigned int i; + + /* perf-record, no need to handle bpf-event */ + if (env == NULL) + return 0; + + info_node = perf_env__find_bpf_prog_info(env, id); + if (!info_node) + return 0; + info_linear = info_node->info_linear; + + for (i = 0; i < info_linear->info.nr_jited_ksyms; i++) { + u64 *addrs = (u64 *)(info_linear->info.jited_ksyms); + u64 addr = addrs[i]; + struct map *map; + + map = map_groups__find(&machine->kmaps, addr); + + if (map) { + map->dso->binary_type = DSO_BINARY_TYPE__BPF_PROG_INFO; + map->dso->bpf_prog.id = id; + map->dso->bpf_prog.sub_id = i; + map->dso->bpf_prog.env = env; + } + } + return 0; +} + int machine__process_bpf_event(struct machine *machine __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused) { if (dump_trace) perf_event__fprintf_bpf_event(event, stdout); + + switch (event->bpf_event.type) { + case PERF_BPF_EVENT_PROG_LOAD: + return machine__process_bpf_event_load(machine, event, sample); + + case PERF_BPF_EVENT_PROG_UNLOAD: + /* + * Do not free bpf_prog_info and btf of the program here, + * as annotation still need them. They will be freed at + * the end of the session. + */ + break; + default: + pr_debug("unexpected bpf_event type of %d\n", + event->bpf_event.type); + break; + } return 0; } From 7442c483b963dbee7d1b655cbad99c727c047828 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Mar 2019 17:35:11 +0100 Subject: [PATCH 696/855] mlxsw: core: mlxsw: core: avoid -Wint-in-bool-context warning A recently added function in mlxsw triggers a harmless compiler warning: In file included from drivers/net/ethernet/mellanox/mlxsw/core.h:17, from drivers/net/ethernet/mellanox/mlxsw/core_env.c:7: drivers/net/ethernet/mellanox/mlxsw/core_env.c: In function 'mlxsw_env_module_temp_thresholds_get': drivers/net/ethernet/mellanox/mlxsw/reg.h:8015:45: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] #define MLXSW_REG_MTMP_TEMP_TO_MC(val) (val * 125) ~~~~~^~~~~~ drivers/net/ethernet/mellanox/mlxsw/core_env.c:116:8: note: in expansion of macro 'MLXSW_REG_MTMP_TEMP_TO_MC' if (!MLXSW_REG_MTMP_TEMP_TO_MC(module_temp)) { ^~~~~~~~~~~~~~~~~~~~~~~~~ The warning is normally disabled, but it would be nice to enable it to find real bugs, and there are no other known instances at the moment. Replace the negation with a zero-comparison, which also matches the comment above it. Fixes: d93c19a1d95c ("mlxsw: core: Add API for QSFP module temperature thresholds reading") Signed-off-by: Arnd Bergmann Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core_env.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_env.c b/drivers/net/ethernet/mellanox/mlxsw/core_env.c index 7a15e932ed2f..c1c1965d7acc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_env.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_env.c @@ -113,7 +113,7 @@ int mlxsw_env_module_temp_thresholds_get(struct mlxsw_core *core, int module, return 0; default: /* Do not consider thresholds for zero temperature. */ - if (!MLXSW_REG_MTMP_TEMP_TO_MC(module_temp)) { + if (MLXSW_REG_MTMP_TEMP_TO_MC(module_temp) == 0) { *temp = 0; return 0; } From 223a960c01227e4dbcb6f9fa06b47d73bda21274 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Mon, 18 Mar 2019 23:36:08 +0200 Subject: [PATCH 697/855] net: stmmac: fix memory corruption with large MTUs When using 16K DMA buffers and ring mode, the DES3 refill is not working correctly as the function is using a bogus pointer for checking the private data. As a result stale pointers will remain in the RX descriptor ring, so DMA will now likely overwrite/corrupt some already freed memory. As simple reproducer, just receive some UDP traffic: # ifconfig eth0 down; ifconfig eth0 mtu 9000; ifconfig eth0 up # iperf3 -c 192.168.253.40 -u -b 0 -R If you didn't crash by now check the RX descriptors to find non-contiguous RX buffers: cat /sys/kernel/debug/stmmaceth/eth0/descriptors_status [...] 1 [0x2be5020]: 0xa3220321 0x9ffc1ffc 0x72d70082 0x130e207e ^^^^^^^^^^^^^^^^^^^^^ 2 [0x2be5040]: 0xa3220321 0x9ffc1ffc 0x72998082 0x1311a07e ^^^^^^^^^^^^^^^^^^^^^ A simple ping test will now report bad data: # ping -s 8200 192.168.253.40 PING 192.168.253.40 (192.168.253.40) 8200(8228) bytes of data. 8208 bytes from 192.168.253.40: icmp_seq=1 ttl=64 time=1.00 ms wrong data byte #8144 should be 0xd0 but was 0x88 Fix the wrong pointer. Also we must refill DES3 only if the DMA buffer size is 16K. Fixes: 54139cf3bb33 ("net: stmmac: adding multiple buffers for rx") Signed-off-by: Aaro Koskinen Acked-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/ring_mode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/ring_mode.c b/drivers/net/ethernet/stmicro/stmmac/ring_mode.c index f936166d8910..4d9bcb4d0378 100644 --- a/drivers/net/ethernet/stmicro/stmmac/ring_mode.c +++ b/drivers/net/ethernet/stmicro/stmmac/ring_mode.c @@ -113,10 +113,11 @@ static unsigned int is_jumbo_frm(int len, int enh_desc) static void refill_desc3(void *priv_ptr, struct dma_desc *p) { - struct stmmac_priv *priv = (struct stmmac_priv *)priv_ptr; + struct stmmac_rx_queue *rx_q = priv_ptr; + struct stmmac_priv *priv = rx_q->priv_data; /* Fill DES3 in case of RING mode */ - if (priv->dma_buf_sz >= BUF_SIZE_8KiB) + if (priv->dma_buf_sz == BUF_SIZE_16KiB) p->des3 = cpu_to_le32(le32_to_cpu(p->des2) + BUF_SIZE_8KiB); } From a3e23f719f5c4a38ffb3d30c8d7632a4ed8ccd9e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 19 Mar 2019 10:16:53 +0800 Subject: [PATCH 698/855] net-sysfs: call dev_hold if kobject_init_and_add success In netdev_queue_add_kobject and rx_queue_add_kobject, if sysfs_create_group failed, kobject_put will call netdev_queue_release to decrease dev refcont, however dev_hold has not be called. So we will see this while unregistering dev: unregister_netdevice: waiting for bcsh0 to become free. Usage count = -1 Reported-by: Hulk Robot Fixes: d0d668371679 ("net: don't decrement kobj reference count on init failure") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4ff661f6f989..8f8b7b6c2945 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -928,6 +928,8 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) if (error) return error; + dev_hold(queue->dev); + if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) { @@ -937,7 +939,6 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) } kobject_uevent(kobj, KOBJ_ADD); - dev_hold(queue->dev); return error; } @@ -1464,6 +1465,8 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) if (error) return error; + dev_hold(queue->dev); + #ifdef CONFIG_BQL error = sysfs_create_group(kobj, &dql_group); if (error) { @@ -1473,7 +1476,6 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) #endif kobject_uevent(kobj, KOBJ_ADD); - dev_hold(queue->dev); return 0; } From d7737d4257459ca8921ff911c88937be1a11ea9d Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Mon, 18 Mar 2019 22:19:44 -0500 Subject: [PATCH 699/855] nfc: Fix to check for kmemdup failure In case of kmemdup failure while setting the service name the patch returns -ENOMEM upstream for processing. Signed-off-by: Aditya Pakki Signed-off-by: David S. Miller --- net/nfc/llcp_sock.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ae296273ce3d..17dcd0b5eb32 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -726,6 +726,10 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, llcp_sock->service_name = kmemdup(addr->service_name, llcp_sock->service_name_len, GFP_KERNEL); + if (!llcp_sock->service_name) { + ret = -ENOMEM; + goto sock_llcp_release; + } nfc_llcp_sock_link(&local->connecting_sockets, sk); @@ -745,10 +749,11 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, return ret; sock_unlink: - nfc_llcp_put_ssap(local, llcp_sock->ssap); - nfc_llcp_sock_unlink(&local->connecting_sockets, sk); +sock_llcp_release: + nfc_llcp_put_ssap(local, llcp_sock->ssap); + put_dev: nfc_put_device(dev); From 89e4130939a20304f4059ab72179da81f5347528 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Mar 2019 05:45:35 -0700 Subject: [PATCH 700/855] tcp: do not use ipv6 header for ipv4 flow When a dual stack tcp listener accepts an ipv4 flow, it should not attempt to use an ipv6 header or tcp_v6_iif() helper. Fixes: 1397ed35f22d ("ipv6: add flowinfo for tcp6 pkt_options for all cases") Fixes: df3687ffc665 ("ipv6: add the IPV6_FL_F_REFLECT flag to IPV6_FL_A_GET") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 57ef69a10889..44d431849d39 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1110,11 +1110,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; - newnp->mcast_oif = tcp_v6_iif(skb); - newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; - newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); + newnp->mcast_oif = inet_iif(skb); + newnp->mcast_hops = ip_hdr(skb)->ttl; + newnp->rcv_flowinfo = 0; if (np->repflow) - newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); + newnp->flow_label = 0; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count From e0aa67709f89d08c8d8e5bdd9e0b649df61d0090 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Mar 2019 05:46:18 -0700 Subject: [PATCH 701/855] dccp: do not use ipv6 header for ipv4 flow When a dual stack dccp listener accepts an ipv4 flow, it should not attempt to use an ipv6 header or inet6_iif() helper. Fixes: 3df80d9320bc ("[DCCP]: Introduce DCCPv6") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ipv6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d5740bad5b18..57d84e9b7b6f 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -436,8 +436,8 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; - newnp->mcast_oif = inet6_iif(skb); - newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; + newnp->mcast_oif = inet_iif(skb); + newnp->mcast_hops = ip_hdr(skb)->ttl; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count From 54e3aca84e571559915998aa6cc05e5ac37c043b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 18 Mar 2019 21:47:09 +0300 Subject: [PATCH 702/855] ACPI / utils: Drop reference in test for device presence When commit 8661423eea1a ("ACPI / utils: Add new acpi_dev_present helper") introduced acpi_dev_present(), it missed the fact that bus_find_device() took a reference on the device found by it and the callers of acpi_dev_present() don't drop that reference. Drop the reference on the device in acpi_dev_present(). Fixes: 8661423eea1a ("ACPI / utils: Add new acpi_dev_present helper") Signed-off-by: Andy Shevchenko Reviewed-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/utils.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index 78db97687f26..c4b06cc075f9 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -800,6 +800,7 @@ bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) match.hrv = hrv; dev = bus_find_device(&acpi_bus_type, NULL, &match, acpi_dev_match_cb); + put_device(dev); return !!dev; } EXPORT_SYMBOL(acpi_dev_present); From 2071ac985d37efe496782c34318dbead93beb02f Mon Sep 17 00:00:00 2001 From: Jiada Wang Date: Tue, 12 Mar 2019 15:51:28 +0900 Subject: [PATCH 703/855] PM / Domains: Avoid a potential deadlock Lockdep warns that prepare_lock and genpd->mlock can cause a deadlock the deadlock scenario is like following: First thread is probing cs2000 cs2000_probe() clk_register() __clk_core_init() clk_prepare_lock() ----> acquires prepare_lock cs2000_recalc_rate() i2c_smbus_read_byte_data() rcar_i2c_master_xfer() dma_request_chan() rcar_dmac_of_xlate() rcar_dmac_alloc_chan_resources() pm_runtime_get_sync() __pm_runtime_resume() rpm_resume() rpm_callback() genpd_runtime_resume() ----> acquires genpd->mlock Second thread is attaching any device to the same PM domain genpd_add_device() genpd_lock() ----> acquires genpd->mlock cpg_mssr_attach_dev() of_clk_get_from_provider() __of_clk_get_from_provider() __clk_create_clk() clk_prepare_lock() ----> acquires prepare_lock Since currently no PM provider access genpd's critical section in .attach_dev, and .detach_dev callbacks, so there is no need to protect these two callbacks with genpd->mlock. This patch avoids a potential deadlock by moving out .attach_dev and .detach_dev from genpd->mlock, so that genpd->mlock won't be held when prepare_lock is acquired in .attach_dev and .detach_dev Signed-off-by: Jiada Wang Reviewed-by: Ulf Hansson Tested-by: Geert Uytterhoeven Reviewed-by: Geert Uytterhoeven Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 76c9969b7124..96a6dc9d305c 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -1469,12 +1469,12 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev, if (IS_ERR(gpd_data)) return PTR_ERR(gpd_data); - genpd_lock(genpd); - ret = genpd->attach_dev ? genpd->attach_dev(genpd, dev) : 0; if (ret) goto out; + genpd_lock(genpd); + dev_pm_domain_set(dev, &genpd->domain); genpd->device_count++; @@ -1482,9 +1482,8 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev, list_add_tail(&gpd_data->base.list_node, &genpd->dev_list); - out: genpd_unlock(genpd); - + out: if (ret) genpd_free_dev_data(dev, gpd_data); else @@ -1533,15 +1532,15 @@ static int genpd_remove_device(struct generic_pm_domain *genpd, genpd->device_count--; genpd->max_off_time_changed = true; - if (genpd->detach_dev) - genpd->detach_dev(genpd, dev); - dev_pm_domain_set(dev, NULL); list_del_init(&pdd->list_node); genpd_unlock(genpd); + if (genpd->detach_dev) + genpd->detach_dev(genpd, dev); + genpd_free_dev_data(dev, gpd_data); return 0; From fb6fafbc7de4a813bb5364358bbe27f71e62b24a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 19 Mar 2019 22:15:58 +0100 Subject: [PATCH 704/855] 3c515: fix integer overflow warning clang points out a harmless signed integer overflow: drivers/net/ethernet/3com/3c515.c:1530:66: error: implicit conversion from 'int' to 'short' changes value from 32783 to -32753 [-Werror,-Wconstant-conversion] new_mode = SetRxFilter | RxStation | RxMulticast | RxBroadcast | RxProm; ~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ drivers/net/ethernet/3com/3c515.c:1532:52: error: implicit conversion from 'int' to 'short' changes value from 32775 to -32761 [-Werror,-Wconstant-conversion] new_mode = SetRxFilter | RxStation | RxMulticast | RxBroadcast; ~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~ drivers/net/ethernet/3com/3c515.c:1534:38: error: implicit conversion from 'int' to 'short' changes value from 32773 to -32763 [-Werror,-Wconstant-conversion] new_mode = SetRxFilter | RxStation | RxBroadcast; ~ ~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~ Make the variable unsigned to avoid the overflow. Fixes: Linux-2.1.128pre1 Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/3com/3c515.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/3com/3c515.c b/drivers/net/ethernet/3com/3c515.c index 808abb6b3671..b15752267c8d 100644 --- a/drivers/net/ethernet/3com/3c515.c +++ b/drivers/net/ethernet/3com/3c515.c @@ -1521,7 +1521,7 @@ static void update_stats(int ioaddr, struct net_device *dev) static void set_rx_mode(struct net_device *dev) { int ioaddr = dev->base_addr; - short new_mode; + unsigned short new_mode; if (dev->flags & IFF_PROMISC) { if (corkscrew_debug > 3) From f84532ce5887dac2ef67498b897a8713793eebde Mon Sep 17 00:00:00 2001 From: Vinay K Nallamothu Date: Tue, 19 Mar 2019 22:41:18 +0000 Subject: [PATCH 705/855] mpls: Fix 6PE forwarding This patch adds support for 6PE (RFC 4798) which uses IPv4-mapped IPv6 nexthop to connect IPv6 islands over IPv4 only MPLS network core. Prior to this fix, to find the link-layer destination mac address, 6PE enabled host/router was sending IPv6 ND requests for IPv4-mapped IPv6 nexthop address over the interface facing the IPv4 only core which wouldn't success as the core is IPv6 free. This fix changes that behavior on 6PE host to treat the nexthop as IPv4 address and send ARP requests whenever the next-hop address is an IPv4-mapped IPv6 address. Below topology illustrates the issue and how the patch addresses it. abcd::1.1.1.1 (lo) abcd::2.2.2.2 (lo) R0 (PE/host)------------------------R1--------------------------------R2 (PE/host) <--- IPv4 MPLS core ---> <------ IPv4 MPLS core --------> eth1 eth2 eth3 eth4 172.18.0.10 172.18.0.11 172.19.0.11 172.19.0.12 ffff::172.18.0.10 ffff::172.19.0.12 <------------------IPv6 MPLS tunnel ----------------------> R0 and R2 act as 6PE routers of IPv6 islands. R1 is IPv4 only with MPLS tunnels between R0,R1 and R1,R2. docker exec r0 ip -f inet6 route add abcd::2.2.2.2/128 nexthop encap mpls 100 via ::ffff:172.18.0.11 dev eth1 docker exec r2 ip -f inet6 route add abcd::1.1.1.1/128 nexthop encap mpls 200 via ::ffff:172.19.0.11 dev eth4 docker exec r1 ip -f mpls route add 100 via inet 172.19.0.12 dev eth3 docker exec r1 ip -f mpls route add 200 via inet 172.18.0.10 dev eth2 With the change, when R0 sends an IPv6 packet over MPLS tunnel to abcd::2.2.2.2, using ::ffff:172.18.0.11 as the nexthop, it does neighbor discovery for 172.18.18.0.11. Signed-off-by: Vinay K Nallamothu Tested-by: Avinash Lingala Tested-by: Aravind Srinivas Srinivasa Prabhakar Signed-off-by: David S. Miller --- net/mpls/mpls_iptunnel.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index dda8930f20e7..f3a8557494d6 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -140,9 +140,15 @@ static int mpls_xmit(struct sk_buff *skb) if (rt) err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, skb); - else if (rt6) - err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway, - skb); + else if (rt6) { + if (ipv6_addr_v4mapped(&rt6->rt6i_gateway)) { + /* 6PE (RFC 4798) */ + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt6->rt6i_gateway.s6_addr32[3], + skb); + } else + err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway, + skb); + } if (err) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); From 4b9ce3a651a37c60527101db4451a315a8b9588f Mon Sep 17 00:00:00 2001 From: Deepak Rawat Date: Thu, 28 Feb 2019 10:29:54 -0800 Subject: [PATCH 706/855] drm/vmwgfx: Return 0 when gmrid::get_node runs out of ID's If it's not a system error and get_node implementation accommodate the buffer object then it should return 0 with memm::mm_node set to NULL. v2: Test for id != -ENOMEM instead of id == -ENOSPC. Cc: Fixes: 4eb085e42fde ("drm/vmwgfx: Convert to new IDA API") Signed-off-by: Deepak Rawat Reviewed-by: Thomas Hellstrom Signed-off-by: Thomas Hellstrom --- drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c b/drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c index b93c558dd86e..7da752ca1c34 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c @@ -57,7 +57,7 @@ static int vmw_gmrid_man_get_node(struct ttm_mem_type_manager *man, id = ida_alloc_max(&gman->gmr_ida, gman->max_gmr_ids - 1, GFP_KERNEL); if (id < 0) - return id; + return (id != -ENOMEM ? 0 : id); spin_lock(&gman->lock); From c2d311553855395764e2e5bf401d987ba65c2056 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 18 Mar 2019 15:47:58 +0100 Subject: [PATCH 707/855] drm/vmwgfx: Don't double-free the mode stored in par->set_mode When calling vmw_fb_set_par(), the mode stored in par->set_mode gets free'd twice. The first free is in vmw_fb_kms_detach(), the second is near the end of vmw_fb_set_par() under the name of 'old_mode'. The mode-setting code only works correctly if the mode doesn't actually change. Removing 'old_mode' in favor of using par->set_mode directly fixes the problem. Cc: Fixes: a278724aa23c ("drm/vmwgfx: Implement fbdev on kms v2") Signed-off-by: Thomas Zimmermann Reviewed-by: Deepak Rawat Signed-off-by: Thomas Hellstrom --- drivers/gpu/drm/vmwgfx/vmwgfx_fb.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c index b913a56f3426..2a9112515f46 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fb.c @@ -564,11 +564,9 @@ static int vmw_fb_set_par(struct fb_info *info) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }; - struct drm_display_mode *old_mode; struct drm_display_mode *mode; int ret; - old_mode = par->set_mode; mode = drm_mode_duplicate(vmw_priv->dev, &new_mode); if (!mode) { DRM_ERROR("Could not create new fb mode.\n"); @@ -579,11 +577,7 @@ static int vmw_fb_set_par(struct fb_info *info) mode->vdisplay = var->yres; vmw_guess_mode_timing(mode); - if (old_mode && drm_mode_equal(old_mode, mode)) { - drm_mode_destroy(vmw_priv->dev, mode); - mode = old_mode; - old_mode = NULL; - } else if (!vmw_kms_validate_mode_vram(vmw_priv, + if (!vmw_kms_validate_mode_vram(vmw_priv, mode->hdisplay * DIV_ROUND_UP(var->bits_per_pixel, 8), mode->vdisplay)) { @@ -620,8 +614,8 @@ static int vmw_fb_set_par(struct fb_info *info) schedule_delayed_work(&par->local_work, 0); out_unlock: - if (old_mode) - drm_mode_destroy(vmw_priv->dev, old_mode); + if (par->set_mode) + drm_mode_destroy(vmw_priv->dev, par->set_mode); par->set_mode = mode; mutex_unlock(&par->bo_mutex); From b25a31bf0ca091aa8bdb9ab329b0226257568bbe Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 19 Mar 2019 13:22:41 +0900 Subject: [PATCH 708/855] netfilter: nf_tables: add missing ->release_ops() in error path of newrule() ->release_ops() callback releases resources and this is used in error path. If nf_tables_newrule() fails after ->select_ops(), it should release resources. but it can not call ->destroy() because that should be called after ->init(). At this point, ->release_ops() should be used for releasing resources. Test commands: modprobe -rv xt_tcpudp iptables-nft -I INPUT -m tcp <-- error command lsmod Result: Module Size Used by xt_tcpudp 20480 2 <-- it should be 0 Fixes: b8e204006340 ("netfilter: nft_compat: use .release_ops and remove list of extension") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 513f93118604..ef7772e976cc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2806,8 +2806,11 @@ err2: nf_tables_rule_release(&ctx, rule); err1: for (i = 0; i < n; i++) { - if (info[i].ops != NULL) + if (info[i].ops) { module_put(info[i].ops->type->owner); + if (info[i].ops->type->release_ops) + info[i].ops->type->release_ops(info[i].ops); + } } kvfree(info); return err; From 8bc086899816214fbc6047c9c7e15fcab49552bf Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 17 Mar 2019 01:17:56 +0000 Subject: [PATCH 709/855] powerpc/mm: Only define MAX_PHYSMEM_BITS in SPARSEMEM configurations MAX_PHYSMEM_BITS only needs to be defined if CONFIG_SPARSEMEM is enabled, and that was the case before commit 4ffe713b7587 ("powerpc/mm: Increase the max addressable memory to 2PB"). On 32-bit systems, where CONFIG_SPARSEMEM is not enabled, we now define it as 46. That is larger than the real number of physical address bits, and breaks calculations in zsmalloc: mm/zsmalloc.c:130:49: warning: right shift count is negative MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) ^~ ... mm/zsmalloc.c:253:21: error: variably modified 'size_class' at file scope struct size_class *size_class[ZS_SIZE_CLASSES]; ^~~~~~~~~~ Fixes: 4ffe713b7587 ("powerpc/mm: Increase the max addressable memory to 2PB") Cc: stable@vger.kernel.org # v4.20+ Signed-off-by: Ben Hutchings Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mmu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index d34ad1657d7b..598cdcdd1355 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -352,7 +352,7 @@ static inline bool strict_kernel_rwx_enabled(void) #if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \ defined (CONFIG_PPC_64K_PAGES) #define MAX_PHYSMEM_BITS 51 -#else +#elif defined(CONFIG_SPARSEMEM) #define MAX_PHYSMEM_BITS 46 #endif From bb229bbb3bf63d23128e851a1f3b85c083178fa1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 20 Mar 2019 09:46:58 +0100 Subject: [PATCH 710/855] libceph: wait for latest osdmap in ceph_monc_blacklist_add() Because map updates are distributed lazily, an OSD may not know about the new blacklist for quite some time after "osd blacklist add" command is completed. This makes it possible for a blacklisted but still alive client to overwrite a post-blacklist update, resulting in data corruption. Waiting for latest osdmap in ceph_monc_blacklist_add() and thus using the post-blacklist epoch for all post-blacklist requests ensures that all such requests "wait" for the blacklist to come into force on their respective OSDs. Cc: stable@vger.kernel.org Fixes: 6305a3b41515 ("libceph: support for blacklisting clients") Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- include/linux/ceph/libceph.h | 2 ++ net/ceph/ceph_common.c | 18 +++++++++++++++++- net/ceph/mon_client.c | 9 +++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index a420c07904bc..337d5049ff93 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -294,6 +294,8 @@ extern void ceph_destroy_client(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, unsigned long started); extern int ceph_open_session(struct ceph_client *client); +int ceph_wait_for_latest_osdmap(struct ceph_client *client, + unsigned long timeout); /* pagevec.c */ extern void ceph_release_page_vector(struct page **pages, int num_pages); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 9cab80207ced..79eac465ec65 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -738,7 +738,6 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started) } EXPORT_SYMBOL(__ceph_open_session); - int ceph_open_session(struct ceph_client *client) { int ret; @@ -754,6 +753,23 @@ int ceph_open_session(struct ceph_client *client) } EXPORT_SYMBOL(ceph_open_session); +int ceph_wait_for_latest_osdmap(struct ceph_client *client, + unsigned long timeout) +{ + u64 newest_epoch; + int ret; + + ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); + if (ret) + return ret; + + if (client->osdc.osdmap->epoch >= newest_epoch) + return 0; + + ceph_osdc_maybe_request_map(&client->osdc); + return ceph_monc_wait_osdmap(&client->monc, newest_epoch, timeout); +} +EXPORT_SYMBOL(ceph_wait_for_latest_osdmap); static int __init init_ceph_lib(void) { diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 18deb3d889c4..a53e4fbb6319 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -922,6 +922,15 @@ int ceph_monc_blacklist_add(struct ceph_mon_client *monc, mutex_unlock(&monc->mutex); ret = wait_generic_request(req); + if (!ret) + /* + * Make sure we have the osdmap that includes the blacklist + * entry. This is needed to ensure that the OSDs pick up the + * new blacklist before processing any future requests from + * this client. + */ + ret = ceph_wait_for_latest_osdmap(monc->client, 0); + out: put_generic_request(req); return ret; From 9d4a227f6ef189cf37eb22641f6ee788b7dc41bb Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 20 Mar 2019 10:58:05 +0100 Subject: [PATCH 711/855] rbd: drop wait_for_latest_osdmap() Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3b2c9289dccb..2210c1b9491b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -924,23 +924,6 @@ static void rbd_put_client(struct rbd_client *rbdc) kref_put(&rbdc->kref, rbd_client_release); } -static int wait_for_latest_osdmap(struct ceph_client *client) -{ - u64 newest_epoch; - int ret; - - ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); - if (ret) - return ret; - - if (client->osdc.osdmap->epoch >= newest_epoch) - return 0; - - ceph_osdc_maybe_request_map(&client->osdc); - return ceph_monc_wait_osdmap(&client->monc, newest_epoch, - client->options->mount_timeout); -} - /* * Get a ceph client with specific addr and configuration, if one does * not exist create it. Either way, ceph_opts is consumed by this @@ -960,7 +943,8 @@ static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) * Using an existing client. Make sure ->pg_pools is up to * date before we look up the pool id in do_rbd_add(). */ - ret = wait_for_latest_osdmap(rbdc->client); + ret = ceph_wait_for_latest_osdmap(rbdc->client, + rbdc->client->options->mount_timeout); if (ret) { rbd_warn(NULL, "failed to get latest osdmap: %d", ret); rbd_put_client(rbdc); From e5a5af7718610c819c4d368bb62655ee43a38011 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 20 Mar 2019 10:20:56 -0700 Subject: [PATCH 712/855] arm64: remove obsolete selection of MULTI_IRQ_HANDLER The arm64 config selects MULTI_IRQ_HANDLER, which was renamed to GENERIC_IRQ_MULTI_HANDLER by commit 4c301f9b6a94 ("ARM: Convert to GENERIC_IRQ_MULTI_HANDLER"). The 'new' option is already selected, so just remove the obsolete entry. Signed-off-by: Matthias Kaehlcke Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 117b2541ef3d..7e34b9eba5de 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -159,7 +159,6 @@ config ARM64 select IRQ_DOMAIN select IRQ_FORCED_THREADING select MODULES_USE_ELF_RELA - select MULTI_IRQ_HANDLER select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH select OF From 398f0132c14754fcd03c1c4f8e7176d001ce8ea1 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Mon, 18 Mar 2019 23:14:52 -0700 Subject: [PATCH 713/855] net/packet: Set __GFP_NOWARN upon allocation in alloc_pg_vec Since commit fc62814d690c ("net/packet: fix 4gb buffer limit due to overflow check") one can now allocate packet ring buffers >= UINT_MAX. However, syzkaller found that that triggers a warning: [ 21.100000] WARNING: CPU: 2 PID: 2075 at mm/page_alloc.c:4584 __alloc_pages_nod0 [ 21.101490] Modules linked in: [ 21.101921] CPU: 2 PID: 2075 Comm: syz-executor.0 Not tainted 5.0.0 #146 [ 21.102784] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011 [ 21.103887] RIP: 0010:__alloc_pages_nodemask+0x2a0/0x630 [ 21.104640] Code: fe ff ff 65 48 8b 04 25 c0 de 01 00 48 05 90 0f 00 00 41 bd 01 00 00 00 48 89 44 24 48 e9 9c fe 3 [ 21.107121] RSP: 0018:ffff88805e1cf920 EFLAGS: 00010246 [ 21.107819] RAX: 0000000000000000 RBX: ffffffff85a488a0 RCX: 0000000000000000 [ 21.108753] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: 0000000000000000 [ 21.109699] RBP: 1ffff1100bc39f28 R08: ffffed100bcefb67 R09: ffffed100bcefb67 [ 21.110646] R10: 0000000000000001 R11: ffffed100bcefb66 R12: 000000000000000d [ 21.111623] R13: 0000000000000000 R14: ffff88805e77d888 R15: 000000000000000d [ 21.112552] FS: 00007f7c7de05700(0000) GS:ffff88806d100000(0000) knlGS:0000000000000000 [ 21.113612] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 21.114405] CR2: 000000000065c000 CR3: 000000005e58e006 CR4: 00000000001606e0 [ 21.115367] Call Trace: [ 21.115705] ? __alloc_pages_slowpath+0x21c0/0x21c0 [ 21.116362] alloc_pages_current+0xac/0x1e0 [ 21.116923] kmalloc_order+0x18/0x70 [ 21.117393] kmalloc_order_trace+0x18/0x110 [ 21.117949] packet_set_ring+0x9d5/0x1770 [ 21.118524] ? packet_rcv_spkt+0x440/0x440 [ 21.119094] ? lock_downgrade+0x620/0x620 [ 21.119646] ? __might_fault+0x177/0x1b0 [ 21.120177] packet_setsockopt+0x981/0x2940 [ 21.120753] ? __fget+0x2fb/0x4b0 [ 21.121209] ? packet_release+0xab0/0xab0 [ 21.121740] ? sock_has_perm+0x1cd/0x260 [ 21.122297] ? selinux_secmark_relabel_packet+0xd0/0xd0 [ 21.123013] ? __fget+0x324/0x4b0 [ 21.123451] ? selinux_netlbl_socket_setsockopt+0x101/0x320 [ 21.124186] ? selinux_netlbl_sock_rcv_skb+0x3a0/0x3a0 [ 21.124908] ? __lock_acquire+0x529/0x3200 [ 21.125453] ? selinux_socket_setsockopt+0x5d/0x70 [ 21.126075] ? __sys_setsockopt+0x131/0x210 [ 21.126533] ? packet_release+0xab0/0xab0 [ 21.127004] __sys_setsockopt+0x131/0x210 [ 21.127449] ? kernel_accept+0x2f0/0x2f0 [ 21.127911] ? ret_from_fork+0x8/0x50 [ 21.128313] ? do_raw_spin_lock+0x11b/0x280 [ 21.128800] __x64_sys_setsockopt+0xba/0x150 [ 21.129271] ? lockdep_hardirqs_on+0x37f/0x560 [ 21.129769] do_syscall_64+0x9f/0x450 [ 21.130182] entry_SYSCALL_64_after_hwframe+0x49/0xbe We should allocate with __GFP_NOWARN to handle this. Cc: Kal Conley Cc: Andrey Konovalov Fixes: fc62814d690c ("net/packet: fix 4gb buffer limit due to overflow check") Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 323655a25674..9419c5cf4de5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4210,7 +4210,7 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) struct pgv *pg_vec; int i; - pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL); + pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!pg_vec)) goto out; From 1c87e79a002f6a159396138cd3f3ab554a2a8887 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 20 Mar 2019 14:45:48 +0800 Subject: [PATCH 714/855] ipv6: make ip6_create_rt_rcu return ip6_null_entry instead of NULL Jianlin reported a crash: [ 381.484332] BUG: unable to handle kernel NULL pointer dereference at 0000000000000068 [ 381.619802] RIP: 0010:fib6_rule_lookup+0xa3/0x160 [ 382.009615] Call Trace: [ 382.020762] [ 382.030174] ip6_route_redirect.isra.52+0xc9/0xf0 [ 382.050984] ip6_redirect+0xb6/0xf0 [ 382.066731] icmpv6_notify+0xca/0x190 [ 382.083185] ndisc_redirect_rcv+0x10f/0x160 [ 382.102569] ndisc_rcv+0xfb/0x100 [ 382.117725] icmpv6_rcv+0x3f2/0x520 [ 382.133637] ip6_input_finish+0xbf/0x460 [ 382.151634] ip6_input+0x3b/0xb0 [ 382.166097] ipv6_rcv+0x378/0x4e0 It was caused by the lookup function __ip6_route_redirect() returns NULL in fib6_rule_lookup() when ip6_create_rt_rcu() returns NULL. So we fix it by simply making ip6_create_rt_rcu() return ip6_null_entry instead of NULL. v1->v2: - move down 'fallback:' to make it more readable. Fixes: e873e4b9cc7e ("ipv6: use fib6_info_hold_safe() when necessary") Reported-by: Jianlin Shi Suggested-by: Paolo Abeni Signed-off-by: Xin Long Reviewed-by: David Ahern Acked-by: Wei Wang Signed-off-by: David S. Miller --- net/ipv6/route.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4ef4bbdb49d4..0302e0eb07af 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1040,14 +1040,20 @@ static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) struct rt6_info *nrt; if (!fib6_info_hold_safe(rt)) - return NULL; + goto fallback; nrt = ip6_dst_alloc(dev_net(dev), dev, flags); - if (nrt) - ip6_rt_copy_init(nrt, rt); - else + if (!nrt) { fib6_info_release(rt); + goto fallback; + } + ip6_rt_copy_init(nrt, rt); + return nrt; + +fallback: + nrt = dev_net(dev)->ipv6.ip6_null_entry; + dst_hold(&nrt->dst); return nrt; } @@ -1096,10 +1102,6 @@ restart: dst_hold(&rt->dst); } else { rt = ip6_create_rt_rcu(f6i); - if (!rt) { - rt = net->ipv6.ip6_null_entry; - dst_hold(&rt->dst); - } } rcu_read_unlock(); From ef82bcfa671b9a635bab5fa669005663d8b177c5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 20 Mar 2019 14:49:38 +0800 Subject: [PATCH 715/855] sctp: use memdup_user instead of vmemdup_user In sctp_setsockopt_bindx()/__sctp_setsockopt_connectx(), it allocates memory with addrs_size which is passed from userspace. We used flag GFP_USER to put some more restrictions on it in Commit cacc06215271 ("sctp: use GFP_USER for user-controlled kmalloc"). However, since Commit c981f254cc82 ("sctp: use vmemdup_user() rather than badly open-coding memdup_user()"), vmemdup_user() has been used, which doesn't check GFP_USER flag when goes to vmalloc_*(). So when addrs_size is a huge value, it could exhaust memory and even trigger oom killer. This patch is to use memdup_user() instead, in which GFP_USER would work to limit the memory allocation with a huge addrs_size. Note we can't fix it by limiting 'addrs_size', as there's no demand for it from RFC. Reported-by: syzbot+ec1b7575afef85a0e5ca@syzkaller.appspotmail.com Fixes: c981f254cc82 ("sctp: use vmemdup_user() rather than badly open-coding memdup_user()") Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 011c349d877a..9874e60c9b0d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -999,7 +999,7 @@ static int sctp_setsockopt_bindx(struct sock *sk, if (unlikely(addrs_size <= 0)) return -EINVAL; - kaddrs = vmemdup_user(addrs, addrs_size); + kaddrs = memdup_user(addrs, addrs_size); if (unlikely(IS_ERR(kaddrs))) return PTR_ERR(kaddrs); @@ -1007,7 +1007,7 @@ static int sctp_setsockopt_bindx(struct sock *sk, addr_buf = kaddrs; while (walk_size < addrs_size) { if (walk_size + sizeof(sa_family_t) > addrs_size) { - kvfree(kaddrs); + kfree(kaddrs); return -EINVAL; } @@ -1018,7 +1018,7 @@ static int sctp_setsockopt_bindx(struct sock *sk, * causes the address buffer to overflow return EINVAL. */ if (!af || (walk_size + af->sockaddr_len) > addrs_size) { - kvfree(kaddrs); + kfree(kaddrs); return -EINVAL; } addrcnt++; @@ -1054,7 +1054,7 @@ static int sctp_setsockopt_bindx(struct sock *sk, } out: - kvfree(kaddrs); + kfree(kaddrs); return err; } @@ -1329,7 +1329,7 @@ static int __sctp_setsockopt_connectx(struct sock *sk, if (unlikely(addrs_size <= 0)) return -EINVAL; - kaddrs = vmemdup_user(addrs, addrs_size); + kaddrs = memdup_user(addrs, addrs_size); if (unlikely(IS_ERR(kaddrs))) return PTR_ERR(kaddrs); @@ -1349,7 +1349,7 @@ static int __sctp_setsockopt_connectx(struct sock *sk, err = __sctp_connect(sk, kaddrs, addrs_size, flags, assoc_id); out_free: - kvfree(kaddrs); + kfree(kaddrs); return err; } From 89dc891792c2e046b030f87600109c22209da32e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 12 Mar 2019 18:33:46 +0100 Subject: [PATCH 716/855] irqchip/gic-v3-its: Fix comparison logic in lpi_range_cmp The lpi_range_list is supposed to be sorted in ascending order of ->base_id (at least if the range merging is to work), but the current comparison function returns a positive value if rb->base_id > ra->base_id, which means that list_sort() will put A after B in that case - and vice versa, of course. Fixes: 880cb3cddd16 (irqchip/gic-v3-its: Refactor LPI allocator) Cc: stable@vger.kernel.org (v4.19+) Signed-off-by: Rasmus Villemoes Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index fb7157188294..783810716015 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1476,7 +1476,7 @@ static int lpi_range_cmp(void *priv, struct list_head *a, struct list_head *b) ra = container_of(a, struct lpi_range, entry); rb = container_of(b, struct lpi_range, entry); - return rb->base_id - ra->base_id; + return ra->base_id - rb->base_id; } static void merge_lpi_ranges(void) From 0e83fc61eee62979260f6aeadd23ee8b615ee1a2 Mon Sep 17 00:00:00 2001 From: Luo Jiaxing Date: Wed, 20 Mar 2019 18:21:34 +0800 Subject: [PATCH 717/855] scsi: hisi_sas: Add softreset in hisi_sas_I_T_nexus_reset() We found out that for v2 hw, a SATA disk can not be written to after the system comes up. In commit ffb1c820b8b6 ("scsi: hisi_sas: remove the check of sas_dev status in hisi_sas_I_T_nexus_reset()"), we introduced a path where we may issue an internal abort for a SATA device, but without following it with a softreset. We need to always follow an internal abort with a software reset, as per HW programming flow, so add this. Fixes: ffb1c820b8b6 ("scsi: hisi_sas: remove the check of sas_dev status in hisi_sas_I_T_nexus_reset()") Signed-off-by: Luo Jiaxing Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 3c3cf89f713f..14bac4966c87 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1801,6 +1801,12 @@ static int hisi_sas_I_T_nexus_reset(struct domain_device *device) } hisi_sas_dereg_device(hisi_hba, device); + if (dev_is_sata(device)) { + rc = hisi_sas_softreset_ata_disk(device); + if (rc) + return TMF_RESP_FUNC_FAILED; + } + rc = hisi_sas_debug_I_T_nexus_reset(device); if ((rc == TMF_RESP_FUNC_COMPLETE) || (rc == -ENODEV)) From 0ccc3876e4b2a1559a4dbe3126dda4459d38a83b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 19 Mar 2019 17:18:13 +0000 Subject: [PATCH 718/855] Btrfs: fix assertion failure on fsync with NO_HOLES enabled Back in commit a89ca6f24ffe4 ("Btrfs: fix fsync after truncate when no_holes feature is enabled") I added an assertion that is triggered when an inline extent is found to assert that the length of the (uncompressed) data the extent represents is the same as the i_size of the inode, since that is true most of the time I couldn't find or didn't remembered about any exception at that time. Later on the assertion was expanded twice to deal with a case of a compressed inline extent representing a range that matches the sector size followed by an expanding truncate, and another case where fallocate can update the i_size of the inode without adding or updating existing extents (if the fallocate range falls entirely within the first block of the file). These two expansion/fixes of the assertion were done by commit 7ed586d0a8241 ("Btrfs: fix assertion on fsync of regular file when using no-holes feature") and commit 6399fb5a0b69a ("Btrfs: fix assertion failure during fsync in no-holes mode"). These however missed the case where an falloc expands the i_size of an inode to exactly the sector size and inline extent exists, for example: $ mkfs.btrfs -f -O no-holes /dev/sdc $ mount /dev/sdc /mnt $ xfs_io -f -c "pwrite -S 0xab 0 1096" /mnt/foobar wrote 1096/1096 bytes at offset 0 1 KiB, 1 ops; 0.0002 sec (4.448 MiB/sec and 4255.3191 ops/sec) $ xfs_io -c "falloc 1096 3000" /mnt/foobar $ xfs_io -c "fsync" /mnt/foobar Segmentation fault $ dmesg [701253.602385] assertion failed: len == i_size || (len == fs_info->sectorsize && btrfs_file_extent_compression(leaf, extent) != BTRFS_COMPRESS_NONE) || (len < i_size && i_size < fs_info->sectorsize), file: fs/btrfs/tree-log.c, line: 4727 [701253.602962] ------------[ cut here ]------------ [701253.603224] kernel BUG at fs/btrfs/ctree.h:3533! [701253.603503] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC PTI [701253.603774] CPU: 2 PID: 7192 Comm: xfs_io Tainted: G W 5.0.0-rc8-btrfs-next-45 #1 [701253.604054] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [701253.604650] RIP: 0010:assfail.constprop.23+0x18/0x1a [btrfs] (...) [701253.605591] RSP: 0018:ffffbb48c186bc48 EFLAGS: 00010286 [701253.605914] RAX: 00000000000000de RBX: ffff921d0a7afc08 RCX: 0000000000000000 [701253.606244] RDX: 0000000000000000 RSI: ffff921d36b16868 RDI: ffff921d36b16868 [701253.606580] RBP: ffffbb48c186bcf0 R08: 0000000000000000 R09: 0000000000000000 [701253.606913] R10: 0000000000000003 R11: 0000000000000000 R12: ffff921d05d2de18 [701253.607247] R13: ffff921d03b54000 R14: 0000000000000448 R15: ffff921d059ecf80 [701253.607769] FS: 00007f14da906700(0000) GS:ffff921d36b00000(0000) knlGS:0000000000000000 [701253.608163] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [701253.608516] CR2: 000056087ea9f278 CR3: 00000002268e8001 CR4: 00000000003606e0 [701253.608880] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [701253.609250] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [701253.609608] Call Trace: [701253.609994] btrfs_log_inode+0xdfb/0xe40 [btrfs] [701253.610383] btrfs_log_inode_parent+0x2be/0xa60 [btrfs] [701253.610770] ? do_raw_spin_unlock+0x49/0xc0 [701253.611150] btrfs_log_dentry_safe+0x4a/0x70 [btrfs] [701253.611537] btrfs_sync_file+0x3b2/0x440 [btrfs] [701253.612010] ? do_sysinfo+0xb0/0xf0 [701253.612552] do_fsync+0x38/0x60 [701253.612988] __x64_sys_fsync+0x10/0x20 [701253.613360] do_syscall_64+0x60/0x1b0 [701253.613733] entry_SYSCALL_64_after_hwframe+0x49/0xbe [701253.614103] RIP: 0033:0x7f14da4e66d0 (...) [701253.615250] RSP: 002b:00007fffa670fdb8 EFLAGS: 00000246 ORIG_RAX: 000000000000004a [701253.615647] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f14da4e66d0 [701253.616047] RDX: 000056087ea9c260 RSI: 000056087ea9c260 RDI: 0000000000000003 [701253.616450] RBP: 0000000000000001 R08: 0000000000000020 R09: 0000000000000010 [701253.616854] R10: 000000000000009b R11: 0000000000000246 R12: 000056087ea9c260 [701253.617257] R13: 000056087ea9c240 R14: 0000000000000000 R15: 000056087ea9dd10 (...) [701253.619941] ---[ end trace e088d74f132b6da5 ]--- Updating the assertion again to allow for this particular case would result in a meaningless assertion, plus there is currently no risk of logging content that would result in any corruption after a log replay if the size of the data encoded in an inline extent is greater than the inode's i_size (which is not currently possibe either with or without compression), therefore just remove the assertion. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 960ea49a7a0f..561884f60d35 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4725,15 +4725,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(leaf, extent); - ASSERT(len == i_size || - (len == fs_info->sectorsize && - btrfs_file_extent_compression(leaf, extent) != - BTRFS_COMPRESS_NONE) || - (len < i_size && i_size < fs_info->sectorsize)); + BTRFS_FILE_EXTENT_INLINE) return 0; - } len = btrfs_file_extent_num_bytes(leaf, extent); /* Last extent goes beyond i_size, no need to log a hole. */ From 8a1b1718214cfd945fef14b3031e4e7262882a86 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:48 -0700 Subject: [PATCH 719/855] perf build: Check what binutils's 'disassembler()' signature to use Commit 003ca0fd2286 ("Refactor disassembler selection") in the binutils repo, which changed the disassembler() function signature, so we must use the feature test introduced in fb982666e380 ("tools/bpftool: fix bpftool build with bintutils >= 2.9") to deal with that. Committer testing: After adding the missing function call to test-all.c, and: FEATURE_CHECK_LDFLAGS-disassembler-four-args = -bfd -lopcodes And the fallbacks for cases where we need -liberty and sometimes -lz to tools/perf/Makefile.config, we get: $ make -C tools/perf O=/tmp/build/perf install-bin make: Entering directory '/home/acme/git/perf/tools/perf' BUILD: Doing 'make -j8' parallel build Auto-detecting system features: ... dwarf: [ on ] ... dwarf_getlocations: [ on ] ... glibc: [ on ] ... gtk2: [ on ] ... libaudit: [ on ] ... libbfd: [ on ] ... libelf: [ on ] ... libnuma: [ on ] ... numa_num_possible_cpus: [ on ] ... libperl: [ on ] ... libpython: [ on ] ... libslang: [ on ] ... libcrypto: [ on ] ... libunwind: [ on ] ... libdw-dwarf-unwind: [ on ] ... zlib: [ on ] ... lzma: [ on ] ... get_cpuid: [ on ] ... bpf: [ on ] ... libaio: [ on ] ... disassembler-four-args: [ on ] CC /tmp/build/perf/jvmti/libjvmti.o CC /tmp/build/perf/builtin-bench.o $ $ The feature detection test-all.bin gets successfully built and linked: $ ls -la /tmp/build/perf/feature/test-all.bin -rwxrwxr-x. 1 acme acme 2680352 Mar 19 11:07 /tmp/build/perf/feature/test-all.bin $ nm /tmp/build/perf/feature/test-all.bin | grep -w disassembler 0000000000061f90 T disassembler $ Time to move on to the patches that make use of this disassembler() routine in binutils's libopcodes. Signed-off-by: Song Liu Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jakub Kicinski Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Stanislav Fomichev Link: http://lkml.kernel.org/r/20190312053051.2690567-13-songliubraving@fb.com [ split from a larger patch, added missing FEATURE_CHECK_LDFLAGS-disassembler-four-args ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Makefile.feature | 6 ++++-- tools/build/feature/test-all.c | 5 +++++ tools/perf/Makefile.config | 9 +++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 61e46d54a67c..8d3864b061f3 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -66,7 +66,8 @@ FEATURE_TESTS_BASIC := \ sched_getcpu \ sdt \ setns \ - libaio + libaio \ + disassembler-four-args # FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list # of all feature tests @@ -118,7 +119,8 @@ FEATURE_DISPLAY ?= \ lzma \ get_cpuid \ bpf \ - libaio + libaio \ + disassembler-four-args # Set FEATURE_CHECK_(C|LD)FLAGS-all for all FEATURE_TESTS features. # If in the future we need per-feature checks/flags for features not diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c index e903b86b742f..7853e6d91090 100644 --- a/tools/build/feature/test-all.c +++ b/tools/build/feature/test-all.c @@ -178,6 +178,10 @@ # include "test-reallocarray.c" #undef main +#define main main_test_disassembler_four_args +# include "test-disassembler-four-args.c" +#undef main + int main(int argc, char *argv[]) { main_test_libpython(); @@ -219,6 +223,7 @@ int main(int argc, char *argv[]) main_test_setns(); main_test_libaio(); main_test_reallocarray(); + main_test_disassembler_four_args(); return 0; } diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index df4ad45599ca..fe3f97e342fa 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -227,6 +227,8 @@ FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS) FEATURE_CHECK_LDFLAGS-libaio = -lrt +FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes + CFLAGS += -fno-omit-frame-pointer CFLAGS += -ggdb3 CFLAGS += -funwind-tables @@ -725,11 +727,14 @@ else ifeq ($(feature-libbfd-liberty), 1) EXTLIBS += -lbfd -lopcodes -liberty + FEATURE_CHECK_LDFLAGS-disassembler-four-args += -liberty -ldl else ifeq ($(feature-libbfd-liberty-z), 1) EXTLIBS += -lbfd -lopcodes -liberty -lz + FEATURE_CHECK_LDFLAGS-disassembler-four-args += -liberty -lz -ldl endif endif + $(call feature_check,disassembler-four-args) endif ifdef NO_DEMANGLE @@ -808,6 +813,10 @@ ifdef HAVE_KVM_STAT_SUPPORT CFLAGS += -DHAVE_KVM_STAT_SUPPORT endif +ifeq ($(feature-disassembler-four-args), 1) + CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE +endif + ifeq (${IS_64_BIT}, 1) ifndef NO_PERF_READ_VDSO32 $(call feature_check,compile-32) From 6987561c9e86eace45f2dbb0c564964a63f4150a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:48 -0700 Subject: [PATCH 720/855] perf annotate: Enable annotation of BPF programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In symbol__disassemble(), DSO_BINARY_TYPE__BPF_PROG_INFO dso calls into a new function symbol__disassemble_bpf(), where annotation line information is filled based on the bpf_prog_info and btf data saved in given perf_env. symbol__disassemble_bpf() uses binutils's libopcodes to disassemble bpf programs. Committer testing: After fixing this: - u64 *addrs = (u64 *)(info_linear->info.jited_ksyms); + u64 *addrs = (u64 *)(uintptr_t)(info_linear->info.jited_ksyms); Detected when crossbuilding to a 32-bit arch. And making all this dependent on HAVE_LIBBFD_SUPPORT and HAVE_LIBBPF_SUPPORT: 1) Have a BPF program running, one that has BTF info, etc, I used the tools/perf/examples/bpf/augmented_raw_syscalls.c put in place by 'perf trace'. # grep -B1 augmented_raw ~/.perfconfig [trace] add_events = /home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.c # # perf trace -e *mmsg dnf/6245 sendmmsg(20, 0x7f5485a88030, 2, MSG_NOSIGNAL) = 2 NetworkManager/10055 sendmmsg(22, 0x7f8126ad1bb0, 2, MSG_NOSIGNAL) = 2 2) Then do a 'perf record' system wide for a while: # perf record -a ^C[ perf record: Woken up 68 times to write data ] [ perf record: Captured and wrote 19.427 MB perf.data (366891 samples) ] # 3) Check that we captured BPF and BTF info in the perf.data file: # perf report --header-only | grep 'b[pt]f' # event : name = cycles:ppp, , id = { 294789, 294790, 294791, 294792, 294793, 294794, 294795, 294796 }, size = 112, { sample_period, sample_freq } = 4000, sample_type = IP|TID|TIME|CPU|PERIOD, read_format = ID, disabled = 1, inherit = 1, mmap = 1, comm = 1, freq = 1, task = 1, precise_ip = 3, sample_id_all = 1, exclude_guest = 1, mmap2 = 1, comm_exec = 1, ksymbol = 1, bpf_event = 1 # bpf_prog_info of id 13 # bpf_prog_info of id 14 # bpf_prog_info of id 15 # bpf_prog_info of id 16 # bpf_prog_info of id 17 # bpf_prog_info of id 18 # bpf_prog_info of id 21 # bpf_prog_info of id 22 # bpf_prog_info of id 41 # bpf_prog_info of id 42 # btf info of id 2 # 4) Check which programs got recorded: # perf report | grep bpf_prog | head 0.16% exe bpf_prog_819967866022f1e1_sys_enter [k] bpf_prog_819967866022f1e1_sys_enter 0.14% exe bpf_prog_c1bd85c092d6e4aa_sys_exit [k] bpf_prog_c1bd85c092d6e4aa_sys_exit 0.08% fuse-overlayfs bpf_prog_819967866022f1e1_sys_enter [k] bpf_prog_819967866022f1e1_sys_enter 0.07% fuse-overlayfs bpf_prog_c1bd85c092d6e4aa_sys_exit [k] bpf_prog_c1bd85c092d6e4aa_sys_exit 0.01% clang-4.0 bpf_prog_c1bd85c092d6e4aa_sys_exit [k] bpf_prog_c1bd85c092d6e4aa_sys_exit 0.01% clang-4.0 bpf_prog_819967866022f1e1_sys_enter [k] bpf_prog_819967866022f1e1_sys_enter 0.00% clang bpf_prog_c1bd85c092d6e4aa_sys_exit [k] bpf_prog_c1bd85c092d6e4aa_sys_exit 0.00% runc bpf_prog_819967866022f1e1_sys_enter [k] bpf_prog_819967866022f1e1_sys_enter 0.00% clang bpf_prog_819967866022f1e1_sys_enter [k] bpf_prog_819967866022f1e1_sys_enter 0.00% sh bpf_prog_c1bd85c092d6e4aa_sys_exit [k] bpf_prog_c1bd85c092d6e4aa_sys_exit # This was with the default --sort order for 'perf report', which is: --sort comm,dso,symbol If we just look for the symbol, for instance: # perf report --sort symbol | grep bpf_prog | head 0.26% [k] bpf_prog_819967866022f1e1_sys_enter - - 0.24% [k] bpf_prog_c1bd85c092d6e4aa_sys_exit - - # or the DSO: # perf report --sort dso | grep bpf_prog | head 0.26% bpf_prog_819967866022f1e1_sys_enter 0.24% bpf_prog_c1bd85c092d6e4aa_sys_exit # We'll see the two BPF programs that augmented_raw_syscalls.o puts in place, one attached to the raw_syscalls:sys_enter and another to the raw_syscalls:sys_exit tracepoints, as expected. Now we can finally do, from the command line, annotation for one of those two symbols, with the original BPF program source coude intermixed with the disassembled JITed code: # perf annotate --stdio2 bpf_prog_819967866022f1e1_sys_enter Samples: 950 of event 'cycles:ppp', 4000 Hz, Event count (approx.): 553756947, [percent: local period] bpf_prog_819967866022f1e1_sys_enter() bpf_prog_819967866022f1e1_sys_enter Percent int sys_enter(struct syscall_enter_args *args) 53.41 push %rbp 0.63 mov %rsp,%rbp 0.31 sub $0x170,%rsp 1.93 sub $0x28,%rbp 7.02 mov %rbx,0x0(%rbp) 3.20 mov %r13,0x8(%rbp) 1.07 mov %r14,0x10(%rbp) 0.61 mov %r15,0x18(%rbp) 0.11 xor %eax,%eax 1.29 mov %rax,0x20(%rbp) 0.11 mov %rdi,%rbx return bpf_get_current_pid_tgid(); 2.02 → callq *ffffffffda6776d9 2.76 mov %eax,-0x148(%rbp) mov %rbp,%rsi int sys_enter(struct syscall_enter_args *args) add $0xfffffffffffffeb8,%rsi return bpf_map_lookup_elem(pids, &pid) != NULL; movabs $0xffff975ac2607800,%rdi 1.26 → callq *ffffffffda6789e9 cmp $0x0,%rax 2.43 → je 0 add $0x38,%rax 0.21 xor %r13d,%r13d if (pid_filter__has(&pids_filtered, getpid())) 0.81 cmp $0x0,%rax → jne 0 mov %rbp,%rdi probe_read(&augmented_args.args, sizeof(augmented_args.args), args); 2.22 add $0xfffffffffffffeb8,%rdi 0.11 mov $0x40,%esi 0.32 mov %rbx,%rdx 2.74 → callq *ffffffffda658409 syscall = bpf_map_lookup_elem(&syscalls, &augmented_args.args.syscall_nr); 0.22 mov %rbp,%rsi 1.69 add $0xfffffffffffffec0,%rsi syscall = bpf_map_lookup_elem(&syscalls, &augmented_args.args.syscall_nr); movabs $0xffff975bfcd36000,%rdi add $0xd0,%rdi 0.21 mov 0x0(%rsi),%eax 0.93 cmp $0x200,%rax → jae 0 0.10 shl $0x3,%rax 0.11 add %rdi,%rax 0.11 → jmp 0 xor %eax,%eax if (syscall == NULL || !syscall->enabled) 1.07 cmp $0x0,%rax → je 0 if (syscall == NULL || !syscall->enabled) 6.57 movzbq 0x0(%rax),%rdi if (syscall == NULL || !syscall->enabled) cmp $0x0,%rdi 0.95 → je 0 mov $0x40,%r8d switch (augmented_args.args.syscall_nr) { mov -0x140(%rbp),%rdi switch (augmented_args.args.syscall_nr) { cmp $0x2,%rdi → je 0 cmp $0x101,%rdi → je 0 cmp $0x15,%rdi → jne 0 case SYS_OPEN: filename_arg = (const void *)args->args[0]; mov 0x10(%rbx),%rdx → jmp 0 case SYS_OPENAT: filename_arg = (const void *)args->args[1]; mov 0x18(%rbx),%rdx if (filename_arg != NULL) { cmp $0x0,%rdx → je 0 xor %edi,%edi augmented_args.filename.reserved = 0; mov %edi,-0x104(%rbp) augmented_args.filename.size = probe_read_str(&augmented_args.filename.value, mov %rbp,%rdi add $0xffffffffffffff00,%rdi augmented_args.filename.size = probe_read_str(&augmented_args.filename.value, mov $0x100,%esi → callq *ffffffffda658499 mov $0x148,%r8d augmented_args.filename.size = probe_read_str(&augmented_args.filename.value, mov %eax,-0x108(%rbp) augmented_args.filename.size = probe_read_str(&augmented_args.filename.value, mov %rax,%rdi shl $0x20,%rdi shr $0x20,%rdi if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) { cmp $0xff,%rdi → ja 0 len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size; add $0x48,%rax len &= sizeof(augmented_args.filename.value) - 1; and $0xff,%rax mov %rax,%r8 mov %rbp,%rcx return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len); add $0xfffffffffffffeb8,%rcx mov %rbx,%rdi movabs $0xffff975fbd72d800,%rsi mov $0xffffffff,%edx → callq *ffffffffda658ad9 mov %rax,%r13 } mov %r13,%rax 0.72 mov 0x0(%rbp),%rbx mov 0x8(%rbp),%r13 1.16 mov 0x10(%rbp),%r14 0.10 mov 0x18(%rbp),%r15 0.42 add $0x28,%rbp 0.54 leaveq 0.54 ← retq # Please see 'man perf-config' to see how to control what should be seen, via ~/.perfconfig [annotate] section, for instance, one can suppress the source code and see just the disassembly, etc. Alternatively, use the TUI bu just using 'perf annotate', press '/bpf_prog' to see the bpf symbols, press enter and do the interactive annotation, which allows for dumping to a file after selecting the the various output tunables, for instance, the above without source code intermixed, plus showing all the instruction offsets: # perf annotate bpf_prog_819967866022f1e1_sys_enter Then press: 's' to hide the source code + 'O' twice to show all instruction offsets, then 'P' to print to the bpf_prog_819967866022f1e1_sys_enter.annotation file, which will have: # cat bpf_prog_819967866022f1e1_sys_enter.annotation bpf_prog_819967866022f1e1_sys_enter() bpf_prog_819967866022f1e1_sys_enter Event: cycles:ppp 53.41 0: push %rbp 0.63 1: mov %rsp,%rbp 0.31 4: sub $0x170,%rsp 1.93 b: sub $0x28,%rbp 7.02 f: mov %rbx,0x0(%rbp) 3.20 13: mov %r13,0x8(%rbp) 1.07 17: mov %r14,0x10(%rbp) 0.61 1b: mov %r15,0x18(%rbp) 0.11 1f: xor %eax,%eax 1.29 21: mov %rax,0x20(%rbp) 0.11 25: mov %rdi,%rbx 2.02 28: → callq *ffffffffda6776d9 2.76 2d: mov %eax,-0x148(%rbp) 33: mov %rbp,%rsi 36: add $0xfffffffffffffeb8,%rsi 3d: movabs $0xffff975ac2607800,%rdi 1.26 47: → callq *ffffffffda6789e9 4c: cmp $0x0,%rax 2.43 50: → je 0 52: add $0x38,%rax 0.21 56: xor %r13d,%r13d 0.81 59: cmp $0x0,%rax 5d: → jne 0 63: mov %rbp,%rdi 2.22 66: add $0xfffffffffffffeb8,%rdi 0.11 6d: mov $0x40,%esi 0.32 72: mov %rbx,%rdx 2.74 75: → callq *ffffffffda658409 0.22 7a: mov %rbp,%rsi 1.69 7d: add $0xfffffffffffffec0,%rsi 84: movabs $0xffff975bfcd36000,%rdi 8e: add $0xd0,%rdi 0.21 95: mov 0x0(%rsi),%eax 0.93 98: cmp $0x200,%rax 9f: → jae 0 0.10 a1: shl $0x3,%rax 0.11 a5: add %rdi,%rax 0.11 a8: → jmp 0 aa: xor %eax,%eax 1.07 ac: cmp $0x0,%rax b0: → je 0 6.57 b6: movzbq 0x0(%rax),%rdi bb: cmp $0x0,%rdi 0.95 bf: → je 0 c5: mov $0x40,%r8d cb: mov -0x140(%rbp),%rdi d2: cmp $0x2,%rdi d6: → je 0 d8: cmp $0x101,%rdi df: → je 0 e1: cmp $0x15,%rdi e5: → jne 0 e7: mov 0x10(%rbx),%rdx eb: → jmp 0 ed: mov 0x18(%rbx),%rdx f1: cmp $0x0,%rdx f5: → je 0 f7: xor %edi,%edi f9: mov %edi,-0x104(%rbp) ff: mov %rbp,%rdi 102: add $0xffffffffffffff00,%rdi 109: mov $0x100,%esi 10e: → callq *ffffffffda658499 113: mov $0x148,%r8d 119: mov %eax,-0x108(%rbp) 11f: mov %rax,%rdi 122: shl $0x20,%rdi 126: shr $0x20,%rdi 12a: cmp $0xff,%rdi 131: → ja 0 133: add $0x48,%rax 137: and $0xff,%rax 13d: mov %rax,%r8 140: mov %rbp,%rcx 143: add $0xfffffffffffffeb8,%rcx 14a: mov %rbx,%rdi 14d: movabs $0xffff975fbd72d800,%rsi 157: mov $0xffffffff,%edx 15c: → callq *ffffffffda658ad9 161: mov %rax,%r13 164: mov %r13,%rax 0.72 167: mov 0x0(%rbp),%rbx 16b: mov 0x8(%rbp),%r13 1.16 16f: mov 0x10(%rbp),%r14 0.10 173: mov 0x18(%rbp),%r15 0.42 177: add $0x28,%rbp 0.54 17b: leaveq 0.54 17c: ← retq Another cool way to test all this is to symple use 'perf top' look for those symbols, go there and press enter, annotate it live :-) Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Link: http://lkml.kernel.org/r/20190312053051.2690567-13-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 163 +++++++++++++++++++++++++++++++++++- tools/perf/util/annotate.h | 1 + tools/perf/util/bpf-event.c | 2 +- 3 files changed, 164 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 5f6dbbf5d749..c8b01176c9e1 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -10,6 +10,10 @@ #include #include #include +#include +#include +#include +#include #include "util.h" #include "ui/ui.h" #include "sort.h" @@ -24,6 +28,7 @@ #include "annotate.h" #include "evsel.h" #include "evlist.h" +#include "bpf-event.h" #include "block-range.h" #include "string2.h" #include "arch/common.h" @@ -31,6 +36,7 @@ #include #include #include +#include /* FIXME: For the HE_COLORSET */ #include "ui/browser.h" @@ -1615,6 +1621,9 @@ int symbol__strerror_disassemble(struct symbol *sym __maybe_unused, struct map * " --vmlinux vmlinux\n", build_id_msg ?: ""); } break; + case SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF: + scnprintf(buf, buflen, "Please link with binutils's libopcode to enable BPF annotation"); + break; default: scnprintf(buf, buflen, "Internal error: Invalid %d error code\n", errnum); break; @@ -1674,6 +1683,156 @@ fallback: return 0; } +#if defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) +#define PACKAGE "perf" +#include +#include + +static int symbol__disassemble_bpf(struct symbol *sym, + struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct annotation_options *opts = args->options; + struct bpf_prog_info_linear *info_linear; + struct bpf_prog_linfo *prog_linfo = NULL; + struct bpf_prog_info_node *info_node; + int len = sym->end - sym->start; + disassembler_ftype disassemble; + struct map *map = args->ms.map; + struct disassemble_info info; + struct dso *dso = map->dso; + int pc = 0, count, sub_id; + struct btf *btf = NULL; + char tpath[PATH_MAX]; + size_t buf_size; + int nr_skip = 0; + int ret = -1; + char *buf; + bfd *bfdf; + FILE *s; + + if (dso->binary_type != DSO_BINARY_TYPE__BPF_PROG_INFO) + return -1; + + pr_debug("%s: handling sym %s addr %lx len %lx\n", __func__, + sym->name, sym->start, sym->end - sym->start); + + memset(tpath, 0, sizeof(tpath)); + perf_exe(tpath, sizeof(tpath)); + + bfdf = bfd_openr(tpath, NULL); + assert(bfdf); + assert(bfd_check_format(bfdf, bfd_object)); + + s = open_memstream(&buf, &buf_size); + if (!s) + goto out; + init_disassemble_info(&info, s, + (fprintf_ftype) fprintf); + + info.arch = bfd_get_arch(bfdf); + info.mach = bfd_get_mach(bfdf); + + info_node = perf_env__find_bpf_prog_info(dso->bpf_prog.env, + dso->bpf_prog.id); + if (!info_node) + goto out; + info_linear = info_node->info_linear; + sub_id = dso->bpf_prog.sub_id; + + info.buffer = (void *)(info_linear->info.jited_prog_insns); + info.buffer_length = info_linear->info.jited_prog_len; + + if (info_linear->info.nr_line_info) + prog_linfo = bpf_prog_linfo__new(&info_linear->info); + + if (info_linear->info.btf_id) { + struct btf_node *node; + + node = perf_env__find_btf(dso->bpf_prog.env, + info_linear->info.btf_id); + if (node) + btf = btf__new((__u8 *)(node->data), + node->data_size); + } + + disassemble_init_for_target(&info); + +#ifdef DISASM_FOUR_ARGS_SIGNATURE + disassemble = disassembler(info.arch, + bfd_big_endian(bfdf), + info.mach, + bfdf); +#else + disassemble = disassembler(bfdf); +#endif + assert(disassemble); + + fflush(s); + do { + const struct bpf_line_info *linfo = NULL; + struct disasm_line *dl; + size_t prev_buf_size; + const char *srcline; + u64 addr; + + addr = pc + ((u64 *)(info_linear->info.jited_ksyms))[sub_id]; + count = disassemble(pc, &info); + + if (prog_linfo) + linfo = bpf_prog_linfo__lfind_addr_func(prog_linfo, + addr, sub_id, + nr_skip); + + if (linfo && btf) { + srcline = btf__name_by_offset(btf, linfo->line_off); + nr_skip++; + } else + srcline = NULL; + + fprintf(s, "\n"); + prev_buf_size = buf_size; + fflush(s); + + if (!opts->hide_src_code && srcline) { + args->offset = -1; + args->line = strdup(srcline); + args->line_nr = 0; + args->ms.sym = sym; + dl = disasm_line__new(args); + if (dl) { + annotation_line__add(&dl->al, + ¬es->src->source); + } + } + + args->offset = pc; + args->line = buf + prev_buf_size; + args->line_nr = 0; + args->ms.sym = sym; + dl = disasm_line__new(args); + if (dl) + annotation_line__add(&dl->al, ¬es->src->source); + + pc += count; + } while (count > 0 && pc < len); + + ret = 0; +out: + free(prog_linfo); + free(btf); + fclose(s); + bfd_close(bfdf); + return ret; +} +#else // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) +static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, + struct annotate_args *args __maybe_unused) +{ + return SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF; +} +#endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) + static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) { struct annotation_options *opts = args->options; @@ -1701,7 +1860,9 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) pr_debug("annotating [%p] %30s : [%p] %30s\n", dso, dso->long_name, sym, sym->name); - if (dso__is_kcore(dso)) { + if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) { + return symbol__disassemble_bpf(sym, args); + } else if (dso__is_kcore(dso)) { kce.kcore_filename = symfs_filename; kce.addr = map__rip_2objdump(map, sym->start); kce.offs = sym->start; diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index df34fe483164..5bc0cf655d37 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -369,6 +369,7 @@ enum symbol_disassemble_errno { __SYMBOL_ANNOTATE_ERRNO__START = -10000, SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX = __SYMBOL_ANNOTATE_ERRNO__START, + SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF, __SYMBOL_ANNOTATE_ERRNO__END, }; diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 852e960692cb..7ffe7db59828 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -46,7 +46,7 @@ static int machine__process_bpf_event_load(struct machine *machine, info_linear = info_node->info_linear; for (i = 0; i < info_linear->info.nr_jited_ksyms; i++) { - u64 *addrs = (u64 *)(info_linear->info.jited_ksyms); + u64 *addrs = (u64 *)(uintptr_t)(info_linear->info.jited_ksyms); u64 addr = addrs[i]; struct map *map; From 536d3680fd2dab5c39857d62a3e084198fc74ff9 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 20 Mar 2019 15:02:00 +0100 Subject: [PATCH 721/855] net: ks8851: Dequeue RX packets explicitly The ks8851 driver lets the chip auto-dequeue received packets once they have been read in full. It achieves that by setting the ADRFE flag in the RXQCR register ("Auto-Dequeue RXQ Frame Enable"). However if allocation of a packet's socket buffer or retrieval of the packet over the SPI bus fails, the packet will not have been read in full and is not auto-dequeued. Such partial retrieval of a packet confuses the chip's RX queue management: On the next RX interrupt, the first packet read from the queue will be the one left there previously and this one can be retrieved without issues. But for any newly received packets, the frame header status and byte count registers (RXFHSR and RXFHBCR) contain bogus values, preventing their retrieval. The chip allows explicitly dequeueing a packet from the RX queue by setting the RRXEF flag in the RXQCR register ("Release RX Error Frame"). This could be used to dequeue the packet in case of an error, but if that error is a failed SPI transfer, it is unknown if the packet was transferred in full and was auto-dequeued or if it was only transferred in part and requires an explicit dequeue. The safest approach is thus to always dequeue packets explicitly and forgo auto-dequeueing. Without this change, I've witnessed packet retrieval break completely when an SPI DMA transfer fails, requiring a chip reset. Explicit dequeueing magically fixes this and makes packet retrieval absolutely robust for me. The chip's documentation suggests auto-dequeuing and uses the RRXEF flag only to dequeue error frames which the driver doesn't want to retrieve. But that seems to be a fair-weather approach. Signed-off-by: Lukas Wunner Cc: Frank Pavlic Cc: Ben Dooks Cc: Tristram Ha Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index bd6e9014bc74..a93f8e842c07 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -535,9 +535,8 @@ static void ks8851_rx_pkts(struct ks8851_net *ks) /* set dma read address */ ks8851_wrreg16(ks, KS_RXFDPR, RXFDPR_RXFPAI | 0x00); - /* start the packet dma process, and set auto-dequeue rx */ - ks8851_wrreg16(ks, KS_RXQCR, - ks->rc_rxqcr | RXQCR_SDA | RXQCR_ADRFE); + /* start DMA access */ + ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_SDA); if (rxlen > 4) { unsigned int rxalign; @@ -568,7 +567,8 @@ static void ks8851_rx_pkts(struct ks8851_net *ks) } } - ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr); + /* end DMA access and dequeue packet */ + ks8851_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_RRXEF); } } From 761cfa979a0c177d6c2d93ef5585cd79ae49a7d5 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 20 Mar 2019 15:02:00 +0100 Subject: [PATCH 722/855] net: ks8851: Reassert reset pin if chip ID check fails Commit 73fdeb82e963 ("net: ks8851: Add optional vdd_io regulator and reset gpio") amended the ks8851 driver to briefly assert the chip's reset pin on probe. It also amended the probe routine's error path to reassert the reset pin if a subsequent initialization step fails. However the commit misplaced reassertion of the reset pin in the error path such that it is not performed if the check of the Chip ID and Enable Register (CIDER) fails. The error path is therefore slightly asymmetrical to the probe routine's body. Fix it. Signed-off-by: Lukas Wunner Cc: Frank Pavlic Cc: Stephen Boyd Cc: Nishanth Menon Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index a93f8e842c07..1633fa5c709c 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -1554,9 +1554,9 @@ err_netdev: free_irq(ndev->irq, ks); err_irq: +err_id: if (gpio_is_valid(gpio)) gpio_set_value(gpio, 0); -err_id: regulator_disable(ks->vdd_reg); err_reg: regulator_disable(ks->vdd_io); From d268f31552794abf5b6aa5af31021643411f25f5 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 20 Mar 2019 15:02:00 +0100 Subject: [PATCH 723/855] net: ks8851: Delay requesting IRQ until opened The ks8851 driver currently requests the IRQ before registering the net_device. Because the net_device name is used as IRQ name and is still "eth%d" when the IRQ is requested, it's impossibe to tell IRQs apart if multiple ks8851 chips are present. Most other drivers delay requesting the IRQ until the net_device is opened. Do the same. The driver doesn't enable interrupts on the chip before opening the net_device and disables them when closing it, so there doesn't seem to be a need to request the IRQ already on probe. Signed-off-by: Lukas Wunner Cc: Frank Pavlic Cc: Ben Dooks Cc: Tristram Ha Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 1633fa5c709c..c9faec4c5b25 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -785,6 +785,15 @@ static void ks8851_tx_work(struct work_struct *work) static int ks8851_net_open(struct net_device *dev) { struct ks8851_net *ks = netdev_priv(dev); + int ret; + + ret = request_threaded_irq(dev->irq, NULL, ks8851_irq, + IRQF_TRIGGER_LOW | IRQF_ONESHOT, + dev->name, ks); + if (ret < 0) { + netdev_err(dev, "failed to get irq\n"); + return ret; + } /* lock the card, even if we may not actually be doing anything * else at the moment */ @@ -899,6 +908,8 @@ static int ks8851_net_stop(struct net_device *dev) dev_kfree_skb(txb); } + free_irq(dev->irq, ks); + return 0; } @@ -1529,14 +1540,6 @@ static int ks8851_probe(struct spi_device *spi) ks8851_read_selftest(ks); ks8851_init_mac(ks); - ret = request_threaded_irq(spi->irq, NULL, ks8851_irq, - IRQF_TRIGGER_LOW | IRQF_ONESHOT, - ndev->name, ks); - if (ret < 0) { - dev_err(&spi->dev, "failed to get irq\n"); - goto err_irq; - } - ret = register_netdev(ndev); if (ret) { dev_err(&spi->dev, "failed to register network device\n"); @@ -1549,11 +1552,7 @@ static int ks8851_probe(struct spi_device *spi) return 0; - err_netdev: - free_irq(ndev->irq, ks); - -err_irq: err_id: if (gpio_is_valid(gpio)) gpio_set_value(gpio, 0); @@ -1574,7 +1573,6 @@ static int ks8851_remove(struct spi_device *spi) dev_info(&spi->dev, "remove\n"); unregister_netdev(priv->netdev); - free_irq(spi->irq, priv); if (gpio_is_valid(priv->gpio)) gpio_set_value(priv->gpio, 0); regulator_disable(priv->vdd_reg); From 9624bafa5f6418b9ca5b3f66d1f6a6a2e8bf6d4c Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 20 Mar 2019 15:02:00 +0100 Subject: [PATCH 724/855] net: ks8851: Set initial carrier state to down The ks8851 chip's initial carrier state is down. A Link Change Interrupt is signaled once interrupts are enabled if the carrier is up. The ks8851 driver has it backwards by assuming that the initial carrier state is up. The state is therefore misrepresented if the interface is opened with no cable attached. Fix it. The Link Change interrupt is sometimes not signaled unless the P1MBSR register (which contains the Link Status bit) is read on ->ndo_open(). This might be a hardware erratum. Read the register by calling mii_check_link(), which has the desirable side effect of setting the carrier state to down if the cable was detached while the interface was closed. Signed-off-by: Lukas Wunner Cc: Frank Pavlic Cc: Ben Dooks Cc: Tristram Ha Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index c9faec4c5b25..b83b070a9eec 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -858,6 +858,7 @@ static int ks8851_net_open(struct net_device *dev) netif_dbg(ks, ifup, ks->netdev, "network device up\n"); mutex_unlock(&ks->lock); + mii_check_link(&ks->mii); return 0; } @@ -1519,6 +1520,7 @@ static int ks8851_probe(struct spi_device *spi) spi_set_drvdata(spi, ks); + netif_carrier_off(ks->netdev); ndev->if_port = IF_PORT_100BASET; ndev->netdev_ops = &ks8851_netdev_ops; ndev->irq = spi->irq; From cbda74a12c4b738feb90752fbca3648d24646079 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 20 Mar 2019 15:02:00 +0100 Subject: [PATCH 725/855] net: ks8851: Fix register macro misnomers In the header file accompanying the ks8851 driver, the P1SCLMD register macros are misnamed, they actually pertain to the P1CR register. The P1CR macros in turn pertain to the P1SR register, see pages 65 to 68 of the spec: http://www.hqchip.com/uploads/pdf/201703/47c98946d6c97a4766e14db3f24955f2.pdf The misnomers have no negative consequences so far because the macros aren't used by ks8851.c, but that's about to change. Signed-off-by: Lukas Wunner Cc: Frank Pavlic Cc: Ben Dooks Cc: Tristram Ha Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.h | 52 +++++++++++++++------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.h b/drivers/net/ethernet/micrel/ks8851.h index 852256ef1f22..1050fa48b73c 100644 --- a/drivers/net/ethernet/micrel/ks8851.h +++ b/drivers/net/ethernet/micrel/ks8851.h @@ -257,33 +257,35 @@ #define KS_P1ANLPR 0xEE #define KS_P1SCLMD 0xF4 -#define P1SCLMD_LEDOFF (1 << 15) -#define P1SCLMD_TXIDS (1 << 14) -#define P1SCLMD_RESTARTAN (1 << 13) -#define P1SCLMD_DISAUTOMDIX (1 << 10) -#define P1SCLMD_FORCEMDIX (1 << 9) -#define P1SCLMD_AUTONEGEN (1 << 7) -#define P1SCLMD_FORCE100 (1 << 6) -#define P1SCLMD_FORCEFDX (1 << 5) -#define P1SCLMD_ADV_FLOW (1 << 4) -#define P1SCLMD_ADV_100BT_FDX (1 << 3) -#define P1SCLMD_ADV_100BT_HDX (1 << 2) -#define P1SCLMD_ADV_10BT_FDX (1 << 1) -#define P1SCLMD_ADV_10BT_HDX (1 << 0) #define KS_P1CR 0xF6 -#define P1CR_HP_MDIX (1 << 15) -#define P1CR_REV_POL (1 << 13) -#define P1CR_OP_100M (1 << 10) -#define P1CR_OP_FDX (1 << 9) -#define P1CR_OP_MDI (1 << 7) -#define P1CR_AN_DONE (1 << 6) -#define P1CR_LINK_GOOD (1 << 5) -#define P1CR_PNTR_FLOW (1 << 4) -#define P1CR_PNTR_100BT_FDX (1 << 3) -#define P1CR_PNTR_100BT_HDX (1 << 2) -#define P1CR_PNTR_10BT_FDX (1 << 1) -#define P1CR_PNTR_10BT_HDX (1 << 0) +#define P1CR_LEDOFF (1 << 15) +#define P1CR_TXIDS (1 << 14) +#define P1CR_RESTARTAN (1 << 13) +#define P1CR_DISAUTOMDIX (1 << 10) +#define P1CR_FORCEMDIX (1 << 9) +#define P1CR_AUTONEGEN (1 << 7) +#define P1CR_FORCE100 (1 << 6) +#define P1CR_FORCEFDX (1 << 5) +#define P1CR_ADV_FLOW (1 << 4) +#define P1CR_ADV_100BT_FDX (1 << 3) +#define P1CR_ADV_100BT_HDX (1 << 2) +#define P1CR_ADV_10BT_FDX (1 << 1) +#define P1CR_ADV_10BT_HDX (1 << 0) + +#define KS_P1SR 0xF8 +#define P1SR_HP_MDIX (1 << 15) +#define P1SR_REV_POL (1 << 13) +#define P1SR_OP_100M (1 << 10) +#define P1SR_OP_FDX (1 << 9) +#define P1SR_OP_MDI (1 << 7) +#define P1SR_AN_DONE (1 << 6) +#define P1SR_LINK_GOOD (1 << 5) +#define P1SR_PNTR_FLOW (1 << 4) +#define P1SR_PNTR_100BT_FDX (1 << 3) +#define P1SR_PNTR_100BT_HDX (1 << 2) +#define P1SR_PNTR_10BT_FDX (1 << 1) +#define P1SR_PNTR_10BT_HDX (1 << 0) /* TX Frame control */ From aae079aa76d0cc3e679db31370364cb87a405651 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 20 Mar 2019 15:02:00 +0100 Subject: [PATCH 726/855] net: ks8851: Deduplicate register macros The ks8851 chip is sold either with an SPI interface (KSZ8851SNL) or with a so-called non-PCI interface (KSZ8851-16MLL). When the driver for the latter was introduced with commit a55c0a0ed415 ("drivers/net: ks8851_mll ethernet network driver"), it duplicated the register macros introduced by the driver for the former with commit 3ba81f3ece3c ("net: Micrel KS8851 SPI network driver"). The chips are almost identical, so the duplication seems unwarranted. There are a handful of bits which are in use on the KSZ8851-16MLL but reserved on the KSZ8851SNL, and vice-versa, but there are no actual collisions. Thus, remove the duplicate definitions from the KSZ8851-16MLL driver. Mark all bits which differ between the two chips. Move the SPI frame opcodes, which are specific to KSZ8851SNL, to its driver. The KSZ8851-16MLL driver added a RXFCTR_THRESHOLD_MASK macro which is a duplication of the RXFCTR_RXFCT_MASK macro, rename it where it's used. Same for P1MBCR_FORCE_FDX, which duplicates the BMCR_FULLDPLX macro and OBCR_ODS_16MA, which duplicates OBCR_ODS_16mA. Signed-off-by: Lukas Wunner Cc: Frank Pavlic Cc: Ben Dooks Cc: Tristram Ha Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 6 + drivers/net/ethernet/micrel/ks8851.h | 41 +-- drivers/net/ethernet/micrel/ks8851_mll.c | 317 +---------------------- 3 files changed, 34 insertions(+), 330 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index b83b070a9eec..7849119d407a 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -142,6 +142,12 @@ struct ks8851_net { static int msg_enable; +/* SPI frame opcodes */ +#define KS_SPIOP_RD (0x00) +#define KS_SPIOP_WR (0x40) +#define KS_SPIOP_RXFIFO (0x80) +#define KS_SPIOP_TXFIFO (0xC0) + /* shift for byte-enable data */ #define BYTE_EN(_x) ((_x) << 2) diff --git a/drivers/net/ethernet/micrel/ks8851.h b/drivers/net/ethernet/micrel/ks8851.h index 1050fa48b73c..23da1e3ee429 100644 --- a/drivers/net/ethernet/micrel/ks8851.h +++ b/drivers/net/ethernet/micrel/ks8851.h @@ -11,9 +11,15 @@ */ #define KS_CCR 0x08 +#define CCR_LE (1 << 10) /* KSZ8851-16MLL */ #define CCR_EEPROM (1 << 9) -#define CCR_SPI (1 << 8) -#define CCR_32PIN (1 << 0) +#define CCR_SPI (1 << 8) /* KSZ8851SNL */ +#define CCR_8BIT (1 << 7) /* KSZ8851-16MLL */ +#define CCR_16BIT (1 << 6) /* KSZ8851-16MLL */ +#define CCR_32BIT (1 << 5) /* KSZ8851-16MLL */ +#define CCR_SHARED (1 << 4) /* KSZ8851-16MLL */ +#define CCR_48PIN (1 << 1) /* KSZ8851-16MLL */ +#define CCR_32PIN (1 << 0) /* KSZ8851SNL */ /* MAC address registers */ #define KS_MAR(_m) (0x15 - (_m)) @@ -112,13 +118,13 @@ #define RXCR1_RXE (1 << 0) #define KS_RXCR2 0x76 -#define RXCR2_SRDBL_MASK (0x7 << 5) -#define RXCR2_SRDBL_SHIFT (5) -#define RXCR2_SRDBL_4B (0x0 << 5) -#define RXCR2_SRDBL_8B (0x1 << 5) -#define RXCR2_SRDBL_16B (0x2 << 5) -#define RXCR2_SRDBL_32B (0x3 << 5) -#define RXCR2_SRDBL_FRAME (0x4 << 5) +#define RXCR2_SRDBL_MASK (0x7 << 5) /* KSZ8851SNL */ +#define RXCR2_SRDBL_SHIFT (5) /* KSZ8851SNL */ +#define RXCR2_SRDBL_4B (0x0 << 5) /* KSZ8851SNL */ +#define RXCR2_SRDBL_8B (0x1 << 5) /* KSZ8851SNL */ +#define RXCR2_SRDBL_16B (0x2 << 5) /* KSZ8851SNL */ +#define RXCR2_SRDBL_32B (0x3 << 5) /* KSZ8851SNL */ +#define RXCR2_SRDBL_FRAME (0x4 << 5) /* KSZ8851SNL */ #define RXCR2_IUFFP (1 << 4) #define RXCR2_RXIUFCEZ (1 << 3) #define RXCR2_UDPLFE (1 << 2) @@ -143,8 +149,10 @@ #define RXFSHR_RXCE (1 << 0) #define KS_RXFHBCR 0x7E +#define RXFHBCR_CNT_MASK (0xfff << 0) + #define KS_TXQCR 0x80 -#define TXQCR_AETFE (1 << 2) +#define TXQCR_AETFE (1 << 2) /* KSZ8851SNL */ #define TXQCR_TXQMAM (1 << 1) #define TXQCR_METFE (1 << 0) @@ -167,6 +175,10 @@ #define KS_RXFDPR 0x86 #define RXFDPR_RXFPAI (1 << 14) +#define RXFDPR_WST (1 << 12) /* KSZ8851-16MLL */ +#define RXFDPR_EMS (1 << 11) /* KSZ8851-16MLL */ +#define RXFDPR_RXFP_MASK (0x7ff << 0) +#define RXFDPR_RXFP_SHIFT (0) #define KS_RXDTTR 0x8C #define KS_RXDBCTR 0x8E @@ -184,7 +196,7 @@ #define IRQ_RXMPDI (1 << 4) #define IRQ_LDI (1 << 3) #define IRQ_EDI (1 << 2) -#define IRQ_SPIBEI (1 << 1) +#define IRQ_SPIBEI (1 << 1) /* KSZ8851SNL */ #define IRQ_DEDI (1 << 0) #define KS_RXFCTR 0x9C @@ -288,13 +300,6 @@ #define P1SR_PNTR_10BT_HDX (1 << 0) /* TX Frame control */ - #define TXFR_TXIC (1 << 15) #define TXFR_TXFID_MASK (0x3f << 0) #define TXFR_TXFID_SHIFT (0) - -/* SPI frame opcodes */ -#define KS_SPIOP_RD (0x00) -#define KS_SPIOP_WR (0x40) -#define KS_SPIOP_RXFIFO (0x80) -#define KS_SPIOP_TXFIFO (0xC0) diff --git a/drivers/net/ethernet/micrel/ks8851_mll.c b/drivers/net/ethernet/micrel/ks8851_mll.c index 35f8c9ef204d..c946841c0a06 100644 --- a/drivers/net/ethernet/micrel/ks8851_mll.c +++ b/drivers/net/ethernet/micrel/ks8851_mll.c @@ -40,6 +40,8 @@ #include #include +#include "ks8851.h" + #define DRV_NAME "ks8851_mll" static u8 KS_DEFAULT_MAC_ADDRESS[] = { 0x00, 0x10, 0xA1, 0x86, 0x95, 0x11 }; @@ -48,319 +50,10 @@ static u8 KS_DEFAULT_MAC_ADDRESS[] = { 0x00, 0x10, 0xA1, 0x86, 0x95, 0x11 }; #define TX_BUF_SIZE 2000 #define RX_BUF_SIZE 2000 -#define KS_CCR 0x08 -#define CCR_EEPROM (1 << 9) -#define CCR_SPI (1 << 8) -#define CCR_8BIT (1 << 7) -#define CCR_16BIT (1 << 6) -#define CCR_32BIT (1 << 5) -#define CCR_SHARED (1 << 4) -#define CCR_32PIN (1 << 0) - -/* MAC address registers */ -#define KS_MARL 0x10 -#define KS_MARM 0x12 -#define KS_MARH 0x14 - -#define KS_OBCR 0x20 -#define OBCR_ODS_16MA (1 << 6) - -#define KS_EEPCR 0x22 -#define EEPCR_EESA (1 << 4) -#define EEPCR_EESB (1 << 3) -#define EEPCR_EEDO (1 << 2) -#define EEPCR_EESCK (1 << 1) -#define EEPCR_EECS (1 << 0) - -#define KS_MBIR 0x24 -#define MBIR_TXMBF (1 << 12) -#define MBIR_TXMBFA (1 << 11) -#define MBIR_RXMBF (1 << 4) -#define MBIR_RXMBFA (1 << 3) - -#define KS_GRR 0x26 -#define GRR_QMU (1 << 1) -#define GRR_GSR (1 << 0) - -#define KS_WFCR 0x2A -#define WFCR_MPRXE (1 << 7) -#define WFCR_WF3E (1 << 3) -#define WFCR_WF2E (1 << 2) -#define WFCR_WF1E (1 << 1) -#define WFCR_WF0E (1 << 0) - -#define KS_WF0CRC0 0x30 -#define KS_WF0CRC1 0x32 -#define KS_WF0BM0 0x34 -#define KS_WF0BM1 0x36 -#define KS_WF0BM2 0x38 -#define KS_WF0BM3 0x3A - -#define KS_WF1CRC0 0x40 -#define KS_WF1CRC1 0x42 -#define KS_WF1BM0 0x44 -#define KS_WF1BM1 0x46 -#define KS_WF1BM2 0x48 -#define KS_WF1BM3 0x4A - -#define KS_WF2CRC0 0x50 -#define KS_WF2CRC1 0x52 -#define KS_WF2BM0 0x54 -#define KS_WF2BM1 0x56 -#define KS_WF2BM2 0x58 -#define KS_WF2BM3 0x5A - -#define KS_WF3CRC0 0x60 -#define KS_WF3CRC1 0x62 -#define KS_WF3BM0 0x64 -#define KS_WF3BM1 0x66 -#define KS_WF3BM2 0x68 -#define KS_WF3BM3 0x6A - -#define KS_TXCR 0x70 -#define TXCR_TCGICMP (1 << 8) -#define TXCR_TCGUDP (1 << 7) -#define TXCR_TCGTCP (1 << 6) -#define TXCR_TCGIP (1 << 5) -#define TXCR_FTXQ (1 << 4) -#define TXCR_TXFCE (1 << 3) -#define TXCR_TXPE (1 << 2) -#define TXCR_TXCRC (1 << 1) -#define TXCR_TXE (1 << 0) - -#define KS_TXSR 0x72 -#define TXSR_TXLC (1 << 13) -#define TXSR_TXMC (1 << 12) -#define TXSR_TXFID_MASK (0x3f << 0) -#define TXSR_TXFID_SHIFT (0) -#define TXSR_TXFID_GET(_v) (((_v) >> 0) & 0x3f) - - -#define KS_RXCR1 0x74 -#define RXCR1_FRXQ (1 << 15) -#define RXCR1_RXUDPFCC (1 << 14) -#define RXCR1_RXTCPFCC (1 << 13) -#define RXCR1_RXIPFCC (1 << 12) -#define RXCR1_RXPAFMA (1 << 11) -#define RXCR1_RXFCE (1 << 10) -#define RXCR1_RXEFE (1 << 9) -#define RXCR1_RXMAFMA (1 << 8) -#define RXCR1_RXBE (1 << 7) -#define RXCR1_RXME (1 << 6) -#define RXCR1_RXUE (1 << 5) -#define RXCR1_RXAE (1 << 4) -#define RXCR1_RXINVF (1 << 1) -#define RXCR1_RXE (1 << 0) #define RXCR1_FILTER_MASK (RXCR1_RXINVF | RXCR1_RXAE | \ RXCR1_RXMAFMA | RXCR1_RXPAFMA) - -#define KS_RXCR2 0x76 -#define RXCR2_SRDBL_MASK (0x7 << 5) -#define RXCR2_SRDBL_SHIFT (5) -#define RXCR2_SRDBL_4B (0x0 << 5) -#define RXCR2_SRDBL_8B (0x1 << 5) -#define RXCR2_SRDBL_16B (0x2 << 5) -#define RXCR2_SRDBL_32B (0x3 << 5) -/* #define RXCR2_SRDBL_FRAME (0x4 << 5) */ -#define RXCR2_IUFFP (1 << 4) -#define RXCR2_RXIUFCEZ (1 << 3) -#define RXCR2_UDPLFE (1 << 2) -#define RXCR2_RXICMPFCC (1 << 1) -#define RXCR2_RXSAF (1 << 0) - -#define KS_TXMIR 0x78 - -#define KS_RXFHSR 0x7C -#define RXFSHR_RXFV (1 << 15) -#define RXFSHR_RXICMPFCS (1 << 13) -#define RXFSHR_RXIPFCS (1 << 12) -#define RXFSHR_RXTCPFCS (1 << 11) -#define RXFSHR_RXUDPFCS (1 << 10) -#define RXFSHR_RXBF (1 << 7) -#define RXFSHR_RXMF (1 << 6) -#define RXFSHR_RXUF (1 << 5) -#define RXFSHR_RXMR (1 << 4) -#define RXFSHR_RXFT (1 << 3) -#define RXFSHR_RXFTL (1 << 2) -#define RXFSHR_RXRF (1 << 1) -#define RXFSHR_RXCE (1 << 0) -#define RXFSHR_ERR (RXFSHR_RXCE | RXFSHR_RXRF |\ - RXFSHR_RXFTL | RXFSHR_RXMR |\ - RXFSHR_RXICMPFCS | RXFSHR_RXIPFCS |\ - RXFSHR_RXTCPFCS) -#define KS_RXFHBCR 0x7E -#define RXFHBCR_CNT_MASK 0x0FFF - -#define KS_TXQCR 0x80 -#define TXQCR_AETFE (1 << 2) -#define TXQCR_TXQMAM (1 << 1) -#define TXQCR_METFE (1 << 0) - -#define KS_RXQCR 0x82 -#define RXQCR_RXDTTS (1 << 12) -#define RXQCR_RXDBCTS (1 << 11) -#define RXQCR_RXFCTS (1 << 10) -#define RXQCR_RXIPHTOE (1 << 9) -#define RXQCR_RXDTTE (1 << 7) -#define RXQCR_RXDBCTE (1 << 6) -#define RXQCR_RXFCTE (1 << 5) -#define RXQCR_ADRFE (1 << 4) -#define RXQCR_SDA (1 << 3) -#define RXQCR_RRXEF (1 << 0) #define RXQCR_CMD_CNTL (RXQCR_RXFCTE|RXQCR_ADRFE) -#define KS_TXFDPR 0x84 -#define TXFDPR_TXFPAI (1 << 14) -#define TXFDPR_TXFP_MASK (0x7ff << 0) -#define TXFDPR_TXFP_SHIFT (0) - -#define KS_RXFDPR 0x86 -#define RXFDPR_RXFPAI (1 << 14) - -#define KS_RXDTTR 0x8C -#define KS_RXDBCTR 0x8E - -#define KS_IER 0x90 -#define KS_ISR 0x92 -#define IRQ_LCI (1 << 15) -#define IRQ_TXI (1 << 14) -#define IRQ_RXI (1 << 13) -#define IRQ_RXOI (1 << 11) -#define IRQ_TXPSI (1 << 9) -#define IRQ_RXPSI (1 << 8) -#define IRQ_TXSAI (1 << 6) -#define IRQ_RXWFDI (1 << 5) -#define IRQ_RXMPDI (1 << 4) -#define IRQ_LDI (1 << 3) -#define IRQ_EDI (1 << 2) -#define IRQ_SPIBEI (1 << 1) -#define IRQ_DEDI (1 << 0) - -#define KS_RXFCTR 0x9C -#define RXFCTR_THRESHOLD_MASK 0x00FF - -#define KS_RXFC 0x9D -#define RXFCTR_RXFC_MASK (0xff << 8) -#define RXFCTR_RXFC_SHIFT (8) -#define RXFCTR_RXFC_GET(_v) (((_v) >> 8) & 0xff) -#define RXFCTR_RXFCT_MASK (0xff << 0) -#define RXFCTR_RXFCT_SHIFT (0) - -#define KS_TXNTFSR 0x9E - -#define KS_MAHTR0 0xA0 -#define KS_MAHTR1 0xA2 -#define KS_MAHTR2 0xA4 -#define KS_MAHTR3 0xA6 - -#define KS_FCLWR 0xB0 -#define KS_FCHWR 0xB2 -#define KS_FCOWR 0xB4 - -#define KS_CIDER 0xC0 -#define CIDER_ID 0x8870 -#define CIDER_REV_MASK (0x7 << 1) -#define CIDER_REV_SHIFT (1) -#define CIDER_REV_GET(_v) (((_v) >> 1) & 0x7) - -#define KS_CGCR 0xC6 -#define KS_IACR 0xC8 -#define IACR_RDEN (1 << 12) -#define IACR_TSEL_MASK (0x3 << 10) -#define IACR_TSEL_SHIFT (10) -#define IACR_TSEL_MIB (0x3 << 10) -#define IACR_ADDR_MASK (0x1f << 0) -#define IACR_ADDR_SHIFT (0) - -#define KS_IADLR 0xD0 -#define KS_IAHDR 0xD2 - -#define KS_PMECR 0xD4 -#define PMECR_PME_DELAY (1 << 14) -#define PMECR_PME_POL (1 << 12) -#define PMECR_WOL_WAKEUP (1 << 11) -#define PMECR_WOL_MAGICPKT (1 << 10) -#define PMECR_WOL_LINKUP (1 << 9) -#define PMECR_WOL_ENERGY (1 << 8) -#define PMECR_AUTO_WAKE_EN (1 << 7) -#define PMECR_WAKEUP_NORMAL (1 << 6) -#define PMECR_WKEVT_MASK (0xf << 2) -#define PMECR_WKEVT_SHIFT (2) -#define PMECR_WKEVT_GET(_v) (((_v) >> 2) & 0xf) -#define PMECR_WKEVT_ENERGY (0x1 << 2) -#define PMECR_WKEVT_LINK (0x2 << 2) -#define PMECR_WKEVT_MAGICPKT (0x4 << 2) -#define PMECR_WKEVT_FRAME (0x8 << 2) -#define PMECR_PM_MASK (0x3 << 0) -#define PMECR_PM_SHIFT (0) -#define PMECR_PM_NORMAL (0x0 << 0) -#define PMECR_PM_ENERGY (0x1 << 0) -#define PMECR_PM_SOFTDOWN (0x2 << 0) -#define PMECR_PM_POWERSAVE (0x3 << 0) - -/* Standard MII PHY data */ -#define KS_P1MBCR 0xE4 -#define P1MBCR_FORCE_FDX (1 << 8) - -#define KS_P1MBSR 0xE6 -#define P1MBSR_AN_COMPLETE (1 << 5) -#define P1MBSR_AN_CAPABLE (1 << 3) -#define P1MBSR_LINK_UP (1 << 2) - -#define KS_PHY1ILR 0xE8 -#define KS_PHY1IHR 0xEA -#define KS_P1ANAR 0xEC -#define KS_P1ANLPR 0xEE - -#define KS_P1SCLMD 0xF4 -#define P1SCLMD_LEDOFF (1 << 15) -#define P1SCLMD_TXIDS (1 << 14) -#define P1SCLMD_RESTARTAN (1 << 13) -#define P1SCLMD_DISAUTOMDIX (1 << 10) -#define P1SCLMD_FORCEMDIX (1 << 9) -#define P1SCLMD_AUTONEGEN (1 << 7) -#define P1SCLMD_FORCE100 (1 << 6) -#define P1SCLMD_FORCEFDX (1 << 5) -#define P1SCLMD_ADV_FLOW (1 << 4) -#define P1SCLMD_ADV_100BT_FDX (1 << 3) -#define P1SCLMD_ADV_100BT_HDX (1 << 2) -#define P1SCLMD_ADV_10BT_FDX (1 << 1) -#define P1SCLMD_ADV_10BT_HDX (1 << 0) - -#define KS_P1CR 0xF6 -#define P1CR_HP_MDIX (1 << 15) -#define P1CR_REV_POL (1 << 13) -#define P1CR_OP_100M (1 << 10) -#define P1CR_OP_FDX (1 << 9) -#define P1CR_OP_MDI (1 << 7) -#define P1CR_AN_DONE (1 << 6) -#define P1CR_LINK_GOOD (1 << 5) -#define P1CR_PNTR_FLOW (1 << 4) -#define P1CR_PNTR_100BT_FDX (1 << 3) -#define P1CR_PNTR_100BT_HDX (1 << 2) -#define P1CR_PNTR_10BT_FDX (1 << 1) -#define P1CR_PNTR_10BT_HDX (1 << 0) - -/* TX Frame control */ - -#define TXFR_TXIC (1 << 15) -#define TXFR_TXFID_MASK (0x3f << 0) -#define TXFR_TXFID_SHIFT (0) - -#define KS_P1SR 0xF8 -#define P1SR_HP_MDIX (1 << 15) -#define P1SR_REV_POL (1 << 13) -#define P1SR_OP_100M (1 << 10) -#define P1SR_OP_FDX (1 << 9) -#define P1SR_OP_MDI (1 << 7) -#define P1SR_AN_DONE (1 << 6) -#define P1SR_LINK_GOOD (1 << 5) -#define P1SR_PNTR_FLOW (1 << 4) -#define P1SR_PNTR_100BT_FDX (1 << 3) -#define P1SR_PNTR_100BT_HDX (1 << 2) -#define P1SR_PNTR_10BT_FDX (1 << 1) -#define P1SR_PNTR_10BT_HDX (1 << 0) - #define ENUM_BUS_NONE 0 #define ENUM_BUS_8BIT 1 #define ENUM_BUS_16BIT 2 @@ -1475,7 +1168,7 @@ static void ks_setup(struct ks_net *ks) ks_wrreg16(ks, KS_RXFDPR, RXFDPR_RXFPAI); /* Setup Receive Frame Threshold - 1 frame (RXFCTFC) */ - ks_wrreg16(ks, KS_RXFCTR, 1 & RXFCTR_THRESHOLD_MASK); + ks_wrreg16(ks, KS_RXFCTR, 1 & RXFCTR_RXFCT_MASK); /* Setup RxQ Command Control (RXQCR) */ ks->rc_rxqcr = RXQCR_CMD_CNTL; @@ -1488,7 +1181,7 @@ static void ks_setup(struct ks_net *ks) */ w = ks_rdreg16(ks, KS_P1MBCR); - w &= ~P1MBCR_FORCE_FDX; + w &= ~BMCR_FULLDPLX; ks_wrreg16(ks, KS_P1MBCR, w); w = TXCR_TXFCE | TXCR_TXPE | TXCR_TXCRC | TXCR_TCGIP; @@ -1629,7 +1322,7 @@ static int ks8851_probe(struct platform_device *pdev) ks_setup_int(ks); data = ks_rdreg16(ks, KS_OBCR); - ks_wrreg16(ks, KS_OBCR, data | OBCR_ODS_16MA); + ks_wrreg16(ks, KS_OBCR, data | OBCR_ODS_16mA); /* overwriting the default MAC address */ if (pdev->dev.of_node) { From 29ece8b4354f8c5eaee798a3d8a1b356efee426f Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Mon, 18 Mar 2019 22:44:41 +0800 Subject: [PATCH 727/855] block: add BLK_MQ_POLL_CLASSIC for hybrid poll and return EINVAL for unexpected value For q->poll_nsec == -1, means doing classic poll, not hybrid poll. We introduce a new flag BLK_MQ_POLL_CLASSIC to replace -1, which may make code much easier to read. Additionally, since val is an int obtained with kstrtoint(), val can be a negative value other than -1, so return -EINVAL for that case. Thanks to Damien Le Moal for some good suggestion. Reviewed-by: Damien Le Moal Signed-off-by: Yufen Yu Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- block/blk-sysfs.c | 12 +++++++----- include/linux/blkdev.h | 3 +++ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ea01c23b58a3..76a3f78c566a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2856,7 +2856,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* * Default to classic polling */ - q->poll_nsec = -1; + q->poll_nsec = BLK_MQ_POLL_CLASSIC; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); @@ -3391,7 +3391,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q, { struct request *rq; - if (q->poll_nsec == -1) + if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) return false; if (!blk_qc_t_is_internal(cookie)) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 59685918167e..422327089e0f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -360,8 +360,8 @@ static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) { int val; - if (q->poll_nsec == -1) - val = -1; + if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) + val = BLK_MQ_POLL_CLASSIC; else val = q->poll_nsec / 1000; @@ -380,10 +380,12 @@ static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, if (err < 0) return err; - if (val == -1) - q->poll_nsec = -1; - else + if (val == BLK_MQ_POLL_CLASSIC) + q->poll_nsec = BLK_MQ_POLL_CLASSIC; + else if (val >= 0) q->poll_nsec = val * 1000; + else + return -EINVAL; return count; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0de92b29f589..5c58a3b2bf00 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -50,6 +50,9 @@ struct blk_stat_callback; /* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 +/* Doing classic polling */ +#define BLK_MQ_POLL_CLASSIC -1 + /* * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. From e6c987120e24cb913cb7bd4e675129a30fa49e0d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 20 Mar 2019 13:14:37 -0700 Subject: [PATCH 728/855] block: Unexport blk_mq_add_to_requeue_list() This function is not used outside the block layer core. Hence unexport it. Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - block/blk-mq.h | 2 ++ include/linux/blk-mq.h | 2 -- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 76a3f78c566a..70b210a308c4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -782,7 +782,6 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, if (kick_requeue_list) blk_mq_kick_requeue_list(q); } -EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q) { diff --git a/block/blk-mq.h b/block/blk-mq.h index c11353a3749d..0ed8e5a8729f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -41,6 +41,8 @@ void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, + bool kick_requeue_list); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_get_driver_tag(struct request *rq); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 35359697318b..cb2aa7ecafff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -299,8 +299,6 @@ void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); -void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, - bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); bool blk_mq_complete_request(struct request *rq); From 373e915cd8e84544609eced57a44fbc084f8d60f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 20 Mar 2019 13:15:01 -0700 Subject: [PATCH 729/855] blk-iolatency: #include "blk.h" This patch avoids that the following warning is reported when building with W=1: block/blk-iolatency.c:734:5: warning: no previous prototype for 'blk_iolatency_init' [-Wmissing-prototypes] Cc: Josef Bacik Fixes: d70675121546 ("block: introduce blk-iolatency io controller") # v4.19 Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 2620baa1f699..507212d75ee2 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -75,6 +75,7 @@ #include #include "blk-rq-qos.h" #include "blk-stat.h" +#include "blk.h" #define DEFAULT_SCALE_COOKIE 1000000U From 537d71b3f774c3e825540ab626bad6c0ed2b5ff7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 20 Mar 2019 13:18:45 -0700 Subject: [PATCH 730/855] blkcg: Fix kernel-doc warnings Avoid that the following warnings are reported when building with W=1: block/blk-cgroup.c:1755: warning: Function parameter or member 'q' not described in 'blkcg_schedule_throttle' block/blk-cgroup.c:1755: warning: Function parameter or member 'use_memdelay' not described in 'blkcg_schedule_throttle' block/blk-cgroup.c:1779: warning: Function parameter or member 'blkg' not described in 'blkcg_add_delay' block/blk-cgroup.c:1779: warning: Function parameter or member 'now' not described in 'blkcg_add_delay' block/blk-cgroup.c:1779: warning: Function parameter or member 'delta' not described in 'blkcg_add_delay' Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 77f37ef8ef06..617a2b3f7582 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1736,8 +1736,8 @@ out: /** * blkcg_schedule_throttle - this task needs to check for throttling - * @q - the request queue IO was submitted on - * @use_memdelay - do we charge this to memory delay for PSI + * @q: the request queue IO was submitted on + * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places @@ -1769,8 +1769,9 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) /** * blkcg_add_delay - add delay to this blkg - * @now - the current time in nanoseconds - * @delta - how many nanoseconds of delay to add + * @blkg: blkg of interest + * @now: the current time in nanoseconds + * @delta: how many nanoseconds of delay to add * * Charge @delta to the blkg's current delay accumulation. This is used to * throttle tasks if an IO controller thinks we need more throttling. From 64447506f152cf0f88a0fc23140ca1c5f7ff34a8 Mon Sep 17 00:00:00 2001 From: Ioana Ciocoi Radulescu Date: Wed, 20 Mar 2019 14:11:04 +0000 Subject: [PATCH 731/855] dpaa2-eth: Fix possible access beyond end of array Make sure we don't try to enqueue XDP_REDIRECT frames to an inexistent FQ. While it is guaranteed not to have more than one queue per core, having fewer queues than CPUs on an interface is a valid configuration. Fixes: d678be1dc1ec ("dpaa2-eth: add XDP_REDIRECT support") Reported-by: Jesper Dangaard Brouer Signed-off-by: Ioana Radulescu Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 2ba49e959c3f..1a68052abb94 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1817,7 +1817,7 @@ static int dpaa2_eth_xdp_xmit_frame(struct net_device *net_dev, dpaa2_fd_set_format(&fd, dpaa2_fd_single); dpaa2_fd_set_ctrl(&fd, FD_CTRL_PTA); - fq = &priv->fq[smp_processor_id()]; + fq = &priv->fq[smp_processor_id() % dpaa2_eth_queue_count(priv)]; for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) { err = priv->enqueue(priv, fq, &fd, 0); if (err != -EBUSY) From 7205981e045e752ccf96cf6ddd703a98c59d4339 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Wed, 20 Mar 2019 13:41:50 -0500 Subject: [PATCH 732/855] scsi: ibmvscsi: Protect ibmvscsi_head from concurrent modificaiton For each ibmvscsi host created during a probe or destroyed during a remove we either add or remove that host to/from the global ibmvscsi_head list. This runs the risk of concurrent modification. This patch adds a simple spinlock around the list modification calls to prevent concurrent updates as is done similarly in the ibmvfc driver and ipr driver. Fixes: 32d6e4b6e4ea ("scsi: ibmvscsi: add vscsi hosts to global list_head") Cc: # v4.10+ Signed-off-by: Tyrel Datwyler Signed-off-by: Martin K. Petersen --- drivers/scsi/ibmvscsi/ibmvscsi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index 1135e74646e2..2b22969f3f63 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -96,6 +96,7 @@ static int client_reserve = 1; static char partition_name[96] = "UNKNOWN"; static unsigned int partition_number = -1; static LIST_HEAD(ibmvscsi_head); +static DEFINE_SPINLOCK(ibmvscsi_driver_lock); static struct scsi_transport_template *ibmvscsi_transport_template; @@ -2270,7 +2271,9 @@ static int ibmvscsi_probe(struct vio_dev *vdev, const struct vio_device_id *id) } dev_set_drvdata(&vdev->dev, hostdata); + spin_lock(&ibmvscsi_driver_lock); list_add_tail(&hostdata->host_list, &ibmvscsi_head); + spin_unlock(&ibmvscsi_driver_lock); return 0; add_srp_port_failed: @@ -2292,7 +2295,9 @@ static int ibmvscsi_probe(struct vio_dev *vdev, const struct vio_device_id *id) static int ibmvscsi_remove(struct vio_dev *vdev) { struct ibmvscsi_host_data *hostdata = dev_get_drvdata(&vdev->dev); + spin_lock(&ibmvscsi_driver_lock); list_del(&hostdata->host_list); + spin_unlock(&ibmvscsi_driver_lock); unmap_persist_bufs(hostdata); release_event_pool(&hostdata->pool, hostdata); ibmvscsi_release_crq_queue(&hostdata->queue, hostdata, From 7f5203c13ba8a7b7f9f6ecfe5a4d5567188d7835 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Wed, 20 Mar 2019 13:41:51 -0500 Subject: [PATCH 733/855] scsi: ibmvscsi: Fix empty event pool access during host removal The event pool used for queueing commands is destroyed fairly early in the ibmvscsi_remove() code path. Since, this happens prior to the call so scsi_remove_host() it is possible for further calls to queuecommand to be processed which manifest as a panic due to a NULL pointer dereference as seen here: PANIC: "Unable to handle kernel paging request for data at address 0x00000000" Context process backtrace: DSISR: 0000000042000000 ????Syscall Result: 0000000000000000 4 [c000000002cb3820] memcpy_power7 at c000000000064204 [Link Register] [c000000002cb3820] ibmvscsi_send_srp_event at d000000003ed14a4 5 [c000000002cb3920] ibmvscsi_send_srp_event at d000000003ed14a4 [ibmvscsi] ?(unreliable) 6 [c000000002cb39c0] ibmvscsi_queuecommand at d000000003ed2388 [ibmvscsi] 7 [c000000002cb3a70] scsi_dispatch_cmd at d00000000395c2d8 [scsi_mod] 8 [c000000002cb3af0] scsi_request_fn at d00000000395ef88 [scsi_mod] 9 [c000000002cb3be0] __blk_run_queue at c000000000429860 10 [c000000002cb3c10] blk_delay_work at c00000000042a0ec 11 [c000000002cb3c40] process_one_work at c0000000000dac30 12 [c000000002cb3cd0] worker_thread at c0000000000db110 13 [c000000002cb3d80] kthread at c0000000000e3378 14 [c000000002cb3e30] ret_from_kernel_thread at c00000000000982c The kernel buffer log is overfilled with this log: [11261.952732] ibmvscsi: found no event struct in pool! This patch reorders the operations during host teardown. Start by calling the SRP transport and Scsi_Host remove functions to flush any outstanding work and set the host offline. LLDD teardown follows including destruction of the event pool, freeing the Command Response Queue (CRQ), and unmapping any persistent buffers. The event pool destruction is protected by the scsi_host lock, and the pool is purged prior of any requests for which we never received a response. Finally, move the removal of the scsi host from our global list to the end so that the host is easily locatable for debugging purposes during teardown. Cc: # v2.6.12+ Signed-off-by: Tyrel Datwyler Signed-off-by: Martin K. Petersen --- drivers/scsi/ibmvscsi/ibmvscsi.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index 2b22969f3f63..8cec5230fe31 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -2295,17 +2295,27 @@ static int ibmvscsi_probe(struct vio_dev *vdev, const struct vio_device_id *id) static int ibmvscsi_remove(struct vio_dev *vdev) { struct ibmvscsi_host_data *hostdata = dev_get_drvdata(&vdev->dev); - spin_lock(&ibmvscsi_driver_lock); - list_del(&hostdata->host_list); - spin_unlock(&ibmvscsi_driver_lock); - unmap_persist_bufs(hostdata); + unsigned long flags; + + srp_remove_host(hostdata->host); + scsi_remove_host(hostdata->host); + + purge_requests(hostdata, DID_ERROR); + + spin_lock_irqsave(hostdata->host->host_lock, flags); release_event_pool(&hostdata->pool, hostdata); + spin_unlock_irqrestore(hostdata->host->host_lock, flags); + ibmvscsi_release_crq_queue(&hostdata->queue, hostdata, max_events); kthread_stop(hostdata->work_thread); - srp_remove_host(hostdata->host); - scsi_remove_host(hostdata->host); + unmap_persist_bufs(hostdata); + + spin_lock(&ibmvscsi_driver_lock); + list_del(&hostdata->host_list); + spin_unlock(&ibmvscsi_driver_lock); + scsi_host_put(hostdata->host); return 0; From 6a3b45ada960ac475ec2b4103d43e57943b2b8d3 Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Tue, 19 Mar 2019 14:05:11 +0100 Subject: [PATCH 734/855] drm/exynos/mixer: fix MIXER shadow registry synchronisation code MIXER on Exynos5 SoCs uses different synchronisation method than Exynos4 to update internal state (shadow registers). Apparently the driver implements it incorrectly. The rule should be as follows: - do not request updating registers until previous request was finished, ie. MXR_CFG_LAYER_UPDATE_COUNT must be 0. - before setting registers synchronisation on VSYNC should be turned off, ie. MXR_STATUS_SYNC_ENABLE should be reset, - after finishing MXR_STATUS_SYNC_ENABLE should be set again. The patch hopefully implements it correctly. Below sample kernel log from page fault caused by the bug: [ 25.670038] exynos-sysmmu 14650000.sysmmu: 14450000.mixer: PAGE FAULT occurred at 0x2247b800 [ 25.677888] ------------[ cut here ]------------ [ 25.682164] kernel BUG at ../drivers/iommu/exynos-iommu.c:450! [ 25.687971] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM [ 25.693778] Modules linked in: [ 25.696816] CPU: 5 PID: 1553 Comm: fb-release_test Not tainted 5.0.0-rc7-01157-g5f86b1566bdd #136 [ 25.705646] Hardware name: SAMSUNG EXYNOS (Flattened Device Tree) [ 25.711710] PC is at exynos_sysmmu_irq+0x1c0/0x264 [ 25.716470] LR is at lock_is_held_type+0x44/0x64 v2: added missing MXR_CFG_LAYER_UPDATE bit setting in mixer_enable_sync Reported-by: Marian Mihailescu Signed-off-by: Andrzej Hajda Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_mixer.c | 110 +++++++++++++++----------- 1 file changed, 66 insertions(+), 44 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_mixer.c b/drivers/gpu/drm/exynos/exynos_mixer.c index 0573eab0e190..f35e4ab55b27 100644 --- a/drivers/gpu/drm/exynos/exynos_mixer.c +++ b/drivers/gpu/drm/exynos/exynos_mixer.c @@ -20,6 +20,7 @@ #include "regs-vp.h" #include +#include #include #include #include @@ -352,15 +353,62 @@ static void mixer_cfg_vp_blend(struct mixer_context *ctx, unsigned int alpha) mixer_reg_write(ctx, MXR_VIDEO_CFG, val); } -static void mixer_vsync_set_update(struct mixer_context *ctx, bool enable) +static bool mixer_is_synced(struct mixer_context *ctx) { - /* block update on vsync */ - mixer_reg_writemask(ctx, MXR_STATUS, enable ? - MXR_STATUS_SYNC_ENABLE : 0, MXR_STATUS_SYNC_ENABLE); + u32 base, shadow; + if (ctx->mxr_ver == MXR_VER_16_0_33_0 || + ctx->mxr_ver == MXR_VER_128_0_0_184) + return !(mixer_reg_read(ctx, MXR_CFG) & + MXR_CFG_LAYER_UPDATE_COUNT_MASK); + + if (test_bit(MXR_BIT_VP_ENABLED, &ctx->flags) && + vp_reg_read(ctx, VP_SHADOW_UPDATE)) + return false; + + base = mixer_reg_read(ctx, MXR_CFG); + shadow = mixer_reg_read(ctx, MXR_CFG_S); + if (base != shadow) + return false; + + base = mixer_reg_read(ctx, MXR_GRAPHIC_BASE(0)); + shadow = mixer_reg_read(ctx, MXR_GRAPHIC_BASE_S(0)); + if (base != shadow) + return false; + + base = mixer_reg_read(ctx, MXR_GRAPHIC_BASE(1)); + shadow = mixer_reg_read(ctx, MXR_GRAPHIC_BASE_S(1)); + if (base != shadow) + return false; + + return true; +} + +static int mixer_wait_for_sync(struct mixer_context *ctx) +{ + ktime_t timeout = ktime_add_us(ktime_get(), 100000); + + while (!mixer_is_synced(ctx)) { + usleep_range(1000, 2000); + if (ktime_compare(ktime_get(), timeout) > 0) + return -ETIMEDOUT; + } + return 0; +} + +static void mixer_disable_sync(struct mixer_context *ctx) +{ + mixer_reg_writemask(ctx, MXR_STATUS, 0, MXR_STATUS_SYNC_ENABLE); +} + +static void mixer_enable_sync(struct mixer_context *ctx) +{ + if (ctx->mxr_ver == MXR_VER_16_0_33_0 || + ctx->mxr_ver == MXR_VER_128_0_0_184) + mixer_reg_writemask(ctx, MXR_CFG, ~0, MXR_CFG_LAYER_UPDATE); + mixer_reg_writemask(ctx, MXR_STATUS, ~0, MXR_STATUS_SYNC_ENABLE); if (test_bit(MXR_BIT_VP_ENABLED, &ctx->flags)) - vp_reg_write(ctx, VP_SHADOW_UPDATE, enable ? - VP_SHADOW_UPDATE_ENABLE : 0); + vp_reg_write(ctx, VP_SHADOW_UPDATE, VP_SHADOW_UPDATE_ENABLE); } static void mixer_cfg_scan(struct mixer_context *ctx, int width, int height) @@ -498,7 +546,6 @@ static void vp_video_buffer(struct mixer_context *ctx, spin_lock_irqsave(&ctx->reg_slock, flags); - vp_reg_write(ctx, VP_SHADOW_UPDATE, 1); /* interlace or progressive scan mode */ val = (test_bit(MXR_BIT_INTERLACE, &ctx->flags) ? ~0 : 0); vp_reg_writemask(ctx, VP_MODE, val, VP_MODE_LINE_SKIP); @@ -553,11 +600,6 @@ static void vp_video_buffer(struct mixer_context *ctx, vp_regs_dump(ctx); } -static void mixer_layer_update(struct mixer_context *ctx) -{ - mixer_reg_writemask(ctx, MXR_CFG, ~0, MXR_CFG_LAYER_UPDATE); -} - static void mixer_graph_buffer(struct mixer_context *ctx, struct exynos_drm_plane *plane) { @@ -640,11 +682,6 @@ static void mixer_graph_buffer(struct mixer_context *ctx, mixer_cfg_layer(ctx, win, priority, true); mixer_cfg_gfx_blend(ctx, win, pixel_alpha, state->base.alpha); - /* layer update mandatory for mixer 16.0.33.0 */ - if (ctx->mxr_ver == MXR_VER_16_0_33_0 || - ctx->mxr_ver == MXR_VER_128_0_0_184) - mixer_layer_update(ctx); - spin_unlock_irqrestore(&ctx->reg_slock, flags); mixer_regs_dump(ctx); @@ -709,7 +746,7 @@ static void mixer_win_reset(struct mixer_context *ctx) static irqreturn_t mixer_irq_handler(int irq, void *arg) { struct mixer_context *ctx = arg; - u32 val, base, shadow; + u32 val; spin_lock(&ctx->reg_slock); @@ -723,26 +760,9 @@ static irqreturn_t mixer_irq_handler(int irq, void *arg) val &= ~MXR_INT_STATUS_VSYNC; /* interlace scan need to check shadow register */ - if (test_bit(MXR_BIT_INTERLACE, &ctx->flags)) { - if (test_bit(MXR_BIT_VP_ENABLED, &ctx->flags) && - vp_reg_read(ctx, VP_SHADOW_UPDATE)) - goto out; - - base = mixer_reg_read(ctx, MXR_CFG); - shadow = mixer_reg_read(ctx, MXR_CFG_S); - if (base != shadow) - goto out; - - base = mixer_reg_read(ctx, MXR_GRAPHIC_BASE(0)); - shadow = mixer_reg_read(ctx, MXR_GRAPHIC_BASE_S(0)); - if (base != shadow) - goto out; - - base = mixer_reg_read(ctx, MXR_GRAPHIC_BASE(1)); - shadow = mixer_reg_read(ctx, MXR_GRAPHIC_BASE_S(1)); - if (base != shadow) - goto out; - } + if (test_bit(MXR_BIT_INTERLACE, &ctx->flags) + && !mixer_is_synced(ctx)) + goto out; drm_crtc_handle_vblank(&ctx->crtc->base); } @@ -917,12 +937,14 @@ static void mixer_disable_vblank(struct exynos_drm_crtc *crtc) static void mixer_atomic_begin(struct exynos_drm_crtc *crtc) { - struct mixer_context *mixer_ctx = crtc->ctx; + struct mixer_context *ctx = crtc->ctx; - if (!test_bit(MXR_BIT_POWERED, &mixer_ctx->flags)) + if (!test_bit(MXR_BIT_POWERED, &ctx->flags)) return; - mixer_vsync_set_update(mixer_ctx, false); + if (mixer_wait_for_sync(ctx)) + dev_err(ctx->dev, "timeout waiting for VSYNC\n"); + mixer_disable_sync(ctx); } static void mixer_update_plane(struct exynos_drm_crtc *crtc, @@ -964,7 +986,7 @@ static void mixer_atomic_flush(struct exynos_drm_crtc *crtc) if (!test_bit(MXR_BIT_POWERED, &mixer_ctx->flags)) return; - mixer_vsync_set_update(mixer_ctx, true); + mixer_enable_sync(mixer_ctx); exynos_crtc_handle_event(crtc); } @@ -979,7 +1001,7 @@ static void mixer_enable(struct exynos_drm_crtc *crtc) exynos_drm_pipe_clk_enable(crtc, true); - mixer_vsync_set_update(ctx, false); + mixer_disable_sync(ctx); mixer_reg_writemask(ctx, MXR_STATUS, ~0, MXR_STATUS_SOFT_RESET); @@ -992,7 +1014,7 @@ static void mixer_enable(struct exynos_drm_crtc *crtc) mixer_commit(ctx); - mixer_vsync_set_update(ctx, true); + mixer_enable_sync(ctx); set_bit(MXR_BIT_POWERED, &ctx->flags); } From cba368c1f01c27ed62fca7a853531845d263bb01 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 18 Mar 2019 10:37:13 -0700 Subject: [PATCH 735/855] bpf: Only print ref_obj_id for refcounted reg Naresh reported that test_align fails because of the mismatch at the verbose printout of the register states. The reason is due to the newly added ref_obj_id. ref_obj_id is only useful for refcounted reg. Thus, this patch fixes it by only printing ref_obj_id for refcounted reg. While at it, it also uses comma instead of space to separate between "id" and "ref_obj_id". Fixes: 1b986589680a ("bpf: Fix bpf_tcp_sock and bpf_sk_fullsock issue related to bpf_sk_release") Reported-by: Naresh Kamboju Signed-off-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 86f9cd5d1c4e..5aa810882583 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -352,6 +352,14 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) map_value_has_spin_lock(reg->map_ptr); } +static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCKET_OR_NULL || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_TCP_SOCK_OR_NULL; +} + static bool arg_type_may_be_refcounted(enum bpf_arg_type type) { return type == ARG_PTR_TO_SOCK_COMMON; @@ -451,8 +459,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (t == PTR_TO_STACK) verbose(env, ",call_%d", func(env, reg)->callsite); } else { - verbose(env, "(id=%d ref_obj_id=%d", reg->id, - reg->ref_obj_id); + verbose(env, "(id=%d", reg->id); + if (reg_type_may_be_refcounted_or_null(t)) + verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); if (t != SCALAR_VALUE) verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) From 667a8f73753908c4d0171e52b71774f9be5d6713 Mon Sep 17 00:00:00 2001 From: Jian-Hong Pan Date: Fri, 15 Mar 2019 17:51:09 +0800 Subject: [PATCH 736/855] ALSA: hda/realtek: Enable headset MIC of Acer AIO with ALC286 Some Acer AIO desktops like Veriton Z6860G, Z4860G and Z4660G cannot record sound from headset MIC. This patch adds the ALC286_FIXUP_ACER_AIO_HEADSET_MIC quirk to fix this issue. Fixes: 9f8aefed9623 ("ALSA: hda/realtek: Fix mic issue on Acer AIO Veriton Z4660G") Fixes: b72f936f6b32 ("ALSA: hda/realtek: Fix mic issue on Acer AIO Veriton Z4860G/Z6860G") Signed-off-by: Jian-Hong Pan Reviewed-by: Kailang Yang Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 384719d5c44e..191830d4fa40 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5687,6 +5687,7 @@ enum { ALC225_FIXUP_DELL_WYSE_AIO_MIC_NO_PRESENCE, ALC225_FIXUP_WYSE_AUTO_MUTE, ALC225_FIXUP_WYSE_DISABLE_MIC_VREF, + ALC286_FIXUP_ACER_AIO_HEADSET_MIC, }; static const struct hda_fixup alc269_fixups[] = { @@ -6685,6 +6686,16 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC }, + [ALC286_FIXUP_ACER_AIO_HEADSET_MIC] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + { 0x20, AC_VERB_SET_COEF_INDEX, 0x4f }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x5029 }, + { } + }, + .chained = true, + .chain_id = ALC286_FIXUP_ACER_AIO_MIC_NO_PRESENCE + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -6701,9 +6712,9 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x079b, "Acer Aspire V5-573G", ALC282_FIXUP_ASPIRE_V5_PINS), SND_PCI_QUIRK(0x1025, 0x102b, "Acer Aspire C24-860", ALC286_FIXUP_ACER_AIO_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x106d, "Acer Cloudbook 14", ALC283_FIXUP_CHROME_BOOK), - SND_PCI_QUIRK(0x1025, 0x128f, "Acer Veriton Z6860G", ALC286_FIXUP_ACER_AIO_MIC_NO_PRESENCE), - SND_PCI_QUIRK(0x1025, 0x1290, "Acer Veriton Z4860G", ALC286_FIXUP_ACER_AIO_MIC_NO_PRESENCE), - SND_PCI_QUIRK(0x1025, 0x1291, "Acer Veriton Z4660G", ALC286_FIXUP_ACER_AIO_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1025, 0x128f, "Acer Veriton Z6860G", ALC286_FIXUP_ACER_AIO_HEADSET_MIC), + SND_PCI_QUIRK(0x1025, 0x1290, "Acer Veriton Z4860G", ALC286_FIXUP_ACER_AIO_HEADSET_MIC), + SND_PCI_QUIRK(0x1025, 0x1291, "Acer Veriton Z4660G", ALC286_FIXUP_ACER_AIO_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x1330, "Acer TravelMate X514-51T", ALC255_FIXUP_ACER_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0470, "Dell M101z", ALC269_FIXUP_DELL_M101Z), SND_PCI_QUIRK(0x1028, 0x054b, "Dell XPS one 2710", ALC275_FIXUP_DELL_XPS), From 2b77158ffa92b820a0c5da9a3c6ead7aa069c71c Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Sun, 17 Mar 2019 12:58:25 +0300 Subject: [PATCH 737/855] mmc: mxcmmc: "Revert mmc: mxcmmc: handle highmem pages" This reverts commit b189e7589f6d3411e85c6b7ae6eef158f08f388f. Unable to handle kernel paging request at virtual address c8358000 pgd = efa405c3 [c8358000] *pgd=00000000 Internal error: Oops: 805 [#1] PREEMPT ARM CPU: 0 PID: 711 Comm: kworker/0:2 Not tainted 4.20.0+ #30 Hardware name: Freescale i.MX27 (Device Tree Support) Workqueue: events mxcmci_datawork PC is at mxcmci_datawork+0xbc/0x2ac LR is at mxcmci_datawork+0xac/0x2ac pc : [] lr : [] psr: 60000013 sp : c6c93f08 ip : 24004180 fp : 00000008 r10: c8358000 r9 : c78b3e24 r8 : c6c92000 r7 : 00000000 r6 : c7bb8680 r5 : c7bb86d4 r4 : c78b3de0 r3 : 00002502 r2 : c090b2e0 r1 : 00000880 r0 : 00000000 Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user Control: 0005317f Table: a68a8000 DAC: 00000055 Process kworker/0:2 (pid: 711, stack limit = 0x389543bc) Stack: (0xc6c93f08 to 0xc6c94000) 3f00: c7bb86d4 00000000 00000000 c6cbfde0 c7bb86d4 c7ee4200 3f20: 00000000 c0907ea8 00000000 c7bb86d8 c0907ea8 c012077c c6cbfde0 c7bb86d4 3f40: c6cbfde0 c6c92000 c6cbfdf4 c09280ba c0907ea8 c090b2e0 c0907ebc c0120c18 3f60: c6cbfde0 00000000 00000000 c6cbb580 c7ba7c40 c7837edc c6cbb598 00000000 3f80: c6cbfde0 c01208f8 00000000 c01254fc c7ba7c40 c0125400 00000000 00000000 3fa0: 00000000 00000000 00000000 c01010d0 00000000 00000000 00000000 00000000 3fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 3fe0: 00000000 00000000 00000000 00000000 00000013 00000000 00000000 00000000 [] (mxcmci_datawork) from [] (process_one_work+0x1f0/0x338) [] (process_one_work) from [] (worker_thread+0x320/0x474) [] (worker_thread) from [] (kthread+0xfc/0x118) [] (kthread) from [] (ret_from_fork+0x14/0x24) Exception stack(0xc6c93fb0 to 0xc6c93ff8) 3fa0: 00000000 00000000 00000000 00000000 3fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 3fe0: 00000000 00000000 00000000 00000000 00000013 00000000 Code: e3500000 1a000059 e5153050 e5933038 (e48a3004) ---[ end trace 54ca629b75f0e737 ]--- note: kworker/0:2[711] exited with preempt_count 1 Signed-off-by: Alexander Shiyan Fixes: b189e7589f6d ("mmc: mxcmmc: handle highmem pages") Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/mxcmmc.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index d54612257b06..45f7b9b53d48 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -290,11 +290,8 @@ static void mxcmci_swap_buffers(struct mmc_data *data) struct scatterlist *sg; int i; - for_each_sg(data->sg, sg, data->sg_len, i) { - void *buf = kmap_atomic(sg_page(sg) + sg->offset); - buffer_swap32(buf, sg->length); - kunmap_atomic(buf); - } + for_each_sg(data->sg, sg, data->sg_len, i) + buffer_swap32(sg_virt(sg), sg->length); } #else static inline void mxcmci_swap_buffers(struct mmc_data *data) {} @@ -611,7 +608,6 @@ static int mxcmci_transfer_data(struct mxcmci_host *host) { struct mmc_data *data = host->req->data; struct scatterlist *sg; - void *buf; int stat, i; host->data = data; @@ -619,18 +615,14 @@ static int mxcmci_transfer_data(struct mxcmci_host *host) if (data->flags & MMC_DATA_READ) { for_each_sg(data->sg, sg, data->sg_len, i) { - buf = kmap_atomic(sg_page(sg) + sg->offset); - stat = mxcmci_pull(host, buf, sg->length); - kunmap(buf); + stat = mxcmci_pull(host, sg_virt(sg), sg->length); if (stat) return stat; host->datasize += sg->length; } } else { for_each_sg(data->sg, sg, data->sg_len, i) { - buf = kmap_atomic(sg_page(sg) + sg->offset); - stat = mxcmci_push(host, buf, sg->length); - kunmap(buf); + stat = mxcmci_push(host, sg_virt(sg), sg->length); if (stat) return stat; host->datasize += sg->length; From 92edf8df0ff2ae86cc632eeca0e651fd8431d40d Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 21 Mar 2019 15:24:33 +1100 Subject: [PATCH 738/855] powerpc/security: Fix spectre_v2 reporting When I updated the spectre_v2 reporting to handle software count cache flush I got the logic wrong when there's no software count cache enabled at all. The result is that on systems with the software count cache flush disabled we print: Mitigation: Indirect branch cache disabled, Software count cache flush Which correctly indicates that the count cache is disabled, but incorrectly says the software count cache flush is enabled. The root of the problem is that we are trying to handle all combinations of options. But we know now that we only expect to see the software count cache flush enabled if the other options are false. So split the two cases, which simplifies the logic and fixes the bug. We were also missing a space before "(hardware accelerated)". The result is we see one of: Mitigation: Indirect branch serialisation (kernel only) Mitigation: Indirect branch cache disabled Mitigation: Software count cache flush Mitigation: Software count cache flush (hardware accelerated) Fixes: ee13cb249fab ("powerpc/64s: Add support for software count cache flush") Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Michael Ellerman Reviewed-by: Michael Neuling Reviewed-by: Diana Craciun Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/security.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index 9b8631533e02..b33bafb8fcea 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -190,29 +190,22 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, c bcs = security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED); ccd = security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED); - if (bcs || ccd || count_cache_flush_type != COUNT_CACHE_FLUSH_NONE) { - bool comma = false; + if (bcs || ccd) { seq_buf_printf(&s, "Mitigation: "); - if (bcs) { + if (bcs) seq_buf_printf(&s, "Indirect branch serialisation (kernel only)"); - comma = true; - } - if (ccd) { - if (comma) - seq_buf_printf(&s, ", "); - seq_buf_printf(&s, "Indirect branch cache disabled"); - comma = true; - } - - if (comma) + if (bcs && ccd) seq_buf_printf(&s, ", "); - seq_buf_printf(&s, "Software count cache flush"); + if (ccd) + seq_buf_printf(&s, "Indirect branch cache disabled"); + } else if (count_cache_flush_type != COUNT_CACHE_FLUSH_NONE) { + seq_buf_printf(&s, "Mitigation: Software count cache flush"); if (count_cache_flush_type == COUNT_CACHE_FLUSH_HW) - seq_buf_printf(&s, "(hardware accelerated)"); + seq_buf_printf(&s, " (hardware accelerated)"); } else if (btb_flush_enabled) { seq_buf_printf(&s, "Mitigation: Branch predictor state flush"); } else { From 031d2ccc16775c9531800979069e141fbedeb2f7 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 21 Mar 2019 11:45:44 +0530 Subject: [PATCH 739/855] mmc: sdhci-omap: Set caps2 to indicate no physical write protect pin After commit 6d5cd068ee59fba ("mmc: sdhci: use WP GPIO in sdhci_check_ro()") and commit 39ee32ce486756f ("mmc: sdhci-omap: drop ->get_ro() implementation"), sdhci-omap relied on SDHCI_PRESENT_STATE to check if the card is read-only, if wp-gpios is not populated in device tree. However SDHCI_PRESENT_STATE in sdhci-omap does not have correct read-only state. sdhci-omap can be used by platforms with both micro SD slot and standard SD slot with physical write protect pin (using GPIO). Set caps2 to MMC_CAP2_NO_WRITE_PROTECT based on if wp-gpios property is populated or not. This fix is required since existing device-tree node doesn't have "disable-wp" property and to preserve old-dt compatibility. Fixes: 6d5cd068ee59fba ("mmc: sdhci: use WP GPIO in sdhci_check_ro()") Fixes: 39ee32ce486756f ("mmc: sdhci-omap: drop ->get_ro() implementation") Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-omap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/host/sdhci-omap.c b/drivers/mmc/host/sdhci-omap.c index b1a66ca3821a..5bbed477c9b1 100644 --- a/drivers/mmc/host/sdhci-omap.c +++ b/drivers/mmc/host/sdhci-omap.c @@ -1056,6 +1056,9 @@ static int sdhci_omap_probe(struct platform_device *pdev) mmc->f_max = 48000000; } + if (!mmc_can_gpio_ro(mmc)) + mmc->caps2 |= MMC_CAP2_NO_WRITE_PROTECT; + pltfm_host->clk = devm_clk_get(dev, "fck"); if (IS_ERR(pltfm_host->clk)) { ret = PTR_ERR(pltfm_host->clk); From 5ea47691bd99e1100707ec63364aff72324e2af4 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Wed, 20 Mar 2019 14:36:53 +0800 Subject: [PATCH 740/855] mmc: alcor: fix DMA reads Setting max_blk_count to 1 here was causing the mmc block layer to always use the MMC_READ_SINGLE_BLOCK command here, which the driver does not DMA-accelerate. Drop the max_blk_ settings here. The mmc host defaults suffice, along with the max_segs and max_seg_size settings, which I have now documented in more detail. Now each MMC command reads 4 512-byte blocks, using DMA instead of PIO. On my SD card, this increases read performance (measured with dd) from 167kb/sec to 4.6mb/sec. Link: http://lkml.kernel.org/r/CAD8Lp47L5T3jnAjBiPs1cQ+yFA3L6LJtgFvMETnBrY63-Zdi2g@mail.gmail.com Signed-off-by: Daniel Drake Reviewed-by: Oleksij Rempel Fixes: c5413ad815a6 ("mmc: add new Alcor Micro Cardreader SD/MMC driver") Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/alcor.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/alcor.c b/drivers/mmc/host/alcor.c index c712b7deb3a9..82a97866e0cf 100644 --- a/drivers/mmc/host/alcor.c +++ b/drivers/mmc/host/alcor.c @@ -1044,14 +1044,27 @@ static void alcor_init_mmc(struct alcor_sdmmc_host *host) mmc->caps2 = MMC_CAP2_NO_SDIO; mmc->ops = &alcor_sdc_ops; - /* Hardware cannot do scatter lists */ + /* The hardware does DMA data transfer of 4096 bytes to/from a single + * buffer address. Scatterlists are not supported, but upon DMA + * completion (signalled via IRQ), the original vendor driver does + * then immediately set up another DMA transfer of the next 4096 + * bytes. + * + * This means that we need to handle the I/O in 4096 byte chunks. + * Lacking a way to limit the sglist entries to 4096 bytes, we instead + * impose that only one segment is provided, with maximum size 4096, + * which also happens to be the minimum size. This means that the + * single-entry sglist handled by this driver can be handed directly + * to the hardware, nice and simple. + * + * Unfortunately though, that means we only do 4096 bytes I/O per + * MMC command. A future improvement would be to make the driver + * accept sg lists and entries of any size, and simply iterate + * through them 4096 bytes at a time. + */ mmc->max_segs = AU6601_MAX_DMA_SEGMENTS; mmc->max_seg_size = AU6601_MAX_DMA_BLOCK_SIZE; - - mmc->max_blk_size = mmc->max_seg_size; - mmc->max_blk_count = mmc->max_segs; - - mmc->max_req_size = mmc->max_seg_size * mmc->max_segs; + mmc->max_req_size = mmc->max_seg_size; } static int alcor_pci_sdmmc_drv_probe(struct platform_device *pdev) From c9a9497ccef205ed4ed2e247011382627876d831 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 19 Mar 2019 11:12:59 +0100 Subject: [PATCH 741/855] mmc: renesas_sdhi: limit block count to 16 bit for old revisions R-Car Gen2 has two different SDHI incarnations in the same chip. The older one does not support the recently introduced 32 bit register access to the block count register. Make sure we use this feature only after the first known version. Thanks to the Renesas Testing team for this bug report! Fixes: 5603731a15ef ("mmc: tmio: fix access width of Block Count Register") Reported-by: Yoshihiro Shimoda Signed-off-by: Wolfram Sang Reviewed-by: Simon Horman Tested-by: Phong Hoang Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 71e13844df6c..8742e27e4e8b 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -641,6 +641,7 @@ int renesas_sdhi_probe(struct platform_device *pdev, struct renesas_sdhi *priv; struct resource *res; int irq, ret, i; + u16 ver; of_data = of_device_get_match_data(&pdev->dev); @@ -773,12 +774,17 @@ int renesas_sdhi_probe(struct platform_device *pdev, if (ret) goto efree; + ver = sd_ctrl_read16(host, CTL_VERSION); + /* GEN2_SDR104 is first known SDHI to use 32bit block count */ + if (ver < SDHI_VER_GEN2_SDR104 && mmc_data->max_blk_count > U16_MAX) + mmc_data->max_blk_count = U16_MAX; + ret = tmio_mmc_host_probe(host); if (ret < 0) goto edisclk; /* One Gen2 SDHI incarnation does NOT have a CBSY bit */ - if (sd_ctrl_read16(host, CTL_VERSION) == SDHI_VER_GEN2_SDR50) + if (ver == SDHI_VER_GEN2_SDR50) mmc_data->flags &= ~TMIO_MMC_HAVE_CBSY; /* Enable tuning iff we have an SCC and a supported mode */ From 551417af91b163bd697eb50b3601adae2177c28a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 18 Mar 2019 14:51:23 +0800 Subject: [PATCH 742/855] genirq: Fix typo in comment of IRQD_MOVE_PCNTXT Signed-off-by: Peter Xu Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Dou Liyang Cc: Julien Thierry Link: https://lkml.kernel.org/r/20190318065123.11862-1-peterx@redhat.com --- include/linux/irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index d6160d479b14..7ae8de5ad0f2 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -195,7 +195,7 @@ struct irq_data { * IRQD_LEVEL - Interrupt is level triggered * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup * from suspend - * IRDQ_MOVE_PCNTXT - Interrupt can be moved in process + * IRQD_MOVE_PCNTXT - Interrupt can be moved in process * context * IRQD_IRQ_DISABLED - Disabled state of the interrupt * IRQD_IRQ_MASKED - Masked state of the interrupt From 82efcab3b9f3ef59e9713237c6e3c05c3a95c1ae Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 11 Mar 2019 16:02:55 -0700 Subject: [PATCH 743/855] workqueue: Only unregister a registered lockdep key The recent change to prevent use after free and a memory leak introduced an unconditional call to wq_unregister_lockdep() in the error handling path. If the lockdep key had not been registered yet, then the lockdep core emits a warning. Only call wq_unregister_lockdep() if wq_register_lockdep() has been called first. Fixes: 009bb421b6ce ("workqueue, lockdep: Fix an alloc_workqueue() error path") Reported-by: syzbot+be0c198232f86389c3dd@syzkaller.appspotmail.com Signed-off-by: Bart Van Assche Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Tejun Heo Cc: Qian Cai Link: https://lkml.kernel.org/r/20190311230255.176081-1-bvanassche@acm.org --- kernel/workqueue.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4026d1871407..ddee541ea97a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4266,7 +4266,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, INIT_LIST_HEAD(&wq->list); if (alloc_and_link_pwqs(wq) < 0) - goto err_free_wq; + goto err_unreg_lockdep; if (wq_online && init_rescuer(wq) < 0) goto err_destroy; @@ -4292,9 +4292,10 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, return wq; -err_free_wq: +err_unreg_lockdep: wq_unregister_lockdep(wq); wq_free_lockdep(wq); +err_free_wq: free_workqueue_attrs(wq->unbound_attrs); kfree(wq); return NULL; From 0c671812f152b628bd87c0af49da032cc2a2c319 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Mar 2019 19:09:38 -0500 Subject: [PATCH 744/855] objtool: Move objtool_file struct off the stack Objtool uses over 512k of stack, thanks to the hash table embedded in the objtool_file struct. This causes an unnecessarily large stack allocation and breaks users with low stack limits. Move the struct off the stack. Fixes: 042ba73fe7eb ("objtool: Add several performance improvements") Reported-by: Vassili Karpov Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/df92dcbc4b84b02ffa252f46876df125fb56e2d7.1552954176.git.jpoimboe@redhat.com --- tools/objtool/check.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0414a0d52262..5dde107083c6 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2184,9 +2184,10 @@ static void cleanup(struct objtool_file *file) elf_close(file->elf); } +static struct objtool_file file; + int check(const char *_objname, bool orc) { - struct objtool_file file; int ret, warnings = 0; objname = _objname; From dc3173c7067ebbac44ed4a5c3636dc0ff27f981b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 20 Mar 2019 22:22:20 +0800 Subject: [PATCH 745/855] irqchip/brcmstb-l2: Make two init functions static Fix sparse warnings: drivers/irqchip/irq-brcmstb-l2.c:278:12: warning: symbol 'brcmstb_l2_edge_intc_of_init' was not declared. Should it be static? drivers/irqchip/irq-brcmstb-l2.c:285:12: warning: symbol 'brcmstb_l2_lvl_intc_of_init' was not declared. Should it be static? Signed-off-by: YueHaibing Reviewed-by: Florian Fainelli Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-brcmstb-l2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c index 83364fedbf0a..5e4ca139e4ea 100644 --- a/drivers/irqchip/irq-brcmstb-l2.c +++ b/drivers/irqchip/irq-brcmstb-l2.c @@ -275,14 +275,14 @@ out_free: return ret; } -int __init brcmstb_l2_edge_intc_of_init(struct device_node *np, +static int __init brcmstb_l2_edge_intc_of_init(struct device_node *np, struct device_node *parent) { return brcmstb_l2_intc_of_init(np, parent, &l2_edge_intc_init); } IRQCHIP_DECLARE(brcmstb_l2_intc, "brcm,l2-intc", brcmstb_l2_edge_intc_of_init); -int __init brcmstb_l2_lvl_intc_of_init(struct device_node *np, +static int __init brcmstb_l2_lvl_intc_of_init(struct device_node *np, struct device_node *parent) { return brcmstb_l2_intc_of_init(np, parent, &l2_lvl_intc_init); From 096048cb120d5318b3a9a7c1e062a4b11c0e80ab Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 20 Mar 2019 21:40:27 +0800 Subject: [PATCH 746/855] irqchip/mmp: Make mmp_irq_domain_ops static Fix sparse warning: drivers/irqchip/irq-mmp.c:182:29: warning: symbol 'mmp_irq_domain_ops' was not declared. Should it be static? Signed-off-by: YueHaibing Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-mmp.c b/drivers/irqchip/irq-mmp.c index 25f32e1d7764..85d1c4d49572 100644 --- a/drivers/irqchip/irq-mmp.c +++ b/drivers/irqchip/irq-mmp.c @@ -176,7 +176,7 @@ static int mmp_irq_domain_xlate(struct irq_domain *d, struct device_node *node, return 0; } -const struct irq_domain_ops mmp_irq_domain_ops = { +static const struct irq_domain_ops mmp_irq_domain_ops = { .map = mmp_irq_domain_map, .xlate = mmp_irq_domain_xlate, }; From 24105bf4d10485143f8e26337cda8bcb7f6e6da5 Mon Sep 17 00:00:00 2001 From: Fabrizio Castro Date: Tue, 19 Mar 2019 11:02:01 +0000 Subject: [PATCH 747/855] dt-bindings: irqchip: renesas-irqc: Document r8a774c0 support Document RZ/G2E (R8A774C0) SoC bindings. Signed-off-by: Fabrizio Castro Reviewed-by: Geert Uytterhoeven Reviewed-by: Simon Horman Reviewed-by: Rob Herring Signed-off-by: Marc Zyngier --- .../devicetree/bindings/interrupt-controller/renesas,irqc.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt index 8de96a4fb2d5..f977ea7617f6 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt @@ -16,6 +16,7 @@ Required properties: - "renesas,irqc-r8a7793" (R-Car M2-N) - "renesas,irqc-r8a7794" (R-Car E2) - "renesas,intc-ex-r8a774a1" (RZ/G2M) + - "renesas,intc-ex-r8a774c0" (RZ/G2E) - "renesas,intc-ex-r8a7795" (R-Car H3) - "renesas,intc-ex-r8a7796" (R-Car M3-W) - "renesas,intc-ex-r8a77965" (R-Car M3-N) From 0dda09666f50eae9c5b794dd89b1fd8a8d89d714 Mon Sep 17 00:00:00 2001 From: Fabien Dessenne Date: Thu, 7 Mar 2019 19:40:35 +0100 Subject: [PATCH 748/855] irqchip/stm32: Don't clear rising/falling config registers at init Falling and rising configuration and status registers are not banked. As they are shared with M4 co-processor, they should not be cleared at probe time, else M4 co-processor configuration will be lost. Fixes: f9fc1745501e ("irqchip/stm32: Add host and driver data structures") Signed-off-by: Loic Pallardy Signed-off-by: Fabien Dessenne Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-stm32-exti.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/irqchip/irq-stm32-exti.c b/drivers/irqchip/irq-stm32-exti.c index 6edfd4bfa169..dab37fbfab82 100644 --- a/drivers/irqchip/irq-stm32-exti.c +++ b/drivers/irqchip/irq-stm32-exti.c @@ -735,11 +735,6 @@ stm32_exti_chip_data *stm32_exti_chip_init(struct stm32_exti_host_data *h_data, */ writel_relaxed(0, base + stm32_bank->imr_ofst); writel_relaxed(0, base + stm32_bank->emr_ofst); - writel_relaxed(0, base + stm32_bank->rtsr_ofst); - writel_relaxed(0, base + stm32_bank->ftsr_ofst); - writel_relaxed(~0UL, base + stm32_bank->rpr_ofst); - if (stm32_bank->fpr_ofst != UNDEF_REG) - writel_relaxed(~0UL, base + stm32_bank->fpr_ofst); pr_info("%pOF: bank%d\n", h_data->node, bank_idx); From 6a77623d78b307b34d4cf7886da6a907689bf388 Mon Sep 17 00:00:00 2001 From: Fabien Dessenne Date: Thu, 7 Mar 2019 19:40:36 +0100 Subject: [PATCH 749/855] irqchip/stm32: Don't set rising configuration registers at init The rising configuration status register (rtsr) is not banked. As it is shared with the co-processor, it should not be written at probe time, else the co-processor configuration will be lost. Fixes: f9fc1745501e ("irqchip/stm32: Add host and driver data structures") Signed-off-by: Fabien Dessenne Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-stm32-exti.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/irqchip/irq-stm32-exti.c b/drivers/irqchip/irq-stm32-exti.c index dab37fbfab82..6b19bffbad78 100644 --- a/drivers/irqchip/irq-stm32-exti.c +++ b/drivers/irqchip/irq-stm32-exti.c @@ -716,7 +716,6 @@ stm32_exti_chip_data *stm32_exti_chip_init(struct stm32_exti_host_data *h_data, const struct stm32_exti_bank *stm32_bank; struct stm32_exti_chip_data *chip_data; void __iomem *base = h_data->base; - u32 irqs_mask; stm32_bank = h_data->drv_data->exti_banks[bank_idx]; chip_data = &h_data->chips_data[bank_idx]; @@ -725,10 +724,6 @@ stm32_exti_chip_data *stm32_exti_chip_init(struct stm32_exti_host_data *h_data, raw_spin_lock_init(&chip_data->rlock); - /* Determine number of irqs supported */ - writel_relaxed(~0UL, base + stm32_bank->rtsr_ofst); - irqs_mask = readl_relaxed(base + stm32_bank->rtsr_ofst); - /* * This IP has no reset, so after hot reboot we should * clear registers to avoid residue From fca269f201a8d9985c0a31fb60b15d4eb57cef80 Mon Sep 17 00:00:00 2001 From: Jianguo Chen Date: Wed, 20 Mar 2019 18:54:21 +0000 Subject: [PATCH 750/855] irqchip/mbigen: Don't clear eventid when freeing an MSI mbigen_write_msg clears eventid bits of a mbigen register when free a interrupt, because msi_domain_deactivate memset struct msg to zero. Then multiple mbigen pins with zero eventid will report the same interrupt number. The eventid clear call trace: free_irq __free_irq irq_shutdown irq_domain_deactivate_irq __irq_domain_deactivate_irq __irq_domain_deactivate_irq msi_domain_deactivate platform_msi_write_msg mbigen_write_msg Signed-off-by: Jianguo Chen [maz: massaged subject] Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mbigen.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/irqchip/irq-mbigen.c b/drivers/irqchip/irq-mbigen.c index 567b29c47608..98b6e1d4b1a6 100644 --- a/drivers/irqchip/irq-mbigen.c +++ b/drivers/irqchip/irq-mbigen.c @@ -161,6 +161,9 @@ static void mbigen_write_msg(struct msi_desc *desc, struct msi_msg *msg) void __iomem *base = d->chip_data; u32 val; + if (!msg->address_lo && !msg->address_hi) + return; + base += get_mbigen_vec_reg(d->hwirq); val = readl_relaxed(base); From 0803278b0b4d8eeb2b461fb698785df65a725d9e Mon Sep 17 00:00:00 2001 From: Xu Yu Date: Thu, 21 Mar 2019 18:00:35 +0800 Subject: [PATCH 751/855] bpf: do not restore dst_reg when cur_state is freed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Syzkaller hit 'KASAN: use-after-free Write in sanitize_ptr_alu' bug. Call trace: dump_stack+0xbf/0x12e print_address_description+0x6a/0x280 kasan_report+0x237/0x360 sanitize_ptr_alu+0x85a/0x8d0 adjust_ptr_min_max_vals+0x8f2/0x1ca0 adjust_reg_min_max_vals+0x8ed/0x22e0 do_check+0x1ca6/0x5d00 bpf_check+0x9ca/0x2570 bpf_prog_load+0xc91/0x1030 __se_sys_bpf+0x61e/0x1f00 do_syscall_64+0xc8/0x550 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fault injection trace:  kfree+0xea/0x290  free_func_state+0x4a/0x60  free_verifier_state+0x61/0xe0  push_stack+0x216/0x2f0 <- inject failslab  sanitize_ptr_alu+0x2b1/0x8d0  adjust_ptr_min_max_vals+0x8f2/0x1ca0  adjust_reg_min_max_vals+0x8ed/0x22e0  do_check+0x1ca6/0x5d00  bpf_check+0x9ca/0x2570  bpf_prog_load+0xc91/0x1030  __se_sys_bpf+0x61e/0x1f00  do_syscall_64+0xc8/0x550  entry_SYSCALL_64_after_hwframe+0x49/0xbe When kzalloc() fails in push_stack(), free_verifier_state() will free current verifier state. As push_stack() returns, dst_reg was restored if ptr_is_dst_reg is false. However, as member of the cur_state, dst_reg is also freed, and error occurs when dereferencing dst_reg. Simply fix it by testing ret of push_stack() before restoring dst_reg. Fixes: 979d63d50c0c ("bpf: prevent out of bounds speculation on pointer arithmetic") Signed-off-by: Xu Yu Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5aa810882583..f19d5e04c69d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3381,7 +3381,7 @@ do_sim: *dst_reg = *ptr_reg; } ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); - if (!ptr_is_dst_reg) + if (!ptr_is_dst_reg && ret) *dst_reg = tmp; return !ret ? -EFAULT : 0; } From a9c640ac96e19b3966357ec9bb586edd2e1e74de Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 14 Mar 2019 15:14:57 -0700 Subject: [PATCH 752/855] x86/boot: Restrict header scope to make Clang happy The inclusion of was causing issue as the definition of __arch_hweight64 from arch/x86/include/asm/arch_hweight.h eventually gets included. The definition is problematic when compiled with -m16 (all code in arch/x86/boot/ is) as the "D" inline assembly constraint is rejected by both compilers when passed an argument of type long long (regardless of signedness, anything smaller is fine). Because GCC performs inlining before semantic analysis, and __arch_hweight64 is dead in this translation unit, GCC does not report any issues at compile time. Clang does the semantic analysis in the front end, before inlining (run in the middle) can determine the code is dead. I consider this another case of PR33587, which I think we can do more work to solve. It turns out that arch/x86/boot/string.c doesn't actually need linux/kernel.h, simply linux/limits.h and linux/compiler.h. Suggested-by: Stephen Rothwell Signed-off-by: Nick Desaulniers Signed-off-by: Thomas Gleixner Tested-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Cc: bp@alien8.de Cc: niravd@google.com Cc: "H. Peter Anvin" Cc: Chao Fan Cc: Uros Bizjak Link: https://bugs.llvm.org/show_bug.cgi?id=33587 Link: https://github.com/ClangBuiltLinux/linux/issues/347 Link: https://lkml.kernel.org/r/20190314221458.83047-1-ndesaulniers@google.com --- arch/x86/boot/string.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 315a67b8896b..90154df8f125 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -13,8 +13,9 @@ */ #include -#include +#include #include +#include #include #include "ctype.h" #include "string.h" From 725e29db8cb9058976559bc3239c97ef7db40eea Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 14 Mar 2019 23:08:38 +0000 Subject: [PATCH 753/855] x86/lib: Fix indentation issue, remove extra tab The increment of buff is indented one level too deeply, clean this up by removing a tab. Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: "H . Peter Anvin" Cc: kernel-janitors@vger.kernel.org Link: https://lkml.kernel.org/r/20190314230838.18256-1-colin.king@canonical.com --- arch/x86/lib/csum-partial_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index 9baca3e054be..e7925d668b68 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -94,7 +94,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len) : "m" (*(unsigned long *)buff), "r" (zero), "0" (result)); --count; - buff += 8; + buff += 8; } result = add32_with_carry(result>>32, result&0xffffffff); From 2e84f116afca3719c9d0a1a78b47b48f75fd5724 Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Mon, 18 Mar 2019 21:19:56 -0500 Subject: [PATCH 754/855] x86/hpet: Prevent potential NULL pointer dereference hpet_virt_address may be NULL when ioremap_nocache fail, but the code lacks a check. Add a check to prevent NULL pointer dereference. Signed-off-by: Aditya Pakki Signed-off-by: Thomas Gleixner Cc: kjlu@umn.edu Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Kees Cook Cc: Joe Perches Cc: Nicolai Stange Cc: Roland Dreier Link: https://lkml.kernel.org/r/20190319021958.17275-1-pakki001@umn.edu --- arch/x86/kernel/hpet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index dfd3aca82c61..fb32925a2e62 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -905,6 +905,8 @@ int __init hpet_enable(void) return 0; hpet_set_mapping(); + if (!hpet_virt_address) + return 0; /* * Read the period and check for a sane value: From 534c89c22e26b183d838294f0937ee092c82ad3a Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Thu, 14 Mar 2019 00:46:51 -0500 Subject: [PATCH 755/855] x86/hyperv: Prevent potential NULL pointer dereference The page allocation in hv_cpu_init() can fail, but the code does not have a check for that. Add a check and return -ENOMEM when the allocation fails. [ tglx: Massaged changelog ] Signed-off-by: Kangjie Lu Signed-off-by: Thomas Gleixner Reviewed-by: Mukesh Ojha Acked-by: "K. Y. Srinivasan" Cc: pakki001@umn.edu Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Sasha Levin Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: linux-hyperv@vger.kernel.org Link: https://lkml.kernel.org/r/20190314054651.1315-1-kjlu@umn.edu --- arch/x86/hyperv/hv_init.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 6461a16b4559..e4ba467a9fc6 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -103,9 +103,13 @@ static int hv_cpu_init(unsigned int cpu) u64 msr_vp_index; struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()]; void **input_arg; + struct page *pg; input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - *input_arg = page_address(alloc_page(GFP_KERNEL)); + pg = alloc_page(GFP_KERNEL); + if (unlikely(!pg)) + return -ENOMEM; + *input_arg = page_address(pg); hv_get_vp_index(msr_vp_index); From 9bd681251b7c1db1c6cfe29a72c5ea1b1c0ba022 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 13 Mar 2019 12:00:22 +0100 Subject: [PATCH 756/855] x86/microcode: Announce reload operation's completion By popular demand, issue a single line to dmesg after the reload operation completes to let the user know that a reload has at least been attempted. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190313110022.8229-1-bp@alien8.de --- arch/x86/kernel/cpu/microcode/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 97f9ada9ceda..5260185cbf7b 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -608,6 +608,8 @@ static int microcode_reload_late(void) if (ret > 0) microcode_check(); + pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode); + return ret; } From 18fb053f9b827bd98cfc64f2a35df8ab19745a1d Mon Sep 17 00:00:00 2001 From: Matthew Whitehead Date: Thu, 14 Mar 2019 16:46:00 -0400 Subject: [PATCH 757/855] x86/cpu/cyrix: Use correct macros for Cyrix calls on Geode processors There are comments in processor-cyrix.h advising you to _not_ make calls using the deprecated macros in this style: setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); This is because it expands the macro into a non-functioning calling sequence. The calling order must be: outb(CX86_CCR2, 0x22); inb(0x23); From the comments: * When using the old macros a line like * setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); * gets expanded to: * do { * outb((CX86_CCR2), 0x22); * outb((({ * outb((CX86_CCR2), 0x22); * inb(0x23); * }) | 0x88), 0x23); * } while (0); The new macros fix this problem, so use them instead. Tested on an actual Geode processor. Signed-off-by: Matthew Whitehead Signed-off-by: Thomas Gleixner Cc: luto@kernel.org Link: https://lkml.kernel.org/r/1552596361-8967-2-git-send-email-tedheadster@gmail.com --- arch/x86/kernel/cpu/cyrix.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index d12226f60168..1d9b8aaea06c 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -124,7 +124,7 @@ static void set_cx86_reorder(void) setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* Load/Store Serialize to mem access disable (=reorder it) */ - setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80); + setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); /* set load/store serialize from 1GB to 4GB */ ccr3 |= 0xe0; setCx86(CX86_CCR3, ccr3); @@ -135,11 +135,11 @@ static void set_cx86_memwb(void) pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); /* CCR2 bit 2: unlock NW bit */ - setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04); + setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); /* set 'Not Write-through' */ write_cr0(read_cr0() | X86_CR0_NW); /* CCR2 bit 2: lock NW bit and set WT1 */ - setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14); + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); } /* @@ -153,14 +153,14 @@ static void geode_configure(void) local_irq_save(flags); /* Suspend on halt power saving and enable #SUSP pin */ - setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88); + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* FPU fast, DTE cache, Mem bypass */ - setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38); + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ set_cx86_memwb(); @@ -296,7 +296,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { /* Enable cxMMX extensions (GX1 Datasheet 54) */ - setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1); + setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); /* * GXm : 0x30 ... 0x5f GXm datasheet 51 @@ -319,7 +319,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) if (dir1 > 7) { dir0_msn++; /* M II */ /* Enable MMX extensions (App note 108) */ - setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1); + setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); } else { /* A 6x86MX - it has the bug. */ set_cpu_bug(c, X86_BUG_COMA); From 0f4d3aa761b71cd6984330baca1e18bf0590e441 Mon Sep 17 00:00:00 2001 From: Matthew Whitehead Date: Thu, 14 Mar 2019 16:46:01 -0400 Subject: [PATCH 758/855] x86/cpu/cyrix: Remove {get,set}Cx86_old macros used for Cyrix processors The getCx86_old() and setCx86_old() macros have been replaced with correctly working getCx86() and setCx86(), so remove these unused macros. Signed-off-by: Matthew Whitehead Signed-off-by: Thomas Gleixner Cc: luto@kernel.org Link: https://lkml.kernel.org/r/1552596361-8967-3-git-send-email-tedheadster@gmail.com --- arch/x86/include/asm/processor-cyrix.h | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h index aaedd73ea2c6..df700a6cc869 100644 --- a/arch/x86/include/asm/processor-cyrix.h +++ b/arch/x86/include/asm/processor-cyrix.h @@ -3,19 +3,6 @@ * NSC/Cyrix CPU indexed register access. Must be inlined instead of * macros to ensure correct access ordering * Access order is always 0x22 (=offset), 0x23 (=value) - * - * When using the old macros a line like - * setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); - * gets expanded to: - * do { - * outb((CX86_CCR2), 0x22); - * outb((({ - * outb((CX86_CCR2), 0x22); - * inb(0x23); - * }) | 0x88), 0x23); - * } while (0); - * - * which in fact violates the access order (= 0x22, 0x22, 0x23, 0x23). */ static inline u8 getCx86(u8 reg) @@ -29,11 +16,3 @@ static inline void setCx86(u8 reg, u8 data) outb(reg, 0x22); outb(data, 0x23); } - -#define getCx86_old(reg) ({ outb((reg), 0x22); inb(0x23); }) - -#define setCx86_old(reg, data) do { \ - outb((reg), 0x22); \ - outb((data), 0x23); \ -} while (0) - From 657ee5531903339b06697581532ed32d4762526e Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:50 -0700 Subject: [PATCH 759/855] perf evlist: Introduce side band thread This patch introduces side band thread that captures extended information for events like PERF_RECORD_BPF_EVENT. This new thread uses its own evlist that uses ring buffer with very low watermark for lower latency. To use side band thread, we need to: 1. add side band event(s) by calling perf_evlist__add_sb_event(); 2. calls perf_evlist__start_sb_thread(); 3. at the end of perf run, perf_evlist__stop_sb_thread(). In the next patch, we use this thread to handle PERF_RECORD_BPF_EVENT. Committer notes: Add fix by Jiri Olsa for when te sb_tread can't get started and then at the end the stop_sb_thread() segfaults when joining the (non-existing) thread. That can happen when running 'perf top' or 'perf record' as a normal user, for instance. Further checks need to be done on top of this to more graciously handle these possible failure scenarios. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Link: http://lkml.kernel.org/r/20190312053051.2690567-15-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 9 +++ tools/perf/builtin-top.c | 9 +++ tools/perf/util/evlist.c | 119 ++++++++++++++++++++++++++++++++++++ tools/perf/util/evlist.h | 12 ++++ tools/perf/util/evsel.h | 6 ++ 5 files changed, 155 insertions(+) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index e79faccd7842..6f645fd72fed 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1137,6 +1137,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) struct perf_data *data = &rec->data; struct perf_session *session; bool disabled = false, draining = false; + struct perf_evlist *sb_evlist = NULL; int fd; atexit(record__sig_exit); @@ -1237,6 +1238,11 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) goto out_child; } + if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) { + pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); + opts->no_bpf_event = true; + } + err = record__synthesize(rec, false); if (err < 0) goto out_child; @@ -1487,6 +1493,9 @@ out_child: out_delete_session: perf_session__delete(session); + + if (!opts->no_bpf_event) + perf_evlist__stop_sb_thread(sb_evlist); return status; } diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index c2ea22c4ea67..3ce8a8db6c1d 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1501,6 +1501,7 @@ int cmd_top(int argc, const char **argv) "number of thread to run event synthesize"), OPT_END() }; + struct perf_evlist *sb_evlist = NULL; const char * const top_usage[] = { "perf top []", NULL @@ -1636,8 +1637,16 @@ int cmd_top(int argc, const char **argv) goto out_delete_evlist; } + if (perf_evlist__start_sb_thread(sb_evlist, target)) { + pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); + opts->no_bpf_event = true; + } + status = __cmd_top(&top); + if (!opts->no_bpf_event) + perf_evlist__stop_sb_thread(sb_evlist); + out_delete_evlist: perf_evlist__delete(top.evlist); perf_session__delete(top.session); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index ed20f4379956..ec78e93085de 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -19,6 +19,7 @@ #include "debug.h" #include "units.h" #include "asm/bug.h" +#include "bpf-event.h" #include #include @@ -1856,3 +1857,121 @@ struct perf_evsel *perf_evlist__reset_weak_group(struct perf_evlist *evsel_list, } return leader; } + +int perf_evlist__add_sb_event(struct perf_evlist **evlist, + struct perf_event_attr *attr, + perf_evsel__sb_cb_t cb, + void *data) +{ + struct perf_evsel *evsel; + bool new_evlist = (*evlist) == NULL; + + if (*evlist == NULL) + *evlist = perf_evlist__new(); + if (*evlist == NULL) + return -1; + + if (!attr->sample_id_all) { + pr_warning("enabling sample_id_all for all side band events\n"); + attr->sample_id_all = 1; + } + + evsel = perf_evsel__new_idx(attr, (*evlist)->nr_entries); + if (!evsel) + goto out_err; + + evsel->side_band.cb = cb; + evsel->side_band.data = data; + perf_evlist__add(*evlist, evsel); + return 0; + +out_err: + if (new_evlist) { + perf_evlist__delete(*evlist); + *evlist = NULL; + } + return -1; +} + +static void *perf_evlist__poll_thread(void *arg) +{ + struct perf_evlist *evlist = arg; + bool draining = false; + int i; + + while (draining || !(evlist->thread.done)) { + if (draining) + draining = false; + else if (evlist->thread.done) + draining = true; + + if (!draining) + perf_evlist__poll(evlist, 1000); + + for (i = 0; i < evlist->nr_mmaps; i++) { + struct perf_mmap *map = &evlist->mmap[i]; + union perf_event *event; + + if (perf_mmap__read_init(map)) + continue; + while ((event = perf_mmap__read_event(map)) != NULL) { + struct perf_evsel *evsel = perf_evlist__event2evsel(evlist, event); + + if (evsel && evsel->side_band.cb) + evsel->side_band.cb(event, evsel->side_band.data); + else + pr_warning("cannot locate proper evsel for the side band event\n"); + + perf_mmap__consume(map); + } + perf_mmap__read_done(map); + } + } + return NULL; +} + +int perf_evlist__start_sb_thread(struct perf_evlist *evlist, + struct target *target) +{ + struct perf_evsel *counter; + + if (!evlist) + return 0; + + if (perf_evlist__create_maps(evlist, target)) + goto out_delete_evlist; + + evlist__for_each_entry(evlist, counter) { + if (perf_evsel__open(counter, evlist->cpus, + evlist->threads) < 0) + goto out_delete_evlist; + } + + if (perf_evlist__mmap(evlist, UINT_MAX)) + goto out_delete_evlist; + + evlist__for_each_entry(evlist, counter) { + if (perf_evsel__enable(counter)) + goto out_delete_evlist; + } + + evlist->thread.done = 0; + if (pthread_create(&evlist->thread.th, NULL, perf_evlist__poll_thread, evlist)) + goto out_delete_evlist; + + return 0; + +out_delete_evlist: + perf_evlist__delete(evlist); + evlist = NULL; + return -1; +} + +void perf_evlist__stop_sb_thread(struct perf_evlist *evlist) +{ + if (!evlist) + return; + evlist->thread.done = 1; + pthread_join(evlist->thread.th, NULL); + perf_evlist__delete(evlist); +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 744906dd4887..dcb68f34d2cd 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -54,6 +54,10 @@ struct perf_evlist { struct perf_sample *sample); u64 first_sample_time; u64 last_sample_time; + struct { + pthread_t th; + volatile int done; + } thread; }; struct perf_evsel_str_handler { @@ -87,6 +91,14 @@ int __perf_evlist__add_default_attrs(struct perf_evlist *evlist, int perf_evlist__add_dummy(struct perf_evlist *evlist); +int perf_evlist__add_sb_event(struct perf_evlist **evlist, + struct perf_event_attr *attr, + perf_evsel__sb_cb_t cb, + void *data); +int perf_evlist__start_sb_thread(struct perf_evlist *evlist, + struct target *target); +void perf_evlist__stop_sb_thread(struct perf_evlist *evlist); + int perf_evlist__add_newtp(struct perf_evlist *evlist, const char *sys, const char *name, void *handler); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index cc578e02e08f..0f2c6c93d721 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -73,6 +73,8 @@ struct perf_evsel_config_term { struct perf_stat_evsel; +typedef int (perf_evsel__sb_cb_t)(union perf_event *event, void *data); + /** struct perf_evsel - event selector * * @evlist - evlist this evsel is in, if it is in one. @@ -151,6 +153,10 @@ struct perf_evsel { bool collect_stat; bool weak_group; const char *pmu_name; + struct { + perf_evsel__sb_cb_t *cb; + void *data; + } side_band; }; union u64_swap { From d56354dc49091e33d9ffca732ac913ed2df70537 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 11 Mar 2019 22:30:51 -0700 Subject: [PATCH 760/855] perf tools: Save bpf_prog_info and BTF of new BPF programs To fully annotate BPF programs with source code mapping, 4 different information are needed: 1) PERF_RECORD_KSYMBOL 2) PERF_RECORD_BPF_EVENT 3) bpf_prog_info 4) btf This patch handles 3) and 4) for BPF programs loaded after 'perf record|top'. For timely process of these information, a dedicated event is added to the side band evlist. When PERF_RECORD_BPF_EVENT is received via the side band event, the polling thread gathers 3) and 4) vis sys_bpf and store them in perf_env. This information is saved to perf.data at the end of 'perf record'. Committer testing: The 'wakeup_watermark' member in 'struct perf_event_attr' is inside a unnamed union, so can't be used in a struct designated initialization with older gccs, get it out of that, isolating as 'attr.wakeup_watermark = 1;' to work with all gcc versions. We also need to add '--no-bpf-event' to the 'perf record' perf_event_attr tests in 'perf test', as the way that that test goes is to intercept the events being setup and looking if they match the fields described in the control files, since now it finds first the side band event used to catch the PERF_RECORD_BPF_EVENT, they all fail. With these issues fixed: Same scenario as for testing BPF programs loaded before 'perf record' or 'perf top' starts, only start the BPF programs after 'perf record|top', so that its information get collected by the sideband threads, the rest works as for the programs loaded before start monitoring. Add missing 'inline' to the bpf_event__add_sb_event() when HAVE_LIBBPF_SUPPORT is not defined, fixing the build in systems without binutils devel files installed. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Link: http://lkml.kernel.org/r/20190312053051.2690567-16-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 3 + tools/perf/builtin-top.c | 3 + tools/perf/tests/attr/test-record-C0 | 2 +- tools/perf/tests/attr/test-record-basic | 2 +- tools/perf/tests/attr/test-record-branch-any | 2 +- .../tests/attr/test-record-branch-filter-any | 2 +- .../attr/test-record-branch-filter-any_call | 2 +- .../attr/test-record-branch-filter-any_ret | 2 +- .../tests/attr/test-record-branch-filter-hv | 2 +- .../attr/test-record-branch-filter-ind_call | 2 +- .../tests/attr/test-record-branch-filter-k | 2 +- .../tests/attr/test-record-branch-filter-u | 2 +- tools/perf/tests/attr/test-record-count | 2 +- tools/perf/tests/attr/test-record-data | 2 +- tools/perf/tests/attr/test-record-freq | 2 +- .../perf/tests/attr/test-record-graph-default | 2 +- tools/perf/tests/attr/test-record-graph-dwarf | 2 +- tools/perf/tests/attr/test-record-graph-fp | 2 +- tools/perf/tests/attr/test-record-group | 2 +- .../tests/attr/test-record-group-sampling | 2 +- tools/perf/tests/attr/test-record-group1 | 2 +- .../perf/tests/attr/test-record-no-buffering | 2 +- tools/perf/tests/attr/test-record-no-inherit | 2 +- tools/perf/tests/attr/test-record-no-samples | 2 +- tools/perf/tests/attr/test-record-period | 2 +- tools/perf/tests/attr/test-record-raw | 2 +- tools/perf/util/bpf-event.c | 100 ++++++++++++++++++ tools/perf/util/bpf-event.h | 15 +++ 28 files changed, 145 insertions(+), 24 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 6f645fd72fed..4e2d953d4bc5 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1238,6 +1238,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) goto out_child; } + if (!opts->no_bpf_event) + bpf_event__add_sb_event(&sb_evlist, &session->header.env); + if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) { pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); opts->no_bpf_event = true; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 3ce8a8db6c1d..1999d6533d12 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1637,6 +1637,9 @@ int cmd_top(int argc, const char **argv) goto out_delete_evlist; } + if (!top.record_opts.no_bpf_event) + bpf_event__add_sb_event(&sb_evlist, &perf_env); + if (perf_evlist__start_sb_thread(sb_evlist, target)) { pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); opts->no_bpf_event = true; diff --git a/tools/perf/tests/attr/test-record-C0 b/tools/perf/tests/attr/test-record-C0 index cb0a3138fa54..93818054ae20 100644 --- a/tools/perf/tests/attr/test-record-C0 +++ b/tools/perf/tests/attr/test-record-C0 @@ -1,6 +1,6 @@ [config] command = record -args = -C 0 kill >/dev/null 2>&1 +args = --no-bpf-event -C 0 kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-basic b/tools/perf/tests/attr/test-record-basic index 85a23cf35ba1..b0ca42a5ecc9 100644 --- a/tools/perf/tests/attr/test-record-basic +++ b/tools/perf/tests/attr/test-record-basic @@ -1,6 +1,6 @@ [config] command = record -args = kill >/dev/null 2>&1 +args = --no-bpf-event kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-branch-any b/tools/perf/tests/attr/test-record-branch-any index 81f839e2fad0..1a99b3ce6b89 100644 --- a/tools/perf/tests/attr/test-record-branch-any +++ b/tools/perf/tests/attr/test-record-branch-any @@ -1,6 +1,6 @@ [config] command = record -args = -b kill >/dev/null 2>&1 +args = --no-bpf-event -b kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-branch-filter-any b/tools/perf/tests/attr/test-record-branch-filter-any index 357421f4dfce..709768b508c6 100644 --- a/tools/perf/tests/attr/test-record-branch-filter-any +++ b/tools/perf/tests/attr/test-record-branch-filter-any @@ -1,6 +1,6 @@ [config] command = record -args = -j any kill >/dev/null 2>&1 +args = --no-bpf-event -j any kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-branch-filter-any_call b/tools/perf/tests/attr/test-record-branch-filter-any_call index dbc55f2ab845..f943221f7825 100644 --- a/tools/perf/tests/attr/test-record-branch-filter-any_call +++ b/tools/perf/tests/attr/test-record-branch-filter-any_call @@ -1,6 +1,6 @@ [config] command = record -args = -j any_call kill >/dev/null 2>&1 +args = --no-bpf-event -j any_call kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-branch-filter-any_ret b/tools/perf/tests/attr/test-record-branch-filter-any_ret index a0824ff8e131..fd4f5b4154a9 100644 --- a/tools/perf/tests/attr/test-record-branch-filter-any_ret +++ b/tools/perf/tests/attr/test-record-branch-filter-any_ret @@ -1,6 +1,6 @@ [config] command = record -args = -j any_ret kill >/dev/null 2>&1 +args = --no-bpf-event -j any_ret kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-branch-filter-hv b/tools/perf/tests/attr/test-record-branch-filter-hv index f34d6f120181..4e52d685ebe1 100644 --- a/tools/perf/tests/attr/test-record-branch-filter-hv +++ b/tools/perf/tests/attr/test-record-branch-filter-hv @@ -1,6 +1,6 @@ [config] command = record -args = -j hv kill >/dev/null 2>&1 +args = --no-bpf-event -j hv kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-branch-filter-ind_call b/tools/perf/tests/attr/test-record-branch-filter-ind_call index b86a35232248..e08c6ab3796e 100644 --- a/tools/perf/tests/attr/test-record-branch-filter-ind_call +++ b/tools/perf/tests/attr/test-record-branch-filter-ind_call @@ -1,6 +1,6 @@ [config] command = record -args = -j ind_call kill >/dev/null 2>&1 +args = --no-bpf-event -j ind_call kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-branch-filter-k b/tools/perf/tests/attr/test-record-branch-filter-k index d3fbc5e1858a..b4b98f84fc2f 100644 --- a/tools/perf/tests/attr/test-record-branch-filter-k +++ b/tools/perf/tests/attr/test-record-branch-filter-k @@ -1,6 +1,6 @@ [config] command = record -args = -j k kill >/dev/null 2>&1 +args = --no-bpf-event -j k kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-branch-filter-u b/tools/perf/tests/attr/test-record-branch-filter-u index a318f0dda173..fb9610edbb0d 100644 --- a/tools/perf/tests/attr/test-record-branch-filter-u +++ b/tools/perf/tests/attr/test-record-branch-filter-u @@ -1,6 +1,6 @@ [config] command = record -args = -j u kill >/dev/null 2>&1 +args = --no-bpf-event -j u kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-count b/tools/perf/tests/attr/test-record-count index 34f6cc577263..5e9b9019d786 100644 --- a/tools/perf/tests/attr/test-record-count +++ b/tools/perf/tests/attr/test-record-count @@ -1,6 +1,6 @@ [config] command = record -args = -c 123 kill >/dev/null 2>&1 +args = --no-bpf-event -c 123 kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-data b/tools/perf/tests/attr/test-record-data index a9cf2233b0ce..a99bb13149c2 100644 --- a/tools/perf/tests/attr/test-record-data +++ b/tools/perf/tests/attr/test-record-data @@ -1,6 +1,6 @@ [config] command = record -args = -d kill >/dev/null 2>&1 +args = --no-bpf-event -d kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-freq b/tools/perf/tests/attr/test-record-freq index bf4cb459f0d5..89e29f6b2ae0 100644 --- a/tools/perf/tests/attr/test-record-freq +++ b/tools/perf/tests/attr/test-record-freq @@ -1,6 +1,6 @@ [config] command = record -args = -F 100 kill >/dev/null 2>&1 +args = --no-bpf-event -F 100 kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-graph-default b/tools/perf/tests/attr/test-record-graph-default index 0b216e69760c..5d8234d50845 100644 --- a/tools/perf/tests/attr/test-record-graph-default +++ b/tools/perf/tests/attr/test-record-graph-default @@ -1,6 +1,6 @@ [config] command = record -args = -g kill >/dev/null 2>&1 +args = --no-bpf-event -g kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-graph-dwarf b/tools/perf/tests/attr/test-record-graph-dwarf index da2fa73bd0a2..ae92061d611d 100644 --- a/tools/perf/tests/attr/test-record-graph-dwarf +++ b/tools/perf/tests/attr/test-record-graph-dwarf @@ -1,6 +1,6 @@ [config] command = record -args = --call-graph dwarf -- kill >/dev/null 2>&1 +args = --no-bpf-event --call-graph dwarf -- kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-graph-fp b/tools/perf/tests/attr/test-record-graph-fp index 625d190bb798..5630521c0b0f 100644 --- a/tools/perf/tests/attr/test-record-graph-fp +++ b/tools/perf/tests/attr/test-record-graph-fp @@ -1,6 +1,6 @@ [config] command = record -args = --call-graph fp kill >/dev/null 2>&1 +args = --no-bpf-event --call-graph fp kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-group b/tools/perf/tests/attr/test-record-group index 618ba1c17474..14ee60fd3f41 100644 --- a/tools/perf/tests/attr/test-record-group +++ b/tools/perf/tests/attr/test-record-group @@ -1,6 +1,6 @@ [config] command = record -args = --group -e cycles,instructions kill >/dev/null 2>&1 +args = --no-bpf-event --group -e cycles,instructions kill >/dev/null 2>&1 ret = 1 [event-1:base-record] diff --git a/tools/perf/tests/attr/test-record-group-sampling b/tools/perf/tests/attr/test-record-group-sampling index f0729c454f16..300b9f7e6d69 100644 --- a/tools/perf/tests/attr/test-record-group-sampling +++ b/tools/perf/tests/attr/test-record-group-sampling @@ -1,6 +1,6 @@ [config] command = record -args = -e '{cycles,cache-misses}:S' kill >/dev/null 2>&1 +args = --no-bpf-event -e '{cycles,cache-misses}:S' kill >/dev/null 2>&1 ret = 1 [event-1:base-record] diff --git a/tools/perf/tests/attr/test-record-group1 b/tools/perf/tests/attr/test-record-group1 index 48e8bd12fe46..3ffe246e0228 100644 --- a/tools/perf/tests/attr/test-record-group1 +++ b/tools/perf/tests/attr/test-record-group1 @@ -1,6 +1,6 @@ [config] command = record -args = -e '{cycles,instructions}' kill >/dev/null 2>&1 +args = --no-bpf-event -e '{cycles,instructions}' kill >/dev/null 2>&1 ret = 1 [event-1:base-record] diff --git a/tools/perf/tests/attr/test-record-no-buffering b/tools/perf/tests/attr/test-record-no-buffering index aa3956d8fe20..583dcbb078ba 100644 --- a/tools/perf/tests/attr/test-record-no-buffering +++ b/tools/perf/tests/attr/test-record-no-buffering @@ -1,6 +1,6 @@ [config] command = record -args = --no-buffering kill >/dev/null 2>&1 +args = --no-bpf-event --no-buffering kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-no-inherit b/tools/perf/tests/attr/test-record-no-inherit index 560943decb87..15d1dc162e1c 100644 --- a/tools/perf/tests/attr/test-record-no-inherit +++ b/tools/perf/tests/attr/test-record-no-inherit @@ -1,6 +1,6 @@ [config] command = record -args = -i kill >/dev/null 2>&1 +args = --no-bpf-event -i kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-no-samples b/tools/perf/tests/attr/test-record-no-samples index 8eb73ab639e0..596fbd6d5a2c 100644 --- a/tools/perf/tests/attr/test-record-no-samples +++ b/tools/perf/tests/attr/test-record-no-samples @@ -1,6 +1,6 @@ [config] command = record -args = -n kill >/dev/null 2>&1 +args = --no-bpf-event -n kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-period b/tools/perf/tests/attr/test-record-period index 69bc748f0f27..119101154c5e 100644 --- a/tools/perf/tests/attr/test-record-period +++ b/tools/perf/tests/attr/test-record-period @@ -1,6 +1,6 @@ [config] command = record -args = -c 100 -P kill >/dev/null 2>&1 +args = --no-bpf-event -c 100 -P kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/tests/attr/test-record-raw b/tools/perf/tests/attr/test-record-raw index a188a614a44c..13a5f7860c78 100644 --- a/tools/perf/tests/attr/test-record-raw +++ b/tools/perf/tests/attr/test-record-raw @@ -1,6 +1,6 @@ [config] command = record -args = -R kill >/dev/null 2>&1 +args = --no-bpf-event -R kill >/dev/null 2>&1 ret = 1 [event:base-record] diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 7ffe7db59828..2a8c245ca942 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -13,6 +13,7 @@ #include "env.h" #include "session.h" #include "map.h" +#include "evlist.h" #define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr)) @@ -330,3 +331,102 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, free(event); return err; } + +static void perf_env__add_bpf_info(struct perf_env *env, u32 id) +{ + struct bpf_prog_info_linear *info_linear; + struct bpf_prog_info_node *info_node; + struct btf *btf = NULL; + u64 arrays; + u32 btf_id; + int fd; + + fd = bpf_prog_get_fd_by_id(id); + if (fd < 0) + return; + + arrays = 1UL << BPF_PROG_INFO_JITED_KSYMS; + arrays |= 1UL << BPF_PROG_INFO_JITED_FUNC_LENS; + arrays |= 1UL << BPF_PROG_INFO_FUNC_INFO; + arrays |= 1UL << BPF_PROG_INFO_PROG_TAGS; + arrays |= 1UL << BPF_PROG_INFO_JITED_INSNS; + arrays |= 1UL << BPF_PROG_INFO_LINE_INFO; + arrays |= 1UL << BPF_PROG_INFO_JITED_LINE_INFO; + + info_linear = bpf_program__get_prog_info_linear(fd, arrays); + if (IS_ERR_OR_NULL(info_linear)) { + pr_debug("%s: failed to get BPF program info. aborting\n", __func__); + goto out; + } + + btf_id = info_linear->info.btf_id; + + info_node = malloc(sizeof(struct bpf_prog_info_node)); + if (info_node) { + info_node->info_linear = info_linear; + perf_env__insert_bpf_prog_info(env, info_node); + } else + free(info_linear); + + if (btf_id == 0) + goto out; + + if (btf__get_from_id(btf_id, &btf)) { + pr_debug("%s: failed to get BTF of id %u, aborting\n", + __func__, btf_id); + goto out; + } + perf_env__fetch_btf(env, btf_id, btf); + +out: + free(btf); + close(fd); +} + +static int bpf_event__sb_cb(union perf_event *event, void *data) +{ + struct perf_env *env = data; + + if (event->header.type != PERF_RECORD_BPF_EVENT) + return -1; + + switch (event->bpf_event.type) { + case PERF_BPF_EVENT_PROG_LOAD: + perf_env__add_bpf_info(env, event->bpf_event.id); + + case PERF_BPF_EVENT_PROG_UNLOAD: + /* + * Do not free bpf_prog_info and btf of the program here, + * as annotation still need them. They will be freed at + * the end of the session. + */ + break; + default: + pr_debug("unexpected bpf_event type of %d\n", + event->bpf_event.type); + break; + } + + return 0; +} + +int bpf_event__add_sb_event(struct perf_evlist **evlist, + struct perf_env *env) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_DUMMY, + .sample_id_all = 1, + .watermark = 1, + .bpf_event = 1, + .size = sizeof(attr), /* to capture ABI version */ + }; + + /* + * Older gcc versions don't support designated initializers, like above, + * for unnamed union members, such as the following: + */ + attr.wakeup_watermark = 1; + + return perf_evlist__add_sb_event(evlist, &attr, bpf_event__sb_cb, env); +} diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h index b9ec394dc7c7..8cb1189149ec 100644 --- a/tools/perf/util/bpf-event.h +++ b/tools/perf/util/bpf-event.h @@ -4,12 +4,17 @@ #include #include +#include +#include #include "event.h" struct machine; union perf_event; +struct perf_env; struct perf_sample; struct record_opts; +struct evlist; +struct target; struct bpf_prog_info_node { struct bpf_prog_info_linear *info_linear; @@ -31,6 +36,9 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, perf_event__handler_t process, struct machine *machine, struct record_opts *opts); +int bpf_event__add_sb_event(struct perf_evlist **evlist, + struct perf_env *env); + #else static inline int machine__process_bpf_event(struct machine *machine __maybe_unused, union perf_event *event __maybe_unused, @@ -46,5 +54,12 @@ static inline int perf_event__synthesize_bpf_events(struct perf_session *session { return 0; } + +static inline int bpf_event__add_sb_event(struct perf_evlist **evlist __maybe_unused, + struct perf_env *env __maybe_unused) +{ + return 0; +} + #endif // HAVE_LIBBPF_SUPPORT #endif From fc462ac75b36daaa61e9bda7fba66ed1b3a500b4 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 19 Mar 2019 09:54:53 -0700 Subject: [PATCH 761/855] perf bpf: Extract logic to create program names from perf_event__synthesize_one_bpf_prog() Extract logic to create program names to synthesize_bpf_prog_name(), so that it can be reused in header.c:print_bpf_prog_info(). This commit doesn't change the behavior. Signed-off-by: Song Liu Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Link: http://lkml.kernel.org/r/20190319165454.1298742-2-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-event.c | 62 +++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 2a8c245ca942..d5b041649f26 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -111,6 +111,38 @@ static int perf_env__fetch_btf(struct perf_env *env, return 0; } +static int synthesize_bpf_prog_name(char *buf, int size, + struct bpf_prog_info *info, + struct btf *btf, + u32 sub_id) +{ + u8 (*prog_tags)[BPF_TAG_SIZE] = (void *)(uintptr_t)(info->prog_tags); + void *func_infos = (void *)(uintptr_t)(info->func_info); + u32 sub_prog_cnt = info->nr_jited_ksyms; + const struct bpf_func_info *finfo; + const char *short_name = NULL; + const struct btf_type *t; + int name_len; + + name_len = snprintf(buf, size, "bpf_prog_"); + name_len += snprintf_hex(buf + name_len, size - name_len, + prog_tags[sub_id], BPF_TAG_SIZE); + if (btf) { + finfo = func_infos + sub_id * info->func_info_rec_size; + t = btf__type_by_id(btf, finfo->type_id); + short_name = btf__name_by_offset(btf, t->name_off); + } else if (sub_id == 0 && sub_prog_cnt == 1) { + /* no subprog */ + if (info->name[0]) + short_name = info->name; + } else + short_name = "F"; + if (short_name) + name_len += snprintf(buf + name_len, size - name_len, + "_%s", short_name); + return name_len; +} + /* * Synthesize PERF_RECORD_KSYMBOL and PERF_RECORD_BPF_EVENT for one bpf * program. One PERF_RECORD_BPF_EVENT is generated for the program. And @@ -135,7 +167,6 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, struct bpf_prog_info_node *info_node; struct bpf_prog_info *info; struct btf *btf = NULL; - bool has_btf = false; struct perf_env *env; u32 sub_prog_cnt, i; int err = 0; @@ -189,19 +220,13 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, btf = NULL; goto out; } - has_btf = true; perf_env__fetch_btf(env, info->btf_id, btf); } /* Synthesize PERF_RECORD_KSYMBOL */ for (i = 0; i < sub_prog_cnt; i++) { - u8 (*prog_tags)[BPF_TAG_SIZE] = (void *)(uintptr_t)(info->prog_tags); - __u32 *prog_lens = (__u32 *)(uintptr_t)(info->jited_func_lens); + __u32 *prog_lens = (__u32 *)(uintptr_t)(info->jited_func_lens); __u64 *prog_addrs = (__u64 *)(uintptr_t)(info->jited_ksyms); - void *func_infos = (void *)(uintptr_t)(info->func_info); - const struct bpf_func_info *finfo; - const char *short_name = NULL; - const struct btf_type *t; int name_len; *ksymbol_event = (struct ksymbol_event){ @@ -214,26 +239,9 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, .ksym_type = PERF_RECORD_KSYMBOL_TYPE_BPF, .flags = 0, }; - name_len = snprintf(ksymbol_event->name, KSYM_NAME_LEN, - "bpf_prog_"); - name_len += snprintf_hex(ksymbol_event->name + name_len, - KSYM_NAME_LEN - name_len, - prog_tags[i], BPF_TAG_SIZE); - if (has_btf) { - finfo = func_infos + i * info->func_info_rec_size; - t = btf__type_by_id(btf, finfo->type_id); - short_name = btf__name_by_offset(btf, t->name_off); - } else if (i == 0 && sub_prog_cnt == 1) { - /* no subprog */ - if (info->name[0]) - short_name = info->name; - } else - short_name = "F"; - if (short_name) - name_len += snprintf(ksymbol_event->name + name_len, - KSYM_NAME_LEN - name_len, - "_%s", short_name); + name_len = synthesize_bpf_prog_name(ksymbol_event->name, + KSYM_NAME_LEN, info, btf, i); ksymbol_event->header.size += PERF_ALIGN(name_len + 1, sizeof(u64)); From f8dfeae009effc0b6dac2741cf8d7cbb91edb982 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 19 Mar 2019 09:54:54 -0700 Subject: [PATCH 762/855] perf bpf: Show more BPF program info in print_bpf_prog_info() This patch enables showing bpf program name, address, and size in the header. Before the patch: perf report --header-only ... # bpf_prog_info of id 9 # bpf_prog_info of id 10 # bpf_prog_info of id 13 After the patch: # bpf_prog_info 9: bpf_prog_7be49e3934a125ba addr 0xffffffffa0024947 size 229 # bpf_prog_info 10: bpf_prog_2a142ef67aaad174 addr 0xffffffffa007c94d size 229 # bpf_prog_info 13: bpf_prog_47368425825d7384_task__task_newt addr 0xffffffffa0251137 size 369 Committer notes: Fix the fallback definition when HAVE_LIBBPF_SUPPORT is not defined, i.e. add the missing 'static inline' and add the __maybe_unused to the args. Also add stdio.h since we now use FILE * in bpf-event.h. Signed-off-by: Song Liu Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stanislav Fomichev Link: http://lkml.kernel.org/r/20190319165454.1298742-3-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-event.c | 40 +++++++++++++++++++++++++++++++++++++ tools/perf/util/bpf-event.h | 11 +++++++++- tools/perf/util/header.c | 5 +++-- 3 files changed, 53 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index d5b041649f26..2a4a0da35632 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -438,3 +438,43 @@ int bpf_event__add_sb_event(struct perf_evlist **evlist, return perf_evlist__add_sb_event(evlist, &attr, bpf_event__sb_cb, env); } + +void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info, + struct perf_env *env, + FILE *fp) +{ + __u32 *prog_lens = (__u32 *)(uintptr_t)(info->jited_func_lens); + __u64 *prog_addrs = (__u64 *)(uintptr_t)(info->jited_ksyms); + char name[KSYM_NAME_LEN]; + struct btf *btf = NULL; + u32 sub_prog_cnt, i; + + sub_prog_cnt = info->nr_jited_ksyms; + if (sub_prog_cnt != info->nr_prog_tags || + sub_prog_cnt != info->nr_jited_func_lens) + return; + + if (info->btf_id) { + struct btf_node *node; + + node = perf_env__find_btf(env, info->btf_id); + if (node) + btf = btf__new((__u8 *)(node->data), + node->data_size); + } + + if (sub_prog_cnt == 1) { + synthesize_bpf_prog_name(name, KSYM_NAME_LEN, info, btf, 0); + fprintf(fp, "# bpf_prog_info %u: %s addr 0x%llx size %u\n", + info->id, name, prog_addrs[0], prog_lens[0]); + return; + } + + fprintf(fp, "# bpf_prog_info %u:\n", info->id); + for (i = 0; i < sub_prog_cnt; i++) { + synthesize_bpf_prog_name(name, KSYM_NAME_LEN, info, btf, i); + + fprintf(fp, "# \tsub_prog %u: %s addr 0x%llx size %u\n", + i, name, prog_addrs[i], prog_lens[i]); + } +} diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h index 8cb1189149ec..04c33b3bfe28 100644 --- a/tools/perf/util/bpf-event.h +++ b/tools/perf/util/bpf-event.h @@ -7,6 +7,7 @@ #include #include #include "event.h" +#include struct machine; union perf_event; @@ -38,7 +39,9 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, struct record_opts *opts); int bpf_event__add_sb_event(struct perf_evlist **evlist, struct perf_env *env); - +void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info, + struct perf_env *env, + FILE *fp); #else static inline int machine__process_bpf_event(struct machine *machine __maybe_unused, union perf_event *event __maybe_unused, @@ -61,5 +64,11 @@ static inline int bpf_event__add_sb_event(struct perf_evlist **evlist __maybe_un return 0; } +static inline void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info __maybe_unused, + struct perf_env *env __maybe_unused, + FILE *fp __maybe_unused) +{ + +} #endif // HAVE_LIBBPF_SUPPORT #endif diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 01dda2f65d36..b9e693825873 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1468,8 +1468,9 @@ static void print_bpf_prog_info(struct feat_fd *ff, FILE *fp) node = rb_entry(next, struct bpf_prog_info_node, rb_node); next = rb_next(&node->rb_node); - fprintf(fp, "# bpf_prog_info of id %u\n", - node->info_linear->info.id); + + bpf_event__print_bpf_prog_info(&node->info_linear->info, + env, fp); } up_read(&env->bpf_progs.lock); From f27b744baaa646a7c0a01443cc0d8b4787cac2f7 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 21 Mar 2019 23:14:48 +0800 Subject: [PATCH 763/855] irqchip/irq-mvebu-sei: Make mvebu_sei_ap806_caps static Fix sparse warning: drivers/irqchip/irq-mvebu-sei.c:481:23: warning: symbol 'mvebu_sei_ap806_caps' was not declared. Should it be static? Signed-off-by: YueHaibing Signed-off-by: Thomas Gleixner Cc: Cc: Cc: Cc: Cc: Cc: Link: https://lkml.kernel.org/r/20190321151448.15600-1-yuehaibing@huawei.com --- drivers/irqchip/irq-mvebu-sei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-mvebu-sei.c b/drivers/irqchip/irq-mvebu-sei.c index add4c9c934c8..18832ccc8ff8 100644 --- a/drivers/irqchip/irq-mvebu-sei.c +++ b/drivers/irqchip/irq-mvebu-sei.c @@ -478,7 +478,7 @@ dispose_irq: return ret; } -struct mvebu_sei_caps mvebu_sei_ap806_caps = { +static struct mvebu_sei_caps mvebu_sei_ap806_caps = { .ap_range = { .first = 0, .size = 21, From 33872d79f5d1cbedaaab79669cc38f16097a9450 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Thu, 21 Mar 2019 09:11:59 +0100 Subject: [PATCH 764/855] tipc: fix cancellation of topology subscriptions When cancelling a subscription, we have to clear the cancel bit in the request before iterating over any established subscriptions with memcmp. Otherwise no subscription will ever be found, and it will not be possible to explicitly unsubscribe individual subscriptions. Fixes: 8985ecc7c1e0 ("tipc: simplify endianness handling in topology subscriber") Signed-off-by: Erik Hugne Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/topsrv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 4a708a4e8583..b45932d78004 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -363,6 +363,7 @@ static int tipc_conn_rcv_sub(struct tipc_topsrv *srv, struct tipc_subscription *sub; if (tipc_sub_read(s, filter) & TIPC_SUB_CANCEL) { + s->filter &= __constant_ntohl(~TIPC_SUB_CANCEL); tipc_conn_delete_sub(con, s); return 0; } From ceabee6c59943bdd5e1da1a6a20dc7ee5f8113a2 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 21 Mar 2019 15:02:50 +0800 Subject: [PATCH 765/855] genetlink: Fix a memory leak on error path In genl_register_family(), when idr_alloc() fails, we forget to free the memory we possibly allocate for family->attrbuf. Reported-by: Hulk Robot Fixes: 2ae0f17df1cd ("genetlink: use idr to track families") Signed-off-by: YueHaibing Reviewed-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 25eeb6d2a75a..f0ec068e1d02 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -366,7 +366,7 @@ int genl_register_family(struct genl_family *family) start, end + 1, GFP_KERNEL); if (family->id < 0) { err = family->id; - goto errout_locked; + goto errout_free; } err = genl_validate_assign_mc_groups(family); @@ -385,6 +385,7 @@ int genl_register_family(struct genl_family *family) errout_remove: idr_remove(&genl_fam_idr, family->id); +errout_free: kfree(family->attrbuf); errout_locked: genl_unlock_all(); From 06acc17a96215a11134114aee26532b12dc8fde1 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 20 Mar 2019 07:36:55 -0500 Subject: [PATCH 766/855] net: phy: Add DP83825I to the DP83822 driver Add the DP83825I ethernet PHY to the DP83822 driver. These devices share the same WoL register bits and addresses. The phy_driver init was made into a macro as there may be future devices appended to this driver that will share the register space. http://www.ti.com/lit/gpn/dp83825i Reviewed-by: Florian Fainelli Signed-off-by: Dan Murphy Signed-off-by: David S. Miller --- drivers/net/phy/dp83822.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index bbd8c22067f3..97d45bd5b38e 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -15,6 +15,8 @@ #include #define DP83822_PHY_ID 0x2000a240 +#define DP83825I_PHY_ID 0x2000a150 + #define DP83822_DEVADDR 0x1f #define MII_DP83822_PHYSCR 0x11 @@ -304,26 +306,30 @@ static int dp83822_resume(struct phy_device *phydev) return 0; } +#define DP83822_PHY_DRIVER(_id, _name) \ + { \ + PHY_ID_MATCH_MODEL(_id), \ + .name = (_name), \ + .features = PHY_BASIC_FEATURES, \ + .soft_reset = dp83822_phy_reset, \ + .config_init = dp83822_config_init, \ + .get_wol = dp83822_get_wol, \ + .set_wol = dp83822_set_wol, \ + .ack_interrupt = dp83822_ack_interrupt, \ + .config_intr = dp83822_config_intr, \ + .suspend = dp83822_suspend, \ + .resume = dp83822_resume, \ + } + static struct phy_driver dp83822_driver[] = { - { - .phy_id = DP83822_PHY_ID, - .phy_id_mask = 0xfffffff0, - .name = "TI DP83822", - .features = PHY_BASIC_FEATURES, - .config_init = dp83822_config_init, - .soft_reset = dp83822_phy_reset, - .get_wol = dp83822_get_wol, - .set_wol = dp83822_set_wol, - .ack_interrupt = dp83822_ack_interrupt, - .config_intr = dp83822_config_intr, - .suspend = dp83822_suspend, - .resume = dp83822_resume, - }, + DP83822_PHY_DRIVER(DP83822_PHY_ID, "TI DP83822"), + DP83822_PHY_DRIVER(DP83825I_PHY_ID, "TI DP83825I"), }; module_phy_driver(dp83822_driver); static struct mdio_device_id __maybe_unused dp83822_tbl[] = { { DP83822_PHY_ID, 0xfffffff0 }, + { DP83825I_PHY_ID, 0xfffffff0 }, { }, }; MODULE_DEVICE_TABLE(mdio, dp83822_tbl); From cd5afa91f078c0787be0a62b5ef90301c00b0271 Mon Sep 17 00:00:00 2001 From: Harini Katakam Date: Wed, 20 Mar 2019 19:12:22 +0530 Subject: [PATCH 767/855] net: macb: Add null check for PCLK and HCLK Both PCLK and HCLK are "required" clocks according to macb devicetree documentation. There is a chance that devm_clk_get doesn't return a negative error but just a NULL clock structure instead. In such a case the driver proceeds as usual and uses pclk value 0 to calculate MDC divisor which is incorrect. Hence fix the same in clock initialization. Signed-off-by: Harini Katakam Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index ad099fd01b45..1522aee81884 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -3370,14 +3370,20 @@ static int macb_clk_init(struct platform_device *pdev, struct clk **pclk, *hclk = devm_clk_get(&pdev->dev, "hclk"); } - if (IS_ERR(*pclk)) { + if (IS_ERR_OR_NULL(*pclk)) { err = PTR_ERR(*pclk); + if (!err) + err = -ENODEV; + dev_err(&pdev->dev, "failed to get macb_clk (%u)\n", err); return err; } - if (IS_ERR(*hclk)) { + if (IS_ERR_OR_NULL(*hclk)) { err = PTR_ERR(*hclk); + if (!err) + err = -ENODEV; + dev_err(&pdev->dev, "failed to get hclk (%u)\n", err); return err; } From 85d0966fa57e0ef2d30d913c98ca93674f7a03c9 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 14:59:59 +0100 Subject: [PATCH 768/855] net/sched: prepare TC actions to properly validate the control action - pass a pointer to struct tcf_proto in each actions's init() handler, to allow validating the control action, checking whether the chain exists and (eventually) refcounting it. - remove code that validates the control action after a successful call to the action's init() handler, and replace it with a test that forbids addition of actions having 'goto_chain' and NULL goto_chain pointer at the same time. - add tcf_action_check_ctrlact(), that will validate the control action and eventually allocate the action 'goto_chain' within the init() handler. - add tcf_action_set_ctrlact(), that will assign the control action and swap the current 'goto_chain' pointer with the new given one. This disallows 'goto_chain' on actions that don't initialize it properly in their init() handler, i.e. calling tcf_action_check_ctrlact() after successful IDR reservation and then calling tcf_action_set_ctrlact() to assign 'goto_chain' and 'tcf_action' consistently. By doing this, the kernel does not leak anymore refcounts when a valid 'goto chain' handle is replaced in TC actions, causing kmemleak splats like the following one: # tc chain add dev dd0 chain 42 ingress protocol ip flower \ > ip_proto tcp action drop # tc chain add dev dd0 chain 43 ingress protocol ip flower \ > ip_proto udp action drop # tc filter add dev dd0 ingress matchall \ > action gact goto chain 42 index 66 # tc filter replace dev dd0 ingress matchall \ > action gact goto chain 43 index 66 # echo scan >/sys/kernel/debug/kmemleak <...> unreferenced object 0xffff93c0ee09f000 (size 1024): comm "tc", pid 2565, jiffies 4295339808 (age 65.426s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 08 00 06 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000009b63f92d>] tc_ctl_chain+0x3d2/0x4c0 [<00000000683a8d72>] rtnetlink_rcv_msg+0x263/0x2d0 [<00000000ddd88f8e>] netlink_rcv_skb+0x4a/0x110 [<000000006126a348>] netlink_unicast+0x1a0/0x250 [<00000000b3340877>] netlink_sendmsg+0x2c1/0x3c0 [<00000000a25a2171>] sock_sendmsg+0x36/0x40 [<00000000f19ee1ec>] ___sys_sendmsg+0x280/0x2f0 [<00000000d0422042>] __sys_sendmsg+0x5e/0xa0 [<000000007a6c61f9>] do_syscall_64+0x5b/0x180 [<00000000ccd07542>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<0000000013eaa334>] 0xffffffffffffffff Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/act_api.h | 7 ++- net/sched/act_api.c | 97 ++++++++++++++++++++++---------------- net/sched/act_bpf.c | 2 +- net/sched/act_connmark.c | 1 + net/sched/act_csum.c | 2 +- net/sched/act_gact.c | 2 +- net/sched/act_ife.c | 2 +- net/sched/act_ipt.c | 11 +++-- net/sched/act_mirred.c | 1 + net/sched/act_nat.c | 3 +- net/sched/act_pedit.c | 2 +- net/sched/act_police.c | 1 + net/sched/act_sample.c | 2 +- net/sched/act_simple.c | 2 +- net/sched/act_skbedit.c | 1 + net/sched/act_skbmod.c | 1 + net/sched/act_tunnel_key.c | 1 + net/sched/act_vlan.c | 2 +- 18 files changed, 84 insertions(+), 56 deletions(-) diff --git a/include/net/act_api.h b/include/net/act_api.h index c745e9ccfab2..54fbb49bd08a 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -90,7 +90,7 @@ struct tc_action_ops { int (*lookup)(struct net *net, struct tc_action **a, u32 index); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int ovr, - int bind, bool rtnl_held, + int bind, bool rtnl_held, struct tcf_proto *tp, struct netlink_ext_ack *extack); int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, @@ -181,6 +181,11 @@ int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); +int tcf_action_check_ctrlact(int action, struct tcf_proto *tp, + struct tcf_chain **handle, + struct netlink_ext_ack *newchain); +struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action, + struct tcf_chain *newchain); #endif /* CONFIG_NET_CLS_ACT */ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes, diff --git a/net/sched/act_api.c b/net/sched/act_api.c index aecf1bf233c8..fe67b98ac641 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -28,23 +28,6 @@ #include #include -static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) -{ - u32 chain_index = a->tcfa_action & TC_ACT_EXT_VAL_MASK; - - if (!tp) - return -EINVAL; - a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index); - if (!a->goto_chain) - return -ENOMEM; - return 0; -} - -static void tcf_action_goto_chain_fini(struct tc_action *a) -{ - tcf_chain_put_by_act(a->goto_chain); -} - static void tcf_action_goto_chain_exec(const struct tc_action *a, struct tcf_result *res) { @@ -71,6 +54,53 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, call_rcu(&old->rcu, tcf_free_cookie_rcu); } +int tcf_action_check_ctrlact(int action, struct tcf_proto *tp, + struct tcf_chain **newchain, + struct netlink_ext_ack *extack) +{ + int opcode = TC_ACT_EXT_OPCODE(action), ret = -EINVAL; + u32 chain_index; + + if (!opcode) + ret = action > TC_ACT_VALUE_MAX ? -EINVAL : 0; + else if (opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC) + ret = 0; + if (ret) { + NL_SET_ERR_MSG(extack, "invalid control action"); + goto end; + } + + if (TC_ACT_EXT_CMP(action, TC_ACT_GOTO_CHAIN)) { + chain_index = action & TC_ACT_EXT_VAL_MASK; + if (!tp || !newchain) { + ret = -EINVAL; + NL_SET_ERR_MSG(extack, + "can't goto NULL proto/chain"); + goto end; + } + *newchain = tcf_chain_get_by_act(tp->chain->block, chain_index); + if (!*newchain) { + ret = -ENOMEM; + NL_SET_ERR_MSG(extack, + "can't allocate goto_chain"); + } + } +end: + return ret; +} +EXPORT_SYMBOL(tcf_action_check_ctrlact); + +struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action, + struct tcf_chain *newchain) +{ + struct tcf_chain *oldchain = a->goto_chain; + + a->tcfa_action = action; + a->goto_chain = newchain; + return oldchain; +} +EXPORT_SYMBOL(tcf_action_set_ctrlact); + /* XXX: For standalone actions, we don't need a RCU grace period either, because * actions are always connected to filters and filters are already destroyed in * RCU callbacks, so after a RCU grace period actions are already disconnected @@ -78,13 +108,15 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, */ static void free_tcf(struct tc_action *p) { + struct tcf_chain *chain = p->goto_chain; + free_percpu(p->cpu_bstats); free_percpu(p->cpu_bstats_hw); free_percpu(p->cpu_qstats); tcf_set_action_cookie(&p->act_cookie, NULL); - if (p->goto_chain) - tcf_action_goto_chain_fini(p); + if (chain) + tcf_chain_put_by_act(chain); kfree(p); } @@ -800,15 +832,6 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } -static bool tcf_action_valid(int action) -{ - int opcode = TC_ACT_EXT_OPCODE(action); - - if (!opcode) - return action <= TC_ACT_VALUE_MAX; - return opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC; -} - struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, @@ -890,10 +913,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, - rtnl_held, extack); + rtnl_held, tp, extack); else err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, - extack); + tp, extack); if (err < 0) goto err_mod; @@ -907,18 +930,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (err != ACT_P_CREATED) module_put(a_o->owner); - if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) { - err = tcf_action_goto_chain_init(a, tp); - if (err) { - tcf_action_destroy_1(a, bind); - NL_SET_ERR_MSG(extack, "Failed to init TC action chain"); - return ERR_PTR(err); - } - } - - if (!tcf_action_valid(a->tcfa_action)) { + if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN) && + !a->goto_chain) { tcf_action_destroy_1(a, bind); - NL_SET_ERR_MSG(extack, "Invalid control action value"); + NL_SET_ERR_MSG(extack, "can't use goto chain with NULL chain"); return ERR_PTR(-EINVAL); } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index aa5c38d11a30..3c0468f2aae6 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -278,7 +278,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int replace, int bind, bool rtnl_held, - struct netlink_ext_ack *extack) + struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 5d24993cccfe..44aa046a92ea 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -97,6 +97,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index c79aca29505e..9ba0f61a1e82 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -46,7 +46,7 @@ static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, bool rtnl_held, + int bind, bool rtnl_held, struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 93da0004e9f4..b8ad311bd8cc 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -57,7 +57,7 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct netlink_ext_ack *extack) + struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 9b1f2b3990ee..c1ba74d5c1e3 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -469,7 +469,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct netlink_ext_ack *extack) + struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 98f5b6ea77b4..04a0b5c61194 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -97,7 +97,8 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - const struct tc_action_ops *ops, int ovr, int bind) + const struct tc_action_ops *ops, int ovr, int bind, + struct tcf_proto *tp) { struct tc_action_net *tn = net_generic(net, id); struct nlattr *tb[TCA_IPT_MAX + 1]; @@ -205,20 +206,20 @@ err1: static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, bool rtnl_held, + int bind, bool rtnl_held, struct tcf_proto *tp, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, - bind); + bind, tp); } static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, bool unlocked, + int bind, bool unlocked, struct tcf_proto *tp, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, - bind); + bind, tp); } static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 6692fd054617..383f4024452c 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -94,6 +94,7 @@ static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 543eab9193f1..de4b493e26d2 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -38,7 +38,8 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, - bool rtnl_held, struct netlink_ext_ack *extack) + bool rtnl_held, struct tcf_proto *tp, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index a80373878df7..8ca82aefa11a 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -138,7 +138,7 @@ nla_failure: static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct netlink_ext_ack *extack) + struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 8271a6263824..229eba7925e5 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -83,6 +83,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, struct netlink_ext_ack *extack) { int ret = 0, tcfp_result = TC_ACT_OK, err, size; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 203e399e5c85..36b8adbe935d 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -37,7 +37,7 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, bool rtnl_held, + int bind, bool rtnl_held, struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index d54cb608dbaf..4916dc3e3668 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -78,7 +78,7 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct netlink_ext_ack *extack) + struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 65879500b688..4566eff3d027 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -96,6 +96,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 7bac1d78e7a3..b9ab2c8f07f1 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -82,6 +82,7 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 7c6591b991d5..fc295d91559a 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -210,6 +210,7 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p) static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index ac0061599225..4651ee15e35d 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -105,7 +105,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, - struct netlink_ext_ack *extack) + struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; From 4e1810049c267c203c19130301203d0591174535 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:00 +0100 Subject: [PATCH 769/855] net/sched: act_bpf: validate the control action inside init() the following script: # tc filter add dev crash0 egress matchall \ > action bpf bytecode '1,6 0 0 4294967295' pass index 90 # tc actions replace action bpf \ > bytecode '1,6 0 0 4294967295' goto chain 42 index 90 cookie c1a0c1a0 # tc action show action bpf had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: bpf bytecode '1,6 0 0 4294967295' default-action goto chain 42 index 90 ref 2 bind 1 cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffb3a0803dfa90 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff942b347ada00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffb3a08034d038 RDI: ffff942b347ada00 RBP: ffffb3a0803dfb30 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: ffffb3a0803dfb0c R12: ffff942b3b682b00 R13: ffff942b3b682b08 R14: 0000000000000001 R15: ffff942b3b682f00 FS: 00007f6160a72740(0000) GS:ffff942b3da80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000795a4002 CR4: 00000000001606e0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ip_finish_output2+0x16f/0x430 ip_finish_output2+0x16f/0x430 ? ip_output+0x69/0xe0 ip_output+0x69/0xe0 ? ip_forward_options+0x1a0/0x1a0 ip_send_skb+0x15/0x40 raw_sendmsg+0x8e1/0xbd0 ? sched_clock+0x5/0x10 ? sched_clock_cpu+0xc/0xa0 ? try_to_wake_up+0x54/0x480 ? ldsem_down_read+0x3f/0x280 ? _cond_resched+0x15/0x40 ? down_read+0xe/0x30 ? copy_termios+0x1e/0x70 ? tty_mode_ioctl+0x1b6/0x4c0 ? sock_sendmsg+0x36/0x40 sock_sendmsg+0x36/0x40 __sys_sendto+0x10e/0x140 ? do_vfs_ioctl+0xa4/0x640 ? handle_mm_fault+0xdc/0x210 ? syscall_trace_enter+0x1df/0x2e0 ? __audit_syscall_exit+0x216/0x260 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f615f7e3c03 Code: 48 8b 0d 90 62 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 9d c3 2c 00 00 75 13 49 89 ca b8 2c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 4b cc 00 00 48 89 04 24 RSP: 002b:00007ffee5d8cc28 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 000055a4f28f1700 RCX: 00007f615f7e3c03 RDX: 0000000000000040 RSI: 000055a4f28f1700 RDI: 0000000000000003 RBP: 00007ffee5d8e340 R08: 000055a4f28ee510 R09: 0000000000000010 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000040 R13: 000055a4f28f16c0 R14: 000055a4f28ef69c R15: 0000000000000080 Modules linked in: act_bpf veth ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 mbcache crct10dif_pclmul jbd2 crc32_pclmul snd_hda_codec_generic ghash_clmulni_intel snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd cryptd glue_helper pcspkr joydev virtio_balloon snd_timer snd i2c_piix4 soundcore nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl drm_kms_helper virtio_blk virtio_net virtio_console net_failover failover syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm crc32c_intel ata_piix serio_raw libata virtio_pci virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_bpf_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 23 +++++++++++++---- .../tc-testing/tc-tests/actions/bpf.json | 25 +++++++++++++++++++ 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 3c0468f2aae6..3841156aa09f 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -282,6 +283,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; + struct tcf_chain *goto_ch = NULL; struct tcf_bpf_cfg cfg, old; struct tc_act_bpf *parm; struct tcf_bpf *prog; @@ -323,12 +325,16 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, return ret; } + ret = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (ret < 0) + goto release_idr; + is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; is_ebpf = tb[TCA_ACT_BPF_FD]; if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) { ret = -EINVAL; - goto out; + goto put_chain; } memset(&cfg, 0, sizeof(cfg)); @@ -336,7 +342,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : tcf_bpf_init_from_efd(tb, &cfg); if (ret < 0) - goto out; + goto put_chain; prog = to_bpf(*act); @@ -350,10 +356,13 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (cfg.bpf_num_ops) prog->bpf_num_ops = cfg.bpf_num_ops; - prog->tcf_action = parm->action; + goto_ch = tcf_action_set_ctrlact(*act, parm->action, goto_ch); rcu_assign_pointer(prog->filter, cfg.filter); spin_unlock_bh(&prog->tcf_lock); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + if (res == ACT_P_CREATED) { tcf_idr_insert(tn, *act); } else { @@ -363,9 +372,13 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, } return res; -out: - tcf_idr_release(*act, bind); +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + +release_idr: + tcf_idr_release(*act, bind); return ret; } diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json b/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json index 5970cee6d05f..b074ea9b6fe8 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json @@ -286,5 +286,30 @@ "teardown": [ "$TC action flush action bpf" ] + }, + { + "id": "b8a1", + "name": "Replace bpf action with invalid goto_chain control", + "category": [ + "actions", + "bpf" + ], + "setup": [ + [ + "$TC actions flush action bpf", + 0, + 1, + 255 + ], + "$TC action add action bpf bytecode '1,6 0 0 4294967295' pass index 90" + ], + "cmdUnderTest": "$TC action replace action bpf bytecode '1,6 0 0 4294967295' goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC action list action bpf", + "matchPattern": "action order [0-9]*: bpf.* default-action pass.*index 90", + "matchCount": "1", + "teardown": [ + "$TC action flush action bpf" + ] } ] From f5c29d83866d75585f9c89754de0b86b45ceab89 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:01 +0100 Subject: [PATCH 770/855] net/sched: act_csum: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall action csum icmp pass index 90 # tc actions replace action csum icmp goto chain 42 index 90 \ > cookie c1a0c1a0 # tc actions show action csum had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: csum (icmp) action goto chain 42 index 90 ref 2 bind 1 cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 8000000074692067 P4D 8000000074692067 PUD 2e210067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.0.0-rc4.gotochain_crash+ #533 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff93153da03be0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff9314ee40f700 RCX: 0000000000003a00 RDX: 0000000000000000 RSI: ffff931537c87828 RDI: ffff931537c87818 RBP: ffff93153da03c80 R08: 00000000527cffff R09: 0000000000000003 R10: 000000000000003f R11: 0000000000000028 R12: ffff9314edf68400 R13: ffff9314edf68408 R14: 0000000000000001 R15: ffff9314ed67b600 FS: 0000000000000000(0000) GS:ffff93153da00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000073e32003 CR4: 00000000001606f0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ip6_finish_output2+0x369/0x590 ip6_finish_output2+0x369/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.35+0x79/0xc0 mld_sendpack+0x16f/0x220 mld_ifc_timer_expire+0x195/0x2c0 ? igmp6_timer_handler+0x70/0x70 call_timer_fn+0x2b/0x130 run_timer_softirq+0x3e8/0x440 ? tick_sched_timer+0x37/0x70 __do_softirq+0xe3/0x2f5 irq_exit+0xf0/0x100 smp_apic_timer_interrupt+0x6c/0x130 apic_timer_interrupt+0xf/0x20 RIP: 0010:native_safe_halt+0x2/0x10 Code: 66 ff ff ff 7f f3 c3 65 48 8b 04 25 00 5c 01 00 f0 80 48 02 20 48 8b 00 a8 08 74 8b eb c1 90 90 90 90 90 90 90 90 90 90 fb f4 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 f4 c3 90 90 90 90 90 90 RSP: 0018:ffffffff9a803e98 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 RAX: ffffffff99e184f0 RBX: 0000000000000000 RCX: 0000000000000001 RDX: 0000000000000001 RSI: 0000000000000087 RDI: 0000000000000000 RBP: 0000000000000000 R08: 000eb5c4572376b3 R09: 0000000000000000 R10: ffffa53e806a3ca0 R11: 00000000000f4240 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ? __sched_text_end+0x1/0x1 default_idle+0x1c/0x140 do_idle+0x1c4/0x280 cpu_startup_entry+0x19/0x20 start_kernel+0x49e/0x4be secondary_startup_64+0xa4/0xb0 Modules linked in: act_csum veth ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 crct10dif_pclmul crc32_pclmul snd_hda_codec_generic ghash_clmulni_intel snd_hda_intel mbcache snd_hda_codec jbd2 snd_hwdep snd_hda_core snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd cryptd snd_timer glue_helper snd joydev virtio_balloon pcspkr soundcore i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect virtio_net sysimgblt net_failover fb_sys_fops virtio_console virtio_blk ttm failover drm ata_piix crc32c_intel floppy virtio_pci serio_raw libata virtio_ring virtio dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_csum_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_csum.c | 20 ++++++++++++--- .../tc-testing/tc-tests/actions/csum.json | 25 +++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 9ba0f61a1e82..0c77e7bdf6d5 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -33,6 +33,7 @@ #include #include +#include #include #include @@ -52,6 +53,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, csum_net_id); struct tcf_csum_params *params_new; struct nlattr *tb[TCA_CSUM_MAX + 1]; + struct tcf_chain *goto_ch = NULL; struct tc_csum *parm; struct tcf_csum *p; int ret = 0, err; @@ -87,21 +89,27 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, return err; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + p = to_tcf_csum(*a); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - tcf_idr_release(*a, bind); - return -ENOMEM; + err = -ENOMEM; + goto put_chain; } params_new->update_flags = parm->update_flags; spin_lock_bh(&p->tcf_lock); - p->tcf_action = parm->action; + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); rcu_swap_protected(p->params, params_new, lockdep_is_held(&p->tcf_lock)); spin_unlock_bh(&p->tcf_lock); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); if (params_new) kfree_rcu(params_new, rcu); @@ -109,6 +117,12 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, tcf_idr_insert(tn, *a); return ret; +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; } /** diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json b/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json index a022792d392a..ddabb2fbb7c7 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json @@ -500,5 +500,30 @@ "matchPattern": "^[ \t]+index [0-9]+ ref", "matchCount": "0", "teardown": [] + }, + { + "id": "d128", + "name": "Replace csum action with invalid goto chain control", + "category": [ + "actions", + "csum" + ], + "setup": [ + [ + "$TC actions flush action csum", + 0, + 1, + 255 + ], + "$TC actions add action csum iph index 90" + ], + "cmdUnderTest": "$TC actions replace action csum iph goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions get action csum index 90", + "matchPattern": "action order [0-9]*: csum \\(iph\\) action pass.*index 90 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action csum" + ] } ] From 0da2dbd6029c2be4191651bafa57c3c006eff63c Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:02 +0100 Subject: [PATCH 771/855] net/sched: act_gact: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall \ > action gact pass index 90 # tc actions replace action gact \ > goto chain 42 index 90 cookie c1a0c1a0 # tc actions show action gact had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: gact action goto chain 42 random type none pass val 0 index 90 ref 2 bind 1 cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 2 PID: 0 Comm: swapper/2 Not tainted 5.0.0-rc4.gotochain_crash+ #533 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff8c2434703be0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff8c23ed6d7e00 RCX: 000000000000005a RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8c23ed6d7e00 RBP: ffff8c2434703c80 R08: ffff8c243b639ac8 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8c2429e68b00 R13: ffff8c2429e68b08 R14: 0000000000000001 R15: ffff8c2429c5a480 FS: 0000000000000000(0000) GS:ffff8c2434700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000002dc0e005 CR4: 00000000001606e0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ip6_finish_output2+0x369/0x590 ip6_finish_output2+0x369/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.35+0x79/0xc0 mld_sendpack+0x16f/0x220 mld_ifc_timer_expire+0x195/0x2c0 ? igmp6_timer_handler+0x70/0x70 call_timer_fn+0x2b/0x130 run_timer_softirq+0x3e8/0x440 ? tick_sched_timer+0x37/0x70 __do_softirq+0xe3/0x2f5 irq_exit+0xf0/0x100 smp_apic_timer_interrupt+0x6c/0x130 apic_timer_interrupt+0xf/0x20 RIP: 0010:native_safe_halt+0x2/0x10 Code: 74 ff ff ff 7f f3 c3 65 48 8b 04 25 00 5c 01 00 f0 80 48 02 20 48 8b 00 a8 08 74 8b eb c1 90 90 90 90 90 90 90 90 90 90 fb f4 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 f4 c3 90 90 90 90 90 90 RSP: 0018:ffff9c8640387eb8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 RAX: ffffffff8b2184f0 RBX: 0000000000000002 RCX: 0000000000000001 RDX: 0000000000000001 RSI: 0000000000000087 RDI: 0000000000000002 RBP: 0000000000000002 R08: 000eb57882b36cc3 R09: 0000000000000020 R10: 0000000000000004 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ? __sched_text_end+0x1/0x1 default_idle+0x1c/0x140 do_idle+0x1c4/0x280 cpu_startup_entry+0x19/0x20 start_secondary+0x1a7/0x200 secondary_startup_64+0xa4/0xb0 Modules linked in: act_gact act_bpf veth ip6table_filter ip6_tables iptable_filter binfmt_misc crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_codec_generic ext4 snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core mbcache jbd2 snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd cryptd glue_helper virtio_balloon joydev pcspkr snd_timer snd i2c_piix4 soundcore nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl drm_kms_helper syscopyarea virtio_net sysfillrect net_failover virtio_blk sysimgblt fb_sys_fops virtio_console ttm failover drm crc32c_intel serio_raw ata_piix libata floppy virtio_pci virtio_ring virtio dm_mirror dm_region_hash dm_log dm_mod [last unloaded: act_bpf] CR2: 0000000000000000 Validating the control action within tcf_gact_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_gact.c | 13 +++++++++- .../tc-testing/tc-tests/actions/gact.json | 25 +++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index b8ad311bd8cc..e540e31069d7 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -61,6 +62,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; + struct tcf_chain *goto_ch = NULL; struct tc_gact *parm; struct tcf_gact *gact; int ret = 0; @@ -116,10 +118,13 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, return err; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; gact = to_gact(*a); spin_lock_bh(&gact->tcf_lock); - gact->tcf_action = parm->action; + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); #ifdef CONFIG_GACT_PROB if (p_parm) { gact->tcfg_paction = p_parm->paction; @@ -133,9 +138,15 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, #endif spin_unlock_bh(&gact->tcf_lock); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +release_idr: + tcf_idr_release(*a, bind); + return err; } static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/gact.json b/tools/testing/selftests/tc-testing/tc-tests/actions/gact.json index 89189a03ce3d..814b7a8a478b 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/gact.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/gact.json @@ -560,5 +560,30 @@ "teardown": [ "$TC actions flush action gact" ] + }, + { + "id": "ca89", + "name": "Replace gact action with invalid goto chain control", + "category": [ + "actions", + "gact" + ], + "setup": [ + [ + "$TC actions flush action gact", + 0, + 1, + 255 + ], + "$TC actions add action pass random determ drop 2 index 90" + ], + "cmdUnderTest": "$TC actions replace action goto chain 42 random determ drop 5 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions list action gact", + "matchPattern": "action order [0-9]*: gact action pass.*random type determ drop val 2.*index 90 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action gact" + ] } ] From 11a94d7fd80f92325e7b8653290ad3d2cd67f119 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:03 +0100 Subject: [PATCH 772/855] net/sched: act_ife: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall \ > action ife encode allow mark pass index 90 # tc actions replace action ife \ > encode allow mark goto chain 42 index 90 cookie c1a0c1a0 # tc action show action ife had the following output: IFE type 0xED3E IFE type 0xED3E Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: ife encode action goto chain 42 type 0XED3E allow mark index 90 ref 2 bind 1 cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 800000007b4e7067 P4D 800000007b4e7067 PUD 7b4e6067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 2 PID: 164 Comm: kworker/2:1 Not tainted 5.0.0-rc4.gotochain_crash+ #533 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 Workqueue: ipv6_addrconf addrconf_dad_work RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffa6a7c0553ad0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff9796ee1bbd00 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffa6a7c0553b70 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: ffff9797385bb038 R12: ffff9796ead9d700 R13: ffff9796ead9d708 R14: 0000000000000001 R15: ffff9796ead9d800 FS: 0000000000000000(0000) GS:ffff97973db00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007c41e006 CR4: 00000000001606e0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ndisc_next_option+0x50/0x50 ? ___neigh_create+0x4d5/0x680 ? ip6_finish_output2+0x1b5/0x590 ip6_finish_output2+0x1b5/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.28+0x79/0xc0 ndisc_send_skb+0x248/0x2e0 ndisc_send_ns+0xf8/0x200 ? addrconf_dad_work+0x389/0x4b0 addrconf_dad_work+0x389/0x4b0 ? __switch_to_asm+0x34/0x70 ? process_one_work+0x195/0x380 ? addrconf_dad_completed+0x370/0x370 process_one_work+0x195/0x380 worker_thread+0x30/0x390 ? process_one_work+0x380/0x380 kthread+0x113/0x130 ? kthread_park+0x90/0x90 ret_from_fork+0x35/0x40 Modules linked in: act_gact act_meta_mark act_ife dummy veth ip6table_filter ip6_tables iptable_filter binfmt_misc snd_hda_codec_generic ext4 snd_hda_intel snd_hda_codec crct10dif_pclmul mbcache crc32_pclmul jbd2 snd_hwdep snd_hda_core ghash_clmulni_intel snd_seq snd_seq_device snd_pcm snd_timer aesni_intel crypto_simd snd cryptd glue_helper virtio_balloon joydev pcspkr soundcore i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl virtio_net drm_kms_helper virtio_blk net_failover syscopyarea failover sysfillrect virtio_console sysimgblt fb_sys_fops ttm drm crc32c_intel serio_raw ata_piix virtio_pci virtio_ring libata virtio floppy dm_mirror dm_region_hash dm_log dm_mod [last unloaded: act_ife] CR2: 0000000000000000 Validating the control action within tcf_ife_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_ife.c | 33 +++++++++++-------- .../tc-testing/tc-tests/actions/ife.json | 25 ++++++++++++++ 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index c1ba74d5c1e3..31c6ffb6abe7 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -474,6 +475,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; + struct tcf_chain *goto_ch = NULL; struct tcf_ife_params *p; struct tcf_ife_info *ife; u16 ife_type = ETH_P_IFE; @@ -531,6 +533,10 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } ife = to_ife(*a); + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + p->flags = parm->flags; if (parm->flags & IFE_ENCODE) { @@ -563,13 +569,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (tb[TCA_IFE_METALST]) { err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST], NULL, NULL); - if (err) { -metadata_parse_err: - tcf_idr_release(*a, bind); - kfree(p); - return err; - } - + if (err) + goto metadata_parse_err; err = populate_metalist(ife, tb2, exists, rtnl_held); if (err) goto metadata_parse_err; @@ -581,21 +582,20 @@ metadata_parse_err: * going to bail out */ err = use_all_metadata(ife, exists); - if (err) { - tcf_idr_release(*a, bind); - kfree(p); - return err; - } + if (err) + goto metadata_parse_err; } if (exists) spin_lock_bh(&ife->tcf_lock); - ife->tcf_action = parm->action; /* protected by tcf_lock when modifying existing action */ + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); rcu_swap_protected(ife->params, p, 1); if (exists) spin_unlock_bh(&ife->tcf_lock); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); if (p) kfree_rcu(p, rcu); @@ -603,6 +603,13 @@ metadata_parse_err: tcf_idr_insert(tn, *a); return ret; +metadata_parse_err: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + kfree(p); + tcf_idr_release(*a, bind); + return err; } static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json index 0da3545cabdb..c13a68b98fc7 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json @@ -1060,5 +1060,30 @@ "matchPattern": "action order [0-9]*: ife encode action pipe.*allow prio.*index 4", "matchCount": "0", "teardown": [] + }, + { + "id": "a0e2", + "name": "Replace ife encode action with invalid goto chain control", + "category": [ + "actions", + "ife" + ], + "setup": [ + [ + "$TC actions flush action ife", + 0, + 1, + 255 + ], + "$TC actions add action ife encode allow mark pass index 90" + ], + "cmdUnderTest": "$TC actions replace action ife encode allow mark goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions get action ife index 90", + "matchPattern": "action order [0-9]*: ife encode action pass.*type 0[xX]ED3E .*allow mark.*index 90 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action ife" + ] } ] From ff9721d32b1aba8bf46a06df20827d0a5d52ec48 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:04 +0100 Subject: [PATCH 773/855] net/sched: act_mirred: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall \ > action mirred ingress mirror dev lo pass # tc actions replace action mirred \ > ingress mirror dev lo goto chain 42 index 90 cookie c1a0c1a0 # tc actions show action mirred had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: mirred (Ingress Mirror to device lo) goto chain 42 index 90 ref 2 bind 1 cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: Mirror/redirect action on BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 3 PID: 47 Comm: kworker/3:1 Not tainted 5.0.0-rc4.gotochain_crash+ #533 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 Workqueue: ipv6_addrconf addrconf_dad_work RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffa772404b7ad0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff9c5afc3f4300 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff9c5afdba9380 RDI: 0000000000029380 RBP: ffffa772404b7b70 R08: ffff9c5af7010028 R09: ffff9c5af7010029 R10: 0000000000000000 R11: ffff9c5af94c6a38 R12: ffff9c5af7953000 R13: ffff9c5af7953008 R14: 0000000000000001 R15: ffff9c5af7953d00 FS: 0000000000000000(0000) GS:ffff9c5afdb80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007c514004 CR4: 00000000001606e0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ndisc_next_option+0x50/0x50 ? ___neigh_create+0x4d5/0x680 ? ip6_finish_output2+0x1b5/0x590 ip6_finish_output2+0x1b5/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.28+0x79/0xc0 ndisc_send_skb+0x248/0x2e0 ndisc_send_ns+0xf8/0x200 ? addrconf_dad_work+0x389/0x4b0 addrconf_dad_work+0x389/0x4b0 ? __switch_to_asm+0x34/0x70 ? process_one_work+0x195/0x380 ? addrconf_dad_completed+0x370/0x370 process_one_work+0x195/0x380 worker_thread+0x30/0x390 ? process_one_work+0x380/0x380 kthread+0x113/0x130 ? kthread_park+0x90/0x90 ret_from_fork+0x35/0x40 Modules linked in: act_mirred veth ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 crct10dif_pclmul snd_hda_codec_generic crc32_pclmul snd_hda_intel snd_hda_codec mbcache ghash_clmulni_intel jbd2 snd_hwdep snd_hda_core snd_seq snd_seq_device snd_pcm aesni_intel snd_timer snd crypto_simd cryptd glue_helper soundcore virtio_balloon joydev pcspkr i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops virtio_net ttm virtio_blk net_failover virtio_console failover drm ata_piix crc32c_intel virtio_pci serio_raw libata virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_mirred_init() proved to fix the above issue. For the same reason, postpone the assignment of tcfa_action and tcfm_eaction to avoid partial reconfiguration of a mirred rule when it's replaced by another one that mirrors to a device that does not exist. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 21 +++++++++++++--- .../tc-testing/tc-tests/actions/mirred.json | 25 +++++++++++++++++++ 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 383f4024452c..cd712e4e8998 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -99,6 +99,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; + struct tcf_chain *goto_ch = NULL; bool mac_header_xmit = false; struct tc_mirred *parm; struct tcf_mirred *m; @@ -158,18 +159,20 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + m = to_mirred(*a); spin_lock_bh(&m->tcf_lock); - m->tcf_action = parm->action; - m->tcfm_eaction = parm->eaction; if (parm->ifindex) { dev = dev_get_by_index(net, parm->ifindex); if (!dev) { spin_unlock_bh(&m->tcf_lock); - tcf_idr_release(*a, bind); - return -ENODEV; + err = -ENODEV; + goto put_chain; } mac_header_xmit = dev_is_mac_header_xmit(dev); rcu_swap_protected(m->tcfm_dev, dev, @@ -178,7 +181,11 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, dev_put(dev); m->tcfm_mac_header_xmit = mac_header_xmit; } + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + m->tcfm_eaction = parm->eaction; spin_unlock_bh(&m->tcf_lock); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); if (ret == ACT_P_CREATED) { spin_lock(&mirred_list_lock); @@ -189,6 +196,12 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } return ret; +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; } static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json index db49fd0f8445..6e5fb3d25681 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json @@ -434,5 +434,30 @@ "teardown": [ "$TC actions flush action mirred" ] + }, + { + "id": "2a9a", + "name": "Replace mirred action with invalid goto chain control", + "category": [ + "actions", + "mirred" + ], + "setup": [ + [ + "$TC actions flush action mirred", + 0, + 1, + 255 + ], + "$TC actions add action mirred ingress mirror dev lo drop index 90" + ], + "cmdUnderTest": "$TC actions replace action mirred ingress mirror dev lo goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions get action mirred index 90", + "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) drop.*index 90 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action mirred" + ] } ] From c53075ea5d3c44849992523d5d83e2810d05a00e Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:05 +0100 Subject: [PATCH 774/855] net/sched: act_connmark: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall \ > action connmark pass index 90 # tc actions replace action connmark \ > goto chain 42 index 90 cookie c1a0c1a0 # tc actions show action connmark had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: connmark zone 0 goto chain 42 index 90 ref 2 bind 1 cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 302 Comm: kworker/0:2 Not tainted 5.0.0-rc4.gotochain_crash+ #533 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 Workqueue: ipv6_addrconf addrconf_dad_work RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff9bea406c3ad0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff8c5dfc009f00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff9bea406c3a80 RDI: ffff8c5dfb9d6ec0 RBP: ffff9bea406c3b70 R08: ffff8c5dfda222a0 R09: ffffffff90933c3c R10: 0000000000000000 R11: 0000000092793f7d R12: ffff8c5df48b3c00 R13: ffff8c5df48b3c08 R14: 0000000000000001 R15: ffff8c5dfb9d6e40 FS: 0000000000000000(0000) GS:ffff8c5dfda00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000062e0e006 CR4: 00000000001606f0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ndisc_next_option+0x50/0x50 ? ___neigh_create+0x4d5/0x680 ? ip6_finish_output2+0x1b5/0x590 ip6_finish_output2+0x1b5/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.28+0x79/0xc0 ndisc_send_skb+0x248/0x2e0 ndisc_send_ns+0xf8/0x200 ? addrconf_dad_work+0x389/0x4b0 addrconf_dad_work+0x389/0x4b0 ? __switch_to_asm+0x34/0x70 ? process_one_work+0x195/0x380 ? addrconf_dad_completed+0x370/0x370 process_one_work+0x195/0x380 worker_thread+0x30/0x390 ? process_one_work+0x380/0x380 kthread+0x113/0x130 ? kthread_park+0x90/0x90 ret_from_fork+0x35/0x40 Modules linked in: act_connmark nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 veth ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 crct10dif_pclmul mbcache crc32_pclmul jbd2 snd_hda_codec_generic ghash_clmulni_intel snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_seq snd_seq_device snd_pcm aesni_intel snd_timer crypto_simd cryptd snd glue_helper joydev virtio_balloon pcspkr soundcore i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl drm_kms_helper virtio_net net_failover syscopyarea virtio_blk failover virtio_console sysfillrect sysimgblt fb_sys_fops ttm drm ata_piix crc32c_intel serio_raw libata virtio_pci virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_connmark_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_connmark.c | 21 +++++++++++++--- .../tc-testing/tc-tests/actions/connmark.json | 25 +++++++++++++++++++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 44aa046a92ea..32ae0cd6e31c 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -102,9 +103,10 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, connmark_net_id); struct nlattr *tb[TCA_CONNMARK_MAX + 1]; + struct tcf_chain *goto_ch = NULL; struct tcf_connmark_info *ci; struct tc_connmark *parm; - int ret = 0; + int ret = 0, err; if (!nla) return -EINVAL; @@ -129,7 +131,11 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, } ci = to_connmark(*a); - ci->tcf_action = parm->action; + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, + extack); + if (err < 0) + goto release_idr; + tcf_action_set_ctrlact(*a, parm->action, goto_ch); ci->net = net; ci->zone = parm->zone; @@ -143,15 +149,24 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, + extack); + if (err < 0) + goto release_idr; /* replacing action and zone */ spin_lock_bh(&ci->tcf_lock); - ci->tcf_action = parm->action; + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); ci->zone = parm->zone; spin_unlock_bh(&ci->tcf_lock); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); ret = 0; } return ret; +release_idr: + tcf_idr_release(*a, bind); + return err; } static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json b/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json index 13147a1f5731..cadde8f41fcd 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json @@ -287,5 +287,30 @@ "teardown": [ "$TC actions flush action connmark" ] + }, + { + "id": "c506", + "name": "Replace connmark with invalid goto chain control", + "category": [ + "actions", + "connmark" + ], + "setup": [ + [ + "$TC actions flush action connmark", + 0, + 1, + 255 + ], + "$TC actions add action connmark pass index 90" + ], + "cmdUnderTest": "$TC actions replace action connmark goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions get action connmark index 90", + "matchPattern": "action order [0-9]+: connmark zone 0 pass.*index 90 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action connmark" + ] } ] From 1e45d043a8bb2ed8a541384db3cc133e92001f0c Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:06 +0100 Subject: [PATCH 775/855] net/sched: act_nat: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall \ > action nat ingress 1.18.1.1 1.18.2.2 pass index 90 # tc actions replace action nat \ > ingress 1.18.1.1 1.18.2.2 goto chain 42 index 90 cookie c1a0c1a0 # tc actions show action nat had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: nat ingress 1.18.1.1/32 1.18.2.2 goto chain 42 index 90 ref 2 bind 1 cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 800000002d180067 P4D 800000002d180067 PUD 7cb8b067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 3 PID: 164 Comm: kworker/3:1 Not tainted 5.0.0-rc4.gotochain_crash+ #533 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 Workqueue: ipv6_addrconf addrconf_dad_work RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffae4500e2fad0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff9fa52e28c800 RCX: 0000000001011201 RDX: 0000000000000000 RSI: 0000000000000056 RDI: ffff9fa52ca12800 RBP: ffffae4500e2fb70 R08: 0000000000000022 R09: 000000000000000e R10: 00000000ffffffff R11: 0000000001011201 R12: ffff9fa52cbc9c00 R13: ffff9fa52cbc9c08 R14: 0000000000000001 R15: ffff9fa52ca12780 FS: 0000000000000000(0000) GS:ffff9fa57db80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000073f8c004 CR4: 00000000001606e0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ndisc_next_option+0x50/0x50 ? ___neigh_create+0x4d5/0x680 ? ip6_finish_output2+0x1b5/0x590 ip6_finish_output2+0x1b5/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.28+0x79/0xc0 ndisc_send_skb+0x248/0x2e0 ndisc_send_ns+0xf8/0x200 ? addrconf_dad_work+0x389/0x4b0 addrconf_dad_work+0x389/0x4b0 ? __switch_to_asm+0x34/0x70 ? process_one_work+0x195/0x380 ? addrconf_dad_completed+0x370/0x370 process_one_work+0x195/0x380 worker_thread+0x30/0x390 ? process_one_work+0x380/0x380 kthread+0x113/0x130 ? kthread_park+0x90/0x90 ret_from_fork+0x35/0x40 Modules linked in: act_nat veth ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 crct10dif_pclmul crc32_pclmul ghash_clmulni_intel mbcache jbd2 snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd cryptd glue_helper snd_timer snd joydev virtio_balloon pcspkr soundcore i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs qxl ata_generic pata_acpi drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm virtio_net virtio_blk net_failover failover virtio_console drm crc32c_intel floppy ata_piix libata virtio_pci virtio_ring virtio serio_raw dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_nat_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_nat.c | 12 ++++++++- .../tc-testing/tc-tests/actions/nat.json | 25 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index de4b493e26d2..e91bb8eb81ec 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; + struct tcf_chain *goto_ch = NULL; struct tc_nat *parm; int ret = 0, err; struct tcf_nat *p; @@ -77,6 +79,9 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, } else { return err; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; p = to_tcf_nat(*a); spin_lock_bh(&p->tcf_lock); @@ -85,13 +90,18 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, p->mask = parm->mask; p->flags = parm->flags; - p->tcf_action = parm->action; + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); spin_unlock_bh(&p->tcf_lock); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +release_idr: + tcf_idr_release(*a, bind); + return err; } static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a, diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json b/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json index 0080dc2fd41c..bc12c1ccad30 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json @@ -589,5 +589,30 @@ "teardown": [ "$TC actions flush action nat" ] + }, + { + "id": "4b12", + "name": "Replace nat action with invalid goto chain control", + "category": [ + "actions", + "nat" + ], + "setup": [ + [ + "$TC actions flush action nat", + 0, + 1, + 255 + ], + "$TC actions add action nat ingress 1.18.1.1 1.18.2.2 drop index 90" + ], + "cmdUnderTest": "$TC actions replace action nat ingress 1.18.1.1 1.18.2.2 goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions get action nat index 90", + "matchPattern": "action order [0-9]+: nat ingress 1.18.1.1/32 1.18.2.2 drop.*index 90 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action nat" + ] } ] From 6ac86ca3524b4549d31c45d11487b0626c334f10 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:07 +0100 Subject: [PATCH 776/855] net/sched: act_pedit: validate the control action inside init() the following script: # tc filter add dev crash0 egress matchall \ > action pedit ex munge ip ttl set 10 pass index 90 # tc actions replace action pedit \ > ex munge ip ttl set 10 goto chain 42 index 90 cookie c1a0c1a0 # tc actions show action pedit had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: pedit action goto chain 42 keys 1 index 90 ref 2 bind 1 key #0 at ipv4+8: val 0a000000 mask 00ffffff cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 2 PID: 0 Comm: swapper/2 Not tainted 5.0.0-rc4.gotochain_crash+ #533 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff94a73db03be0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff94a6ee4c0700 RCX: 000000000000000a RDX: 0000000000000000 RSI: ffff94a6ed22c800 RDI: 0000000000000000 RBP: ffff94a73db03c80 R08: ffff94a7386fa4c8 R09: ffff94a73229ea20 R10: 0000000000000000 R11: 0000000000000000 R12: ffff94a6ed22cb00 R13: ffff94a6ed22cb08 R14: 0000000000000001 R15: ffff94a6ed22c800 FS: 0000000000000000(0000) GS:ffff94a73db00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007120e002 CR4: 00000000001606e0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ip6_finish_output2+0x369/0x590 ip6_finish_output2+0x369/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.35+0x79/0xc0 mld_sendpack+0x16f/0x220 mld_ifc_timer_expire+0x195/0x2c0 ? igmp6_timer_handler+0x70/0x70 call_timer_fn+0x2b/0x130 run_timer_softirq+0x3e8/0x440 ? tick_sched_timer+0x37/0x70 __do_softirq+0xe3/0x2f5 irq_exit+0xf0/0x100 smp_apic_timer_interrupt+0x6c/0x130 apic_timer_interrupt+0xf/0x20 RIP: 0010:native_safe_halt+0x2/0x10 Code: 4e ff ff ff 7f f3 c3 65 48 8b 04 25 00 5c 01 00 f0 80 48 02 20 48 8b 00 a8 08 74 8b eb c1 90 90 90 90 90 90 90 90 90 90 fb f4 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 f4 c3 90 90 90 90 90 90 RSP: 0018:ffffab1740387eb8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 RAX: ffffffffb18184f0 RBX: 0000000000000002 RCX: 0000000000000001 RDX: 0000000000000001 RSI: 0000000000000087 RDI: 0000000000000002 RBP: 0000000000000002 R08: 000f168fa695f9a9 R09: 0000000000000020 R10: 0000000000000004 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ? __sched_text_end+0x1/0x1 default_idle+0x1c/0x140 do_idle+0x1c4/0x280 cpu_startup_entry+0x19/0x20 start_secondary+0x1a7/0x200 secondary_startup_64+0xa4/0xb0 Modules linked in: act_pedit veth ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 mbcache jbd2 crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep aesni_intel snd_hda_core crypto_simd snd_seq cryptd glue_helper snd_seq_device snd_pcm joydev snd_timer pcspkr virtio_balloon snd soundcore i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs qxl ata_generic pata_acpi drm_kms_helper virtio_net net_failover syscopyarea sysfillrect sysimgblt failover virtio_blk fb_sys_fops virtio_console ttm drm crc32c_intel serio_raw ata_piix virtio_pci libata virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_pedit_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 16 +++++- .../tc-testing/tc-tests/actions/pedit.json | 51 +++++++++++++++++++ 2 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 8ca82aefa11a..287793abfaf9 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -23,6 +23,7 @@ #include #include #include +#include static unsigned int pedit_net_id; static struct tc_action_ops act_pedit_ops; @@ -142,6 +143,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; + struct tcf_chain *goto_ch = NULL; struct tc_pedit_key *keys = NULL; struct tcf_pedit_key_ex *keys_ex; struct tc_pedit *parm; @@ -205,6 +207,11 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_free; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) { + ret = err; + goto out_release; + } p = to_pedit(*a); spin_lock_bh(&p->tcf_lock); @@ -214,7 +221,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (!keys) { spin_unlock_bh(&p->tcf_lock); ret = -ENOMEM; - goto out_release; + goto put_chain; } kfree(p->tcfp_keys); p->tcfp_keys = keys; @@ -223,16 +230,21 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, memcpy(p->tcfp_keys, parm->keys, ksize); p->tcfp_flags = parm->flags; - p->tcf_action = parm->action; + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); kfree(p->tcfp_keys_ex); p->tcfp_keys_ex = keys_ex; spin_unlock_bh(&p->tcf_lock); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); out_release: tcf_idr_release(*a, bind); out_free: diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json new file mode 100644 index 000000000000..b73ceb9e28b1 --- /dev/null +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json @@ -0,0 +1,51 @@ +[ + { + "id": "319a", + "name": "Add pedit action that mangles IP TTL", + "category": [ + "actions", + "pedit" + ], + "setup": [ + [ + "$TC actions flush action pedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action pedit ex munge ip ttl set 10", + "expExitCode": "0", + "verifyCmd": "$TC actions ls action pedit", + "matchPattern": "action order [0-9]+: pedit action pass keys 1.*index 1 ref.*key #0 at ipv4\\+8: val 0a000000 mask 00ffffff", + "matchCount": "1", + "teardown": [ + "$TC actions flush action pedit" + ] + }, + { + "id": "7e67", + "name": "Replace pedit action with invalid goto chain", + "category": [ + "actions", + "pedit" + ], + "setup": [ + [ + "$TC actions flush action pedit", + 0, + 1, + 255 + ], + "$TC actions add action pedit ex munge ip ttl set 10 pass index 90" + ], + "cmdUnderTest": "$TC actions replace action pedit ex munge ip ttl set 10 goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions ls action pedit", + "matchPattern": "action order [0-9]+: pedit action pass keys 1.*index 90 ref.*key #0 at ipv4\\+8: val 0a000000 mask 00ffffff", + "matchCount": "1", + "teardown": [ + "$TC actions flush action pedit" + ] + } +] From d6124d6ba697413efc53ff6919b1e0c250f1902a Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:08 +0100 Subject: [PATCH 777/855] net/sched: act_police: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall \ > action police rate 3mbit burst 250k pass index 90 # tc actions replace action police \ > rate 3mbit burst 250k goto chain 42 index 90 cookie c1a0c1a0 # tc actions show action police rate 3mbit burst had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: police 0x5a rate 3Mbit burst 250Kb mtu 2Kb action goto chain 42 overhead 0b ref 2 bind 1 cookie c1a0c1a0 Then, when crash0 starts transmitting more than 3Mbit/s, the following kernel crash is observed: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 800000007a779067 P4D 800000007a779067 PUD 2ad96067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 3 PID: 5032 Comm: netperf Not tainted 5.0.0-rc4.gotochain_crash+ #533 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffb0e04064fa60 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff93bb3322cce0 RCX: 0000000000000005 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff93bb3322cce0 RBP: ffffb0e04064fb00 R08: 0000000000000022 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff93bb3beed300 R13: ffff93bb3beed308 R14: 0000000000000001 R15: ffff93bb3b64d000 FS: 00007f0bc6be5740(0000) GS:ffff93bb3db80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000746a8001 CR4: 00000000001606e0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ipt_do_table+0x31c/0x420 [ip_tables] ? ip_finish_output2+0x16f/0x430 ip_finish_output2+0x16f/0x430 ? ip_output+0x69/0xe0 ip_output+0x69/0xe0 ? ip_forward_options+0x1a0/0x1a0 __tcp_transmit_skb+0x563/0xa40 tcp_write_xmit+0x243/0xfa0 __tcp_push_pending_frames+0x32/0xf0 tcp_sendmsg_locked+0x404/0xd30 tcp_sendmsg+0x27/0x40 sock_sendmsg+0x36/0x40 __sys_sendto+0x10e/0x140 ? __sys_connect+0x87/0xf0 ? syscall_trace_enter+0x1df/0x2e0 ? __audit_syscall_exit+0x216/0x260 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f0bc5ffbafd Code: 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 8b 05 ae c4 2c 00 85 c0 75 2d 45 31 c9 45 31 c0 4c 63 d1 48 63 ff b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 01 c3 48 8b 15 63 63 2c 00 f7 d8 64 89 02 48 RSP: 002b:00007fffef94b7f8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000000004000 RCX: 00007f0bc5ffbafd RDX: 0000000000004000 RSI: 00000000017e5420 RDI: 0000000000000004 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000004 R13: 00000000017e51d0 R14: 0000000000000010 R15: 0000000000000006 Modules linked in: act_police veth ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 snd_hda_codec_generic mbcache crct10dif_pclmul jbd2 crc32_pclmul ghash_clmulni_intel snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd cryptd glue_helper snd_timer snd joydev pcspkr virtio_balloon soundcore i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm virtio_blk virtio_net virtio_console net_failover failover crc32c_intel ata_piix libata serio_raw virtio_pci virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_police_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_police.c | 12 ++++++++- .../tc-testing/tc-tests/actions/police.json | 25 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 229eba7925e5..2b8581f6ab51 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -21,6 +21,7 @@ #include #include #include +#include struct tcf_police_params { int tcfp_result; @@ -88,6 +89,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, { int ret = 0, tcfp_result = TC_ACT_OK, err, size; struct nlattr *tb[TCA_POLICE_MAX + 1]; + struct tcf_chain *goto_ch = NULL; struct tc_police *parm; struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; @@ -129,6 +131,9 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; police = to_police(*a); if (parm->rate.rate) { @@ -214,12 +219,14 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (new->peak_present) police->tcfp_ptoks = new->tcfp_mtu_ptoks; spin_unlock_bh(&police->tcfp_lock); - police->tcf_action = parm->action; + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); rcu_swap_protected(police->params, new, lockdep_is_held(&police->tcf_lock)); spin_unlock_bh(&police->tcf_lock); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); if (new) kfree_rcu(new, rcu); @@ -230,6 +237,9 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, failure: qdisc_put_rtab(P_tab); qdisc_put_rtab(R_tab); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: tcf_idr_release(*a, bind); return err; } diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json index 4086a50a670e..b8268da5adaa 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json @@ -739,5 +739,30 @@ "teardown": [ "$TC actions flush action police" ] + }, + { + "id": "689e", + "name": "Replace police action with invalid goto chain control", + "category": [ + "actions", + "police" + ], + "setup": [ + [ + "$TC actions flush action police", + 0, + 1, + 255 + ], + "$TC actions add action police rate 3mbit burst 250k drop index 90" + ], + "cmdUnderTest": "$TC actions replace action police rate 3mbit burst 250k goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions get action police index 90", + "matchPattern": "action order [0-9]*: police 0x5a rate 3Mbit burst 250Kb mtu 2Kb action drop", + "matchCount": "1", + "teardown": [ + "$TC actions flush action police" + ] } ] From e8c87c643ef3603c6e63bdecb67b03d794648493 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:09 +0100 Subject: [PATCH 778/855] net/sched: act_sample: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall \ > action sample rate 1024 group 4 pass index 90 # tc actions replace action sample \ > rate 1024 group 4 goto chain 42 index 90 cookie c1a0c1a0 # tc actions show action sample had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: sample rate 1/1024 group 4 goto chain 42 index 90 ref 2 bind 1 cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 8000000079966067 P4D 8000000079966067 PUD 7987b067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 5 Comm: kworker/0:0 Not tainted 5.0.0-rc4.gotochain_crash+ #536 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 Workqueue: ipv6_addrconf addrconf_dad_work RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffbee60033fad0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff99d7ae6e3b00 RCX: 00000000e555df9b RDX: 0000000000000000 RSI: 00000000b0352718 RDI: ffff99d7fda1fcf0 RBP: ffffbee60033fb70 R08: 0000000070731ab1 R09: 0000000000000400 R10: 0000000000000000 R11: ffff99d7ac733838 R12: ffff99d7f3c2be00 R13: ffff99d7f3c2be08 R14: 0000000000000001 R15: ffff99d7f3c2b600 FS: 0000000000000000(0000) GS:ffff99d7fda00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000797de006 CR4: 00000000001606f0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ndisc_next_option+0x50/0x50 ? ___neigh_create+0x4d5/0x680 ? ip6_finish_output2+0x1b5/0x590 ip6_finish_output2+0x1b5/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.28+0x79/0xc0 ndisc_send_skb+0x248/0x2e0 ndisc_send_ns+0xf8/0x200 ? addrconf_dad_work+0x389/0x4b0 addrconf_dad_work+0x389/0x4b0 ? __switch_to_asm+0x34/0x70 ? process_one_work+0x195/0x380 ? addrconf_dad_completed+0x370/0x370 process_one_work+0x195/0x380 worker_thread+0x30/0x390 ? process_one_work+0x380/0x380 kthread+0x113/0x130 ? kthread_park+0x90/0x90 ret_from_fork+0x35/0x40 Modules linked in: act_sample psample veth ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 crct10dif_pclmul crc32_pclmul ghash_clmulni_intel mbcache jbd2 snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_seq snd_seq_device aesni_intel crypto_simd snd_pcm cryptd glue_helper snd_timer joydev snd pcspkr virtio_balloon i2c_piix4 soundcore nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect virtio_net sysimgblt fb_sys_fops net_failover ttm failover virtio_blk virtio_console drm ata_piix serio_raw crc32c_intel libata virtio_pci virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_sample_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_sample.c | 19 +++++++++++--- .../tc-testing/tc-tests/actions/sample.json | 25 +++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 36b8adbe935d..4060b0955c97 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -43,6 +44,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; struct psample_group *psample_group; + struct tcf_chain *goto_ch = NULL; struct tc_sample *parm; u32 psample_group_num; struct tcf_sample *s; @@ -79,18 +81,21 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, psample_group_num); if (!psample_group) { - tcf_idr_release(*a, bind); - return -ENOMEM; + err = -ENOMEM; + goto put_chain; } s = to_sample(*a); spin_lock_bh(&s->tcf_lock); - s->tcf_action = parm->action; + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); s->psample_group_num = psample_group_num; RCU_INIT_POINTER(s->psample_group, psample_group); @@ -100,10 +105,18 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); } spin_unlock_bh(&s->tcf_lock); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; } static void tcf_sample_cleanup(struct tc_action *a) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/sample.json b/tools/testing/selftests/tc-testing/tc-tests/actions/sample.json index 3aca33c00039..27f0acaed880 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/sample.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/sample.json @@ -584,5 +584,30 @@ "teardown": [ "$TC actions flush action sample" ] + }, + { + "id": "0a6e", + "name": "Replace sample action with invalid goto chain control", + "category": [ + "actions", + "sample" + ], + "setup": [ + [ + "$TC actions flush action sample", + 0, + 1, + 255 + ], + "$TC actions add action sample rate 1024 group 4 pass index 90" + ], + "cmdUnderTest": "$TC actions replace action sample rate 1024 group 7 goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions list action sample", + "matchPattern": "action order [0-9]+: sample rate 1/1024 group 4 pass.*index 90", + "matchCount": "1", + "teardown": [ + "$TC actions flush action sample" + ] } ] From 4b006b0c139e486773335f4c23b4d82348cfeb04 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:10 +0100 Subject: [PATCH 779/855] net/sched: act_simple: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall \ > action simple sdata hello pass index 90 # tc actions replace action simple \ > sdata world goto chain 42 index 90 cookie c1a0c1a0 # tc action show action simple had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: Simple index 90 ref 2 bind 1 cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 800000006a6fb067 P4D 800000006a6fb067 PUD 6aed6067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 2 PID: 3241 Comm: kworker/2:0 Not tainted 5.0.0-rc4.gotochain_crash+ #536 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 Workqueue: ipv6_addrconf addrconf_dad_work RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffbe6781763ad0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff9e59bdb80e00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff9e59b4716738 RDI: ffff9e59ab12d140 RBP: ffffbe6781763b70 R08: 0000000000000234 R09: 0000000000aaaaaa R10: 0000000000000000 R11: ffff9e59b247cd50 R12: ffff9e59b112f100 R13: ffff9e59b112f108 R14: 0000000000000001 R15: ffff9e59ab12d0c0 FS: 0000000000000000(0000) GS:ffff9e59b4700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000006af92004 CR4: 00000000001606e0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ndisc_next_option+0x50/0x50 ? ___neigh_create+0x4d5/0x680 ? ip6_finish_output2+0x1b5/0x590 ip6_finish_output2+0x1b5/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.28+0x79/0xc0 ndisc_send_skb+0x248/0x2e0 ndisc_send_ns+0xf8/0x200 ? addrconf_dad_work+0x389/0x4b0 addrconf_dad_work+0x389/0x4b0 ? __switch_to_asm+0x34/0x70 ? process_one_work+0x195/0x380 ? addrconf_dad_completed+0x370/0x370 process_one_work+0x195/0x380 worker_thread+0x30/0x390 ? process_one_work+0x380/0x380 kthread+0x113/0x130 ? kthread_park+0x90/0x90 ret_from_fork+0x35/0x40 Modules linked in: act_simple veth ip6table_filter ip6_tables iptable_filter binfmt_misc crct10dif_pclmul crc32_pclmul ghash_clmulni_intel ext4 snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep mbcache snd_hda_core jbd2 snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd cryptd snd_timer glue_helper snd joydev virtio_balloon pcspkr soundcore i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops virtio_net ttm net_failover virtio_console virtio_blk failover drm crc32c_intel serio_raw floppy ata_piix libata virtio_pci virtio_ring virtio dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_simple_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_simple.c | 52 ++++++++++++++----- .../tc-testing/tc-tests/actions/simple.json | 25 +++++++++ 2 files changed, 63 insertions(+), 14 deletions(-) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 4916dc3e3668..23c8ca5615e5 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -60,14 +61,26 @@ static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata) return 0; } -static void reset_policy(struct tcf_defact *d, const struct nlattr *defdata, - struct tc_defact *p) +static int reset_policy(struct tc_action *a, const struct nlattr *defdata, + struct tc_defact *p, struct tcf_proto *tp, + struct netlink_ext_ack *extack) { + struct tcf_chain *goto_ch = NULL; + struct tcf_defact *d; + int err; + + err = tcf_action_check_ctrlact(p->action, tp, &goto_ch, extack); + if (err < 0) + return err; + d = to_defact(a); spin_lock_bh(&d->tcf_lock); - d->tcf_action = p->action; + goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch); memset(d->tcfd_defdata, 0, SIMP_MAX_DATA); nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + return 0; } static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { @@ -82,6 +95,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; + struct tcf_chain *goto_ch = NULL; struct tc_defact *parm; struct tcf_defact *d; bool exists = false; @@ -122,27 +136,37 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } d = to_defact(*a); - ret = alloc_defdata(d, tb[TCA_DEF_DATA]); - if (ret < 0) { - tcf_idr_release(*a, bind); - return ret; - } - d->tcf_action = parm->action; + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, + extack); + if (err < 0) + goto release_idr; + + err = alloc_defdata(d, tb[TCA_DEF_DATA]); + if (err < 0) + goto put_chain; + + tcf_action_set_ctrlact(*a, parm->action, goto_ch); ret = ACT_P_CREATED; } else { - d = to_defact(*a); - if (!ovr) { - tcf_idr_release(*a, bind); - return -EEXIST; + err = -EEXIST; + goto release_idr; } - reset_policy(d, tb[TCA_DEF_DATA], parm); + err = reset_policy(*a, tb[TCA_DEF_DATA], parm, tp, extack); + if (err) + goto release_idr; } if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; } static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json index e89a7aa4012d..8e8c1ae12260 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json @@ -126,5 +126,30 @@ "teardown": [ "" ] + }, + { + "id": "b776", + "name": "Replace simple action with invalid goto chain control", + "category": [ + "actions", + "simple" + ], + "setup": [ + [ + "$TC actions flush action simple", + 0, + 1, + 255 + ], + "$TC actions add action simple sdata \"hello\" pass index 90" + ], + "cmdUnderTest": "$TC actions replace action simple sdata \"world\" goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions list action simple", + "matchPattern": "action order [0-9]*: Simple .*index 90 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action simple" + ] } ] From ec7727bb24b01e96b0c46068addf355ee4f794d8 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:11 +0100 Subject: [PATCH 780/855] net/sched: act_skbedit: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall \ > action skbedit ptype host pass index 90 # tc actions replace action skbedit \ > ptype host goto chain 42 index 90 cookie c1a0c1a0 # tc actions show action skbedit had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: skbedit ptype host goto chain 42 index 90 ref 2 bind 1 cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 3 PID: 3467 Comm: kworker/3:3 Not tainted 5.0.0-rc4.gotochain_crash+ #536 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 Workqueue: ipv6_addrconf addrconf_dad_work RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffb50a81e1fad0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff9aa47ba4ea00 RCX: 0000000000000001 RDX: 0000000000000000 RSI: ffff9aa469eeb3c0 RDI: ffff9aa47ba4ea00 RBP: ffffb50a81e1fb70 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: ffff9aa47bce0638 R12: ffff9aa4793b0c00 R13: ffff9aa4793b0c08 R14: 0000000000000001 R15: ffff9aa469eeb3c0 FS: 0000000000000000(0000) GS:ffff9aa474780000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007360e005 CR4: 00000000001606e0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ndisc_next_option+0x50/0x50 ? ___neigh_create+0x4d5/0x680 ? ip6_finish_output2+0x1b5/0x590 ip6_finish_output2+0x1b5/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.28+0x79/0xc0 ndisc_send_skb+0x248/0x2e0 ndisc_send_ns+0xf8/0x200 ? addrconf_dad_work+0x389/0x4b0 addrconf_dad_work+0x389/0x4b0 ? __switch_to_asm+0x34/0x70 ? process_one_work+0x195/0x380 ? addrconf_dad_completed+0x370/0x370 process_one_work+0x195/0x380 worker_thread+0x30/0x390 ? process_one_work+0x380/0x380 kthread+0x113/0x130 ? kthread_park+0x90/0x90 ret_from_fork+0x35/0x40 Modules linked in: act_skbedit veth ip6table_filter ip6_tables iptable_filter binfmt_misc crct10dif_pclmul crc32_pclmul ghash_clmulni_intel ext4 snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep mbcache snd_hda_core jbd2 snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd cryptd snd_timer glue_helper snd joydev soundcore pcspkr virtio_balloon i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm virtio_net net_failover drm failover virtio_blk virtio_console ata_piix virtio_pci crc32c_intel serio_raw libata virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_skbedit_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_skbedit.c | 19 +++++++++++--- .../tc-testing/tc-tests/actions/skbedit.json | 25 +++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 4566eff3d027..7e1d261a31d2 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -102,6 +103,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, skbedit_net_id); struct tcf_skbedit_params *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; + struct tcf_chain *goto_ch = NULL; struct tc_skbedit *parm; struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL; @@ -187,11 +189,14 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, return -EEXIST; } } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - tcf_idr_release(*a, bind); - return -ENOMEM; + err = -ENOMEM; + goto put_chain; } params_new->flags = flags; @@ -209,16 +214,24 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, params_new->mask = *mask; spin_lock_bh(&d->tcf_lock); - d->tcf_action = parm->action; + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); rcu_swap_protected(d->params, params_new, lockdep_is_held(&d->tcf_lock)); spin_unlock_bh(&d->tcf_lock); if (params_new) kfree_rcu(params_new, rcu); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; } static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json index 5aaf593b914a..ecd96eda7f6a 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json @@ -484,5 +484,30 @@ "teardown": [ "$TC actions flush action skbedit" ] + }, + { + "id": "1b2b", + "name": "Replace skbedit action with invalid goto_chain control", + "category": [ + "actions", + "skbedit" + ], + "setup": [ + [ + "$TC actions flush action skbedit", + 0, + 1, + 255 + ], + "$TC actions add action skbedit ptype host pass index 90" + ], + "cmdUnderTest": "$TC actions replace action skbedit ptype host goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions list action skbedit", + "matchPattern": "action order [0-9]*: skbedit ptype host pass.*index 90 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action skbedit" + ] } ] From 7c3d825d12c5e6056ea73c0a202cbdef9d9ab9e6 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:12 +0100 Subject: [PATCH 781/855] net/sched: act_skbmod: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall \ > action skbmod set smac 00:c1:a0:c1:a0:00 pass index 90 # tc actions replace action skbmod \ > set smac 00:c1:a0:c1:a0:00 goto chain 42 index 90 cookie c1a0c1a0 # tc actions show action skbmod had the following output: src MAC address <00:c1:a0:c1:a0:00> src MAC address <00:c1:a0:c1:a0:00> Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: skbmod goto chain 42 set smac 00:c1:a0:c1:a0:00 index 90 ref 2 bind 1 cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 800000002d5c7067 P4D 800000002d5c7067 PUD 77e16067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.0.0-rc4.gotochain_crash+ #536 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff8987ffd83be0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff8987aeb68800 RCX: ffff8987fa263640 RDX: 0000000000000000 RSI: ffff8987f51c8802 RDI: 00000000000000a0 RBP: ffff8987ffd83c80 R08: ffff8987f939bac8 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8987f5c77d00 R13: ffff8987f5c77d08 R14: 0000000000000001 R15: ffff8987f0c29f00 FS: 0000000000000000(0000) GS:ffff8987ffd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007832c004 CR4: 00000000001606e0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ip6_finish_output2+0x369/0x590 ip6_finish_output2+0x369/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.35+0x79/0xc0 mld_sendpack+0x16f/0x220 mld_ifc_timer_expire+0x195/0x2c0 ? igmp6_timer_handler+0x70/0x70 call_timer_fn+0x2b/0x130 run_timer_softirq+0x3e8/0x440 ? tick_sched_timer+0x37/0x70 __do_softirq+0xe3/0x2f5 irq_exit+0xf0/0x100 smp_apic_timer_interrupt+0x6c/0x130 apic_timer_interrupt+0xf/0x20 RIP: 0010:native_safe_halt+0x2/0x10 Code: 56 ff ff ff 7f f3 c3 65 48 8b 04 25 00 5c 01 00 f0 80 48 02 20 48 8b 00 a8 08 74 8b eb c1 90 90 90 90 90 90 90 90 90 90 fb f4 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 f4 c3 90 90 90 90 90 90 RSP: 0018:ffffa2a1c038feb8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 RAX: ffffffffa94184f0 RBX: 0000000000000003 RCX: 0000000000000001 RDX: 0000000000000001 RSI: 0000000000000087 RDI: 0000000000000003 RBP: 0000000000000003 R08: 001123cfc2ba71ac R09: 0000000000000000 R10: 0000000000000000 R11: 00000000000f4240 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ? __sched_text_end+0x1/0x1 default_idle+0x1c/0x140 do_idle+0x1c4/0x280 cpu_startup_entry+0x19/0x20 start_secondary+0x1a7/0x200 secondary_startup_64+0xa4/0xb0 Modules linked in: act_skbmod veth ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 crct10dif_pclmul crc32_pclmul ghash_clmulni_intel mbcache jbd2 snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_seq snd_seq_device aesni_intel crypto_simd cryptd glue_helper snd_pcm joydev pcspkr virtio_balloon snd_timer snd i2c_piix4 soundcore nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect virtio_net sysimgblt fb_sys_fops net_failover virtio_console ttm virtio_blk failover drm crc32c_intel serio_raw ata_piix virtio_pci libata virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_skbmod_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_skbmod.c | 19 +++++++++++--- .../tc-testing/tc-tests/actions/skbmod.json | 25 +++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index b9ab2c8f07f1..1d4c324d0a42 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -88,6 +89,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, skbmod_net_id); struct nlattr *tb[TCA_SKBMOD_MAX + 1]; struct tcf_skbmod_params *p, *p_old; + struct tcf_chain *goto_ch = NULL; struct tc_skbmod *parm; struct tcf_skbmod *d; bool exists = false; @@ -154,21 +156,24 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; d = to_skbmod(*a); p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); if (unlikely(!p)) { - tcf_idr_release(*a, bind); - return -ENOMEM; + err = -ENOMEM; + goto put_chain; } p->flags = lflags; - d->tcf_action = parm->action; if (ovr) spin_lock_bh(&d->tcf_lock); /* Protected by tcf_lock if overwriting existing action. */ + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p_old = rcu_dereference_protected(d->skbmod_p, 1); if (lflags & SKBMOD_F_DMAC) @@ -184,10 +189,18 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (p_old) kfree_rcu(p_old, rcu); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; } static void tcf_skbmod_cleanup(struct tc_action *a) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json b/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json index fe3326e939c1..6eb4c4f97060 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json @@ -392,5 +392,30 @@ "teardown": [ "$TC actions flush action skbmod" ] + }, + { + "id": "b651", + "name": "Replace skbmod action with invalid goto_chain control", + "category": [ + "actions", + "skbmod" + ], + "setup": [ + [ + "$TC actions flush action skbmod", + 0, + 1, + 255 + ], + "$TC actions add action skbmod set etype 0x1111 pass index 90" + ], + "cmdUnderTest": "$TC actions replace action skbmod set etype 0x1111 goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions ls action skbmod", + "matchPattern": "action order [0-9]*: skbmod pass set etype 0x1111\\s+index 90 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action skbmod" + ] } ] From e5fdabacbffc5d321bf9f51410fe0db0834606eb Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:13 +0100 Subject: [PATCH 782/855] net/sched: act_tunnel_key: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall \ > action tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.2 dst_port 3128 \ > nocsum id 1 pass index 90 # tc actions replace action tunnel_key \ > set src_ip 10.10.10.1 dst_ip 20.20.2 dst_port 3128 nocsum id 1 \ > goto chain 42 index 90 cookie c1a0c1a0 # tc actions show action tunnel_key had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.2.0 key_id 1 dst_port 3128 nocsum goto chain 42 index 90 ref 2 bind 1 cookie c1a0c1a0 then, the first packet transmitted by crash0 made the kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 800000002aba4067 P4D 800000002aba4067 PUD 795f9067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.0.0-rc4.gotochain_crash+ #536 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff9346bdb83be0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff9346bb795c00 RCX: 0000000000000002 RDX: 0000000000000000 RSI: ffff93466c881700 RDI: 0000000000000246 RBP: ffff9346bdb83c80 R08: ffff9346b3e1e0c8 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9346b978f000 R13: ffff9346b978f008 R14: 0000000000000001 R15: ffff93466dceeb40 FS: 0000000000000000(0000) GS:ffff9346bdb80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007a6c2002 CR4: 00000000001606e0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ip6_finish_output2+0x369/0x590 ip6_finish_output2+0x369/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.35+0x79/0xc0 mld_sendpack+0x16f/0x220 mld_ifc_timer_expire+0x195/0x2c0 ? igmp6_timer_handler+0x70/0x70 call_timer_fn+0x2b/0x130 run_timer_softirq+0x3e8/0x440 ? tick_sched_timer+0x37/0x70 __do_softirq+0xe3/0x2f5 irq_exit+0xf0/0x100 smp_apic_timer_interrupt+0x6c/0x130 apic_timer_interrupt+0xf/0x20 RIP: 0010:native_safe_halt+0x2/0x10 Code: 55 ff ff ff 7f f3 c3 65 48 8b 04 25 00 5c 01 00 f0 80 48 02 20 48 8b 00 a8 08 74 8b eb c1 90 90 90 90 90 90 90 90 90 90 fb f4 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 f4 c3 90 90 90 90 90 90 RSP: 0018:ffffa48a8038feb8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 RAX: ffffffffaa8184f0 RBX: 0000000000000003 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000087 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0011251c6fcfac49 R09: ffff9346b995be00 R10: ffffa48a805e7ce8 R11: 00000000024c38dd R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ? __sched_text_end+0x1/0x1 default_idle+0x1c/0x140 do_idle+0x1c4/0x280 cpu_startup_entry+0x19/0x20 start_secondary+0x1a7/0x200 secondary_startup_64+0xa4/0xb0 Modules linked in: act_tunnel_key veth ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 crct10dif_pclmul crc32_pclmul snd_hda_codec_generic ghash_clmulni_intel mbcache snd_hda_intel jbd2 snd_hda_codec snd_hwdep snd_hda_core snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd cryptd glue_helper joydev snd_timer snd pcspkr virtio_balloon soundcore i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect virtio_net sysimgblt fb_sys_fops ttm net_failover virtio_console virtio_blk failover drm serio_raw crc32c_intel ata_piix virtio_pci floppy virtio_ring libata virtio dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_tunnel_key_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_tunnel_key.c | 18 +++++++++++-- .../tc-tests/actions/tunnel_key.json | 25 +++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index fc295d91559a..d5aaf90a3971 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -217,6 +218,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; struct tcf_tunnel_key_params *params_new; struct metadata_dst *metadata = NULL; + struct tcf_chain *goto_ch = NULL; struct tc_tunnel_key *parm; struct tcf_tunnel_key *t; bool exists = false; @@ -360,6 +362,12 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, goto release_tun_meta; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) { + ret = err; + exists = true; + goto release_tun_meta; + } t = to_tunnel_key(*a); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); @@ -367,23 +375,29 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); ret = -ENOMEM; exists = true; - goto release_tun_meta; + goto put_chain; } params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; spin_lock_bh(&t->tcf_lock); - t->tcf_action = parm->action; + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); rcu_swap_protected(t->params, params_new, lockdep_is_held(&t->tcf_lock)); spin_unlock_bh(&t->tcf_lock); tunnel_key_release_params(params_new); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + release_tun_meta: if (metadata) dst_release(&metadata->dst); diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json index e7e15a7336b6..28453a445fdb 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json @@ -884,5 +884,30 @@ "teardown": [ "$TC actions flush action tunnel_key" ] + }, + { + "id": "8242", + "name": "Replace tunnel_key set action with invalid goto chain", + "category": [ + "actions", + "tunnel_key" + ], + "setup": [ + [ + "$TC actions flush action tunnel_key", + 0, + 1, + 255 + ], + "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.20.2 dst_port 3128 nocsum id 1 pass index 90" + ], + "cmdUnderTest": "$TC actions replace action tunnel_key set src_ip 10.10.10.2 dst_ip 20.20.20.1 dst_port 3129 id 2 csum goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions get action tunnel_key index 90", + "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 10.10.10.1.*dst_ip 20.20.20.2.*key_id 1.*dst_port 3128.*csum pass.*index 90 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action tunnel_key" + ] } ] From 7e0c8892df7d0316ec853adbf84db536cd53258c Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:14 +0100 Subject: [PATCH 783/855] net/sched: act_vlan: validate the control action inside init() the following script: # tc qdisc add dev crash0 clsact # tc filter add dev crash0 egress matchall \ > action vlan pop pass index 90 # tc actions replace action vlan \ > pop goto chain 42 index 90 cookie c1a0c1a0 # tc actions show action vlan had the following output: Error: Failed to init TC action chain. We have an error talking to the kernel total acts 1 action order 0: vlan pop goto chain 42 index 90 ref 2 bind 1 cookie c1a0c1a0 Then, the first packet transmitted by crash0 made the kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 800000007974f067 P4D 800000007974f067 PUD 79638067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.0.0-rc4.gotochain_crash+ #536 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff982dfdb83be0 EFLAGS: 00010246 RAX: 000000002000002a RBX: ffff982dfc55db00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff982df97099c0 RDI: ffff982dfc55db00 RBP: ffff982dfdb83c80 R08: ffff982df983fec8 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff982df5aacd00 R13: ffff982df5aacd08 R14: 0000000000000001 R15: ffff982df97099c0 FS: 0000000000000000(0000) GS:ffff982dfdb80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000796d0005 CR4: 00000000001606e0 Call Trace: tcf_classify+0x58/0x120 __dev_queue_xmit+0x40a/0x890 ? ip6_finish_output2+0x369/0x590 ip6_finish_output2+0x369/0x590 ? ip6_output+0x68/0x110 ip6_output+0x68/0x110 ? nf_hook.constprop.35+0x79/0xc0 mld_sendpack+0x16f/0x220 mld_ifc_timer_expire+0x195/0x2c0 ? igmp6_timer_handler+0x70/0x70 call_timer_fn+0x2b/0x130 run_timer_softirq+0x3e8/0x440 ? enqueue_hrtimer+0x39/0x90 __do_softirq+0xe3/0x2f5 irq_exit+0xf0/0x100 smp_apic_timer_interrupt+0x6c/0x130 apic_timer_interrupt+0xf/0x20 RIP: 0010:native_safe_halt+0x2/0x10 Code: 7b ff ff ff 7f f3 c3 65 48 8b 04 25 00 5c 01 00 f0 80 48 02 20 48 8b 00 a8 08 74 8b eb c1 90 90 90 90 90 90 90 90 90 90 fb f4 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 f4 c3 90 90 90 90 90 90 RSP: 0018:ffffa4714038feb8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 RAX: ffffffff840184f0 RBX: 0000000000000003 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000001e57d3f387 RBP: 0000000000000003 R08: 001125d9ca39e1eb R09: 0000000000000000 R10: 000000000000027d R11: 000000000009f400 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ? __sched_text_end+0x1/0x1 default_idle+0x1c/0x140 do_idle+0x1c4/0x280 cpu_startup_entry+0x19/0x20 start_secondary+0x1a7/0x200 secondary_startup_64+0xa4/0xb0 Modules linked in: act_vlan veth ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 snd_hda_codec_generic mbcache crct10dif_pclmul jbd2 snd_hda_intel crc32_pclmul snd_hda_codec ghash_clmulni_intel snd_hwdep snd_hda_core snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd cryptd glue_helper joydev snd_timer virtio_balloon snd pcspkr soundcore i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect sysimgblt virtio_net fb_sys_fops virtio_blk ttm net_failover virtio_console failover ata_piix drm libata crc32c_intel virtio_pci serio_raw virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 Validating the control action within tcf_vlan_init() proved to fix the above issue. A TDC selftest is added to verify the correct behavior. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Fixes: 97763dc0f401 ("net_sched: reject unknown tcfa_action values") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_vlan.c | 20 ++++++++++++--- .../tc-testing/tc-tests/actions/vlan.json | 25 +++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 4651ee15e35d..0f40d0a74423 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -109,6 +110,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; + struct tcf_chain *goto_ch = NULL; struct tcf_vlan_params *p; struct tc_vlan *parm; struct tcf_vlan *v; @@ -200,12 +202,16 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, return -EEXIST; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + v = to_vlan(*a); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { - tcf_idr_release(*a, bind); - return -ENOMEM; + err = -ENOMEM; + goto put_chain; } p->tcfv_action = action; @@ -214,16 +220,24 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, p->tcfv_push_proto = push_proto; spin_lock_bh(&v->tcf_lock); - v->tcf_action = parm->action; + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); spin_unlock_bh(&v->tcf_lock); + if (goto_ch) + tcf_chain_put_by_act(goto_ch); if (p) kfree_rcu(p, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +put_chain: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; } static void tcf_vlan_cleanup(struct tc_action *a) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json b/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json index 69ea09eefffc..cc7c7d758008 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json @@ -688,5 +688,30 @@ "teardown": [ "$TC actions flush action vlan" ] + }, + { + "id": "e394", + "name": "Replace vlan push action with invalid goto chain control", + "category": [ + "actions", + "vlan" + ], + "setup": [ + [ + "$TC actions flush action vlan", + 0, + 1, + 255 + ], + "$TC actions add action vlan push id 500 pass index 90" + ], + "cmdUnderTest": "$TC actions replace action vlan push id 500 goto chain 42 index 90 cookie c1a0c1a0", + "expExitCode": "255", + "verifyCmd": "$TC actions get action vlan index 90", + "matchPattern": "action order [0-9]+: vlan.*push id 500 protocol 802.1Q priority 0 pass.*index 90 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action vlan" + ] } ] From fe384e2fa36ca084a456fd30558cccc75b4b3fbd Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:15 +0100 Subject: [PATCH 784/855] net/sched: don't dereference a->goto_chain to read the chain index callers of tcf_gact_goto_chain_index() can potentially read an old value of the chain index, or even dereference a NULL 'goto_chain' pointer, because 'goto_chain' and 'tcfa_action' are read in the traffic path without caring of concurrent write in the control path. The most recent value of chain index can be read also from a->tcfa_action (it's encoded there together with TC_ACT_GOTO_CHAIN bits), so we don't really need to dereference 'goto_chain': just read the chain id from the control action. Fixes: e457d86ada27 ("net: sched: add couple of goto_chain helpers") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/tc_act/tc_gact.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index ee8d005f56fc..eb8f01c819e6 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -56,7 +56,7 @@ static inline bool is_tcf_gact_goto_chain(const struct tc_action *a) static inline u32 tcf_gact_goto_chain_index(const struct tc_action *a) { - return a->goto_chain->index; + return READ_ONCE(a->tcfa_action) & TC_ACT_EXT_VAL_MASK; } #endif /* __NET_TC_GACT_H */ From ee3bbfe806cdb46b02cda63626cb50a7a7b19fc5 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 20 Mar 2019 15:00:16 +0100 Subject: [PATCH 785/855] net/sched: let actions use RCU to access 'goto_chain' use RCU when accessing the action chain, to avoid use after free in the traffic path when 'goto chain' is replaced on existing TC actions (see script below). Since the control action is read in the traffic path without holding the action spinlock, we need to explicitly ensure that a->goto_chain is not NULL before dereferencing (i.e it's not sufficient to rely on the value of TC_ACT_GOTO_CHAIN bits). Not doing so caused NULL dereferences in tcf_action_goto_chain_exec() when the following script: # tc chain add dev dd0 chain 42 ingress protocol ip flower \ > ip_proto udp action pass index 4 # tc filter add dev dd0 ingress protocol ip flower \ > ip_proto udp action csum udp goto chain 42 index 66 # tc chain del dev dd0 chain 42 ingress (start UDP traffic towards dd0) # tc action replace action csum udp pass index 66 was run repeatedly for several hours. Suggested-by: Cong Wang Suggested-by: Vlad Buslov Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/sch_generic.h | 1 + net/sched/act_api.c | 18 ++++++++++-------- net/sched/cls_api.c | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/include/net/act_api.h b/include/net/act_api.h index 54fbb49bd08a..c61a1bf4e3de 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -39,7 +39,7 @@ struct tc_action { struct gnet_stats_basic_cpu __percpu *cpu_bstats_hw; struct gnet_stats_queue __percpu *cpu_qstats; struct tc_cookie __rcu *act_cookie; - struct tcf_chain *goto_chain; + struct tcf_chain __rcu *goto_chain; }; #define tcf_index common.tcfa_index #define tcf_refcnt common.tcfa_refcnt diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 31284c078d06..7d1a0483a17b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -378,6 +378,7 @@ struct tcf_chain { bool flushing; const struct tcf_proto_ops *tmplt_ops; void *tmplt_priv; + struct rcu_head rcu; }; struct tcf_block { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index fe67b98ac641..5a87e271d35a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -31,7 +31,7 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a, struct tcf_result *res) { - const struct tcf_chain *chain = a->goto_chain; + const struct tcf_chain *chain = rcu_dereference_bh(a->goto_chain); res->goto_tp = rcu_dereference_bh(chain->filter_chain); } @@ -91,13 +91,11 @@ end: EXPORT_SYMBOL(tcf_action_check_ctrlact); struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action, - struct tcf_chain *newchain) + struct tcf_chain *goto_chain) { - struct tcf_chain *oldchain = a->goto_chain; - a->tcfa_action = action; - a->goto_chain = newchain; - return oldchain; + rcu_swap_protected(a->goto_chain, goto_chain, 1); + return goto_chain; } EXPORT_SYMBOL(tcf_action_set_ctrlact); @@ -108,7 +106,7 @@ EXPORT_SYMBOL(tcf_action_set_ctrlact); */ static void free_tcf(struct tc_action *p) { - struct tcf_chain *chain = p->goto_chain; + struct tcf_chain *chain = rcu_dereference_protected(p->goto_chain, 1); free_percpu(p->cpu_bstats); free_percpu(p->cpu_bstats_hw); @@ -686,6 +684,10 @@ repeat: return TC_ACT_OK; } } else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) { + if (unlikely(!rcu_access_pointer(a->goto_chain))) { + net_warn_ratelimited("can't go to NULL chain!\n"); + return TC_ACT_SHOT; + } tcf_action_goto_chain_exec(a, res); } @@ -931,7 +933,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, module_put(a_o->owner); if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN) && - !a->goto_chain) { + !rcu_access_pointer(a->goto_chain)) { tcf_action_destroy_1(a, bind); NL_SET_ERR_MSG(extack, "can't use goto chain with NULL chain"); return ERR_PTR(-EINVAL); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index dc10525e90e7..99ae30c177c7 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -367,7 +367,7 @@ static void tcf_chain_destroy(struct tcf_chain *chain, bool free_block) struct tcf_block *block = chain->block; mutex_destroy(&chain->filter_chain_lock); - kfree(chain); + kfree_rcu(chain, rcu); if (free_block) tcf_block_destroy(block); } From 6b70fc94afd165342876e53fc4b2f7d085009945 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Wed, 20 Mar 2019 14:25:05 -0400 Subject: [PATCH 786/855] net-sysfs: Fix memory leak in netdev_register_kobject When registering struct net_device, it will call register_netdevice -> netdev_register_kobject -> device_initialize(dev); dev_set_name(dev, "%s", ndev->name) device_add(dev) register_queue_kobjects(ndev) In netdev_register_kobject(), if device_add(dev) or register_queue_kobjects(ndev) failed. Register_netdevice() will return error, causing netdev_freemem(ndev) to be called to free net_device, however put_device(&dev->dev)->..-> kobject_cleanup() won't be called, resulting in a memory leak. syzkaller report this: BUG: memory leak unreferenced object 0xffff8881f4fad168 (size 8): comm "syz-executor.0", pid 3575, jiffies 4294778002 (age 20.134s) hex dump (first 8 bytes): 77 70 61 6e 30 00 ff ff wpan0... backtrace: [<000000006d2d91d7>] kstrdup_const+0x3d/0x50 mm/util.c:73 [<00000000ba9ff953>] kvasprintf_const+0x112/0x170 lib/kasprintf.c:48 [<000000005555ec09>] kobject_set_name_vargs+0x55/0x130 lib/kobject.c:281 [<0000000098d28ec3>] dev_set_name+0xbb/0xf0 drivers/base/core.c:1915 [<00000000b7553017>] netdev_register_kobject+0xc0/0x410 net/core/net-sysfs.c:1727 [<00000000c826a797>] register_netdevice+0xa51/0xeb0 net/core/dev.c:8711 [<00000000857bfcfd>] cfg802154_update_iface_num.isra.2+0x13/0x90 [ieee802154] [<000000003126e453>] ieee802154_llsec_fill_key_id+0x1d5/0x570 [ieee802154] [<00000000e4b3df51>] 0xffffffffc1500e0e [<00000000b4319776>] platform_drv_probe+0xc6/0x180 drivers/base/platform.c:614 [<0000000037669347>] really_probe+0x491/0x7c0 drivers/base/dd.c:509 [<000000008fed8862>] driver_probe_device+0xdc/0x240 drivers/base/dd.c:671 [<00000000baf52041>] device_driver_attach+0xf2/0x130 drivers/base/dd.c:945 [<00000000c7cc8dec>] __driver_attach+0x10e/0x210 drivers/base/dd.c:1022 [<0000000057a757c2>] bus_for_each_dev+0x154/0x1e0 drivers/base/bus.c:304 [<000000005f5ae04b>] bus_add_driver+0x427/0x5e0 drivers/base/bus.c:645 Reported-by: Hulk Robot Fixes: 1fa5ae857bb1 ("driver core: get rid of struct device's bus_id string array") Signed-off-by: Wang Hai Reviewed-by: Andy Shevchenko Reviewed-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 8f8b7b6c2945..f8f94303a1f5 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1747,16 +1747,20 @@ int netdev_register_kobject(struct net_device *ndev) error = device_add(dev); if (error) - return error; + goto error_put_device; error = register_queue_kobjects(ndev); - if (error) { - device_del(dev); - return error; - } + if (error) + goto error_device_del; pm_runtime_set_memalloc_noio(dev, true); + return 0; + +error_device_del: + device_del(dev); +error_put_device: + put_device(dev); return error; } From 408f13ef358aa5ad56dc6230c2c7deb92cf462b1 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 21 Mar 2019 09:39:52 +0800 Subject: [PATCH 787/855] rhashtable: Still do rehash when we get EEXIST As it stands if a shrink is delayed because of an outstanding rehash, we will go into a rescheduling loop without ever doing the rehash. This patch fixes this by still carrying out the rehash and then rescheduling so that we can shrink after the completion of the rehash should it still be necessary. The return value of EEXIST captures this case and other cases (e.g., another thread expanded/rehashed the table at the same time) where we should still proceed with the rehash. Fixes: da20420f83ea ("rhashtable: Add nested tables") Reported-by: Josh Elsasser Signed-off-by: Herbert Xu Tested-by: Josh Elsasser Signed-off-by: David S. Miller --- lib/rhashtable.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 0a105d4af166..97f59abc3e92 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -416,8 +416,12 @@ static void rht_deferred_worker(struct work_struct *work) else if (tbl->nest) err = rhashtable_rehash_alloc(ht, tbl, tbl->size); - if (!err) - err = rhashtable_rehash_table(ht); + if (!err || err == -EEXIST) { + int nerr; + + nerr = rhashtable_rehash_table(ht); + err = err ?: nerr; + } mutex_unlock(&ht->mutex); From 5f543a54eec08228ab0cc0a49cf5d79dd32c9e5e Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Thu, 21 Mar 2019 11:28:43 +0800 Subject: [PATCH 788/855] net: hns3: fix for not calculating tx bd num correctly When there is only one byte in a frag, the current calculation using "(size + HNS3_MAX_BD_SIZE - 1) >> HNS3_MAX_BD_SIZE_OFFSET" will return zero, because HNS3_MAX_BD_SIZE is 65535 and HNS3_MAX_BD_SIZE_OFFSET is 16. So it will cause tx error when a frag's size is one byte. This patch fixes it by using DIV_ROUND_UP. Fixes: 3fe13ed95dd3 ("net: hns3: avoid mult + div op in critical data path") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 13 ++++++------- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 1 - 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 1c1f17ec6be2..162cb9afa0e7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -22,6 +22,7 @@ #include "hns3_enet.h" #define hns3_set_field(origin, shift, val) ((origin) |= ((val) << (shift))) +#define hns3_tx_bd_count(S) DIV_ROUND_UP(S, HNS3_MAX_BD_SIZE) static void hns3_clear_all_ring(struct hnae3_handle *h); static void hns3_force_clear_all_rx_ring(struct hnae3_handle *h); @@ -1079,7 +1080,7 @@ static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv, desc_cb->length = size; - frag_buf_num = (size + HNS3_MAX_BD_SIZE - 1) >> HNS3_MAX_BD_SIZE_OFFSET; + frag_buf_num = hns3_tx_bd_count(size); sizeoflast = size & HNS3_TX_LAST_SIZE_M; sizeoflast = sizeoflast ? sizeoflast : HNS3_MAX_BD_SIZE; @@ -1124,14 +1125,13 @@ static int hns3_nic_maybe_stop_tso(struct sk_buff **out_skb, int *bnum, int i; size = skb_headlen(skb); - buf_num = (size + HNS3_MAX_BD_SIZE - 1) >> HNS3_MAX_BD_SIZE_OFFSET; + buf_num = hns3_tx_bd_count(size); frag_num = skb_shinfo(skb)->nr_frags; for (i = 0; i < frag_num; i++) { frag = &skb_shinfo(skb)->frags[i]; size = skb_frag_size(frag); - bdnum_for_frag = (size + HNS3_MAX_BD_SIZE - 1) >> - HNS3_MAX_BD_SIZE_OFFSET; + bdnum_for_frag = hns3_tx_bd_count(size); if (unlikely(bdnum_for_frag > HNS3_MAX_BD_PER_FRAG)) return -ENOMEM; @@ -1139,8 +1139,7 @@ static int hns3_nic_maybe_stop_tso(struct sk_buff **out_skb, int *bnum, } if (unlikely(buf_num > HNS3_MAX_BD_PER_FRAG)) { - buf_num = (skb->len + HNS3_MAX_BD_SIZE - 1) >> - HNS3_MAX_BD_SIZE_OFFSET; + buf_num = hns3_tx_bd_count(skb->len); if (ring_space(ring) < buf_num) return -EBUSY; /* manual split the send packet */ @@ -1169,7 +1168,7 @@ static int hns3_nic_maybe_stop_tx(struct sk_buff **out_skb, int *bnum, buf_num = skb_shinfo(skb)->nr_frags + 1; if (unlikely(buf_num > HNS3_MAX_BD_PER_FRAG)) { - buf_num = (skb->len + HNS3_MAX_BD_SIZE - 1) / HNS3_MAX_BD_SIZE; + buf_num = hns3_tx_bd_count(skb->len); if (ring_space(ring) < buf_num) return -EBUSY; /* manual split the send packet */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 1db0bd41d209..75669cd0c311 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -193,7 +193,6 @@ enum hns3_nic_state { #define HNS3_VECTOR_INITED 1 #define HNS3_MAX_BD_SIZE 65535 -#define HNS3_MAX_BD_SIZE_OFFSET 16 #define HNS3_MAX_BD_PER_FRAG 8 #define HNS3_MAX_BD_PER_PKT MAX_SKB_FRAGS From 2219c9ee922d9a0548b44fb82342166d9d9fdbc9 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 21 Feb 2019 03:38:51 +0000 Subject: [PATCH 789/855] drm/nouveau/dmem: remove set but not used variable 'drm' Fixes gcc '-Wunused-but-set-variable' warning: drivers/gpu/drm/nouveau/nouveau_dmem.c: In function 'nouveau_dmem_free': drivers/gpu/drm/nouveau/nouveau_dmem.c:103:22: warning: variable 'drm' set but not used [-Wunused-but-set-variable] struct nouveau_drm *drm; ^ Signed-off-by: YueHaibing Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 8be7a83ced9b..58b4cb389d87 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -100,12 +100,10 @@ static void nouveau_dmem_free(struct hmm_devmem *devmem, struct page *page) { struct nouveau_dmem_chunk *chunk; - struct nouveau_drm *drm; unsigned long idx; chunk = (void *)hmm_devmem_page_get_drvdata(page); idx = page_to_pfn(page) - chunk->pfn_first; - drm = chunk->drm; /* * FIXME: From 18ec3c129bcad0a481ee7faf8ce5ad92d6537722 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 22 Feb 2019 09:34:04 +0300 Subject: [PATCH 790/855] drm/nouveau/dmem: Fix a NULL vs IS_ERR() check The hmm_devmem_add() function doesn't return NULL, it returns error pointers. Fixes: 5be73b690875 ("drm/nouveau/dmem: device memory helpers for SVM") Signed-off-by: Dan Carpenter Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 58b4cb389d87..b1df42f73ec2 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -621,7 +621,7 @@ nouveau_dmem_init(struct nouveau_drm *drm) */ drm->dmem->devmem = hmm_devmem_add(&nouveau_dmem_devmem_ops, device, size); - if (drm->dmem->devmem == NULL) { + if (IS_ERR(drm->dmem->devmem)) { kfree(drm->dmem); drm->dmem = NULL; return; From 909e9c9c428376e2a43d178ed4b0a2d5ba9cb7d3 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 28 Feb 2019 20:24:59 +0800 Subject: [PATCH 791/855] drm/nouveau/debugfs: Fix check of pm_runtime_get_sync failure pm_runtime_get_sync returns negative on failure. Fixes: eaeb9010bb4b ("drm/nouveau/debugfs: Wake up GPU before doing any reclocking") Signed-off-by: YueHaibing Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_debugfs.c b/drivers/gpu/drm/nouveau/nouveau_debugfs.c index 88a52f6b39fe..7dfbbbc1beea 100644 --- a/drivers/gpu/drm/nouveau/nouveau_debugfs.c +++ b/drivers/gpu/drm/nouveau/nouveau_debugfs.c @@ -181,7 +181,7 @@ nouveau_debugfs_pstate_set(struct file *file, const char __user *ubuf, } ret = pm_runtime_get_sync(drm->dev); - if (IS_ERR_VALUE(ret) && ret != -EACCES) + if (ret < 0 && ret != -EACCES) return ret; ret = nvif_mthd(ctrl, NVIF_CONTROL_PSTATE_USER, &args, sizeof(args)); pm_runtime_put_autosuspend(drm->dev); From 83857418073fda5e0d562e6639b45ae06c957131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Thu, 21 Mar 2019 15:42:18 -0400 Subject: [PATCH 792/855] drm/nouveau/dmem: empty chunk do not have a buffer object associated with them. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Empty chunk do not have a bo associated with them so no need to pin/unpin on suspend/resume. This fix suspend/resume on 5.1rc1 when NOUVEAU_SVM is enabled. Signed-off-by: Jérôme Glisse Reviewed-by: Tobias Klausmann Tested-by: Tobias Klausmann Cc: Ben Skeggs Cc: dri-devel@lists.freedesktop.org Cc: nouveau@lists.freedesktop.org Cc: David Airlie Cc: Daniel Vetter Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index b1df42f73ec2..61fc27d30ff2 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -454,11 +454,6 @@ nouveau_dmem_resume(struct nouveau_drm *drm) /* FIXME handle pin failure */ WARN_ON(ret); } - list_for_each_entry (chunk, &drm->dmem->chunk_empty, list) { - ret = nouveau_bo_pin(chunk->bo, TTM_PL_FLAG_VRAM, false); - /* FIXME handle pin failure */ - WARN_ON(ret); - } mutex_unlock(&drm->dmem->mutex); } @@ -477,9 +472,6 @@ nouveau_dmem_suspend(struct nouveau_drm *drm) list_for_each_entry (chunk, &drm->dmem->chunk_full, list) { nouveau_bo_unpin(chunk->bo); } - list_for_each_entry (chunk, &drm->dmem->chunk_empty, list) { - nouveau_bo_unpin(chunk->bo); - } mutex_unlock(&drm->dmem->mutex); } From 83d163124cf1104cca5b668d5fe6325715a60855 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 21 Mar 2019 14:34:36 -0700 Subject: [PATCH 793/855] bpf: verifier: propagate liveness on all frames Commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") connected up parentage chains of all frames of the stack. It didn't, however, ensure propagate_liveness() propagates all liveness information along those chains. This means pruning happening in the callee may generate explored states with incomplete liveness for the chains in lower frames of the stack. The included selftest is similar to the prior one from commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences"), where callee would prune regardless of the difference in r8 state. Now we also initialize r9 to 0 or 1 based on a result from get_random(). r9 is never read so the walk with r9 = 0 gets pruned (correctly) after the walk with r9 = 1 completes. The selftest is so arranged that the pruning will happen in the callee. Since callee does not propagate read marks of r8, the explored state at the pruning point prior to the callee will now ignore r8. Propagate liveness on all frames of the stack when pruning. Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 20 +++++++++------- tools/testing/selftests/bpf/verifier/calls.c | 25 ++++++++++++++++++++ 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f19d5e04c69d..fd502c1f71eb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6078,15 +6078,17 @@ static int propagate_liveness(struct bpf_verifier_env *env, } /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - /* We don't need to worry about FP liveness because it's read-only */ - for (i = 0; i < BPF_REG_FP; i++) { - if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) - continue; - if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], - &vparent->frame[vstate->curframe]->regs[i]); - if (err) - return err; + for (frame = 0; frame <= vstate->curframe; frame++) { + /* We don't need to worry about FP liveness, it's read-only */ + for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { + if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ) + continue; + if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) { + err = mark_reg_read(env, &vstate->frame[frame]->regs[i], + &vparent->frame[frame]->regs[i]); + if (err) + return err; + } } } diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 4004891afa9c..f2ccae39ee66 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -1940,3 +1940,28 @@ .errstr = "!read_ok", .result = REJECT, }, +{ + "calls: cross frame pruning - liveness propagation", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), + BPF_MOV64_IMM(BPF_REG_8, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_MOV64_IMM(BPF_REG_8, 1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), + BPF_MOV64_IMM(BPF_REG_9, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_MOV64_IMM(BPF_REG_9, 1), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 1, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, + .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr = "!read_ok", + .result = REJECT, +}, From 80ef4464d5e27408685e609d389663aad46644b9 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 20 Mar 2019 18:57:23 +0000 Subject: [PATCH 794/855] iommu/iova: Fix tracking of recently failed iova address If a 32 bit allocation request is too big to possibly succeed, it early exits with a failure and then should never update max32_alloc_ size. This patch fixes current code, now the size is only updated if the slow path failed while walking the tree. Without the fix the allocation may enter the slow path again even if there was a failure before of a request with the same or a smaller size. Cc: # 4.20+ Fixes: bee60e94a1e2 ("iommu/iova: Optimise attempts to allocate iova from 32bit address range") Reviewed-by: Robin Murphy Signed-off-by: Robert Richter Signed-off-by: Joerg Roedel --- drivers/iommu/iova.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index f8d3ba247523..2de8122e218f 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -207,8 +207,10 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad, curr_iova = rb_entry(curr, struct iova, node); } while (curr && new_pfn <= curr_iova->pfn_hi); - if (limit_pfn < size || new_pfn < iovad->start_pfn) + if (limit_pfn < size || new_pfn < iovad->start_pfn) { + iovad->max32_alloc_size = size; goto iova32_full; + } /* pfn_lo will point to size aligned address if size_aligned is set */ new->pfn_lo = new_pfn; @@ -222,7 +224,6 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad, return 0; iova32_full: - iovad->max32_alloc_size = size; spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); return -ENOMEM; } From 5bb71fc790a88d063507dc5d445ab8b14e845591 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 20 Mar 2019 09:58:33 +0800 Subject: [PATCH 795/855] iommu/vt-d: Check capability before disabling protected memory The spec states in 10.4.16 that the Protected Memory Enable Register should be treated as read-only for implementations not supporting protected memory regions (PLMR and PHMR fields reported as Clear in the Capability register). Cc: Jacob Pan Cc: mark gross Suggested-by: Ashok Raj Fixes: f8bab73515ca5 ("intel-iommu: PMEN support") Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 87274b54febd..f002d47d2f27 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1538,6 +1538,9 @@ static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) u32 pmen; unsigned long flags; + if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) + return; + raw_spin_lock_irqsave(&iommu->register_lock, flags); pmen = readl(iommu->reg + DMAR_PMEN_REG); pmen &= ~DMA_PMEN_EPM; From 84c11e4df5aa4955acaa441f0cf1cb2e50daf64b Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 20 Mar 2019 09:58:34 +0800 Subject: [PATCH 796/855] iommu/vt-d: Save the right domain ID used by hardware The driver sets a default domain id (FLPT_DEFAULT_DID) in the first level only pasid entry, but saves a different domain id in @sdev->did. The value saved in @sdev->did will be used to invalidate the translation caches. Hence, the driver might result in invalidating the caches with a wrong domain id. Cc: Ashok Raj Cc: Jacob Pan Fixes: 1c4f88b7f1f92 ("iommu/vt-d: Shared virtual address in scalable mode") Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index f002d47d2f27..28cb713d728c 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5335,7 +5335,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sd ctx_lo = context[0].lo; - sdev->did = domain->iommu_did[iommu->seq_id]; + sdev->did = FLPT_DEFAULT_DID; sdev->sid = PCI_DEVID(info->bus, info->devfn); if (!(ctx_lo & CONTEXT_PASIDE)) { From 5a07168d8d89b00fe1760120714378175b3ef992 Mon Sep 17 00:00:00 2001 From: Chen Jie Date: Fri, 15 Mar 2019 03:44:38 +0000 Subject: [PATCH 797/855] futex: Ensure that futex address is aligned in handle_futex_death() The futex code requires that the user space addresses of futexes are 32bit aligned. sys_futex() checks this in futex_get_keys() but the robust list code has no alignment check in place. As a consequence the kernel crashes on architectures with strict alignment requirements in handle_futex_death() when trying to cmpxchg() on an unaligned futex address which was retrieved from the robust list. [ tglx: Rewrote changelog, proper sizeof() based alignement check and add comment ] Fixes: 0771dfefc9e5 ("[PATCH] lightweight robust futexes: core") Signed-off-by: Chen Jie Signed-off-by: Thomas Gleixner Cc: Cc: Cc: Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1552621478-119787-1-git-send-email-chenjie6@huawei.com --- kernel/futex.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/futex.c b/kernel/futex.c index c3b73b0311bc..9e40cf7be606 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3436,6 +3436,10 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int p { u32 uval, uninitialized_var(nval), mval; + /* Futex address must be 32bit aligned */ + if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) + return -1; + retry: if (get_user(uval, uaddr)) return -1; From 4fe64a62e04cfb2dc1daab0d8f05d212aa014161 Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Tue, 12 Mar 2019 03:47:53 -0400 Subject: [PATCH 798/855] x86/mm/pti: Make local symbols static With 'make C=2 W=1', sparse and gcc both complain: CHECK arch/x86/mm/pti.c arch/x86/mm/pti.c:84:3: warning: symbol 'pti_mode' was not declared. Should it be static? arch/x86/mm/pti.c:605:6: warning: symbol 'pti_set_kernel_image_nonglobal' was not declared. Should it be static? CC arch/x86/mm/pti.o arch/x86/mm/pti.c:605:6: warning: no previous prototype for 'pti_set_kernel_image_nonglobal' [-Wmissing-prototypes] 605 | void pti_set_kernel_image_nonglobal(void) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pti_set_kernel_image_nonglobal() is only used locally. 'pti_mode' exists in drivers/hwtracing/intel_th/pti.c as well, but it's a completely unrelated local (static) symbol. Make both static. Signed-off-by: Valdis Kletnieks Signed-off-by: Thomas Gleixner Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/27680.1552376873@turing-police --- arch/x86/mm/pti.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 4fee5c3003ed..139b28a01ce4 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -77,7 +77,7 @@ static void __init pti_print_if_secure(const char *reason) pr_info("%s\n", reason); } -enum pti_mode { +static enum pti_mode { PTI_AUTO = 0, PTI_FORCE_OFF, PTI_FORCE_ON @@ -602,7 +602,7 @@ static void pti_clone_kernel_text(void) set_memory_global(start, (end_global - start) >> PAGE_SHIFT); } -void pti_set_kernel_image_nonglobal(void) +static void pti_set_kernel_image_nonglobal(void) { /* * The identity map is created with PMDs, regardless of the From bb2e320565f997273fe04035bb6c17f643da6f8a Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Tue, 12 Mar 2019 04:17:56 -0400 Subject: [PATCH 799/855] genirq/devres: Remove excess parameter from kernel doc Building with 'make W=1' complains: CC kernel/irq/devres.o kernel/irq/devres.c:104: warning: Excess function parameter 'thread_fn' description in 'devm_request_any_context_irq' Remove it. Signed-off-by: Valdis Kletnieks Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/31207.1552378676@turing-police --- kernel/irq/devres.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 5d5378ea0afe..f808c6a97dcc 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -84,8 +84,6 @@ EXPORT_SYMBOL(devm_request_threaded_irq); * @dev: device to request interrupt for * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs - * @thread_fn: function to be called in a threaded interrupt context. NULL - * for devices which handle everything in @handler * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device, dev_name(dev) if NULL * @dev_id: A cookie passed back to the handler function From e8750053d64a3317cbc15f8341f0f11ca751bfeb Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Tue, 12 Mar 2019 04:38:35 -0400 Subject: [PATCH 800/855] time/jiffies: Make refined_jiffies static sparse complains: CHECK kernel/time/jiffies.c kernel/time/jiffies.c:92:20: warning: symbol 'refined_jiffies' was not declared. Should it be static? Its only used in file scope. Make it static. Signed-off-by: Valdis Kletnieks Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/32342.1552379915@turing-police --- kernel/time/jiffies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index dc1b6f1929f9..ac9c03dd6c7d 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -89,7 +89,7 @@ struct clocksource * __init __weak clocksource_default_clock(void) return &clocksource_jiffies; } -struct clocksource refined_jiffies; +static struct clocksource refined_jiffies; int register_refined_jiffies(long cycles_per_second) { From 48084abf212052ca1d39fae064c581b1ce5b1fdf Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Tue, 12 Mar 2019 05:33:48 -0400 Subject: [PATCH 801/855] watchdog/core: Make variables static sparse complains: CHECK kernel/watchdog.c kernel/watchdog.c:45:19: warning: symbol 'nmi_watchdog_available' was not declared. Should it be static? kernel/watchdog.c:47:16: warning: symbol 'watchdog_allowed_mask' was not declared. Should it be static? They're not referenced by name from anyplace else, make them static. Signed-off-by: Valdis Kletnieks Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/7855.1552383228@turing-police --- kernel/watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8fbfda94a67b..403c9bd90413 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -42,9 +42,9 @@ int __read_mostly watchdog_user_enabled = 1; int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; -int __read_mostly nmi_watchdog_available; +static int __read_mostly nmi_watchdog_available; -struct cpumask watchdog_allowed_mask __read_mostly; +static struct cpumask watchdog_allowed_mask __read_mostly; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); From c8248c6c1a3d5db944753dd8e1c143d92c2c74fc Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 21 Mar 2019 21:23:14 +0100 Subject: [PATCH 802/855] r8169: don't read interrupt mask register in interrupt handler After the original patch network starts to crash on heavy load. It's not fully clear why this additional register read has such side effects, but removing it fixes the issue. Thanks also to Alex for his contribution and hints. [0] https://marc.info/?t=155268170400002&r=1&w=2 Fixes: e782410ed237 ("r8169: improve spurious interrupt detection") Reported-by: VDR User Tested-by: VDR User Signed-off-by: Heiner Kallweit Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index c29dde064078..9dd1cd2c0c68 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -678,6 +678,7 @@ struct rtl8169_private { struct work_struct work; } wk; + unsigned irq_enabled:1; unsigned supports_gmii:1; dma_addr_t counters_phys_addr; struct rtl8169_counters *counters; @@ -1293,6 +1294,7 @@ static void rtl_ack_events(struct rtl8169_private *tp, u16 bits) static void rtl_irq_disable(struct rtl8169_private *tp) { RTL_W16(tp, IntrMask, 0); + tp->irq_enabled = 0; } #define RTL_EVENT_NAPI_RX (RxOK | RxErr) @@ -1301,6 +1303,7 @@ static void rtl_irq_disable(struct rtl8169_private *tp) static void rtl_irq_enable(struct rtl8169_private *tp) { + tp->irq_enabled = 1; RTL_W16(tp, IntrMask, tp->irq_mask); } @@ -6520,9 +6523,8 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance) { struct rtl8169_private *tp = dev_instance; u16 status = RTL_R16(tp, IntrStatus); - u16 irq_mask = RTL_R16(tp, IntrMask); - if (status == 0xffff || !(status & irq_mask)) + if (!tp->irq_enabled || status == 0xffff || !(status & tp->irq_mask)) return IRQ_NONE; if (unlikely(status & SYSErr)) { From e898e69d6b9475bf123f99b3c5d1a67bb7cb2361 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 7 Mar 2019 14:27:56 -0700 Subject: [PATCH 803/855] x86/hw_breakpoints: Make default case in hw_breakpoint_arch_parse() return an error When building with -Wsometimes-uninitialized, Clang warns: arch/x86/kernel/hw_breakpoint.c:355:2: warning: variable 'align' is used uninitialized whenever switch default is taken [-Wsometimes-uninitialized] The default cannot be reached because arch_build_bp_info() initializes hw->len to one of the specified cases. Nevertheless the warning is valid and returning -EINVAL makes sure that this cannot be broken by future modifications. Suggested-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: Thomas Gleixner Reviewed-by: Nick Desaulniers Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: clang-built-linux@googlegroups.com Link: https://github.com/ClangBuiltLinux/linux/issues/392 Link: https://lkml.kernel.org/r/20190307212756.4648-1-natechancellor@gmail.com --- arch/x86/kernel/hw_breakpoint.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index ff9bfd40429e..d73083021002 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -354,6 +354,7 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, #endif default: WARN_ON_ONCE(1); + return -EINVAL; } /* From 1e4471e74c75acb3f89959ffa02a241227937ae2 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Sat, 16 Mar 2019 16:24:37 +0800 Subject: [PATCH 804/855] sbitmap: trivial - update comment for sbitmap_deferred_clear_bit "sbitmap_batch_clear" should be "sbitmap_deferred_clear" Acked-by: Omar Sandoval Signed-off-by: Shenghui Wang Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 14d558146aea..20f3e3f029b9 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -330,7 +330,7 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) /* * This one is special, since it doesn't actually clear the bit, rather it * sets the corresponding bit in the ->cleared mask instead. Paired with - * the caller doing sbitmap_batch_clear() if a given index is full, which + * the caller doing sbitmap_deferred_clear() if a given index is full, which * will clear the previously freed entries in the corresponding ->word. */ static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) From d18a7408d7be0f34a120d99051ed5187d9727728 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 22 Mar 2019 22:37:08 +0800 Subject: [PATCH 805/855] clocksource/drivers/clps711x: Make clps711x_clksrc_init() static Fix sparse warning: drivers/clocksource/clps711x-timer.c:96:13: warning: symbol 'clps711x_clksrc_init' was not declared. Should it be static? Signed-off-by: YueHaibing Signed-off-by: Thomas Gleixner Cc: Cc: Cc: Link: https://lkml.kernel.org/r/20190322143708.12716-1-yuehaibing@huawei.com --- drivers/clocksource/clps711x-timer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/clps711x-timer.c b/drivers/clocksource/clps711x-timer.c index a8dd80576c95..cdc251524f5e 100644 --- a/drivers/clocksource/clps711x-timer.c +++ b/drivers/clocksource/clps711x-timer.c @@ -93,8 +93,9 @@ static int __init _clps711x_clkevt_init(struct clk *clock, void __iomem *base, "clps711x-timer", clkevt); } -void __init clps711x_clksrc_init(void __iomem *tc1_base, void __iomem *tc2_base, - unsigned int irq) +static void __init clps711x_clksrc_init(void __iomem *tc1_base, + void __iomem *tc2_base, + unsigned int irq) { struct clk *tc1 = clk_get_sys("clps711x-timer.0", NULL); struct clk *tc2 = clk_get_sys("clps711x-timer.1", NULL); From bddee90af621914f08a03d546419fc293e9140d8 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 22 Mar 2019 22:39:40 +0800 Subject: [PATCH 806/855] clocksource/drivers/tcb_clksrc: Make tc_clksrc_suspend/resume() static Fix sparse warnings: drivers/clocksource/tcb_clksrc.c:74:6: warning: symbol 'tc_clksrc_suspend' was not declared. Should it be static? drivers/clocksource/tcb_clksrc.c:89:6: warning: symbol 'tc_clksrc_resume' was not declared. Should it be static? Signed-off-by: YueHaibing Signed-off-by: Thomas Gleixner Cc: Cc: Cc: Link: https://lkml.kernel.org/r/20190322143940.12396-1-yuehaibing@huawei.com --- drivers/clocksource/tcb_clksrc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c index 43f4d5c4d6fa..f987027ca566 100644 --- a/drivers/clocksource/tcb_clksrc.c +++ b/drivers/clocksource/tcb_clksrc.c @@ -71,7 +71,7 @@ static u64 tc_get_cycles32(struct clocksource *cs) return readl_relaxed(tcaddr + ATMEL_TC_REG(0, CV)); } -void tc_clksrc_suspend(struct clocksource *cs) +static void tc_clksrc_suspend(struct clocksource *cs) { int i; @@ -86,7 +86,7 @@ void tc_clksrc_suspend(struct clocksource *cs) bmr_cache = readl(tcaddr + ATMEL_TC_BMR); } -void tc_clksrc_resume(struct clocksource *cs) +static void tc_clksrc_resume(struct clocksource *cs) { int i; From 008258d995a637c77c10a5d087d134eed49a6572 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 22 Mar 2019 22:43:02 +0800 Subject: [PATCH 807/855] clocksource/drivers/timer-ti-dm: Make omap_dm_timer_set_load_start() static Fix sparse warning: drivers/clocksource/timer-ti-dm.c:589:5: warning: symbol 'omap_dm_timer_set_load_start' was not declared. Should it be static? Signed-off-by: YueHaibing Signed-off-by: Thomas Gleixner Cc: Link: https://lkml.kernel.org/r/20190322144302.6704-1-yuehaibing@huawei.com --- drivers/clocksource/timer-ti-dm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index c364027638e1..3352da6ed61f 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -586,8 +586,8 @@ static int omap_dm_timer_set_load(struct omap_dm_timer *timer, int autoreload, } /* Optimized set_load which removes costly spin wait in timer_start */ -int omap_dm_timer_set_load_start(struct omap_dm_timer *timer, int autoreload, - unsigned int load) +static int omap_dm_timer_set_load_start(struct omap_dm_timer *timer, + int autoreload, unsigned int load) { u32 l; From 9039de4034775f4420bf01fa879f8c04b3cd6bba Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 22 Mar 2019 22:43:59 +0800 Subject: [PATCH 808/855] clocksource/drivers/mips-gic-timer: Make gic_compare_irqaction static Fix sparse warning: drivers/clocksource/mips-gic-timer.c:70:18: warning: symbol 'gic_compare_irqaction' was not declared. Should it be static? Signed-off-by: YueHaibing Signed-off-by: Thomas Gleixner Cc: Link: https://lkml.kernel.org/r/20190322144359.19516-1-yuehaibing@huawei.com --- drivers/clocksource/mips-gic-timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c index 54f8a331b53a..37671a5d4ed9 100644 --- a/drivers/clocksource/mips-gic-timer.c +++ b/drivers/clocksource/mips-gic-timer.c @@ -67,7 +67,7 @@ static irqreturn_t gic_compare_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -struct irqaction gic_compare_irqaction = { +static struct irqaction gic_compare_irqaction = { .handler = gic_compare_interrupt, .percpu_dev_id = &gic_clockevent_device, .flags = IRQF_PERCPU | IRQF_TIMER, From d53e292f0f505783d0219f58f8f8f294f45f4ee6 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 15 Mar 2019 07:54:59 +0000 Subject: [PATCH 809/855] CIFS: Fix an issue with re-sending wdata when transport returning -EAGAIN When sending a wdata, transport may return -EAGAIN. In this case we should re-obtain credits because the session may have been reconnected. Change in v2: adjust_credits before re-sending Signed-off-by: Long Li Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/file.c | 81 +++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 2a6d20c0ce02..2aa3e62af764 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2632,43 +2632,56 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, struct TCP_Server_Info *server = tlink_tcon(wdata->cfile->tlink)->ses->server; - /* - * Wait for credits to resend this wdata. - * Note: we are attempting to resend the whole wdata not in segments - */ do { - rc = server->ops->wait_mtu_credits(server, wdata->bytes, &wsize, - &credits); - - if (rc) - goto out; - - if (wsize < wdata->bytes) { - add_credits_and_wake_if(server, &credits, 0); - msleep(1000); - } - } while (wsize < wdata->bytes); - - wdata->credits = credits; - rc = -EAGAIN; - while (rc == -EAGAIN) { - rc = 0; - if (wdata->cfile->invalidHandle) + if (wdata->cfile->invalidHandle) { rc = cifs_reopen_file(wdata->cfile, false); - if (!rc) - rc = server->ops->async_writev(wdata, + if (rc == -EAGAIN) + continue; + else if (rc) + break; + } + + + /* + * Wait for credits to resend this wdata. + * Note: we are attempting to resend the whole wdata not in + * segments + */ + do { + rc = server->ops->wait_mtu_credits(server, wdata->bytes, + &wsize, &credits); + if (rc) + goto fail; + + if (wsize < wdata->bytes) { + add_credits_and_wake_if(server, &credits, 0); + msleep(1000); + } + } while (wsize < wdata->bytes); + wdata->credits = credits; + + rc = adjust_credits(server, &wdata->credits, wdata->bytes); + + if (!rc) { + if (wdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_writev(wdata, cifs_uncached_writedata_release); - } + } - if (!rc) { - list_add_tail(&wdata->list, wdata_list); - return 0; - } + /* If the write was successfully sent, we are done */ + if (!rc) { + list_add_tail(&wdata->list, wdata_list); + return 0; + } - add_credits_and_wake_if(server, &wdata->credits, 0); -out: + /* Roll back credits and retry if needed */ + add_credits_and_wake_if(server, &wdata->credits, 0); + } while (rc == -EAGAIN); + +fail: kref_put(&wdata->refcount, cifs_uncached_writedata_release); - return rc; } @@ -2896,12 +2909,12 @@ restart_loop: wdata->bytes, &tmp_from, ctx->cfile, cifs_sb, &tmp_list, ctx); + + kref_put(&wdata->refcount, + cifs_uncached_writedata_release); } list_splice(&tmp_list, &ctx->list); - - kref_put(&wdata->refcount, - cifs_uncached_writedata_release); goto restart_loop; } } From 0b0dfd59216755cfa5a47eab2811efaa4589db68 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 15 Mar 2019 07:55:00 +0000 Subject: [PATCH 810/855] CIFS: Fix an issue with re-sending rdata when transport returning -EAGAIN When sending a rdata, transport may return -EAGAIN. In this case we should re-obtain credits because the session may have been reconnected. Change in v2: adjust_credits before re-sending Signed-off-by: Long Li Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/file.c | 71 +++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 2aa3e62af764..89006e044973 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3361,44 +3361,55 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata, struct TCP_Server_Info *server = tlink_tcon(rdata->cfile->tlink)->ses->server; - /* - * Wait for credits to resend this rdata. - * Note: we are attempting to resend the whole rdata not in segments - */ do { - rc = server->ops->wait_mtu_credits(server, rdata->bytes, + if (rdata->cfile->invalidHandle) { + rc = cifs_reopen_file(rdata->cfile, true); + if (rc == -EAGAIN) + continue; + else if (rc) + break; + } + + /* + * Wait for credits to resend this rdata. + * Note: we are attempting to resend the whole rdata not in + * segments + */ + do { + rc = server->ops->wait_mtu_credits(server, rdata->bytes, &rsize, &credits); - if (rc) - goto out; + if (rc) + goto fail; - if (rsize < rdata->bytes) { - add_credits_and_wake_if(server, &credits, 0); - msleep(1000); + if (rsize < rdata->bytes) { + add_credits_and_wake_if(server, &credits, 0); + msleep(1000); + } + } while (rsize < rdata->bytes); + rdata->credits = credits; + + rc = adjust_credits(server, &rdata->credits, rdata->bytes); + if (!rc) { + if (rdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_readv(rdata); } - } while (rsize < rdata->bytes); - rdata->credits = credits; - rc = -EAGAIN; - while (rc == -EAGAIN) { - rc = 0; - if (rdata->cfile->invalidHandle) - rc = cifs_reopen_file(rdata->cfile, true); - if (!rc) - rc = server->ops->async_readv(rdata); - } + /* If the read was successfully sent, we are done */ + if (!rc) { + /* Add to aio pending list */ + list_add_tail(&rdata->list, rdata_list); + return 0; + } - if (!rc) { - /* Add to aio pending list */ - list_add_tail(&rdata->list, rdata_list); - return 0; - } - - add_credits_and_wake_if(server, &rdata->credits, 0); -out: - kref_put(&rdata->refcount, - cifs_uncached_readdata_release); + /* Roll back credits and retry if needed */ + add_credits_and_wake_if(server, &rdata->credits, 0); + } while (rc == -EAGAIN); +fail: + kref_put(&rdata->refcount, cifs_uncached_readdata_release); return rc; } From b073a08016a10f01dfb0d0b6c7fa89da0d544963 Mon Sep 17 00:00:00 2001 From: Xiaoli Feng Date: Sat, 16 Mar 2019 12:11:54 +0800 Subject: [PATCH 811/855] cifs: fix that return -EINVAL when do dedupe operation dedupe_file_range operations is combiled into remap_file_range. But it's always skipped for dedupe operations in function cifs_remap_file_range. Example to test: Before this patch: # dd if=/dev/zero of=cifs/file bs=1M count=1 # xfs_io -c "dedupe cifs/file 4k 64k 4k" cifs/file XFS_IOC_FILE_EXTENT_SAME: Invalid argument After this patch: # dd if=/dev/zero of=cifs/file bs=1M count=1 # xfs_io -c "dedupe cifs/file 4k 64k 4k" cifs/file XFS_IOC_FILE_EXTENT_SAME: Operation not supported Influence for xfstests: generic/091 generic/112 generic/127 generic/263 These tests report this error "do_copy_range:: Invalid argument" instead of "FIDEDUPERANGE: Invalid argument". Because there are still two bugs cause these test failed. https://bugzilla.kernel.org/show_bug.cgi?id=202935 https://bugzilla.kernel.org/show_bug.cgi?id=202785 Signed-off-by: Xiaoli Feng Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 217276b8b942..f9b71c12cc9f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1008,7 +1008,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, unsigned int xid; int rc; - if (remap_flags & ~REMAP_FILE_ADVISORY) + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; cifs_dbg(FYI, "clone range\n"); From 85f9987b236cf46e06ffdb5c225cf1f3c0acb789 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 17 Mar 2019 15:58:38 -0500 Subject: [PATCH 812/855] fix incorrect error code mapping for OBJECTID_NOT_FOUND It was mapped to EIO which can be confusing when user space queries for an object GUID for an object for which the server file system doesn't support (or hasn't saved one). As Amir Goldstein suggested this is similar to ENOATTR (equivalently ENODATA in Linux errno definitions) so changing NT STATUS code mapping for OBJECTID_NOT_FOUND to ENODATA. Signed-off-by: Steve French CC: Amir Goldstein --- fs/cifs/smb2maperror.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 924269cec135..e32c264e3adb 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -1036,7 +1036,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_UNFINISHED_CONTEXT_DELETED, -EIO, "STATUS_UNFINISHED_CONTEXT_DELETED"}, {STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"}, - {STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"}, + /* Note that ENOATTTR and ENODATA are the same errno */ + {STATUS_OBJECTID_NOT_FOUND, -ENODATA, "STATUS_OBJECTID_NOT_FOUND"}, {STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"}, {STATUS_WRONG_CREDENTIAL_HANDLE, -EIO, "STATUS_WRONG_CREDENTIAL_HANDLE"}, From e71ab2aa06f731a944993120b0eef1556c63b81c Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 21 Mar 2019 14:59:02 +1000 Subject: [PATCH 813/855] cifs: allow guest mounts to work for smb3.11 Fix Guest/Anonymous sessions so that they work with SMB 3.11. The commit noted below tightened the conditions and forced signing for the SMB2-TreeConnect commands as per MS-SMB2. However, this should only apply to normal user sessions and not for Guest/Anonumous sessions. Fixes: 6188f28bf608 ("Tree connect for SMB3.1.1 must be signed for non-encrypted shares") Signed-off-by: Ronnie Sahlberg CC: Stable Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index c399e09b76e6..8e4a1da95418 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1628,9 +1628,13 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, iov[1].iov_base = unc_path; iov[1].iov_len = unc_path_len; - /* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */ + /* + * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 + * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1 + */ if ((ses->server->dialect == SMB311_PROT_ID) && - !smb3_encryption_required(tcon)) + !smb3_encryption_required(tcon) && + !(ses->session_flags & (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL))) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; memset(&rqst, 0, sizeof(struct smb_rqst)); From 68ddb496800acdb46172b4981dc3753ea9b39c25 Mon Sep 17 00:00:00 2001 From: "Paulo Alcantara (SUSE)" Date: Thu, 21 Mar 2019 19:31:22 -0300 Subject: [PATCH 814/855] cifs: Fix slab-out-of-bounds when tracing SMB tcon This patch fixes the following KASAN report: [ 779.044746] BUG: KASAN: slab-out-of-bounds in string+0xab/0x180 [ 779.044750] Read of size 1 at addr ffff88814f327968 by task trace-cmd/2812 [ 779.044756] CPU: 1 PID: 2812 Comm: trace-cmd Not tainted 5.1.0-rc1+ #62 [ 779.044760] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-0-ga698c89-prebuilt.qemu.org 04/01/2014 [ 779.044761] Call Trace: [ 779.044769] dump_stack+0x5b/0x90 [ 779.044775] ? string+0xab/0x180 [ 779.044781] print_address_description+0x6c/0x23c [ 779.044787] ? string+0xab/0x180 [ 779.044792] ? string+0xab/0x180 [ 779.044797] kasan_report.cold.3+0x1a/0x32 [ 779.044803] ? string+0xab/0x180 [ 779.044809] string+0xab/0x180 [ 779.044816] ? widen_string+0x160/0x160 [ 779.044822] ? vsnprintf+0x5bf/0x7f0 [ 779.044829] vsnprintf+0x4e7/0x7f0 [ 779.044836] ? pointer+0x4a0/0x4a0 [ 779.044841] ? seq_buf_vprintf+0x79/0xc0 [ 779.044848] seq_buf_vprintf+0x62/0xc0 [ 779.044855] trace_seq_printf+0x113/0x210 [ 779.044861] ? trace_seq_puts+0x110/0x110 [ 779.044867] ? trace_raw_output_prep+0xd8/0x110 [ 779.044876] trace_raw_output_smb3_tcon_class+0x9f/0xc0 [ 779.044882] print_trace_line+0x377/0x890 [ 779.044888] ? tracing_buffers_read+0x300/0x300 [ 779.044893] ? ring_buffer_read+0x58/0x70 [ 779.044899] s_show+0x6e/0x140 [ 779.044906] seq_read+0x505/0x6a0 [ 779.044913] vfs_read+0xaf/0x1b0 [ 779.044919] ksys_read+0xa1/0x130 [ 779.044925] ? kernel_write+0xa0/0xa0 [ 779.044931] ? __do_page_fault+0x3d5/0x620 [ 779.044938] do_syscall_64+0x63/0x150 [ 779.044944] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 779.044949] RIP: 0033:0x7f62c2c2db31 [ 779.044955] Code: fe ff ff 48 8d 3d 17 9e 09 00 48 83 ec 08 e8 96 02 02 00 66 0f 1f 44 00 00 8b 05 fa fc 2c 00 48 63 ff 85 c0 75 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 57 f3 c3 0f 1f 44 00 00 55 53 48 89 d5 48 89 [ 779.044958] RSP: 002b:00007ffd6e116678 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [ 779.044964] RAX: ffffffffffffffda RBX: 0000560a38be9260 RCX: 00007f62c2c2db31 [ 779.044966] RDX: 0000000000002000 RSI: 00007ffd6e116710 RDI: 0000000000000003 [ 779.044966] RDX: 0000000000002000 RSI: 00007ffd6e116710 RDI: 0000000000000003 [ 779.044969] RBP: 00007f62c2ef5420 R08: 0000000000000000 R09: 0000000000000003 [ 779.044972] R10: ffffffffffffffa8 R11: 0000000000000246 R12: 00007ffd6e116710 [ 779.044975] R13: 0000000000002000 R14: 0000000000000d68 R15: 0000000000002000 [ 779.044981] Allocated by task 1257: [ 779.044987] __kasan_kmalloc.constprop.5+0xc1/0xd0 [ 779.044992] kmem_cache_alloc+0xad/0x1a0 [ 779.044997] getname_flags+0x6c/0x2a0 [ 779.045003] user_path_at_empty+0x1d/0x40 [ 779.045008] do_faccessat+0x12a/0x330 [ 779.045012] do_syscall_64+0x63/0x150 [ 779.045017] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 779.045019] Freed by task 1257: [ 779.045023] __kasan_slab_free+0x12e/0x180 [ 779.045029] kmem_cache_free+0x85/0x1b0 [ 779.045034] filename_lookup.part.70+0x176/0x250 [ 779.045039] do_faccessat+0x12a/0x330 [ 779.045043] do_syscall_64+0x63/0x150 [ 779.045048] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 779.045052] The buggy address belongs to the object at ffff88814f326600 which belongs to the cache names_cache of size 4096 [ 779.045057] The buggy address is located 872 bytes to the right of 4096-byte region [ffff88814f326600, ffff88814f327600) [ 779.045058] The buggy address belongs to the page: [ 779.045062] page:ffffea00053cc800 count:1 mapcount:0 mapping:ffff88815b191b40 index:0x0 compound_mapcount: 0 [ 779.045067] flags: 0x200000000010200(slab|head) [ 779.045075] raw: 0200000000010200 dead000000000100 dead000000000200 ffff88815b191b40 [ 779.045081] raw: 0000000000000000 0000000000070007 00000001ffffffff 0000000000000000 [ 779.045083] page dumped because: kasan: bad access detected [ 779.045085] Memory state around the buggy address: [ 779.045089] ffff88814f327800: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 779.045093] ffff88814f327880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 779.045097] >ffff88814f327900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 779.045099] ^ [ 779.045103] ffff88814f327980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 779.045107] ffff88814f327a00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 779.045109] ================================================================== [ 779.045110] Disabling lock debugging due to kernel taint Correctly assign tree name str for smb3_tcon event. Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/trace.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index fa226de48ef3..99c4d799c24b 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -549,19 +549,19 @@ DECLARE_EVENT_CLASS(smb3_tcon_class, __field(unsigned int, xid) __field(__u32, tid) __field(__u64, sesid) - __field(const char *, unc_name) + __string(name, unc_name) __field(int, rc) ), TP_fast_assign( __entry->xid = xid; __entry->tid = tid; __entry->sesid = sesid; - __entry->unc_name = unc_name; + __assign_str(name, unc_name); __entry->rc = rc; ), TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d", __entry->xid, __entry->sesid, __entry->tid, - __entry->unc_name, __entry->rc) + __get_str(name), __entry->rc) ) #define DEFINE_SMB3_TCON_EVENT(name) \ From 8c11a607d1d9cd6e7f01fd6b03923597fb0ef95a Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 22 Mar 2019 22:31:17 -0500 Subject: [PATCH 815/855] SMB3: Fix SMB3.1.1 guest mounts to Samba Workaround problem with Samba responses to SMB3.1.1 null user (guest) mounts. The server doesn't set the expected flag in the session setup response so we have to do a similar check to what is done in smb3_validate_negotiate where we also check if the user is a null user (but not sec=krb5 since username might not be passed in on mount for Kerberos case). Note that the commit below tightened the conditions and forced signing for the SMB2-TreeConnect commands as per MS-SMB2. However, this should only apply to normal user sessions and not for cases where there is no user (even if server forgets to set the flag in the response) since we don't have anything useful to sign with. This is especially important now that the more secure SMB3.1.1 protocol is in the default dialect list. An earlier patch ("cifs: allow guest mounts to work for smb3.11") fixed the guest mounts to Windows. Fixes: 6188f28bf608 ("Tree connect for SMB3.1.1 must be signed for non-encrypted shares") Reviewed-by: Ronnie Sahlberg Reviewed-by: Paulo Alcantara CC: Stable Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 8e4a1da95418..21ac19ff19cb 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1631,10 +1631,13 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, /* * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1 + * (Samba servers don't always set the flag so also check if null user) */ if ((ses->server->dialect == SMB311_PROT_ID) && !smb3_encryption_required(tcon) && - !(ses->session_flags & (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL))) + !(ses->session_flags & + (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) && + ((ses->user_name != NULL) || (ses->sectype == Kerberos))) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; memset(&rqst, 0, sizeof(struct smb_rqst)); From cf7d624f8dcc9b833a8489208b6ef6dcc5dd308b Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 20 Mar 2019 16:42:50 -0500 Subject: [PATCH 816/855] cifs: update internal module version number To 2.19 Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 142164ef1f05..5c0298b9998f 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.18" +#define CIFS_VERSION "2.19" #endif /* _CIFSFS_H */ From ffc8599aa9763f39f6736a79da4d1575e7006f9a Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 8 Mar 2019 11:05:08 +0800 Subject: [PATCH 817/855] x86/gart: Exclude GART aperture from kcore On machines where the GART aperture is mapped over physical RAM, /proc/kcore contains the GART aperture range. Accessing the GART range via /proc/kcore results in a kernel crash. vmcore used to have the same issue, until it was fixed with commit 2a3e83c6f96c ("x86/gart: Exclude GART aperture from vmcore")', leveraging existing hook infrastructure in vmcore to let /proc/vmcore return zeroes when attempting to read the aperture region, and so it won't read from the actual memory. Apply the same workaround for kcore. First implement the same hook infrastructure for kcore, then reuse the hook functions introduced in the previous vmcore fix. Just with some minor adjustment, rename some functions for more general usage, and simplify the hook infrastructure a bit as there is no module usage yet. Suggested-by: Baoquan He Signed-off-by: Kairui Song Signed-off-by: Thomas Gleixner Reviewed-by: Jiri Bohac Acked-by: Baoquan He Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Omar Sandoval Cc: Dave Young Link: https://lkml.kernel.org/r/20190308030508.13548-1-kasong@redhat.com --- arch/x86/kernel/aperture_64.c | 20 +++++++++++++------- fs/proc/kcore.c | 27 +++++++++++++++++++++++++++ include/linux/kcore.h | 2 ++ 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 58176b56354e..294ed4392a0e 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -14,6 +14,7 @@ #define pr_fmt(fmt) "AGP: " fmt #include +#include #include #include #include @@ -57,7 +58,7 @@ int fallback_aper_force __initdata; int fix_aperture __initdata = 1; -#ifdef CONFIG_PROC_VMCORE +#if defined(CONFIG_PROC_VMCORE) || defined(CONFIG_PROC_KCORE) /* * If the first kernel maps the aperture over e820 RAM, the kdump kernel will * use the same range because it will remain configured in the northbridge. @@ -66,20 +67,25 @@ int fix_aperture __initdata = 1; */ static unsigned long aperture_pfn_start, aperture_page_count; -static int gart_oldmem_pfn_is_ram(unsigned long pfn) +static int gart_mem_pfn_is_ram(unsigned long pfn) { return likely((pfn < aperture_pfn_start) || (pfn >= aperture_pfn_start + aperture_page_count)); } -static void exclude_from_vmcore(u64 aper_base, u32 aper_order) +static void __init exclude_from_core(u64 aper_base, u32 aper_order) { aperture_pfn_start = aper_base >> PAGE_SHIFT; aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; - WARN_ON(register_oldmem_pfn_is_ram(&gart_oldmem_pfn_is_ram)); +#ifdef CONFIG_PROC_VMCORE + WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram)); +#endif +#ifdef CONFIG_PROC_KCORE + WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram)); +#endif } #else -static void exclude_from_vmcore(u64 aper_base, u32 aper_order) +static void exclude_from_core(u64 aper_base, u32 aper_order) { } #endif @@ -474,7 +480,7 @@ out: * may have allocated the range over its e820 RAM * and fixed up the northbridge */ - exclude_from_vmcore(last_aper_base, last_aper_order); + exclude_from_core(last_aper_base, last_aper_order); return 1; } @@ -520,7 +526,7 @@ out: * overlap with the first kernel's memory. We can't access the * range through vmcore even though it should be part of the dump. */ - exclude_from_vmcore(aper_alloc, aper_order); + exclude_from_core(aper_alloc, aper_order); /* Fix up the north bridges */ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index bbcc185062bb..d29d869abec1 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -54,6 +54,28 @@ static LIST_HEAD(kclist_head); static DECLARE_RWSEM(kclist_lock); static int kcore_need_update = 1; +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * Same as oldmem_pfn_is_ram in vmcore + */ +static int (*mem_pfn_is_ram)(unsigned long pfn); + +int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (mem_pfn_is_ram) + return -EBUSY; + mem_pfn_is_ram = fn; + return 0; +} + +static int pfn_is_ram(unsigned long pfn) +{ + if (mem_pfn_is_ram) + return mem_pfn_is_ram(pfn); + else + return 1; +} + /* This doesn't grab kclist_lock, so it should only be used at init time. */ void __init kclist_add(struct kcore_list *new, void *addr, size_t size, int type) @@ -465,6 +487,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) goto out; } m = NULL; /* skip the list anchor */ + } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ diff --git a/include/linux/kcore.h b/include/linux/kcore.h index 8c3f8c14eeaa..c843f4a9c512 100644 --- a/include/linux/kcore.h +++ b/include/linux/kcore.h @@ -44,6 +44,8 @@ void kclist_add_remap(struct kcore_list *m, void *addr, void *vaddr, size_t sz) m->vaddr = (unsigned long)vaddr; kclist_add(m, addr, sz, KCORE_REMAP); } + +extern int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn)); #else static inline void kclist_add(struct kcore_list *new, void *addr, size_t size, int type) From 32d0be018f6f5ee2d5d19c4795304613560814cf Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Fri, 22 Mar 2019 14:54:11 -0700 Subject: [PATCH 818/855] clocksource/drivers/riscv: Fix clocksource mask For all riscv architectures (RV32, RV64 and RV128), the clocksource is a 64 bit incrementing counter. Fix the clock source mask accordingly. Tested on both 64bit and 32 bit virt machine in QEMU. Fixes: 62b019436814 ("clocksource: new RISC-V SBI timer driver") Signed-off-by: Atish Patra Signed-off-by: Thomas Gleixner Reviewed-by: Anup Patel Cc: Albert Ou Cc: Daniel Lezcano Cc: linux-riscv@lists.infradead.org Cc: Palmer Dabbelt Cc: Anup Patel Cc: Damien Le Moal Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190322215411.19362-1-atish.patra@wdc.com --- drivers/clocksource/timer-riscv.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/timer-riscv.c b/drivers/clocksource/timer-riscv.c index e8163693e936..5e6038fbf115 100644 --- a/drivers/clocksource/timer-riscv.c +++ b/drivers/clocksource/timer-riscv.c @@ -58,7 +58,7 @@ static u64 riscv_sched_clock(void) static DEFINE_PER_CPU(struct clocksource, riscv_clocksource) = { .name = "riscv_clocksource", .rating = 300, - .mask = CLOCKSOURCE_MASK(BITS_PER_LONG), + .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, .read = riscv_clocksource_rdtime, }; @@ -120,8 +120,7 @@ static int __init riscv_timer_init_dt(struct device_node *n) return error; } - sched_clock_register(riscv_sched_clock, - BITS_PER_LONG, riscv_timebase); + sched_clock_register(riscv_sched_clock, 64, riscv_timebase); error = cpuhp_setup_state(CPUHP_AP_RISCV_TIMER_STARTING, "clockevents/riscv/timer:starting", From 93417a3fda2060f2a34e3341904024c5b6980d1f Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 28 Feb 2019 15:37:14 -0600 Subject: [PATCH 819/855] genirq: Mark expected switch case fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. With -Wimplicit-fallthrough added to CFLAGS: kernel/irq/manage.c: In function ‘irq_do_set_affinity’: kernel/irq/manage.c:198:3: warning: this statement may fall through [-Wimplicit-fallthrough=] cpumask_copy(desc->irq_common_data.affinity, mask); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/irq/manage.c:199:2: note: here case IRQ_SET_MASK_OK_NOCOPY: ^~~~ Annotate it. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Thomas Gleixner Cc: Kees Cook Link: https://lkml.kernel.org/r/20190228213714.GA9246@embeddedor --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9ec34a2a6638..1401afa0d58a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -196,6 +196,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); + /* fall through */ case IRQ_SET_MASK_OK_NOCOPY: irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); From 674a2b27234d1b7afcb0a9162e81b2e53aeef217 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sat, 23 Mar 2019 11:43:05 -0400 Subject: [PATCH 820/855] ext4: brelse all indirect buffer in ext4_ind_remove_space() All indirect buffers get by ext4_find_shared() should be released no mater the branch should be freed or not. But now, we forget to release the lower depth indirect buffers when removing space from the same higher depth indirect block. It will lead to buffer leak and futher more, it may lead to quota information corruption when using old quota, consider the following case. - Create and mount an empty ext4 filesystem without extent and quota features, - quotacheck and enable the user & group quota, - Create some files and write some data to them, and then punch hole to some files of them, it may trigger the buffer leak problem mentioned above. - Disable quota and run quotacheck again, it will create two new aquota files and write the checked quota information to them, which probably may reuse the freed indirect block(the buffer and page cache was not freed) as data block. - Enable quota again, it will invoke vfs_load_quota_inode()->invalidate_bdev() to try to clean unused buffers and pagecache. Unfortunately, because of the buffer of quota data block is still referenced, quota code cannot read the up to date quota info from the device and lead to quota information corruption. This problem can be reproduced by xfstests generic/231 on ext3 file system or ext4 file system without extent and quota features. This patch fix this problem by releasing the missing indirect buffers, in ext4_ind_remove_space(). Reported-by: Hulk Robot Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@kernel.org --- fs/ext4/indirect.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index c2225f0d31b5..02c39524bda2 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1390,10 +1390,14 @@ end_range: partial->p + 1, partial2->p, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + } + while (partial2 > chain2) { + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + } return 0; } From d84dd3fb82fa7a094de7f08f10610d55a70cf0ca Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 19 Mar 2019 11:24:54 -0400 Subject: [PATCH 821/855] SUNRPC: Don't let RPC_SOFTCONN tasks time out if the transport is connected If the transport is still connected, then we do want to allow RPC_SOFTCONN tasks to retry. They should time out if and only if the connection is broken. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 228970e6e52b..187d10443a15 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2311,6 +2311,15 @@ out_exit: rpc_exit(task, status); } +static bool +rpc_check_connected(const struct rpc_rqst *req) +{ + /* No allocated request or transport? return true */ + if (!req || !req->rq_xprt) + return true; + return xprt_connected(req->rq_xprt); +} + static void rpc_check_timeout(struct rpc_task *task) { @@ -2322,10 +2331,11 @@ rpc_check_timeout(struct rpc_task *task) dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid); task->tk_timeouts++; - if (RPC_IS_SOFTCONN(task)) { + if (RPC_IS_SOFTCONN(task) && !rpc_check_connected(task->tk_rqstp)) { rpc_exit(task, -ETIMEDOUT); return; } + if (RPC_IS_SOFT(task)) { if (clnt->cl_chatty) { printk(KERN_NOTICE "%s: server %s not responding, timed out\n", From 5e86bdda41534e17621d5a071b294943cae4376e Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sat, 23 Mar 2019 11:56:01 -0400 Subject: [PATCH 822/855] ext4: cleanup bh release code in ext4_ind_remove_space() Currently, we are releasing the indirect buffer where we are done with it in ext4_ind_remove_space(), so we can see the brelse() and BUFFER_TRACE() everywhere. It seems fragile and hard to read, and we may probably forget to release the buffer some day. This patch cleans up the code by putting of the code which releases the buffers to the end of the function. Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/indirect.c | 47 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 02c39524bda2..2024d3fa5504 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1222,6 +1222,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t offsets[4], offsets2[4]; Indirect chain[4], chain2[4]; Indirect *partial, *partial2; + Indirect *p = NULL, *p2 = NULL; ext4_lblk_t max_block; __le32 nr = 0, nr2 = 0; int n = 0, n2 = 0; @@ -1263,7 +1264,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, } - partial = ext4_find_shared(inode, n, offsets, chain, &nr); + partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ @@ -1288,13 +1289,11 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); partial--; } end_range: - partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); if (nr2) { if (partial2 == chain2) { /* @@ -1324,16 +1323,14 @@ end_range: (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); partial2--; } goto do_indirects; } /* Punch happened within the same level (n == n2) */ - partial = ext4_find_shared(inode, n, offsets, chain, &nr); - partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); + partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); /* Free top, but only if partial2 isn't its subtree. */ if (nr) { @@ -1390,15 +1387,7 @@ end_range: partial->p + 1, partial2->p, (chain+n-1) - partial); - while (partial > chain) { - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - } - while (partial2 > chain2) { - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); - } - return 0; + goto cleanup; } /* @@ -1413,8 +1402,6 @@ end_range: partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); partial--; } if (partial2 > chain2 && depth2 <= depth) { @@ -1422,11 +1409,21 @@ end_range: (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); partial2--; } } + +cleanup: + while (p && p > chain) { + BUFFER_TRACE(p->bh, "call brelse"); + brelse(p->bh); + p--; + } + while (p2 && p2 > chain2) { + BUFFER_TRACE(p2->bh, "call brelse"); + brelse(p2->bh); + p2--; + } return 0; do_indirects: @@ -1434,7 +1431,7 @@ do_indirects: switch (offsets[0]) { default: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); @@ -1443,7 +1440,7 @@ do_indirects: /* fall through */ case EXT4_IND_BLOCK: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); @@ -1452,7 +1449,7 @@ do_indirects: /* fall through */ case EXT4_DIND_BLOCK: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); @@ -1462,5 +1459,5 @@ do_indirects: case EXT4_TIND_BLOCK: ; } - return 0; + goto cleanup; } From 5a698243930c441afccec04e4d5dc8febfd2b775 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 21 Mar 2019 17:57:56 -0400 Subject: [PATCH 823/855] NFS: Fix a typo in nfs_init_timeout_values() Specifying a retrans=0 mount parameter to a NFS/TCP mount, is inadvertently causing the NFS client to rewrite any specified timeout parameter to the default of 60 seconds. Fixes: a956beda19a6 ("NFS: Allow the mount option retrans=0") Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index fb1cf1a4bda2..90d71fda65ce 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -453,7 +453,7 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, case XPRT_TRANSPORT_RDMA: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; - if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0) + if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; From 166bd5b889ac61369c34650887a5c6b899f5e976 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 22 Mar 2019 23:03:56 -0400 Subject: [PATCH 824/855] pNFS/flexfiles: Fix layoutstats handling during read failovers During a read failover, we may end up changing the value of the pgio_mirror_idx, so make sure that we record the layout stats before that update. Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index f9264e1922a2..6673d4ff5a2a 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1289,6 +1289,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + int new_idx = hdr->pgio_mirror_idx; int err; trace_nfs4_pnfs_read(hdr, task->tk_status); @@ -1307,7 +1308,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task, case -NFS4ERR_RESET_TO_PNFS: if (ff_layout_choose_best_ds_for_read(hdr->lseg, hdr->pgio_mirror_idx + 1, - &hdr->pgio_mirror_idx)) + &new_idx)) goto out_layouterror; set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; @@ -1320,7 +1321,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task, return 0; out_layouterror: + ff_layout_read_record_layoutstats_done(task, hdr); ff_layout_send_layouterror(hdr->lseg); + hdr->pgio_mirror_idx = new_idx; out_eagain: rpc_restart_call_prepare(task); return -EAGAIN; From 18915b5873f07e5030e6fb108a050fa7c71c59fb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 23 Mar 2019 12:10:29 -0400 Subject: [PATCH 825/855] ext4: prohibit fstrim in norecovery mode The ext4 fstrim implementation uses the block bitmaps to find free space that can be discarded. If we haven't replayed the journal, the bitmaps will be stale and we absolutely *cannot* use stale metadata to zap the underlying storage. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index eb8ca8d80885..73435444b159 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1000,6 +1000,13 @@ resizefs_out: if (!blk_queue_discard(q)) return -EOPNOTSUPP; + /* + * We haven't replayed the journal, so we cannot use our + * block-bitmap-guided storage zapping commands. + */ + if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) + return -EROFS; + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; From a7fb107b7d8982ac76c958a0d2838a151b03e97e Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 21 Mar 2019 16:34:44 -0700 Subject: [PATCH 826/855] net: phy: Re-parent menus for MDIO bus drivers correctly After 90eff9096c01 ("net: phy: Allow splitting MDIO bus/device support from PHYs") the various MDIO bus drivers were no longer parented with config PHYLIB but with config MDIO_BUS which is not a menuconfig, fix this by depending on MDIO_DEVICE which is a menuconfig. This is visually nicer and less confusing for users. Fixes: 90eff9096c01 ("net: phy: Allow splitting MDIO bus/device support from PHYs") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 071869db44cf..520657945b82 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -7,6 +7,8 @@ menuconfig MDIO_DEVICE help MDIO devices and driver infrastructure code. +if MDIO_DEVICE + config MDIO_BUS tristate default m if PHYLIB=m @@ -179,6 +181,7 @@ config MDIO_XGENE APM X-Gene SoC's. endif +endif config PHYLINK tristate From fa3a419d2f674b431d38748cb58fb7da17ee8949 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Fri, 22 Mar 2019 11:04:07 +0800 Subject: [PATCH 827/855] net: xilinx: fix possible object reference leak The call to of_parse_phandle returns a node pointer with refcount incremented thus it must be explicitly decremented after the last usage. Detected by coccinelle with the following warnings: ./drivers/net/ethernet/xilinx/xilinx_axienet_main.c:1624:1-7: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 1569, but without a corresponding object release within this function. Signed-off-by: Wen Yang Cc: Anirudha Sarangi Cc: John Linn Cc: "David S. Miller" Cc: Michal Simek Cc: netdev@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index ec7e7ec24ff9..4041c75997ba 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1575,12 +1575,14 @@ static int axienet_probe(struct platform_device *pdev) ret = of_address_to_resource(np, 0, &dmares); if (ret) { dev_err(&pdev->dev, "unable to get DMA resource\n"); + of_node_put(np); goto free_netdev; } lp->dma_regs = devm_ioremap_resource(&pdev->dev, &dmares); if (IS_ERR(lp->dma_regs)) { dev_err(&pdev->dev, "could not map DMA regs\n"); ret = PTR_ERR(lp->dma_regs); + of_node_put(np); goto free_netdev; } lp->rx_irq = irq_of_parse_and_map(np, 1); From be693df3cf9dd113ff1d2c0d8150199efdba37f6 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Fri, 22 Mar 2019 11:04:08 +0800 Subject: [PATCH 828/855] net: ibm: fix possible object reference leak The call to ehea_get_eth_dn returns a node pointer with refcount incremented thus it must be explicitly decremented after the last usage. Detected by coccinelle with the following warnings: ./drivers/net/ethernet/ibm/ehea/ehea_main.c:3163:2-8: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 3154, but without a corresponding object release within this function. Signed-off-by: Wen Yang Cc: Douglas Miller Cc: "David S. Miller" Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ehea/ehea_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c index 3baabdc89726..90b62c1412c8 100644 --- a/drivers/net/ethernet/ibm/ehea/ehea_main.c +++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c @@ -3160,6 +3160,7 @@ static ssize_t ehea_probe_port(struct device *dev, if (ehea_add_adapter_mr(adapter)) { pr_err("creating MR failed\n"); + of_node_put(eth_dn); return -EIO; } From 75eac7b5f68b0a0671e795ac636457ee27cc11d8 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Fri, 22 Mar 2019 11:04:09 +0800 Subject: [PATCH 829/855] net: ethernet: ti: fix possible object reference leak The call to of_get_child_by_name returns a node pointer with refcount incremented thus it must be explicitly decremented after the last usage. Detected by coccinelle with the following warnings: ./drivers/net/ethernet/ti/netcp_ethss.c:3661:2-8: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 3654, but without a corresponding object release within this function. ./drivers/net/ethernet/ti/netcp_ethss.c:3665:2-8: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 3654, but without a corresponding object release within this function. Signed-off-by: Wen Yang Cc: Wingman Kwok Cc: Murali Karicheri Cc: "David S. Miller" Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/netcp_ethss.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c index 5174d318901e..0a920c5936b2 100644 --- a/drivers/net/ethernet/ti/netcp_ethss.c +++ b/drivers/net/ethernet/ti/netcp_ethss.c @@ -3657,12 +3657,16 @@ static int gbe_probe(struct netcp_device *netcp_device, struct device *dev, ret = netcp_txpipe_init(&gbe_dev->tx_pipe, netcp_device, gbe_dev->dma_chan_name, gbe_dev->tx_queue_id); - if (ret) + if (ret) { + of_node_put(interfaces); return ret; + } ret = netcp_txpipe_open(&gbe_dev->tx_pipe); - if (ret) + if (ret) { + of_node_put(interfaces); return ret; + } /* Create network interfaces */ INIT_LIST_HEAD(&gbe_dev->gbe_intf_head); From 23c78343ec36990709b636a9e02bad814f4384ad Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 22 Mar 2019 07:39:35 +0100 Subject: [PATCH 830/855] r8169: fix cable re-plugging issue Bartek reported that after few cable unplug/replug cycles suddenly replug isn't detected any longer. His system uses a RTL8106, I wasn't able to reproduce the issue with RTL8168g. According to his bisect the referenced commit caused the regression. As Realtek doesn't release datasheets or errata it's hard to say what's the actual root cause, but this change was reported to fix the issue. Fixes: 38caff5a445b ("r8169: handle all interrupt events in the hard irq handler") Reported-by: Bartosz Skrzypczak Suggested-by: Bartosz Skrzypczak Tested-by: Bartosz Skrzypczak Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 9dd1cd2c0c68..7562ccbbb39a 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -6542,7 +6542,7 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance) set_bit(RTL_FLAG_TASK_RESET_PENDING, tp->wk.flags); } - if (status & RTL_EVENT_NAPI) { + if (status & (RTL_EVENT_NAPI | LinkChg)) { rtl_irq_disable(tp); napi_schedule_irqoff(&tp->napi); } From 064c5d6881e897077639e04973de26440ee205e6 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Fri, 22 Mar 2019 12:37:35 +0000 Subject: [PATCH 831/855] net: sched: fix cleanup NULL pointer exception in act_mirr A new mirred action is created by the tcf_mirred_init function. This contains a list head struct which is inserted into a global list on successful creation of a new action. However, after a creation, it is still possible to error out and call the tcf_idr_release function. This, in turn, calls the act_mirr cleanup function via __tcf_idr_release and __tcf_action_put. This cleanup function tries to delete the list entry which is as yet uninitialised, leading to a NULL pointer exception. Fix this by initialising the list entry on creation of a new action. Bug report: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 PGD 8000000840c73067 P4D 8000000840c73067 PUD 858dcc067 PMD 0 Oops: 0002 [#1] SMP PTI CPU: 32 PID: 5636 Comm: handler194 Tainted: G OE 5.0.0+ #186 Hardware name: Dell Inc. PowerEdge R730/0599V5, BIOS 1.3.6 06/03/2015 RIP: 0010:tcf_mirred_release+0x42/0xa7 [act_mirred] Code: f0 90 39 c0 e8 52 04 57 c8 48 c7 c7 b8 80 39 c0 e8 94 fa d4 c7 48 8b 93 d0 00 00 00 48 8b 83 d8 00 00 00 48 c7 c7 f0 90 39 c0 <48> 89 42 08 48 89 10 48 b8 00 01 00 00 00 00 ad de 48 89 83 d0 00 RSP: 0018:ffffac4aa059f688 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff9dcd1b214d00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff9dcd1fa165f8 RDI: ffffffffc03990f0 RBP: ffff9dccf9c7af80 R08: 0000000000000a3b R09: 0000000000000000 R10: ffff9dccfa11f420 R11: 0000000000000000 R12: 0000000000000001 R13: ffff9dcd16b433c0 R14: ffff9dcd1b214d80 R15: 0000000000000000 FS: 00007f441bfff700(0000) GS:ffff9dcd1fa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 0000000839e64004 CR4: 00000000001606e0 Call Trace: tcf_action_cleanup+0x59/0xca __tcf_action_put+0x54/0x6b __tcf_idr_release.cold.33+0x9/0x12 tcf_mirred_init.cold.20+0x22e/0x3b0 [act_mirred] tcf_action_init_1+0x3d0/0x4c0 tcf_action_init+0x9c/0x130 tcf_exts_validate+0xab/0xc0 fl_change+0x1ca/0x982 [cls_flower] tc_new_tfilter+0x647/0x8d0 ? load_balance+0x14b/0x9e0 rtnetlink_rcv_msg+0xe3/0x370 ? __switch_to_asm+0x40/0x70 ? __switch_to_asm+0x34/0x70 ? _cond_resched+0x15/0x30 ? __kmalloc_node_track_caller+0x1d4/0x2b0 ? rtnl_calcit.isra.31+0xf0/0xf0 netlink_rcv_skb+0x49/0x110 netlink_unicast+0x16f/0x210 netlink_sendmsg+0x1df/0x390 sock_sendmsg+0x36/0x40 ___sys_sendmsg+0x27b/0x2c0 ? futex_wake+0x80/0x140 ? do_futex+0x2b9/0xac0 ? ep_scan_ready_list.constprop.22+0x1f2/0x210 ? ep_poll+0x7a/0x430 __sys_sendmsg+0x47/0x80 do_syscall_64+0x55/0x100 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 4e232818bd32 ("net: sched: act_mirred: remove dependency on rtnl lock") Signed-off-by: John Hurley Reviewed-by: Jakub Kicinski Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index cd712e4e8998..17cc6bd4c57c 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -159,12 +159,15 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } + + m = to_mirred(*a); + if (ret == ACT_P_CREATED) + INIT_LIST_HEAD(&m->tcfm_list); + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; - m = to_mirred(*a); - spin_lock_bh(&m->tcf_lock); if (parm->ifindex) { From 737889efe9713a0f20a75fd0de952841d9275e6b Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 22 Mar 2019 15:03:51 +0100 Subject: [PATCH 832/855] tipc: tipc clang warning When checking the code with clang -Wsometimes-uninitialized we get the following warning: if (!tipc_link_is_establishing(l)) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/tipc/node.c:847:46: note: uninitialized use occurs here tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); net/tipc/node.c:831:2: note: remove the 'if' if its condition is always true if (!tipc_link_is_establishing(l)) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/tipc/node.c:821:31: note: initialize the variable 'maddr' to silence this warning struct tipc_media_addr *maddr; We fix this by initializing 'maddr' to NULL. For the matter of clarity, we also test if 'xmitq' is non-empty before we use it and 'maddr' further down in the function. It will never happen that 'xmitq' is non- empty at the same time as 'maddr' is NULL, so this is a sufficient test. Fixes: 598411d70f85 ("tipc: make resetting of links non-atomic") Reported-by: Nathan Chancellor Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/tipc/node.c b/net/tipc/node.c index 2dc4919ab23c..dd3b6dc17662 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -817,10 +817,10 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) { struct tipc_link_entry *le = &n->links[bearer_id]; + struct tipc_media_addr *maddr = NULL; struct tipc_link *l = le->link; - struct tipc_media_addr *maddr; - struct sk_buff_head xmitq; int old_bearer_id = bearer_id; + struct sk_buff_head xmitq; if (!l) return; @@ -844,7 +844,8 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) tipc_node_write_unlock(n); if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); - tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); + if (!skb_queue_empty(&xmitq)) + tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); tipc_sk_rcv(n->net, &le->inputq); } From 526949e877f44672d408bfe291e39860c13f2e24 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Mar 2019 15:18:43 +0100 Subject: [PATCH 833/855] rxrpc: avoid clang -Wuninitialized warning clang produces a false-positive warning as it fails to notice that "lost = true" implies that "ret" is initialized: net/rxrpc/output.c:402:6: error: variable 'ret' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized] if (lost) ^~~~ net/rxrpc/output.c:437:6: note: uninitialized use occurs here if (ret >= 0) { ^~~ net/rxrpc/output.c:402:2: note: remove the 'if' if its condition is always false if (lost) ^~~~~~~~~ net/rxrpc/output.c:339:9: note: initialize the variable 'ret' to silence this warning int ret, opt; ^ = 0 Rearrange the code to make that more obvious and avoid the warning. Signed-off-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Signed-off-by: David S. Miller --- net/rxrpc/output.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 736aa9281100..004c762c2e8d 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -335,7 +335,6 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, struct kvec iov[2]; rxrpc_serial_t serial; size_t len; - bool lost = false; int ret, opt; _enter(",{%d}", skb->len); @@ -393,14 +392,14 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, static int lose; if ((lose++ & 7) == 7) { ret = 0; - lost = true; + trace_rxrpc_tx_data(call, sp->hdr.seq, serial, + whdr.flags, retrans, true); + goto done; } } - trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, - retrans, lost); - if (lost) - goto done; + trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, retrans, + false); /* send the packet with the don't fragment bit set if we currently * think it's small enough */ From 2a6a8e2d9004b5303fcb494588ba3a3b87a256c3 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Thu, 20 Dec 2018 14:16:26 +0300 Subject: [PATCH 834/855] clocksource/drivers/clps711x: Remove board support Since board support for the CLPS711X platform was removed, remove the board support from the clps711x-timer driver. Signed-off-by: Alexander Shiyan Signed-off-by: Thomas Gleixner Acked-by: Arnd Bergmann Cc: Daniel Lezcano Link: https://lkml.kernel.org/r/20181220111626.17140-1-shc_work@mail.ru --- drivers/clocksource/clps711x-timer.c | 45 ++++++++-------------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/drivers/clocksource/clps711x-timer.c b/drivers/clocksource/clps711x-timer.c index cdc251524f5e..857f8c086274 100644 --- a/drivers/clocksource/clps711x-timer.c +++ b/drivers/clocksource/clps711x-timer.c @@ -31,16 +31,9 @@ static u64 notrace clps711x_sched_clock_read(void) return ~readw(tcd); } -static int __init _clps711x_clksrc_init(struct clk *clock, void __iomem *base) +static void __init clps711x_clksrc_init(struct clk *clock, void __iomem *base) { - unsigned long rate; - - if (!base) - return -ENOMEM; - if (IS_ERR(clock)) - return PTR_ERR(clock); - - rate = clk_get_rate(clock); + unsigned long rate = clk_get_rate(clock); tcd = base; @@ -48,8 +41,6 @@ static int __init _clps711x_clksrc_init(struct clk *clock, void __iomem *base) clocksource_mmio_readw_down); sched_clock_register(clps711x_sched_clock_read, 16, rate); - - return 0; } static irqreturn_t clps711x_timer_interrupt(int irq, void *dev_id) @@ -67,13 +58,6 @@ static int __init _clps711x_clkevt_init(struct clk *clock, void __iomem *base, struct clock_event_device *clkevt; unsigned long rate; - if (!irq) - return -EINVAL; - if (!base) - return -ENOMEM; - if (IS_ERR(clock)) - return PTR_ERR(clock); - clkevt = kzalloc(sizeof(*clkevt), GFP_KERNEL); if (!clkevt) return -ENOMEM; @@ -93,32 +77,29 @@ static int __init _clps711x_clkevt_init(struct clk *clock, void __iomem *base, "clps711x-timer", clkevt); } -static void __init clps711x_clksrc_init(void __iomem *tc1_base, - void __iomem *tc2_base, - unsigned int irq) -{ - struct clk *tc1 = clk_get_sys("clps711x-timer.0", NULL); - struct clk *tc2 = clk_get_sys("clps711x-timer.1", NULL); - - BUG_ON(_clps711x_clksrc_init(tc1, tc1_base)); - BUG_ON(_clps711x_clkevt_init(tc2, tc2_base, irq)); -} - -#ifdef CONFIG_TIMER_OF static int __init clps711x_timer_init(struct device_node *np) { unsigned int irq = irq_of_parse_and_map(np, 0); struct clk *clock = of_clk_get(np, 0); void __iomem *base = of_iomap(np, 0); + if (!base) + return -ENOMEM; + if (!irq) + return -EINVAL; + if (IS_ERR(clock)) + return PTR_ERR(clock); + switch (of_alias_get_id(np, "timer")) { case CLPS711X_CLKSRC_CLOCKSOURCE: - return _clps711x_clksrc_init(clock, base); + clps711x_clksrc_init(clock, base); + break; case CLPS711X_CLKSRC_CLOCKEVENT: return _clps711x_clkevt_init(clock, base, irq); default: return -EINVAL; } + + return 0; } TIMER_OF_DECLARE(clps711x, "cirrus,ep7209-timer", clps711x_timer_init); -#endif From 8c2ffd9174779014c3fe1f96d9dc3641d9175f00 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 24 Mar 2019 14:02:26 -0700 Subject: [PATCH 835/855] Linux 5.1-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 99c0530489ef..c0a34064c574 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 1 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Shy Crocodile # *DOCUMENTATION* From 1d382264d911d91a8be5dbed1f0e053eb3245d81 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 23 Mar 2019 01:49:10 +0100 Subject: [PATCH 836/855] bpf, libbpf: fix version info and add it to shared object Even though libbpf's versioning script for the linker (libbpf.map) is pointing to 0.0.2, the BPF_EXTRAVERSION in the Makefile has not been updated along with it and is therefore still on 0.0.1. While fixing up, I also noticed that the generated shared object versioning information is missing, typical convention is to have a linker name (libbpf.so), soname (libbpf.so.0) and real name (libbpf.so.0.0.2) for library management. This is based upon the LIBBPF_VERSION as well. The build will then produce the following bpf libraries: # ll libbpf* libbpf.a libbpf.so -> libbpf.so.0.0.2 libbpf.so.0 -> libbpf.so.0.0.2 libbpf.so.0.0.2 # readelf -d libbpf.so.0.0.2 | grep SONAME 0x000000000000000e (SONAME) Library soname: [libbpf.so.0] And install them accordingly: # rm -rf /tmp/bld; mkdir /tmp/bld; make -j$(nproc) O=/tmp/bld install Auto-detecting system features: ... libelf: [ on ] ... bpf: [ on ] CC /tmp/bld/libbpf.o CC /tmp/bld/bpf.o CC /tmp/bld/nlattr.o CC /tmp/bld/btf.o CC /tmp/bld/libbpf_errno.o CC /tmp/bld/str_error.o CC /tmp/bld/netlink.o CC /tmp/bld/bpf_prog_linfo.o CC /tmp/bld/libbpf_probes.o CC /tmp/bld/xsk.o LD /tmp/bld/libbpf-in.o LINK /tmp/bld/libbpf.a LINK /tmp/bld/libbpf.so.0.0.2 LINK /tmp/bld/test_libbpf INSTALL /tmp/bld/libbpf.a INSTALL /tmp/bld/libbpf.so.0.0.2 # ll /usr/local/lib64/libbpf.* /usr/local/lib64/libbpf.a /usr/local/lib64/libbpf.so -> libbpf.so.0.0.2 /usr/local/lib64/libbpf.so.0 -> libbpf.so.0.0.2 /usr/local/lib64/libbpf.so.0.0.2 Fixes: 1bf4b05810fe ("tools: bpftool: add probes for eBPF program types") Fixes: 1b76c13e4b36 ("bpf tools: Introduce 'bpf' library and add bpf feature check") Reported-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/Makefile | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 61aaacf0cfa1..5bf8e52c41fc 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -3,7 +3,7 @@ BPF_VERSION = 0 BPF_PATCHLEVEL = 0 -BPF_EXTRAVERSION = 1 +BPF_EXTRAVERSION = 2 MAKEFLAGS += --no-print-directory @@ -79,8 +79,6 @@ export prefix libdir src obj libdir_SQ = $(subst ','\'',$(libdir)) libdir_relative_SQ = $(subst ','\'',$(libdir_relative)) -LIB_FILE = libbpf.a libbpf.so - VERSION = $(BPF_VERSION) PATCHLEVEL = $(BPF_PATCHLEVEL) EXTRAVERSION = $(BPF_EXTRAVERSION) @@ -88,7 +86,10 @@ EXTRAVERSION = $(BPF_EXTRAVERSION) OBJ = $@ N = -LIBBPF_VERSION = $(BPF_VERSION).$(BPF_PATCHLEVEL).$(BPF_EXTRAVERSION) +LIBBPF_VERSION = $(BPF_VERSION).$(BPF_PATCHLEVEL).$(BPF_EXTRAVERSION) + +LIB_TARGET = libbpf.a libbpf.so.$(LIBBPF_VERSION) +LIB_FILE = libbpf.a libbpf.so* # Set compile option CFLAGS ifdef EXTRA_CFLAGS @@ -128,16 +129,18 @@ all: export srctree OUTPUT CC LD CFLAGS V include $(srctree)/tools/build/Makefile.include -BPF_IN := $(OUTPUT)libbpf-in.o -LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE)) -VERSION_SCRIPT := libbpf.map +BPF_IN := $(OUTPUT)libbpf-in.o +VERSION_SCRIPT := libbpf.map + +LIB_TARGET := $(addprefix $(OUTPUT),$(LIB_TARGET)) +LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE)) GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN) | \ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {s++} END{print s}') VERSIONED_SYM_COUNT = $(shell readelf -s --wide $(OUTPUT)libbpf.so | \ grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l) -CMD_TARGETS = $(LIB_FILE) +CMD_TARGETS = $(LIB_TARGET) CXX_TEST_TARGET = $(OUTPUT)test_libbpf @@ -170,9 +173,13 @@ $(BPF_IN): force elfdep bpfdep echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true $(Q)$(MAKE) $(build)=libbpf -$(OUTPUT)libbpf.so: $(BPF_IN) - $(QUIET_LINK)$(CC) --shared -Wl,--version-script=$(VERSION_SCRIPT) \ - $^ -o $@ +$(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) + +$(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN) + $(QUIET_LINK)$(CC) --shared -Wl,-soname,libbpf.so.$(VERSION) \ + -Wl,--version-script=$(VERSION_SCRIPT) $^ -o $@ + @ln -sf $(@F) $(OUTPUT)libbpf.so + @ln -sf $(@F) $(OUTPUT)libbpf.so.$(VERSION) $(OUTPUT)libbpf.a: $(BPF_IN) $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^ @@ -192,6 +199,12 @@ check_abi: $(OUTPUT)libbpf.so exit 1; \ fi +define do_install_mkdir + if [ ! -d '$(DESTDIR_SQ)$1' ]; then \ + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$1'; \ + fi +endef + define do_install if [ ! -d '$(DESTDIR_SQ)$2' ]; then \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \ @@ -200,8 +213,9 @@ define do_install endef install_lib: all_cmd - $(call QUIET_INSTALL, $(LIB_FILE)) \ - $(call do_install,$(LIB_FILE),$(libdir_SQ)) + $(call QUIET_INSTALL, $(LIB_TARGET)) \ + $(call do_install_mkdir,$(libdir_SQ)); \ + cp -fpR $(LIB_FILE) $(DESTDIR)$(libdir_SQ) install_headers: $(call QUIET_INSTALL, headers) \ @@ -219,7 +233,7 @@ config-clean: clean: $(call QUIET_CLEAN, libbpf) $(RM) $(TARGETS) $(CXX_TEST_TARGET) \ - *.o *~ *.a *.so .*.d .*.cmd LIBBPF-CFLAGS + *.o *~ *.a *.so *.so.$(VERSION) .*.d .*.cmd LIBBPF-CFLAGS $(call QUIET_CLEAN, core-gen) $(RM) $(OUTPUT)FEATURE-DUMP.libbpf From 63197f78bca2d86093126783b0ee6519bd652435 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 23 Mar 2019 01:49:11 +0100 Subject: [PATCH 837/855] bpf, libbpf: clarify bump in libbpf version info The current documentation suggests that we would need to bump the libbpf version on every change. Lets clarify this a bit more and reflect what we do today in practice, that is, bumping it once per development cycle. Fixes: 76d1b894c515 ("libbpf: Document API and ABI conventions") Reported-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/README.rst b/tools/lib/bpf/README.rst index 5788479384ca..cef7b77eab69 100644 --- a/tools/lib/bpf/README.rst +++ b/tools/lib/bpf/README.rst @@ -111,6 +111,7 @@ starting from ``0.0.1``. Every time ABI is being changed, e.g. because a new symbol is added or semantic of existing symbol is changed, ABI version should be bumped. +This bump in ABI version is at most once per kernel development cycle. For example, if current state of ``libbpf.map`` is: From 945ab8f6de94430c23a82f3cf2e3f6d6f2945ff7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 25 Mar 2019 08:15:14 -0400 Subject: [PATCH 838/855] locks: wake any locks blocked on request before deadlock check Andreas reported that he was seeing the tdbtorture test fail in some cases with -EDEADLCK when it wasn't before. Some debugging showed that deadlock detection was sometimes discovering the caller's lock request itself in a dependency chain. While we remove the request from the blocked_lock_hash prior to reattempting to acquire it, any locks that are blocked on that request will still be present in the hash and will still have their fl_blocker pointer set to the current request. This causes posix_locks_deadlock to find a deadlock dependency chain when it shouldn't, as a lock request cannot block itself. We are going to end up waking all of those blocked locks anyway when we go to reinsert the request back into the blocked_lock_hash, so just do it prior to checking for deadlocks. This ensures that any lock blocked on the current request will no longer be part of any blocked request chain. URL: https://bugzilla.kernel.org/show_bug.cgi?id=202975 Fixes: 5946c4319ebb ("fs/locks: allow a lock request to block other requests.") Cc: stable@vger.kernel.org Reported-by: Andreas Schneider Signed-off-by: Neil Brown Signed-off-by: Jeff Layton --- fs/locks.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/locks.c b/fs/locks.c index eaa1cfaf73b0..71d0c6c2aac5 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1160,6 +1160,11 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, */ error = -EDEADLK; spin_lock(&blocked_lock_lock); + /* + * Ensure that we don't find any locks blocked on this + * request during deadlock detection. + */ + __locks_wake_up_blocks(request); if (likely(!posix_locks_deadlock(request, fl))) { error = FILE_LOCK_DEFERRED; __locks_insert_block(fl, request, From a3ac7917b73070010c05b4485b8582a6c9cd69b6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 25 Mar 2019 14:49:00 -0700 Subject: [PATCH 839/855] Revert "parport: daisy: use new parport device model" This reverts commit 1aec4211204d9463d1fd209eb50453de16254599. Steven Rostedt reports that it causes a hang at bootup and bisected it to this commit. The troigger is apparently a module alias for "parport_lowlevel" that points to "parport_pc", which causes a hang with modprobe -q -- parport_lowlevel blocking forever with a backtrace like this: wait_for_completion_killable+0x1c/0x28 call_usermodehelper_exec+0xa7/0x108 __request_module+0x351/0x3d8 get_lowlevel_driver+0x28/0x41 [parport] __parport_register_driver+0x39/0x1f4 [parport] daisy_drv_init+0x31/0x4f [parport] parport_bus_init+0x5d/0x7b [parport] parport_default_proc_register+0x26/0x1000 [parport] do_one_initcall+0xc2/0x1e0 do_init_module+0x50/0x1d4 load_module+0x1c2e/0x21b3 sys_init_module+0xef/0x117 Supid says: "Due to the new device model daisy driver will now try to find the parallel ports while trying to register its driver so that it can bind with them. Now, since daisy driver is loaded while parport bus is initialising the list of parport is still empty and it tries to load the lowlevel driver, which has an alias set to parport_pc, now causes a deadlock" But I don't think the daisy driver should be loaded by the parport initialization in the first place, so let's revert the whole change. If the daisy driver can just initialize separately on its own (like a driver should), instead of hooking into the parport init sequence directly, this issue probably would go away. Reported-and-bisected-by: Steven Rostedt (VMware) Reported-by: Michal Kubecek Acked-by: Greg Kroah-Hartman Cc: Sudip Mukherjee Signed-off-by: Linus Torvalds --- drivers/parport/daisy.c | 32 +------------------------------- drivers/parport/probe.c | 2 +- drivers/parport/share.c | 10 +--------- include/linux/parport.h | 13 ------------- 4 files changed, 3 insertions(+), 54 deletions(-) diff --git a/drivers/parport/daisy.c b/drivers/parport/daisy.c index 56dd83a45e55..5484a46dafda 100644 --- a/drivers/parport/daisy.c +++ b/drivers/parport/daisy.c @@ -213,12 +213,10 @@ void parport_daisy_fini(struct parport *port) struct pardevice *parport_open(int devnum, const char *name) { struct daisydev *p = topology; - struct pardev_cb par_cb; struct parport *port; struct pardevice *dev; int daisy; - memset(&par_cb, 0, sizeof(par_cb)); spin_lock(&topology_lock); while (p && p->devnum != devnum) p = p->next; @@ -232,7 +230,7 @@ struct pardevice *parport_open(int devnum, const char *name) port = parport_get_port(p->port); spin_unlock(&topology_lock); - dev = parport_register_dev_model(port, name, &par_cb, devnum); + dev = parport_register_device(port, name, NULL, NULL, NULL, 0, NULL); parport_put_port(port); if (!dev) return NULL; @@ -482,31 +480,3 @@ static int assign_addrs(struct parport *port) kfree(deviceid); return detected; } - -static int daisy_drv_probe(struct pardevice *par_dev) -{ - struct device_driver *drv = par_dev->dev.driver; - - if (strcmp(drv->name, "daisy_drv")) - return -ENODEV; - if (strcmp(par_dev->name, daisy_dev_name)) - return -ENODEV; - - return 0; -} - -static struct parport_driver daisy_driver = { - .name = "daisy_drv", - .probe = daisy_drv_probe, - .devmodel = true, -}; - -int daisy_drv_init(void) -{ - return parport_register_driver(&daisy_driver); -} - -void daisy_drv_exit(void) -{ - parport_unregister_driver(&daisy_driver); -} diff --git a/drivers/parport/probe.c b/drivers/parport/probe.c index e5e6a463a941..e035174ba205 100644 --- a/drivers/parport/probe.c +++ b/drivers/parport/probe.c @@ -257,7 +257,7 @@ static ssize_t parport_read_device_id (struct parport *port, char *buffer, ssize_t parport_device_id (int devnum, char *buffer, size_t count) { ssize_t retval = -ENXIO; - struct pardevice *dev = parport_open(devnum, daisy_dev_name); + struct pardevice *dev = parport_open (devnum, "Device ID probe"); if (!dev) return -ENXIO; diff --git a/drivers/parport/share.c b/drivers/parport/share.c index 0171b8dbcdcd..5dc53d420ca8 100644 --- a/drivers/parport/share.c +++ b/drivers/parport/share.c @@ -137,19 +137,11 @@ static struct bus_type parport_bus_type = { int parport_bus_init(void) { - int retval; - - retval = bus_register(&parport_bus_type); - if (retval) - return retval; - daisy_drv_init(); - - return 0; + return bus_register(&parport_bus_type); } void parport_bus_exit(void) { - daisy_drv_exit(); bus_unregister(&parport_bus_type); } diff --git a/include/linux/parport.h b/include/linux/parport.h index f41f1d041e2c..397607a0c0eb 100644 --- a/include/linux/parport.h +++ b/include/linux/parport.h @@ -460,7 +460,6 @@ extern size_t parport_ieee1284_epp_read_addr (struct parport *, void *, size_t, int); /* IEEE1284.3 functions */ -#define daisy_dev_name "Device ID probe" extern int parport_daisy_init (struct parport *port); extern void parport_daisy_fini (struct parport *port); extern struct pardevice *parport_open (int devnum, const char *name); @@ -469,18 +468,6 @@ extern ssize_t parport_device_id (int devnum, char *buffer, size_t len); extern void parport_daisy_deselect_all (struct parport *port); extern int parport_daisy_select (struct parport *port, int daisy, int mode); -#ifdef CONFIG_PARPORT_1284 -extern int daisy_drv_init(void); -extern void daisy_drv_exit(void); -#else -static inline int daisy_drv_init(void) -{ - return 0; -} - -static inline void daisy_drv_exit(void) {} -#endif - /* Lowlevel drivers _can_ call this support function to handle irqs. */ static inline void parport_generic_irq(struct parport *port) { From d29f5aa0bc0c321e1b9e4658a2a7e08e885da52a Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 22 Mar 2019 20:00:20 +0100 Subject: [PATCH 840/855] net: phy: don't clear BMCR in genphy_soft_reset So far we effectively clear the BMCR register. Some PHY's can deal with this (e.g. because they reset BMCR to a default as part of a soft-reset) whilst on others this causes issues because e.g. the autoneg bit is cleared. Marvell is an example, see also thread [0]. So let's be a little bit more gentle and leave all bits we're not interested in as-is. This change is needed for PHY drivers to properly deal with the original patch. [0] https://marc.info/?t=155264050700001&r=1&w=2 Fixes: 6e2d85ec0559 ("net: phy: Stop with excessive soft reset") Tested-by: Phil Reid Tested-by: liweihang Signed-off-by: Heiner Kallweit Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 49fdd1ee798e..77068c545de0 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1831,7 +1831,7 @@ int genphy_soft_reset(struct phy_device *phydev) { int ret; - ret = phy_write(phydev, MII_BMCR, BMCR_RESET); + ret = phy_set_bits(phydev, MII_BMCR, BMCR_RESET); if (ret < 0) return ret; From ff9d31d0d46672e201fc9ff59c42f1eef5f00c77 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 20 Mar 2019 12:53:33 -0500 Subject: [PATCH 841/855] tracing: Remove unnecessary var_ref destroy in track_data_destroy() Commit 656fe2ba85e8 (tracing: Use hist trigger's var_ref array to destroy var_refs) centralized the destruction of all the var_refs in one place so that other code didn't have to do it. The track_data_destroy() added later ignored that and also destroyed the track_data var_ref, causing a double-free error flagged by KASAN. ================================================================== BUG: KASAN: use-after-free in destroy_hist_field+0x30/0x70 Read of size 8 at addr ffff888086df2210 by task bash/1694 CPU: 6 PID: 1694 Comm: bash Not tainted 5.1.0-rc1-test+ #15 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 Call Trace: dump_stack+0x71/0xa0 ? destroy_hist_field+0x30/0x70 print_address_description.cold.3+0x9/0x1fb ? destroy_hist_field+0x30/0x70 ? destroy_hist_field+0x30/0x70 kasan_report.cold.4+0x1a/0x33 ? __kasan_slab_free+0x100/0x150 ? destroy_hist_field+0x30/0x70 destroy_hist_field+0x30/0x70 track_data_destroy+0x55/0xe0 destroy_hist_data+0x1f0/0x350 hist_unreg_all+0x203/0x220 event_trigger_open+0xbb/0x130 do_dentry_open+0x296/0x700 ? stacktrace_count_trigger+0x30/0x30 ? generic_permission+0x56/0x200 ? __x64_sys_fchdir+0xd0/0xd0 ? inode_permission+0x55/0x200 ? security_inode_permission+0x18/0x60 path_openat+0x633/0x22b0 ? path_lookupat.isra.50+0x420/0x420 ? __kasan_kmalloc.constprop.12+0xc1/0xd0 ? kmem_cache_alloc+0xe5/0x260 ? getname_flags+0x6c/0x2a0 ? do_sys_open+0x149/0x2b0 ? do_syscall_64+0x73/0x1b0 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 ? _raw_write_lock_bh+0xe0/0xe0 ? __kernel_text_address+0xe/0x30 ? unwind_get_return_address+0x2f/0x50 ? __list_add_valid+0x2d/0x70 ? deactivate_slab.isra.62+0x1f4/0x5a0 ? getname_flags+0x6c/0x2a0 ? set_track+0x76/0x120 do_filp_open+0x11a/0x1a0 ? may_open_dev+0x50/0x50 ? _raw_spin_lock+0x7a/0xd0 ? _raw_write_lock_bh+0xe0/0xe0 ? __alloc_fd+0x10f/0x200 do_sys_open+0x1db/0x2b0 ? filp_open+0x50/0x50 do_syscall_64+0x73/0x1b0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fa7b24a4ca2 Code: 25 00 00 41 00 3d 00 00 41 00 74 4c 48 8d 05 85 7a 0d 00 8b 00 85 c0 75 6d 89 f2 b8 01 01 00 00 48 89 fe bf 9c ff ff ff 0f 05 <48> 3d 00 f0 ff ff 0f 87 a2 00 00 00 48 8b 4c 24 28 64 48 33 0c 25 RSP: 002b:00007fffbafb3af0 EFLAGS: 00000246 ORIG_RAX: 0000000000000101 RAX: ffffffffffffffda RBX: 000055d3648ade30 RCX: 00007fa7b24a4ca2 RDX: 0000000000000241 RSI: 000055d364a55240 RDI: 00000000ffffff9c RBP: 00007fffbafb3bf0 R08: 0000000000000020 R09: 0000000000000002 R10: 00000000000001b6 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000003 R14: 0000000000000001 R15: 000055d364a55240 ================================================================== So remove the track_data_destroy() destroy_hist_field() call for that var_ref. Link: http://lkml.kernel.org/r/1deffec420f6a16d11dd8647318d34a66d1989a9.camel@linux.intel.com Fixes: 466f4528fbc69 ("tracing: Generalize hist trigger onmax and save action") Reported-by: Steven Rostedt (VMware) Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ca46339f3009..795aa2038377 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3713,7 +3713,6 @@ static void track_data_destroy(struct hist_trigger_data *hist_data, struct trace_event_file *file = hist_data->event_file; destroy_hist_field(data->track_data.track_var, 0); - destroy_hist_field(data->track_data.var_ref, 0); if (data->action == ACTION_SNAPSHOT) { struct track_data *track_data; From 3dee10da2e9ff220e054a8f158cc296c797fbe81 Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Thu, 21 Mar 2019 23:58:20 -0700 Subject: [PATCH 842/855] tracing: initialize variable in create_dyn_event() Fix compile warning in create_dyn_event(): 'ret' may be used uninitialized in this function [-Wuninitialized]. Link: http://lkml.kernel.org/r/1553237900-8555-1-git-send-email-frowand.list@gmail.com Cc: Masami Hiramatsu Cc: Ingo Molnar Cc: Tom Zanussi Cc: Ravi Bangoria Cc: stable@vger.kernel.org Fixes: 5448d44c3855 ("tracing: Add unified dynamic event framework") Signed-off-by: Frank Rowand Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index dd1f43588d70..fa100ed3b4de 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -74,7 +74,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) static int create_dyn_event(int argc, char **argv) { struct dyn_event_operations *ops; - int ret; + int ret = -ENODEV; if (argv[0][0] == '-' || argv[0][0] == '!') return dyn_event_release(argc, argv, NULL); From 9efb85c5cfac7e1f0caae4471446d936ff2163fe Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Sun, 24 Mar 2019 00:05:23 +0530 Subject: [PATCH 843/855] ftrace: Fix warning using plain integer as NULL & spelling corrections Changed 0 --> NULL to avoid sparse warning Corrected spelling mistakes reported by checkpatch.pl Sparse warning below: sudo make C=2 CF=-D__CHECK_ENDIAN__ M=kernel/trace CHECK kernel/trace/ftrace.c kernel/trace/ftrace.c:3007:24: warning: Using plain integer as NULL pointer kernel/trace/ftrace.c:4758:37: warning: Using plain integer as NULL pointer Link: http://lkml.kernel.org/r/20190323183523.GA2244@hari-Inspiron-1545 Signed-off-by: Hariprasad Kelam Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fa79323331b2..26c8ca9bd06b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1992,7 +1992,7 @@ static void print_bug_type(void) * modifying the code. @failed should be one of either: * EFAULT - if the problem happens on reading the @ip address * EINVAL - if what is read at @ip is not what was expected - * EPERM - if the problem happens on writting to the @ip address + * EPERM - if the problem happens on writing to the @ip address */ void ftrace_bug(int failed, struct dyn_ftrace *rec) { @@ -2391,7 +2391,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } - return -1; /* unknow ftrace bug */ + return -1; /* unknown ftrace bug */ } void __weak ftrace_replace_code(int mod_flags) @@ -3004,7 +3004,7 @@ ftrace_allocate_pages(unsigned long num_to_init) int cnt; if (!num_to_init) - return 0; + return NULL; start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); if (!pg) @@ -4755,7 +4755,7 @@ static int ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, int reset, int enable) { - return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); + return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); } /** @@ -5463,7 +5463,7 @@ void ftrace_create_filter_files(struct ftrace_ops *ops, /* * The name "destroy_filter_files" is really a misnomer. Although - * in the future, it may actualy delete the files, but this is + * in the future, it may actually delete the files, but this is * really intended to make sure the ops passed in are disabled * and that when this function returns, the caller is free to * free the ops. @@ -5786,7 +5786,7 @@ void ftrace_module_enable(struct module *mod) /* * If the tracing is enabled, go ahead and enable the record. * - * The reason not to enable the record immediatelly is the + * The reason not to enable the record immediately is the * inherent check of ftrace_make_nop/ftrace_make_call for * correct previous instructions. Making first the NOP * conversion puts the module to the correct state, thus From fb1eb41a3dd4cfff274c98f3c3324ab329641298 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Fri, 22 Mar 2019 01:05:00 +0100 Subject: [PATCH 844/855] dt-bindings: net: dsa: qca8k: fix example In the example, the phy at phy@0 is clashing with the switch0@0 at the same address. Usually, the switches are accessible through pseudo PHYs which in case of the qca8k are located at 0x10 - 0x18. Reviewed-by: Florian Fainelli Signed-off-by: Christian Lamparter Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/qca8k.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/net/dsa/qca8k.txt b/Documentation/devicetree/bindings/net/dsa/qca8k.txt index bbcb255c3150..5eda99e6c86e 100644 --- a/Documentation/devicetree/bindings/net/dsa/qca8k.txt +++ b/Documentation/devicetree/bindings/net/dsa/qca8k.txt @@ -55,12 +55,12 @@ Example: reg = <4>; }; - switch0@0 { + switch@10 { compatible = "qca,qca8337"; #address-cells = <1>; #size-cells = <0>; - reg = <0>; + reg = <0x10>; ports { #address-cells = <1>; From 5e07321f3388e6f2b13c43ae9df3e09efa8418e0 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Fri, 22 Mar 2019 01:05:01 +0100 Subject: [PATCH 845/855] dt-bindings: net: dsa: qca8k: support internal mdio-bus This patch updates the qca8k's binding to document to the approach for using the internal mdio-bus of the supported qca8k switches. Reviewed-by: Florian Fainelli Signed-off-by: Christian Lamparter Signed-off-by: David S. Miller --- .../devicetree/bindings/net/dsa/qca8k.txt | 69 +++++++++++++++++-- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/net/dsa/qca8k.txt b/Documentation/devicetree/bindings/net/dsa/qca8k.txt index 5eda99e6c86e..93a7469e70d4 100644 --- a/Documentation/devicetree/bindings/net/dsa/qca8k.txt +++ b/Documentation/devicetree/bindings/net/dsa/qca8k.txt @@ -12,10 +12,15 @@ Required properties: Subnodes: The integrated switch subnode should be specified according to the binding -described in dsa/dsa.txt. As the QCA8K switches do not have a N:N mapping of -port and PHY id, each subnode describing a port needs to have a valid phandle -referencing the internal PHY connected to it. The CPU port of this switch is -always port 0. +described in dsa/dsa.txt. If the QCA8K switch is connect to a SoC's external +mdio-bus each subnode describing a port needs to have a valid phandle +referencing the internal PHY it is connected to. This is because there's no +N:N mapping of port and PHY id. + +Don't use mixed external and internal mdio-bus configurations, as this is +not supported by the hardware. + +The CPU port of this switch is always port 0. A CPU port node has the following optional node: @@ -31,8 +36,9 @@ For QCA8K the 'fixed-link' sub-node supports only the following properties: - 'full-duplex' (boolean, optional), to indicate that full duplex is used. When absent, half duplex is assumed. -Example: +Examples: +for the external mdio-bus configuration: &mdio0 { phy_port1: phy@0 { @@ -108,3 +114,56 @@ Example: }; }; }; + +for the internal master mdio-bus configuration: + + &mdio0 { + switch@10 { + compatible = "qca,qca8337"; + #address-cells = <1>; + #size-cells = <0>; + + reg = <0x10>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + label = "cpu"; + ethernet = <&gmac1>; + phy-mode = "rgmii"; + fixed-link { + speed = 1000; + full-duplex; + }; + }; + + port@1 { + reg = <1>; + label = "lan1"; + }; + + port@2 { + reg = <2>; + label = "lan2"; + }; + + port@3 { + reg = <3>; + label = "lan3"; + }; + + port@4 { + reg = <4>; + label = "lan4"; + }; + + port@5 { + reg = <5>; + label = "wan"; + }; + }; + }; + }; From 1eec7151ae0e134bd42e3f128066b2ff8da21393 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Fri, 22 Mar 2019 01:05:02 +0100 Subject: [PATCH 846/855] net: dsa: qca8k: remove leftover phy accessors This belated patch implements Andrew Lunn's request of "remove the phy_read() and phy_write() functions." While seemingly harmless, this causes the switch's user port PHYs to get registered twice. This is because the DSA subsystem will create a slave mdio-bus not knowing that the qca8k_phy_(read|write) accessors operate on the external mdio-bus. So the same "bus" gets effectively duplicated. Cc: stable@vger.kernel.org Fixes: 6b93fb46480a ("net-next: dsa: add new driver for qca8xxx family") Signed-off-by: Christian Lamparter Signed-off-by: David S. Miller --- drivers/net/dsa/qca8k.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 576b37d12a63..14ad78225f07 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -624,22 +624,6 @@ qca8k_adjust_link(struct dsa_switch *ds, int port, struct phy_device *phy) qca8k_port_set_status(priv, port, 1); } -static int -qca8k_phy_read(struct dsa_switch *ds, int phy, int regnum) -{ - struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv; - - return mdiobus_read(priv->bus, phy, regnum); -} - -static int -qca8k_phy_write(struct dsa_switch *ds, int phy, int regnum, u16 val) -{ - struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv; - - return mdiobus_write(priv->bus, phy, regnum, val); -} - static void qca8k_get_strings(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data) { @@ -879,8 +863,6 @@ static const struct dsa_switch_ops qca8k_switch_ops = { .setup = qca8k_setup, .adjust_link = qca8k_adjust_link, .get_strings = qca8k_get_strings, - .phy_read = qca8k_phy_read, - .phy_write = qca8k_phy_write, .get_ethtool_stats = qca8k_get_ethtool_stats, .get_sset_count = qca8k_get_sset_count, .get_mac_eee = qca8k_get_mac_eee, From db460c54b67fc2cbe6dcef88b7bf3cba8e07f80e Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Fri, 22 Mar 2019 01:05:03 +0100 Subject: [PATCH 847/855] net: dsa: qca8k: extend slave-bus implementations This patch implements accessors for the QCA8337 MDIO access through the MDIO_MASTER register, which makes it possible to access the PHYs on slave-bus through the switch. In cases where the switch ports are already mapped via external "phy-phandles", the internal mdio-bus is disabled in order to prevent a duplicated discovery and enumeration of the same PHYs. Don't use mixed external and internal mdio-bus configurations, as this is not supported by the hardware. Signed-off-by: Christian Lamparter Signed-off-by: David S. Miller --- drivers/net/dsa/qca8k.c | 156 +++++++++++++++++++++++++++++++++++++++- drivers/net/dsa/qca8k.h | 13 ++++ 2 files changed, 168 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 14ad78225f07..c4fa400efdcc 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -481,6 +481,155 @@ qca8k_port_set_status(struct qca8k_priv *priv, int port, int enable) qca8k_reg_clear(priv, QCA8K_REG_PORT_STATUS(port), mask); } +static u32 +qca8k_port_to_phy(int port) +{ + /* From Andrew Lunn: + * Port 0 has no internal phy. + * Port 1 has an internal PHY at MDIO address 0. + * Port 2 has an internal PHY at MDIO address 1. + * ... + * Port 5 has an internal PHY at MDIO address 4. + * Port 6 has no internal PHY. + */ + + return port - 1; +} + +static int +qca8k_mdio_write(struct qca8k_priv *priv, int port, u32 regnum, u16 data) +{ + u32 phy, val; + + if (regnum >= QCA8K_MDIO_MASTER_MAX_REG) + return -EINVAL; + + /* callee is responsible for not passing bad ports, + * but we still would like to make spills impossible. + */ + phy = qca8k_port_to_phy(port) % PHY_MAX_ADDR; + val = QCA8K_MDIO_MASTER_BUSY | QCA8K_MDIO_MASTER_EN | + QCA8K_MDIO_MASTER_WRITE | QCA8K_MDIO_MASTER_PHY_ADDR(phy) | + QCA8K_MDIO_MASTER_REG_ADDR(regnum) | + QCA8K_MDIO_MASTER_DATA(data); + + qca8k_write(priv, QCA8K_MDIO_MASTER_CTRL, val); + + return qca8k_busy_wait(priv, QCA8K_MDIO_MASTER_CTRL, + QCA8K_MDIO_MASTER_BUSY); +} + +static int +qca8k_mdio_read(struct qca8k_priv *priv, int port, u32 regnum) +{ + u32 phy, val; + + if (regnum >= QCA8K_MDIO_MASTER_MAX_REG) + return -EINVAL; + + /* callee is responsible for not passing bad ports, + * but we still would like to make spills impossible. + */ + phy = qca8k_port_to_phy(port) % PHY_MAX_ADDR; + val = QCA8K_MDIO_MASTER_BUSY | QCA8K_MDIO_MASTER_EN | + QCA8K_MDIO_MASTER_READ | QCA8K_MDIO_MASTER_PHY_ADDR(phy) | + QCA8K_MDIO_MASTER_REG_ADDR(regnum); + + qca8k_write(priv, QCA8K_MDIO_MASTER_CTRL, val); + + if (qca8k_busy_wait(priv, QCA8K_MDIO_MASTER_CTRL, + QCA8K_MDIO_MASTER_BUSY)) + return -ETIMEDOUT; + + val = (qca8k_read(priv, QCA8K_MDIO_MASTER_CTRL) & + QCA8K_MDIO_MASTER_DATA_MASK); + + return val; +} + +static int +qca8k_phy_write(struct dsa_switch *ds, int port, int regnum, u16 data) +{ + struct qca8k_priv *priv = ds->priv; + + return qca8k_mdio_write(priv, port, regnum, data); +} + +static int +qca8k_phy_read(struct dsa_switch *ds, int port, int regnum) +{ + struct qca8k_priv *priv = ds->priv; + int ret; + + ret = qca8k_mdio_read(priv, port, regnum); + + if (ret < 0) + return 0xffff; + + return ret; +} + +static int +qca8k_setup_mdio_bus(struct qca8k_priv *priv) +{ + u32 internal_mdio_mask = 0, external_mdio_mask = 0, reg; + struct device_node *ports, *port; + int err; + + ports = of_get_child_by_name(priv->dev->of_node, "ports"); + if (!ports) + return -EINVAL; + + for_each_available_child_of_node(ports, port) { + err = of_property_read_u32(port, "reg", ®); + if (err) + return err; + + if (!dsa_is_user_port(priv->ds, reg)) + continue; + + if (of_property_read_bool(port, "phy-handle")) + external_mdio_mask |= BIT(reg); + else + internal_mdio_mask |= BIT(reg); + } + + if (!external_mdio_mask && !internal_mdio_mask) { + dev_err(priv->dev, "no PHYs are defined.\n"); + return -EINVAL; + } + + /* The QCA8K_MDIO_MASTER_EN Bit, which grants access to PHYs through + * the MDIO_MASTER register also _disconnects_ the external MDC + * passthrough to the internal PHYs. It's not possible to use both + * configurations at the same time! + * + * Because this came up during the review process: + * If the external mdio-bus driver is capable magically disabling + * the QCA8K_MDIO_MASTER_EN and mutex/spin-locking out the qca8k's + * accessors for the time being, it would be possible to pull this + * off. + */ + if (!!external_mdio_mask && !!internal_mdio_mask) { + dev_err(priv->dev, "either internal or external mdio bus configuration is supported.\n"); + return -EINVAL; + } + + if (external_mdio_mask) { + /* Make sure to disable the internal mdio bus in cases + * a dt-overlay and driver reload changed the configuration + */ + + qca8k_reg_clear(priv, QCA8K_MDIO_MASTER_CTRL, + QCA8K_MDIO_MASTER_EN); + return 0; + } + + priv->ops.phy_read = qca8k_phy_read; + priv->ops.phy_write = qca8k_phy_write; + return 0; +} + static int qca8k_setup(struct dsa_switch *ds) { @@ -502,6 +651,10 @@ qca8k_setup(struct dsa_switch *ds) if (IS_ERR(priv->regmap)) pr_warn("regmap initialization failed"); + ret = qca8k_setup_mdio_bus(priv); + if (ret) + return ret; + /* Initialize CPU port pad mode (xMII type, delays...) */ phy_mode = of_get_phy_mode(ds->ports[QCA8K_CPU_PORT].dn); if (phy_mode < 0) { @@ -905,7 +1058,8 @@ qca8k_sw_probe(struct mdio_device *mdiodev) return -ENOMEM; priv->ds->priv = priv; - priv->ds->ops = &qca8k_switch_ops; + priv->ops = qca8k_switch_ops; + priv->ds->ops = &priv->ops; mutex_init(&priv->reg_mutex); dev_set_drvdata(&mdiodev->dev, priv); diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h index d146e54c8a6c..249fd62268e5 100644 --- a/drivers/net/dsa/qca8k.h +++ b/drivers/net/dsa/qca8k.h @@ -49,6 +49,18 @@ #define QCA8K_MIB_FLUSH BIT(24) #define QCA8K_MIB_CPU_KEEP BIT(20) #define QCA8K_MIB_BUSY BIT(17) +#define QCA8K_MDIO_MASTER_CTRL 0x3c +#define QCA8K_MDIO_MASTER_BUSY BIT(31) +#define QCA8K_MDIO_MASTER_EN BIT(30) +#define QCA8K_MDIO_MASTER_READ BIT(27) +#define QCA8K_MDIO_MASTER_WRITE 0 +#define QCA8K_MDIO_MASTER_SUP_PRE BIT(26) +#define QCA8K_MDIO_MASTER_PHY_ADDR(x) ((x) << 21) +#define QCA8K_MDIO_MASTER_REG_ADDR(x) ((x) << 16) +#define QCA8K_MDIO_MASTER_DATA(x) (x) +#define QCA8K_MDIO_MASTER_DATA_MASK GENMASK(15, 0) +#define QCA8K_MDIO_MASTER_MAX_PORTS 5 +#define QCA8K_MDIO_MASTER_MAX_REG 32 #define QCA8K_GOL_MAC_ADDR0 0x60 #define QCA8K_GOL_MAC_ADDR1 0x64 #define QCA8K_REG_PORT_STATUS(_i) (0x07c + (_i) * 4) @@ -169,6 +181,7 @@ struct qca8k_priv { struct dsa_switch *ds; struct mutex reg_mutex; struct device *dev; + struct dsa_switch_ops ops; }; struct qca8k_mib_desc { From 1f8389bf63aec99218c62490869ca38d1a38ce46 Mon Sep 17 00:00:00 2001 From: Leslie Monis Date: Sat, 23 Mar 2019 19:11:33 +0530 Subject: [PATCH 848/855] net: sched: Kconfig: update reference link for PIE RFC 8033 replaces the IETF draft for PIE Signed-off-by: Leslie Monis Signed-off-by: David S. Miller --- net/sched/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 1b9afdee5ba9..5c02ad97ef23 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -358,8 +358,7 @@ config NET_SCH_PIE help Say Y here if you want to use the Proportional Integral controller Enhanced scheduler packet scheduling algorithm. - For more information, please see - http://tools.ietf.org/html/draft-pan-tsvwg-pie-00 + For more information, please see https://tools.ietf.org/html/rfc8033 To compile this driver as a module, choose M here: the module will be called sch_pie. From b7ebee2f95fb0fa2862d5ed2de707f872c311393 Mon Sep 17 00:00:00 2001 From: Dmitry Bezrukov Date: Sat, 23 Mar 2019 13:59:53 +0000 Subject: [PATCH 849/855] net: usb: aqc111: Extend HWID table by QNAP device New device of QNAP based on aqc111u Add this ID to blacklist of cdc_ether driver as well Signed-off-by: Dmitry Bezrukov Signed-off-by: David S. Miller --- drivers/net/usb/aqc111.c | 15 +++++++++++++++ drivers/net/usb/cdc_ether.c | 8 ++++++++ 2 files changed, 23 insertions(+) diff --git a/drivers/net/usb/aqc111.c b/drivers/net/usb/aqc111.c index 820a2fe7d027..aff995be2a31 100644 --- a/drivers/net/usb/aqc111.c +++ b/drivers/net/usb/aqc111.c @@ -1301,6 +1301,20 @@ static const struct driver_info trendnet_info = { .tx_fixup = aqc111_tx_fixup, }; +static const struct driver_info qnap_info = { + .description = "QNAP QNA-UC5G1T USB to 5GbE Adapter", + .bind = aqc111_bind, + .unbind = aqc111_unbind, + .status = aqc111_status, + .link_reset = aqc111_link_reset, + .reset = aqc111_reset, + .stop = aqc111_stop, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | + FLAG_AVOID_UNLINK_URBS | FLAG_MULTI_PACKET, + .rx_fixup = aqc111_rx_fixup, + .tx_fixup = aqc111_tx_fixup, +}; + static int aqc111_suspend(struct usb_interface *intf, pm_message_t message) { struct usbnet *dev = usb_get_intfdata(intf); @@ -1455,6 +1469,7 @@ static const struct usb_device_id products[] = { {AQC111_USB_ETH_DEV(0x0b95, 0x2790, asix111_info)}, {AQC111_USB_ETH_DEV(0x0b95, 0x2791, asix112_info)}, {AQC111_USB_ETH_DEV(0x20f4, 0xe05a, trendnet_info)}, + {AQC111_USB_ETH_DEV(0x1c04, 0x0015, qnap_info)}, { },/* END */ }; MODULE_DEVICE_TABLE(usb, products); diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index 5512a1038721..3e9b2c319e45 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -851,6 +851,14 @@ static const struct usb_device_id products[] = { .driver_info = 0, }, +/* QNAP QNA-UC5G1T USB to 5GbE Adapter (based on AQC111U) */ +{ + USB_DEVICE_AND_INTERFACE_INFO(0x1c04, 0x0015, USB_CLASS_COMM, + USB_CDC_SUBCLASS_ETHERNET, + USB_CDC_PROTO_NONE), + .driver_info = 0, +}, + /* WHITELIST!!! * * CDC Ether uses two interfaces, not necessarily consecutive. From 9926cb5f8b0f0aea535735185600d74db7608550 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 24 Mar 2019 00:48:22 +0800 Subject: [PATCH 850/855] tipc: change to check tipc_own_id to return in tipc_net_stop When running a syz script, a panic occurred: [ 156.088228] BUG: KASAN: use-after-free in tipc_disc_timeout+0x9c9/0xb20 [tipc] [ 156.094315] Call Trace: [ 156.094844] [ 156.095306] dump_stack+0x7c/0xc0 [ 156.097346] print_address_description+0x65/0x22e [ 156.100445] kasan_report.cold.3+0x37/0x7a [ 156.102402] tipc_disc_timeout+0x9c9/0xb20 [tipc] [ 156.106517] call_timer_fn+0x19a/0x610 [ 156.112749] run_timer_softirq+0xb51/0x1090 It was caused by the netns freed without deleting the discoverer timer, while later on the netns would be accessed in the timer handler. The timer should have been deleted by tipc_net_stop() when cleaning up a netns. However, tipc has been able to enable a bearer and start d->timer without the local node_addr set since Commit 52dfae5c85a4 ("tipc: obtain node identity from interface by default"), which caused the timer not to be deleted in tipc_net_stop() then. So fix it in tipc_net_stop() by changing to check local node_id instead of local node_addr, as Jon suggested. While at it, remove the calling of tipc_nametbl_withdraw() there, since tipc_nametbl_stop() will take of the nametbl's freeing after. Fixes: 52dfae5c85a4 ("tipc: obtain node identity from interface by default") Reported-by: syzbot+a25307ad099309f1c2b9@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/net.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/tipc/net.c b/net/tipc/net.c index f076edb74338..7ce1e86b024f 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -163,12 +163,9 @@ void tipc_sched_net_finalize(struct net *net, u32 addr) void tipc_net_stop(struct net *net) { - u32 self = tipc_own_addr(net); - - if (!self) + if (!tipc_own_id(net)) return; - tipc_nametbl_withdraw(net, TIPC_CFG_SRV, self, self, self); rtnl_lock(); tipc_bearer_stop(net); tipc_node_stop(net); From 450895d04ba13a96886eddfeddb11556ae8624f1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 24 Mar 2019 00:18:46 +0200 Subject: [PATCH 851/855] net: phy: bcm54xx: Encode link speed and activity into LEDs Previously the green and amber LEDs on this quad PHY were solid, to indicate an encoding of the link speed (10/100/1000). This keeps the LEDs always on just as before, but now they flash on Rx/Tx activity. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 13 +++++++++++++ include/linux/brcmphy.h | 16 ++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 9605d4fe540b..cb86a3e90c7d 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -323,6 +323,19 @@ static int bcm54xx_config_init(struct phy_device *phydev) bcm54xx_phydsp_config(phydev); + /* Encode link speed into LED1 and LED3 pair (green/amber). + * Also flash these two LEDs on activity. This means configuring + * them for MULTICOLOR and encoding link/activity into them. + */ + val = BCM5482_SHD_LEDS1_LED1(BCM_LED_SRC_MULTICOLOR1) | + BCM5482_SHD_LEDS1_LED3(BCM_LED_SRC_MULTICOLOR1); + bcm_phy_write_shadow(phydev, BCM5482_SHD_LEDS1, val); + + val = BCM_LED_MULTICOLOR_IN_PHASE | + BCM5482_SHD_LEDS1_LED1(BCM_LED_MULTICOLOR_LINK_ACT) | + BCM5482_SHD_LEDS1_LED3(BCM_LED_MULTICOLOR_LINK_ACT); + bcm_phy_write_exp(phydev, BCM_EXP_MULTICOLOR, val); + return 0; } diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 9cd00a37b8d3..6db2d9a6e503 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -148,6 +148,22 @@ #define BCM_LED_SRC_OFF 0xe /* Tied high */ #define BCM_LED_SRC_ON 0xf /* Tied low */ +/* + * Broadcom Multicolor LED configurations (expansion register 4) + */ +#define BCM_EXP_MULTICOLOR (MII_BCM54XX_EXP_SEL_ER + 0x04) +#define BCM_LED_MULTICOLOR_IN_PHASE BIT(8) +#define BCM_LED_MULTICOLOR_LINK_ACT 0x0 +#define BCM_LED_MULTICOLOR_SPEED 0x1 +#define BCM_LED_MULTICOLOR_ACT_FLASH 0x2 +#define BCM_LED_MULTICOLOR_FDX 0x3 +#define BCM_LED_MULTICOLOR_OFF 0x4 +#define BCM_LED_MULTICOLOR_ON 0x5 +#define BCM_LED_MULTICOLOR_ALT 0x6 +#define BCM_LED_MULTICOLOR_FLASH 0x7 +#define BCM_LED_MULTICOLOR_LINK 0x8 +#define BCM_LED_MULTICOLOR_ACT 0x9 +#define BCM_LED_MULTICOLOR_PROGRAM 0xa /* * BCM5482: Shadow registers From c493b09b2792336f471d2206be180a4b4c1fc9ba Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 24 Mar 2019 00:21:03 +0100 Subject: [PATCH 852/855] net: devlink: skip info_get op call if it is not defined in dumpit In dumpit, unlike doit, the check for info_get op being defined is missing. Add it and avoid null pointer dereference in case driver does not define this op. Fixes: f9cf22882c60 ("devlink: add device information API") Reported-by: Ido Schimmel Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/devlink.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/core/devlink.c b/net/core/devlink.c index 78e22cea4cc7..da0a29f30885 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3897,6 +3897,11 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, continue; } + if (!devlink->ops->info_get) { + idx++; + continue; + } + mutex_lock(&devlink->lock); err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, NETLINK_CB(cb->skb).portid, From 047a013f8d0af8299ce2d02af152de6a30165ccc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Mar 2019 13:49:16 +0100 Subject: [PATCH 853/855] chelsio: use BUG() instead of BUG_ON(1) clang warns about possible bugs in a dead code branch after BUG_ON(1) when CONFIG_PROFILE_ALL_BRANCHES is enabled: drivers/net/ethernet/chelsio/cxgb4/sge.c:479:3: error: variable 'buf_size' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized] BUG_ON(1); ^~~~~~~~~ include/asm-generic/bug.h:61:36: note: expanded from macro 'BUG_ON' #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0) ^~~~~~~~~~~~~~~~~~~ include/linux/compiler.h:48:23: note: expanded from macro 'unlikely' # define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x))) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/net/ethernet/chelsio/cxgb4/sge.c:482:9: note: uninitialized use occurs here return buf_size; ^~~~~~~~ drivers/net/ethernet/chelsio/cxgb4/sge.c:479:3: note: remove the 'if' if its condition is always true BUG_ON(1); ^ include/asm-generic/bug.h:61:32: note: expanded from macro 'BUG_ON' #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0) ^ drivers/net/ethernet/chelsio/cxgb4/sge.c:459:14: note: initialize the variable 'buf_size' to silence this warning int buf_size; ^ = 0 Use BUG() here to create simpler code that clang understands correctly. Signed-off-by: Arnd Bergmann Reviewed-by: Nick Desaulniers Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 2 +- drivers/net/ethernet/chelsio/cxgb4/sge.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c index 3130b43bba52..02959035ed3f 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c @@ -2620,7 +2620,7 @@ static inline struct port_info *ethqset2pinfo(struct adapter *adap, int qset) } /* should never happen! */ - BUG_ON(1); + BUG(); return NULL; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 88773ca58e6b..b3da81e90132 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -476,7 +476,7 @@ static inline int get_buf_size(struct adapter *adapter, break; default: - BUG_ON(1); + BUG(); } return buf_size; From 8c838f53e149871561a9261ac768a9c7071b43d0 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Mon, 25 Mar 2019 13:06:22 +0000 Subject: [PATCH 854/855] dpaa2-eth: fix race condition with bql frame accounting It might happen that Tx conf acknowledges a frame before it was subscribed in bql, as subscribing was previously done after the enqueue operation. This patch moves the netdev_tx_sent_queue call before the actual frame enqueue, so that this can never happen. Fixes: 569dac6a5a0d ("dpaa2-eth: bql support") Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 1a68052abb94..dc339dc1adb2 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -815,6 +815,14 @@ static netdev_tx_t dpaa2_eth_tx(struct sk_buff *skb, struct net_device *net_dev) */ queue_mapping = skb_get_queue_mapping(skb); fq = &priv->fq[queue_mapping]; + + fd_len = dpaa2_fd_get_len(&fd); + nq = netdev_get_tx_queue(net_dev, queue_mapping); + netdev_tx_sent_queue(nq, fd_len); + + /* Everything that happens after this enqueues might race with + * the Tx confirmation callback for this frame + */ for (i = 0; i < DPAA2_ETH_ENQUEUE_RETRIES; i++) { err = priv->enqueue(priv, fq, &fd, 0); if (err != -EBUSY) @@ -825,13 +833,10 @@ static netdev_tx_t dpaa2_eth_tx(struct sk_buff *skb, struct net_device *net_dev) percpu_stats->tx_errors++; /* Clean up everything, including freeing the skb */ free_tx_fd(priv, fq, &fd, false); + netdev_tx_completed_queue(nq, 1, fd_len); } else { - fd_len = dpaa2_fd_get_len(&fd); percpu_stats->tx_packets++; percpu_stats->tx_bytes += fd_len; - - nq = netdev_get_tx_queue(net_dev, queue_mapping); - netdev_tx_sent_queue(nq, fd_len); } return NETDEV_TX_OK; From 01f2f5b82a2b523ae76af53f2ff43c48dde10a00 Mon Sep 17 00:00:00 2001 From: Alakesh Haloi Date: Tue, 26 Mar 2019 02:00:01 +0000 Subject: [PATCH 855/855] SUNRPC: fix uninitialized variable warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid following compiler warning on uninitialized variable net/sunrpc/xprtsock.c: In function ‘xs_read_stream_request.constprop’: net/sunrpc/xprtsock.c:525:10: warning: ‘read’ may be used uninitialized in this function [-Wmaybe-uninitialized] return read; ^~~~ net/sunrpc/xprtsock.c:529:23: warning: ‘ret’ may be used uninitialized in this function [-Wmaybe-uninitialized] return ret < 0 ? ret : read; ~~~~~~~~~~~~~~^~~~~~ Signed-off-by: Alakesh Haloi Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9359539907ba..732d4b57411a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -495,8 +495,8 @@ xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg, int flags, struct rpc_rqst *req) { struct xdr_buf *buf = &req->rq_private_buf; - size_t want, read; - ssize_t ret; + size_t want, uninitialized_var(read); + ssize_t uninitialized_var(ret); xs_read_header(transport, buf);