From 2f884e6e688a0deb69e6c9552e51aef8b7e3f5f1 Mon Sep 17 00:00:00 2001 From: "Strashko, Grygorii" Date: Thu, 8 Dec 2016 17:33:10 -0600 Subject: [PATCH 01/56] irqchip/keystone: Fix "scheduling while atomic" on rt The below call chain generates "scheduling while atomic" backtrace and causes system crash when Keystone 2 IRQ chip driver is used with RT-kernel: gic_handle_irq() |-__handle_domain_irq() |-generic_handle_irq() |-keystone_irq_handler() |-regmap_read() |-regmap_lock_spinlock() |-rt_spin_lock() The reason is that Keystone driver dispatches IRQ using chained IRQ handler and accesses I/O memory through syscon->regmap(mmio) which is implemented as fast_io regmap and uses regular spinlocks for synchronization, but spinlocks transformed to rt_mutexes on RT. Hence, convert Keystone 2 IRQ driver to use generic irq handler instead of chained IRQ handler. This way it will be compatible with RT kernel where it will be forced thread IRQ handler while in non-RT kernel it still will be executed in HW IRQ context. Cc: Suman Anna Signed-off-by: Grygorii Strashko Tested-by: Suman Anna Link: https://lkml.kernel.org/r/20161208233310.10329-1-grygorii.strashko@ti.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-keystone.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/irqchip/irq-keystone.c b/drivers/irqchip/irq-keystone.c index 54a5e870a8f5..efbcf8435185 100644 --- a/drivers/irqchip/irq-keystone.c +++ b/drivers/irqchip/irq-keystone.c @@ -19,9 +19,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -39,6 +39,7 @@ struct keystone_irq_device { struct irq_domain *irqd; struct regmap *devctrl_regs; u32 devctrl_offset; + raw_spinlock_t wa_lock; }; static inline u32 keystone_irq_readl(struct keystone_irq_device *kirq) @@ -83,17 +84,15 @@ static void keystone_irq_ack(struct irq_data *d) /* nothing to do here */ } -static void keystone_irq_handler(struct irq_desc *desc) +static irqreturn_t keystone_irq_handler(int irq, void *keystone_irq) { - unsigned int irq = irq_desc_get_irq(desc); - struct keystone_irq_device *kirq = irq_desc_get_handler_data(desc); + struct keystone_irq_device *kirq = keystone_irq; + unsigned long wa_lock_flags; unsigned long pending; int src, virq; dev_dbg(kirq->dev, "start irq %d\n", irq); - chained_irq_enter(irq_desc_get_chip(desc), desc); - pending = keystone_irq_readl(kirq); keystone_irq_writel(kirq, pending); @@ -111,13 +110,15 @@ static void keystone_irq_handler(struct irq_desc *desc) if (!virq) dev_warn(kirq->dev, "spurious irq detected hwirq %d, virq %d\n", src, virq); + raw_spin_lock_irqsave(&kirq->wa_lock, wa_lock_flags); generic_handle_irq(virq); + raw_spin_unlock_irqrestore(&kirq->wa_lock, + wa_lock_flags); } } - chained_irq_exit(irq_desc_get_chip(desc), desc); - dev_dbg(kirq->dev, "end irq %d\n", irq); + return IRQ_HANDLED; } static int keystone_irq_map(struct irq_domain *h, unsigned int virq, @@ -182,9 +183,16 @@ static int keystone_irq_probe(struct platform_device *pdev) return -ENODEV; } + raw_spin_lock_init(&kirq->wa_lock); + platform_set_drvdata(pdev, kirq); - irq_set_chained_handler_and_data(kirq->irq, keystone_irq_handler, kirq); + ret = request_irq(kirq->irq, keystone_irq_handler, + 0, dev_name(dev), kirq); + if (ret) { + irq_domain_remove(kirq->irqd); + return ret; + } /* clear all source bits */ keystone_irq_writel(kirq, ~0x0); @@ -199,6 +207,8 @@ static int keystone_irq_remove(struct platform_device *pdev) struct keystone_irq_device *kirq = platform_get_drvdata(pdev); int hwirq; + free_irq(kirq->irq, kirq); + for (hwirq = 0; hwirq < KEYSTONE_N_IRQ; hwirq++) irq_dispose_mapping(irq_find_mapping(kirq->irqd, hwirq)); From 88e20c74ee020f9e0c99dfce0dd9aa61c3f0cca0 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Tue, 27 Dec 2016 18:29:57 +0000 Subject: [PATCH 02/56] irqchip/mxs: Enable SKIP_SET_WAKE and MASK_ON_SUSPEND The ICOLL controller doesn't provide any facility to configure the wakeup sources. That's the reason why this implementation lacks the irq_set_wake implementation. But this prevent us from properly entering power management states like "suspend to idle". So enable the flags IRQCHIP_SKIP_SET_WAKE and IRQCHIP_MASK_ON_SUSPEND to let the irqchip core allows and handles the power management. Signed-off-by: Stefan Wahren Reviewed-by: Fabio Estevam Link: https://lkml.kernel.org/r/1482863397-11400-1-git-send-email-stefan.wahren@i2se.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-mxs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/irqchip/irq-mxs.c b/drivers/irqchip/irq-mxs.c index 17304705f2cf..05fa9f7af53c 100644 --- a/drivers/irqchip/irq-mxs.c +++ b/drivers/irqchip/irq-mxs.c @@ -131,12 +131,16 @@ static struct irq_chip mxs_icoll_chip = { .irq_ack = icoll_ack_irq, .irq_mask = icoll_mask_irq, .irq_unmask = icoll_unmask_irq, + .flags = IRQCHIP_MASK_ON_SUSPEND | + IRQCHIP_SKIP_SET_WAKE, }; static struct irq_chip asm9260_icoll_chip = { .irq_ack = icoll_ack_irq, .irq_mask = asm9260_mask_irq, .irq_unmask = asm9260_unmask_irq, + .flags = IRQCHIP_MASK_ON_SUSPEND | + IRQCHIP_SKIP_SET_WAKE, }; asmlinkage void __exception_irq_entry icoll_handle_irq(struct pt_regs *regs) From 27873de99f2fecca0f6b257316489ef2a1d86ffd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 23 Jan 2017 08:34:45 -0800 Subject: [PATCH 03/56] scsi: qla2xxx: Fix a recently introduced memory leak qla2x00_probe_one() allocates IRQs before it initializes rsp_q_map so IRQs must be freed even if rsp_q_map allocation did not occur. This was detected by kmemleak. Fixes: 4fa183455988 ("scsi: qla2xxx: Utilize pci_alloc_irq_vectors/pci_free_irq_vectors calls") Signed-off-by: Bart Van Assche Cc: Michael Hernandez Cc: Himanshu Madhani Cc: Christoph Hellwig Cc: Reviewed-by: Christoph Hellwig Acked-By: Himanshu Madhani Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_isr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index 5815403d1d65..3116bf390b06 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -3237,7 +3237,7 @@ qla2x00_free_irqs(scsi_qla_host_t *vha) * from a probe failure context. */ if (!ha->rsp_q_map || !ha->rsp_q_map[0]) - return; + goto free_irqs; rsp = ha->rsp_q_map[0]; if (ha->flags.msix_enabled) { @@ -3257,6 +3257,7 @@ qla2x00_free_irqs(scsi_qla_host_t *vha) free_irq(pci_irq_vector(ha->pdev, 0), rsp); } +free_irqs: pci_free_irq_vectors(ha->pdev); } From 2780f3c8f0233de90b6b47a23fc422b7780c5436 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Wed, 25 Jan 2017 22:07:06 -0200 Subject: [PATCH 04/56] scsi: qla2xxx: Avoid that issuing a LIP triggers a kernel crash Avoid that issuing a LIP as follows: find /sys -name 'issue_lip'|while read f; do echo 1 > $f; done triggers the following: BUG: unable to handle kernel NULL pointer dereference at (null) Call Trace: qla2x00_abort_all_cmds+0xed/0x140 [qla2xxx] qla2x00_abort_isp_cleanup+0x1e3/0x280 [qla2xxx] qla2x00_abort_isp+0xef/0x690 [qla2xxx] qla2x00_do_dpc+0x36c/0x880 [qla2xxx] kthread+0x10c/0x140 [mkp: consolidated Mauricio's and Bart's fixes] Signed-off-by: Mauricio Faria de Oliveira Reported-by: Bart Van Assche Fixes: 1535aa75a3d8 ("qla2xxx: fix invalid DMA access after command aborts in PCI device remove") Cc: Himanshu Madhani Cc: Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index ad4edc13ebcf..1cd3c48a6350 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1616,7 +1616,7 @@ qla2x00_abort_all_cmds(scsi_qla_host_t *vha, int res) /* Don't abort commands in adapter during EEH * recovery as it's not accessible/responding. */ - if (!ha->flags.eeh_busy) { + if (GET_CMD_SP(sp) && !ha->flags.eeh_busy) { /* Get a reference to the sp and drop the lock. * The reference ensures this sp->done() call * - and not the call in qla2xxx_eh_abort() - From f2e767bb5d6ee0d988cb7d4e54b0b21175802b6b Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 26 Jan 2017 16:37:01 -0200 Subject: [PATCH 05/56] scsi: mpt3sas: Force request partial completion alignment The firmware or device, possibly under a heavy I/O load, can return on a partial unaligned boundary. Scsi-ml expects these requests to be completed on an alignment boundary. Scsi-ml blindly requeues the I/O without checking the alignment boundary of the I/O request for the remaining bytes. This leads to errors, since devices cannot perform non-aligned read/write operations. This patch fixes the issue in the driver. It aligns unaligned completions of FS requests, by truncating them to the nearest alignment boundary. [mkp: simplified if statement] Reported-by: Mauricio Faria De Oliveira Signed-off-by: Guilherme G. Piccoli Signed-off-by: Ram Pai Acked-by: Sreekanth Reddy Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 75f3fce1c867..268103182d34 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -4657,6 +4657,7 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply) struct MPT3SAS_DEVICE *sas_device_priv_data; u32 response_code = 0; unsigned long flags; + unsigned int sector_sz; mpi_reply = mpt3sas_base_get_reply_virt_addr(ioc, reply); scmd = _scsih_scsi_lookup_get_clear(ioc, smid); @@ -4715,6 +4716,20 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply) } xfer_cnt = le32_to_cpu(mpi_reply->TransferCount); + + /* In case of bogus fw or device, we could end up having + * unaligned partial completion. We can force alignment here, + * then scsi-ml does not need to handle this misbehavior. + */ + sector_sz = scmd->device->sector_size; + if (unlikely(scmd->request->cmd_type == REQ_TYPE_FS && sector_sz && + xfer_cnt % sector_sz)) { + sdev_printk(KERN_INFO, scmd->device, + "unaligned partial completion avoided (xfer_cnt=%u, sector_sz=%u)\n", + xfer_cnt, sector_sz); + xfer_cnt = round_down(xfer_cnt, sector_sz); + } + scsi_set_resid(scmd, scsi_bufflen(scmd) - xfer_cnt); if (ioc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE) log_info = le32_to_cpu(mpi_reply->IOCLogInfo); From 8381cdd0e32dd748bd34ca3ace476949948bd793 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 18 Jan 2017 14:14:56 +0900 Subject: [PATCH 06/56] perf diff: Fix segfault on 'perf diff -o N' option The -o/--order option is to select column number to sort a diff result. It does the job by adding a hpp field at the beginning of the sort list. But it should not be added to the output field list as it has no callbacks required by a output field. During the setup_sorting(), the perf_hpp__setup_output_field() appends the given sort keys to the output field if it's not there already. Originally it was checked by fmt->list being non-empty. But commit 3f931f2c4274 ("perf hists: Make hpp setup function generic") changed it to check the ->equal callback. Anyways, we don't need to add the pseudo hpp field to the output field list since it won't be used for output. So just skip fields if they have no ->color or ->entry callbacks. Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Peter Zijlstra Fixes: 3f931f2c4274 ("perf hists: Make hpp setup function generic") Link: http://lkml.kernel.org/r/20170118051457.30946-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/hist.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index 37388397b5bc..4ec79b2f9416 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -560,6 +560,10 @@ void perf_hpp__setup_output_field(struct perf_hpp_list *list) perf_hpp_list__for_each_sort_list(list, fmt) { struct perf_hpp_fmt *pos; + /* skip sort-only fields ("sort_compute" in perf diff) */ + if (!fmt->entry && !fmt->color) + continue; + perf_hpp_list__for_each_format(list, pos) { if (fmt_equal(fmt, pos)) goto next; From a1c9f97f0b64e6337d9cfcc08c134450934fdd90 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 18 Jan 2017 14:14:57 +0900 Subject: [PATCH 07/56] perf diff: Fix -o/--order option behavior (again) Commit 21e6d8428664 ("perf diff: Use perf_hpp__register_sort_field interface") changed list_add() to perf_hpp__register_sort_field(). This resulted in a behavior change since the field was added to the tail instead of the head. So the -o option is mostly ignored due to its order in the list. This patch fixes it by adding perf_hpp__prepend_sort_field(). Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Peter Zijlstra Fixes: 21e6d8428664 ("perf diff: Use perf_hpp__register_sort_field interface") Link: http://lkml.kernel.org/r/20170118051457.30946-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-diff.c | 2 +- tools/perf/ui/hist.c | 6 ++++++ tools/perf/util/hist.h | 7 +++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 9ff0db4e2d0c..933aeec46f4a 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -1199,7 +1199,7 @@ static int ui_init(void) BUG_ON(1); } - perf_hpp__register_sort_field(fmt); + perf_hpp__prepend_sort_field(fmt); return 0; } diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index 4ec79b2f9416..18cfcdc90356 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -521,6 +521,12 @@ void perf_hpp_list__register_sort_field(struct perf_hpp_list *list, list_add_tail(&format->sort_list, &list->sorts); } +void perf_hpp_list__prepend_sort_field(struct perf_hpp_list *list, + struct perf_hpp_fmt *format) +{ + list_add(&format->sort_list, &list->sorts); +} + void perf_hpp__column_unregister(struct perf_hpp_fmt *format) { list_del(&format->list); diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index d4b6514eeef5..28c216e3d5b7 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -283,6 +283,8 @@ void perf_hpp_list__column_register(struct perf_hpp_list *list, struct perf_hpp_fmt *format); void perf_hpp_list__register_sort_field(struct perf_hpp_list *list, struct perf_hpp_fmt *format); +void perf_hpp_list__prepend_sort_field(struct perf_hpp_list *list, + struct perf_hpp_fmt *format); static inline void perf_hpp__column_register(struct perf_hpp_fmt *format) { @@ -294,6 +296,11 @@ static inline void perf_hpp__register_sort_field(struct perf_hpp_fmt *format) perf_hpp_list__register_sort_field(&perf_hpp_list, format); } +static inline void perf_hpp__prepend_sort_field(struct perf_hpp_fmt *format) +{ + perf_hpp_list__prepend_sort_field(&perf_hpp_list, format); +} + #define perf_hpp_list__for_each_format(_list, format) \ list_for_each_entry(format, &(_list)->fields, list) From aa33b9b9a2ebb00d33c83a5312d4fbf2d5aeba36 Mon Sep 17 00:00:00 2001 From: Krister Johansen Date: Thu, 5 Jan 2017 22:23:31 -0800 Subject: [PATCH 08/56] perf callchain: Reference count maps If dso__load_kcore frees all of the existing maps, but one has already been attached to a callchain cursor node, then we can get a SIGSEGV in any function that happens to try to use this invalid cursor. Use the existing map refcount mechanism to forestall cleanup of a map until the cursor iterates past the node. Signed-off-by: Krister Johansen Tested-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: stable@kernel.org Fixes: 84c2cafa2889 ("perf tools: Reference count struct map") Link: http://lkml.kernel.org/r/20170106062331.GB2707@templeofstupid.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 11 +++++++++-- tools/perf/util/callchain.h | 6 ++++++ tools/perf/util/hist.c | 7 +++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 42922512c1c6..8b610dd9e2f6 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -437,7 +437,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor) } call->ip = cursor_node->ip; call->ms.sym = cursor_node->sym; - call->ms.map = cursor_node->map; + call->ms.map = map__get(cursor_node->map); if (cursor_node->branch) { call->branch_count = 1; @@ -477,6 +477,7 @@ add_child(struct callchain_node *parent, list_for_each_entry_safe(call, tmp, &new->val, list) { list_del(&call->list); + map__zput(call->ms.map); free(call); } free(new); @@ -761,6 +762,7 @@ merge_chain_branch(struct callchain_cursor *cursor, list->ms.map, list->ms.sym, false, NULL, 0, 0); list_del(&list->list); + map__zput(list->ms.map); free(list); } @@ -811,7 +813,8 @@ int callchain_cursor_append(struct callchain_cursor *cursor, } node->ip = ip; - node->map = map; + map__zput(node->map); + node->map = map__get(map); node->sym = sym; node->branch = branch; node->nr_loop_iter = nr_loop_iter; @@ -1142,11 +1145,13 @@ static void free_callchain_node(struct callchain_node *node) list_for_each_entry_safe(list, tmp, &node->parent_val, list) { list_del(&list->list); + map__zput(list->ms.map); free(list); } list_for_each_entry_safe(list, tmp, &node->val, list) { list_del(&list->list); + map__zput(list->ms.map); free(list); } @@ -1210,6 +1215,7 @@ int callchain_node__make_parent_list(struct callchain_node *node) goto out; *new = *chain; new->has_children = false; + map__get(new->ms.map); list_add_tail(&new->list, &head); } parent = parent->parent; @@ -1230,6 +1236,7 @@ int callchain_node__make_parent_list(struct callchain_node *node) out: list_for_each_entry_safe(chain, new, &head, list) { list_del(&chain->list); + map__zput(chain->ms.map); free(chain); } return -ENOMEM; diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 35c8e379530f..4f4b60f1558a 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -5,6 +5,7 @@ #include #include #include "event.h" +#include "map.h" #include "symbol.h" #define HELP_PAD "\t\t\t\t" @@ -184,8 +185,13 @@ int callchain_merge(struct callchain_cursor *cursor, */ static inline void callchain_cursor_reset(struct callchain_cursor *cursor) { + struct callchain_cursor_node *node; + cursor->nr = 0; cursor->last = &cursor->first; + + for (node = cursor->first; node != NULL; node = node->next) + map__zput(node->map); } int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip, diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 6770a9645609..7d1b7d33e644 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1,6 +1,7 @@ #include "util.h" #include "build-id.h" #include "hist.h" +#include "map.h" #include "session.h" #include "sort.h" #include "evlist.h" @@ -1019,6 +1020,10 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al, int max_stack_depth, void *arg) { int err, err2; + struct map *alm = NULL; + + if (al && al->map) + alm = map__get(al->map); err = sample__resolve_callchain(iter->sample, &callchain_cursor, &iter->parent, iter->evsel, al, max_stack_depth); @@ -1058,6 +1063,8 @@ out: if (!err) err = err2; + map__put(alm); + return err; } From 79a8b9aa388b0620cc1d525d7c0f0d9a8a85e08e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 5 Feb 2017 11:50:21 +0100 Subject: [PATCH 09/56] x86/CPU/AMD: Bring back Compute Unit ID Commit: a33d331761bc ("x86/CPU/AMD: Fix Bulldozer topology") restored the initial approach we had with the Fam15h topology of enumerating CU (Compute Unit) threads as cores. And this is still correct - they're beefier than HT threads but still have some shared functionality. Our current approach has a problem with the Mad Max Steam game, for example. Yves Dionne reported a certain "choppiness" while playing on v4.9.5. That problem stems most likely from the fact that the CU threads share resources within one CU and when we schedule to a thread of a different compute unit, this incurs latency due to migrating the working set to a different CU through the caches. When the thread siblings mask mirrors that aspect of the CUs and threads, the scheduler pays attention to it and tries to schedule within one CU first. Which takes care of the latency, of course. Reported-by: Yves Dionne Signed-off-by: Borislav Petkov Cc: # 4.9 Cc: Brice Goglin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yazen Ghannam Link: http://lkml.kernel.org/r/20170205105022.8705-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/amd.c | 9 ++++++++- arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/smpboot.c | 12 +++++++++--- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 1be64da0384e..e6cfe7ba2d65 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -104,6 +104,7 @@ struct cpuinfo_x86 { __u8 x86_phys_bits; /* CPUID returned core id bits: */ __u8 x86_coreid_bits; + __u8 cu_id; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1d3167269a67..20dc44d1e6be 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -309,8 +309,15 @@ static void amd_get_topology(struct cpuinfo_x86 *c) /* get information required for multi-node processors */ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + u32 eax, ebx, ecx, edx; - node_id = cpuid_ecx(0x8000001e) & 7; + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + + node_id = ecx & 0xff; + smp_num_siblings = ((ebx >> 8) & 0xff) + 1; + + if (c->x86 == 0x15) + c->cu_id = ebx & 0xff; /* * We may have multiple LLCs if L3 caches exist, so check if we diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9bab7a8a4293..ede03e849a8b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1015,6 +1015,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; c->x86_coreid_bits = 0; + c->cu_id = 0xff; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 46732dc3b73c..99b920d0e516 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -433,9 +433,15 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->phys_proc_id == o->phys_proc_id && - per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && - c->cpu_core_id == o->cpu_core_id) - return topology_sane(c, o, "smt"); + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { + if (c->cpu_core_id == o->cpu_core_id) + return topology_sane(c, o, "smt"); + + if ((c->cu_id != 0xff) && + (o->cu_id != 0xff) && + (c->cu_id == o->cu_id)) + return topology_sane(c, o, "smt"); + } } else if (c->phys_proc_id == o->phys_proc_id && c->cpu_core_id == o->cpu_core_id) { From 08b259631b5a1d912af4832847b5642f377d9101 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Sun, 5 Feb 2017 11:50:22 +0100 Subject: [PATCH 10/56] x86/CPU/AMD: Fix Zen SMT topology After: a33d331761bc ("x86/CPU/AMD: Fix Bulldozer topology") our SMT scheduling topology for Fam17h systems is broken, because the ThreadId is included in the ApicId when SMT is enabled. So, without further decoding cpu_core_id is unique for each thread rather than the same for threads on the same core. This didn't affect systems with SMT disabled. Make cpu_core_id be what it is defined to be. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: # 4.9 Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170205105022.8705-2-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 20dc44d1e6be..2b4cf04239b6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -319,6 +319,13 @@ static void amd_get_topology(struct cpuinfo_x86 *c) if (c->x86 == 0x15) c->cu_id = ebx & 0xff; + if (c->x86 >= 0x17) { + c->cpu_core_id = ebx & 0xff; + + if (smp_num_siblings > 1) + c->x86_max_cores /= smp_num_siblings; + } + /* * We may have multiple LLCs if L3 caches exist, so check if we * have an L3 cache by looking at the L3 cache CPUID leaf. From bfeda41d06d85ad9d52f2413cfc2b77be5022f75 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 7 Feb 2017 15:33:20 -0800 Subject: [PATCH 11/56] stacktrace, lockdep: Fix address, newline ugliness Since KERN_CONT became meaningful again, lockdep stack traces have had annoying extra newlines, like this: [ 5.561122] -> #1 (B){+.+...}: [ 5.561528] [ 5.561532] [] lock_acquire+0xc3/0x210 [ 5.562178] [ 5.562181] [] mutex_lock_nested+0x74/0x6d0 [ 5.562861] [ 5.562880] [] init_btrfs_fs+0x21/0x196 [btrfs] [ 5.563717] [ 5.563721] [] do_one_initcall+0x52/0x1b0 [ 5.564554] [ 5.564559] [] do_init_module+0x5f/0x209 [ 5.565357] [ 5.565361] [] load_module+0x218d/0x2b80 [ 5.566020] [ 5.566021] [] SyS_finit_module+0xeb/0x120 [ 5.566694] [ 5.566696] [] entry_SYSCALL_64_fastpath+0x1f/0xc2 That's happening because each printk() call now gets printed on its own line, and we do a separate call to print the spaces before the symbol. Fix it by doing the printk() directly instead of using the print_ip_sym() helper. Additionally, the symbol address isn't very helpful, so let's get rid of that, too. The final result looks like this: [ 5.194518] -> #1 (B){+.+...}: [ 5.195002] lock_acquire+0xc3/0x210 [ 5.195439] mutex_lock_nested+0x74/0x6d0 [ 5.196491] do_one_initcall+0x52/0x1b0 [ 5.196939] do_init_module+0x5f/0x209 [ 5.197355] load_module+0x218d/0x2b80 [ 5.197792] SyS_finit_module+0xeb/0x120 [ 5.198251] entry_SYSCALL_64_fastpath+0x1f/0xc2 Suggested-by: Linus Torvalds Signed-off-by: Omar Sandoval Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Fixes: 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") Link: http://lkml.kernel.org/r/43b4e114724b2bdb0308fa86cb33aa07d3d67fad.1486510315.git.osandov@fb.com Signed-off-by: Ingo Molnar --- kernel/stacktrace.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index b6e4c16377c7..9c15a9124e83 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -18,10 +18,8 @@ void print_stack_trace(struct stack_trace *trace, int spaces) if (WARN_ON(!trace->entries)) return; - for (i = 0; i < trace->nr_entries; i++) { - printk("%*c", 1 + spaces, ' '); - print_ip_sym(trace->entries[i]); - } + for (i = 0; i < trace->nr_entries; i++) + printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]); } EXPORT_SYMBOL_GPL(print_stack_trace); @@ -29,7 +27,6 @@ int snprint_stack_trace(char *buf, size_t size, struct stack_trace *trace, int spaces) { int i; - unsigned long ip; int generated; int total = 0; @@ -37,9 +34,8 @@ int snprint_stack_trace(char *buf, size_t size, return 0; for (i = 0; i < trace->nr_entries; i++) { - ip = trace->entries[i]; - generated = snprintf(buf, size, "%*c[<%p>] %pS\n", - 1 + spaces, ' ', (void *) ip, (void *) ip); + generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ', + (void *)trace->entries[i]); total += generated; From 2a362249187a8d0f6d942d6e1d763d150a296f47 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 6 Feb 2017 19:39:09 -0500 Subject: [PATCH 12/56] btrfs: fix btrfs_compat_ioctl failures on non-compat ioctls Commit 4c63c2454ef incorrectly assumed that returning -ENOIOCTLCMD would cause the native ioctl to be called. The ->compat_ioctl callback is expected to handle all ioctls, not just compat variants. As a result, when using 32-bit userspace on 64-bit kernels, everything except those three ioctls would return -ENOTTY. Fixes: 4c63c2454ef ("btrfs: bugfix: handle FS_IOC32_{GETFLAGS,SETFLAGS,GETVERSION} in btrfs_ioctl") Cc: stable@vger.kernel.org Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0a6902555e65..0c77cad89e87 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5665,6 +5665,10 @@ long btrfs_ioctl(struct file *file, unsigned int #ifdef CONFIG_COMPAT long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + /* + * These all access 32-bit values anyway so no further + * handling is necessary. + */ switch (cmd) { case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; @@ -5675,8 +5679,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; - default: - return -ENOIOCTLCMD; } return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); From ffdadd68af5a397b8a52289ab39d62e1acb39e63 Mon Sep 17 00:00:00 2001 From: ojab Date: Wed, 28 Dec 2016 11:05:24 +0000 Subject: [PATCH 13/56] scsi: mpt3sas: disable ASPM for MPI2 controllers MPI2 controllers sometimes got lost (i.e. disappear from /sys/bus/pci/devices) if ASMP is enabled. Signed-off-by: Slava Kardakov Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=60644 Cc: Acked-by: Sreekanth Reddy Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 268103182d34..0b5b423b1db0 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -8761,6 +8762,8 @@ _scsih_probe(struct pci_dev *pdev, const struct pci_device_id *id) switch (hba_mpi_version) { case MPI2_VERSION: + pci_disable_link_state(pdev, PCIE_LINK_STATE_L0S | + PCIE_LINK_STATE_L1 | PCIE_LINK_STATE_CLKPM); /* Use mpt2sas driver host template for SAS 2.0 HBA's */ shost = scsi_host_alloc(&mpt2sas_driver_template, sizeof(struct MPT3SAS_ADAPTER)); From 8af8e1c22f9994bb1849c01d66c24fe23f9bc9a0 Mon Sep 17 00:00:00 2001 From: Dave Carroll Date: Thu, 9 Feb 2017 11:04:47 -0700 Subject: [PATCH 14/56] scsi: aacraid: Fix INTx/MSI-x issue with older controllers commit 78cbccd3bd68 ("aacraid: Fix for KDUMP driver hang") caused a problem on older controllers which do not support MSI-x (namely ASR3405,ASR3805). This patch conditionalizes the previous patch to controllers which support MSI-x Cc: # v4.7+ Fixes: 78cbccd3bd68 ("aacraid: Fix for KDUMP driver hang") Reported-by: Arkadiusz Miskiewicz Signed-off-by: Dave Carroll Reviewed-by: Raghava Aditya Renukunta Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/comminit.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/aacraid/comminit.c b/drivers/scsi/aacraid/comminit.c index 4f56b1003cc7..5b48bedd7c38 100644 --- a/drivers/scsi/aacraid/comminit.c +++ b/drivers/scsi/aacraid/comminit.c @@ -50,9 +50,13 @@ struct aac_common aac_config = { static inline int aac_is_msix_mode(struct aac_dev *dev) { - u32 status; + u32 status = 0; - status = src_readl(dev, MUnit.OMR); + if (dev->pdev->device == PMC_DEVICE_S6 || + dev->pdev->device == PMC_DEVICE_S7 || + dev->pdev->device == PMC_DEVICE_S8) { + status = src_readl(dev, MUnit.OMR); + } return (status & AAC_INT_MODE_MSIX); } From 2dfa6688aafdc3f74efeb1cf05fb871465d67f79 Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Wed, 8 Feb 2017 15:34:22 +0100 Subject: [PATCH 15/56] scsi: zfcp: fix use-after-free by not tracing WKA port open/close on failed send Dan Carpenter kindly reported: The patch d27a7cb91960: "zfcp: trace on request for open and close of WKA port" from Aug 10, 2016, leads to the following static checker warning: drivers/s390/scsi/zfcp_fsf.c:1615 zfcp_fsf_open_wka_port() warn: 'req' was already freed. drivers/s390/scsi/zfcp_fsf.c 1609 zfcp_fsf_start_timer(req, ZFCP_FSF_REQUEST_TIMEOUT); 1610 retval = zfcp_fsf_req_send(req); 1611 if (retval) 1612 zfcp_fsf_req_free(req); ^^^ Freed. 1613 out: 1614 spin_unlock_irq(&qdio->req_q_lock); 1615 if (req && !IS_ERR(req)) 1616 zfcp_dbf_rec_run_wka("fsowp_1", wka_port, req->req_id); ^^^^^^^^^^^ Use after free. 1617 return retval; 1618 } Same thing for zfcp_fsf_close_wka_port() as well. Rather than relying on req being NULL (or ERR_PTR) for all cases where we don't want to trace or should not trace, simply check retval which is unconditionally initialized with -EIO != 0 and it can only become 0 on successful retval = zfcp_fsf_req_send(req). With that we can also remove the then again unnecessary unconditional initialization of req which was introduced with that earlier commit. Reported-by: Dan Carpenter Suggested-by: Benjamin Block Signed-off-by: Steffen Maier Fixes: d27a7cb91960 ("zfcp: trace on request for open and close of WKA port") Cc: #2.6.38+ Reviewed-by: Benjamin Block Reviewed-by: Jens Remus Signed-off-by: Martin K. Petersen --- drivers/s390/scsi/zfcp_fsf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index 75f820ca17b7..27ff38f839fc 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -1583,7 +1583,7 @@ out: int zfcp_fsf_open_wka_port(struct zfcp_fc_wka_port *wka_port) { struct zfcp_qdio *qdio = wka_port->adapter->qdio; - struct zfcp_fsf_req *req = NULL; + struct zfcp_fsf_req *req; int retval = -EIO; spin_lock_irq(&qdio->req_q_lock); @@ -1612,7 +1612,7 @@ int zfcp_fsf_open_wka_port(struct zfcp_fc_wka_port *wka_port) zfcp_fsf_req_free(req); out: spin_unlock_irq(&qdio->req_q_lock); - if (req && !IS_ERR(req)) + if (!retval) zfcp_dbf_rec_run_wka("fsowp_1", wka_port, req->req_id); return retval; } @@ -1638,7 +1638,7 @@ static void zfcp_fsf_close_wka_port_handler(struct zfcp_fsf_req *req) int zfcp_fsf_close_wka_port(struct zfcp_fc_wka_port *wka_port) { struct zfcp_qdio *qdio = wka_port->adapter->qdio; - struct zfcp_fsf_req *req = NULL; + struct zfcp_fsf_req *req; int retval = -EIO; spin_lock_irq(&qdio->req_q_lock); @@ -1667,7 +1667,7 @@ int zfcp_fsf_close_wka_port(struct zfcp_fc_wka_port *wka_port) zfcp_fsf_req_free(req); out: spin_unlock_irq(&qdio->req_q_lock); - if (req && !IS_ERR(req)) + if (!retval) zfcp_dbf_rec_run_wka("fscwp_1", wka_port, req->req_id); return retval; } From 451d24d1e5f40bad000fa9abe36ddb16fc9928cb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 Jan 2017 11:27:10 +0100 Subject: [PATCH 16/56] perf/core: Fix crash in perf_event_read() Alexei had his box explode because doing read() on a package (rapl/uncore) event that isn't currently scheduled in ends up doing an out-of-bounds load. Rework the code to more explicitly deal with event->oncpu being -1. Reported-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Tested-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Fixes: d6a2f9035bfc ("perf/core: Introduce PMU_EV_CAP_READ_ACTIVE_PKG") Link: http://lkml.kernel.org/r/20170131102710.GL6515@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index e5aaa806702d..e235bb991bdd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3487,14 +3487,15 @@ struct perf_read_data { int ret; }; -static int find_cpu_to_read(struct perf_event *event, int local_cpu) +static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { - int event_cpu = event->oncpu; u16 local_pkg, event_pkg; if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { - event_pkg = topology_physical_package_id(event_cpu); - local_pkg = topology_physical_package_id(local_cpu); + int local_cpu = smp_processor_id(); + + event_pkg = topology_physical_package_id(event_cpu); + local_pkg = topology_physical_package_id(local_cpu); if (event_pkg == local_pkg) return local_cpu; @@ -3624,7 +3625,7 @@ u64 perf_event_read_local(struct perf_event *event) static int perf_event_read(struct perf_event *event, bool group) { - int ret = 0, cpu_to_read, local_cpu; + int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the @@ -3637,21 +3638,25 @@ static int perf_event_read(struct perf_event *event, bool group) .ret = 0, }; - local_cpu = get_cpu(); - cpu_to_read = find_cpu_to_read(event, local_cpu); - put_cpu(); + event_cpu = READ_ONCE(event->oncpu); + if ((unsigned)event_cpu >= nr_cpu_ids) + return 0; + + preempt_disable(); + event_cpu = __perf_event_read_cpu(event, event_cpu); /* * Purposely ignore the smp_call_function_single() return * value. * - * If event->oncpu isn't a valid CPU it means the event got + * If event_cpu isn't a valid CPU it means the event got * scheduled out and that will have updated the event count. * * Therefore, either way, we'll have an up-to-date event count * after this. */ - (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1); + (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); + preempt_enable(); ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; From 7bdb59f1ad474bd7161adc8f923cdef10f2638d1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Feb 2017 17:44:54 +0100 Subject: [PATCH 17/56] tick/nohz: Fix possible missing clock reprog after tick soft restart ts->next_tick keeps track of the next tick deadline in order to optimize clock programmation on irq exit and avoid redundant clock device writes. Now if ts->next_tick missed an update, we may spuriously miss a clock reprog later as the nohz code is fooled by an obsolete next_tick value. This is what happens here on a specific path: when we observe an expired timer from the nohz update code on irq exit, we perform a soft tick restart which simply fires the closest possible tick without actually exiting the nohz mode and restoring a periodic state. But we forget to update ts->next_tick accordingly. As a result, after the next tick resulting from such soft tick restart, the nohz code sees a stale value on ts->next_tick which doesn't match the clock deadline that just expired. If that obsolete ts->next_tick value happens to collide with the actual next tick deadline to be scheduled, we may spuriously bypass the clock reprogramming. In the worst case, the tick may never fire again. Fix this with a ts->next_tick reset on soft tick restart. Signed-off-by: Frederic Weisbecker Reviewed: Wanpeng Li Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1486485894-29173-1-git-send-email-fweisbec@gmail.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 74e0388cc88d..fc6f740d0277 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -725,6 +725,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, */ if (delta == 0) { tick_nohz_restart(ts, now); + /* + * Make sure next tick stop doesn't get fooled by past + * clock deadline + */ + ts->next_tick = 0; goto out; } } From f2e04214ef7f7e49d1e06109ad1b2718155dab25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 9 Feb 2017 16:08:41 +0100 Subject: [PATCH 18/56] x86/tsc: Avoid the large time jump when sanitizing TSC ADJUST Olof reported that on a machine which has a BIOS wreckaged TSC the timestamps in dmesg are making a large jump because the TSC value is jumping forward after resetting the TSC ADJUST register to a sane value. This can be avoided by calling the TSC ADJUST saniziting function before initializing the per cpu sched clock machinery. That takes the offset into account and avoid the time jump. What cannot be avoided is that the 'Firmware Bug' warnings on the secondary CPUs are printed with the large time offsets because it would be too much effort and ugly hackery to print those warnings into a buffer and emit them after the adjustemt on the starting CPUs. It's a firmware bug and should be fixed in firmware. The weird timestamps are collateral damage and just illustrate the sillyness of the BIOS folks: [ 0.397445] smp: Bringing up secondary CPUs ... [ 0.402100] x86: Booting SMP configuration: [ 0.406343] .... node #0, CPUs: #1 [1265776479.930667] [Firmware Bug]: TSC ADJUST differs: Reference CPU0: -2978888639075328 CPU1: -2978888639183101 [1265776479.944664] TSC ADJUST synchronize: Reference CPU0: 0 CPU1: -2978888639183101 [ 0.508119] #2 [1265776480.032346] [Firmware Bug]: TSC ADJUST differs: Reference CPU0: -2978888639075328 CPU2: -2978888639183677 [1265776480.044192] TSC ADJUST synchronize: Reference CPU0: 0 CPU2: -2978888639183677 [ 0.607643] #3 [1265776480.131874] [Firmware Bug]: TSC ADJUST differs: Reference CPU0: -2978888639075328 CPU3: -2978888639184530 [1265776480.143720] TSC ADJUST synchronize: Reference CPU0: 0 CPU3: -2978888639184530 [ 0.707108] smp: Brought up 1 node, 4 CPUs [ 0.711271] smpboot: Total of 4 processors activated (21698.88 BogoMIPS) Reported-by: Olof Johansson Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170209151231.411460506@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index e41af597aed8..37e7cf544e51 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1356,6 +1356,9 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ + tsc_store_and_check_tsc_adjust(true); + /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same @@ -1386,8 +1389,6 @@ void __init tsc_init(void) if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); - else - tsc_store_and_check_tsc_adjust(true); check_system_tsc_reliable(); From 5f2e71e71410ecb858cfec184ba092adaca61626 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 9 Feb 2017 16:08:42 +0100 Subject: [PATCH 19/56] x86/tsc: Make the TSC ADJUST sanitizing work for tsc_reliable When the TSC is marked reliable then the synchronization check is skipped, but that also skips the TSC ADJUST sanitizing code. So on a machine with a wreckaged BIOS the TSC deviation between CPUs might go unnoticed. Let the TSC adjust sanitizing code run unconditionally and just skip the expensive synchronization checks when TSC is marked reliable. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Olof Johansson Link: http://lkml.kernel.org/r/20170209151231.491189912@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_sync.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index d0db011051a5..728f75378475 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -286,13 +286,6 @@ void check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; - if (tsc_clocksource_reliable) { - if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) - pr_info( - "Skipped synchronization checks as TSC is reliable.\n"); - return; - } - /* * Set the maximum number of test runs to * 1 if the CPU does not provide the TSC_ADJUST MSR @@ -380,14 +373,19 @@ void check_tsc_sync_target(void) int cpus = 2; /* Also aborts if there is no TSC. */ - if (unsynchronized_tsc() || tsc_clocksource_reliable) + if (unsynchronized_tsc()) return; /* * Store, verify and sanitize the TSC adjust register. If * successful skip the test. + * + * The test is also skipped when the TSC is marked reliable. This + * is true for SoCs which have no fallback clocksource. On these + * SoCs the TSC is frequency synchronized, but still the TSC ADJUST + * register might have been wreckaged by the BIOS.. */ - if (tsc_store_and_check_tsc_adjust(false)) { + if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) { atomic_inc(&skip_test); return; } From 146fbb766934dc003fcbf755b519acef683576bf Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 10 Feb 2017 12:54:05 +0300 Subject: [PATCH 20/56] x86/mm/ptdump: Fix soft lockup in page table walker CONFIG_KASAN=y needs a lot of virtual memory mapped for its shadow. In that case ptdump_walk_pgd_level_core() takes a lot of time to walk across all page tables and doing this without a rescheduling causes soft lockups: NMI watchdog: BUG: soft lockup - CPU#3 stuck for 23s! [swapper/0:1] ... Call Trace: ptdump_walk_pgd_level_core+0x40c/0x550 ptdump_walk_pgd_level_checkwx+0x17/0x20 mark_rodata_ro+0x13b/0x150 kernel_init+0x2f/0x120 ret_from_fork+0x2c/0x40 I guess that this issue might arise even without KASAN on huge machines with several terabytes of RAM. Stick cond_resched() in pgd loop to fix this. Reported-by: Tobias Regnery Signed-off-by: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Alexander Potapenko Cc: "Paul E . McKenney" Cc: Dmitry Vyukov Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170210095405.31802-1-aryabinin@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/dump_pagetables.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index ea9c49adaa1f..8aa6bea1cd6c 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -406,6 +407,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, } else note_page(m, &st, __pgprot(0), 1); + cond_resched(); start++; } From 6e78b3f7a193546b1c00a6d084596e774f147169 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 10 Feb 2017 15:03:35 -0800 Subject: [PATCH 21/56] Btrfs: fix btrfs_decompress_buf2page() If btrfs_decompress_buf2page() is handed a bio with its page in the middle of the working buffer, then we adjust the offset into the working buffer. After we copy into the bio, we advance the iterator by the number of bytes we copied. Then, we have some logic to handle the case of discontiguous pages and adjust the offset into the working buffer again. However, if we didn't advance the bio to a new page, we may enter this case in error, essentially repeating the adjustment that we already made when we entered the function. The end result is bogus data in the bio. Previously, we only checked for this case when we advanced to a new page, but the conversion to bio iterators changed that. This restores the old, correct behavior. A case I saw when testing with zlib was: buf_start = 42769 total_out = 46865 working_bytes = total_out - buf_start = 4096 start_byte = 45056 The condition (total_out > start_byte && buf_start < start_byte) is true, so we adjust the offset: buf_offset = start_byte - buf_start = 2287 working_bytes -= buf_offset = 1809 current_buf_start = buf_start = 42769 Then, we copy bytes = min(bvec.bv_len, PAGE_SIZE - buf_offset, working_bytes) = 1809 buf_offset += bytes = 4096 working_bytes -= bytes = 0 current_buf_start += bytes = 44578 After bio_advance(), we are still in the same page, so start_byte is the same. Then, we check (total_out > start_byte && current_buf_start < start_byte), which is true! So, we adjust the values again: buf_offset = start_byte - buf_start = 2287 working_bytes = total_out - start_byte = 1809 current_buf_start = buf_start + buf_offset = 45056 But note that working_bytes was already zero before this, so we should have stopped copying. Fixes: 974b1adc3b10 ("btrfs: use bio iterators for the decompression handlers") Reported-by: Pat Erley Reviewed-by: Chris Mason Signed-off-by: Omar Sandoval Signed-off-by: Chris Mason Reviewed-by: Liu Bo Tested-by: Liu Bo --- fs/btrfs/compression.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7f390849343b..c4444d6f439f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1024,6 +1024,7 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, unsigned long buf_offset; unsigned long current_buf_start; unsigned long start_byte; + unsigned long prev_start_byte; unsigned long working_bytes = total_out - buf_start; unsigned long bytes; char *kaddr; @@ -1071,26 +1072,34 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, if (!bio->bi_iter.bi_size) return 0; bvec = bio_iter_iovec(bio, bio->bi_iter); - + prev_start_byte = start_byte; start_byte = page_offset(bvec.bv_page) - disk_start; /* - * make sure our new page is covered by this - * working buffer + * We need to make sure we're only adjusting + * our offset into compression working buffer when + * we're switching pages. Otherwise we can incorrectly + * keep copying when we were actually done. */ - if (total_out <= start_byte) - return 1; + if (start_byte != prev_start_byte) { + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + return 1; - /* - * the next page in the biovec might not be adjacent - * to the last page, but it might still be found - * inside this working buffer. bump our offset pointer - */ - if (total_out > start_byte && - current_buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes = total_out - start_byte; - current_buf_start = buf_start + buf_offset; + /* + * the next page in the biovec might not be adjacent + * to the last page, but it might still be found + * inside this working buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + buf_offset; + } } } From db5d0b597bc27bbddf40f2f8359a73be4eb77104 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Fri, 10 Feb 2017 13:45:05 -0500 Subject: [PATCH 22/56] ibmvnic: Initialize completion variables before starting work Initialize condition variables prior to invoking any work that can mark them complete. This resolves a race in the ibmvnic driver where the driver faults trying to complete an uninitialized condition variable. Signed-off-by: Nathan Fontenot Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index c12596676bbb..c7150343342d 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -189,9 +189,10 @@ static int alloc_long_term_buff(struct ibmvnic_adapter *adapter, } ltb->map_id = adapter->map_id; adapter->map_id++; + + init_completion(&adapter->fw_done); send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id); - init_completion(&adapter->fw_done); wait_for_completion(&adapter->fw_done); return 0; } @@ -1121,10 +1122,10 @@ static void ibmvnic_get_ethtool_stats(struct net_device *dev, crq.request_statistics.ioba = cpu_to_be32(adapter->stats_token); crq.request_statistics.len = cpu_to_be32(sizeof(struct ibmvnic_statistics)); - ibmvnic_send_crq(adapter, &crq); /* Wait for data to be written */ init_completion(&adapter->stats_done); + ibmvnic_send_crq(adapter, &crq); wait_for_completion(&adapter->stats_done); for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++) @@ -2799,9 +2800,9 @@ static ssize_t trace_read(struct file *file, char __user *user_buf, size_t len, crq.collect_fw_trace.correlator = adapter->ras_comps[num].correlator; crq.collect_fw_trace.ioba = cpu_to_be32(trace_tok); crq.collect_fw_trace.len = adapter->ras_comps[num].trace_buff_size; - ibmvnic_send_crq(adapter, &crq); init_completion(&adapter->fw_done); + ibmvnic_send_crq(adapter, &crq); wait_for_completion(&adapter->fw_done); if (*ppos + len > be32_to_cpu(adapter->ras_comps[num].trace_buff_size)) @@ -3581,9 +3582,9 @@ static int ibmvnic_dump_show(struct seq_file *seq, void *v) memset(&crq, 0, sizeof(crq)); crq.request_dump_size.first = IBMVNIC_CRQ_CMD; crq.request_dump_size.cmd = REQUEST_DUMP_SIZE; - ibmvnic_send_crq(adapter, &crq); init_completion(&adapter->fw_done); + ibmvnic_send_crq(adapter, &crq); wait_for_completion(&adapter->fw_done); seq_write(seq, adapter->dump_data, adapter->dump_data_size); @@ -3629,8 +3630,8 @@ static void handle_crq_init_rsp(struct work_struct *work) } } - send_version_xchg(adapter); reinit_completion(&adapter->init_done); + send_version_xchg(adapter); if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { dev_err(dev, "Passive init timeout\n"); goto task_failed; @@ -3640,9 +3641,9 @@ static void handle_crq_init_rsp(struct work_struct *work) if (adapter->renegotiate) { adapter->renegotiate = false; release_sub_crqs_no_irqs(adapter); - send_cap_queries(adapter); reinit_completion(&adapter->init_done); + send_cap_queries(adapter); if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { dev_err(dev, "Passive init timeout\n"); @@ -3772,9 +3773,9 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) adapter->debugfs_dump = ent; } } - ibmvnic_send_crq_init(adapter); init_completion(&adapter->init_done); + ibmvnic_send_crq_init(adapter); if (!wait_for_completion_timeout(&adapter->init_done, timeout)) return 0; @@ -3782,9 +3783,9 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) if (adapter->renegotiate) { adapter->renegotiate = false; release_sub_crqs_no_irqs(adapter); - send_cap_queries(adapter); reinit_completion(&adapter->init_done); + send_cap_queries(adapter); if (!wait_for_completion_timeout(&adapter->init_done, timeout)) return 0; From e722af6391949e8851310441bb0cec157d25611d Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Fri, 10 Feb 2017 13:29:06 -0500 Subject: [PATCH 23/56] ibmvnic: Call napi_disable instead of napi_enable in failure path The failure path in ibmvnic_open() mistakenly makes a second call to napi_enable instead of calling napi_disable. This can result in a BUG_ON for any queues that were enabled in the previous call to napi_enable. Signed-off-by: Nathan Fontenot Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index c7150343342d..752b0822b020 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -506,7 +506,7 @@ rx_pool_alloc_failed: adapter->rx_pool = NULL; rx_pool_arr_alloc_failed: for (i = 0; i < adapter->req_rx_queues; i++) - napi_enable(&adapter->napi[i]); + napi_disable(&adapter->napi[i]); alloc_napi_failed: return -ENOMEM; } From 7089db84e356562f8ba737c29e472cc42d530dbc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 12 Feb 2017 13:03:20 -0800 Subject: [PATCH 24/56] Linux 4.10-rc8 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8e223e081c9d..503dae1de8ef 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = -rc8 NAME = Fearless Coyote # *DOCUMENTATION* From 7f677633379b4abb3281cdbe7e7006f049305c03 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 10 Feb 2017 20:28:24 -0800 Subject: [PATCH 25/56] bpf: introduce BPF_F_ALLOW_OVERRIDE flag If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command to the given cgroup the descendent cgroup will be able to override effective bpf program that was inherited from this cgroup. By default it's not passed, therefore override is disallowed. Examples: 1. prog X attached to /A with default prog Y fails to attach to /A/B and /A/B/C Everything under /A runs prog X 2. prog X attached to /A with allow_override. prog Y fails to attach to /A/B with default (non-override) prog M attached to /A/B with allow_override. Everything under /A/B runs prog M only. 3. prog X attached to /A with allow_override. prog Y fails to attach to /A with default. The user has to detach first to switch the mode. In the future this behavior may be extended with a chain of non-overridable programs. Also fix the bug where detach from cgroup where nothing is attached was not throwing error. Return ENOENT in such case. Add several testcases and adjust libbpf. Fixes: 3007098494be ("cgroup: add support for eBPF programs") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Tejun Heo Acked-by: Daniel Mack Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 13 +++--- include/uapi/linux/bpf.h | 7 ++++ kernel/bpf/cgroup.c | 59 +++++++++++++++++++++------ kernel/bpf/syscall.c | 20 +++++++--- kernel/cgroup.c | 9 +++-- samples/bpf/test_cgrp2_attach.c | 2 +- samples/bpf/test_cgrp2_attach2.c | 68 ++++++++++++++++++++++++++++++-- samples/bpf/test_cgrp2_sock.c | 2 +- samples/bpf/test_cgrp2_sock2.c | 2 +- tools/lib/bpf/bpf.c | 4 +- tools/lib/bpf/bpf.h | 3 +- 11 files changed, 151 insertions(+), 38 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 92bc89ae7e20..c970a25d2a49 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -21,20 +21,19 @@ struct cgroup_bpf { */ struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE]; struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE]; + bool disallow_override[MAX_BPF_ATTACH_TYPE]; }; void cgroup_bpf_put(struct cgroup *cgrp); void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent); -void __cgroup_bpf_update(struct cgroup *cgrp, - struct cgroup *parent, - struct bpf_prog *prog, - enum bpf_attach_type type); +int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, + struct bpf_prog *prog, enum bpf_attach_type type, + bool overridable); /* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */ -void cgroup_bpf_update(struct cgroup *cgrp, - struct bpf_prog *prog, - enum bpf_attach_type type); +int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, bool overridable); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0eb0e87dbe9f..d2b0ac799d03 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -116,6 +116,12 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command + * to the given target_fd cgroup the descendent cgroup will be able to + * override effective bpf program that was inherited from this cgroup + */ +#define BPF_F_ALLOW_OVERRIDE (1U << 0) + #define BPF_PSEUDO_MAP_FD 1 /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -171,6 +177,7 @@ union bpf_attr { __u32 target_fd; /* container object to attach to */ __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; + __u32 attach_flags; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a515f7b007c6..da0f53690295 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -52,6 +52,7 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) e = rcu_dereference_protected(parent->bpf.effective[type], lockdep_is_held(&cgroup_mutex)); rcu_assign_pointer(cgrp->bpf.effective[type], e); + cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type]; } } @@ -82,30 +83,63 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) * * Must be called with cgroup_mutex held. */ -void __cgroup_bpf_update(struct cgroup *cgrp, - struct cgroup *parent, - struct bpf_prog *prog, - enum bpf_attach_type type) +int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, + struct bpf_prog *prog, enum bpf_attach_type type, + bool new_overridable) { - struct bpf_prog *old_prog, *effective; + struct bpf_prog *old_prog, *effective = NULL; struct cgroup_subsys_state *pos; + bool overridable = true; - old_prog = xchg(cgrp->bpf.prog + type, prog); + if (parent) { + overridable = !parent->bpf.disallow_override[type]; + effective = rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + } - effective = (!prog && parent) ? - rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)) : - prog; + if (prog && effective && !overridable) + /* if parent has non-overridable prog attached, disallow + * attaching new programs to descendent cgroup + */ + return -EPERM; + + if (prog && effective && overridable != new_overridable) + /* if parent has overridable prog attached, only + * allow overridable programs in descendent cgroup + */ + return -EPERM; + + old_prog = cgrp->bpf.prog[type]; + + if (prog) { + overridable = new_overridable; + effective = prog; + if (old_prog && + cgrp->bpf.disallow_override[type] == new_overridable) + /* disallow attaching non-overridable on top + * of existing overridable in this cgroup + * and vice versa + */ + return -EPERM; + } + + if (!prog && !old_prog) + /* report error when trying to detach and nothing is attached */ + return -ENOENT; + + cgrp->bpf.prog[type] = prog; css_for_each_descendant_pre(pos, &cgrp->self) { struct cgroup *desc = container_of(pos, struct cgroup, self); /* skip the subtree if the descendant has its own program */ - if (desc->bpf.prog[type] && desc != cgrp) + if (desc->bpf.prog[type] && desc != cgrp) { pos = css_rightmost_descendant(pos); - else + } else { rcu_assign_pointer(desc->bpf.effective[type], effective); + desc->bpf.disallow_override[type] = !overridable; + } } if (prog) @@ -115,6 +149,7 @@ void __cgroup_bpf_update(struct cgroup *cgrp, bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } + return 0; } /** diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 19b6129eab23..bbb016adbaeb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -920,13 +920,14 @@ static int bpf_obj_get(const union bpf_attr *attr) #ifdef CONFIG_CGROUP_BPF -#define BPF_PROG_ATTACH_LAST_FIELD attach_type +#define BPF_PROG_ATTACH_LAST_FIELD attach_flags static int bpf_prog_attach(const union bpf_attr *attr) { + enum bpf_prog_type ptype; struct bpf_prog *prog; struct cgroup *cgrp; - enum bpf_prog_type ptype; + int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -934,6 +935,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; + if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE) + return -EINVAL; + switch (attr->attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: @@ -956,10 +960,13 @@ static int bpf_prog_attach(const union bpf_attr *attr) return PTR_ERR(cgrp); } - cgroup_bpf_update(cgrp, prog, attr->attach_type); + ret = cgroup_bpf_update(cgrp, prog, attr->attach_type, + attr->attach_flags & BPF_F_ALLOW_OVERRIDE); + if (ret) + bpf_prog_put(prog); cgroup_put(cgrp); - return 0; + return ret; } #define BPF_PROG_DETACH_LAST_FIELD attach_type @@ -967,6 +974,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { struct cgroup *cgrp; + int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -982,7 +990,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) if (IS_ERR(cgrp)) return PTR_ERR(cgrp); - cgroup_bpf_update(cgrp, NULL, attr->attach_type); + ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false); cgroup_put(cgrp); break; @@ -990,7 +998,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) return -EINVAL; } - return 0; + return ret; } #endif /* CONFIG_CGROUP_BPF */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 688dd02af985..53bbca7c4859 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6498,15 +6498,16 @@ static __init int cgroup_namespaces_init(void) subsys_initcall(cgroup_namespaces_init); #ifdef CONFIG_CGROUP_BPF -void cgroup_bpf_update(struct cgroup *cgrp, - struct bpf_prog *prog, - enum bpf_attach_type type) +int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, bool overridable) { struct cgroup *parent = cgroup_parent(cgrp); + int ret; mutex_lock(&cgroup_mutex); - __cgroup_bpf_update(cgrp, parent, prog, type); + ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable); mutex_unlock(&cgroup_mutex); + return ret; } #endif /* CONFIG_CGROUP_BPF */ diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c index 504058631ffc..4bfcaf93fcf3 100644 --- a/samples/bpf/test_cgrp2_attach.c +++ b/samples/bpf/test_cgrp2_attach.c @@ -104,7 +104,7 @@ static int attach_filter(int cg_fd, int type, int verdict) return EXIT_FAILURE; } - ret = bpf_prog_attach(prog_fd, cg_fd, type); + ret = bpf_prog_attach(prog_fd, cg_fd, type, 0); if (ret < 0) { printf("Failed to attach prog to cgroup: '%s'\n", strerror(errno)); diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c index 6e69be37f87f..3049b1f26267 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/samples/bpf/test_cgrp2_attach2.c @@ -79,11 +79,12 @@ int main(int argc, char **argv) if (join_cgroup(FOO)) goto err; - if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS)) { + if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) { log_err("Attaching prog to /foo"); goto err; } + printf("Attached DROP prog. This ping in cgroup /foo should fail...\n"); assert(system(PING_CMD) != 0); /* Create cgroup /foo/bar, get fd, and join it */ @@ -94,24 +95,27 @@ int main(int argc, char **argv) if (join_cgroup(BAR)) goto err; + printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n"); assert(system(PING_CMD) != 0); - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) { + if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { log_err("Attaching prog to /foo/bar"); goto err; } + printf("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n"); assert(system(PING_CMD) == 0); - if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) { log_err("Detaching program from /foo/bar"); goto err; } + printf("Detached PASS from /foo/bar while DROP is attached to /foo.\n" + "This ping in cgroup /foo/bar should fail...\n"); assert(system(PING_CMD) != 0); - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) { + if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { log_err("Attaching prog to /foo/bar"); goto err; } @@ -121,8 +125,60 @@ int main(int argc, char **argv) goto err; } + printf("Attached PASS from /foo/bar and detached DROP from /foo.\n" + "This ping in cgroup /foo/bar should pass...\n"); assert(system(PING_CMD) == 0); + if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { + log_err("Attaching prog to /foo/bar"); + goto err; + } + + if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) { + errno = 0; + log_err("Unexpected success attaching prog to /foo/bar"); + goto err; + } + + if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching program from /foo/bar"); + goto err; + } + + if (!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) { + errno = 0; + log_err("Unexpected success in double detach from /foo"); + goto err; + } + + if (bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) { + log_err("Attaching non-overridable prog to /foo"); + goto err; + } + + if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) { + errno = 0; + log_err("Unexpected success attaching non-overridable prog to /foo/bar"); + goto err; + } + + if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { + errno = 0; + log_err("Unexpected success attaching overridable prog to /foo/bar"); + goto err; + } + + if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) { + errno = 0; + log_err("Unexpected success attaching overridable prog to /foo"); + goto err; + } + + if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) { + log_err("Attaching different non-overridable prog to /foo"); + goto err; + } + goto out; err: @@ -132,5 +188,9 @@ out: close(foo); close(bar); cleanup_cgroup_environment(); + if (!rc) + printf("PASS\n"); + else + printf("FAIL\n"); return rc; } diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c index 0791b949cbe4..c3cfb23e23b5 100644 --- a/samples/bpf/test_cgrp2_sock.c +++ b/samples/bpf/test_cgrp2_sock.c @@ -75,7 +75,7 @@ int main(int argc, char **argv) return EXIT_FAILURE; } - ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE); + ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE, 0); if (ret < 0) { printf("Failed to attach prog to cgroup: '%s'\n", strerror(errno)); diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c index 455ef0d06e93..db036077b644 100644 --- a/samples/bpf/test_cgrp2_sock2.c +++ b/samples/bpf/test_cgrp2_sock2.c @@ -55,7 +55,7 @@ int main(int argc, char **argv) } ret = bpf_prog_attach(prog_fd[filter_id], cg_fd, - BPF_CGROUP_INET_SOCK_CREATE); + BPF_CGROUP_INET_SOCK_CREATE, 0); if (ret < 0) { printf("Failed to attach prog to cgroup: '%s'\n", strerror(errno)); diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 3ddb58a36d3c..ae752fa4eaa7 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -168,7 +168,8 @@ int bpf_obj_get(const char *pathname) return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr)); } -int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type) +int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, + unsigned int flags) { union bpf_attr attr; @@ -176,6 +177,7 @@ int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type) attr.target_fd = target_fd; attr.attach_bpf_fd = prog_fd; attr.attach_type = type; + attr.attach_flags = flags; return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); } diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index a2f9853dd882..4ac6c4b84100 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -41,7 +41,8 @@ int bpf_map_delete_elem(int fd, void *key); int bpf_map_get_next_key(int fd, void *key, void *next_key); int bpf_obj_pin(int fd, const char *pathname); int bpf_obj_get(const char *pathname); -int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type); +int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, + unsigned int flags); int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); From 8b74d439e1697110c5e5c600643e823eb1dd0762 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 12 Feb 2017 14:03:52 -0800 Subject: [PATCH 26/56] net/llc: avoid BUG_ON() in skb_orphan() It seems nobody used LLC since linux-3.12. Fortunately fuzzers like syzkaller still know how to run this code, otherwise it would be no fun. Setting skb->sk without skb->destructor leads to all kinds of bugs, we now prefer to be very strict about it. Ideally here we would use skb_set_owner() but this helper does not exist yet, only CAN seems to have a private helper for that. Fixes: 376c7311bdb6 ("net: add a temporary sanity check in skb_orphan()") Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Signed-off-by: David S. Miller --- net/llc/llc_conn.c | 3 +++ net/llc/llc_sap.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 3e821daf9dd4..8bc5a1bd2d45 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -821,7 +821,10 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) * another trick required to cope with how the PROCOM state * machine works. -acme */ + skb_orphan(skb); + sock_hold(sk); skb->sk = sk; + skb->destructor = sock_efree; } if (!sock_owned_by_user(sk)) llc_conn_rcv(sk, skb); diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index d0e1e804ebd7..5404d0d195cc 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -290,7 +290,10 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb, ev->type = LLC_SAP_EV_TYPE_PDU; ev->reason = 0; + skb_orphan(skb); + sock_hold(sk); skb->sk = sk; + skb->destructor = sock_efree; llc_sap_state_process(sap, skb); } From 35879ee4769099905fa3bda0b21e73d434e2df6a Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Fri, 10 Feb 2017 07:18:36 -0200 Subject: [PATCH 27/56] [media] videodev2.h: go back to limited range Y'CbCr for SRGB and, ADOBERGB This reverts 'commit 7e0739cd9c40 ("[media] videodev2.h: fix sYCC/AdobeYCC default quantization range"). The problem is that many drivers can convert R'G'B' content (often from sensors) to Y'CbCr, but they all produce limited range Y'CbCr. To stay backwards compatible the default quantization range for sRGB and AdobeRGB Y'CbCr encoding should be limited range, not full range, even though the corresponding standards specify full range. Update the V4L2_MAP_QUANTIZATION_DEFAULT define accordingly and also update the documentation. Fixes: 7e0739cd9c40 ("[media] videodev2.h: fix sYCC/AdobeYCC default quantization range") Signed-off-by: Hans Verkuil Cc: # for v4.9 and up Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/pixfmt-007.rst | 23 +++++++++++++++------ include/uapi/linux/videodev2.h | 7 +++---- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/Documentation/media/uapi/v4l/pixfmt-007.rst b/Documentation/media/uapi/v4l/pixfmt-007.rst index 44bb5a7059b3..95a23a28c595 100644 --- a/Documentation/media/uapi/v4l/pixfmt-007.rst +++ b/Documentation/media/uapi/v4l/pixfmt-007.rst @@ -211,7 +211,13 @@ Colorspace sRGB (V4L2_COLORSPACE_SRGB) The :ref:`srgb` standard defines the colorspace used by most webcams and computer graphics. The default transfer function is ``V4L2_XFER_FUNC_SRGB``. The default Y'CbCr encoding is -``V4L2_YCBCR_ENC_601``. The default Y'CbCr quantization is full range. +``V4L2_YCBCR_ENC_601``. The default Y'CbCr quantization is limited range. + +Note that the :ref:`sycc` standard specifies full range quantization, +however all current capture hardware supported by the kernel convert +R'G'B' to limited range Y'CbCr. So choosing full range as the default +would break how applications interpret the quantization range. + The chromaticities of the primary colors and the white reference are: @@ -276,7 +282,7 @@ the following ``V4L2_YCBCR_ENC_601`` encoding as defined by :ref:`sycc`: Y' is clamped to the range [0…1] and Cb and Cr are clamped to the range [-0.5…0.5]. This transform is identical to one defined in SMPTE -170M/BT.601. The Y'CbCr quantization is full range. +170M/BT.601. The Y'CbCr quantization is limited range. .. _col-adobergb: @@ -288,10 +294,15 @@ The :ref:`adobergb` standard defines the colorspace used by computer graphics that use the AdobeRGB colorspace. This is also known as the :ref:`oprgb` standard. The default transfer function is ``V4L2_XFER_FUNC_ADOBERGB``. The default Y'CbCr encoding is -``V4L2_YCBCR_ENC_601``. The default Y'CbCr quantization is full -range. The chromaticities of the primary colors and the white reference -are: +``V4L2_YCBCR_ENC_601``. The default Y'CbCr quantization is limited +range. +Note that the :ref:`oprgb` standard specifies full range quantization, +however all current capture hardware supported by the kernel convert +R'G'B' to limited range Y'CbCr. So choosing full range as the default +would break how applications interpret the quantization range. + +The chromaticities of the primary colors and the white reference are: .. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{8.7cm}| @@ -344,7 +355,7 @@ the following ``V4L2_YCBCR_ENC_601`` encoding: Y' is clamped to the range [0…1] and Cb and Cr are clamped to the range [-0.5…0.5]. This transform is identical to one defined in SMPTE -170M/BT.601. The Y'CbCr quantization is full range. +170M/BT.601. The Y'CbCr quantization is limited range. .. _col-bt2020: diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 46e8a2e369f9..45184a2ef66c 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -362,8 +362,8 @@ enum v4l2_quantization { /* * The default for R'G'B' quantization is always full range, except * for the BT2020 colorspace. For Y'CbCr the quantization is always - * limited range, except for COLORSPACE_JPEG, SRGB, ADOBERGB, - * XV601 or XV709: those are full range. + * limited range, except for COLORSPACE_JPEG, XV601 or XV709: those + * are full range. */ V4L2_QUANTIZATION_DEFAULT = 0, V4L2_QUANTIZATION_FULL_RANGE = 1, @@ -379,8 +379,7 @@ enum v4l2_quantization { (((is_rgb_or_hsv) && (colsp) == V4L2_COLORSPACE_BT2020) ? \ V4L2_QUANTIZATION_LIM_RANGE : \ (((is_rgb_or_hsv) || (ycbcr_enc) == V4L2_YCBCR_ENC_XV601 || \ - (ycbcr_enc) == V4L2_YCBCR_ENC_XV709 || (colsp) == V4L2_COLORSPACE_JPEG) || \ - (colsp) == V4L2_COLORSPACE_ADOBERGB || (colsp) == V4L2_COLORSPACE_SRGB ? \ + (ycbcr_enc) == V4L2_YCBCR_ENC_XV709 || (colsp) == V4L2_COLORSPACE_JPEG) ? \ V4L2_QUANTIZATION_FULL_RANGE : V4L2_QUANTIZATION_LIM_RANGE)) enum v4l2_priority { From 42980da2eb7eb9695d8efc0c0ef145cbbb993b2c Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sat, 11 Feb 2017 09:24:46 -0200 Subject: [PATCH 28/56] [media] cec: initiator should be the same as the destination for, poll Poll messages that are used to allocate a logical address should use the same initiator as the destination. Instead, it expected that the initiator was 0xf which is not according to the standard. This also had consequences for the message checks in cec_transmit_msg_fh that incorrectly rejected poll messages with the same initiator and destination. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/cec-adap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c index 87a6b65ed3af..ccda41c2c9e4 100644 --- a/drivers/media/cec/cec-adap.c +++ b/drivers/media/cec/cec-adap.c @@ -612,8 +612,7 @@ int cec_transmit_msg_fh(struct cec_adapter *adap, struct cec_msg *msg, } memset(msg->msg + msg->len, 0, sizeof(msg->msg) - msg->len); if (msg->len == 1) { - if (cec_msg_initiator(msg) != 0xf || - cec_msg_destination(msg) == 0xf) { + if (cec_msg_destination(msg) == 0xf) { dprintk(1, "cec_transmit_msg: invalid poll message\n"); return -EINVAL; } @@ -638,7 +637,7 @@ int cec_transmit_msg_fh(struct cec_adapter *adap, struct cec_msg *msg, dprintk(1, "cec_transmit_msg: destination is the adapter itself\n"); return -EINVAL; } - if (cec_msg_initiator(msg) != 0xf && + if (msg->len > 1 && adap->is_configured && !cec_has_log_addr(adap, cec_msg_initiator(msg))) { dprintk(1, "cec_transmit_msg: initiator has unknown logical address %d\n", cec_msg_initiator(msg)); @@ -1072,7 +1071,7 @@ static int cec_config_log_addr(struct cec_adapter *adap, /* Send poll message */ msg.len = 1; - msg.msg[0] = 0xf0 | log_addr; + msg.msg[0] = (log_addr << 4) | log_addr; err = cec_transmit_msg_fh(adap, &msg, NULL, true); /* From 0c59d28121b96d826c188280f367e754b5d83350 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 13 Feb 2017 14:15:44 -0300 Subject: [PATCH 29/56] MAINTAINERS: Remove old e-mail address The ghostprotocols.net domain is not working, remove it from CREDITS and MAINTAINERS, and change the status to "Odd fixes", and since I haven't been maintaining those, remove my address from there. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- CREDITS | 5 ++--- MAINTAINERS | 15 ++++++--------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/CREDITS b/CREDITS index c58560701d13..c5626bf06264 100644 --- a/CREDITS +++ b/CREDITS @@ -2478,12 +2478,11 @@ S: D-90453 Nuernberg S: Germany N: Arnaldo Carvalho de Melo -E: acme@ghostprotocols.net +E: acme@kernel.org E: arnaldo.melo@gmail.com E: acme@redhat.com -W: http://oops.ghostprotocols.net:81/blog/ P: 1024D/9224DF01 D5DF E3BB E3C8 BCBB F8AD 841A B6AB 4681 9224 DF01 -D: IPX, LLC, DCCP, cyc2x, wl3501_cs, net/ hacks +D: tools/, IPX, LLC, DCCP, cyc2x, wl3501_cs, net/ hacks S: Brazil N: Karsten Merker diff --git a/MAINTAINERS b/MAINTAINERS index 107c10e8f2d2..527d13759ecc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -877,8 +877,8 @@ S: Odd fixes F: drivers/hwmon/applesmc.c APPLETALK NETWORK LAYER -M: Arnaldo Carvalho de Melo -S: Maintained +L: netdev@vger.kernel.org +S: Odd fixes F: drivers/net/appletalk/ F: net/appletalk/ @@ -6727,9 +6727,8 @@ S: Odd Fixes F: drivers/tty/ipwireless/ IPX NETWORK LAYER -M: Arnaldo Carvalho de Melo L: netdev@vger.kernel.org -S: Maintained +S: Odd fixes F: include/net/ipx.h F: include/uapi/linux/ipx.h F: net/ipx/ @@ -7501,8 +7500,8 @@ S: Maintained F: drivers/misc/lkdtm* LLC (802.2) -M: Arnaldo Carvalho de Melo -S: Maintained +L: netdev@vger.kernel.org +S: Odd fixes F: include/linux/llc.h F: include/uapi/linux/llc.h F: include/net/llc* @@ -13373,10 +13372,8 @@ S: Maintained F: drivers/input/misc/wistron_btns.c WL3501 WIRELESS PCMCIA CARD DRIVER -M: Arnaldo Carvalho de Melo L: linux-wireless@vger.kernel.org -W: http://oops.ghostprotocols.net:81/blog -S: Maintained +S: Odd fixes F: drivers/net/wireless/wl3501* WOLFSON MICROELECTRONICS DRIVERS From ebf692f85ff78092cd238166d8d7ec51419f9c02 Mon Sep 17 00:00:00 2001 From: Mart van Santen Date: Fri, 10 Feb 2017 12:02:18 +0000 Subject: [PATCH 30/56] xen-netback: vif counters from int/long to u64 This patch fixes an issue where the type of counters in the queue(s) and interface are not in sync (queue counters are int, interface counters are long), causing incorrect reporting of tx/rx values of the vif interface and unclear counter overflows. This patch sets both counters to the u64 type. Signed-off-by: Mart van Santen Reviewed-by: Paul Durrant Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 8 ++++---- drivers/net/xen-netback/interface.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 3ce1f7da8647..530586be05b4 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -113,10 +113,10 @@ struct xenvif_stats { * A subset of struct net_device_stats that contains only the * fields that are updated in netback.c for each queue. */ - unsigned int rx_bytes; - unsigned int rx_packets; - unsigned int tx_bytes; - unsigned int tx_packets; + u64 rx_bytes; + u64 rx_packets; + u64 tx_bytes; + u64 tx_packets; /* Additional stats used by xenvif */ unsigned long rx_gso_checksum_fixup; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 579521327b03..50fa1692d985 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -221,10 +221,10 @@ static struct net_device_stats *xenvif_get_stats(struct net_device *dev) { struct xenvif *vif = netdev_priv(dev); struct xenvif_queue *queue = NULL; - unsigned long rx_bytes = 0; - unsigned long rx_packets = 0; - unsigned long tx_bytes = 0; - unsigned long tx_packets = 0; + u64 rx_bytes = 0; + u64 rx_packets = 0; + u64 tx_bytes = 0; + u64 tx_packets = 0; unsigned int index; spin_lock(&vif->lock); From 4872e57c812dd312bf8193b5933fa60585cda42f Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Sat, 11 Feb 2017 00:38:57 +0100 Subject: [PATCH 31/56] NET: Fix /proc/net/arp for AX.25 When sending ARP requests over AX.25 links the hwaddress in the neighbour cache are not getting initialized. For such an incomplete arp entry ax2asc2 will generate an empty string resulting in /proc/net/arp output like the following: $ cat /proc/net/arp IP address HW type Flags HW address Mask Device 192.168.122.1 0x1 0x2 52:54:00:00:5d:5f * ens3 172.20.1.99 0x3 0x0 * bpq0 The missing field will confuse the procfs parsing of arp(8) resulting in incorrect output for the device such as the following: $ arp Address HWtype HWaddress Flags Mask Iface gateway ether 52:54:00:00:5d:5f C ens3 172.20.1.99 (incomplete) ens3 This changes the content of /proc/net/arp to: $ cat /proc/net/arp IP address HW type Flags HW address Mask Device 172.20.1.99 0x3 0x0 * * bpq0 192.168.122.1 0x1 0x2 52:54:00:00:5d:5f * ens3 To do so it change ax2asc to put the string "*" in buf for a NULL address argument. Finally the HW address field is left aligned in a 17 character field (the length of an ethernet HW address in the usual hex notation) for readability. Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller --- net/ipv4/arp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 89a8cac4726a..51b27ae09fbd 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1263,7 +1263,7 @@ void __init arp_init(void) /* * ax25 -> ASCII conversion */ -static char *ax2asc2(ax25_address *a, char *buf) +static void ax2asc2(ax25_address *a, char *buf) { char c, *s; int n; @@ -1285,10 +1285,10 @@ static char *ax2asc2(ax25_address *a, char *buf) *s++ = n + '0'; *s++ = '\0'; - if (*buf == '\0' || *buf == '-') - return "*"; - - return buf; + if (*buf == '\0' || *buf == '-') { + buf[0] = '*'; + buf[1] = '\0'; + } } #endif /* CONFIG_AX25 */ @@ -1322,7 +1322,7 @@ static void arp_format_neigh_entry(struct seq_file *seq, } #endif sprintf(tbuf, "%pI4", n->primary_key); - seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", + seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n", tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); read_unlock(&n->lock); } From 6a25478077d987edc5e2f880590a2bc5fcab4441 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 11 Feb 2017 19:26:45 +0800 Subject: [PATCH 32/56] gfs2: Use rhashtable walk interface in glock_hash_walk The function glock_hash_walk walks the rhashtable by hand. This is broken because if it catches the hash table in the middle of a rehash, then it will miss entries. This patch replaces the manual walk by using the rhashtable walk interface. Fixes: 88ffbf3e037e ("GFS2: Use resizable hash table for glocks") Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- fs/gfs2/glock.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 94f50cac91c6..70e94170af85 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1420,26 +1420,32 @@ static struct shrinker glock_shrinker = { * @sdp: the filesystem * @bucket: the bucket * + * Note that the function can be called multiple times on the same + * object. So the user must ensure that the function can cope with + * that. */ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; - struct rhash_head *pos; - const struct bucket_table *tbl; - int i; + struct rhashtable_iter iter; - rcu_read_lock(); - tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table); - for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) { + rhashtable_walk_enter(&gl_hash_table, &iter); + + do { + gl = ERR_PTR(rhashtable_walk_start(&iter)); + if (gl) + continue; + + while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) if ((gl->gl_name.ln_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); - } - } - rcu_read_unlock(); - cond_resched(); + + rhashtable_walk_stop(&iter); + } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); + + rhashtable_walk_exit(&iter); } /** From 9dbbfb0ab6680c6a85609041011484e6658e7d3c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 11 Feb 2017 19:26:46 +0800 Subject: [PATCH 33/56] tipc: Fix tipc_sk_reinit race conditions There are two problems with the function tipc_sk_reinit. Firstly it's doing a manual walk over an rhashtable. This is broken as an rhashtable can be resized and if you manually walk over it during a resize then you may miss entries. Secondly it's missing memory barriers as previously the code used spinlocks which provide the barriers implicitly. This patch fixes both problems. Fixes: 07f6c4bc048a ("tipc: convert tipc reference table to...") Signed-off-by: Herbert Xu Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/net.c | 4 ++++ net/tipc/socket.c | 30 +++++++++++++++++++----------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/net/tipc/net.c b/net/tipc/net.c index 28bf4feeb81c..ab8a2d5d1e32 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -110,6 +110,10 @@ int tipc_net_start(struct net *net, u32 addr) char addr_string[16]; tn->own_addr = addr; + + /* Ensure that the new address is visible before we reinit. */ + smp_mb(); + tipc_named_reinit(net); tipc_sk_reinit(net); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 800caaa699a1..370a5912bcb5 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -384,8 +384,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock, INIT_LIST_HEAD(&tsk->publications); msg = &tsk->phdr; tn = net_generic(sock_net(sk), tipc_net_id); - tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, - NAMED_H_SIZE, 0); /* Finish initializing socket data structures */ sock->ops = ops; @@ -395,6 +393,13 @@ static int tipc_sk_create(struct net *net, struct socket *sock, pr_warn("Socket create failed; port number exhausted\n"); return -EINVAL; } + + /* Ensure tsk is visible before we read own_addr. */ + smp_mb(); + + tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, + NAMED_H_SIZE, 0); + msg_set_origport(msg, tsk->portid); setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); sk->sk_shutdown = 0; @@ -2269,24 +2274,27 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, void tipc_sk_reinit(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); - const struct bucket_table *tbl; - struct rhash_head *pos; + struct rhashtable_iter iter; struct tipc_sock *tsk; struct tipc_msg *msg; - int i; - rcu_read_lock(); - tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht); - for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_rcu(tsk, pos, tbl, i, node) { + rhashtable_walk_enter(&tn->sk_rht, &iter); + + do { + tsk = ERR_PTR(rhashtable_walk_start(&iter)); + if (tsk) + continue; + + while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { spin_lock_bh(&tsk->sk.sk_lock.slock); msg = &tsk->phdr; msg_set_prevnode(msg, tn->own_addr); msg_set_orignode(msg, tn->own_addr); spin_unlock_bh(&tsk->sk.sk_lock.slock); } - } - rcu_read_unlock(); + + rhashtable_walk_stop(&iter); + } while (tsk == ERR_PTR(-EAGAIN)); } static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) From 40137906c5f55c252194ef5834130383e639536f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 11 Feb 2017 19:26:47 +0800 Subject: [PATCH 34/56] rhashtable: Add nested tables This patch adds code that handles GFP_ATOMIC kmalloc failure on insertion. As we cannot use vmalloc, we solve it by making our hash table nested. That is, we allocate single pages at each level and reach our desired table size by nesting them. When a nested table is created, only a single page is allocated at the top-level. Lower levels are allocated on demand during insertion. Therefore for each insertion to succeed, only two (non-consecutive) pages are needed. After a nested table is created, a rehash will be scheduled in order to switch to a vmalloced table as soon as possible. Also, the rehash code will never rehash into a nested table. If we detect a nested table during a rehash, the rehash will be aborted and a new rehash will be scheduled. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 78 ++++++++--- lib/rhashtable.c | 270 ++++++++++++++++++++++++++++++------- 2 files changed, 276 insertions(+), 72 deletions(-) diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 5c132d3188be..f2e12a845910 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -61,6 +61,7 @@ struct rhlist_head { /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets + * @nest: Number of bits of first-level nested table. * @rehash: Current bucket being rehashed * @hash_rnd: Random seed to fold into hash * @locks_mask: Mask to apply before accessing locks[] @@ -68,10 +69,12 @@ struct rhlist_head { * @walkers: List of active walkers * @rcu: RCU structure for freeing the table * @future_tbl: Table under construction during rehashing + * @ntbl: Nested table used when out of memory. * @buckets: size * hash buckets */ struct bucket_table { unsigned int size; + unsigned int nest; unsigned int rehash; u32 hash_rnd; unsigned int locks_mask; @@ -81,7 +84,7 @@ struct bucket_table { struct bucket_table __rcu *future_tbl; - struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; + struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; /** @@ -374,6 +377,12 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, void *arg); void rhashtable_destroy(struct rhashtable *ht); +struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, + unsigned int hash); +struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash); + #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) @@ -389,6 +398,27 @@ void rhashtable_destroy(struct rhashtable *ht); #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) +static inline struct rhash_head __rcu *const *rht_bucket( + const struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : + &tbl->buckets[hash]; +} + +static inline struct rhash_head __rcu **rht_bucket_var( + struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : + &tbl->buckets[hash]; +} + +static inline struct rhash_head __rcu **rht_bucket_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : + &tbl->buckets[hash]; +} + /** * rht_for_each_continue - continue iterating over hash chain * @pos: the &struct rhash_head to use as a loop cursor. @@ -408,7 +438,7 @@ void rhashtable_destroy(struct rhashtable *ht); * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ - rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash) + rht_for_each_continue(pos, *rht_bucket(tbl, hash), tbl, hash) /** * rht_for_each_entry_continue - continue iterating over hash chain @@ -433,7 +463,7 @@ void rhashtable_destroy(struct rhashtable *ht); * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_continue(tpos, pos, (tbl)->buckets[hash], \ + rht_for_each_entry_continue(tpos, pos, *rht_bucket(tbl, hash), \ tbl, hash, member) /** @@ -448,13 +478,13 @@ void rhashtable_destroy(struct rhashtable *ht); * This hash chain list-traversal primitive allows for the looped code to * remove the loop cursor from the list. */ -#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ - for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \ - next = !rht_is_a_nulls(pos) ? \ - rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ - (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ - pos = next, \ - next = !rht_is_a_nulls(pos) ? \ +#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ + for (pos = rht_dereference_bucket(*rht_bucket(tbl, hash), tbl, hash), \ + next = !rht_is_a_nulls(pos) ? \ + rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ + pos = next, \ + next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL) /** @@ -485,7 +515,7 @@ void rhashtable_destroy(struct rhashtable *ht); * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu(pos, tbl, hash) \ - rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash) + rht_for_each_rcu_continue(pos, *rht_bucket(tbl, hash), tbl, hash) /** * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain @@ -518,8 +548,8 @@ void rhashtable_destroy(struct rhashtable *ht); * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ -#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\ +#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ + rht_for_each_entry_rcu_continue(tpos, pos, *rht_bucket(tbl, hash), \ tbl, hash, member) /** @@ -565,7 +595,7 @@ static inline struct rhash_head *__rhashtable_lookup( .ht = ht, .key = key, }; - const struct bucket_table *tbl; + struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; @@ -697,8 +727,12 @@ slow_path: } elasticity = ht->elasticity; - pprev = &tbl->buckets[hash]; - rht_for_each(head, tbl, hash) { + pprev = rht_bucket_insert(ht, tbl, hash); + data = ERR_PTR(-ENOMEM); + if (!pprev) + goto out; + + rht_for_each_continue(head, *pprev, tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; @@ -736,7 +770,7 @@ slow_path: if (unlikely(rht_grow_above_100(ht, tbl))) goto slow_path; - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); + head = rht_dereference_bucket(*pprev, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (rhlist) { @@ -746,7 +780,7 @@ slow_path: RCU_INIT_POINTER(list->next, NULL); } - rcu_assign_pointer(tbl->buckets[hash], obj); + rcu_assign_pointer(*pprev, obj); atomic_inc(&ht->nelems); if (rht_grow_above_75(ht, tbl)) @@ -955,8 +989,8 @@ static inline int __rhashtable_remove_fast_one( spin_lock_bh(lock); - pprev = &tbl->buckets[hash]; - rht_for_each(he, tbl, hash) { + pprev = rht_bucket_var(tbl, hash); + rht_for_each_continue(he, *pprev, tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); @@ -1107,8 +1141,8 @@ static inline int __rhashtable_replace_fast( spin_lock_bh(lock); - pprev = &tbl->buckets[hash]; - rht_for_each(he, tbl, hash) { + pprev = rht_bucket_var(tbl, hash); + rht_for_each_continue(he, *pprev, tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 32d0ad058380..172454e6b979 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -32,6 +32,11 @@ #define HASH_MIN_SIZE 4U #define BUCKET_LOCKS_PER_CPU 32UL +union nested_table { + union nested_table __rcu *table; + struct rhash_head __rcu *bucket; +}; + static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he) @@ -76,6 +81,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, /* Never allocate more than 0.5 locks per bucket */ size = min_t(unsigned int, size, tbl->size >> 1); + if (tbl->nest) + size = min(size, 1U << tbl->nest); + if (sizeof(spinlock_t) != 0) { tbl->locks = NULL; #ifdef CONFIG_NUMA @@ -99,8 +107,45 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, return 0; } +static void nested_table_free(union nested_table *ntbl, unsigned int size) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + const unsigned int len = 1 << shift; + unsigned int i; + + ntbl = rcu_dereference_raw(ntbl->table); + if (!ntbl) + return; + + if (size > len) { + size >>= shift; + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + } + + kfree(ntbl); +} + +static void nested_bucket_table_free(const struct bucket_table *tbl) +{ + unsigned int size = tbl->size >> tbl->nest; + unsigned int len = 1 << tbl->nest; + union nested_table *ntbl; + unsigned int i; + + ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); + + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + + kfree(ntbl); +} + static void bucket_table_free(const struct bucket_table *tbl) { + if (tbl->nest) + nested_bucket_table_free(tbl); + if (tbl) kvfree(tbl->locks); @@ -112,6 +157,59 @@ static void bucket_table_free_rcu(struct rcu_head *head) bucket_table_free(container_of(head, struct bucket_table, rcu)); } +static union nested_table *nested_table_alloc(struct rhashtable *ht, + union nested_table __rcu **prev, + unsigned int shifted, + unsigned int nhash) +{ + union nested_table *ntbl; + int i; + + ntbl = rcu_dereference(*prev); + if (ntbl) + return ntbl; + + ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); + + if (ntbl && shifted) { + for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++) + INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht, + (i << shifted) | nhash); + } + + rcu_assign_pointer(*prev, ntbl); + + return ntbl; +} + +static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + struct bucket_table *tbl; + size_t size; + + if (nbuckets < (1 << (shift + 1))) + return NULL; + + size = sizeof(*tbl) + sizeof(tbl->buckets[0]); + + tbl = kzalloc(size, gfp); + if (!tbl) + return NULL; + + if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, + 0, 0)) { + kfree(tbl); + return NULL; + } + + tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; + + return tbl; +} + static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) @@ -126,10 +224,17 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); if (tbl == NULL && gfp == GFP_KERNEL) tbl = vzalloc(size); + + size = nbuckets; + + if (tbl == NULL && gfp != GFP_KERNEL) { + tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); + nbuckets = 0; + } if (tbl == NULL) return NULL; - tbl->size = nbuckets; + tbl->size = size; if (alloc_bucket_locks(ht, tbl, gfp) < 0) { bucket_table_free(tbl); @@ -164,12 +269,17 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl = rhashtable_last_table(ht, rht_dereference_rcu(old_tbl->future_tbl, ht)); - struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; - int err = -ENOENT; + struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash); + int err = -EAGAIN; struct rhash_head *head, *next, *entry; spinlock_t *new_bucket_lock; unsigned int new_hash; + if (new_tbl->nest) + goto out; + + err = -ENOENT; + rht_for_each(entry, old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); @@ -202,19 +312,26 @@ out: return err; } -static void rhashtable_rehash_chain(struct rhashtable *ht, +static int rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); spinlock_t *old_bucket_lock; + int err; old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); spin_lock_bh(old_bucket_lock); - while (!rhashtable_rehash_one(ht, old_hash)) + while (!(err = rhashtable_rehash_one(ht, old_hash))) ; - old_tbl->rehash++; + + if (err == -ENOENT) { + old_tbl->rehash++; + err = 0; + } spin_unlock_bh(old_bucket_lock); + + return err; } static int rhashtable_rehash_attach(struct rhashtable *ht, @@ -246,13 +363,17 @@ static int rhashtable_rehash_table(struct rhashtable *ht) struct bucket_table *new_tbl; struct rhashtable_walker *walker; unsigned int old_hash; + int err; new_tbl = rht_dereference(old_tbl->future_tbl, ht); if (!new_tbl) return 0; - for (old_hash = 0; old_hash < old_tbl->size; old_hash++) - rhashtable_rehash_chain(ht, old_hash); + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { + err = rhashtable_rehash_chain(ht, old_hash); + if (err) + return err; + } /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); @@ -271,31 +392,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht) return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } -/** - * rhashtable_expand - Expand hash table while allowing concurrent lookups - * @ht: the hash table to expand - * - * A secondary bucket array is allocated and the hash entries are migrated. - * - * This function may only be called in a context where it is safe to call - * synchronize_rcu(), e.g. not within a rcu_read_lock() section. - * - * The caller must ensure that no concurrent resizing occurs by holding - * ht->mutex. - * - * It is valid to have concurrent insertions and deletions protected by per - * bucket locks or concurrent RCU protected lookups and traversals. - */ -static int rhashtable_expand(struct rhashtable *ht) +static int rhashtable_rehash_alloc(struct rhashtable *ht, + struct bucket_table *old_tbl, + unsigned int size) { - struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl; int err; ASSERT_RHT_MUTEX(ht); - old_tbl = rhashtable_last_table(ht, old_tbl); - - new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL); + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; @@ -324,12 +430,9 @@ static int rhashtable_expand(struct rhashtable *ht) */ static int rhashtable_shrink(struct rhashtable *ht) { - struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); unsigned int nelems = atomic_read(&ht->nelems); unsigned int size = 0; - int err; - - ASSERT_RHT_MUTEX(ht); if (nelems) size = roundup_pow_of_two(nelems * 3 / 2); @@ -342,15 +445,7 @@ static int rhashtable_shrink(struct rhashtable *ht) if (rht_dereference(old_tbl->future_tbl, ht)) return -EEXIST; - new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); - if (new_tbl == NULL) - return -ENOMEM; - - err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); - if (err) - bucket_table_free(new_tbl); - - return err; + return rhashtable_rehash_alloc(ht, old_tbl, size); } static void rht_deferred_worker(struct work_struct *work) @@ -366,11 +461,14 @@ static void rht_deferred_worker(struct work_struct *work) tbl = rhashtable_last_table(ht, tbl); if (rht_grow_above_75(ht, tbl)) - rhashtable_expand(ht); + err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) - rhashtable_shrink(ht); + err = rhashtable_shrink(ht); + else if (tbl->nest) + err = rhashtable_rehash_alloc(ht, tbl, tbl->size); - err = rhashtable_rehash_table(ht); + if (!err) + err = rhashtable_rehash_table(ht); mutex_unlock(&ht->mutex); @@ -439,8 +537,8 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, int elasticity; elasticity = ht->elasticity; - pprev = &tbl->buckets[hash]; - rht_for_each(head, tbl, hash) { + pprev = rht_bucket_var(tbl, hash); + rht_for_each_continue(head, *pprev, tbl, hash) { struct rhlist_head *list; struct rhlist_head *plist; @@ -477,6 +575,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, struct rhash_head *obj, void *data) { + struct rhash_head __rcu **pprev; struct bucket_table *new_tbl; struct rhash_head *head; @@ -499,7 +598,11 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, if (unlikely(rht_grow_above_100(ht, tbl))) return ERR_PTR(-EAGAIN); - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); + pprev = rht_bucket_insert(ht, tbl, hash); + if (!pprev) + return ERR_PTR(-ENOMEM); + + head = rht_dereference_bucket(*pprev, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (ht->rhlist) { @@ -509,7 +612,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, RCU_INIT_POINTER(list->next, NULL); } - rcu_assign_pointer(tbl->buckets[hash], obj); + rcu_assign_pointer(*pprev, obj); atomic_inc(&ht->nelems); if (rht_grow_above_75(ht, tbl)) @@ -975,7 +1078,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg) { - const struct bucket_table *tbl; + struct bucket_table *tbl; unsigned int i; cancel_work_sync(&ht->run_work); @@ -986,7 +1089,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, for (i = 0; i < tbl->size; i++) { struct rhash_head *pos, *next; - for (pos = rht_dereference(tbl->buckets[i], ht), + for (pos = rht_dereference(*rht_bucket(tbl, i), ht), next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; !rht_is_a_nulls(pos); @@ -1007,3 +1110,70 @@ void rhashtable_destroy(struct rhashtable *ht) return rhashtable_free_and_destroy(ht, NULL, NULL); } EXPORT_SYMBOL_GPL(rhashtable_destroy); + +struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, + unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + static struct rhash_head __rcu *rhnull = + (struct rhash_head __rcu *)NULLS_MARKER(0); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + unsigned int subhash = hash; + union nested_table *ntbl; + + ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); + ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash); + subhash >>= tbl->nest; + + while (ntbl && size > (1 << shift)) { + index = subhash & ((1 << shift) - 1); + ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash); + size >>= shift; + subhash >>= shift; + } + + if (!ntbl) + return &rhnull; + + return &ntbl[subhash].bucket; + +} +EXPORT_SYMBOL_GPL(rht_bucket_nested); + +struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + union nested_table *ntbl; + unsigned int shifted; + unsigned int nhash; + + ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); + hash >>= tbl->nest; + nhash = index; + shifted = tbl->nest; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift) ? shifted : 0, nhash); + + while (ntbl && size > (1 << shift)) { + index = hash & ((1 << shift) - 1); + size >>= shift; + hash >>= shift; + nhash |= index << shifted; + shifted += shift; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift) ? shifted : 0, + nhash); + } + + if (!ntbl) + return NULL; + + return &ntbl[hash].bucket; + +} +EXPORT_SYMBOL_GPL(rht_bucket_nested_insert); From fed06ee89b78d3af32e235e0e89ad0d946fcb95d Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 12 Feb 2017 11:21:31 +0200 Subject: [PATCH 35/56] net/mlx5e: Disable preemption when doing TC statistics upcall When called by HW offloading drivers, the TC action (e.g net/sched/act_mirred.c) code uses this_cpu logic, e.g _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets) per the kernel documention, preemption should be disabled, add that. Before the fix, when running with CONFIG_PREEMPT set, we get a BUG: using smp_processor_id() in preemptible [00000000] code: tc/3793 asserion from the TC action (mirred) stats_update callback. Fixes: aad7e08d39bd ('net/mlx5e: Hardware offloaded flower filter statistics support') Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index c5282b6aba8b..2ebbe80d8126 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1087,10 +1087,14 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv, mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse); + preempt_disable(); + tcf_exts_to_list(f->exts, &actions); list_for_each_entry(a, &actions, list) tcf_action_stats_update(a, bytes, packets, lastuse); + preempt_enable(); + return 0; } From ec5e3b0a1d41fbda0cc33a45bc9e54e91d9d12c7 Mon Sep 17 00:00:00 2001 From: "Jonathan T. Leighton" Date: Sun, 12 Feb 2017 17:26:06 -0500 Subject: [PATCH 36/56] ipv6: Inhibit IPv4-mapped src address on the wire. This patch adds a check for the problematic case of an IPv4-mapped IPv6 source address and a destination address that is neither an IPv4-mapped IPv6 address nor in6addr_any, and returns an appropriate error. The check in done before returning from looking up the route. Signed-off-by: Jonathan T. Leighton Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b6a94ff0bbd0..e164684456df 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1021,6 +1021,9 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, } } #endif + if (ipv6_addr_v4mapped(&fl6->saddr) && + !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) + return -EAFNOSUPPORT; return 0; From 052d2369d1b479cdbbe020fdd6d057d3c342db74 Mon Sep 17 00:00:00 2001 From: "Jonathan T. Leighton" Date: Sun, 12 Feb 2017 17:26:07 -0500 Subject: [PATCH 37/56] ipv6: Handle IPv4-mapped src to in6addr_any dst. This patch adds a check on the type of the source address for the case where the destination address is in6addr_any. If the source is an IPv4-mapped IPv6 source address, the destination is changed to ::ffff:127.0.0.1, and otherwise the destination is changed to ::1. This is done in three locations to handle UDP calls to either connect() or sendmsg() and TCP calls to connect(). Note that udpv6_sendmsg() delays handling an in6addr_any destination until very late, so the patch only needs to handle the case where the source is an IPv4-mapped IPv6 address. Signed-off-by: Jonathan T. Leighton Signed-off-by: David S. Miller --- net/ipv6/datagram.c | 14 +++++++++----- net/ipv6/tcp_ipv6.c | 11 ++++++++--- net/ipv6/udp.c | 4 ++++ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index a3eaafd87100..eec27f87efac 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -167,18 +167,22 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, if (np->sndflow) fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; - addr_type = ipv6_addr_type(&usin->sin6_addr); - - if (addr_type == IPV6_ADDR_ANY) { + if (ipv6_addr_any(&usin->sin6_addr)) { /* * connect to self */ - usin->sin6_addr.s6_addr[15] = 0x01; + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), + &usin->sin6_addr); + else + usin->sin6_addr = in6addr_loopback; } + addr_type = ipv6_addr_type(&usin->sin6_addr); + daddr = &usin->sin6_addr; - if (addr_type == IPV6_ADDR_MAPPED) { + if (addr_type & IPV6_ADDR_MAPPED) { struct sockaddr_in sin; if (__ipv6_only_sock(sk)) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index eaad72c3d746..4c60c6f71cd3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -148,8 +148,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, * connect() to INADDR_ANY means loopback (BSD'ism). */ - if (ipv6_addr_any(&usin->sin6_addr)) - usin->sin6_addr.s6_addr[15] = 0x1; + if (ipv6_addr_any(&usin->sin6_addr)) { + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), + &usin->sin6_addr); + else + usin->sin6_addr = in6addr_loopback; + } addr_type = ipv6_addr_type(&usin->sin6_addr); @@ -188,7 +193,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, * TCP over IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED) { + if (addr_type & IPV6_ADDR_MAPPED) { u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8990856f5101..221825a9407a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1033,6 +1033,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; daddr = &sin6->sin6_addr; + if (ipv6_addr_any(daddr) && + ipv6_addr_v4mapped(&np->saddr)) + ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), + daddr); break; case AF_INET: goto do_udp_sendmsg; From 01f8902bcf3ff124d0aeb88a774180ebcec20ace Mon Sep 17 00:00:00 2001 From: Rui Sousa Date: Mon, 13 Feb 2017 10:01:25 +0800 Subject: [PATCH 38/56] net: fec: fix multicast filtering hardware setup Fix hardware setup of multicast address hash: - Never clear the hardware hash (to avoid packet loss) - Construct the hash register values in software and then write once to hardware Signed-off-by: Rui Sousa Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 38160c2bebcb..8be7034b2e7b 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2910,6 +2910,7 @@ static void set_multicast_list(struct net_device *ndev) struct netdev_hw_addr *ha; unsigned int i, bit, data, crc, tmp; unsigned char hash; + unsigned int hash_high = 0, hash_low = 0; if (ndev->flags & IFF_PROMISC) { tmp = readl(fep->hwp + FEC_R_CNTRL); @@ -2932,11 +2933,7 @@ static void set_multicast_list(struct net_device *ndev) return; } - /* Clear filter and add the addresses in hash register - */ - writel(0, fep->hwp + FEC_GRP_HASH_TABLE_HIGH); - writel(0, fep->hwp + FEC_GRP_HASH_TABLE_LOW); - + /* Add the addresses in hash register */ netdev_for_each_mc_addr(ha, ndev) { /* calculate crc32 value of mac address */ crc = 0xffffffff; @@ -2954,16 +2951,14 @@ static void set_multicast_list(struct net_device *ndev) */ hash = (crc >> (32 - FEC_HASH_BITS)) & 0x3f; - if (hash > 31) { - tmp = readl(fep->hwp + FEC_GRP_HASH_TABLE_HIGH); - tmp |= 1 << (hash - 32); - writel(tmp, fep->hwp + FEC_GRP_HASH_TABLE_HIGH); - } else { - tmp = readl(fep->hwp + FEC_GRP_HASH_TABLE_LOW); - tmp |= 1 << hash; - writel(tmp, fep->hwp + FEC_GRP_HASH_TABLE_LOW); - } + if (hash > 31) + hash_high |= 1 << (hash - 32); + else + hash_low |= 1 << hash; } + + writel(hash_high, fep->hwp + FEC_GRP_HASH_TABLE_HIGH); + writel(hash_low, fep->hwp + FEC_GRP_HASH_TABLE_LOW); } /* Set a MAC change in hardware. */ From cd27b96bc13841ee7af25837a6ae86fee87273d6 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 13 Feb 2017 11:13:16 -0800 Subject: [PATCH 39/56] kcm: fix a null pointer dereference in kcm_sendmsg() In commit 98e3862ca2b1 ("kcm: fix 0-length case for kcm_sendmsg()") I tried to avoid skb allocation for 0-length case, but missed a check for NULL pointer in the non EOR case. Fixes: 98e3862ca2b1 ("kcm: fix 0-length case for kcm_sendmsg()") Reported-by: Dmitry Vyukov Cc: Tom Herbert Signed-off-by: Cong Wang Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 64f0e8531af0..a646f3481240 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1044,8 +1044,10 @@ wait_for_memory: } else { /* Message not complete, save state */ partial_message: - kcm->seq_skb = head; - kcm_tx_msg(head)->last_skb = skb; + if (head) { + kcm->seq_skb = head; + kcm_tx_msg(head)->last_skb = skb; + } } KCM_STATS_ADD(kcm->stats.tx_bytes, copied); From a60ced990e309666915d21445e95347d12406694 Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Tue, 14 Feb 2017 14:42:15 +0200 Subject: [PATCH 40/56] net: ethernet: ti: cpsw: fix cpsw assignment in resume There is a copy-paste error, which hides breaking of resume for CPSW driver: there was replaced netdev_priv() to ndev_to_cpsw(ndev) in suspend, but left it unchanged in resume. Fixes: 606f39939595a4d4540406bfc11f265b2036af6d (ti: cpsw: move platform data and slaves info to cpsw_common) Reported-by: Alexey Starikovskiy Signed-off-by: Ivan Khoronzhuk Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index b203143647e6..65088224c207 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -3160,7 +3160,7 @@ static int cpsw_resume(struct device *dev) { struct platform_device *pdev = to_platform_device(dev); struct net_device *ndev = platform_get_drvdata(pdev); - struct cpsw_common *cpsw = netdev_priv(ndev); + struct cpsw_common *cpsw = ndev_to_cpsw(ndev); /* Select default pin state */ pinctrl_pm_select_default_state(dev); From f39f0d1e1e93145a0e91d9a7a639c42fd037ecc3 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Tue, 14 Feb 2017 10:22:59 -0600 Subject: [PATCH 41/56] ibmvnic: Fix initial MTU settings In the current driver, the MTU is set to the maximum value capable for the backing device. This decision turned out to be a mistake as it led to confusion among users. The expected initial MTU value used for other IBM vNIC capable operating systems is 1500, with the maximum value (9000) reserved for when Jumbo frames are enabled. This patch sets the MTU to the default value for a net device. It also corrects a discrepancy between MTU values received from firmware, which includes the ethernet header length, and net device MTU values. Finally, it removes redundant min/max MTU assignments after device initialization. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 752b0822b020..5b66b4fd1767 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1497,7 +1497,7 @@ static void init_sub_crqs(struct ibmvnic_adapter *adapter, int retry) adapter->req_rx_queues = adapter->opt_rx_comp_queues; adapter->req_rx_add_queues = adapter->max_rx_add_queues; - adapter->req_mtu = adapter->max_mtu; + adapter->req_mtu = adapter->netdev->mtu + ETH_HLEN; } total_queues = adapter->req_tx_queues + adapter->req_rx_queues; @@ -2627,12 +2627,12 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq, break; case MIN_MTU: adapter->min_mtu = be64_to_cpu(crq->query_capability.number); - netdev->min_mtu = adapter->min_mtu; + netdev->min_mtu = adapter->min_mtu - ETH_HLEN; netdev_dbg(netdev, "min_mtu = %lld\n", adapter->min_mtu); break; case MAX_MTU: adapter->max_mtu = be64_to_cpu(crq->query_capability.number); - netdev->max_mtu = adapter->max_mtu; + netdev->max_mtu = adapter->max_mtu - ETH_HLEN; netdev_dbg(netdev, "max_mtu = %lld\n", adapter->max_mtu); break; case MAX_MULTICAST_FILTERS: @@ -3657,9 +3657,7 @@ static void handle_crq_init_rsp(struct work_struct *work) goto task_failed; netdev->real_num_tx_queues = adapter->req_tx_queues; - netdev->mtu = adapter->req_mtu; - netdev->min_mtu = adapter->min_mtu; - netdev->max_mtu = adapter->max_mtu; + netdev->mtu = adapter->req_mtu - ETH_HLEN; if (adapter->failover) { adapter->failover = false; @@ -3799,7 +3797,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) } netdev->real_num_tx_queues = adapter->req_tx_queues; - netdev->mtu = adapter->req_mtu; + netdev->mtu = adapter->req_mtu - ETH_HLEN; rc = register_netdev(netdev); if (rc) { From d199fab63c11998a602205f7ee7ff7c05c97164b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Feb 2017 09:03:51 -0800 Subject: [PATCH 42/56] packet: fix races in fanout_add() Multiple threads can call fanout_add() at the same time. We need to grab fanout_mutex earlier to avoid races that could lead to one thread freeing po->rollover that was set by another thread. Do the same in fanout_release(), for peace of mind, and to help us finding lockdep issues earlier. Fixes: dc99f600698d ("packet: Add fanout support.") Fixes: 0648ab70afe6 ("packet: rollover prepare: per-socket state") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 57 +++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d56ee46b11fc..0f03f6a53b4d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1619,6 +1619,7 @@ static void fanout_release_data(struct packet_fanout *f) static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { + struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f, *match; u8 type = type_flags & 0xff; @@ -1641,23 +1642,28 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) return -EINVAL; } - if (!po->running) - return -EINVAL; + mutex_lock(&fanout_mutex); + err = -EINVAL; + if (!po->running) + goto out; + + err = -EALREADY; if (po->fanout) - return -EALREADY; + goto out; if (type == PACKET_FANOUT_ROLLOVER || (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { - po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); - if (!po->rollover) - return -ENOMEM; - atomic_long_set(&po->rollover->num, 0); - atomic_long_set(&po->rollover->num_huge, 0); - atomic_long_set(&po->rollover->num_failed, 0); + err = -ENOMEM; + rollover = kzalloc(sizeof(*rollover), GFP_KERNEL); + if (!rollover) + goto out; + atomic_long_set(&rollover->num, 0); + atomic_long_set(&rollover->num_huge, 0); + atomic_long_set(&rollover->num_failed, 0); + po->rollover = rollover; } - mutex_lock(&fanout_mutex); match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && @@ -1704,11 +1710,11 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } } out: - mutex_unlock(&fanout_mutex); - if (err) { - kfree(po->rollover); + if (err && rollover) { + kfree(rollover); po->rollover = NULL; } + mutex_unlock(&fanout_mutex); return err; } @@ -1717,23 +1723,22 @@ static void fanout_release(struct sock *sk) struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f; - f = po->fanout; - if (!f) - return; - mutex_lock(&fanout_mutex); - po->fanout = NULL; + f = po->fanout; + if (f) { + po->fanout = NULL; - if (atomic_dec_and_test(&f->sk_ref)) { - list_del(&f->list); - dev_remove_pack(&f->prot_hook); - fanout_release_data(f); - kfree(f); + if (atomic_dec_and_test(&f->sk_ref)) { + list_del(&f->list); + dev_remove_pack(&f->prot_hook); + fanout_release_data(f); + kfree(f); + } + + if (po->rollover) + kfree_rcu(po->rollover, rcu); } mutex_unlock(&fanout_mutex); - - if (po->rollover) - kfree_rcu(po->rollover, rcu); } static bool packet_extra_vlan_len_allowed(const struct net_device *dev, From f9c85ee67164b37f9296eab3b754e543e4e96a1c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Feb 2017 17:47:57 -0200 Subject: [PATCH 43/56] [media] siano: make it work again with CONFIG_VMAP_STACK Reported as a Kaffeine bug: https://bugs.kde.org/show_bug.cgi?id=375811 The USB control messages require DMA to work. We cannot pass a stack-allocated buffer, as it is not warranted that the stack would be into a DMA enabled area. On Kernel 4.9, the default is to not accept DMA on stack anymore on x86 architecture. On other architectures, this has been a requirement since Kernel 2.2. So, after this patch, this driver should likely work fine on all archs. Tested with USB ID 2040:5510: Hauppauge Windham Cc: stable@vger.kernel.org Signed-off-by: Mauro Carvalho Chehab --- drivers/media/usb/siano/smsusb.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/media/usb/siano/smsusb.c b/drivers/media/usb/siano/smsusb.c index a4dcaec31d02..8c1f926567ec 100644 --- a/drivers/media/usb/siano/smsusb.c +++ b/drivers/media/usb/siano/smsusb.c @@ -218,22 +218,30 @@ static int smsusb_start_streaming(struct smsusb_device_t *dev) static int smsusb_sendrequest(void *context, void *buffer, size_t size) { struct smsusb_device_t *dev = (struct smsusb_device_t *) context; - struct sms_msg_hdr *phdr = (struct sms_msg_hdr *) buffer; - int dummy; + struct sms_msg_hdr *phdr; + int dummy, ret; if (dev->state != SMSUSB_ACTIVE) { pr_debug("Device not active yet\n"); return -ENOENT; } + phdr = kmalloc(size, GFP_KERNEL); + if (!phdr) + return -ENOMEM; + memcpy(phdr, buffer, size); + pr_debug("sending %s(%d) size: %d\n", smscore_translate_msg(phdr->msg_type), phdr->msg_type, phdr->msg_length); smsendian_handle_tx_message((struct sms_msg_data *) phdr); - smsendian_handle_message_header((struct sms_msg_hdr *)buffer); - return usb_bulk_msg(dev->udev, usb_sndbulkpipe(dev->udev, 2), - buffer, size, &dummy, 1000); + smsendian_handle_message_header((struct sms_msg_hdr *)phdr); + ret = usb_bulk_msg(dev->udev, usb_sndbulkpipe(dev->udev, 2), + phdr, size, &dummy, 1000); + + kfree(phdr); + return ret; } static char *smsusb1_fw_lkup[] = { From a725eb15db80643a160310ed6bcfd6c5a6c907f2 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Wed, 15 Feb 2017 05:23:26 +0300 Subject: [PATCH 44/56] uapi: fix linux/if_pppol2tp.h userspace compilation errors Because of interface limitations, provided by libc cannot be included after , therefore any header that includes cannot be included after . Change uapi/linux/l2tp.h, the last uapi header that includes , to include and instead of and use __SOCK_SIZE__ instead of sizeof(struct sockaddr) the same way as uapi/linux/in.h does, to fix linux/if_pppol2tp.h userspace compilation errors like this: In file included from /usr/include/linux/l2tp.h:12:0, from /usr/include/linux/if_pppol2tp.h:21, /usr/include/netinet/in.h:31:8: error: redefinition of 'struct in_addr' Fixes: 47c3e7783be4 ("net: l2tp: deprecate PPPOL2TP_MSG_* in favour of L2TP_MSG_*") Signed-off-by: Dmitry V. Levin Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 85ddb74fcd1c..b23c1914a182 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -9,9 +9,8 @@ #include #include -#ifndef __KERNEL__ -#include -#endif +#include +#include #define IPPROTO_L2TP 115 @@ -31,7 +30,7 @@ struct sockaddr_l2tpip { __u32 l2tp_conn_id; /* Connection ID of tunnel */ /* Pad to size of `struct sockaddr'. */ - unsigned char __pad[sizeof(struct sockaddr) - + unsigned char __pad[__SOCK_SIZE__ - sizeof(__kernel_sa_family_t) - sizeof(__be16) - sizeof(struct in_addr) - sizeof(__u32)]; From e70ac171658679ecf6bea4bbd9e9325cd6079d2b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Feb 2017 17:11:14 -0800 Subject: [PATCH 45/56] tcp: tcp_probe: use spin_lock_bh() tcp_rcv_established() can now run in process context. We need to disable BH while acquiring tcp probe spinlock, or risk a deadlock. Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog") Signed-off-by: Eric Dumazet Reported-by: Ricardo Nabinger Sanchez Signed-off-by: David S. Miller --- net/ipv4/tcp_probe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index f6c50af24a64..3d063eb37848 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -117,7 +117,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, (fwmark > 0 && skb->mark == fwmark)) && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { - spin_lock(&tcp_probe.lock); + spin_lock_bh(&tcp_probe.lock); /* If log fills, just silently drop */ if (tcp_probe_avail() > 1) { struct tcp_log *p = tcp_probe.log + tcp_probe.head; @@ -157,7 +157,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); } tcp_probe.lastcwnd = tp->snd_cwnd; - spin_unlock(&tcp_probe.lock); + spin_unlock_bh(&tcp_probe.lock); wake_up(&tcp_probe.wait); } From 5463b3d043826ff8ef487edbd1ef1bfffb677437 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 14 Feb 2017 08:22:20 +1100 Subject: [PATCH 46/56] bpf: kernel header files need to be copied into the tools directory Signed-off-by: Stephen Rothwell Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- tools/include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0eb0e87dbe9f..d2b0ac799d03 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -116,6 +116,12 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command + * to the given target_fd cgroup the descendent cgroup will be able to + * override effective bpf program that was inherited from this cgroup + */ +#define BPF_F_ALLOW_OVERRIDE (1U << 0) + #define BPF_PSEUDO_MAP_FD 1 /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -171,6 +177,7 @@ union bpf_attr { __u32 target_fd; /* container object to attach to */ __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; + __u32 attach_flags; }; } __attribute__((aligned(8))); From 6ba4d2722d06960102c981322035239cd66f7316 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Wed, 8 Feb 2017 20:30:56 +0530 Subject: [PATCH 47/56] fuse: fix use after free issue in fuse_dev_do_read() There is a potential race between fuse_dev_do_write() and request_wait_answer() contexts as shown below: TASK 1: __fuse_request_send(): |--spin_lock(&fiq->waitq.lock); |--queue_request(); |--spin_unlock(&fiq->waitq.lock); |--request_wait_answer(): |--if (test_bit(FR_SENT, &req->flags)) TASK 2: fuse_dev_do_write(): |--clears bit FR_SENT, |--request_end(): |--sets bit FR_FINISHED |--spin_lock(&fiq->waitq.lock); |--list_del_init(&req->intr_entry); |--spin_unlock(&fiq->waitq.lock); |--fuse_put_request(); |--queue_interrupt(); |--wake_up_locked(&fiq->waitq); |--wait_event_freezable(); Now, the next fuse_dev_do_read(), see interrupts list is not empty and then calls fuse_read_interrupt() which tries to access the request which is already free'd and gets the below crash: [11432.401266] Unable to handle kernel paging request at virtual address 6b6b6b6b6b6b6b6b ... [11432.418518] Kernel BUG at ffffff80083720e0 [11432.456168] PC is at __list_del_entry+0x6c/0xc4 [11432.463573] LR is at fuse_dev_do_read+0x1ac/0x474 ... [11432.679999] [] __list_del_entry+0x6c/0xc4 [11432.687794] [] fuse_dev_do_read+0x1ac/0x474 [11432.693180] [] fuse_dev_read+0x6c/0x78 [11432.699082] [] __vfs_read+0xc0/0xe8 [11432.704459] [] vfs_read+0x90/0x108 [11432.709406] [] SyS_read+0x58/0x94 As FR_FINISHED bit is set before deleting the intr_entry with input queue lock in request completion path, do the testing of this flag and queueing atomically with the same lock in queue_interrupt(). Signed-off-by: Sahitya Tummala Signed-off-by: Miklos Szeredi Fixes: fd22d62ed0c3 ("fuse: no fc->lock for iqueue parts") Cc: # 4.2+ --- fs/fuse/dev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4e06a27ed7f8..b656e1805f04 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -399,6 +399,10 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->waitq.lock); + if (test_bit(FR_FINISHED, &req->flags)) { + spin_unlock(&fiq->waitq.lock); + return; + } if (list_empty(&req->intr_entry)) { list_add_tail(&req->intr_entry, &fiq->interrupts); wake_up_locked(&fiq->waitq); From afe3e4d11bdf50a4c3965eb6465ba6bebbcf5dcf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 14 Feb 2017 21:17:48 -0800 Subject: [PATCH 48/56] PCI/PME: Restore pcie_pme_driver.remove In addition to making PME non-modular, d7def2040077 ("PCI/PME: Make explicitly non-modular") removed the pcie_pme_driver .remove() method, pcie_pme_remove(). pcie_pme_remove() freed the PME IRQ that was requested in pci_pme_probe(). The fact that we don't free the IRQ after d7def2040077 causes the following crash when removing a PCIe port device via /sys: ------------[ cut here ]------------ kernel BUG at drivers/pci/msi.c:370! invalid opcode: 0000 [#1] SMP Modules linked in: CPU: 1 PID: 14509 Comm: sh Tainted: G W 4.8.0-rc1-yh-00012-gd29438d RIP: 0010:[] free_msi_irqs+0x65/0x190 ... Call Trace: [] pci_disable_msi+0x34/0x40 [] cleanup_service_irqs+0x27/0x30 [] pcie_port_device_remove+0x2a/0x40 [] pcie_portdrv_remove+0x40/0x50 [] pci_device_remove+0x4b/0xc0 [] __device_release_driver+0xb6/0x150 [] device_release_driver+0x25/0x40 [] pci_stop_bus_device+0x74/0xa0 [] pci_stop_and_remove_bus_device_locked+0x1a/0x30 [] remove_store+0x50/0x70 [] dev_attr_store+0x18/0x30 [] sysfs_kf_write+0x44/0x60 [] kernfs_fop_write+0x10e/0x190 [] __vfs_write+0x28/0x110 [] ? percpu_down_read+0x44/0x80 [] ? __sb_start_write+0xa7/0xe0 [] ? __sb_start_write+0xa7/0xe0 [] vfs_write+0xc4/0x180 [] SyS_write+0x49/0xa0 [] do_syscall_64+0xa6/0x1b0 [] entry_SYSCALL64_slow_path+0x25/0x25 ... RIP [] free_msi_irqs+0x65/0x190 RSP ---[ end trace f4505e1dac5b95d3 ]--- Segmentation fault Restore pcie_pme_remove(). [bhelgaas: changelog] Fixes: d7def2040077 ("PCI/PME: Make explicitly non-modular") Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki CC: stable@vger.kernel.org # v4.9+ --- drivers/pci/pcie/pme.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c index 717529331dac..2dd1c68e6de8 100644 --- a/drivers/pci/pcie/pme.c +++ b/drivers/pci/pcie/pme.c @@ -433,6 +433,17 @@ static int pcie_pme_resume(struct pcie_device *srv) return 0; } +/** + * pcie_pme_remove - Prepare PCIe PME service device for removal. + * @srv - PCIe service device to remove. + */ +static void pcie_pme_remove(struct pcie_device *srv) +{ + pcie_pme_suspend(srv); + free_irq(srv->irq, srv); + kfree(get_service_data(srv)); +} + static struct pcie_port_service_driver pcie_pme_driver = { .name = "pcie_pme", .port_type = PCI_EXP_TYPE_ROOT_PORT, @@ -441,6 +452,7 @@ static struct pcie_port_service_driver pcie_pme_driver = { .probe = pcie_pme_probe, .suspend = pcie_pme_suspend, .resume = pcie_pme_resume, + .remove = pcie_pme_remove, }; /** From cd224553641848dd17800fe559e4ff5d208553e8 Mon Sep 17 00:00:00 2001 From: Anssi Hannula Date: Tue, 14 Feb 2017 19:11:44 +0200 Subject: [PATCH 49/56] net: xilinx_emaclite: fix receive buffer overflow xilinx_emaclite looks at the received data to try to determine the Ethernet packet length but does not properly clamp it if proto_type == ETH_P_IP or 1500 < proto_type <= 1518, causing a buffer overflow and a panic via skb_panic() as the length exceeds the allocated skb size. Fix those cases. Also add an additional unconditional check with WARN_ON() at the end. Signed-off-by: Anssi Hannula Fixes: bb81b2ddfa19 ("net: add Xilinx emac lite device driver") Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 93dc10b10c09..455774e4741a 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -369,7 +369,7 @@ static int xemaclite_send_data(struct net_local *drvdata, u8 *data, * * Return: Total number of bytes received */ -static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data) +static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data, int maxlen) { void __iomem *addr; u16 length, proto_type; @@ -409,7 +409,7 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data) /* Check if received ethernet frame is a raw ethernet frame * or an IP packet or an ARP packet */ - if (proto_type > (ETH_FRAME_LEN + ETH_FCS_LEN)) { + if (proto_type > ETH_DATA_LEN) { if (proto_type == ETH_P_IP) { length = ((ntohl(__raw_readl(addr + @@ -417,6 +417,7 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data) XEL_RXBUFF_OFFSET)) >> XEL_HEADER_SHIFT) & XEL_RPLR_LENGTH_MASK); + length = min_t(u16, length, ETH_DATA_LEN); length += ETH_HLEN + ETH_FCS_LEN; } else if (proto_type == ETH_P_ARP) @@ -429,6 +430,9 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data) /* Use the length in the frame, plus the header and trailer */ length = proto_type + ETH_HLEN + ETH_FCS_LEN; + if (WARN_ON(length > maxlen)) + length = maxlen; + /* Read from the EmacLite device */ xemaclite_aligned_read((u32 __force *) (addr + XEL_RXBUFF_OFFSET), data, length); @@ -603,7 +607,7 @@ static void xemaclite_rx_handler(struct net_device *dev) skb_reserve(skb, 2); - len = xemaclite_recv_data(lp, (u8 *) skb->data); + len = xemaclite_recv_data(lp, (u8 *) skb->data, len); if (!len) { dev->stats.rx_errors++; From acf138f1b00bdd1b7cd9894562ed0c2a1670888e Mon Sep 17 00:00:00 2001 From: Anssi Hannula Date: Tue, 14 Feb 2017 19:11:45 +0200 Subject: [PATCH 50/56] net: xilinx_emaclite: fix freezes due to unordered I/O The xilinx_emaclite uses __raw_writel and __raw_readl for register accesses. Those functions do not imply any kind of memory barriers and they may be reordered. The driver does not seem to take that into account, though, and the driver does not satisfy the ordering requirements of the hardware. For clear examples, see xemaclite_mdio_write() and xemaclite_mdio_read() which try to set MDIO address before initiating the transaction. I'm seeing system freezes with the driver with GCC 5.4 and current Linux kernels on Zynq-7000 SoC immediately when trying to use the interface. In commit 123c1407af87 ("net: emaclite: Do not use microblaze and ppc IO functions") the driver was switched from non-generic in_be32/out_be32 (memory barriers, big endian) to __raw_readl/__raw_writel (no memory barriers, native endian), so apparently the device follows system endianness and the driver was originally written with the assumption of memory barriers. Rather than try to hunt for each case of missing barrier, just switch the driver to use iowrite32/ioread32/iowrite32be/ioread32be depending on endianness instead. Tested on little-endian Zynq-7000 ARM SoC FPGA. Signed-off-by: Anssi Hannula Fixes: 123c1407af87 ("net: emaclite: Do not use microblaze and ppc IO functions") Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 116 ++++++++++-------- 1 file changed, 62 insertions(+), 54 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 455774e4741a..aa02a03a6d8d 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -100,6 +100,14 @@ /* BUFFER_ALIGN(adr) calculates the number of bytes to the next alignment. */ #define BUFFER_ALIGN(adr) ((ALIGNMENT - ((u32) adr)) % ALIGNMENT) +#ifdef __BIG_ENDIAN +#define xemaclite_readl ioread32be +#define xemaclite_writel iowrite32be +#else +#define xemaclite_readl ioread32 +#define xemaclite_writel iowrite32 +#endif + /** * struct net_local - Our private per device data * @ndev: instance of the network device @@ -156,15 +164,15 @@ static void xemaclite_enable_interrupts(struct net_local *drvdata) u32 reg_data; /* Enable the Tx interrupts for the first Buffer */ - reg_data = __raw_readl(drvdata->base_addr + XEL_TSR_OFFSET); - __raw_writel(reg_data | XEL_TSR_XMIT_IE_MASK, - drvdata->base_addr + XEL_TSR_OFFSET); + reg_data = xemaclite_readl(drvdata->base_addr + XEL_TSR_OFFSET); + xemaclite_writel(reg_data | XEL_TSR_XMIT_IE_MASK, + drvdata->base_addr + XEL_TSR_OFFSET); /* Enable the Rx interrupts for the first buffer */ - __raw_writel(XEL_RSR_RECV_IE_MASK, drvdata->base_addr + XEL_RSR_OFFSET); + xemaclite_writel(XEL_RSR_RECV_IE_MASK, drvdata->base_addr + XEL_RSR_OFFSET); /* Enable the Global Interrupt Enable */ - __raw_writel(XEL_GIER_GIE_MASK, drvdata->base_addr + XEL_GIER_OFFSET); + xemaclite_writel(XEL_GIER_GIE_MASK, drvdata->base_addr + XEL_GIER_OFFSET); } /** @@ -179,17 +187,17 @@ static void xemaclite_disable_interrupts(struct net_local *drvdata) u32 reg_data; /* Disable the Global Interrupt Enable */ - __raw_writel(XEL_GIER_GIE_MASK, drvdata->base_addr + XEL_GIER_OFFSET); + xemaclite_writel(XEL_GIER_GIE_MASK, drvdata->base_addr + XEL_GIER_OFFSET); /* Disable the Tx interrupts for the first buffer */ - reg_data = __raw_readl(drvdata->base_addr + XEL_TSR_OFFSET); - __raw_writel(reg_data & (~XEL_TSR_XMIT_IE_MASK), - drvdata->base_addr + XEL_TSR_OFFSET); + reg_data = xemaclite_readl(drvdata->base_addr + XEL_TSR_OFFSET); + xemaclite_writel(reg_data & (~XEL_TSR_XMIT_IE_MASK), + drvdata->base_addr + XEL_TSR_OFFSET); /* Disable the Rx interrupts for the first buffer */ - reg_data = __raw_readl(drvdata->base_addr + XEL_RSR_OFFSET); - __raw_writel(reg_data & (~XEL_RSR_RECV_IE_MASK), - drvdata->base_addr + XEL_RSR_OFFSET); + reg_data = xemaclite_readl(drvdata->base_addr + XEL_RSR_OFFSET); + xemaclite_writel(reg_data & (~XEL_RSR_RECV_IE_MASK), + drvdata->base_addr + XEL_RSR_OFFSET); } /** @@ -321,7 +329,7 @@ static int xemaclite_send_data(struct net_local *drvdata, u8 *data, byte_count = ETH_FRAME_LEN; /* Check if the expected buffer is available */ - reg_data = __raw_readl(addr + XEL_TSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_TSR_OFFSET); if ((reg_data & (XEL_TSR_XMIT_BUSY_MASK | XEL_TSR_XMIT_ACTIVE_MASK)) == 0) { @@ -334,7 +342,7 @@ static int xemaclite_send_data(struct net_local *drvdata, u8 *data, addr = (void __iomem __force *)((u32 __force)addr ^ XEL_BUFFER_OFFSET); - reg_data = __raw_readl(addr + XEL_TSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_TSR_OFFSET); if ((reg_data & (XEL_TSR_XMIT_BUSY_MASK | XEL_TSR_XMIT_ACTIVE_MASK)) != 0) @@ -345,16 +353,16 @@ static int xemaclite_send_data(struct net_local *drvdata, u8 *data, /* Write the frame to the buffer */ xemaclite_aligned_write(data, (u32 __force *) addr, byte_count); - __raw_writel((byte_count & XEL_TPLR_LENGTH_MASK), - addr + XEL_TPLR_OFFSET); + xemaclite_writel((byte_count & XEL_TPLR_LENGTH_MASK), + addr + XEL_TPLR_OFFSET); /* Update the Tx Status Register to indicate that there is a * frame to send. Set the XEL_TSR_XMIT_ACTIVE_MASK flag which * is used by the interrupt handler to check whether a frame * has been transmitted */ - reg_data = __raw_readl(addr + XEL_TSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_TSR_OFFSET); reg_data |= (XEL_TSR_XMIT_BUSY_MASK | XEL_TSR_XMIT_ACTIVE_MASK); - __raw_writel(reg_data, addr + XEL_TSR_OFFSET); + xemaclite_writel(reg_data, addr + XEL_TSR_OFFSET); return 0; } @@ -379,7 +387,7 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data, int maxlen) addr = (drvdata->base_addr + drvdata->next_rx_buf_to_use); /* Verify which buffer has valid data */ - reg_data = __raw_readl(addr + XEL_RSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_RSR_OFFSET); if ((reg_data & XEL_RSR_RECV_DONE_MASK) == XEL_RSR_RECV_DONE_MASK) { if (drvdata->rx_ping_pong != 0) @@ -396,14 +404,14 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data, int maxlen) return 0; /* No data was available */ /* Verify that buffer has valid data */ - reg_data = __raw_readl(addr + XEL_RSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_RSR_OFFSET); if ((reg_data & XEL_RSR_RECV_DONE_MASK) != XEL_RSR_RECV_DONE_MASK) return 0; /* No data was available */ } /* Get the protocol type of the ethernet frame that arrived */ - proto_type = ((ntohl(__raw_readl(addr + XEL_HEADER_OFFSET + + proto_type = ((ntohl(xemaclite_readl(addr + XEL_HEADER_OFFSET + XEL_RXBUFF_OFFSET)) >> XEL_HEADER_SHIFT) & XEL_RPLR_LENGTH_MASK); @@ -412,7 +420,7 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data, int maxlen) if (proto_type > ETH_DATA_LEN) { if (proto_type == ETH_P_IP) { - length = ((ntohl(__raw_readl(addr + + length = ((ntohl(xemaclite_readl(addr + XEL_HEADER_IP_LENGTH_OFFSET + XEL_RXBUFF_OFFSET)) >> XEL_HEADER_SHIFT) & @@ -438,9 +446,9 @@ static u16 xemaclite_recv_data(struct net_local *drvdata, u8 *data, int maxlen) data, length); /* Acknowledge the frame */ - reg_data = __raw_readl(addr + XEL_RSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_RSR_OFFSET); reg_data &= ~XEL_RSR_RECV_DONE_MASK; - __raw_writel(reg_data, addr + XEL_RSR_OFFSET); + xemaclite_writel(reg_data, addr + XEL_RSR_OFFSET); return length; } @@ -467,14 +475,14 @@ static void xemaclite_update_address(struct net_local *drvdata, xemaclite_aligned_write(address_ptr, (u32 __force *) addr, ETH_ALEN); - __raw_writel(ETH_ALEN, addr + XEL_TPLR_OFFSET); + xemaclite_writel(ETH_ALEN, addr + XEL_TPLR_OFFSET); /* Update the MAC address in the EmacLite */ - reg_data = __raw_readl(addr + XEL_TSR_OFFSET); - __raw_writel(reg_data | XEL_TSR_PROG_MAC_ADDR, addr + XEL_TSR_OFFSET); + reg_data = xemaclite_readl(addr + XEL_TSR_OFFSET); + xemaclite_writel(reg_data | XEL_TSR_PROG_MAC_ADDR, addr + XEL_TSR_OFFSET); /* Wait for EmacLite to finish with the MAC address update */ - while ((__raw_readl(addr + XEL_TSR_OFFSET) & + while ((xemaclite_readl(addr + XEL_TSR_OFFSET) & XEL_TSR_PROG_MAC_ADDR) != 0) ; } @@ -644,32 +652,32 @@ static irqreturn_t xemaclite_interrupt(int irq, void *dev_id) u32 tx_status; /* Check if there is Rx Data available */ - if ((__raw_readl(base_addr + XEL_RSR_OFFSET) & + if ((xemaclite_readl(base_addr + XEL_RSR_OFFSET) & XEL_RSR_RECV_DONE_MASK) || - (__raw_readl(base_addr + XEL_BUFFER_OFFSET + XEL_RSR_OFFSET) + (xemaclite_readl(base_addr + XEL_BUFFER_OFFSET + XEL_RSR_OFFSET) & XEL_RSR_RECV_DONE_MASK)) xemaclite_rx_handler(dev); /* Check if the Transmission for the first buffer is completed */ - tx_status = __raw_readl(base_addr + XEL_TSR_OFFSET); + tx_status = xemaclite_readl(base_addr + XEL_TSR_OFFSET); if (((tx_status & XEL_TSR_XMIT_BUSY_MASK) == 0) && (tx_status & XEL_TSR_XMIT_ACTIVE_MASK) != 0) { tx_status &= ~XEL_TSR_XMIT_ACTIVE_MASK; - __raw_writel(tx_status, base_addr + XEL_TSR_OFFSET); + xemaclite_writel(tx_status, base_addr + XEL_TSR_OFFSET); tx_complete = true; } /* Check if the Transmission for the second buffer is completed */ - tx_status = __raw_readl(base_addr + XEL_BUFFER_OFFSET + XEL_TSR_OFFSET); + tx_status = xemaclite_readl(base_addr + XEL_BUFFER_OFFSET + XEL_TSR_OFFSET); if (((tx_status & XEL_TSR_XMIT_BUSY_MASK) == 0) && (tx_status & XEL_TSR_XMIT_ACTIVE_MASK) != 0) { tx_status &= ~XEL_TSR_XMIT_ACTIVE_MASK; - __raw_writel(tx_status, base_addr + XEL_BUFFER_OFFSET + - XEL_TSR_OFFSET); + xemaclite_writel(tx_status, base_addr + XEL_BUFFER_OFFSET + + XEL_TSR_OFFSET); tx_complete = true; } @@ -702,7 +710,7 @@ static int xemaclite_mdio_wait(struct net_local *lp) /* wait for the MDIO interface to not be busy or timeout after some time. */ - while (__raw_readl(lp->base_addr + XEL_MDIOCTRL_OFFSET) & + while (xemaclite_readl(lp->base_addr + XEL_MDIOCTRL_OFFSET) & XEL_MDIOCTRL_MDIOSTS_MASK) { if (time_before_eq(end, jiffies)) { WARN_ON(1); @@ -738,17 +746,17 @@ static int xemaclite_mdio_read(struct mii_bus *bus, int phy_id, int reg) * MDIO Address register. Set the Status bit in the MDIO Control * register to start a MDIO read transaction. */ - ctrl_reg = __raw_readl(lp->base_addr + XEL_MDIOCTRL_OFFSET); - __raw_writel(XEL_MDIOADDR_OP_MASK | - ((phy_id << XEL_MDIOADDR_PHYADR_SHIFT) | reg), - lp->base_addr + XEL_MDIOADDR_OFFSET); - __raw_writel(ctrl_reg | XEL_MDIOCTRL_MDIOSTS_MASK, - lp->base_addr + XEL_MDIOCTRL_OFFSET); + ctrl_reg = xemaclite_readl(lp->base_addr + XEL_MDIOCTRL_OFFSET); + xemaclite_writel(XEL_MDIOADDR_OP_MASK | + ((phy_id << XEL_MDIOADDR_PHYADR_SHIFT) | reg), + lp->base_addr + XEL_MDIOADDR_OFFSET); + xemaclite_writel(ctrl_reg | XEL_MDIOCTRL_MDIOSTS_MASK, + lp->base_addr + XEL_MDIOCTRL_OFFSET); if (xemaclite_mdio_wait(lp)) return -ETIMEDOUT; - rc = __raw_readl(lp->base_addr + XEL_MDIORD_OFFSET); + rc = xemaclite_readl(lp->base_addr + XEL_MDIORD_OFFSET); dev_dbg(&lp->ndev->dev, "xemaclite_mdio_read(phy_id=%i, reg=%x) == %x\n", @@ -785,13 +793,13 @@ static int xemaclite_mdio_write(struct mii_bus *bus, int phy_id, int reg, * Data register. Finally, set the Status bit in the MDIO Control * register to start a MDIO write transaction. */ - ctrl_reg = __raw_readl(lp->base_addr + XEL_MDIOCTRL_OFFSET); - __raw_writel(~XEL_MDIOADDR_OP_MASK & - ((phy_id << XEL_MDIOADDR_PHYADR_SHIFT) | reg), - lp->base_addr + XEL_MDIOADDR_OFFSET); - __raw_writel(val, lp->base_addr + XEL_MDIOWR_OFFSET); - __raw_writel(ctrl_reg | XEL_MDIOCTRL_MDIOSTS_MASK, - lp->base_addr + XEL_MDIOCTRL_OFFSET); + ctrl_reg = xemaclite_readl(lp->base_addr + XEL_MDIOCTRL_OFFSET); + xemaclite_writel(~XEL_MDIOADDR_OP_MASK & + ((phy_id << XEL_MDIOADDR_PHYADR_SHIFT) | reg), + lp->base_addr + XEL_MDIOADDR_OFFSET); + xemaclite_writel(val, lp->base_addr + XEL_MDIOWR_OFFSET); + xemaclite_writel(ctrl_reg | XEL_MDIOCTRL_MDIOSTS_MASK, + lp->base_addr + XEL_MDIOCTRL_OFFSET); return 0; } @@ -838,8 +846,8 @@ static int xemaclite_mdio_setup(struct net_local *lp, struct device *dev) /* Enable the MDIO bus by asserting the enable bit in MDIO Control * register. */ - __raw_writel(XEL_MDIOCTRL_MDIOEN_MASK, - lp->base_addr + XEL_MDIOCTRL_OFFSET); + xemaclite_writel(XEL_MDIOCTRL_MDIOEN_MASK, + lp->base_addr + XEL_MDIOCTRL_OFFSET); bus = mdiobus_alloc(); if (!bus) { @@ -1144,8 +1152,8 @@ static int xemaclite_of_probe(struct platform_device *ofdev) } /* Clear the Tx CSR's in case this is a restart */ - __raw_writel(0, lp->base_addr + XEL_TSR_OFFSET); - __raw_writel(0, lp->base_addr + XEL_BUFFER_OFFSET + XEL_TSR_OFFSET); + xemaclite_writel(0, lp->base_addr + XEL_TSR_OFFSET); + xemaclite_writel(0, lp->base_addr + XEL_BUFFER_OFFSET + XEL_TSR_OFFSET); /* Set the MAC address in the EmacLite device */ xemaclite_update_address(lp, ndev->dev_addr); From 7627ae6030f56a9a91a5b3867b21f35d79c16e64 Mon Sep 17 00:00:00 2001 From: Marcus Huewe Date: Wed, 15 Feb 2017 01:00:36 +0100 Subject: [PATCH 51/56] net: neigh: Fix netevent NETEVENT_DELAY_PROBE_TIME_UPDATE notification When setting a neigh related sysctl parameter, we always send a NETEVENT_DELAY_PROBE_TIME_UPDATE netevent. For instance, when executing sysctl net.ipv6.neigh.wlp3s0.retrans_time_ms=2000 a NETEVENT_DELAY_PROBE_TIME_UPDATE netevent is generated. This is caused by commit 2a4501ae18b5 ("neigh: Send a notification when DELAY_PROBE_TIME changes"). According to the commit's description, it was intended to generate such an event when setting the "delay_first_probe_time" sysctl parameter. In order to fix this, only generate this event when actually setting the "delay_first_probe_time" sysctl parameter. This fix should not have any unintended side-effects, because all but one registered netevent callbacks check for other netevent event types (the registered callbacks were obtained by grepping for "register_netevent_notifier"). The only callback that uses the NETEVENT_DELAY_PROBE_TIME_UPDATE event is mlxsw_sp_router_netevent_event() (in drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c): in case of this event, it only accesses the DELAY_PROBE_TIME of the passed neigh_parms. Fixes: 2a4501ae18b5 ("neigh: Send a notification when DELAY_PROBE_TIME changes") Signed-off-by: Marcus Huewe Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/neighbour.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 7bb12e07ffef..e7c12caa20c8 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2923,7 +2923,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) return; set_bit(index, p->data_state); - call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); + if (index == NEIGH_VAR_DELAY_PROBE_TIME) + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } From 28f4d16570dcf440e54a4d72666d5be452f27d0e Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Wed, 15 Feb 2017 10:32:11 -0600 Subject: [PATCH 52/56] ibmvnic: Fix endian error when requesting device capabilities When a vNIC client driver requests a faulty device setting, the server returns an acceptable value for the client to request. This 64 bit value was incorrectly being swapped as a 32 bit value, resulting in loss of data. This patch corrects that by using the 64 bit swap function. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 5b66b4fd1767..158b49a0a1d6 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2389,10 +2389,10 @@ static void handle_request_cap_rsp(union ibmvnic_crq *crq, case PARTIALSUCCESS: dev_info(dev, "req=%lld, rsp=%ld in %s queue, retrying.\n", *req_value, - (long int)be32_to_cpu(crq->request_capability_rsp. + (long int)be64_to_cpu(crq->request_capability_rsp. number), name); release_sub_crqs_no_irqs(adapter); - *req_value = be32_to_cpu(crq->request_capability_rsp.number); + *req_value = be64_to_cpu(crq->request_capability_rsp.number); init_sub_crqs(adapter, 1); return; default: From 75224c93fa985f4a6fb983f53208f5c5aa555fbf Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Wed, 15 Feb 2017 10:33:33 -0600 Subject: [PATCH 53/56] ibmvnic: Fix endian errors in error reporting output Error reports received from firmware were not being converted from big endian values, leading to bogus error codes reported on little endian systems. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 158b49a0a1d6..a07b8d79174c 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2186,12 +2186,12 @@ static void handle_error_info_rsp(union ibmvnic_crq *crq, if (!found) { dev_err(dev, "Couldn't find error id %x\n", - crq->request_error_rsp.error_id); + be32_to_cpu(crq->request_error_rsp.error_id)); return; } dev_err(dev, "Detailed info for error id %x:", - crq->request_error_rsp.error_id); + be32_to_cpu(crq->request_error_rsp.error_id)); for (i = 0; i < error_buff->len; i++) { pr_cont("%02x", (int)error_buff->buff[i]); @@ -2270,8 +2270,8 @@ static void handle_error_indication(union ibmvnic_crq *crq, dev_err(dev, "Firmware reports %serror id %x, cause %d\n", crq->error_indication. flags & IBMVNIC_FATAL_ERROR ? "FATAL " : "", - crq->error_indication.error_id, - crq->error_indication.error_cause); + be32_to_cpu(crq->error_indication.error_id), + be16_to_cpu(crq->error_indication.error_cause)); error_buff = kmalloc(sizeof(*error_buff), GFP_ATOMIC); if (!error_buff) From bf3f14d6342cfb37eab8f0cddd0e4d4063fd9fc9 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 15 Feb 2017 22:29:51 -0500 Subject: [PATCH 54/56] rhashtable: Revert nested table changes. This reverts commits: 6a25478077d987edc5e2f880590a2bc5fcab4441 9dbbfb0ab6680c6a85609041011484e6658e7d3c 40137906c5f55c252194ef5834130383e639536f It's too risky to put in this late in the release cycle. We'll put these changes into the next merge window instead. Signed-off-by: David S. Miller --- fs/gfs2/glock.c | 28 ++-- include/linux/rhashtable.h | 78 +++-------- lib/rhashtable.c | 270 +++++++------------------------------ net/tipc/net.c | 4 - net/tipc/socket.c | 30 ++--- 5 files changed, 94 insertions(+), 316 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 70e94170af85..94f50cac91c6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1420,32 +1420,26 @@ static struct shrinker glock_shrinker = { * @sdp: the filesystem * @bucket: the bucket * - * Note that the function can be called multiple times on the same - * object. So the user must ensure that the function can cope with - * that. */ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; - struct rhashtable_iter iter; + struct rhash_head *pos; + const struct bucket_table *tbl; + int i; - rhashtable_walk_enter(&gl_hash_table, &iter); - - do { - gl = ERR_PTR(rhashtable_walk_start(&iter)); - if (gl) - continue; - - while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) + rcu_read_lock(); + tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table); + for (i = 0; i < tbl->size; i++) { + rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) { if ((gl->gl_name.ln_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); - - rhashtable_walk_stop(&iter); - } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); - - rhashtable_walk_exit(&iter); + } + } + rcu_read_unlock(); + cond_resched(); } /** diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index f2e12a845910..5c132d3188be 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -61,7 +61,6 @@ struct rhlist_head { /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets - * @nest: Number of bits of first-level nested table. * @rehash: Current bucket being rehashed * @hash_rnd: Random seed to fold into hash * @locks_mask: Mask to apply before accessing locks[] @@ -69,12 +68,10 @@ struct rhlist_head { * @walkers: List of active walkers * @rcu: RCU structure for freeing the table * @future_tbl: Table under construction during rehashing - * @ntbl: Nested table used when out of memory. * @buckets: size * hash buckets */ struct bucket_table { unsigned int size; - unsigned int nest; unsigned int rehash; u32 hash_rnd; unsigned int locks_mask; @@ -84,7 +81,7 @@ struct bucket_table { struct bucket_table __rcu *future_tbl; - struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; + struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; /** @@ -377,12 +374,6 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, void *arg); void rhashtable_destroy(struct rhashtable *ht); -struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, - unsigned int hash); -struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash); - #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) @@ -398,27 +389,6 @@ struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) -static inline struct rhash_head __rcu *const *rht_bucket( - const struct bucket_table *tbl, unsigned int hash) -{ - return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : - &tbl->buckets[hash]; -} - -static inline struct rhash_head __rcu **rht_bucket_var( - struct bucket_table *tbl, unsigned int hash) -{ - return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : - &tbl->buckets[hash]; -} - -static inline struct rhash_head __rcu **rht_bucket_insert( - struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) -{ - return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : - &tbl->buckets[hash]; -} - /** * rht_for_each_continue - continue iterating over hash chain * @pos: the &struct rhash_head to use as a loop cursor. @@ -438,7 +408,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ - rht_for_each_continue(pos, *rht_bucket(tbl, hash), tbl, hash) + rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash) /** * rht_for_each_entry_continue - continue iterating over hash chain @@ -463,7 +433,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_continue(tpos, pos, *rht_bucket(tbl, hash), \ + rht_for_each_entry_continue(tpos, pos, (tbl)->buckets[hash], \ tbl, hash, member) /** @@ -478,13 +448,13 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * This hash chain list-traversal primitive allows for the looped code to * remove the loop cursor from the list. */ -#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ - for (pos = rht_dereference_bucket(*rht_bucket(tbl, hash), tbl, hash), \ - next = !rht_is_a_nulls(pos) ? \ - rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ - (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ - pos = next, \ - next = !rht_is_a_nulls(pos) ? \ +#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ + for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \ + next = !rht_is_a_nulls(pos) ? \ + rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ + pos = next, \ + next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL) /** @@ -515,7 +485,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu(pos, tbl, hash) \ - rht_for_each_rcu_continue(pos, *rht_bucket(tbl, hash), tbl, hash) + rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash) /** * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain @@ -548,8 +518,8 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ -#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_rcu_continue(tpos, pos, *rht_bucket(tbl, hash), \ +#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ + rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\ tbl, hash, member) /** @@ -595,7 +565,7 @@ static inline struct rhash_head *__rhashtable_lookup( .ht = ht, .key = key, }; - struct bucket_table *tbl; + const struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; @@ -727,12 +697,8 @@ slow_path: } elasticity = ht->elasticity; - pprev = rht_bucket_insert(ht, tbl, hash); - data = ERR_PTR(-ENOMEM); - if (!pprev) - goto out; - - rht_for_each_continue(head, *pprev, tbl, hash) { + pprev = &tbl->buckets[hash]; + rht_for_each(head, tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; @@ -770,7 +736,7 @@ slow_path: if (unlikely(rht_grow_above_100(ht, tbl))) goto slow_path; - head = rht_dereference_bucket(*pprev, tbl, hash); + head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); RCU_INIT_POINTER(obj->next, head); if (rhlist) { @@ -780,7 +746,7 @@ slow_path: RCU_INIT_POINTER(list->next, NULL); } - rcu_assign_pointer(*pprev, obj); + rcu_assign_pointer(tbl->buckets[hash], obj); atomic_inc(&ht->nelems); if (rht_grow_above_75(ht, tbl)) @@ -989,8 +955,8 @@ static inline int __rhashtable_remove_fast_one( spin_lock_bh(lock); - pprev = rht_bucket_var(tbl, hash); - rht_for_each_continue(he, *pprev, tbl, hash) { + pprev = &tbl->buckets[hash]; + rht_for_each(he, tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); @@ -1141,8 +1107,8 @@ static inline int __rhashtable_replace_fast( spin_lock_bh(lock); - pprev = rht_bucket_var(tbl, hash); - rht_for_each_continue(he, *pprev, tbl, hash) { + pprev = &tbl->buckets[hash]; + rht_for_each(he, tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 172454e6b979..32d0ad058380 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -32,11 +32,6 @@ #define HASH_MIN_SIZE 4U #define BUCKET_LOCKS_PER_CPU 32UL -union nested_table { - union nested_table __rcu *table; - struct rhash_head __rcu *bucket; -}; - static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he) @@ -81,9 +76,6 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, /* Never allocate more than 0.5 locks per bucket */ size = min_t(unsigned int, size, tbl->size >> 1); - if (tbl->nest) - size = min(size, 1U << tbl->nest); - if (sizeof(spinlock_t) != 0) { tbl->locks = NULL; #ifdef CONFIG_NUMA @@ -107,45 +99,8 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, return 0; } -static void nested_table_free(union nested_table *ntbl, unsigned int size) -{ - const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); - const unsigned int len = 1 << shift; - unsigned int i; - - ntbl = rcu_dereference_raw(ntbl->table); - if (!ntbl) - return; - - if (size > len) { - size >>= shift; - for (i = 0; i < len; i++) - nested_table_free(ntbl + i, size); - } - - kfree(ntbl); -} - -static void nested_bucket_table_free(const struct bucket_table *tbl) -{ - unsigned int size = tbl->size >> tbl->nest; - unsigned int len = 1 << tbl->nest; - union nested_table *ntbl; - unsigned int i; - - ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); - - for (i = 0; i < len; i++) - nested_table_free(ntbl + i, size); - - kfree(ntbl); -} - static void bucket_table_free(const struct bucket_table *tbl) { - if (tbl->nest) - nested_bucket_table_free(tbl); - if (tbl) kvfree(tbl->locks); @@ -157,59 +112,6 @@ static void bucket_table_free_rcu(struct rcu_head *head) bucket_table_free(container_of(head, struct bucket_table, rcu)); } -static union nested_table *nested_table_alloc(struct rhashtable *ht, - union nested_table __rcu **prev, - unsigned int shifted, - unsigned int nhash) -{ - union nested_table *ntbl; - int i; - - ntbl = rcu_dereference(*prev); - if (ntbl) - return ntbl; - - ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); - - if (ntbl && shifted) { - for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++) - INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht, - (i << shifted) | nhash); - } - - rcu_assign_pointer(*prev, ntbl); - - return ntbl; -} - -static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, - size_t nbuckets, - gfp_t gfp) -{ - const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); - struct bucket_table *tbl; - size_t size; - - if (nbuckets < (1 << (shift + 1))) - return NULL; - - size = sizeof(*tbl) + sizeof(tbl->buckets[0]); - - tbl = kzalloc(size, gfp); - if (!tbl) - return NULL; - - if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, - 0, 0)) { - kfree(tbl); - return NULL; - } - - tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; - - return tbl; -} - static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) @@ -224,17 +126,10 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); if (tbl == NULL && gfp == GFP_KERNEL) tbl = vzalloc(size); - - size = nbuckets; - - if (tbl == NULL && gfp != GFP_KERNEL) { - tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); - nbuckets = 0; - } if (tbl == NULL) return NULL; - tbl->size = size; + tbl->size = nbuckets; if (alloc_bucket_locks(ht, tbl, gfp) < 0) { bucket_table_free(tbl); @@ -269,17 +164,12 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl = rhashtable_last_table(ht, rht_dereference_rcu(old_tbl->future_tbl, ht)); - struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash); - int err = -EAGAIN; + struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; + int err = -ENOENT; struct rhash_head *head, *next, *entry; spinlock_t *new_bucket_lock; unsigned int new_hash; - if (new_tbl->nest) - goto out; - - err = -ENOENT; - rht_for_each(entry, old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); @@ -312,26 +202,19 @@ out: return err; } -static int rhashtable_rehash_chain(struct rhashtable *ht, +static void rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); spinlock_t *old_bucket_lock; - int err; old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); spin_lock_bh(old_bucket_lock); - while (!(err = rhashtable_rehash_one(ht, old_hash))) + while (!rhashtable_rehash_one(ht, old_hash)) ; - - if (err == -ENOENT) { - old_tbl->rehash++; - err = 0; - } + old_tbl->rehash++; spin_unlock_bh(old_bucket_lock); - - return err; } static int rhashtable_rehash_attach(struct rhashtable *ht, @@ -363,17 +246,13 @@ static int rhashtable_rehash_table(struct rhashtable *ht) struct bucket_table *new_tbl; struct rhashtable_walker *walker; unsigned int old_hash; - int err; new_tbl = rht_dereference(old_tbl->future_tbl, ht); if (!new_tbl) return 0; - for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { - err = rhashtable_rehash_chain(ht, old_hash); - if (err) - return err; - } + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) + rhashtable_rehash_chain(ht, old_hash); /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); @@ -392,16 +271,31 @@ static int rhashtable_rehash_table(struct rhashtable *ht) return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } -static int rhashtable_rehash_alloc(struct rhashtable *ht, - struct bucket_table *old_tbl, - unsigned int size) +/** + * rhashtable_expand - Expand hash table while allowing concurrent lookups + * @ht: the hash table to expand + * + * A secondary bucket array is allocated and the hash entries are migrated. + * + * This function may only be called in a context where it is safe to call + * synchronize_rcu(), e.g. not within a rcu_read_lock() section. + * + * The caller must ensure that no concurrent resizing occurs by holding + * ht->mutex. + * + * It is valid to have concurrent insertions and deletions protected by per + * bucket locks or concurrent RCU protected lookups and traversals. + */ +static int rhashtable_expand(struct rhashtable *ht) { - struct bucket_table *new_tbl; + struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); int err; ASSERT_RHT_MUTEX(ht); - new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + old_tbl = rhashtable_last_table(ht, old_tbl); + + new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; @@ -430,9 +324,12 @@ static int rhashtable_rehash_alloc(struct rhashtable *ht, */ static int rhashtable_shrink(struct rhashtable *ht) { - struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); unsigned int nelems = atomic_read(&ht->nelems); unsigned int size = 0; + int err; + + ASSERT_RHT_MUTEX(ht); if (nelems) size = roundup_pow_of_two(nelems * 3 / 2); @@ -445,7 +342,15 @@ static int rhashtable_shrink(struct rhashtable *ht) if (rht_dereference(old_tbl->future_tbl, ht)) return -EEXIST; - return rhashtable_rehash_alloc(ht, old_tbl, size); + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + if (new_tbl == NULL) + return -ENOMEM; + + err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); + if (err) + bucket_table_free(new_tbl); + + return err; } static void rht_deferred_worker(struct work_struct *work) @@ -461,14 +366,11 @@ static void rht_deferred_worker(struct work_struct *work) tbl = rhashtable_last_table(ht, tbl); if (rht_grow_above_75(ht, tbl)) - err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); + rhashtable_expand(ht); else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) - err = rhashtable_shrink(ht); - else if (tbl->nest) - err = rhashtable_rehash_alloc(ht, tbl, tbl->size); + rhashtable_shrink(ht); - if (!err) - err = rhashtable_rehash_table(ht); + err = rhashtable_rehash_table(ht); mutex_unlock(&ht->mutex); @@ -537,8 +439,8 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, int elasticity; elasticity = ht->elasticity; - pprev = rht_bucket_var(tbl, hash); - rht_for_each_continue(head, *pprev, tbl, hash) { + pprev = &tbl->buckets[hash]; + rht_for_each(head, tbl, hash) { struct rhlist_head *list; struct rhlist_head *plist; @@ -575,7 +477,6 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, struct rhash_head *obj, void *data) { - struct rhash_head __rcu **pprev; struct bucket_table *new_tbl; struct rhash_head *head; @@ -598,11 +499,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, if (unlikely(rht_grow_above_100(ht, tbl))) return ERR_PTR(-EAGAIN); - pprev = rht_bucket_insert(ht, tbl, hash); - if (!pprev) - return ERR_PTR(-ENOMEM); - - head = rht_dereference_bucket(*pprev, tbl, hash); + head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); RCU_INIT_POINTER(obj->next, head); if (ht->rhlist) { @@ -612,7 +509,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, RCU_INIT_POINTER(list->next, NULL); } - rcu_assign_pointer(*pprev, obj); + rcu_assign_pointer(tbl->buckets[hash], obj); atomic_inc(&ht->nelems); if (rht_grow_above_75(ht, tbl)) @@ -1078,7 +975,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg) { - struct bucket_table *tbl; + const struct bucket_table *tbl; unsigned int i; cancel_work_sync(&ht->run_work); @@ -1089,7 +986,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, for (i = 0; i < tbl->size; i++) { struct rhash_head *pos, *next; - for (pos = rht_dereference(*rht_bucket(tbl, i), ht), + for (pos = rht_dereference(tbl->buckets[i], ht), next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; !rht_is_a_nulls(pos); @@ -1110,70 +1007,3 @@ void rhashtable_destroy(struct rhashtable *ht) return rhashtable_free_and_destroy(ht, NULL, NULL); } EXPORT_SYMBOL_GPL(rhashtable_destroy); - -struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, - unsigned int hash) -{ - const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); - static struct rhash_head __rcu *rhnull = - (struct rhash_head __rcu *)NULLS_MARKER(0); - unsigned int index = hash & ((1 << tbl->nest) - 1); - unsigned int size = tbl->size >> tbl->nest; - unsigned int subhash = hash; - union nested_table *ntbl; - - ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); - ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash); - subhash >>= tbl->nest; - - while (ntbl && size > (1 << shift)) { - index = subhash & ((1 << shift) - 1); - ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash); - size >>= shift; - subhash >>= shift; - } - - if (!ntbl) - return &rhnull; - - return &ntbl[subhash].bucket; - -} -EXPORT_SYMBOL_GPL(rht_bucket_nested); - -struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash) -{ - const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); - unsigned int index = hash & ((1 << tbl->nest) - 1); - unsigned int size = tbl->size >> tbl->nest; - union nested_table *ntbl; - unsigned int shifted; - unsigned int nhash; - - ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); - hash >>= tbl->nest; - nhash = index; - shifted = tbl->nest; - ntbl = nested_table_alloc(ht, &ntbl[index].table, - size <= (1 << shift) ? shifted : 0, nhash); - - while (ntbl && size > (1 << shift)) { - index = hash & ((1 << shift) - 1); - size >>= shift; - hash >>= shift; - nhash |= index << shifted; - shifted += shift; - ntbl = nested_table_alloc(ht, &ntbl[index].table, - size <= (1 << shift) ? shifted : 0, - nhash); - } - - if (!ntbl) - return NULL; - - return &ntbl[hash].bucket; - -} -EXPORT_SYMBOL_GPL(rht_bucket_nested_insert); diff --git a/net/tipc/net.c b/net/tipc/net.c index ab8a2d5d1e32..28bf4feeb81c 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -110,10 +110,6 @@ int tipc_net_start(struct net *net, u32 addr) char addr_string[16]; tn->own_addr = addr; - - /* Ensure that the new address is visible before we reinit. */ - smp_mb(); - tipc_named_reinit(net); tipc_sk_reinit(net); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 370a5912bcb5..800caaa699a1 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -384,6 +384,8 @@ static int tipc_sk_create(struct net *net, struct socket *sock, INIT_LIST_HEAD(&tsk->publications); msg = &tsk->phdr; tn = net_generic(sock_net(sk), tipc_net_id); + tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, + NAMED_H_SIZE, 0); /* Finish initializing socket data structures */ sock->ops = ops; @@ -393,13 +395,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock, pr_warn("Socket create failed; port number exhausted\n"); return -EINVAL; } - - /* Ensure tsk is visible before we read own_addr. */ - smp_mb(); - - tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, - NAMED_H_SIZE, 0); - msg_set_origport(msg, tsk->portid); setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); sk->sk_shutdown = 0; @@ -2274,27 +2269,24 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, void tipc_sk_reinit(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct rhashtable_iter iter; + const struct bucket_table *tbl; + struct rhash_head *pos; struct tipc_sock *tsk; struct tipc_msg *msg; + int i; - rhashtable_walk_enter(&tn->sk_rht, &iter); - - do { - tsk = ERR_PTR(rhashtable_walk_start(&iter)); - if (tsk) - continue; - - while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { + rcu_read_lock(); + tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht); + for (i = 0; i < tbl->size; i++) { + rht_for_each_entry_rcu(tsk, pos, tbl, i, node) { spin_lock_bh(&tsk->sk.sk_lock.slock); msg = &tsk->phdr; msg_set_prevnode(msg, tn->own_addr); msg_set_orignode(msg, tn->own_addr); spin_unlock_bh(&tsk->sk.sk_lock.slock); } - - rhashtable_walk_stop(&iter); - } while (tsk == ERR_PTR(-EAGAIN)); + } + rcu_read_unlock(); } static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) From 84588a93d097bace24b9233930f82511d4f34210 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 16 Feb 2017 15:08:20 +0100 Subject: [PATCH 55/56] fuse: fix uninitialized flags in pipe_buffer Signed-off-by: Miklos Szeredi Fixes: d82718e348fe ("fuse_dev_splice_read(): switch to add_to_pipe()") Cc: # 4.9+ --- fs/fuse/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b656e1805f04..f11792672977 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1376,6 +1376,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, * code can Oops if the buffer persists after module unload. */ bufs[page_nr].ops = &nosteal_pipe_buf_ops; + bufs[page_nr].flags = 0; ret = add_to_pipe(pipe, &bufs[page_nr++]); if (unlikely(ret < 0)) break; From 5a81e6a171cdbd1fa8bc1fdd80c23d3d71816fac Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 16 Feb 2017 17:49:02 +0100 Subject: [PATCH 56/56] vfs: fix uninitialized flags in splice_to_pipe() Flags (PIPE_BUF_FLAG_PACKET, PIPE_BUF_FLAG_GIFT) could remain on the unused part of the pipe ring buffer. Previously splice_to_pipe() left the flags value alone, which could result in incorrect behavior. Uninitialized flags appears to have been there from the introduction of the splice syscall. Signed-off-by: Miklos Szeredi Cc: # 2.6.17+ Signed-off-by: Linus Torvalds --- fs/splice.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/splice.c b/fs/splice.c index 873d83104e79..4ef78aa8ef61 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -204,6 +204,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; + buf->flags = 0; pipe->nrbufs++; page_nr++;