From eac9eacee1602710dda47c517ad0b61ac6f429bf Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 21 May 2011 17:07:24 +0200 Subject: [PATCH 01/12] perf tools: Check we are able to read the event size on mmap Check we have enough mmaped space to read the current event size from its headers, otherwise we may dereference some hell there. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian --- tools/perf/util/session.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index fff66741f18d..61746b5866d8 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1007,6 +1007,13 @@ remap: file_pos = file_offset + head; more: + /* + * Ensure we have enough space remaining to read + * the size of the event in the headers. + */ + if (head + sizeof(event->header) > mmap_size) + goto remap; + event = (union perf_event *)(buf + head); if (session->header.needs_swap) From dd5f5fd1083601d9145168ce43a268a068add81a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 21 May 2011 17:07:24 +0200 Subject: [PATCH 02/12] perf tools: Remove junk code in mmap size handling size is overriden later and used only then. Those lines are only junk, probably a leftover. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian --- tools/perf/util/session.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 61746b5866d8..db652068c396 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1018,9 +1018,6 @@ more: if (session->header.needs_swap) perf_event_header__bswap(&event->header); - size = event->header.size; - if (size == 0) - size = 8; if (head + event->header.size > mmap_size) { if (mmaps[map_idx]) { From 74429964d8e29c0107fa6e9cdf35b8f33f57405d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 21 May 2011 17:49:00 +0200 Subject: [PATCH 03/12] perf tools: Move evlist sample helpers to evlist area These APIs should belong to evlist.c as they may not be exclusively tied to the headers. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian entries, node) { + if (!type) + type = pos->attr.sample_type; + else if (type != pos->attr.sample_type) + die("non matching sample_type"); + } + + return type; +} + +bool perf_evlist__sample_id_all(const struct perf_evlist *evlist) +{ + bool value = false, first = true; + struct perf_evsel *pos; + + list_for_each_entry(pos, &evlist->entries, node) { + if (first) { + value = pos->attr.sample_id_all; + first = false; + } else if (value != pos->attr.sample_id_all) + die("non matching sample_id_all"); + } + + return value; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 8b1cb7a4c5f1..a8556b68c392 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -65,4 +65,7 @@ int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid, void perf_evlist__delete_maps(struct perf_evlist *evlist); int perf_evlist__set_filters(struct perf_evlist *evlist); +u64 perf_evlist__sample_type(struct perf_evlist *evlist); +bool perf_evlist__sample_id_all(const struct perf_evlist *evlist); + #endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 93862a8027ea..0717bebc7649 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -934,37 +934,6 @@ out_delete_evlist: return -ENOMEM; } -u64 perf_evlist__sample_type(struct perf_evlist *evlist) -{ - struct perf_evsel *pos; - u64 type = 0; - - list_for_each_entry(pos, &evlist->entries, node) { - if (!type) - type = pos->attr.sample_type; - else if (type != pos->attr.sample_type) - die("non matching sample_type"); - } - - return type; -} - -bool perf_evlist__sample_id_all(const struct perf_evlist *evlist) -{ - bool value = false, first = true; - struct perf_evsel *pos; - - list_for_each_entry(pos, &evlist->entries, node) { - if (first) { - value = pos->attr.sample_id_all; - first = false; - } else if (value != pos->attr.sample_id_all) - die("non matching sample_id_all"); - } - - return value; -} - int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, perf_event__handler_t process, struct perf_session *session) diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 456661d7f10e..1886256768a1 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -64,8 +64,6 @@ int perf_header__write_pipe(int fd); int perf_header__push_event(u64 id, const char *name); char *perf_header__find_event(u64 id); -u64 perf_evlist__sample_type(struct perf_evlist *evlist); -bool perf_evlist__sample_id_all(const struct perf_evlist *evlist); void perf_header__set_feat(struct perf_header *header, int feat); void perf_header__clear_feat(struct perf_header *header, int feat); bool perf_header__has_feat(const struct perf_header *header, int feat); From a285412479b6d5af3e48273a92ec2f1987df8cd1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 21 May 2011 19:33:04 +0200 Subject: [PATCH 04/12] perf tools: Pre-check sample size before parsing Check that the total size of the sample fields having a fixed size do not exceed the one of the whole event. This robustifies the sample parsing. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian --- tools/perf/builtin-test.c | 4 +++- tools/perf/util/event.c | 16 ++++++++++++++++ tools/perf/util/event.h | 12 +++++++++++- tools/perf/util/evsel.c | 6 +++++- tools/perf/util/python.c | 5 +++-- tools/perf/util/session.c | 1 + tools/perf/util/session.h | 2 ++ 7 files changed, 41 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 11e3c8458362..44d7df280430 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -474,6 +474,7 @@ static int test__basic_mmap(void) unsigned int nr_events[nsyscalls], expected_nr_events[nsyscalls], i, j; struct perf_evsel *evsels[nsyscalls], *evsel; + int sample_size = perf_sample_size(attr.sample_type); for (i = 0; i < nsyscalls; ++i) { char name[64]; @@ -558,7 +559,8 @@ static int test__basic_mmap(void) goto out_munmap; } - perf_event__parse_sample(event, attr.sample_type, false, &sample); + perf_event__parse_sample(event, attr.sample_type, sample_size, + false, &sample); evsel = perf_evlist__id2evsel(evlist, sample.id); if (evsel == NULL) { pr_debug("event with id %" PRIu64 diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 1023f67633a4..17c1c3c875c3 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -35,6 +35,22 @@ const char *perf_event__name(unsigned int id) return perf_event__names[id]; } +int perf_sample_size(u64 sample_type) +{ + u64 mask = sample_type & PERF_SAMPLE_MASK; + int size = 0; + int i; + + for (i = 0; i < 64; i++) { + if ((mask << i) & 1) + size++; + } + + size *= sizeof(u64); + + return size; +} + static struct perf_sample synth_sample = { .pid = -1, .tid = -1, diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 9c35170fb379..c08332871408 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -56,6 +56,13 @@ struct read_event { u64 id; }; + +#define PERF_SAMPLE_MASK \ + (PERF_SAMPLE_IP | PERF_SAMPLE_TID | \ + PERF_SAMPLE_TIME | PERF_SAMPLE_ADDR | \ + PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \ + PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD) + struct sample_event { struct perf_event_header header; u64 array[]; @@ -75,6 +82,8 @@ struct perf_sample { struct ip_callchain *callchain; }; +int perf_sample_size(u64 sample_type); + #define BUILD_ID_SIZE 20 struct build_id_event { @@ -178,6 +187,7 @@ int perf_event__preprocess_sample(const union perf_event *self, const char *perf_event__name(unsigned int id); int perf_event__parse_sample(const union perf_event *event, u64 type, - bool sample_id_all, struct perf_sample *sample); + int sample_size, bool sample_id_all, + struct perf_sample *sample); #endif /* __PERF_RECORD_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index d6fd59beb860..bfce8bf642fa 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -304,7 +304,8 @@ static int perf_event__parse_id_sample(const union perf_event *event, u64 type, } int perf_event__parse_sample(const union perf_event *event, u64 type, - bool sample_id_all, struct perf_sample *data) + int sample_size, bool sample_id_all, + struct perf_sample *data) { const u64 *array; @@ -319,6 +320,9 @@ int perf_event__parse_sample(const union perf_event *event, u64 type, array = event->sample.array; + if (sample_size + sizeof(event->header) > event->header.size) + return -EFAULT; + if (type & PERF_SAMPLE_IP) { data->ip = event->ip.ip; array++; diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 8b0eff8b8283..4174c0990320 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -690,8 +690,9 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, return PyErr_NoMemory(); first = list_entry(evlist->entries.next, struct perf_evsel, node); - perf_event__parse_sample(event, first->attr.sample_type, sample_id_all, - &pevent->sample); + perf_event__parse_sample(event, first->attr.sample_type, + perf_sample_size(first->attr.sample_type), + sample_id_all, &pevent->sample); return pyevent; } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index db652068c396..8940fd871eae 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -97,6 +97,7 @@ out: void perf_session__update_sample_type(struct perf_session *self) { self->sample_type = perf_evlist__sample_type(self->evlist); + self->sample_size = perf_sample_size(self->sample_type); self->sample_id_all = perf_evlist__sample_id_all(self->evlist); perf_session__id_header_size(self); } diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 8daaa2d15396..66d4e1490879 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -43,6 +43,7 @@ struct perf_session { */ struct hists hists; u64 sample_type; + int sample_size; int fd; bool fd_pipe; bool repipe; @@ -159,6 +160,7 @@ static inline int perf_session__parse_sample(struct perf_session *session, struct perf_sample *sample) { return perf_event__parse_sample(event, session->sample_type, + session->sample_size, session->sample_id_all, sample); } From 98e1da905cbe64bb023a165c7c01eef5e800609e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 21 May 2011 20:08:15 +0200 Subject: [PATCH 05/12] perf tools: Robustify dynamic sample content fetch Ensure the size of the dynamic fields such as callchains or raw events don't overlap the whole event boundaries. This prevents from dereferencing junk if the given size of the callchain goes too eager. Reported-by: Linus Torvalds Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian --- tools/perf/util/evsel.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index bfce8bf642fa..ee0fe0dffa71 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -303,6 +303,17 @@ static int perf_event__parse_id_sample(const union perf_event *event, u64 type, return 0; } +static bool sample_overlap(const union perf_event *event, + const void *offset, u64 size) +{ + const void *base = event; + + if (offset + size > base + event->header.size) + return true; + + return false; +} + int perf_event__parse_sample(const union perf_event *event, u64 type, int sample_size, bool sample_id_all, struct perf_sample *data) @@ -373,14 +384,29 @@ int perf_event__parse_sample(const union perf_event *event, u64 type, } if (type & PERF_SAMPLE_CALLCHAIN) { + if (sample_overlap(event, array, sizeof(data->callchain->nr))) + return -EFAULT; + data->callchain = (struct ip_callchain *)array; + + if (sample_overlap(event, array, data->callchain->nr)) + return -EFAULT; + array += 1 + data->callchain->nr; } if (type & PERF_SAMPLE_RAW) { u32 *p = (u32 *)array; + + if (sample_overlap(event, array, sizeof(u32))) + return -EFAULT; + data->raw_size = *p; p++; + + if (sample_overlap(event, p, data->raw_size)) + return -EFAULT; + data->raw_data = p; } From 5538becaec9ca2ff21e7826372941dc46f498487 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 May 2011 02:17:22 +0200 Subject: [PATCH 06/12] perf tools: Propagate event parse error handling Better handle event parsing error by propagating the details in upper layers or by dumping some failure message. So that the user knows he has some crazy events in the batch. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian --- tools/perf/builtin-test.c | 9 +++++++-- tools/perf/builtin-top.c | 7 ++++++- tools/perf/util/python.c | 14 ++++++++++---- tools/perf/util/session.c | 14 ++++++++++---- 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 44d7df280430..1fa9f58c2af2 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -559,8 +559,13 @@ static int test__basic_mmap(void) goto out_munmap; } - perf_event__parse_sample(event, attr.sample_type, sample_size, - false, &sample); + err = perf_event__parse_sample(event, attr.sample_type, sample_size, + false, &sample); + if (err) { + pr_err("Can't parse sample, err = %d\n", err); + goto out_munmap; + } + evsel = perf_evlist__id2evsel(evlist, sample.id); if (evsel == NULL) { pr_debug("event with id %" PRIu64 diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 7e3d6e310bf8..74f533cbf6ca 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -805,9 +805,14 @@ static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu) { struct perf_sample sample; union perf_event *event; + int ret; while ((event = perf_evlist__read_on_cpu(top.evlist, cpu)) != NULL) { - perf_session__parse_sample(self, event, &sample); + ret = perf_session__parse_sample(self, event, &sample); + if (ret) { + pr_err("Can't parse sample, err = %d\n", ret); + continue; + } if (event->header.type == PERF_RECORD_SAMPLE) perf_event__process_sample(event, &sample, self); diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 4174c0990320..3344d6e6f048 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -675,6 +675,7 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, union perf_event *event; int sample_id_all = 1, cpu; static char *kwlist[] = {"sample_id_all", NULL, NULL}; + int err; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|i", kwlist, &cpu, &sample_id_all)) @@ -690,12 +691,17 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, return PyErr_NoMemory(); first = list_entry(evlist->entries.next, struct perf_evsel, node); - perf_event__parse_sample(event, first->attr.sample_type, - perf_sample_size(first->attr.sample_type), - sample_id_all, &pevent->sample); + err = perf_event__parse_sample(event, first->attr.sample_type, + perf_sample_size(first->attr.sample_type), + sample_id_all, &pevent->sample); + if (err) { + pr_err("Can't parse sample, err = %d\n", err); + goto end; + } + return pyevent; } - +end: Py_INCREF(Py_None); return Py_None; } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 8940fd871eae..948327d9e92b 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -480,6 +480,7 @@ static void flush_sample_queue(struct perf_session *s, struct perf_sample sample; u64 limit = os->next_flush; u64 last_ts = os->last_sample ? os->last_sample->timestamp : 0ULL; + int ret; if (!ops->ordered_samples || !limit) return; @@ -488,9 +489,12 @@ static void flush_sample_queue(struct perf_session *s, if (iter->timestamp > limit) break; - perf_session__parse_sample(s, iter->event, &sample); - perf_session_deliver_event(s, iter->event, &sample, ops, - iter->file_offset); + ret = perf_session__parse_sample(s, iter->event, &sample); + if (ret) + pr_err("Can't parse sample, err = %d\n", ret); + else + perf_session_deliver_event(s, iter->event, &sample, ops, + iter->file_offset); os->last_flush = iter->timestamp; list_del(&iter->list); @@ -806,7 +810,9 @@ static int perf_session__process_event(struct perf_session *session, /* * For all kernel events we get the sample data */ - perf_session__parse_sample(session, event, &sample); + ret = perf_session__parse_sample(session, event, &sample); + if (ret) + return ret; /* Preprocess sample records - precheck callchains */ if (perf_session__preprocess_sample(session, event, &sample)) From 824c6b7f6294101f30e141117def224a56c203e6 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sun, 22 May 2011 22:10:20 -0700 Subject: [PATCH 07/12] watchdog: Fix rounding bug in get_sample_period() In get_sample_period(), softlockup_thresh is integer divided by 5 before the multiplication by NSEC_PER_SEC. This results in softlockup_thresh being rounded down to the nearest integer multiple of 5. For example, a softlockup_thresh of 4 rounds down to 0. Signed-off-by: Mandeep Singh Baines Cc: Marcin Slusarz Cc: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1306127423-3347-1-git-send-email-msb@chromium.org Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 14733d4d156b..a06972d71060 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -110,7 +110,7 @@ static unsigned long get_sample_period(void) * increment before the hardlockup detector generates * a warning */ - return softlockup_thresh / 5 * NSEC_PER_SEC; + return softlockup_thresh * (NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ From e04ab2bc41b35c0cb6cdb07c8443f91aa738cf78 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sun, 22 May 2011 22:10:21 -0700 Subject: [PATCH 08/12] watchdog: Only disable/enable watchdog if neccessary Don't take any action on an unsuccessful write to /proc. Signed-off-by: Mandeep Singh Baines Cc: Marcin Slusarz Cc: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1306127423-3347-2-git-send-email-msb@chromium.org Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a06972d71060..cf0e09f452e7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -507,15 +507,19 @@ static void watchdog_disable_all_cpus(void) int proc_dowatchdog_enabled(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, buffer, length, ppos); + int ret; - if (write) { - if (watchdog_enabled) - watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); - } - return 0; + ret = proc_dointvec(table, write, buffer, length, ppos); + if (ret || !write) + goto out; + + if (watchdog_enabled) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + +out: + return ret; } int proc_dowatchdog_thresh(struct ctl_table *table, int write, From 586692a5a5fc5740c8a46abc0f2365495c2d7c5f Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sun, 22 May 2011 22:10:22 -0700 Subject: [PATCH 09/12] watchdog: Disable watchdog when thresh is zero This restores the previous behavior of softlock_thresh. Currently, setting watchdog_thresh to zero causes the watchdog kthreads to consume a lot of CPU. In addition, the logic of proc_dowatchdog_thresh and proc_dowatchdog_enabled has been factored into proc_dowatchdog. Signed-off-by: Mandeep Singh Baines Cc: Marcin Slusarz Cc: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1306127423-3347-3-git-send-email-msb@chromium.org Signed-off-by: Ingo Molnar LKML-Reference: <20110517071018.GE22305@elte.hu> --- include/linux/nmi.h | 5 +++-- include/linux/sched.h | 1 - kernel/sysctl.c | 12 ++++++++---- kernel/watchdog.c | 25 +++++++++---------------- 4 files changed, 20 insertions(+), 23 deletions(-) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index c536f8545f74..5317b8b2198f 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -47,9 +47,10 @@ static inline bool trigger_all_cpu_backtrace(void) int hw_nmi_is_cpu_stuck(struct pt_regs *); u64 hw_nmi_get_sample_period(void); extern int watchdog_enabled; +extern int watchdog_thresh; struct ctl_table; -extern int proc_dowatchdog_enabled(struct ctl_table *, int , - void __user *, size_t *, loff_t *); +extern int proc_dowatchdog(struct ctl_table *, int , + void __user *, size_t *, loff_t *); #endif #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 12211e1666e2..d8b2d0bec0d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -315,7 +315,6 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; -extern int softlockup_thresh; void lockup_detector_init(void); #else static inline void touch_softlockup_watchdog(void) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c0bb32414b17..3dd0c46fa3bb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -730,14 +730,16 @@ static struct ctl_table kern_table[] = { .data = &watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dowatchdog_enabled, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &one, }, { .procname = "watchdog_thresh", - .data = &softlockup_thresh, + .data = &watchdog_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dowatchdog_thresh, + .proc_handler = proc_dowatchdog, .extra1 = &neg_one, .extra2 = &sixty, }, @@ -755,7 +757,9 @@ static struct ctl_table kern_table[] = { .data = &watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dowatchdog_enabled, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &one, }, #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index cf0e09f452e7..60301916f62e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -28,7 +28,7 @@ #include int watchdog_enabled = 1; -int __read_mostly softlockup_thresh = 60; +int __read_mostly watchdog_thresh = 60; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); @@ -105,12 +105,12 @@ static unsigned long get_timestamp(int this_cpu) static unsigned long get_sample_period(void) { /* - * convert softlockup_thresh from seconds to ns + * convert watchdog_thresh from seconds to ns * the divide by 5 is to give hrtimer 5 chances to * increment before the hardlockup detector generates * a warning */ - return softlockup_thresh * (NSEC_PER_SEC / 5); + return watchdog_thresh * (NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ @@ -182,7 +182,7 @@ static int is_softlockup(unsigned long touch_ts) unsigned long now = get_timestamp(smp_processor_id()); /* Warn about unreasonable delays: */ - if (time_after(now, touch_ts + softlockup_thresh)) + if (time_after(now, touch_ts + watchdog_thresh)) return now - touch_ts; return 0; @@ -501,19 +501,19 @@ static void watchdog_disable_all_cpus(void) /* sysctl functions */ #ifdef CONFIG_SYSCTL /* - * proc handler for /proc/sys/kernel/nmi_watchdog + * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ -int proc_dowatchdog_enabled(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) +int proc_dowatchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec(table, write, buffer, length, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) goto out; - if (watchdog_enabled) + if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); @@ -521,13 +521,6 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write, out: return ret; } - -int proc_dowatchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} #endif /* CONFIG_SYSCTL */ From 4eec42f392043063d0f019640b4ccc2a45570002 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sun, 22 May 2011 22:10:23 -0700 Subject: [PATCH 10/12] watchdog: Change the default timeout and configure nmi watchdog period based on watchdog_thresh Before the conversion of the NMI watchdog to perf event, the watchdog timeout was 5 seconds. Now it is 60 seconds. For my particular application, netbooks, 5 seconds was a better timeout. With a short timeout, we catch faults earlier and are able to send back a panic. With a 60 second timeout, the user is unlikely to wait and will instead hit the power button, causing us to lose the panic info. This change configures the NMI period to watchdog_thresh and sets the softlockup_thresh to watchdog_thresh * 2. In addition, watchdog_thresh was reduced to 10 seconds as suggested by Ingo Molnar. Signed-off-by: Mandeep Singh Baines Cc: Marcin Slusarz Cc: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1306127423-3347-4-git-send-email-msb@chromium.org Signed-off-by: Ingo Molnar LKML-Reference: <20110517071642.GF22305@elte.hu> --- arch/x86/kernel/apic/hw_nmi.c | 4 ++-- include/linux/nmi.h | 2 +- kernel/watchdog.c | 19 +++++++++++++++---- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 5260fe91bcb6..d5e57db0f7be 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -19,9 +19,9 @@ #include #ifdef CONFIG_HARDLOCKUP_DETECTOR -u64 hw_nmi_get_sample_period(void) +u64 hw_nmi_get_sample_period(int watchdog_thresh) { - return (u64)(cpu_khz) * 1000 * 60; + return (u64)(cpu_khz) * 1000 * watchdog_thresh; } #endif diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 5317b8b2198f..2d304efc89df 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -45,7 +45,7 @@ static inline bool trigger_all_cpu_backtrace(void) #ifdef CONFIG_LOCKUP_DETECTOR int hw_nmi_is_cpu_stuck(struct pt_regs *); -u64 hw_nmi_get_sample_period(void); +u64 hw_nmi_get_sample_period(int watchdog_thresh); extern int watchdog_enabled; extern int watchdog_thresh; struct ctl_table; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 60301916f62e..6e63097fa73a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -28,7 +28,7 @@ #include int watchdog_enabled = 1; -int __read_mostly watchdog_thresh = 60; +int __read_mostly watchdog_thresh = 10; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); @@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str) __setup("nosoftlockup", nosoftlockup_setup); /* */ +/* + * Hard-lockup warnings should be triggered after just a few seconds. Soft- + * lockups can have false positives under extreme conditions. So we generally + * want a higher threshold for soft lockups than for hard lockups. So we couple + * the thresholds with a factor: we make the soft threshold twice the amount of + * time the hard threshold is. + */ +static int get_softlockup_thresh() +{ + return watchdog_thresh * 2; +} /* * Returns seconds, approximately. We don't need nanosecond @@ -110,7 +121,7 @@ static unsigned long get_sample_period(void) * increment before the hardlockup detector generates * a warning */ - return watchdog_thresh * (NSEC_PER_SEC / 5); + return get_softlockup_thresh() * (NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ @@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts) unsigned long now = get_timestamp(smp_processor_id()); /* Warn about unreasonable delays: */ - if (time_after(now, touch_ts + watchdog_thresh)) + if (time_after(now, touch_ts + get_softlockup_thresh())) return now - touch_ts; return 0; @@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(); + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); From 998bedc8c56c6869de457c845cbd328592e5e82e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 23 May 2011 13:06:28 +0200 Subject: [PATCH 11/12] perf tools: Fix ommitted mmap data update on remap Commit eac9eacee16 "perf tools: Check we are able to read the event size on mmap" brought a check to ensure we can read the size of the event before dereferencing it, and do a remap otherwise to move the buffer forward. However that remap was ommitting all the necessary work to update the new page offset, head, and to unmap previous pages, etc... To fix this, gather all the code that fetches the event in a seperate helper which does all the necessary checks about the header/event size and tells us anytime a remap is needed. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1306148788-6179-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- tools/perf/util/session.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 948327d9e92b..64500fc78799 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -960,6 +960,30 @@ out_err: return err; } +static union perf_event * +fetch_mmaped_event(struct perf_session *session, + u64 head, size_t mmap_size, char *buf) +{ + union perf_event *event; + + /* + * Ensure we have enough space remaining to read + * the size of the event in the headers. + */ + if (head + sizeof(event->header) > mmap_size) + return NULL; + + event = (union perf_event *)(buf + head); + + if (session->header.needs_swap) + perf_event_header__bswap(&event->header); + + if (head + event->header.size > mmap_size) + return NULL; + + return event; +} + int __perf_session__process_events(struct perf_session *session, u64 data_offset, u64 data_size, u64 file_size, struct perf_event_ops *ops) @@ -1014,19 +1038,8 @@ remap: file_pos = file_offset + head; more: - /* - * Ensure we have enough space remaining to read - * the size of the event in the headers. - */ - if (head + sizeof(event->header) > mmap_size) - goto remap; - - event = (union perf_event *)(buf + head); - - if (session->header.needs_swap) - perf_event_header__bswap(&event->header); - - if (head + event->header.size > mmap_size) { + event = fetch_mmaped_event(session, head, mmap_size, buf); + if (!event) { if (mmaps[map_idx]) { munmap(mmaps[map_idx], mmap_size); mmaps[map_idx] = NULL; From 3cb6d1540880e767d911b79eb49578de2190f428 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 23 May 2011 13:06:27 +0200 Subject: [PATCH 12/12] perf tools: Fix sample size bit operations What we want is to count the number of bits in the mask, not some other random operation written in the middle of the night. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1306148788-6179-2-git-send-email-fweisbec@gmail.com [ Fixed perf_event__names[] alignment which was nearby and hurting my eyes ... ] Signed-off-by: Ingo Molnar --- tools/perf/util/event.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 17c1c3c875c3..252b72a5e59e 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -9,21 +9,21 @@ #include "thread_map.h" static const char *perf_event__names[] = { - [0] = "TOTAL", - [PERF_RECORD_MMAP] = "MMAP", - [PERF_RECORD_LOST] = "LOST", - [PERF_RECORD_COMM] = "COMM", - [PERF_RECORD_EXIT] = "EXIT", - [PERF_RECORD_THROTTLE] = "THROTTLE", - [PERF_RECORD_UNTHROTTLE] = "UNTHROTTLE", - [PERF_RECORD_FORK] = "FORK", - [PERF_RECORD_READ] = "READ", - [PERF_RECORD_SAMPLE] = "SAMPLE", - [PERF_RECORD_HEADER_ATTR] = "ATTR", - [PERF_RECORD_HEADER_EVENT_TYPE] = "EVENT_TYPE", - [PERF_RECORD_HEADER_TRACING_DATA] = "TRACING_DATA", - [PERF_RECORD_HEADER_BUILD_ID] = "BUILD_ID", - [PERF_RECORD_FINISHED_ROUND] = "FINISHED_ROUND", + [0] = "TOTAL", + [PERF_RECORD_MMAP] = "MMAP", + [PERF_RECORD_LOST] = "LOST", + [PERF_RECORD_COMM] = "COMM", + [PERF_RECORD_EXIT] = "EXIT", + [PERF_RECORD_THROTTLE] = "THROTTLE", + [PERF_RECORD_UNTHROTTLE] = "UNTHROTTLE", + [PERF_RECORD_FORK] = "FORK", + [PERF_RECORD_READ] = "READ", + [PERF_RECORD_SAMPLE] = "SAMPLE", + [PERF_RECORD_HEADER_ATTR] = "ATTR", + [PERF_RECORD_HEADER_EVENT_TYPE] = "EVENT_TYPE", + [PERF_RECORD_HEADER_TRACING_DATA] = "TRACING_DATA", + [PERF_RECORD_HEADER_BUILD_ID] = "BUILD_ID", + [PERF_RECORD_FINISHED_ROUND] = "FINISHED_ROUND", }; const char *perf_event__name(unsigned int id) @@ -42,7 +42,7 @@ int perf_sample_size(u64 sample_type) int i; for (i = 0; i < 64; i++) { - if ((mask << i) & 1) + if (mask & (1UL << i)) size++; }