linux/include/trace/events/rcu.h
Paul E. McKenney f535a607c1 rcu: Eliminate RCU_FAST_NO_HZ grace-period hang
With the new implementation of RCU_FAST_NO_HZ, it was possible to hang
RCU grace periods as follows:

o	CPU 0 attempts to go idle, cycles several times through the
	rcu_prepare_for_idle() loop, then goes dyntick-idle when
	RCU needs nothing more from it, while still having at least
	on RCU callback pending.

o	CPU 1 goes idle with no callbacks.

Both CPUs can then stay in dyntick-idle mode indefinitely, preventing
the RCU grace period from ever completing, possibly hanging the system.

This commit therefore prevents CPUs that have RCU callbacks from entering
dyntick-idle mode.  This approach also eliminates the need for the
end-of-grace-period IPIs used previously.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2011-12-11 10:32:02 -08:00

535 lines
14 KiB
C

#undef TRACE_SYSTEM
#define TRACE_SYSTEM rcu
#if !defined(_TRACE_RCU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_RCU_H
#include <linux/tracepoint.h>
/*
* Tracepoint for start/end markers used for utilization calculations.
* By convention, the string is of the following forms:
*
* "Start <activity>" -- Mark the start of the specified activity,
* such as "context switch". Nesting is permitted.
* "End <activity>" -- Mark the end of the specified activity.
*
* An "@" character within "<activity>" is a comment character: Data
* reduction scripts will ignore the "@" and the remainder of the line.
*/
TRACE_EVENT(rcu_utilization,
TP_PROTO(char *s),
TP_ARGS(s),
TP_STRUCT__entry(
__field(char *, s)
),
TP_fast_assign(
__entry->s = s;
),
TP_printk("%s", __entry->s)
);
#ifdef CONFIG_RCU_TRACE
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
/*
* Tracepoint for grace-period events: starting and ending a grace
* period ("start" and "end", respectively), a CPU noting the start
* of a new grace period or the end of an old grace period ("cpustart"
* and "cpuend", respectively), a CPU passing through a quiescent
* state ("cpuqs"), a CPU coming online or going offline ("cpuonl"
* and "cpuofl", respectively), and a CPU being kicked for being too
* long in dyntick-idle mode ("kick").
*/
TRACE_EVENT(rcu_grace_period,
TP_PROTO(char *rcuname, unsigned long gpnum, char *gpevent),
TP_ARGS(rcuname, gpnum, gpevent),
TP_STRUCT__entry(
__field(char *, rcuname)
__field(unsigned long, gpnum)
__field(char *, gpevent)
),
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->gpnum = gpnum;
__entry->gpevent = gpevent;
),
TP_printk("%s %lu %s",
__entry->rcuname, __entry->gpnum, __entry->gpevent)
);
/*
* Tracepoint for grace-period-initialization events. These are
* distinguished by the type of RCU, the new grace-period number, the
* rcu_node structure level, the starting and ending CPU covered by the
* rcu_node structure, and the mask of CPUs that will be waited for.
* All but the type of RCU are extracted from the rcu_node structure.
*/
TRACE_EVENT(rcu_grace_period_init,
TP_PROTO(char *rcuname, unsigned long gpnum, u8 level,
int grplo, int grphi, unsigned long qsmask),
TP_ARGS(rcuname, gpnum, level, grplo, grphi, qsmask),
TP_STRUCT__entry(
__field(char *, rcuname)
__field(unsigned long, gpnum)
__field(u8, level)
__field(int, grplo)
__field(int, grphi)
__field(unsigned long, qsmask)
),
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->gpnum = gpnum;
__entry->level = level;
__entry->grplo = grplo;
__entry->grphi = grphi;
__entry->qsmask = qsmask;
),
TP_printk("%s %lu %u %d %d %lx",
__entry->rcuname, __entry->gpnum, __entry->level,
__entry->grplo, __entry->grphi, __entry->qsmask)
);
/*
* Tracepoint for tasks blocking within preemptible-RCU read-side
* critical sections. Track the type of RCU (which one day might
* include SRCU), the grace-period number that the task is blocking
* (the current or the next), and the task's PID.
*/
TRACE_EVENT(rcu_preempt_task,
TP_PROTO(char *rcuname, int pid, unsigned long gpnum),
TP_ARGS(rcuname, pid, gpnum),
TP_STRUCT__entry(
__field(char *, rcuname)
__field(unsigned long, gpnum)
__field(int, pid)
),
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->gpnum = gpnum;
__entry->pid = pid;
),
TP_printk("%s %lu %d",
__entry->rcuname, __entry->gpnum, __entry->pid)
);
/*
* Tracepoint for tasks that blocked within a given preemptible-RCU
* read-side critical section exiting that critical section. Track the
* type of RCU (which one day might include SRCU) and the task's PID.
*/
TRACE_EVENT(rcu_unlock_preempted_task,
TP_PROTO(char *rcuname, unsigned long gpnum, int pid),
TP_ARGS(rcuname, gpnum, pid),
TP_STRUCT__entry(
__field(char *, rcuname)
__field(unsigned long, gpnum)
__field(int, pid)
),
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->gpnum = gpnum;
__entry->pid = pid;
),
TP_printk("%s %lu %d", __entry->rcuname, __entry->gpnum, __entry->pid)
);
/*
* Tracepoint for quiescent-state-reporting events. These are
* distinguished by the type of RCU, the grace-period number, the
* mask of quiescent lower-level entities, the rcu_node structure level,
* the starting and ending CPU covered by the rcu_node structure, and
* whether there are any blocked tasks blocking the current grace period.
* All but the type of RCU are extracted from the rcu_node structure.
*/
TRACE_EVENT(rcu_quiescent_state_report,
TP_PROTO(char *rcuname, unsigned long gpnum,
unsigned long mask, unsigned long qsmask,
u8 level, int grplo, int grphi, int gp_tasks),
TP_ARGS(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks),
TP_STRUCT__entry(
__field(char *, rcuname)
__field(unsigned long, gpnum)
__field(unsigned long, mask)
__field(unsigned long, qsmask)
__field(u8, level)
__field(int, grplo)
__field(int, grphi)
__field(u8, gp_tasks)
),
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->gpnum = gpnum;
__entry->mask = mask;
__entry->qsmask = qsmask;
__entry->level = level;
__entry->grplo = grplo;
__entry->grphi = grphi;
__entry->gp_tasks = gp_tasks;
),
TP_printk("%s %lu %lx>%lx %u %d %d %u",
__entry->rcuname, __entry->gpnum,
__entry->mask, __entry->qsmask, __entry->level,
__entry->grplo, __entry->grphi, __entry->gp_tasks)
);
/*
* Tracepoint for quiescent states detected by force_quiescent_state().
* These trace events include the type of RCU, the grace-period number
* that was blocked by the CPU, the CPU itself, and the type of quiescent
* state, which can be "dti" for dyntick-idle mode, "ofl" for CPU offline,
* or "kick" when kicking a CPU that has been in dyntick-idle mode for
* too long.
*/
TRACE_EVENT(rcu_fqs,
TP_PROTO(char *rcuname, unsigned long gpnum, int cpu, char *qsevent),
TP_ARGS(rcuname, gpnum, cpu, qsevent),
TP_STRUCT__entry(
__field(char *, rcuname)
__field(unsigned long, gpnum)
__field(int, cpu)
__field(char *, qsevent)
),
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->gpnum = gpnum;
__entry->cpu = cpu;
__entry->qsevent = qsevent;
),
TP_printk("%s %lu %d %s",
__entry->rcuname, __entry->gpnum,
__entry->cpu, __entry->qsevent)
);
#endif /* #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) */
/*
* Tracepoint for dyntick-idle entry/exit events. These take a string
* as argument: "Start" for entering dyntick-idle mode, "End" for
* leaving it, "--=" for events moving towards idle, and "++=" for events
* moving away from idle. "Error on entry: not idle task" and "Error on
* exit: not idle task" indicate that a non-idle task is erroneously
* toying with the idle loop.
*
* These events also take a pair of numbers, which indicate the nesting
* depth before and after the event of interest. Note that task-related
* events use the upper bits of each number, while interrupt-related
* events use the lower bits.
*/
TRACE_EVENT(rcu_dyntick,
TP_PROTO(char *polarity, long long oldnesting, long long newnesting),
TP_ARGS(polarity, oldnesting, newnesting),
TP_STRUCT__entry(
__field(char *, polarity)
__field(long long, oldnesting)
__field(long long, newnesting)
),
TP_fast_assign(
__entry->polarity = polarity;
__entry->oldnesting = oldnesting;
__entry->newnesting = newnesting;
),
TP_printk("%s %llx %llx", __entry->polarity,
__entry->oldnesting, __entry->newnesting)
);
/*
* Tracepoint for RCU preparation for idle, the goal being to get RCU
* processing done so that the current CPU can shut off its scheduling
* clock and enter dyntick-idle mode. One way to accomplish this is
* to drain all RCU callbacks from this CPU, and the other is to have
* done everything RCU requires for the current grace period. In this
* latter case, the CPU will be awakened at the end of the current grace
* period in order to process the remainder of its callbacks.
*
* These tracepoints take a string as argument:
*
* "No callbacks": Nothing to do, no callbacks on this CPU.
* "In holdoff": Nothing to do, holding off after unsuccessful attempt.
* "Begin holdoff": Attempt failed, don't retry until next jiffy.
* "More callbacks": Still more callbacks, try again to clear them out.
* "Callbacks drained": All callbacks processed, off to dyntick idle!
* "CPU awakened at GP end":
*/
TRACE_EVENT(rcu_prep_idle,
TP_PROTO(char *reason),
TP_ARGS(reason),
TP_STRUCT__entry(
__field(char *, reason)
),
TP_fast_assign(
__entry->reason = reason;
),
TP_printk("%s", __entry->reason)
);
/*
* Tracepoint for the registration of a single RCU callback function.
* The first argument is the type of RCU, the second argument is
* a pointer to the RCU callback itself, and the third element is the
* new RCU callback queue length for the current CPU.
*/
TRACE_EVENT(rcu_callback,
TP_PROTO(char *rcuname, struct rcu_head *rhp, long qlen),
TP_ARGS(rcuname, rhp, qlen),
TP_STRUCT__entry(
__field(char *, rcuname)
__field(void *, rhp)
__field(void *, func)
__field(long, qlen)
),
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->rhp = rhp;
__entry->func = rhp->func;
__entry->qlen = qlen;
),
TP_printk("%s rhp=%p func=%pf %ld",
__entry->rcuname, __entry->rhp, __entry->func, __entry->qlen)
);
/*
* Tracepoint for the registration of a single RCU callback of the special
* kfree() form. The first argument is the RCU type, the second argument
* is a pointer to the RCU callback, the third argument is the offset
* of the callback within the enclosing RCU-protected data structure,
* and the fourth argument is the new RCU callback queue length for the
* current CPU.
*/
TRACE_EVENT(rcu_kfree_callback,
TP_PROTO(char *rcuname, struct rcu_head *rhp, unsigned long offset,
long qlen),
TP_ARGS(rcuname, rhp, offset, qlen),
TP_STRUCT__entry(
__field(char *, rcuname)
__field(void *, rhp)
__field(unsigned long, offset)
__field(long, qlen)
),
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->rhp = rhp;
__entry->offset = offset;
__entry->qlen = qlen;
),
TP_printk("%s rhp=%p func=%ld %ld",
__entry->rcuname, __entry->rhp, __entry->offset,
__entry->qlen)
);
/*
* Tracepoint for marking the beginning rcu_do_batch, performed to start
* RCU callback invocation. The first argument is the RCU flavor,
* the second is the total number of callbacks (including those that
* are not yet ready to be invoked), and the third argument is the
* current RCU-callback batch limit.
*/
TRACE_EVENT(rcu_batch_start,
TP_PROTO(char *rcuname, long qlen, int blimit),
TP_ARGS(rcuname, qlen, blimit),
TP_STRUCT__entry(
__field(char *, rcuname)
__field(long, qlen)
__field(int, blimit)
),
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->qlen = qlen;
__entry->blimit = blimit;
),
TP_printk("%s CBs=%ld bl=%d",
__entry->rcuname, __entry->qlen, __entry->blimit)
);
/*
* Tracepoint for the invocation of a single RCU callback function.
* The first argument is the type of RCU, and the second argument is
* a pointer to the RCU callback itself.
*/
TRACE_EVENT(rcu_invoke_callback,
TP_PROTO(char *rcuname, struct rcu_head *rhp),
TP_ARGS(rcuname, rhp),
TP_STRUCT__entry(
__field(char *, rcuname)
__field(void *, rhp)
__field(void *, func)
),
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->rhp = rhp;
__entry->func = rhp->func;
),
TP_printk("%s rhp=%p func=%pf",
__entry->rcuname, __entry->rhp, __entry->func)
);
/*
* Tracepoint for the invocation of a single RCU callback of the special
* kfree() form. The first argument is the RCU flavor, the second
* argument is a pointer to the RCU callback, and the third argument
* is the offset of the callback within the enclosing RCU-protected
* data structure.
*/
TRACE_EVENT(rcu_invoke_kfree_callback,
TP_PROTO(char *rcuname, struct rcu_head *rhp, unsigned long offset),
TP_ARGS(rcuname, rhp, offset),
TP_STRUCT__entry(
__field(char *, rcuname)
__field(void *, rhp)
__field(unsigned long, offset)
),
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->rhp = rhp;
__entry->offset = offset;
),
TP_printk("%s rhp=%p func=%ld",
__entry->rcuname, __entry->rhp, __entry->offset)
);
/*
* Tracepoint for exiting rcu_do_batch after RCU callbacks have been
* invoked. The first argument is the name of the RCU flavor and
* the second argument is number of callbacks actually invoked.
*/
TRACE_EVENT(rcu_batch_end,
TP_PROTO(char *rcuname, int callbacks_invoked),
TP_ARGS(rcuname, callbacks_invoked),
TP_STRUCT__entry(
__field(char *, rcuname)
__field(int, callbacks_invoked)
),
TP_fast_assign(
__entry->rcuname = rcuname;
__entry->callbacks_invoked = callbacks_invoked;
),
TP_printk("%s CBs-invoked=%d",
__entry->rcuname, __entry->callbacks_invoked)
);
/*
* Tracepoint for rcutorture readers. The first argument is the name
* of the RCU flavor from rcutorture's viewpoint and the second argument
* is the callback address.
*/
TRACE_EVENT(rcu_torture_read,
TP_PROTO(char *rcutorturename, struct rcu_head *rhp),
TP_ARGS(rcutorturename, rhp),
TP_STRUCT__entry(
__field(char *, rcutorturename)
__field(struct rcu_head *, rhp)
),
TP_fast_assign(
__entry->rcutorturename = rcutorturename;
__entry->rhp = rhp;
),
TP_printk("%s torture read %p",
__entry->rcutorturename, __entry->rhp)
);
#else /* #ifdef CONFIG_RCU_TRACE */
#define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, qsmask) do { } while (0)
#define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0)
#define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0)
#define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0)
#define trace_rcu_prep_idle(reason) do { } while (0)
#define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0)
#define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0)
#define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0)
#define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0)
#define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0)
#define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0)
#define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
#endif /* _TRACE_RCU_H */
/* This part must be outside protection */
#include <trace/define_trace.h>