oom: select task from tasklist for mempolicy ooms

The oom killer presently kills current whenever there is no more memory
free or reclaimable on its mempolicy's nodes.  There is no guarantee that
current is a memory-hogging task or that killing it will free any
substantial amount of memory, however.

In such situations, it is better to scan the tasklist for nodes that are
allowed to allocate on current's set of nodes and kill the task with the
highest badness() score.  This ensures that the most memory-hogging task,
or the one configured by the user with /proc/pid/oom_adj, is always
selected in such scenarios.

Signed-off-by: David Rientjes <rientjes@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
David Rientjes 2010-08-09 17:18:52 -07:00 committed by Linus Torvalds
parent 5e9d834a0e
commit 6f48d0ebd9
3 changed files with 124 additions and 37 deletions

View File

@ -210,6 +210,8 @@ extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags, unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask); struct mempolicy **mpol, nodemask_t **nodemask);
extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
extern bool mempolicy_nodemask_intersects(struct task_struct *tsk,
const nodemask_t *mask);
extern unsigned slab_node(struct mempolicy *policy); extern unsigned slab_node(struct mempolicy *policy);
extern enum zone_type policy_zone; extern enum zone_type policy_zone;
@ -338,7 +340,16 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
return node_zonelist(0, gfp_flags); return node_zonelist(0, gfp_flags);
} }
static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
{
return false;
}
static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk,
const nodemask_t *mask)
{
return false;
}
static inline int do_migrate_pages(struct mm_struct *mm, static inline int do_migrate_pages(struct mm_struct *mm,
const nodemask_t *from_nodes, const nodemask_t *from_nodes,

View File

@ -1712,6 +1712,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
} }
#endif #endif
/*
* mempolicy_nodemask_intersects
*
* If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
* policy. Otherwise, check for intersection between mask and the policy
* nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
* policy, always return true since it may allocate elsewhere on fallback.
*
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
*/
bool mempolicy_nodemask_intersects(struct task_struct *tsk,
const nodemask_t *mask)
{
struct mempolicy *mempolicy;
bool ret = true;
if (!mask)
return ret;
task_lock(tsk);
mempolicy = tsk->mempolicy;
if (!mempolicy)
goto out;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
/*
* MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
* allocate from, they may fallback to other nodes when oom.
* Thus, it's possible for tsk to have allocated memory from
* nodes in mask.
*/
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
ret = nodes_intersects(mempolicy->v.nodes, *mask);
break;
default:
BUG();
}
out:
task_unlock(tsk);
return ret;
}
/* Allocate a page in interleaved policy. /* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */ Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,

View File

@ -27,6 +27,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/notifier.h> #include <linux/notifier.h>
#include <linux/memcontrol.h> #include <linux/memcontrol.h>
#include <linux/mempolicy.h>
#include <linux/security.h> #include <linux/security.h>
int sysctl_panic_on_oom; int sysctl_panic_on_oom;
@ -35,23 +36,57 @@ int sysctl_oom_dump_tasks;
static DEFINE_SPINLOCK(zone_scan_lock); static DEFINE_SPINLOCK(zone_scan_lock);
/* #define DEBUG */ /* #define DEBUG */
/* #ifdef CONFIG_NUMA
* Is all threads of the target process nodes overlap ours? /**
* has_intersects_mems_allowed() - check task eligiblity for kill
* @tsk: task struct of which task to consider
* @mask: nodemask passed to page allocator for mempolicy ooms
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
* and whether or not it has the same set of allowed cpuset nodes.
*/ */
static int has_intersects_mems_allowed(struct task_struct *tsk) static bool has_intersects_mems_allowed(struct task_struct *tsk,
const nodemask_t *mask)
{ {
struct task_struct *t; struct task_struct *start = tsk;
t = tsk;
do { do {
if (cpuset_mems_allowed_intersects(current, t)) if (mask) {
return 1; /*
t = next_thread(t); * If this is a mempolicy constrained oom, tsk's
} while (t != tsk); * cpuset is irrelevant. Only return true if its
* mempolicy intersects current, otherwise it may be
return 0; * needlessly killed.
*/
if (mempolicy_nodemask_intersects(tsk, mask))
return true;
} else {
/*
* This is not a mempolicy constrained oom, so only
* check the mems of tsk's cpuset.
*/
if (cpuset_mems_allowed_intersects(current, tsk))
return true;
}
tsk = next_thread(tsk);
} while (tsk != start);
return false;
} }
#else
static bool has_intersects_mems_allowed(struct task_struct *tsk,
const nodemask_t *mask)
{
return true;
}
#endif /* CONFIG_NUMA */
/*
* The process p may have detached its own ->mm while exiting or through
* use_mm(), but one or more of its subthreads may still have a valid
* pointer. Return p, or any of its subthreads with a valid ->mm, with
* task_lock() held.
*/
static struct task_struct *find_lock_task_mm(struct task_struct *p) static struct task_struct *find_lock_task_mm(struct task_struct *p)
{ {
struct task_struct *t = p; struct task_struct *t = p;
@ -106,10 +141,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
* The memory size of the process is the basis for the badness. * The memory size of the process is the basis for the badness.
*/ */
points = p->mm->total_vm; points = p->mm->total_vm;
/*
* After this unlock we can no longer dereference local variable `mm'
*/
task_unlock(p); task_unlock(p);
/* /*
@ -253,7 +284,8 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
* (not docbooked, we don't want this one cluttering up the manual) * (not docbooked, we don't want this one cluttering up the manual)
*/ */
static struct task_struct *select_bad_process(unsigned long *ppoints, static struct task_struct *select_bad_process(unsigned long *ppoints,
struct mem_cgroup *mem) struct mem_cgroup *mem, enum oom_constraint constraint,
const nodemask_t *mask)
{ {
struct task_struct *p; struct task_struct *p;
struct task_struct *chosen = NULL; struct task_struct *chosen = NULL;
@ -269,7 +301,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
continue; continue;
if (mem && !task_in_mem_cgroup(p, mem)) if (mem && !task_in_mem_cgroup(p, mem))
continue; continue;
if (!has_intersects_mems_allowed(p)) if (!has_intersects_mems_allowed(p,
constraint == CONSTRAINT_MEMORY_POLICY ? mask :
NULL))
continue; continue;
/* /*
@ -497,7 +531,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
panic("out of memory(memcg). panic_on_oom is selected.\n"); panic("out of memory(memcg). panic_on_oom is selected.\n");
read_lock(&tasklist_lock); read_lock(&tasklist_lock);
retry: retry:
p = select_bad_process(&points, mem); p = select_bad_process(&points, mem, CONSTRAINT_NONE, NULL);
if (!p || PTR_ERR(p) == -1UL) if (!p || PTR_ERR(p) == -1UL)
goto out; goto out;
@ -576,7 +610,8 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
/* /*
* Must be called with tasklist_lock held for read. * Must be called with tasklist_lock held for read.
*/ */
static void __out_of_memory(gfp_t gfp_mask, int order) static void __out_of_memory(gfp_t gfp_mask, int order,
enum oom_constraint constraint, const nodemask_t *mask)
{ {
struct task_struct *p; struct task_struct *p;
unsigned long points; unsigned long points;
@ -590,7 +625,7 @@ retry:
* Rambo mode: Shoot down a process and hope it solves whatever * Rambo mode: Shoot down a process and hope it solves whatever
* issues we may have. * issues we may have.
*/ */
p = select_bad_process(&points, NULL); p = select_bad_process(&points, NULL, constraint, mask);
if (PTR_ERR(p) == -1UL) if (PTR_ERR(p) == -1UL)
return; return;
@ -624,7 +659,8 @@ void pagefault_out_of_memory(void)
panic("out of memory from page fault. panic_on_oom is selected.\n"); panic("out of memory from page fault. panic_on_oom is selected.\n");
read_lock(&tasklist_lock); read_lock(&tasklist_lock);
__out_of_memory(0, 0); /* unknown gfp_mask and order */ /* unknown gfp_mask and order */
__out_of_memory(0, 0, CONSTRAINT_NONE, NULL);
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
/* /*
@ -640,6 +676,7 @@ void pagefault_out_of_memory(void)
* @zonelist: zonelist pointer * @zonelist: zonelist pointer
* @gfp_mask: memory allocation flags * @gfp_mask: memory allocation flags
* @order: amount of memory being requested as a power of 2 * @order: amount of memory being requested as a power of 2
* @nodemask: nodemask passed to page allocator
* *
* If we run out of memory, we have the choice between either * If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse) * killing a random task (bad), letting the system crash (worse)
@ -678,24 +715,19 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
*/ */
constraint = constrained_alloc(zonelist, gfp_mask, nodemask); constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
read_lock(&tasklist_lock); read_lock(&tasklist_lock);
if (unlikely(sysctl_panic_on_oom)) {
switch (constraint) { /*
case CONSTRAINT_MEMORY_POLICY: * panic_on_oom only affects CONSTRAINT_NONE, the kernel
oom_kill_process(current, gfp_mask, order, 0, NULL, * should not panic for cpuset or mempolicy induced memory
"No available memory (MPOL_BIND)"); * failures.
break; */
if (constraint == CONSTRAINT_NONE) {
case CONSTRAINT_NONE:
if (sysctl_panic_on_oom) {
dump_header(NULL, gfp_mask, order, NULL); dump_header(NULL, gfp_mask, order, NULL);
panic("out of memory. panic_on_oom is selected\n"); read_unlock(&tasklist_lock);
panic("Out of memory: panic_on_oom is enabled\n");
} }
/* Fall-through */
case CONSTRAINT_CPUSET:
__out_of_memory(gfp_mask, order);
break;
} }
__out_of_memory(gfp_mask, order, constraint, nodemask);
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
/* /*