Revert "Squash revert 3.18 binder/lmk changes"

This reverts commit 3b278f89ee.
This commit is contained in:
TARKZiM 2022-01-21 11:12:33 +08:00
parent 8ba63ccfde
commit 9a3c5248a3
31 changed files with 8713 additions and 5992 deletions

View File

@ -40,7 +40,6 @@ Features:
- soft limit
- moving (recharging) account at moving a task is selectable.
- usage threshold notifier
- memory pressure notifier
- oom-killer disable knob and oom-notifier
- Root cgroup has no limit controls.
@ -66,7 +65,6 @@ Brief summary of control files.
memory.stat # show various statistics
memory.use_hierarchy # set/show hierarchical account enabled
memory.force_empty # trigger forced move charge to parent
memory.pressure_level # set memory pressure notifications
memory.swappiness # set/show swappiness parameter of vmscan
(See sysctl's vm.swappiness)
memory.move_charge_at_immigrate # set/show controls of moving charges
@ -766,73 +764,7 @@ At reading, current status of OOM is shown.
under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may
be stopped.)
11. Memory Pressure
The pressure level notifications can be used to monitor the memory
allocation cost; based on the pressure, applications can implement
different strategies of managing their memory resources. The pressure
levels are defined as following:
The "low" level means that the system is reclaiming memory for new
allocations. Monitoring this reclaiming activity might be useful for
maintaining cache level. Upon notification, the program (typically
"Activity Manager") might analyze vmstat and act in advance (i.e.
prematurely shutdown unimportant services).
The "medium" level means that the system is experiencing medium memory
pressure, the system might be making swap, paging out active file caches,
etc. Upon this event applications may decide to further analyze
vmstat/zoneinfo/memcg or internal memory usage statistics and free any
resources that can be easily reconstructed or re-read from a disk.
The "critical" level means that the system is actively thrashing, it is
about to out of memory (OOM) or even the in-kernel OOM killer is on its
way to trigger. Applications should do whatever they can to help the
system. It might be too late to consult with vmstat or any other
statistics, so it's advisable to take an immediate action.
The events are propagated upward until the event is handled, i.e. the
events are not pass-through. Here is what this means: for example you have
three cgroups: A->B->C. Now you set up an event listener on cgroups A, B
and C, and suppose group C experiences some pressure. In this situation,
only group C will receive the notification, i.e. groups A and B will not
receive it. This is done to avoid excessive "broadcasting" of messages,
which disturbs the system and which is especially bad if we are low on
memory or thrashing. So, organize the cgroups wisely, or propagate the
events manually (or, ask us to implement the pass-through events,
explaining why would you need them.)
The file memory.pressure_level is only used to setup an eventfd. To
register a notification, an application must:
- create an eventfd using eventfd(2);
- open memory.pressure_level;
- write string like "<event_fd> <fd of memory.pressure_level> <level>"
to cgroup.event_control.
Application will be notified through eventfd when memory pressure is at
the specific level (or higher). Read/write operations to
memory.pressure_level are no implemented.
Test:
Here is a small script example that makes a new cgroup, sets up a
memory limit, sets up a notification in the cgroup and then makes child
cgroup experience a critical pressure:
# cd /sys/fs/cgroup/memory/
# mkdir foo
# cd foo
# cgroup_event_listener memory.pressure_level low &
# echo 8000000 > memory.limit_in_bytes
# echo 8000000 > memory.memsw.limit_in_bytes
# echo $$ > tasks
# dd if=/dev/zero | read x
(Expect a bunch of notifications, and eventually, the oom-killer will
trigger.)
12. TODO
11. TODO
1. Add support for accounting huge pages (as a separate controller)
2. Make per-cgroup scanner reclaim not-shared pages first

View File

@ -174,8 +174,6 @@ source "drivers/ipack/Kconfig"
source "drivers/reset/Kconfig"
source "drivers/android/Kconfig"
source "drivers/coresight/Kconfig"
source "drivers/bif/Kconfig"

View File

@ -158,7 +158,6 @@ obj-$(CONFIG_IIO) += iio/
obj-$(CONFIG_VME_BUS) += vme/
obj-$(CONFIG_IPACK_BUS) += ipack/
obj-$(CONFIG_NTB) += ntb/
obj-$(CONFIG_ANDROID) += android/
obj-$(CONFIG_CORESIGHT) += coresight/
obj-$(CONFIG_ESOC) += esoc/

View File

@ -1,49 +0,0 @@
menu "Android"
config ANDROID
bool "Android Drivers"
---help---
Enable support for various drivers needed on the Android platform
if ANDROID
config ANDROID_BINDER_IPC
bool "Android Binder IPC Driver"
depends on MMU
default n
---help---
Binder is used in Android for both communication between processes,
and remote method invocation.
This means one Android process can call a method/routine in another
Android process, using Binder to identify, invoke and pass arguments
between said processes.
config ANDROID_BINDER_DEVICES
string "Android Binder devices"
depends on ANDROID_BINDER_IPC
default "binder,hwbinder,vndbinder"
---help---
Default value for the binder.devices parameter.
The binder.devices parameter is a comma-separated list of strings
that specifies the names of the binder device nodes that will be
created. Each binder device has its own context manager, and is
therefore logically separated from the other devices.
config ANDROID_BINDER_IPC_32BIT
bool
depends on !64BIT && ANDROID_BINDER_IPC
default y
---help---
The Binder API has been changed to support both 32 and 64bit
applications in a mixed environment.
Enable this to support an old 32-bit Android user-space (v4.4 and
earlier).
Note that enabling this will break newer Android user-space.
endif # if ANDROID
endmenu

View File

@ -1,3 +0,0 @@
ccflags-y += -I$(src) # needed for trace events
obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,53 @@ config ANDROID
if ANDROID
config ANDROID_BINDER_IPC
bool "Android Binder IPC Driver"
depends on MMU
default n
---help---
Binder is used in Android for both communication between processes,
and remote method invocation.
This means one Android process can call a method/routine in another
Android process, using Binder to identify, invoke and pass arguments
between said processes.
config ANDROID_BINDER_IPC_32BIT
bool "Use old (Android 4.4 and earlier) 32-bit binder API"
depends on !64BIT && ANDROID_BINDER_IPC
default y
---help---
The Binder API has been changed to support both 32 and 64bit
applications in a mixed environment.
Enable this to support an old 32-bit Android user-space (v4.4 and
earlier).
Note that enabling this will break newer Android user-space.
config ANDROID_BINDER_DEVICES
string "Android Binder devices"
depends on ANDROID_BINDER_IPC
default "binder,hwbinder,vndbinder"
---help---
Default value for the binder.devices parameter.
The binder.devices parameter is a comma-separated list of strings
that specifies the names of the binder device nodes that will be
created. Each binder device has its own context manager, and is
therefore logically separated from the other devices.
config ANDROID_BINDER_IPC_SELFTEST
bool "Android Binder IPC Driver Selftest"
depends on ANDROID_BINDER_IPC
---help---
This feature allows binder selftest to run.
Binder selftest checks the allocation and free of binder buffers
exhaustively with combinations of various buffer sizes and
alignments.
config ASHMEM
bool "Enable the Anonymous Shared Memory Subsystem"
default n

View File

@ -3,6 +3,8 @@ ccflags-y += -I$(src) # needed for trace events
obj-y += ion/
obj-$(CONFIG_FIQ_DEBUGGER) += fiq_debugger/
obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o
obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
obj-$(CONFIG_ASHMEM) += ashmem.o
obj-$(CONFIG_ANDROID_TIMED_OUTPUT) += timed_output.o
obj-$(CONFIG_ANDROID_TIMED_GPIO) += timed_gpio.o

View File

@ -268,24 +268,23 @@ static loff_t ashmem_llseek(struct file *file, loff_t offset, int origin)
mutex_lock(&ashmem_mutex);
if (asma->size == 0) {
ret = -EINVAL;
goto out;
mutex_unlock(&ashmem_mutex);
return -EINVAL;
}
if (!asma->file) {
ret = -EBADF;
goto out;
mutex_unlock(&ashmem_mutex);
return -EBADF;
}
ret = asma->file->f_op->llseek(asma->file, offset, origin);
mutex_unlock(&ashmem_mutex);
ret = vfs_llseek(asma->file, offset, origin);
if (ret < 0)
goto out;
return ret;
/** Copy f_pos from backing file, since f_ops->llseek() sets it */
file->f_pos = asma->file->f_pos;
out:
mutex_unlock(&ashmem_mutex);
return ret;
}
@ -309,6 +308,12 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
goto out;
}
/* requested mapping size larger than object size */
if (vma->vm_end - vma->vm_start > PAGE_ALIGN(asma->size)) {
ret = -EINVAL;
goto out;
}
/* requested protection bits must match our allowed protection mask */
if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
calc_vm_prot_bits(PROT_MASK))) {
@ -330,6 +335,7 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
ret = PTR_ERR(vmfile);
goto out;
}
vmfile->f_mode |= FMODE_LSEEK;
asma->file = vmfile;
}
get_file(asma->file);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,30 @@
/*
* Copyright (C) 2008 Google, Inc.
*
* Based on, but no longer compatible with, the original
* OpenBinder.org binder driver interface, which is:
*
* Copyright (c) 2005 Palmsource, Inc.
*
* This software is licensed under the terms of the GNU General Public
* License version 2, as published by the Free Software Foundation, and
* may be copied, distributed, and modified under those terms.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#ifndef _LINUX_BINDER_H
#define _LINUX_BINDER_H
#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT
#define BINDER_IPC_32BIT 1
#endif
#include "uapi/binder.h"
#endif /* _LINUX_BINDER_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,186 @@
/*
* Copyright (C) 2017 Google, Inc.
*
* This software is licensed under the terms of the GNU General Public
* License version 2, as published by the Free Software Foundation, and
* may be copied, distributed, and modified under those terms.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#ifndef _LINUX_BINDER_ALLOC_H
#define _LINUX_BINDER_ALLOC_H
#include <linux/rbtree.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/rtmutex.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/list_lru.h>
extern struct list_lru binder_alloc_lru;
struct binder_transaction;
/**
* struct binder_buffer - buffer used for binder transactions
* @entry: entry alloc->buffers
* @rb_node: node for allocated_buffers/free_buffers rb trees
* @free: true if buffer is free
* @allow_user_free: describe the second member of struct blah,
* @async_transaction: describe the second member of struct blah,
* @debug_id: describe the second member of struct blah,
* @transaction: describe the second member of struct blah,
* @target_node: describe the second member of struct blah,
* @data_size: describe the second member of struct blah,
* @offsets_size: describe the second member of struct blah,
* @extra_buffers_size: describe the second member of struct blah,
* @data:i describe the second member of struct blah,
*
* Bookkeeping structure for binder transaction buffers
*/
struct binder_buffer {
struct list_head entry; /* free and allocated entries by address */
struct rb_node rb_node; /* free entry by size or allocated entry */
/* by address */
unsigned free:1;
unsigned allow_user_free:1;
unsigned async_transaction:1;
unsigned debug_id:29;
struct binder_transaction *transaction;
struct binder_node *target_node;
size_t data_size;
size_t offsets_size;
size_t extra_buffers_size;
void *data;
};
/**
* struct binder_lru_page - page object used for binder shrinker
* @page_ptr: pointer to physical page in mmap'd space
* @lru: entry in binder_alloc_lru
* @alloc: binder_alloc for a proc
*/
struct binder_lru_page {
struct list_head lru;
struct page *page_ptr;
struct binder_alloc *alloc;
};
/**
* struct binder_alloc - per-binder proc state for binder allocator
* @vma: vm_area_struct passed to mmap_handler
* (invarient after mmap)
* @tsk: tid for task that called init for this proc
* (invariant after init)
* @vma_vm_mm: copy of vma->vm_mm (invarient after mmap)
* @buffer: base of per-proc address space mapped via mmap
* @user_buffer_offset: offset between user and kernel VAs for buffer
* @buffers: list of all buffers for this proc
* @free_buffers: rb tree of buffers available for allocation
* sorted by size
* @allocated_buffers: rb tree of allocated buffers sorted by address
* @free_async_space: VA space available for async buffers. This is
* initialized at mmap time to 1/2 the full VA space
* @pages: array of binder_lru_page
* @buffer_size: size of address space specified via mmap
* @pid: pid for associated binder_proc (invariant after init)
* @pages_high: high watermark of offset in @pages
*
* Bookkeeping structure for per-proc address space management for binder
* buffers. It is normally initialized during binder_init() and binder_mmap()
* calls. The address space is used for both user-visible buffers and for
* struct binder_buffer objects used to track the user buffers
*/
struct binder_alloc {
struct mutex mutex;
struct vm_area_struct *vma;
struct mm_struct *vma_vm_mm;
void *buffer;
ptrdiff_t user_buffer_offset;
struct list_head buffers;
struct rb_root free_buffers;
struct rb_root allocated_buffers;
size_t free_async_space;
struct binder_lru_page *pages;
size_t buffer_size;
uint32_t buffer_free;
int pid;
size_t pages_high;
};
#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST
void binder_selftest_alloc(struct binder_alloc *alloc);
#else
static inline void binder_selftest_alloc(struct binder_alloc *alloc) {}
#endif
enum lru_status binder_alloc_free_page(struct list_head *item,
spinlock_t *lock, void *cb_arg);
extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
size_t data_size,
size_t offsets_size,
size_t extra_buffers_size,
int is_async);
extern void binder_alloc_init(struct binder_alloc *alloc);
void binder_alloc_shrinker_init(void);
extern void binder_alloc_vma_close(struct binder_alloc *alloc);
extern struct binder_buffer *
binder_alloc_prepare_to_free(struct binder_alloc *alloc,
uintptr_t user_ptr);
extern void binder_alloc_free_buf(struct binder_alloc *alloc,
struct binder_buffer *buffer);
extern int binder_alloc_mmap_handler(struct binder_alloc *alloc,
struct vm_area_struct *vma);
extern void binder_alloc_deferred_release(struct binder_alloc *alloc);
extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc);
extern void binder_alloc_print_allocated(struct seq_file *m,
struct binder_alloc *alloc);
void binder_alloc_print_pages(struct seq_file *m,
struct binder_alloc *alloc);
/**
* binder_alloc_get_free_async_space() - get free space available for async
* @alloc: binder_alloc for this proc
*
* Return: the bytes remaining in the address-space for async transactions
*/
static inline size_t
binder_alloc_get_free_async_space(struct binder_alloc *alloc)
{
size_t free_async_space;
mutex_lock(&alloc->mutex);
free_async_space = alloc->free_async_space;
mutex_unlock(&alloc->mutex);
return free_async_space;
}
/**
* binder_alloc_get_user_buffer_offset() - get offset between kernel/user addrs
* @alloc: binder_alloc for this proc
*
* Return: the offset between kernel and user-space addresses to use for
* virtual address conversion
*/
static inline ptrdiff_t
binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc)
{
/*
* user_buffer_offset is constant if vma is set and
* undefined if vma is not set. It is possible to
* get here with !alloc->vma if the target process
* is dying while a transaction is being initiated.
* Returning the old value is ok in this case and
* the transaction will fail.
*/
return alloc->user_buffer_offset;
}
#endif /* _LINUX_BINDER_ALLOC_H */

View File

@ -0,0 +1,310 @@
/* binder_alloc_selftest.c
*
* Android IPC Subsystem
*
* Copyright (C) 2017 Google, Inc.
*
* This software is licensed under the terms of the GNU General Public
* License version 2, as published by the Free Software Foundation, and
* may be copied, distributed, and modified under those terms.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm_types.h>
#include <linux/err.h>
#include "binder_alloc.h"
#define BUFFER_NUM 5
#define BUFFER_MIN_SIZE (PAGE_SIZE / 8)
static bool binder_selftest_run = true;
static int binder_selftest_failures;
static DEFINE_MUTEX(binder_selftest_lock);
/**
* enum buf_end_align_type - Page alignment of a buffer
* end with regard to the end of the previous buffer.
*
* In the pictures below, buf2 refers to the buffer we
* are aligning. buf1 refers to previous buffer by addr.
* Symbol [ means the start of a buffer, ] means the end
* of a buffer, and | means page boundaries.
*/
enum buf_end_align_type {
/**
* @SAME_PAGE_UNALIGNED: The end of this buffer is on
* the same page as the end of the previous buffer and
* is not page aligned. Examples:
* buf1 ][ buf2 ][ ...
* buf1 ]|[ buf2 ][ ...
*/
SAME_PAGE_UNALIGNED = 0,
/**
* @SAME_PAGE_ALIGNED: When the end of the previous buffer
* is not page aligned, the end of this buffer is on the
* same page as the end of the previous buffer and is page
* aligned. When the previous buffer is page aligned, the
* end of this buffer is aligned to the next page boundary.
* Examples:
* buf1 ][ buf2 ]| ...
* buf1 ]|[ buf2 ]| ...
*/
SAME_PAGE_ALIGNED,
/**
* @NEXT_PAGE_UNALIGNED: The end of this buffer is on
* the page next to the end of the previous buffer and
* is not page aligned. Examples:
* buf1 ][ buf2 | buf2 ][ ...
* buf1 ]|[ buf2 | buf2 ][ ...
*/
NEXT_PAGE_UNALIGNED,
/**
* @NEXT_PAGE_ALIGNED: The end of this buffer is on
* the page next to the end of the previous buffer and
* is page aligned. Examples:
* buf1 ][ buf2 | buf2 ]| ...
* buf1 ]|[ buf2 | buf2 ]| ...
*/
NEXT_PAGE_ALIGNED,
/**
* @NEXT_NEXT_UNALIGNED: The end of this buffer is on
* the page that follows the page after the end of the
* previous buffer and is not page aligned. Examples:
* buf1 ][ buf2 | buf2 | buf2 ][ ...
* buf1 ]|[ buf2 | buf2 | buf2 ][ ...
*/
NEXT_NEXT_UNALIGNED,
LOOP_END,
};
static void pr_err_size_seq(size_t *sizes, int *seq)
{
int i;
pr_err("alloc sizes: ");
for (i = 0; i < BUFFER_NUM; i++)
pr_cont("[%zu]", sizes[i]);
pr_cont("\n");
pr_err("free seq: ");
for (i = 0; i < BUFFER_NUM; i++)
pr_cont("[%d]", seq[i]);
pr_cont("\n");
}
static bool check_buffer_pages_allocated(struct binder_alloc *alloc,
struct binder_buffer *buffer,
size_t size)
{
void *page_addr, *end;
int page_index;
end = (void *)PAGE_ALIGN((uintptr_t)buffer->data + size);
page_addr = buffer->data;
for (; page_addr < end; page_addr += PAGE_SIZE) {
page_index = (page_addr - alloc->buffer) / PAGE_SIZE;
if (!alloc->pages[page_index].page_ptr ||
!list_empty(&alloc->pages[page_index].lru)) {
pr_err("expect alloc but is %s at page index %d\n",
alloc->pages[page_index].page_ptr ?
"lru" : "free", page_index);
return false;
}
}
return true;
}
static void binder_selftest_alloc_buf(struct binder_alloc *alloc,
struct binder_buffer *buffers[],
size_t *sizes, int *seq)
{
int i;
for (i = 0; i < BUFFER_NUM; i++) {
buffers[i] = binder_alloc_new_buf(alloc, sizes[i], 0, 0, 0);
if (IS_ERR(buffers[i]) ||
!check_buffer_pages_allocated(alloc, buffers[i],
sizes[i])) {
pr_err_size_seq(sizes, seq);
binder_selftest_failures++;
}
}
}
static void binder_selftest_free_buf(struct binder_alloc *alloc,
struct binder_buffer *buffers[],
size_t *sizes, int *seq, size_t end)
{
int i;
for (i = 0; i < BUFFER_NUM; i++)
binder_alloc_free_buf(alloc, buffers[seq[i]]);
for (i = 0; i < end / PAGE_SIZE; i++) {
/**
* Error message on a free page can be false positive
* if binder shrinker ran during binder_alloc_free_buf
* calls above.
*/
if (list_empty(&alloc->pages[i].lru)) {
pr_err_size_seq(sizes, seq);
pr_err("expect lru but is %s at page index %d\n",
alloc->pages[i].page_ptr ? "alloc" : "free", i);
binder_selftest_failures++;
}
}
}
static void binder_selftest_free_page(struct binder_alloc *alloc)
{
int i;
unsigned long count;
while ((count = list_lru_count(&binder_alloc_lru))) {
list_lru_walk(&binder_alloc_lru, binder_alloc_free_page,
NULL, count);
}
for (i = 0; i < (alloc->buffer_size / PAGE_SIZE); i++) {
if (alloc->pages[i].page_ptr) {
pr_err("expect free but is %s at page index %d\n",
list_empty(&alloc->pages[i].lru) ?
"alloc" : "lru", i);
binder_selftest_failures++;
}
}
}
static void binder_selftest_alloc_free(struct binder_alloc *alloc,
size_t *sizes, int *seq, size_t end)
{
struct binder_buffer *buffers[BUFFER_NUM];
binder_selftest_alloc_buf(alloc, buffers, sizes, seq);
binder_selftest_free_buf(alloc, buffers, sizes, seq, end);
/* Allocate from lru. */
binder_selftest_alloc_buf(alloc, buffers, sizes, seq);
if (list_lru_count(&binder_alloc_lru))
pr_err("lru list should be empty but is not\n");
binder_selftest_free_buf(alloc, buffers, sizes, seq, end);
binder_selftest_free_page(alloc);
}
static bool is_dup(int *seq, int index, int val)
{
int i;
for (i = 0; i < index; i++) {
if (seq[i] == val)
return true;
}
return false;
}
/* Generate BUFFER_NUM factorial free orders. */
static void binder_selftest_free_seq(struct binder_alloc *alloc,
size_t *sizes, int *seq,
int index, size_t end)
{
int i;
if (index == BUFFER_NUM) {
binder_selftest_alloc_free(alloc, sizes, seq, end);
return;
}
for (i = 0; i < BUFFER_NUM; i++) {
if (is_dup(seq, index, i))
continue;
seq[index] = i;
binder_selftest_free_seq(alloc, sizes, seq, index + 1, end);
}
}
static void binder_selftest_alloc_size(struct binder_alloc *alloc,
size_t *end_offset)
{
int i;
int seq[BUFFER_NUM] = {0};
size_t front_sizes[BUFFER_NUM];
size_t back_sizes[BUFFER_NUM];
size_t last_offset, offset = 0;
for (i = 0; i < BUFFER_NUM; i++) {
last_offset = offset;
offset = end_offset[i];
front_sizes[i] = offset - last_offset;
back_sizes[BUFFER_NUM - i - 1] = front_sizes[i];
}
/*
* Buffers share the first or last few pages.
* Only BUFFER_NUM - 1 buffer sizes are adjustable since
* we need one giant buffer before getting to the last page.
*/
back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1];
binder_selftest_free_seq(alloc, front_sizes, seq, 0,
end_offset[BUFFER_NUM - 1]);
binder_selftest_free_seq(alloc, back_sizes, seq, 0, alloc->buffer_size);
}
static void binder_selftest_alloc_offset(struct binder_alloc *alloc,
size_t *end_offset, int index)
{
int align;
size_t end, prev;
if (index == BUFFER_NUM) {
binder_selftest_alloc_size(alloc, end_offset);
return;
}
prev = index == 0 ? 0 : end_offset[index - 1];
end = prev;
BUILD_BUG_ON(BUFFER_MIN_SIZE * BUFFER_NUM >= PAGE_SIZE);
for (align = SAME_PAGE_UNALIGNED; align < LOOP_END; align++) {
if (align % 2)
end = ALIGN(end, PAGE_SIZE);
else
end += BUFFER_MIN_SIZE;
end_offset[index] = end;
binder_selftest_alloc_offset(alloc, end_offset, index + 1);
}
}
/**
* binder_selftest_alloc() - Test alloc and free of buffer pages.
* @alloc: Pointer to alloc struct.
*
* Allocate BUFFER_NUM buffers to cover all page alignment cases,
* then free them in all orders possible. Check that pages are
* correctly allocated, put onto lru when buffers are freed, and
* are freed when binder_alloc_free_page is called.
*/
void binder_selftest_alloc(struct binder_alloc *alloc)
{
size_t end_offset[BUFFER_NUM];
if (!binder_selftest_run)
return;
mutex_lock(&binder_selftest_lock);
if (!binder_selftest_run || !alloc->vma)
goto done;
pr_info("STARTED\n");
binder_selftest_alloc_offset(alloc, end_offset, 0);
binder_selftest_run = false;
if (binder_selftest_failures > 0)
pr_info("%d tests FAILED\n", binder_selftest_failures);
else
pr_info("PASSED\n");
done:
mutex_unlock(&binder_selftest_lock);
}

View File

@ -23,7 +23,8 @@
struct binder_buffer;
struct binder_node;
struct binder_proc;
struct binder_ref;
struct binder_alloc;
struct binder_ref_data;
struct binder_thread;
struct binder_transaction;
@ -84,6 +85,30 @@ DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_ioctl_done);
DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_write_done);
DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_read_done);
TRACE_EVENT(binder_set_priority,
TP_PROTO(int proc, int thread, unsigned int old_prio,
unsigned int desired_prio, unsigned int new_prio),
TP_ARGS(proc, thread, old_prio, new_prio, desired_prio),
TP_STRUCT__entry(
__field(int, proc)
__field(int, thread)
__field(unsigned int, old_prio)
__field(unsigned int, new_prio)
__field(unsigned int, desired_prio)
),
TP_fast_assign(
__entry->proc = proc;
__entry->thread = thread;
__entry->old_prio = old_prio;
__entry->new_prio = new_prio;
__entry->desired_prio = desired_prio;
),
TP_printk("proc=%d thread=%d old=%d => new=%d desired=%d",
__entry->proc, __entry->thread, __entry->old_prio,
__entry->new_prio, __entry->desired_prio)
);
TRACE_EVENT(binder_wait_for_work,
TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo),
TP_ARGS(proc_work, transaction_stack, thread_todo),
@ -146,8 +171,8 @@ TRACE_EVENT(binder_transaction_received,
TRACE_EVENT(binder_transaction_node_to_ref,
TP_PROTO(struct binder_transaction *t, struct binder_node *node,
struct binder_ref *ref),
TP_ARGS(t, node, ref),
struct binder_ref_data *rdata),
TP_ARGS(t, node, rdata),
TP_STRUCT__entry(
__field(int, debug_id)
@ -160,8 +185,8 @@ TRACE_EVENT(binder_transaction_node_to_ref,
__entry->debug_id = t->debug_id;
__entry->node_debug_id = node->debug_id;
__entry->node_ptr = node->ptr;
__entry->ref_debug_id = ref->debug_id;
__entry->ref_desc = ref->desc;
__entry->ref_debug_id = rdata->debug_id;
__entry->ref_desc = rdata->desc;
),
TP_printk("transaction=%d node=%d src_ptr=0x%016llx ==> dest_ref=%d dest_desc=%d",
__entry->debug_id, __entry->node_debug_id,
@ -170,8 +195,9 @@ TRACE_EVENT(binder_transaction_node_to_ref,
);
TRACE_EVENT(binder_transaction_ref_to_node,
TP_PROTO(struct binder_transaction *t, struct binder_ref *ref),
TP_ARGS(t, ref),
TP_PROTO(struct binder_transaction *t, struct binder_node *node,
struct binder_ref_data *rdata),
TP_ARGS(t, node, rdata),
TP_STRUCT__entry(
__field(int, debug_id)
@ -182,10 +208,10 @@ TRACE_EVENT(binder_transaction_ref_to_node,
),
TP_fast_assign(
__entry->debug_id = t->debug_id;
__entry->ref_debug_id = ref->debug_id;
__entry->ref_desc = ref->desc;
__entry->node_debug_id = ref->node->debug_id;
__entry->node_ptr = ref->node->ptr;
__entry->ref_debug_id = rdata->debug_id;
__entry->ref_desc = rdata->desc;
__entry->node_debug_id = node->debug_id;
__entry->node_ptr = node->ptr;
),
TP_printk("transaction=%d node=%d src_ref=%d src_desc=%d ==> dest_ptr=0x%016llx",
__entry->debug_id, __entry->node_debug_id,
@ -194,9 +220,10 @@ TRACE_EVENT(binder_transaction_ref_to_node,
);
TRACE_EVENT(binder_transaction_ref_to_ref,
TP_PROTO(struct binder_transaction *t, struct binder_ref *src_ref,
struct binder_ref *dest_ref),
TP_ARGS(t, src_ref, dest_ref),
TP_PROTO(struct binder_transaction *t, struct binder_node *node,
struct binder_ref_data *src_ref,
struct binder_ref_data *dest_ref),
TP_ARGS(t, node, src_ref, dest_ref),
TP_STRUCT__entry(
__field(int, debug_id)
@ -208,7 +235,7 @@ TRACE_EVENT(binder_transaction_ref_to_ref,
),
TP_fast_assign(
__entry->debug_id = t->debug_id;
__entry->node_debug_id = src_ref->node->debug_id;
__entry->node_debug_id = node->debug_id;
__entry->src_ref_debug_id = src_ref->debug_id;
__entry->src_ref_desc = src_ref->desc;
__entry->dest_ref_debug_id = dest_ref->debug_id;
@ -268,9 +295,9 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release,
TP_ARGS(buffer));
TRACE_EVENT(binder_update_page_range,
TP_PROTO(struct binder_proc *proc, bool allocate,
TP_PROTO(struct binder_alloc *alloc, bool allocate,
void *start, void *end),
TP_ARGS(proc, allocate, start, end),
TP_ARGS(alloc, allocate, start, end),
TP_STRUCT__entry(
__field(int, proc)
__field(bool, allocate)
@ -278,9 +305,9 @@ TRACE_EVENT(binder_update_page_range,
__field(size_t, size)
),
TP_fast_assign(
__entry->proc = proc->pid;
__entry->proc = alloc->pid;
__entry->allocate = allocate;
__entry->offset = start - proc->buffer;
__entry->offset = start - alloc->buffer;
__entry->size = end - start;
),
TP_printk("proc=%d allocate=%d offset=%zu size=%zu",
@ -288,6 +315,61 @@ TRACE_EVENT(binder_update_page_range,
__entry->offset, __entry->size)
);
DECLARE_EVENT_CLASS(binder_lru_page_class,
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
TP_ARGS(alloc, page_index),
TP_STRUCT__entry(
__field(int, proc)
__field(size_t, page_index)
),
TP_fast_assign(
__entry->proc = alloc->pid;
__entry->page_index = page_index;
),
TP_printk("proc=%d page_index=%zu",
__entry->proc, __entry->page_index)
);
DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_start,
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
TP_ARGS(alloc, page_index));
DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_end,
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
TP_ARGS(alloc, page_index));
DEFINE_EVENT(binder_lru_page_class, binder_free_lru_start,
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
TP_ARGS(alloc, page_index));
DEFINE_EVENT(binder_lru_page_class, binder_free_lru_end,
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
TP_ARGS(alloc, page_index));
DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_start,
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
TP_ARGS(alloc, page_index));
DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_end,
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
TP_ARGS(alloc, page_index));
DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_start,
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
TP_ARGS(alloc, page_index));
DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_end,
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
TP_ARGS(alloc, page_index));
DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_start,
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
TP_ARGS(alloc, page_index));
DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_end,
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
TP_ARGS(alloc, page_index));
TRACE_EVENT(binder_command,
TP_PROTO(uint32_t cmd),
TP_ARGS(cmd),

View File

@ -13,7 +13,9 @@
* drops below 4096 pages and kill processes with a oom_score_adj value of 0 or
* higher when the free memory drops below 1024 pages.
*
* The driver considers memory used for caches to be free.
* The driver considers memory used for caches to be free, but if a large
* percentage of the cached memory is locked this can be very inaccurate
* and processes may not get killed until the normal oom killer is triggered.
*
* Copyright (C) 2007-2008 Google, Inc.
*
@ -27,11 +29,6 @@
* GNU General Public License for more details.
*
*/
/*
* NOTE: This file has been modified by Sony Mobile Communications Inc.
* Modifications are Copyright (c) 2015 Sony Mobile Communications Inc,
* and licensed under the license of the file.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@ -40,27 +37,9 @@
#include <linux/mm.h>
#include <linux/oom.h>
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/rcupdate.h>
#include <linux/notifier.h>
#include <linux/mutex.h>
#include <linux/delay.h>
#include <linux/swap.h>
#include <linux/fs.h>
#include <linux/cpuset.h>
#include <linux/show_mem_notifier.h>
#include <linux/vmpressure.h>
#define CREATE_TRACE_POINTS
#include <trace/events/almk.h>
#define CREATE_TRACE_POINTS
#include <trace/events/lmk.h>
#ifdef CONFIG_HIGHMEM
#define _ZONE ZONE_HIGHMEM
#else
#define _ZONE ZONE_NORMAL
#endif
#define CREATE_TRACE_POINTS
#include "trace/lowmemorykiller.h"
@ -80,7 +59,6 @@ static int lowmem_minfree[6] = {
16 * 1024, /* 64MB */
};
static int lowmem_minfree_size = 4;
static int lmk_fast_run = 1;
static unsigned long lowmem_deathpending_timeout;
@ -90,380 +68,32 @@ static unsigned long lowmem_deathpending_timeout;
pr_info(x); \
} while (0)
static atomic_t shift_adj = ATOMIC_INIT(0);
static short adj_max_shift = 353;
/* User knob to enable/disable adaptive lmk feature */
static int enable_adaptive_lmk;
module_param_named(enable_adaptive_lmk, enable_adaptive_lmk, int,
S_IRUGO | S_IWUSR);
/*
* This parameter controls the behaviour of LMK when vmpressure is in
* the range of 90-94. Adaptive lmk triggers based on number of file
* pages wrt vmpressure_file_min, when vmpressure is in the range of
* 90-94. Usually this is a pseudo minfree value, higher than the
* highest configured value in minfree array.
*/
static int vmpressure_file_min;
module_param_named(vmpressure_file_min, vmpressure_file_min, int,
S_IRUGO | S_IWUSR);
enum {
VMPRESSURE_NO_ADJUST = 0,
VMPRESSURE_ADJUST_ENCROACH,
VMPRESSURE_ADJUST_NORMAL,
};
int adjust_minadj(short *min_score_adj)
static unsigned long lowmem_count(struct shrinker *s,
struct shrink_control *sc)
{
int ret = VMPRESSURE_NO_ADJUST;
if (!enable_adaptive_lmk)
return 0;
if (atomic_read(&shift_adj) &&
(*min_score_adj > adj_max_shift)) {
if (*min_score_adj == OOM_SCORE_ADJ_MAX + 1)
ret = VMPRESSURE_ADJUST_ENCROACH;
else
ret = VMPRESSURE_ADJUST_NORMAL;
*min_score_adj = adj_max_shift;
}
atomic_set(&shift_adj, 0);
return ret;
return global_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_ACTIVE_FILE) +
global_page_state(NR_INACTIVE_ANON) +
global_page_state(NR_INACTIVE_FILE);
}
static int lmk_vmpressure_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
int other_free, other_file;
unsigned long pressure = action;
int array_size = ARRAY_SIZE(lowmem_adj);
if (!enable_adaptive_lmk)
return 0;
if (pressure >= 95) {
other_file = global_page_state(NR_FILE_PAGES) -
global_page_state(NR_SHMEM) -
total_swapcache_pages();
other_free = global_page_state(NR_FREE_PAGES);
atomic_set(&shift_adj, 1);
trace_almk_vmpressure(pressure, other_free, other_file);
} else if (pressure >= 90) {
if (lowmem_adj_size < array_size)
array_size = lowmem_adj_size;
if (lowmem_minfree_size < array_size)
array_size = lowmem_minfree_size;
other_file = global_page_state(NR_FILE_PAGES) -
global_page_state(NR_SHMEM) -
total_swapcache_pages();
other_free = global_page_state(NR_FREE_PAGES);
if ((other_free < lowmem_minfree[array_size - 1]) &&
(other_file < vmpressure_file_min)) {
atomic_set(&shift_adj, 1);
trace_almk_vmpressure(pressure, other_free,
other_file);
}
} else if (atomic_read(&shift_adj)) {
/*
* shift_adj would have been set by a previous invocation
* of notifier, which is not followed by a lowmem_shrink yet.
* Since vmpressure has improved, reset shift_adj to avoid
* false adaptive LMK trigger.
*/
trace_almk_vmpressure(pressure, other_free, other_file);
atomic_set(&shift_adj, 0);
}
return 0;
}
static struct notifier_block lmk_vmpr_nb = {
.notifier_call = lmk_vmpressure_notifier,
};
static int test_task_flag(struct task_struct *p, int flag)
{
struct task_struct *t;
for_each_thread(p, t) {
task_lock(t);
if (test_tsk_thread_flag(t, flag)) {
task_unlock(t);
return 1;
}
task_unlock(t);
}
return 0;
}
static DEFINE_MUTEX(scan_mutex);
int can_use_cma_pages(gfp_t gfp_mask)
{
int can_use = 0;
int mtype = allocflags_to_migratetype(gfp_mask);
int i = 0;
int *mtype_fallbacks = get_migratetype_fallbacks(mtype);
if (is_migrate_cma(mtype)) {
can_use = 1;
} else {
for (i = 0;; i++) {
int fallbacktype = mtype_fallbacks[i];
if (is_migrate_cma(fallbacktype)) {
can_use = 1;
break;
}
if (fallbacktype == MIGRATE_RESERVE)
break;
}
}
return can_use;
}
void tune_lmk_zone_param(struct zonelist *zonelist, int classzone_idx,
int *other_free, int *other_file,
int use_cma_pages)
{
struct zone *zone;
struct zoneref *zoneref;
int zone_idx;
for_each_zone_zonelist(zone, zoneref, zonelist, MAX_NR_ZONES) {
zone_idx = zonelist_zone_idx(zoneref);
if (zone_idx == ZONE_MOVABLE) {
if (!use_cma_pages)
*other_free -=
zone_page_state(zone, NR_FREE_CMA_PAGES);
continue;
}
if (zone_idx > classzone_idx) {
if (other_free != NULL)
*other_free -= zone_page_state(zone,
NR_FREE_PAGES);
if (other_file != NULL)
*other_file -= zone_page_state(zone,
NR_FILE_PAGES)
- zone_page_state(zone, NR_SHMEM)
- zone_page_state(zone, NR_SWAPCACHE);
} else if (zone_idx < classzone_idx) {
if (zone_watermark_ok(zone, 0, 0, classzone_idx, 0)) {
if (!use_cma_pages) {
*other_free -= min(
zone->lowmem_reserve[classzone_idx] +
zone_page_state(
zone, NR_FREE_CMA_PAGES),
zone_page_state(
zone, NR_FREE_PAGES));
} else {
*other_free -=
zone->lowmem_reserve[classzone_idx];
}
} else {
*other_free -=
zone_page_state(zone, NR_FREE_PAGES);
}
}
}
}
#ifdef CONFIG_HIGHMEM
void adjust_gfp_mask(gfp_t *gfp_mask)
{
struct zone *preferred_zone;
struct zonelist *zonelist;
enum zone_type high_zoneidx;
if (current_is_kswapd()) {
zonelist = node_zonelist(0, *gfp_mask);
high_zoneidx = gfp_zone(*gfp_mask);
first_zones_zonelist(zonelist, high_zoneidx, NULL,
&preferred_zone);
if (high_zoneidx == ZONE_NORMAL) {
if (zone_watermark_ok_safe(preferred_zone, 0,
high_wmark_pages(preferred_zone), 0,
0))
*gfp_mask |= __GFP_HIGHMEM;
} else if (high_zoneidx == ZONE_HIGHMEM) {
*gfp_mask |= __GFP_HIGHMEM;
}
}
}
#else
void adjust_gfp_mask(gfp_t *unused)
{
}
#endif
void tune_lmk_param(int *other_free, int *other_file, struct shrink_control *sc)
{
gfp_t gfp_mask;
struct zone *preferred_zone;
struct zonelist *zonelist;
enum zone_type high_zoneidx, classzone_idx;
unsigned long balance_gap;
int use_cma_pages;
gfp_mask = sc->gfp_mask;
adjust_gfp_mask(&gfp_mask);
zonelist = node_zonelist(0, gfp_mask);
high_zoneidx = gfp_zone(gfp_mask);
first_zones_zonelist(zonelist, high_zoneidx, NULL, &preferred_zone);
classzone_idx = zone_idx(preferred_zone);
use_cma_pages = can_use_cma_pages(gfp_mask);
balance_gap = min(low_wmark_pages(preferred_zone),
(preferred_zone->present_pages +
KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
KSWAPD_ZONE_BALANCE_GAP_RATIO);
if (likely(current_is_kswapd() && zone_watermark_ok(preferred_zone, 0,
high_wmark_pages(preferred_zone) + SWAP_CLUSTER_MAX +
balance_gap, 0, 0))) {
if (lmk_fast_run)
tune_lmk_zone_param(zonelist, classzone_idx, other_free,
other_file, use_cma_pages);
else
tune_lmk_zone_param(zonelist, classzone_idx, other_free,
NULL, use_cma_pages);
if (zone_watermark_ok(preferred_zone, 0, 0, _ZONE, 0)) {
if (!use_cma_pages) {
*other_free -= min(
preferred_zone->lowmem_reserve[_ZONE]
+ zone_page_state(
preferred_zone, NR_FREE_CMA_PAGES),
zone_page_state(
preferred_zone, NR_FREE_PAGES));
} else {
*other_free -=
preferred_zone->lowmem_reserve[_ZONE];
}
} else {
*other_free -= zone_page_state(preferred_zone,
NR_FREE_PAGES);
}
lowmem_print(4, "lowmem_shrink of kswapd tunning for highmem "
"ofree %d, %d\n", *other_free, *other_file);
} else {
tune_lmk_zone_param(zonelist, classzone_idx, other_free,
other_file, use_cma_pages);
if (!use_cma_pages) {
*other_free -=
zone_page_state(preferred_zone, NR_FREE_CMA_PAGES);
}
lowmem_print(4, "lowmem_shrink tunning for others ofree %d, "
"%d\n", *other_free, *other_file);
}
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
if (zone_watermark_ok(preferred_zone, 0,
low_wmark_pages(preferred_zone), 0, 0)) {
struct sysinfo si;
si_swapinfo(&si);
*other_free += si.freeswap;
#ifdef CONFIG_ZRAM
/* If swap is actually residing in RAM (e. g. the swap device
* is a ZRAM device), we need to subtract the amount of RAM
* that will be occupied by compressed data. To play on the
* safe side, it's better to subtract too much than too few,
* otherwise LMK may not be triggered when it has to be. ZRAM
* compression ratio is at least 2, so we subtract half of the
* reported freeswap.
*/
*other_free -= si.freeswap >> 1;
#endif
lowmem_print(4, "lowmem_shrink tunning for swap "
"ofree %d, %d\n", *other_free, *other_file);
}
#endif
}
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
static void lowmem_wakeup_kswapd(struct shrink_control *sc, int minfree)
{
gfp_t gfp_mask;
struct zone *preferred_zone;
struct zonelist *zonelist;
enum zone_type high_zoneidx, classzone_idx;
int order = 0;
if (likely(current_is_kswapd()))
return;
gfp_mask = sc->gfp_mask;
adjust_gfp_mask(&gfp_mask);
zonelist = node_zonelist(0, gfp_mask);
high_zoneidx = gfp_zone(gfp_mask);
first_zones_zonelist(zonelist, high_zoneidx, NULL, &preferred_zone);
classzone_idx = zone_idx(preferred_zone);
for (minfree >>= 13; order < 7; order++) {
if (minfree <= (1 << order))
break;
}
lowmem_print(4, "lowmem_wakeup_kswapd order %d\n", order);
wakeup_kswapd(preferred_zone, order, classzone_idx);
}
#endif
static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
{
struct task_struct *tsk;
struct task_struct *selected = NULL;
int rem = 0;
unsigned long rem = 0;
int tasksize;
int i;
int ret = 0;
short min_score_adj = OOM_SCORE_ADJ_MAX + 1;
int minfree = 0;
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
int max_minfree = 0;
#endif
int selected_tasksize = 0;
short selected_oom_score_adj;
int array_size = ARRAY_SIZE(lowmem_adj);
int other_free;
int other_file;
unsigned long nr_to_scan = sc->nr_to_scan;
if (nr_to_scan > 0) {
if (mutex_lock_interruptible(&scan_mutex) < 0) {
trace_lmk_remain_scan(0, nr_to_scan, sc->gfp_mask);
return 0;
};
}
other_free = global_page_state(NR_FREE_PAGES);
if (global_page_state(NR_SHMEM) + global_page_state(NR_MLOCK_FILE) +
total_swapcache_pages() < global_page_state(NR_FILE_PAGES))
other_file = global_page_state(NR_FILE_PAGES) -
int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
int other_file = global_page_state(NR_FILE_PAGES) -
global_page_state(NR_SHMEM) -
global_page_state(NR_MLOCK_FILE) -
global_page_state(NR_UNEVICTABLE) -
total_swapcache_pages();
else
other_file = 0;
tune_lmk_param(&other_free, &other_file, sc);
if (lowmem_adj_size < array_size)
array_size = lowmem_adj_size;
@ -471,43 +101,22 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
array_size = lowmem_minfree_size;
for (i = 0; i < array_size; i++) {
minfree = lowmem_minfree[i];
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
if (max_minfree < minfree)
max_minfree = minfree;
#endif
if (other_free < minfree && other_file < minfree) {
min_score_adj = lowmem_adj[i];
break;
}
}
if (nr_to_scan > 0) {
ret = adjust_minadj(&min_score_adj);
lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %hd\n",
nr_to_scan, sc->gfp_mask, other_free,
other_file, min_score_adj);
lowmem_print(3, "lowmem_scan %lu, %x, ofree %d %d, ma %hd\n",
sc->nr_to_scan, sc->gfp_mask, other_free,
other_file, min_score_adj);
if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
lowmem_print(5, "lowmem_scan %lu, %x, return 0\n",
sc->nr_to_scan, sc->gfp_mask);
return 0;
}
rem = global_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_ACTIVE_FILE) +
global_page_state(NR_INACTIVE_ANON) +
global_page_state(NR_INACTIVE_FILE);
if (nr_to_scan <= 0 || min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
lowmem_print(5, "lowmem_shrink %lu, %x, return %d\n",
nr_to_scan, sc->gfp_mask, rem);
if (nr_to_scan > 0)
mutex_unlock(&scan_mutex);
if ((min_score_adj == OOM_SCORE_ADJ_MAX + 1) &&
(nr_to_scan > 0))
trace_almk_shrink(0, ret, other_free, other_file, 0);
trace_lmk_remain_scan(rem, nr_to_scan, sc->gfp_mask);
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
lowmem_wakeup_kswapd(sc, max_minfree);
#endif
return rem;
}
selected_oom_score_adj = min_score_adj;
rcu_read_lock();
@ -518,30 +127,16 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
if (tsk->flags & PF_KTHREAD)
continue;
/* if task no longer has any memory ignore it */
if (test_task_flag(tsk, TIF_MM_RELEASED))
continue;
/* Ignore task if coredump in progress */
if (tsk->mm && tsk->mm->core_state)
continue;
if (time_before_eq(jiffies, lowmem_deathpending_timeout)) {
if (test_task_flag(tsk, TIF_MEMDIE)) {
rcu_read_unlock();
/* give the system time to free up the memory */
msleep_interruptible(20);
mutex_unlock(&scan_mutex);
trace_lmk_remain_scan(rem, nr_to_scan,
sc->gfp_mask);
return 0;
}
}
p = find_lock_task_mm(tsk);
if (!p)
continue;
if (test_tsk_thread_flag(p, TIF_MEMDIE) &&
time_before_eq(jiffies, lowmem_deathpending_timeout)) {
task_unlock(p);
rcu_read_unlock();
return 0;
}
oom_score_adj = p->signal->oom_score_adj;
if (oom_score_adj < min_score_adj) {
task_unlock(p);
@ -561,7 +156,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
selected = p;
selected_tasksize = tasksize;
selected_oom_score_adj = oom_score_adj;
lowmem_print(3, "select '%s' (%d), adj %hd, size %d, to kill\n",
lowmem_print(2, "select '%s' (%d), adj %hd, size %d, to kill\n",
p->comm, p->pid, oom_score_adj, tasksize);
}
if (selected) {
@ -572,83 +167,35 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
lowmem_print(1, "Killing '%s' (%d), adj %hd,\n" \
" to free %ldkB on behalf of '%s' (%d) because\n" \
" cache %ldkB is below limit %ldkB for oom_score_adj %hd\n" \
" Free memory is %ldkB above reserved.\n" \
" Free CMA is %ldkB\n" \
" Total reserve is %ldkB\n" \
" Total free pages is %ldkB\n" \
" Total file cache is %ldkB\n" \
" Slab Reclaimable is %ldkB\n" \
" Slab UnReclaimable is %ldkB\n" \
" Total Slab is %ldkB\n" \
" GFP mask is 0x%x\n",
" Free memory is %ldkB above reserved\n",
selected->comm, selected->pid,
selected_oom_score_adj,
selected_tasksize * (long)(PAGE_SIZE / 1024),
current->comm, current->pid,
cache_size, cache_limit,
min_score_adj,
free ,
global_page_state(NR_FREE_CMA_PAGES) *
(long)(PAGE_SIZE / 1024),
totalreserve_pages * (long)(PAGE_SIZE / 1024),
global_page_state(NR_FREE_PAGES) *
(long)(PAGE_SIZE / 1024),
global_page_state(NR_FILE_PAGES) *
(long)(PAGE_SIZE / 1024),
global_page_state(NR_SLAB_RECLAIMABLE) *
(long)(PAGE_SIZE / 1024),
global_page_state(NR_SLAB_UNRECLAIMABLE) *
(long)(PAGE_SIZE / 1024),
global_page_state(NR_SLAB_RECLAIMABLE) *
(long)(PAGE_SIZE / 1024) +
global_page_state(NR_SLAB_UNRECLAIMABLE) *
(long)(PAGE_SIZE / 1024),
sc->gfp_mask);
if (lowmem_debug_level >= 2 && selected_oom_score_adj == 0) {
show_mem(SHOW_MEM_FILTER_NODES);
dump_tasks(NULL, NULL);
show_mem_call_notifiers();
}
free);
lowmem_deathpending_timeout = jiffies + HZ;
/*
* FIXME: lowmemorykiller shouldn't abuse global OOM killer
* infrastructure. There is no real reason why the selected
* task should have access to the memory reserves.
*/
mark_tsk_oom_victim(selected);
set_tsk_thread_flag(selected, TIF_MEMDIE);
send_sig(SIGKILL, selected, 0);
rem -= selected_tasksize;
rcu_read_unlock();
trace_lmk_sigkill(selected->pid, selected->comm,
selected_oom_score_adj, selected_tasksize,
sc->gfp_mask);
/* give the system time to free up the memory */
msleep_interruptible(20);
trace_almk_shrink(selected_tasksize, ret,
other_free, other_file, selected_oom_score_adj);
} else {
trace_almk_shrink(1, ret, other_free, other_file, 0);
rcu_read_unlock();
rem += selected_tasksize;
}
lowmem_print(4, "lowmem_shrink %lu, %x, return %d\n",
nr_to_scan, sc->gfp_mask, rem);
mutex_unlock(&scan_mutex);
trace_lmk_remain_scan(rem, nr_to_scan, sc->gfp_mask);
lowmem_print(4, "lowmem_scan %lu, %x, return %lu\n",
sc->nr_to_scan, sc->gfp_mask, rem);
rcu_read_unlock();
return rem;
}
static struct shrinker lowmem_shrinker = {
.shrink = lowmem_shrink,
.scan_objects = lowmem_scan,
.count_objects = lowmem_count,
.seeks = DEFAULT_SEEKS * 16
};
static int __init lowmem_init(void)
{
register_shrinker(&lowmem_shrinker);
vmpressure_notifier_register(&lmk_vmpr_nb);
return 0;
}
@ -736,10 +283,8 @@ static const struct kparam_array __param_arr_adj = {
module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR);
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
__module_param_call(MODULE_PARAM_PREFIX, adj,
&lowmem_adj_array_ops,
.arr = &__param_arr_adj,
S_IRUGO | S_IWUSR, -1);
module_param_cb(adj, &lowmem_adj_array_ops,
.arr = &__param_arr_adj, S_IRUGO | S_IWUSR);
__MODULE_PARM_TYPE(adj, "array of short");
#else
module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size,
@ -748,7 +293,6 @@ module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size,
module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size,
S_IRUGO | S_IWUSR);
module_param_named(debug_level, lowmem_debug_level, uint, S_IRUGO | S_IWUSR);
module_param_named(lmk_fast_run, lmk_fast_run, int, S_IRUGO | S_IWUSR);
module_init(lowmem_init);
module_exit(lowmem_exit);

View File

@ -0,0 +1,541 @@
/*
* Copyright (C) 2008 Google, Inc.
*
* Based on, but no longer compatible with, the original
* OpenBinder.org binder driver interface, which is:
*
* Copyright (c) 2005 Palmsource, Inc.
*
* This software is licensed under the terms of the GNU General Public
* License version 2, as published by the Free Software Foundation, and
* may be copied, distributed, and modified under those terms.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#ifndef _UAPI_LINUX_BINDER_H
#define _UAPI_LINUX_BINDER_H
#include <linux/ioctl.h>
#define B_PACK_CHARS(c1, c2, c3, c4) \
((((c1)<<24)) | (((c2)<<16)) | (((c3)<<8)) | (c4))
#define B_TYPE_LARGE 0x85
enum {
BINDER_TYPE_BINDER = B_PACK_CHARS('s', 'b', '*', B_TYPE_LARGE),
BINDER_TYPE_WEAK_BINDER = B_PACK_CHARS('w', 'b', '*', B_TYPE_LARGE),
BINDER_TYPE_HANDLE = B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE),
BINDER_TYPE_WEAK_HANDLE = B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE),
BINDER_TYPE_FD = B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE),
BINDER_TYPE_FDA = B_PACK_CHARS('f', 'd', 'a', B_TYPE_LARGE),
BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE),
};
/**
* enum flat_binder_object_shifts: shift values for flat_binder_object_flags
* @FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT: shift for getting scheduler policy.
*
*/
enum flat_binder_object_shifts {
FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT = 9,
};
/**
* enum flat_binder_object_flags - flags for use in flat_binder_object.flags
*/
enum flat_binder_object_flags {
/**
* @FLAT_BINDER_FLAG_PRIORITY_MASK: bit-mask for min scheduler priority
*
* These bits can be used to set the minimum scheduler priority
* at which transactions into this node should run. Valid values
* in these bits depend on the scheduler policy encoded in
* @FLAT_BINDER_FLAG_SCHED_POLICY_MASK.
*
* For SCHED_NORMAL/SCHED_BATCH, the valid range is between [-20..19]
* For SCHED_FIFO/SCHED_RR, the value can run between [1..99]
*/
FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff,
/**
* @FLAT_BINDER_FLAG_ACCEPTS_FDS: whether the node accepts fds.
*/
FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100,
/**
* @FLAT_BINDER_FLAG_SCHED_POLICY_MASK: bit-mask for scheduling policy
*
* These two bits can be used to set the min scheduling policy at which
* transactions on this node should run. These match the UAPI
* scheduler policy values, eg:
* 00b: SCHED_NORMAL
* 01b: SCHED_FIFO
* 10b: SCHED_RR
* 11b: SCHED_BATCH
*/
FLAT_BINDER_FLAG_SCHED_POLICY_MASK =
3U << FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT,
/**
* @FLAT_BINDER_FLAG_INHERIT_RT: whether the node inherits RT policy
*
* Only when set, calls into this node will inherit a real-time
* scheduling policy from the caller (for synchronous transactions).
*/
FLAT_BINDER_FLAG_INHERIT_RT = 0x800,
/**
* @FLAT_BINDER_FLAG_TXN_SECURITY_CTX: request security contexts
*
* Only when set, causes senders to include their security
* context
*/
FLAT_BINDER_FLAG_TXN_SECURITY_CTX = 0x1000,
};
#ifdef BINDER_IPC_32BIT
typedef __u32 binder_size_t;
typedef __u32 binder_uintptr_t;
#else
typedef __u64 binder_size_t;
typedef __u64 binder_uintptr_t;
#endif
/**
* struct binder_object_header - header shared by all binder metadata objects.
* @type: type of the object
*/
struct binder_object_header {
__u32 type;
};
/*
* This is the flattened representation of a Binder object for transfer
* between processes. The 'offsets' supplied as part of a binder transaction
* contains offsets into the data where these structures occur. The Binder
* driver takes care of re-writing the structure type and data as it moves
* between processes.
*/
struct flat_binder_object {
struct binder_object_header hdr;
__u32 flags;
/* 8 bytes of data. */
union {
binder_uintptr_t binder; /* local object */
__u32 handle; /* remote object */
};
/* extra data associated with local object */
binder_uintptr_t cookie;
};
/**
* struct binder_fd_object - describes a filedescriptor to be fixed up.
* @hdr: common header structure
* @pad_flags: padding to remain compatible with old userspace code
* @pad_binder: padding to remain compatible with old userspace code
* @fd: file descriptor
* @cookie: opaque data, used by user-space
*/
struct binder_fd_object {
struct binder_object_header hdr;
__u32 pad_flags;
union {
binder_uintptr_t pad_binder;
__u32 fd;
};
binder_uintptr_t cookie;
};
/* struct binder_buffer_object - object describing a userspace buffer
* @hdr: common header structure
* @flags: one or more BINDER_BUFFER_* flags
* @buffer: address of the buffer
* @length: length of the buffer
* @parent: index in offset array pointing to parent buffer
* @parent_offset: offset in @parent pointing to this buffer
*
* A binder_buffer object represents an object that the
* binder kernel driver can copy verbatim to the target
* address space. A buffer itself may be pointed to from
* within another buffer, meaning that the pointer inside
* that other buffer needs to be fixed up as well. This
* can be done by setting the BINDER_BUFFER_FLAG_HAS_PARENT
* flag in @flags, by setting @parent buffer to the index
* in the offset array pointing to the parent binder_buffer_object,
* and by setting @parent_offset to the offset in the parent buffer
* at which the pointer to this buffer is located.
*/
struct binder_buffer_object {
struct binder_object_header hdr;
__u32 flags;
binder_uintptr_t buffer;
binder_size_t length;
binder_size_t parent;
binder_size_t parent_offset;
};
enum {
BINDER_BUFFER_FLAG_HAS_PARENT = 0x01,
};
/* struct binder_fd_array_object - object describing an array of fds in a buffer
* @hdr: common header structure
* @pad: padding to ensure correct alignment
* @num_fds: number of file descriptors in the buffer
* @parent: index in offset array to buffer holding the fd array
* @parent_offset: start offset of fd array in the buffer
*
* A binder_fd_array object represents an array of file
* descriptors embedded in a binder_buffer_object. It is
* different from a regular binder_buffer_object because it
* describes a list of file descriptors to fix up, not an opaque
* blob of memory, and hence the kernel needs to treat it differently.
*
* An example of how this would be used is with Android's
* native_handle_t object, which is a struct with a list of integers
* and a list of file descriptors. The native_handle_t struct itself
* will be represented by a struct binder_buffer_objct, whereas the
* embedded list of file descriptors is represented by a
* struct binder_fd_array_object with that binder_buffer_object as
* a parent.
*/
struct binder_fd_array_object {
struct binder_object_header hdr;
__u32 pad;
binder_size_t num_fds;
binder_size_t parent;
binder_size_t parent_offset;
};
/*
* On 64-bit platforms where user code may run in 32-bits the driver must
* translate the buffer (and local binder) addresses appropriately.
*/
struct binder_write_read {
binder_size_t write_size; /* bytes to write */
binder_size_t write_consumed; /* bytes consumed by driver */
binder_uintptr_t write_buffer;
binder_size_t read_size; /* bytes to read */
binder_size_t read_consumed; /* bytes consumed by driver */
binder_uintptr_t read_buffer;
};
/* Use with BINDER_VERSION, driver fills in fields. */
struct binder_version {
/* driver protocol version -- increment with incompatible change */
__s32 protocol_version;
};
/* This is the current protocol version. */
#ifdef BINDER_IPC_32BIT
#define BINDER_CURRENT_PROTOCOL_VERSION 7
#else
#define BINDER_CURRENT_PROTOCOL_VERSION 8
#endif
/*
* Use with BINDER_GET_NODE_DEBUG_INFO, driver reads ptr, writes to all fields.
* Set ptr to NULL for the first call to get the info for the first node, and
* then repeat the call passing the previously returned value to get the next
* nodes. ptr will be 0 when there are no more nodes.
*/
struct binder_node_debug_info {
binder_uintptr_t ptr;
binder_uintptr_t cookie;
__u32 has_strong_ref;
__u32 has_weak_ref;
};
struct binder_node_info_for_ref {
__u32 handle;
__u32 strong_count;
__u32 weak_count;
__u32 reserved1;
__u32 reserved2;
__u32 reserved3;
};
#define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read)
#define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64)
#define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32)
#define BINDER_SET_IDLE_PRIORITY _IOW('b', 6, __s32)
#define BINDER_SET_CONTEXT_MGR _IOW('b', 7, __s32)
#define BINDER_THREAD_EXIT _IOW('b', 8, __s32)
#define BINDER_VERSION _IOWR('b', 9, struct binder_version)
#define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info)
#define BINDER_GET_NODE_INFO_FOR_REF _IOWR('b', 12, struct binder_node_info_for_ref)
#define BINDER_SET_CONTEXT_MGR_EXT _IOW('b', 13, struct flat_binder_object)
/*
* NOTE: Two special error codes you should check for when calling
* in to the driver are:
*
* EINTR -- The operation has been interupted. This should be
* handled by retrying the ioctl() until a different error code
* is returned.
*
* ECONNREFUSED -- The driver is no longer accepting operations
* from your process. That is, the process is being destroyed.
* You should handle this by exiting from your process. Note
* that once this error code is returned, all further calls to
* the driver from any thread will return this same code.
*/
enum transaction_flags {
TF_ONE_WAY = 0x01, /* this is a one-way call: async, no return */
TF_ROOT_OBJECT = 0x04, /* contents are the component's root object */
TF_STATUS_CODE = 0x08, /* contents are a 32-bit status code */
TF_ACCEPT_FDS = 0x10, /* allow replies with file descriptors */
};
struct binder_transaction_data {
/* The first two are only used for bcTRANSACTION and brTRANSACTION,
* identifying the target and contents of the transaction.
*/
union {
/* target descriptor of command transaction */
__u32 handle;
/* target descriptor of return transaction */
binder_uintptr_t ptr;
} target;
binder_uintptr_t cookie; /* target object cookie */
__u32 code; /* transaction command */
/* General information about the transaction. */
__u32 flags;
pid_t sender_pid;
uid_t sender_euid;
binder_size_t data_size; /* number of bytes of data */
binder_size_t offsets_size; /* number of bytes of offsets */
/* If this transaction is inline, the data immediately
* follows here; otherwise, it ends with a pointer to
* the data buffer.
*/
union {
struct {
/* transaction data */
binder_uintptr_t buffer;
/* offsets from buffer to flat_binder_object structs */
binder_uintptr_t offsets;
} ptr;
__u8 buf[8];
} data;
};
struct binder_transaction_data_secctx {
struct binder_transaction_data transaction_data;
binder_uintptr_t secctx;
};
struct binder_transaction_data_sg {
struct binder_transaction_data transaction_data;
binder_size_t buffers_size;
};
struct binder_ptr_cookie {
binder_uintptr_t ptr;
binder_uintptr_t cookie;
};
struct binder_handle_cookie {
__u32 handle;
binder_uintptr_t cookie;
} __packed;
struct binder_pri_desc {
__s32 priority;
__u32 desc;
};
struct binder_pri_ptr_cookie {
__s32 priority;
binder_uintptr_t ptr;
binder_uintptr_t cookie;
};
enum binder_driver_return_protocol {
BR_ERROR = _IOR('r', 0, __s32),
/*
* int: error code
*/
BR_OK = _IO('r', 1),
/* No parameters! */
BR_TRANSACTION_SEC_CTX = _IOR('r', 2,
struct binder_transaction_data_secctx),
/*
* binder_transaction_data_secctx: the received command.
*/
BR_TRANSACTION = _IOR('r', 2, struct binder_transaction_data),
BR_REPLY = _IOR('r', 3, struct binder_transaction_data),
/*
* binder_transaction_data: the received command.
*/
BR_ACQUIRE_RESULT = _IOR('r', 4, __s32),
/*
* not currently supported
* int: 0 if the last bcATTEMPT_ACQUIRE was not successful.
* Else the remote object has acquired a primary reference.
*/
BR_DEAD_REPLY = _IO('r', 5),
/*
* The target of the last transaction (either a bcTRANSACTION or
* a bcATTEMPT_ACQUIRE) is no longer with us. No parameters.
*/
BR_TRANSACTION_COMPLETE = _IO('r', 6),
/*
* No parameters... always refers to the last transaction requested
* (including replies). Note that this will be sent even for
* asynchronous transactions.
*/
BR_INCREFS = _IOR('r', 7, struct binder_ptr_cookie),
BR_ACQUIRE = _IOR('r', 8, struct binder_ptr_cookie),
BR_RELEASE = _IOR('r', 9, struct binder_ptr_cookie),
BR_DECREFS = _IOR('r', 10, struct binder_ptr_cookie),
/*
* void *: ptr to binder
* void *: cookie for binder
*/
BR_ATTEMPT_ACQUIRE = _IOR('r', 11, struct binder_pri_ptr_cookie),
/*
* not currently supported
* int: priority
* void *: ptr to binder
* void *: cookie for binder
*/
BR_NOOP = _IO('r', 12),
/*
* No parameters. Do nothing and examine the next command. It exists
* primarily so that we can replace it with a BR_SPAWN_LOOPER command.
*/
BR_SPAWN_LOOPER = _IO('r', 13),
/*
* No parameters. The driver has determined that a process has no
* threads waiting to service incoming transactions. When a process
* receives this command, it must spawn a new service thread and
* register it via bcENTER_LOOPER.
*/
BR_FINISHED = _IO('r', 14),
/*
* not currently supported
* stop threadpool thread
*/
BR_DEAD_BINDER = _IOR('r', 15, binder_uintptr_t),
/*
* void *: cookie
*/
BR_CLEAR_DEATH_NOTIFICATION_DONE = _IOR('r', 16, binder_uintptr_t),
/*
* void *: cookie
*/
BR_FAILED_REPLY = _IO('r', 17),
/*
* The the last transaction (either a bcTRANSACTION or
* a bcATTEMPT_ACQUIRE) failed (e.g. out of memory). No parameters.
*/
};
enum binder_driver_command_protocol {
BC_TRANSACTION = _IOW('c', 0, struct binder_transaction_data),
BC_REPLY = _IOW('c', 1, struct binder_transaction_data),
/*
* binder_transaction_data: the sent command.
*/
BC_ACQUIRE_RESULT = _IOW('c', 2, __s32),
/*
* not currently supported
* int: 0 if the last BR_ATTEMPT_ACQUIRE was not successful.
* Else you have acquired a primary reference on the object.
*/
BC_FREE_BUFFER = _IOW('c', 3, binder_uintptr_t),
/*
* void *: ptr to transaction data received on a read
*/
BC_INCREFS = _IOW('c', 4, __u32),
BC_ACQUIRE = _IOW('c', 5, __u32),
BC_RELEASE = _IOW('c', 6, __u32),
BC_DECREFS = _IOW('c', 7, __u32),
/*
* int: descriptor
*/
BC_INCREFS_DONE = _IOW('c', 8, struct binder_ptr_cookie),
BC_ACQUIRE_DONE = _IOW('c', 9, struct binder_ptr_cookie),
/*
* void *: ptr to binder
* void *: cookie for binder
*/
BC_ATTEMPT_ACQUIRE = _IOW('c', 10, struct binder_pri_desc),
/*
* not currently supported
* int: priority
* int: descriptor
*/
BC_REGISTER_LOOPER = _IO('c', 11),
/*
* No parameters.
* Register a spawned looper thread with the device.
*/
BC_ENTER_LOOPER = _IO('c', 12),
BC_EXIT_LOOPER = _IO('c', 13),
/*
* No parameters.
* These two commands are sent as an application-level thread
* enters and exits the binder loop, respectively. They are
* used so the binder can have an accurate count of the number
* of looping threads it has available.
*/
BC_REQUEST_DEATH_NOTIFICATION = _IOW('c', 14,
struct binder_handle_cookie),
/*
* int: handle
* void *: cookie
*/
BC_CLEAR_DEATH_NOTIFICATION = _IOW('c', 15,
struct binder_handle_cookie),
/*
* int: handle
* void *: cookie
*/
BC_DEAD_BINDER_DONE = _IOW('c', 16, binder_uintptr_t),
/*
* void *: cookie
*/
BC_TRANSACTION_SG = _IOW('c', 17, struct binder_transaction_data_sg),
BC_REPLY_SG = _IOW('c', 18, struct binder_transaction_data_sg),
/*
* binder_transaction_data_sg: the sent command.
*/
};
#endif /* _UAPI_LINUX_BINDER_H */

View File

@ -181,17 +181,7 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
#include <uapi/linux/types.h>
static __always_inline void data_access_exceeds_word_size(void)
#ifdef __compiletime_warning
__compiletime_warning("data access exceeds word size and won't be atomic")
#endif
;
static __always_inline void data_access_exceeds_word_size(void)
{
}
static __always_inline void __read_once_size(volatile void *p, void *res, int size)
static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
{
switch (size) {
case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
@ -203,7 +193,6 @@ static __always_inline void __read_once_size(volatile void *p, void *res, int si
default:
barrier();
__builtin_memcpy((void *)res, (const void *)p, size);
data_access_exceeds_word_size();
barrier();
}
}
@ -220,7 +209,6 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
default:
barrier();
__builtin_memcpy((void *)p, (const void *)res, size);
data_access_exceeds_word_size();
barrier();
}
}
@ -248,10 +236,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
*/
#define READ_ONCE(x) \
({ typeof(x) __val; __read_once_size(&x, &__val, sizeof(__val)); __val; })
({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
#define WRITE_ONCE(x, val) \
({ typeof(x) __val; __val = val; __write_once_size(&x, &__val, sizeof(__val)); __val; })
({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
#endif /* __KERNEL__ */

View File

@ -388,6 +388,17 @@ static inline void list_splice_tail_init(struct list_head *list,
#define list_prev_entry(pos, member) \
list_entry((pos)->member.prev, typeof(*(pos)), member)
/**
* list_first_entry_or_null - get the first element from a list
* @ptr: the list head to take the element from.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_struct within the struct.
*
* Note that if the list is empty, it returns NULL.
*/
#define list_first_entry_or_null(ptr, type, member) \
(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
/**
* list_for_each - iterate over a list
* @pos: the &struct list_head to use as a loop cursor.

137
include/linux/list_lru.h Normal file
View File

@ -0,0 +1,137 @@
/*
* Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
* Authors: David Chinner and Glauber Costa
*
* Generic LRU infrastructure
*/
#ifndef _LRU_LIST_H
#define _LRU_LIST_H
#include <linux/list.h>
#include <linux/nodemask.h>
/* list_lru_walk_cb has to always return one of those */
enum lru_status {
LRU_REMOVED, /* item removed from list */
LRU_REMOVED_RETRY, /* item removed, but lock has been
dropped and reacquired */
LRU_ROTATE, /* item referenced, give another pass */
LRU_SKIP, /* item cannot be locked, skip */
LRU_RETRY, /* item not freeable. May drop the lock
internally, but has to return locked. */
};
struct list_lru_node {
spinlock_t lock;
struct list_head list;
/* kept as signed so we can catch imbalance bugs */
long nr_items;
} ____cacheline_aligned_in_smp;
struct list_lru {
struct list_lru_node *node;
nodemask_t active_nodes;
};
void list_lru_destroy(struct list_lru *lru);
int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
static inline int list_lru_init(struct list_lru *lru)
{
return list_lru_init_key(lru, NULL);
}
/**
* list_lru_add: add an element to the lru list's tail
* @list_lru: the lru pointer
* @item: the item to be added.
*
* If the element is already part of a list, this function returns doing
* nothing. Therefore the caller does not need to keep state about whether or
* not the element already belongs in the list and is allowed to lazy update
* it. Note however that this is valid for *a* list, not *this* list. If
* the caller organize itself in a way that elements can be in more than
* one type of list, it is up to the caller to fully remove the item from
* the previous list (with list_lru_del() for instance) before moving it
* to @list_lru
*
* Return value: true if the list was updated, false otherwise
*/
bool list_lru_add(struct list_lru *lru, struct list_head *item);
/**
* list_lru_del: delete an element to the lru list
* @list_lru: the lru pointer
* @item: the item to be deleted.
*
* This function works analogously as list_lru_add in terms of list
* manipulation. The comments about an element already pertaining to
* a list are also valid for list_lru_del.
*
* Return value: true if the list was updated, false otherwise
*/
bool list_lru_del(struct list_lru *lru, struct list_head *item);
/**
* list_lru_count_node: return the number of objects currently held by @lru
* @lru: the lru pointer.
* @nid: the node id to count from.
*
* Always return a non-negative number, 0 for empty lists. There is no
* guarantee that the list is not updated while the count is being computed.
* Callers that want such a guarantee need to provide an outer lock.
*/
unsigned long list_lru_count_node(struct list_lru *lru, int nid);
static inline unsigned long list_lru_count(struct list_lru *lru)
{
long count = 0;
int nid;
for_each_node_mask(nid, lru->active_nodes)
count += list_lru_count_node(lru, nid);
return count;
}
typedef enum lru_status
(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
/**
* list_lru_walk_node: walk a list_lru, isolating and disposing freeable items.
* @lru: the lru pointer.
* @nid: the node id to scan from.
* @isolate: callback function that is resposible for deciding what to do with
* the item currently being scanned
* @cb_arg: opaque type that will be passed to @isolate
* @nr_to_walk: how many items to scan.
*
* This function will scan all elements in a particular list_lru, calling the
* @isolate callback for each of those items, along with the current list
* spinlock and a caller-provided opaque. The @isolate callback can choose to
* drop the lock internally, but *must* return with the lock held. The callback
* will return an enum lru_status telling the list_lru infrastructure what to
* do with the object being scanned.
*
* Please note that nr_to_walk does not mean how many objects will be freed,
* just how many objects will be scanned.
*
* Return value: the number of objects effectively removed from the LRU.
*/
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
static inline unsigned long
list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
void *cb_arg, unsigned long nr_to_walk)
{
long isolated = 0;
int nid;
for_each_node_mask(nid, lru->active_nodes) {
isolated += list_lru_walk_node(lru, nid, isolate,
cb_arg, &nr_to_walk);
if (nr_to_walk <= 0)
break;
}
return isolated;
}
#endif /* _LRU_LIST_H */

View File

@ -1560,6 +1560,62 @@ struct task_struct {
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
#define MAX_NICE 19
#define MIN_NICE -20
#define NICE_WIDTH (MAX_NICE - MIN_NICE + 1)
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
* values are inverted: lower p->prio value means higher priority.
*
* The MAX_USER_RT_PRIO value allows the actual maximum
* RT priority to be separate from the value exported to
* user-space. This allows kernel threads to set their
* priority to a value higher than any user task. Note:
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
*/
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
* and back.
*/
#define NICE_TO_PRIO(nice) ((nice) + DEFAULT_PRIO)
#define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO)
#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
/*
* 'User priority' is the nice value converted to something we
* can work with better when scaling various scheduler parameters,
* it's a [ 0 ... 39 ] range.
*/
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
/*
* Convert nice value [19,-20] to rlimit style value [1,40].
*/
static inline long nice_to_rlimit(long nice)
{
return (MAX_NICE - nice + 1);
}
/*
* Convert rlimit style value [1,40] to nice value [-20, 19].
*/
static inline long rlimit_to_nice(long prio)
{
return (MAX_NICE - prio + 1);
}
#ifdef CONFIG_NUMA_BALANCING
extern void task_numa_fault(int node, int pages, bool migrated);
extern void set_numabalancing_state(bool enabled);

View File

@ -1,24 +1,9 @@
#ifndef _SCHED_RT_H
#define _SCHED_RT_H
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
* values are inverted: lower p->prio value means higher priority.
*
* The MAX_USER_RT_PRIO value allows the actual maximum
* RT priority to be separate from the value exported to
* user-space. This allows kernel threads to set their
* priority to a value higher than any user task. Note:
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
*/
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO
#define MAX_PRIO (MAX_RT_PRIO + 40)
#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
#define MAX_NICE 19
#define MIN_NICE -20
#define NICE_WIDTH (MAX_NICE - MIN_NICE + 1)
static inline int rt_prio(int prio)
{

View File

@ -4,6 +4,12 @@
/*
* This struct is used to pass information from page reclaim to the shrinkers.
* We consolidate the values for easier extention later.
*
* The 'gfpmask' refers to the allocation we are currently trying to
* fulfil.
*
* Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
* querying the cache size, so a fastpath for that case is appropriate.
*/
struct shrink_control {
gfp_t gfp_mask;
@ -12,23 +18,37 @@ struct shrink_control {
unsigned long nr_to_scan;
};
#define SHRINK_STOP (~0UL)
/*
* A callback you can register to apply pressure to ageable caches.
*
* 'sc' is passed shrink_control which includes a count 'nr_to_scan'
* and a 'gfpmask'. It should look through the least-recently-used
* 'nr_to_scan' entries and attempt to free them up. It should return
* the number of objects which remain in the cache. If it returns -1, it means
* it cannot do any scanning at this time (eg. there is a risk of deadlock).
* @shrink() should look through the least-recently-used 'nr_to_scan' entries
* and attempt to free them up. It should return the number of objects which
* remain in the cache. If it returns -1, it means it cannot do any scanning at
* this time (eg. there is a risk of deadlock).
*
* The 'gfpmask' refers to the allocation we are currently trying to
* fulfil.
* @count_objects should return the number of freeable items in the cache. If
* there are no objects to free or the number of freeable items cannot be
* determined, it should return 0. No deadlock checks should be done during the
* count callback - the shrinker relies on aggregating scan counts that couldn't
* be executed due to potential deadlocks to be run at a later call when the
* deadlock condition is no longer pending.
*
* Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
* querying the cache size, so a fastpath for that case is appropriate.
* @scan_objects will only be called if @count_objects returned a non-zero
* value for the number of freeable objects. The callout should scan the cache
* and attempt to free items from the cache. It should then return the number
* of objects freed during the scan, or SHRINK_STOP if progress cannot be made
* due to potential deadlocks. If SHRINK_STOP is returned, then no further
* attempts to call the @scan_objects will be made from the current reclaim
* context.
*/
struct shrinker {
int (*shrink)(struct shrinker *, struct shrink_control *sc);
unsigned long (*count_objects)(struct shrinker *,
struct shrink_control *sc);
unsigned long (*scan_objects)(struct shrinker *,
struct shrink_control *sc);
int seeks; /* seeks to recreate an obj */
long batch; /* reclaim batch size, 0 = default */

View File

@ -1,50 +0,0 @@
#ifndef __LINUX_VMPRESSURE_H
#define __LINUX_VMPRESSURE_H
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/workqueue.h>
#include <linux/gfp.h>
#include <linux/types.h>
#include <linux/cgroup.h>
struct vmpressure {
unsigned long scanned;
unsigned long reclaimed;
unsigned long stall;
/* The lock is used to keep the scanned/reclaimed above in sync. */
struct mutex sr_lock;
/* The list of vmpressure_event structs. */
struct list_head events;
/* Have to grab the lock on events traversal or modifications. */
struct mutex events_lock;
struct work_struct work;
};
struct mem_cgroup;
extern int vmpressure_notifier_register(struct notifier_block *nb);
extern int vmpressure_notifier_unregister(struct notifier_block *nb);
extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
unsigned long scanned, unsigned long reclaimed);
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
#ifdef CONFIG_MEMCG
extern void vmpressure_init(struct vmpressure *vmpr);
extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
struct eventfd_ctx *eventfd,
const char *args);
extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
struct eventfd_ctx *eventfd);
#else
static inline struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
return NULL;
}
#endif /* CONFIG_MEMCG */
#endif /* __LINUX_VMPRESSURE_H */

View File

@ -1,84 +0,0 @@
/* Copyright (c) 2015, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
* only version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM almk
#if !defined(_TRACE_EVENT_ALMK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_EVENT_ALMK_H
#include <linux/tracepoint.h>
#include <linux/types.h>
TRACE_EVENT(almk_vmpressure,
TP_PROTO(unsigned long pressure,
int other_free,
int other_file),
TP_ARGS(pressure, other_free, other_file),
TP_STRUCT__entry(
__field(unsigned long, pressure)
__field(int, other_free)
__field(int, other_file)
),
TP_fast_assign(
__entry->pressure = pressure;
__entry->other_free = other_free;
__entry->other_file = other_file;
),
TP_printk("%lu, %d, %d",
__entry->pressure, __entry->other_free,
__entry->other_file)
);
TRACE_EVENT(almk_shrink,
TP_PROTO(int tsize,
int vmp,
int other_free,
int other_file,
short adj),
TP_ARGS(tsize, vmp, other_free, other_file, adj),
TP_STRUCT__entry(
__field(int, tsize)
__field(int, vmp)
__field(int, other_free)
__field(int, other_file)
__field(short, adj)
),
TP_fast_assign(
__entry->tsize = tsize;
__entry->vmp = vmp;
__entry->other_free = other_free;
__entry->other_file = other_file;
__entry->adj = adj;
),
TP_printk("%d, %d, %d, %d, %d",
__entry->tsize,
__entry->vmp,
__entry->other_free,
__entry->other_file,
__entry->adj)
);
#endif
#include <trace/define_trace.h>

View File

@ -17,24 +17,6 @@
extern __read_mostly int scheduler_running;
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
* and back.
*/
#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
/*
* 'User priority' is the nice value converted to something we
* can work with better when scaling various scheduler parameters,
* it's a [ 0 ... 39 ] range.
*/
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/

View File

@ -17,8 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o balloon_compaction.o \
interval_tree.o $(mmu-y) \
showmem.o vmpressure.o
interval_tree.o showmem.o list_lru.o $(mmu-y)
obj-y += init-mm.o

152
mm/list_lru.c Normal file
View File

@ -0,0 +1,152 @@
/*
* Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
* Authors: David Chinner and Glauber Costa
*
* Generic LRU infrastructure
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
#include <linux/slab.h>
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
spin_lock(&nlru->lock);
WARN_ON_ONCE(nlru->nr_items < 0);
if (list_empty(item)) {
list_add_tail(item, &nlru->list);
if (nlru->nr_items++ == 0)
node_set(nid, lru->active_nodes);
spin_unlock(&nlru->lock);
return true;
}
spin_unlock(&nlru->lock);
return false;
}
EXPORT_SYMBOL_GPL(list_lru_add);
bool list_lru_del(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
spin_lock(&nlru->lock);
if (!list_empty(item)) {
list_del_init(item);
if (--nlru->nr_items == 0)
node_clear(nid, lru->active_nodes);
WARN_ON_ONCE(nlru->nr_items < 0);
spin_unlock(&nlru->lock);
return true;
}
spin_unlock(&nlru->lock);
return false;
}
EXPORT_SYMBOL_GPL(list_lru_del);
unsigned long
list_lru_count_node(struct list_lru *lru, int nid)
{
unsigned long count = 0;
struct list_lru_node *nlru = &lru->node[nid];
spin_lock(&nlru->lock);
WARN_ON_ONCE(nlru->nr_items < 0);
count += nlru->nr_items;
spin_unlock(&nlru->lock);
return count;
}
EXPORT_SYMBOL_GPL(list_lru_count_node);
unsigned long
list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
void *cb_arg, unsigned long *nr_to_walk)
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_head *item, *n;
unsigned long isolated = 0;
spin_lock(&nlru->lock);
restart:
list_for_each_safe(item, n, &nlru->list) {
enum lru_status ret;
/*
* decrement nr_to_walk first so that we don't livelock if we
* get stuck on large numbesr of LRU_RETRY items
*/
if (!*nr_to_walk)
break;
--*nr_to_walk;
ret = isolate(item, &nlru->lock, cb_arg);
switch (ret) {
case LRU_REMOVED_RETRY:
assert_spin_locked(&nlru->lock);
case LRU_REMOVED:
if (--nlru->nr_items == 0)
node_clear(nid, lru->active_nodes);
WARN_ON_ONCE(nlru->nr_items < 0);
isolated++;
/*
* If the lru lock has been dropped, our list
* traversal is now invalid and so we have to
* restart from scratch.
*/
if (ret == LRU_REMOVED_RETRY)
goto restart;
break;
case LRU_ROTATE:
list_move_tail(item, &nlru->list);
break;
case LRU_SKIP:
break;
case LRU_RETRY:
/*
* The lru lock has been dropped, our list traversal is
* now invalid and so we have to restart from scratch.
*/
assert_spin_locked(&nlru->lock);
goto restart;
default:
BUG();
}
}
spin_unlock(&nlru->lock);
return isolated;
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
lru->node = kzalloc(size, GFP_KERNEL);
if (!lru->node)
return -ENOMEM;
nodes_clear(lru->active_nodes);
for (i = 0; i < nr_node_ids; i++) {
spin_lock_init(&lru->node[i].lock);
if (key)
lockdep_set_class(&lru->node[i].lock, key);
INIT_LIST_HEAD(&lru->node[i].list);
lru->node[i].nr_items = 0;
}
return 0;
}
EXPORT_SYMBOL_GPL(list_lru_init_key);
void list_lru_destroy(struct list_lru *lru)
{
kfree(lru->node);
}
EXPORT_SYMBOL_GPL(list_lru_destroy);

View File

@ -49,7 +49,6 @@
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
@ -264,9 +263,6 @@ struct mem_cgroup {
*/
struct res_counter res;
/* vmpressure notifications */
struct vmpressure vmpressure;
union {
/*
* the counter to account for mem+swap usage.
@ -366,7 +362,6 @@ struct mem_cgroup {
atomic_t numainfo_events;
atomic_t numainfo_updating;
#endif
/*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
@ -518,24 +513,6 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
return container_of(s, struct mem_cgroup, css);
}
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
if (!memcg)
memcg = root_mem_cgroup;
return &memcg->vmpressure;
}
struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
{
return &mem_cgroup_from_css(css)->vmpressure;
}
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return (memcg == root_mem_cgroup);
@ -6001,11 +5978,6 @@ static struct cftype mem_cgroup_files[] = {
.unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
.register_event = vmpressure_register_event,
.unregister_event = vmpressure_unregister_event,
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
@ -6287,7 +6259,6 @@ mem_cgroup_css_alloc(struct cgroup *cont)
memcg->move_charge_at_immigrate = 0;
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
return &memcg->css;

View File

@ -1,502 +0,0 @@
/*
* Linux VM pressure
*
* Copyright 2012 Linaro Ltd.
* Anton Vorontsov <anton.vorontsov@linaro.org>
*
* Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
* Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
*/
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/log2.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/eventfd.h>
#include <linux/swap.h>
#include <linux/printk.h>
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/vmpressure.h>
/*
* The window size (vmpressure_win) is the number of scanned pages before
* we try to analyze scanned/reclaimed ratio. So the window is used as a
* rate-limit tunable for the "low" level notification, and also for
* averaging the ratio for medium/critical levels. Using small window
* sizes can cause lot of false positives, but too big window size will
* delay the notifications.
*
* As the vmscan reclaimer logic works with chunks which are multiple of
* SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
*
* TODO: Make the window size depend on machine size, as we do for vmstat
* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
*/
static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
/*
* These thresholds are used when we account memory pressure through
* scanned/reclaimed ratio. The current values were chosen empirically. In
* essence, they are percents: the higher the value, the more number
* unsuccessful reclaims there were.
*/
static const unsigned int vmpressure_level_med = 60;
static const unsigned int vmpressure_level_critical = 95;
static unsigned long vmpressure_scale_max = 100;
module_param_named(vmpressure_scale_max, vmpressure_scale_max,
ulong, S_IRUGO | S_IWUSR);
/* vmpressure values >= this will be scaled based on allocstalls */
static unsigned long allocstall_threshold = 70;
module_param_named(allocstall_threshold, allocstall_threshold,
ulong, S_IRUGO | S_IWUSR);
static struct vmpressure global_vmpressure;
BLOCKING_NOTIFIER_HEAD(vmpressure_notifier);
int vmpressure_notifier_register(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&vmpressure_notifier, nb);
}
int vmpressure_notifier_unregister(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&vmpressure_notifier, nb);
}
void vmpressure_notify(unsigned long pressure)
{
blocking_notifier_call_chain(&vmpressure_notifier, pressure, NULL);
}
/*
* When there are too little pages left to scan, vmpressure() may miss the
* critical pressure as number of pages will be less than "window size".
* However, in that case the vmscan priority will raise fast as the
* reclaimer will try to scan LRUs more deeply.
*
* The vmscan logic considers these special priorities:
*
* prio == DEF_PRIORITY (12): reclaimer starts with that value
* prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
* prio == 0 : close to OOM, kernel scans every page in an lru
*
* Any value in this range is acceptable for this tunable (i.e. from 12 to
* 0). Current value for the vmpressure_level_critical_prio is chosen
* empirically, but the number, in essence, means that we consider
* critical level when scanning depth is ~10% of the lru size (vmscan
* scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
* eights).
*/
static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
static struct vmpressure *work_to_vmpressure(struct work_struct *work)
{
return container_of(work, struct vmpressure, work);
}
#ifdef CONFIG_MEMCG
static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
{
return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
}
static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
{
struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
memcg = parent_mem_cgroup(memcg);
if (!memcg)
return NULL;
return memcg_to_vmpressure(memcg);
}
#else
static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
{
return NULL;
}
static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
{
return NULL;
}
#endif
enum vmpressure_levels {
VMPRESSURE_LOW = 0,
VMPRESSURE_MEDIUM,
VMPRESSURE_CRITICAL,
VMPRESSURE_NUM_LEVELS,
};
static const char * const vmpressure_str_levels[] = {
[VMPRESSURE_LOW] = "low",
[VMPRESSURE_MEDIUM] = "medium",
[VMPRESSURE_CRITICAL] = "critical",
};
static enum vmpressure_levels vmpressure_level(unsigned long pressure)
{
if (pressure >= vmpressure_level_critical)
return VMPRESSURE_CRITICAL;
else if (pressure >= vmpressure_level_med)
return VMPRESSURE_MEDIUM;
return VMPRESSURE_LOW;
}
static unsigned long vmpressure_calc_pressure(unsigned long scanned,
unsigned long reclaimed)
{
unsigned long scale = scanned + reclaimed;
unsigned long pressure = 0;
/*
* reclaimed can be greater than scanned in cases
* like THP, where the scanned is 1 and reclaimed
* could be 512
*/
if (reclaimed >= scanned)
goto out;
/*
* We calculate the ratio (in percents) of how many pages were
* scanned vs. reclaimed in a given time frame (window). Note that
* time is in VM reclaimer's "ticks", i.e. number of pages
* scanned. This makes it possible to set desired reaction time
* and serves as a ratelimit.
*/
pressure = scale - (reclaimed * scale / scanned);
pressure = pressure * 100 / scale;
out:
pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
scanned, reclaimed);
return pressure;
}
static unsigned long vmpressure_account_stall(unsigned long pressure,
unsigned long stall, unsigned long scanned)
{
unsigned long scale;
if (pressure < allocstall_threshold)
return pressure;
scale = ((vmpressure_scale_max - pressure) * stall) / scanned;
return pressure + scale;
}
struct vmpressure_event {
struct eventfd_ctx *efd;
enum vmpressure_levels level;
struct list_head node;
};
static bool vmpressure_event(struct vmpressure *vmpr,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure_event *ev;
enum vmpressure_levels level;
unsigned long pressure;
bool signalled = false;
pressure = vmpressure_calc_pressure(scanned, reclaimed);
level = vmpressure_level(pressure);
mutex_lock(&vmpr->events_lock);
list_for_each_entry(ev, &vmpr->events, node) {
if (level >= ev->level) {
eventfd_signal(ev->efd, 1);
signalled = true;
}
}
mutex_unlock(&vmpr->events_lock);
return signalled;
}
static void vmpressure_work_fn(struct work_struct *work)
{
struct vmpressure *vmpr = work_to_vmpressure(work);
unsigned long scanned;
unsigned long reclaimed;
/*
* Several contexts might be calling vmpressure(), so it is
* possible that the work was rescheduled again before the old
* work context cleared the counters. In that case we will run
* just after the old work returns, but then scanned might be zero
* here. No need for any locks here since we don't care if
* vmpr->reclaimed is in sync.
*/
if (!vmpr->scanned)
return;
mutex_lock(&vmpr->sr_lock);
scanned = vmpr->scanned;
reclaimed = vmpr->reclaimed;
vmpr->scanned = 0;
vmpr->reclaimed = 0;
mutex_unlock(&vmpr->sr_lock);
do {
if (vmpressure_event(vmpr, scanned, reclaimed))
break;
/*
* If not handled, propagate the event upward into the
* hierarchy.
*/
} while ((vmpr = vmpressure_parent(vmpr)));
}
void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
BUG_ON(!vmpr);
/*
* Here we only want to account pressure that userland is able to
* help us with. For example, suppose that DMA zone is under
* pressure; if we notify userland about that kind of pressure,
* then it will be mostly a waste as it will trigger unnecessary
* freeing of memory by userland (since userland is more likely to
* have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
* is why we include only movable, highmem and FS/IO pages.
* Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
* we account it too.
*/
if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
return;
/*
* If we got here with no pages scanned, then that is an indicator
* that reclaimer was unable to find any shrinkable LRUs at the
* current scanning depth. But it does not mean that we should
* report the critical pressure, yet. If the scanning priority
* (scanning depth) goes too high (deep), we will be notified
* through vmpressure_prio(). But so far, keep calm.
*/
if (!scanned)
return;
mutex_lock(&vmpr->sr_lock);
vmpr->scanned += scanned;
vmpr->reclaimed += reclaimed;
scanned = vmpr->scanned;
mutex_unlock(&vmpr->sr_lock);
if (scanned < vmpressure_win || work_pending(&vmpr->work))
return;
schedule_work(&vmpr->work);
}
void vmpressure_global(gfp_t gfp, unsigned long scanned,
unsigned long reclaimed)
{
struct vmpressure *vmpr = &global_vmpressure;
unsigned long pressure;
unsigned long stall;
if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
return;
if (!scanned)
return;
mutex_lock(&vmpr->sr_lock);
vmpr->scanned += scanned;
vmpr->reclaimed += reclaimed;
if (!current_is_kswapd())
vmpr->stall += scanned;
stall = vmpr->stall;
scanned = vmpr->scanned;
reclaimed = vmpr->reclaimed;
mutex_unlock(&vmpr->sr_lock);
if (scanned < vmpressure_win)
return;
mutex_lock(&vmpr->sr_lock);
vmpr->scanned = 0;
vmpr->reclaimed = 0;
vmpr->stall = 0;
mutex_unlock(&vmpr->sr_lock);
pressure = vmpressure_calc_pressure(scanned, reclaimed);
pressure = vmpressure_account_stall(pressure, stall, scanned);
vmpressure_notify(pressure);
}
/**
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask
* @memcg: cgroup memory controller handle
* @scanned: number of pages scanned
* @reclaimed: number of pages reclaimed
*
* This function should be called from the vmscan reclaim path to account
* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
* pressure index is then further refined and averaged over time.
*
* This function does not return any value.
*/
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
unsigned long scanned, unsigned long reclaimed)
{
if (!memcg)
vmpressure_global(gfp, scanned, reclaimed);
if (IS_ENABLED(CONFIG_MEMCG))
vmpressure_memcg(gfp, memcg, scanned, reclaimed);
}
/**
* vmpressure_prio() - Account memory pressure through reclaimer priority level
* @gfp: reclaimer's gfp mask
* @memcg: cgroup memory controller handle
* @prio: reclaimer's priority
*
* This function should be called from the reclaim path every time when
* the vmscan's reclaiming priority (scanning depth) changes.
*
* This function does not return any value.
*/
void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
{
/*
* We only use prio for accounting critical level. For more info
* see comment for vmpressure_level_critical_prio variable above.
*/
if (prio > vmpressure_level_critical_prio)
return;
/*
* OK, the prio is below the threshold, updating vmpressure
* information before shrinker dives into long shrinking of long
* range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
* to the vmpressure() basically means that we signal 'critical'
* level.
*/
vmpressure(gfp, memcg, vmpressure_win, 0);
}
/**
* vmpressure_register_event() - Bind vmpressure notifications to an eventfd
* @cg: cgroup that is interested in vmpressure notifications
* @cft: cgroup control files handle
* @eventfd: eventfd context to link notifications with
* @args: event arguments (used to set up a pressure level threshold)
*
* This function associates eventfd context with the vmpressure
* infrastructure, so that the notifications will be delivered to the
* @eventfd. The @args parameter is a string that denotes pressure level
* threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
* "critical").
*
* This function should not be used directly, just pass it to (struct
* cftype).register_event, and then cgroup core will handle everything by
* itself.
*/
int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
struct eventfd_ctx *eventfd, const char *args)
{
struct vmpressure *vmpr = cg_to_vmpressure(cg);
struct vmpressure_event *ev;
int level;
BUG_ON(!vmpr);
for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
if (!strcmp(vmpressure_str_levels[level], args))
break;
}
if (level >= VMPRESSURE_NUM_LEVELS)
return -EINVAL;
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
if (!ev)
return -ENOMEM;
ev->efd = eventfd;
ev->level = level;
mutex_lock(&vmpr->events_lock);
list_add(&ev->node, &vmpr->events);
mutex_unlock(&vmpr->events_lock);
return 0;
}
/**
* vmpressure_unregister_event() - Unbind eventfd from vmpressure
* @cg: cgroup handle
* @cft: cgroup control files handle
* @eventfd: eventfd context that was used to link vmpressure with the @cg
*
* This function does internal manipulations to detach the @eventfd from
* the vmpressure notifications, and then frees internal resources
* associated with the @eventfd (but the @eventfd itself is not freed).
*
* This function should not be used directly, just pass it to (struct
* cftype).unregister_event, and then cgroup core will handle everything
* by itself.
*/
void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
struct eventfd_ctx *eventfd)
{
struct vmpressure *vmpr = cg_to_vmpressure(cg);
struct vmpressure_event *ev;
if (!vmpr)
BUG();
mutex_lock(&vmpr->events_lock);
list_for_each_entry(ev, &vmpr->events, node) {
if (ev->efd != eventfd)
continue;
list_del(&ev->node);
kfree(ev);
break;
}
mutex_unlock(&vmpr->events_lock);
}
/**
* vmpressure_init() - Initialize vmpressure control structure
* @vmpr: Structure to be initialized
*
* This function should be called on every allocated vmpressure structure
* before any usage.
*/
void vmpressure_init(struct vmpressure *vmpr)
{
mutex_init(&vmpr->sr_lock);
mutex_init(&vmpr->events_lock);
INIT_LIST_HEAD(&vmpr->events);
INIT_WORK(&vmpr->work, vmpressure_work_fn);
}
int vmpressure_global_init(void)
{
vmpressure_init(&global_vmpressure);
return 0;
}
late_initcall(vmpressure_global_init);

View File

@ -24,7 +24,6 @@
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/vmpressure.h>
#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
@ -283,19 +282,24 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
*
* Returns the number of slab objects which we shrunk.
*/
unsigned long shrink_slab(struct shrink_control *shrink,
unsigned long shrink_slab(struct shrink_control *shrinkctl,
unsigned long nr_pages_scanned,
unsigned long lru_pages)
{
struct shrinker *shrinker;
unsigned long ret = 0;
unsigned long freed = 0;
if (nr_pages_scanned == 0)
nr_pages_scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem)) {
/* Assume we'll be able to shrink next time */
ret = 1;
/*
* If we would return 0, our callers would understand that we
* have nothing else to shrink and give up trying. By returning
* 1 we keep it going and assume we'll be able to shrink next
* time.
*/
freed = 1;
goto out;
}
@ -303,7 +307,6 @@ unsigned long shrink_slab(struct shrink_control *shrink,
unsigned long long delta;
long total_scan;
long max_pass;
int shrink_ret = 0;
long nr;
long new_nr;
long batch_size = shrinker->batch ? shrinker->batch
@ -313,8 +316,11 @@ unsigned long shrink_slab(struct shrink_control *shrink,
if (current_is_kswapd())
min_cache_size = 0;
max_pass = do_shrinker_shrink(shrinker, shrink, 0);
if (max_pass <= 0)
if (shrinker->count_objects)
max_pass = shrinker->count_objects(shrinker, shrinkctl);
else
max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
if (max_pass == 0)
continue;
/*
@ -330,8 +336,8 @@ unsigned long shrink_slab(struct shrink_control *shrink,
do_div(delta, lru_pages + 1);
total_scan += delta;
if (total_scan < 0) {
printk(KERN_ERR "shrink_slab: %pF negative objects to "
"delete nr=%ld\n",
printk(KERN_ERR
"shrink_slab: %pF negative objects to delete nr=%ld\n",
shrinker->shrink, total_scan);
total_scan = max_pass;
}
@ -359,23 +365,36 @@ unsigned long shrink_slab(struct shrink_control *shrink,
if (total_scan > max_pass * 2)
total_scan = max_pass * 2;
trace_mm_shrink_slab_start(shrinker, shrink, nr,
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
nr_pages_scanned, lru_pages,
max_pass, delta, total_scan);
while (total_scan > min_cache_size) {
int nr_before;
if (total_scan < batch_size)
batch_size = total_scan;
nr_before = do_shrinker_shrink(shrinker, shrink, 0);
shrink_ret = do_shrinker_shrink(shrinker, shrink,
batch_size);
if (shrink_ret == -1)
break;
if (shrink_ret < nr_before)
ret += nr_before - shrink_ret;
if (shrinker->scan_objects) {
unsigned long ret;
shrinkctl->nr_to_scan = batch_size;
ret = shrinker->scan_objects(shrinker, shrinkctl);
if (ret == SHRINK_STOP)
break;
freed += ret;
} else {
int nr_before;
long ret;
nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
ret = do_shrinker_shrink(shrinker, shrinkctl,
batch_size);
if (ret == -1)
break;
if (ret < nr_before)
freed += nr_before - ret;
}
count_vm_events(SLABS_SCANNED, batch_size);
total_scan -= batch_size;
@ -393,12 +412,12 @@ unsigned long shrink_slab(struct shrink_control *shrink,
else
new_nr = atomic_long_read(&shrinker->nr_in_batch);
trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
}
up_read(&shrinker_rwsem);
out:
cond_resched();
return ret;
return freed;
}
static inline int is_page_cache_freeable(struct page *page)
@ -2312,11 +2331,6 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
}
memcg = mem_cgroup_iter(root, memcg, &reclaim);
} while (memcg);
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
}
@ -2497,8 +2511,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
count_vm_event(ALLOCSTALL);
do {
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
sc->nr_scanned = 0;
aborted_reclaim = shrink_zones(zonelist, sc);