mirror of
https://github.com/joel16/android_kernel_sony_msm8994_rework.git
synced 2024-11-23 03:50:08 +00:00
Revert "Squash revert 3.18 binder/lmk changes"
This reverts commit 3b278f89ee
.
This commit is contained in:
parent
8ba63ccfde
commit
9a3c5248a3
@ -40,7 +40,6 @@ Features:
|
||||
- soft limit
|
||||
- moving (recharging) account at moving a task is selectable.
|
||||
- usage threshold notifier
|
||||
- memory pressure notifier
|
||||
- oom-killer disable knob and oom-notifier
|
||||
- Root cgroup has no limit controls.
|
||||
|
||||
@ -66,7 +65,6 @@ Brief summary of control files.
|
||||
memory.stat # show various statistics
|
||||
memory.use_hierarchy # set/show hierarchical account enabled
|
||||
memory.force_empty # trigger forced move charge to parent
|
||||
memory.pressure_level # set memory pressure notifications
|
||||
memory.swappiness # set/show swappiness parameter of vmscan
|
||||
(See sysctl's vm.swappiness)
|
||||
memory.move_charge_at_immigrate # set/show controls of moving charges
|
||||
@ -766,73 +764,7 @@ At reading, current status of OOM is shown.
|
||||
under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may
|
||||
be stopped.)
|
||||
|
||||
11. Memory Pressure
|
||||
|
||||
The pressure level notifications can be used to monitor the memory
|
||||
allocation cost; based on the pressure, applications can implement
|
||||
different strategies of managing their memory resources. The pressure
|
||||
levels are defined as following:
|
||||
|
||||
The "low" level means that the system is reclaiming memory for new
|
||||
allocations. Monitoring this reclaiming activity might be useful for
|
||||
maintaining cache level. Upon notification, the program (typically
|
||||
"Activity Manager") might analyze vmstat and act in advance (i.e.
|
||||
prematurely shutdown unimportant services).
|
||||
|
||||
The "medium" level means that the system is experiencing medium memory
|
||||
pressure, the system might be making swap, paging out active file caches,
|
||||
etc. Upon this event applications may decide to further analyze
|
||||
vmstat/zoneinfo/memcg or internal memory usage statistics and free any
|
||||
resources that can be easily reconstructed or re-read from a disk.
|
||||
|
||||
The "critical" level means that the system is actively thrashing, it is
|
||||
about to out of memory (OOM) or even the in-kernel OOM killer is on its
|
||||
way to trigger. Applications should do whatever they can to help the
|
||||
system. It might be too late to consult with vmstat or any other
|
||||
statistics, so it's advisable to take an immediate action.
|
||||
|
||||
The events are propagated upward until the event is handled, i.e. the
|
||||
events are not pass-through. Here is what this means: for example you have
|
||||
three cgroups: A->B->C. Now you set up an event listener on cgroups A, B
|
||||
and C, and suppose group C experiences some pressure. In this situation,
|
||||
only group C will receive the notification, i.e. groups A and B will not
|
||||
receive it. This is done to avoid excessive "broadcasting" of messages,
|
||||
which disturbs the system and which is especially bad if we are low on
|
||||
memory or thrashing. So, organize the cgroups wisely, or propagate the
|
||||
events manually (or, ask us to implement the pass-through events,
|
||||
explaining why would you need them.)
|
||||
|
||||
The file memory.pressure_level is only used to setup an eventfd. To
|
||||
register a notification, an application must:
|
||||
|
||||
- create an eventfd using eventfd(2);
|
||||
- open memory.pressure_level;
|
||||
- write string like "<event_fd> <fd of memory.pressure_level> <level>"
|
||||
to cgroup.event_control.
|
||||
|
||||
Application will be notified through eventfd when memory pressure is at
|
||||
the specific level (or higher). Read/write operations to
|
||||
memory.pressure_level are no implemented.
|
||||
|
||||
Test:
|
||||
|
||||
Here is a small script example that makes a new cgroup, sets up a
|
||||
memory limit, sets up a notification in the cgroup and then makes child
|
||||
cgroup experience a critical pressure:
|
||||
|
||||
# cd /sys/fs/cgroup/memory/
|
||||
# mkdir foo
|
||||
# cd foo
|
||||
# cgroup_event_listener memory.pressure_level low &
|
||||
# echo 8000000 > memory.limit_in_bytes
|
||||
# echo 8000000 > memory.memsw.limit_in_bytes
|
||||
# echo $$ > tasks
|
||||
# dd if=/dev/zero | read x
|
||||
|
||||
(Expect a bunch of notifications, and eventually, the oom-killer will
|
||||
trigger.)
|
||||
|
||||
12. TODO
|
||||
11. TODO
|
||||
|
||||
1. Add support for accounting huge pages (as a separate controller)
|
||||
2. Make per-cgroup scanner reclaim not-shared pages first
|
||||
|
@ -174,8 +174,6 @@ source "drivers/ipack/Kconfig"
|
||||
|
||||
source "drivers/reset/Kconfig"
|
||||
|
||||
source "drivers/android/Kconfig"
|
||||
|
||||
source "drivers/coresight/Kconfig"
|
||||
|
||||
source "drivers/bif/Kconfig"
|
||||
|
@ -158,7 +158,6 @@ obj-$(CONFIG_IIO) += iio/
|
||||
obj-$(CONFIG_VME_BUS) += vme/
|
||||
obj-$(CONFIG_IPACK_BUS) += ipack/
|
||||
obj-$(CONFIG_NTB) += ntb/
|
||||
obj-$(CONFIG_ANDROID) += android/
|
||||
|
||||
obj-$(CONFIG_CORESIGHT) += coresight/
|
||||
obj-$(CONFIG_ESOC) += esoc/
|
||||
|
@ -1,49 +0,0 @@
|
||||
menu "Android"
|
||||
|
||||
config ANDROID
|
||||
bool "Android Drivers"
|
||||
---help---
|
||||
Enable support for various drivers needed on the Android platform
|
||||
|
||||
if ANDROID
|
||||
|
||||
config ANDROID_BINDER_IPC
|
||||
bool "Android Binder IPC Driver"
|
||||
depends on MMU
|
||||
default n
|
||||
---help---
|
||||
Binder is used in Android for both communication between processes,
|
||||
and remote method invocation.
|
||||
|
||||
This means one Android process can call a method/routine in another
|
||||
Android process, using Binder to identify, invoke and pass arguments
|
||||
between said processes.
|
||||
|
||||
config ANDROID_BINDER_DEVICES
|
||||
string "Android Binder devices"
|
||||
depends on ANDROID_BINDER_IPC
|
||||
default "binder,hwbinder,vndbinder"
|
||||
---help---
|
||||
Default value for the binder.devices parameter.
|
||||
|
||||
The binder.devices parameter is a comma-separated list of strings
|
||||
that specifies the names of the binder device nodes that will be
|
||||
created. Each binder device has its own context manager, and is
|
||||
therefore logically separated from the other devices.
|
||||
|
||||
config ANDROID_BINDER_IPC_32BIT
|
||||
bool
|
||||
depends on !64BIT && ANDROID_BINDER_IPC
|
||||
default y
|
||||
---help---
|
||||
The Binder API has been changed to support both 32 and 64bit
|
||||
applications in a mixed environment.
|
||||
|
||||
Enable this to support an old 32-bit Android user-space (v4.4 and
|
||||
earlier).
|
||||
|
||||
Note that enabling this will break newer Android user-space.
|
||||
|
||||
endif # if ANDROID
|
||||
|
||||
endmenu
|
@ -1,3 +0,0 @@
|
||||
ccflags-y += -I$(src) # needed for trace events
|
||||
|
||||
obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o
|
File diff suppressed because it is too large
Load Diff
@ -8,6 +8,53 @@ config ANDROID
|
||||
|
||||
if ANDROID
|
||||
|
||||
config ANDROID_BINDER_IPC
|
||||
bool "Android Binder IPC Driver"
|
||||
depends on MMU
|
||||
default n
|
||||
---help---
|
||||
Binder is used in Android for both communication between processes,
|
||||
and remote method invocation.
|
||||
|
||||
This means one Android process can call a method/routine in another
|
||||
Android process, using Binder to identify, invoke and pass arguments
|
||||
between said processes.
|
||||
|
||||
config ANDROID_BINDER_IPC_32BIT
|
||||
bool "Use old (Android 4.4 and earlier) 32-bit binder API"
|
||||
depends on !64BIT && ANDROID_BINDER_IPC
|
||||
default y
|
||||
---help---
|
||||
The Binder API has been changed to support both 32 and 64bit
|
||||
applications in a mixed environment.
|
||||
|
||||
Enable this to support an old 32-bit Android user-space (v4.4 and
|
||||
earlier).
|
||||
|
||||
Note that enabling this will break newer Android user-space.
|
||||
|
||||
config ANDROID_BINDER_DEVICES
|
||||
string "Android Binder devices"
|
||||
depends on ANDROID_BINDER_IPC
|
||||
default "binder,hwbinder,vndbinder"
|
||||
---help---
|
||||
Default value for the binder.devices parameter.
|
||||
|
||||
The binder.devices parameter is a comma-separated list of strings
|
||||
that specifies the names of the binder device nodes that will be
|
||||
created. Each binder device has its own context manager, and is
|
||||
therefore logically separated from the other devices.
|
||||
|
||||
config ANDROID_BINDER_IPC_SELFTEST
|
||||
bool "Android Binder IPC Driver Selftest"
|
||||
depends on ANDROID_BINDER_IPC
|
||||
---help---
|
||||
This feature allows binder selftest to run.
|
||||
|
||||
Binder selftest checks the allocation and free of binder buffers
|
||||
exhaustively with combinations of various buffer sizes and
|
||||
alignments.
|
||||
|
||||
config ASHMEM
|
||||
bool "Enable the Anonymous Shared Memory Subsystem"
|
||||
default n
|
||||
|
@ -3,6 +3,8 @@ ccflags-y += -I$(src) # needed for trace events
|
||||
obj-y += ion/
|
||||
obj-$(CONFIG_FIQ_DEBUGGER) += fiq_debugger/
|
||||
|
||||
obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o
|
||||
obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
|
||||
obj-$(CONFIG_ASHMEM) += ashmem.o
|
||||
obj-$(CONFIG_ANDROID_TIMED_OUTPUT) += timed_output.o
|
||||
obj-$(CONFIG_ANDROID_TIMED_GPIO) += timed_gpio.o
|
||||
|
@ -268,24 +268,23 @@ static loff_t ashmem_llseek(struct file *file, loff_t offset, int origin)
|
||||
mutex_lock(&ashmem_mutex);
|
||||
|
||||
if (asma->size == 0) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
mutex_unlock(&ashmem_mutex);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!asma->file) {
|
||||
ret = -EBADF;
|
||||
goto out;
|
||||
mutex_unlock(&ashmem_mutex);
|
||||
return -EBADF;
|
||||
}
|
||||
|
||||
ret = asma->file->f_op->llseek(asma->file, offset, origin);
|
||||
mutex_unlock(&ashmem_mutex);
|
||||
|
||||
ret = vfs_llseek(asma->file, offset, origin);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
return ret;
|
||||
|
||||
/** Copy f_pos from backing file, since f_ops->llseek() sets it */
|
||||
file->f_pos = asma->file->f_pos;
|
||||
|
||||
out:
|
||||
mutex_unlock(&ashmem_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -309,6 +308,12 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* requested mapping size larger than object size */
|
||||
if (vma->vm_end - vma->vm_start > PAGE_ALIGN(asma->size)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* requested protection bits must match our allowed protection mask */
|
||||
if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
|
||||
calc_vm_prot_bits(PROT_MASK))) {
|
||||
@ -330,6 +335,7 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
ret = PTR_ERR(vmfile);
|
||||
goto out;
|
||||
}
|
||||
vmfile->f_mode |= FMODE_LSEEK;
|
||||
asma->file = vmfile;
|
||||
}
|
||||
get_file(asma->file);
|
||||
|
5994
drivers/staging/android/binder.c
Normal file
5994
drivers/staging/android/binder.c
Normal file
File diff suppressed because it is too large
Load Diff
30
drivers/staging/android/binder.h
Normal file
30
drivers/staging/android/binder.h
Normal file
@ -0,0 +1,30 @@
|
||||
/*
|
||||
* Copyright (C) 2008 Google, Inc.
|
||||
*
|
||||
* Based on, but no longer compatible with, the original
|
||||
* OpenBinder.org binder driver interface, which is:
|
||||
*
|
||||
* Copyright (c) 2005 Palmsource, Inc.
|
||||
*
|
||||
* This software is licensed under the terms of the GNU General Public
|
||||
* License version 2, as published by the Free Software Foundation, and
|
||||
* may be copied, distributed, and modified under those terms.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_BINDER_H
|
||||
#define _LINUX_BINDER_H
|
||||
|
||||
#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT
|
||||
#define BINDER_IPC_32BIT 1
|
||||
#endif
|
||||
|
||||
#include "uapi/binder.h"
|
||||
|
||||
#endif /* _LINUX_BINDER_H */
|
||||
|
1012
drivers/staging/android/binder_alloc.c
Normal file
1012
drivers/staging/android/binder_alloc.c
Normal file
File diff suppressed because it is too large
Load Diff
186
drivers/staging/android/binder_alloc.h
Normal file
186
drivers/staging/android/binder_alloc.h
Normal file
@ -0,0 +1,186 @@
|
||||
/*
|
||||
* Copyright (C) 2017 Google, Inc.
|
||||
*
|
||||
* This software is licensed under the terms of the GNU General Public
|
||||
* License version 2, as published by the Free Software Foundation, and
|
||||
* may be copied, distributed, and modified under those terms.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_BINDER_ALLOC_H
|
||||
#define _LINUX_BINDER_ALLOC_H
|
||||
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/rtmutex.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/list_lru.h>
|
||||
|
||||
extern struct list_lru binder_alloc_lru;
|
||||
struct binder_transaction;
|
||||
|
||||
/**
|
||||
* struct binder_buffer - buffer used for binder transactions
|
||||
* @entry: entry alloc->buffers
|
||||
* @rb_node: node for allocated_buffers/free_buffers rb trees
|
||||
* @free: true if buffer is free
|
||||
* @allow_user_free: describe the second member of struct blah,
|
||||
* @async_transaction: describe the second member of struct blah,
|
||||
* @debug_id: describe the second member of struct blah,
|
||||
* @transaction: describe the second member of struct blah,
|
||||
* @target_node: describe the second member of struct blah,
|
||||
* @data_size: describe the second member of struct blah,
|
||||
* @offsets_size: describe the second member of struct blah,
|
||||
* @extra_buffers_size: describe the second member of struct blah,
|
||||
* @data:i describe the second member of struct blah,
|
||||
*
|
||||
* Bookkeeping structure for binder transaction buffers
|
||||
*/
|
||||
struct binder_buffer {
|
||||
struct list_head entry; /* free and allocated entries by address */
|
||||
struct rb_node rb_node; /* free entry by size or allocated entry */
|
||||
/* by address */
|
||||
unsigned free:1;
|
||||
unsigned allow_user_free:1;
|
||||
unsigned async_transaction:1;
|
||||
unsigned debug_id:29;
|
||||
|
||||
struct binder_transaction *transaction;
|
||||
|
||||
struct binder_node *target_node;
|
||||
size_t data_size;
|
||||
size_t offsets_size;
|
||||
size_t extra_buffers_size;
|
||||
void *data;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct binder_lru_page - page object used for binder shrinker
|
||||
* @page_ptr: pointer to physical page in mmap'd space
|
||||
* @lru: entry in binder_alloc_lru
|
||||
* @alloc: binder_alloc for a proc
|
||||
*/
|
||||
struct binder_lru_page {
|
||||
struct list_head lru;
|
||||
struct page *page_ptr;
|
||||
struct binder_alloc *alloc;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct binder_alloc - per-binder proc state for binder allocator
|
||||
* @vma: vm_area_struct passed to mmap_handler
|
||||
* (invarient after mmap)
|
||||
* @tsk: tid for task that called init for this proc
|
||||
* (invariant after init)
|
||||
* @vma_vm_mm: copy of vma->vm_mm (invarient after mmap)
|
||||
* @buffer: base of per-proc address space mapped via mmap
|
||||
* @user_buffer_offset: offset between user and kernel VAs for buffer
|
||||
* @buffers: list of all buffers for this proc
|
||||
* @free_buffers: rb tree of buffers available for allocation
|
||||
* sorted by size
|
||||
* @allocated_buffers: rb tree of allocated buffers sorted by address
|
||||
* @free_async_space: VA space available for async buffers. This is
|
||||
* initialized at mmap time to 1/2 the full VA space
|
||||
* @pages: array of binder_lru_page
|
||||
* @buffer_size: size of address space specified via mmap
|
||||
* @pid: pid for associated binder_proc (invariant after init)
|
||||
* @pages_high: high watermark of offset in @pages
|
||||
*
|
||||
* Bookkeeping structure for per-proc address space management for binder
|
||||
* buffers. It is normally initialized during binder_init() and binder_mmap()
|
||||
* calls. The address space is used for both user-visible buffers and for
|
||||
* struct binder_buffer objects used to track the user buffers
|
||||
*/
|
||||
struct binder_alloc {
|
||||
struct mutex mutex;
|
||||
struct vm_area_struct *vma;
|
||||
struct mm_struct *vma_vm_mm;
|
||||
void *buffer;
|
||||
ptrdiff_t user_buffer_offset;
|
||||
struct list_head buffers;
|
||||
struct rb_root free_buffers;
|
||||
struct rb_root allocated_buffers;
|
||||
size_t free_async_space;
|
||||
struct binder_lru_page *pages;
|
||||
size_t buffer_size;
|
||||
uint32_t buffer_free;
|
||||
int pid;
|
||||
size_t pages_high;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST
|
||||
void binder_selftest_alloc(struct binder_alloc *alloc);
|
||||
#else
|
||||
static inline void binder_selftest_alloc(struct binder_alloc *alloc) {}
|
||||
#endif
|
||||
enum lru_status binder_alloc_free_page(struct list_head *item,
|
||||
spinlock_t *lock, void *cb_arg);
|
||||
extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
|
||||
size_t data_size,
|
||||
size_t offsets_size,
|
||||
size_t extra_buffers_size,
|
||||
int is_async);
|
||||
extern void binder_alloc_init(struct binder_alloc *alloc);
|
||||
void binder_alloc_shrinker_init(void);
|
||||
extern void binder_alloc_vma_close(struct binder_alloc *alloc);
|
||||
extern struct binder_buffer *
|
||||
binder_alloc_prepare_to_free(struct binder_alloc *alloc,
|
||||
uintptr_t user_ptr);
|
||||
extern void binder_alloc_free_buf(struct binder_alloc *alloc,
|
||||
struct binder_buffer *buffer);
|
||||
extern int binder_alloc_mmap_handler(struct binder_alloc *alloc,
|
||||
struct vm_area_struct *vma);
|
||||
extern void binder_alloc_deferred_release(struct binder_alloc *alloc);
|
||||
extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc);
|
||||
extern void binder_alloc_print_allocated(struct seq_file *m,
|
||||
struct binder_alloc *alloc);
|
||||
void binder_alloc_print_pages(struct seq_file *m,
|
||||
struct binder_alloc *alloc);
|
||||
|
||||
/**
|
||||
* binder_alloc_get_free_async_space() - get free space available for async
|
||||
* @alloc: binder_alloc for this proc
|
||||
*
|
||||
* Return: the bytes remaining in the address-space for async transactions
|
||||
*/
|
||||
static inline size_t
|
||||
binder_alloc_get_free_async_space(struct binder_alloc *alloc)
|
||||
{
|
||||
size_t free_async_space;
|
||||
|
||||
mutex_lock(&alloc->mutex);
|
||||
free_async_space = alloc->free_async_space;
|
||||
mutex_unlock(&alloc->mutex);
|
||||
return free_async_space;
|
||||
}
|
||||
|
||||
/**
|
||||
* binder_alloc_get_user_buffer_offset() - get offset between kernel/user addrs
|
||||
* @alloc: binder_alloc for this proc
|
||||
*
|
||||
* Return: the offset between kernel and user-space addresses to use for
|
||||
* virtual address conversion
|
||||
*/
|
||||
static inline ptrdiff_t
|
||||
binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc)
|
||||
{
|
||||
/*
|
||||
* user_buffer_offset is constant if vma is set and
|
||||
* undefined if vma is not set. It is possible to
|
||||
* get here with !alloc->vma if the target process
|
||||
* is dying while a transaction is being initiated.
|
||||
* Returning the old value is ok in this case and
|
||||
* the transaction will fail.
|
||||
*/
|
||||
return alloc->user_buffer_offset;
|
||||
}
|
||||
|
||||
#endif /* _LINUX_BINDER_ALLOC_H */
|
||||
|
310
drivers/staging/android/binder_alloc_selftest.c
Normal file
310
drivers/staging/android/binder_alloc_selftest.c
Normal file
@ -0,0 +1,310 @@
|
||||
/* binder_alloc_selftest.c
|
||||
*
|
||||
* Android IPC Subsystem
|
||||
*
|
||||
* Copyright (C) 2017 Google, Inc.
|
||||
*
|
||||
* This software is licensed under the terms of the GNU General Public
|
||||
* License version 2, as published by the Free Software Foundation, and
|
||||
* may be copied, distributed, and modified under those terms.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/err.h>
|
||||
#include "binder_alloc.h"
|
||||
|
||||
#define BUFFER_NUM 5
|
||||
#define BUFFER_MIN_SIZE (PAGE_SIZE / 8)
|
||||
|
||||
static bool binder_selftest_run = true;
|
||||
static int binder_selftest_failures;
|
||||
static DEFINE_MUTEX(binder_selftest_lock);
|
||||
|
||||
/**
|
||||
* enum buf_end_align_type - Page alignment of a buffer
|
||||
* end with regard to the end of the previous buffer.
|
||||
*
|
||||
* In the pictures below, buf2 refers to the buffer we
|
||||
* are aligning. buf1 refers to previous buffer by addr.
|
||||
* Symbol [ means the start of a buffer, ] means the end
|
||||
* of a buffer, and | means page boundaries.
|
||||
*/
|
||||
enum buf_end_align_type {
|
||||
/**
|
||||
* @SAME_PAGE_UNALIGNED: The end of this buffer is on
|
||||
* the same page as the end of the previous buffer and
|
||||
* is not page aligned. Examples:
|
||||
* buf1 ][ buf2 ][ ...
|
||||
* buf1 ]|[ buf2 ][ ...
|
||||
*/
|
||||
SAME_PAGE_UNALIGNED = 0,
|
||||
/**
|
||||
* @SAME_PAGE_ALIGNED: When the end of the previous buffer
|
||||
* is not page aligned, the end of this buffer is on the
|
||||
* same page as the end of the previous buffer and is page
|
||||
* aligned. When the previous buffer is page aligned, the
|
||||
* end of this buffer is aligned to the next page boundary.
|
||||
* Examples:
|
||||
* buf1 ][ buf2 ]| ...
|
||||
* buf1 ]|[ buf2 ]| ...
|
||||
*/
|
||||
SAME_PAGE_ALIGNED,
|
||||
/**
|
||||
* @NEXT_PAGE_UNALIGNED: The end of this buffer is on
|
||||
* the page next to the end of the previous buffer and
|
||||
* is not page aligned. Examples:
|
||||
* buf1 ][ buf2 | buf2 ][ ...
|
||||
* buf1 ]|[ buf2 | buf2 ][ ...
|
||||
*/
|
||||
NEXT_PAGE_UNALIGNED,
|
||||
/**
|
||||
* @NEXT_PAGE_ALIGNED: The end of this buffer is on
|
||||
* the page next to the end of the previous buffer and
|
||||
* is page aligned. Examples:
|
||||
* buf1 ][ buf2 | buf2 ]| ...
|
||||
* buf1 ]|[ buf2 | buf2 ]| ...
|
||||
*/
|
||||
NEXT_PAGE_ALIGNED,
|
||||
/**
|
||||
* @NEXT_NEXT_UNALIGNED: The end of this buffer is on
|
||||
* the page that follows the page after the end of the
|
||||
* previous buffer and is not page aligned. Examples:
|
||||
* buf1 ][ buf2 | buf2 | buf2 ][ ...
|
||||
* buf1 ]|[ buf2 | buf2 | buf2 ][ ...
|
||||
*/
|
||||
NEXT_NEXT_UNALIGNED,
|
||||
LOOP_END,
|
||||
};
|
||||
|
||||
static void pr_err_size_seq(size_t *sizes, int *seq)
|
||||
{
|
||||
int i;
|
||||
|
||||
pr_err("alloc sizes: ");
|
||||
for (i = 0; i < BUFFER_NUM; i++)
|
||||
pr_cont("[%zu]", sizes[i]);
|
||||
pr_cont("\n");
|
||||
pr_err("free seq: ");
|
||||
for (i = 0; i < BUFFER_NUM; i++)
|
||||
pr_cont("[%d]", seq[i]);
|
||||
pr_cont("\n");
|
||||
}
|
||||
|
||||
static bool check_buffer_pages_allocated(struct binder_alloc *alloc,
|
||||
struct binder_buffer *buffer,
|
||||
size_t size)
|
||||
{
|
||||
void *page_addr, *end;
|
||||
int page_index;
|
||||
|
||||
end = (void *)PAGE_ALIGN((uintptr_t)buffer->data + size);
|
||||
page_addr = buffer->data;
|
||||
for (; page_addr < end; page_addr += PAGE_SIZE) {
|
||||
page_index = (page_addr - alloc->buffer) / PAGE_SIZE;
|
||||
if (!alloc->pages[page_index].page_ptr ||
|
||||
!list_empty(&alloc->pages[page_index].lru)) {
|
||||
pr_err("expect alloc but is %s at page index %d\n",
|
||||
alloc->pages[page_index].page_ptr ?
|
||||
"lru" : "free", page_index);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void binder_selftest_alloc_buf(struct binder_alloc *alloc,
|
||||
struct binder_buffer *buffers[],
|
||||
size_t *sizes, int *seq)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < BUFFER_NUM; i++) {
|
||||
buffers[i] = binder_alloc_new_buf(alloc, sizes[i], 0, 0, 0);
|
||||
if (IS_ERR(buffers[i]) ||
|
||||
!check_buffer_pages_allocated(alloc, buffers[i],
|
||||
sizes[i])) {
|
||||
pr_err_size_seq(sizes, seq);
|
||||
binder_selftest_failures++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void binder_selftest_free_buf(struct binder_alloc *alloc,
|
||||
struct binder_buffer *buffers[],
|
||||
size_t *sizes, int *seq, size_t end)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < BUFFER_NUM; i++)
|
||||
binder_alloc_free_buf(alloc, buffers[seq[i]]);
|
||||
|
||||
for (i = 0; i < end / PAGE_SIZE; i++) {
|
||||
/**
|
||||
* Error message on a free page can be false positive
|
||||
* if binder shrinker ran during binder_alloc_free_buf
|
||||
* calls above.
|
||||
*/
|
||||
if (list_empty(&alloc->pages[i].lru)) {
|
||||
pr_err_size_seq(sizes, seq);
|
||||
pr_err("expect lru but is %s at page index %d\n",
|
||||
alloc->pages[i].page_ptr ? "alloc" : "free", i);
|
||||
binder_selftest_failures++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void binder_selftest_free_page(struct binder_alloc *alloc)
|
||||
{
|
||||
int i;
|
||||
unsigned long count;
|
||||
|
||||
while ((count = list_lru_count(&binder_alloc_lru))) {
|
||||
list_lru_walk(&binder_alloc_lru, binder_alloc_free_page,
|
||||
NULL, count);
|
||||
}
|
||||
|
||||
for (i = 0; i < (alloc->buffer_size / PAGE_SIZE); i++) {
|
||||
if (alloc->pages[i].page_ptr) {
|
||||
pr_err("expect free but is %s at page index %d\n",
|
||||
list_empty(&alloc->pages[i].lru) ?
|
||||
"alloc" : "lru", i);
|
||||
binder_selftest_failures++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void binder_selftest_alloc_free(struct binder_alloc *alloc,
|
||||
size_t *sizes, int *seq, size_t end)
|
||||
{
|
||||
struct binder_buffer *buffers[BUFFER_NUM];
|
||||
|
||||
binder_selftest_alloc_buf(alloc, buffers, sizes, seq);
|
||||
binder_selftest_free_buf(alloc, buffers, sizes, seq, end);
|
||||
|
||||
/* Allocate from lru. */
|
||||
binder_selftest_alloc_buf(alloc, buffers, sizes, seq);
|
||||
if (list_lru_count(&binder_alloc_lru))
|
||||
pr_err("lru list should be empty but is not\n");
|
||||
|
||||
binder_selftest_free_buf(alloc, buffers, sizes, seq, end);
|
||||
binder_selftest_free_page(alloc);
|
||||
}
|
||||
|
||||
static bool is_dup(int *seq, int index, int val)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < index; i++) {
|
||||
if (seq[i] == val)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Generate BUFFER_NUM factorial free orders. */
|
||||
static void binder_selftest_free_seq(struct binder_alloc *alloc,
|
||||
size_t *sizes, int *seq,
|
||||
int index, size_t end)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (index == BUFFER_NUM) {
|
||||
binder_selftest_alloc_free(alloc, sizes, seq, end);
|
||||
return;
|
||||
}
|
||||
for (i = 0; i < BUFFER_NUM; i++) {
|
||||
if (is_dup(seq, index, i))
|
||||
continue;
|
||||
seq[index] = i;
|
||||
binder_selftest_free_seq(alloc, sizes, seq, index + 1, end);
|
||||
}
|
||||
}
|
||||
|
||||
static void binder_selftest_alloc_size(struct binder_alloc *alloc,
|
||||
size_t *end_offset)
|
||||
{
|
||||
int i;
|
||||
int seq[BUFFER_NUM] = {0};
|
||||
size_t front_sizes[BUFFER_NUM];
|
||||
size_t back_sizes[BUFFER_NUM];
|
||||
size_t last_offset, offset = 0;
|
||||
|
||||
for (i = 0; i < BUFFER_NUM; i++) {
|
||||
last_offset = offset;
|
||||
offset = end_offset[i];
|
||||
front_sizes[i] = offset - last_offset;
|
||||
back_sizes[BUFFER_NUM - i - 1] = front_sizes[i];
|
||||
}
|
||||
/*
|
||||
* Buffers share the first or last few pages.
|
||||
* Only BUFFER_NUM - 1 buffer sizes are adjustable since
|
||||
* we need one giant buffer before getting to the last page.
|
||||
*/
|
||||
back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1];
|
||||
binder_selftest_free_seq(alloc, front_sizes, seq, 0,
|
||||
end_offset[BUFFER_NUM - 1]);
|
||||
binder_selftest_free_seq(alloc, back_sizes, seq, 0, alloc->buffer_size);
|
||||
}
|
||||
|
||||
static void binder_selftest_alloc_offset(struct binder_alloc *alloc,
|
||||
size_t *end_offset, int index)
|
||||
{
|
||||
int align;
|
||||
size_t end, prev;
|
||||
|
||||
if (index == BUFFER_NUM) {
|
||||
binder_selftest_alloc_size(alloc, end_offset);
|
||||
return;
|
||||
}
|
||||
prev = index == 0 ? 0 : end_offset[index - 1];
|
||||
end = prev;
|
||||
|
||||
BUILD_BUG_ON(BUFFER_MIN_SIZE * BUFFER_NUM >= PAGE_SIZE);
|
||||
|
||||
for (align = SAME_PAGE_UNALIGNED; align < LOOP_END; align++) {
|
||||
if (align % 2)
|
||||
end = ALIGN(end, PAGE_SIZE);
|
||||
else
|
||||
end += BUFFER_MIN_SIZE;
|
||||
end_offset[index] = end;
|
||||
binder_selftest_alloc_offset(alloc, end_offset, index + 1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* binder_selftest_alloc() - Test alloc and free of buffer pages.
|
||||
* @alloc: Pointer to alloc struct.
|
||||
*
|
||||
* Allocate BUFFER_NUM buffers to cover all page alignment cases,
|
||||
* then free them in all orders possible. Check that pages are
|
||||
* correctly allocated, put onto lru when buffers are freed, and
|
||||
* are freed when binder_alloc_free_page is called.
|
||||
*/
|
||||
void binder_selftest_alloc(struct binder_alloc *alloc)
|
||||
{
|
||||
size_t end_offset[BUFFER_NUM];
|
||||
|
||||
if (!binder_selftest_run)
|
||||
return;
|
||||
mutex_lock(&binder_selftest_lock);
|
||||
if (!binder_selftest_run || !alloc->vma)
|
||||
goto done;
|
||||
pr_info("STARTED\n");
|
||||
binder_selftest_alloc_offset(alloc, end_offset, 0);
|
||||
binder_selftest_run = false;
|
||||
if (binder_selftest_failures > 0)
|
||||
pr_info("%d tests FAILED\n", binder_selftest_failures);
|
||||
else
|
||||
pr_info("PASSED\n");
|
||||
|
||||
done:
|
||||
mutex_unlock(&binder_selftest_lock);
|
||||
}
|
@ -23,7 +23,8 @@
|
||||
struct binder_buffer;
|
||||
struct binder_node;
|
||||
struct binder_proc;
|
||||
struct binder_ref;
|
||||
struct binder_alloc;
|
||||
struct binder_ref_data;
|
||||
struct binder_thread;
|
||||
struct binder_transaction;
|
||||
|
||||
@ -84,6 +85,30 @@ DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_ioctl_done);
|
||||
DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_write_done);
|
||||
DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_read_done);
|
||||
|
||||
TRACE_EVENT(binder_set_priority,
|
||||
TP_PROTO(int proc, int thread, unsigned int old_prio,
|
||||
unsigned int desired_prio, unsigned int new_prio),
|
||||
TP_ARGS(proc, thread, old_prio, new_prio, desired_prio),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, proc)
|
||||
__field(int, thread)
|
||||
__field(unsigned int, old_prio)
|
||||
__field(unsigned int, new_prio)
|
||||
__field(unsigned int, desired_prio)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->proc = proc;
|
||||
__entry->thread = thread;
|
||||
__entry->old_prio = old_prio;
|
||||
__entry->new_prio = new_prio;
|
||||
__entry->desired_prio = desired_prio;
|
||||
),
|
||||
TP_printk("proc=%d thread=%d old=%d => new=%d desired=%d",
|
||||
__entry->proc, __entry->thread, __entry->old_prio,
|
||||
__entry->new_prio, __entry->desired_prio)
|
||||
);
|
||||
|
||||
TRACE_EVENT(binder_wait_for_work,
|
||||
TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo),
|
||||
TP_ARGS(proc_work, transaction_stack, thread_todo),
|
||||
@ -146,8 +171,8 @@ TRACE_EVENT(binder_transaction_received,
|
||||
|
||||
TRACE_EVENT(binder_transaction_node_to_ref,
|
||||
TP_PROTO(struct binder_transaction *t, struct binder_node *node,
|
||||
struct binder_ref *ref),
|
||||
TP_ARGS(t, node, ref),
|
||||
struct binder_ref_data *rdata),
|
||||
TP_ARGS(t, node, rdata),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, debug_id)
|
||||
@ -160,8 +185,8 @@ TRACE_EVENT(binder_transaction_node_to_ref,
|
||||
__entry->debug_id = t->debug_id;
|
||||
__entry->node_debug_id = node->debug_id;
|
||||
__entry->node_ptr = node->ptr;
|
||||
__entry->ref_debug_id = ref->debug_id;
|
||||
__entry->ref_desc = ref->desc;
|
||||
__entry->ref_debug_id = rdata->debug_id;
|
||||
__entry->ref_desc = rdata->desc;
|
||||
),
|
||||
TP_printk("transaction=%d node=%d src_ptr=0x%016llx ==> dest_ref=%d dest_desc=%d",
|
||||
__entry->debug_id, __entry->node_debug_id,
|
||||
@ -170,8 +195,9 @@ TRACE_EVENT(binder_transaction_node_to_ref,
|
||||
);
|
||||
|
||||
TRACE_EVENT(binder_transaction_ref_to_node,
|
||||
TP_PROTO(struct binder_transaction *t, struct binder_ref *ref),
|
||||
TP_ARGS(t, ref),
|
||||
TP_PROTO(struct binder_transaction *t, struct binder_node *node,
|
||||
struct binder_ref_data *rdata),
|
||||
TP_ARGS(t, node, rdata),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, debug_id)
|
||||
@ -182,10 +208,10 @@ TRACE_EVENT(binder_transaction_ref_to_node,
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->debug_id = t->debug_id;
|
||||
__entry->ref_debug_id = ref->debug_id;
|
||||
__entry->ref_desc = ref->desc;
|
||||
__entry->node_debug_id = ref->node->debug_id;
|
||||
__entry->node_ptr = ref->node->ptr;
|
||||
__entry->ref_debug_id = rdata->debug_id;
|
||||
__entry->ref_desc = rdata->desc;
|
||||
__entry->node_debug_id = node->debug_id;
|
||||
__entry->node_ptr = node->ptr;
|
||||
),
|
||||
TP_printk("transaction=%d node=%d src_ref=%d src_desc=%d ==> dest_ptr=0x%016llx",
|
||||
__entry->debug_id, __entry->node_debug_id,
|
||||
@ -194,9 +220,10 @@ TRACE_EVENT(binder_transaction_ref_to_node,
|
||||
);
|
||||
|
||||
TRACE_EVENT(binder_transaction_ref_to_ref,
|
||||
TP_PROTO(struct binder_transaction *t, struct binder_ref *src_ref,
|
||||
struct binder_ref *dest_ref),
|
||||
TP_ARGS(t, src_ref, dest_ref),
|
||||
TP_PROTO(struct binder_transaction *t, struct binder_node *node,
|
||||
struct binder_ref_data *src_ref,
|
||||
struct binder_ref_data *dest_ref),
|
||||
TP_ARGS(t, node, src_ref, dest_ref),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, debug_id)
|
||||
@ -208,7 +235,7 @@ TRACE_EVENT(binder_transaction_ref_to_ref,
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->debug_id = t->debug_id;
|
||||
__entry->node_debug_id = src_ref->node->debug_id;
|
||||
__entry->node_debug_id = node->debug_id;
|
||||
__entry->src_ref_debug_id = src_ref->debug_id;
|
||||
__entry->src_ref_desc = src_ref->desc;
|
||||
__entry->dest_ref_debug_id = dest_ref->debug_id;
|
||||
@ -268,9 +295,9 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release,
|
||||
TP_ARGS(buffer));
|
||||
|
||||
TRACE_EVENT(binder_update_page_range,
|
||||
TP_PROTO(struct binder_proc *proc, bool allocate,
|
||||
TP_PROTO(struct binder_alloc *alloc, bool allocate,
|
||||
void *start, void *end),
|
||||
TP_ARGS(proc, allocate, start, end),
|
||||
TP_ARGS(alloc, allocate, start, end),
|
||||
TP_STRUCT__entry(
|
||||
__field(int, proc)
|
||||
__field(bool, allocate)
|
||||
@ -278,9 +305,9 @@ TRACE_EVENT(binder_update_page_range,
|
||||
__field(size_t, size)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->proc = proc->pid;
|
||||
__entry->proc = alloc->pid;
|
||||
__entry->allocate = allocate;
|
||||
__entry->offset = start - proc->buffer;
|
||||
__entry->offset = start - alloc->buffer;
|
||||
__entry->size = end - start;
|
||||
),
|
||||
TP_printk("proc=%d allocate=%d offset=%zu size=%zu",
|
||||
@ -288,6 +315,61 @@ TRACE_EVENT(binder_update_page_range,
|
||||
__entry->offset, __entry->size)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(binder_lru_page_class,
|
||||
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
|
||||
TP_ARGS(alloc, page_index),
|
||||
TP_STRUCT__entry(
|
||||
__field(int, proc)
|
||||
__field(size_t, page_index)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->proc = alloc->pid;
|
||||
__entry->page_index = page_index;
|
||||
),
|
||||
TP_printk("proc=%d page_index=%zu",
|
||||
__entry->proc, __entry->page_index)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_start,
|
||||
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
|
||||
TP_ARGS(alloc, page_index));
|
||||
|
||||
DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_end,
|
||||
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
|
||||
TP_ARGS(alloc, page_index));
|
||||
|
||||
DEFINE_EVENT(binder_lru_page_class, binder_free_lru_start,
|
||||
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
|
||||
TP_ARGS(alloc, page_index));
|
||||
|
||||
DEFINE_EVENT(binder_lru_page_class, binder_free_lru_end,
|
||||
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
|
||||
TP_ARGS(alloc, page_index));
|
||||
|
||||
DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_start,
|
||||
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
|
||||
TP_ARGS(alloc, page_index));
|
||||
|
||||
DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_end,
|
||||
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
|
||||
TP_ARGS(alloc, page_index));
|
||||
|
||||
DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_start,
|
||||
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
|
||||
TP_ARGS(alloc, page_index));
|
||||
|
||||
DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_end,
|
||||
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
|
||||
TP_ARGS(alloc, page_index));
|
||||
|
||||
DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_start,
|
||||
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
|
||||
TP_ARGS(alloc, page_index));
|
||||
|
||||
DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_end,
|
||||
TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
|
||||
TP_ARGS(alloc, page_index));
|
||||
|
||||
TRACE_EVENT(binder_command,
|
||||
TP_PROTO(uint32_t cmd),
|
||||
TP_ARGS(cmd),
|
@ -13,7 +13,9 @@
|
||||
* drops below 4096 pages and kill processes with a oom_score_adj value of 0 or
|
||||
* higher when the free memory drops below 1024 pages.
|
||||
*
|
||||
* The driver considers memory used for caches to be free.
|
||||
* The driver considers memory used for caches to be free, but if a large
|
||||
* percentage of the cached memory is locked this can be very inaccurate
|
||||
* and processes may not get killed until the normal oom killer is triggered.
|
||||
*
|
||||
* Copyright (C) 2007-2008 Google, Inc.
|
||||
*
|
||||
@ -27,11 +29,6 @@
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
*/
|
||||
/*
|
||||
* NOTE: This file has been modified by Sony Mobile Communications Inc.
|
||||
* Modifications are Copyright (c) 2015 Sony Mobile Communications Inc,
|
||||
* and licensed under the license of the file.
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
@ -40,27 +37,9 @@
|
||||
#include <linux/mm.h>
|
||||
#include <linux/oom.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/show_mem_notifier.h>
|
||||
#include <linux/vmpressure.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/almk.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/lmk.h>
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
#define _ZONE ZONE_HIGHMEM
|
||||
#else
|
||||
#define _ZONE ZONE_NORMAL
|
||||
#endif
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include "trace/lowmemorykiller.h"
|
||||
@ -80,7 +59,6 @@ static int lowmem_minfree[6] = {
|
||||
16 * 1024, /* 64MB */
|
||||
};
|
||||
static int lowmem_minfree_size = 4;
|
||||
static int lmk_fast_run = 1;
|
||||
|
||||
static unsigned long lowmem_deathpending_timeout;
|
||||
|
||||
@ -90,380 +68,32 @@ static unsigned long lowmem_deathpending_timeout;
|
||||
pr_info(x); \
|
||||
} while (0)
|
||||
|
||||
static atomic_t shift_adj = ATOMIC_INIT(0);
|
||||
static short adj_max_shift = 353;
|
||||
|
||||
/* User knob to enable/disable adaptive lmk feature */
|
||||
static int enable_adaptive_lmk;
|
||||
module_param_named(enable_adaptive_lmk, enable_adaptive_lmk, int,
|
||||
S_IRUGO | S_IWUSR);
|
||||
|
||||
/*
|
||||
* This parameter controls the behaviour of LMK when vmpressure is in
|
||||
* the range of 90-94. Adaptive lmk triggers based on number of file
|
||||
* pages wrt vmpressure_file_min, when vmpressure is in the range of
|
||||
* 90-94. Usually this is a pseudo minfree value, higher than the
|
||||
* highest configured value in minfree array.
|
||||
*/
|
||||
static int vmpressure_file_min;
|
||||
module_param_named(vmpressure_file_min, vmpressure_file_min, int,
|
||||
S_IRUGO | S_IWUSR);
|
||||
|
||||
enum {
|
||||
VMPRESSURE_NO_ADJUST = 0,
|
||||
VMPRESSURE_ADJUST_ENCROACH,
|
||||
VMPRESSURE_ADJUST_NORMAL,
|
||||
};
|
||||
|
||||
int adjust_minadj(short *min_score_adj)
|
||||
static unsigned long lowmem_count(struct shrinker *s,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
int ret = VMPRESSURE_NO_ADJUST;
|
||||
|
||||
if (!enable_adaptive_lmk)
|
||||
return 0;
|
||||
|
||||
if (atomic_read(&shift_adj) &&
|
||||
(*min_score_adj > adj_max_shift)) {
|
||||
if (*min_score_adj == OOM_SCORE_ADJ_MAX + 1)
|
||||
ret = VMPRESSURE_ADJUST_ENCROACH;
|
||||
else
|
||||
ret = VMPRESSURE_ADJUST_NORMAL;
|
||||
*min_score_adj = adj_max_shift;
|
||||
}
|
||||
atomic_set(&shift_adj, 0);
|
||||
|
||||
return ret;
|
||||
return global_page_state(NR_ACTIVE_ANON) +
|
||||
global_page_state(NR_ACTIVE_FILE) +
|
||||
global_page_state(NR_INACTIVE_ANON) +
|
||||
global_page_state(NR_INACTIVE_FILE);
|
||||
}
|
||||
|
||||
static int lmk_vmpressure_notifier(struct notifier_block *nb,
|
||||
unsigned long action, void *data)
|
||||
{
|
||||
int other_free, other_file;
|
||||
unsigned long pressure = action;
|
||||
int array_size = ARRAY_SIZE(lowmem_adj);
|
||||
|
||||
if (!enable_adaptive_lmk)
|
||||
return 0;
|
||||
|
||||
if (pressure >= 95) {
|
||||
other_file = global_page_state(NR_FILE_PAGES) -
|
||||
global_page_state(NR_SHMEM) -
|
||||
total_swapcache_pages();
|
||||
other_free = global_page_state(NR_FREE_PAGES);
|
||||
|
||||
atomic_set(&shift_adj, 1);
|
||||
trace_almk_vmpressure(pressure, other_free, other_file);
|
||||
} else if (pressure >= 90) {
|
||||
if (lowmem_adj_size < array_size)
|
||||
array_size = lowmem_adj_size;
|
||||
if (lowmem_minfree_size < array_size)
|
||||
array_size = lowmem_minfree_size;
|
||||
|
||||
other_file = global_page_state(NR_FILE_PAGES) -
|
||||
global_page_state(NR_SHMEM) -
|
||||
total_swapcache_pages();
|
||||
|
||||
other_free = global_page_state(NR_FREE_PAGES);
|
||||
|
||||
if ((other_free < lowmem_minfree[array_size - 1]) &&
|
||||
(other_file < vmpressure_file_min)) {
|
||||
atomic_set(&shift_adj, 1);
|
||||
trace_almk_vmpressure(pressure, other_free,
|
||||
other_file);
|
||||
}
|
||||
} else if (atomic_read(&shift_adj)) {
|
||||
/*
|
||||
* shift_adj would have been set by a previous invocation
|
||||
* of notifier, which is not followed by a lowmem_shrink yet.
|
||||
* Since vmpressure has improved, reset shift_adj to avoid
|
||||
* false adaptive LMK trigger.
|
||||
*/
|
||||
trace_almk_vmpressure(pressure, other_free, other_file);
|
||||
atomic_set(&shift_adj, 0);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct notifier_block lmk_vmpr_nb = {
|
||||
.notifier_call = lmk_vmpressure_notifier,
|
||||
};
|
||||
|
||||
static int test_task_flag(struct task_struct *p, int flag)
|
||||
{
|
||||
struct task_struct *t;
|
||||
|
||||
for_each_thread(p, t) {
|
||||
task_lock(t);
|
||||
if (test_tsk_thread_flag(t, flag)) {
|
||||
task_unlock(t);
|
||||
return 1;
|
||||
}
|
||||
task_unlock(t);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static DEFINE_MUTEX(scan_mutex);
|
||||
|
||||
int can_use_cma_pages(gfp_t gfp_mask)
|
||||
{
|
||||
int can_use = 0;
|
||||
int mtype = allocflags_to_migratetype(gfp_mask);
|
||||
int i = 0;
|
||||
int *mtype_fallbacks = get_migratetype_fallbacks(mtype);
|
||||
|
||||
if (is_migrate_cma(mtype)) {
|
||||
can_use = 1;
|
||||
} else {
|
||||
for (i = 0;; i++) {
|
||||
int fallbacktype = mtype_fallbacks[i];
|
||||
|
||||
if (is_migrate_cma(fallbacktype)) {
|
||||
can_use = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (fallbacktype == MIGRATE_RESERVE)
|
||||
break;
|
||||
}
|
||||
}
|
||||
return can_use;
|
||||
}
|
||||
|
||||
void tune_lmk_zone_param(struct zonelist *zonelist, int classzone_idx,
|
||||
int *other_free, int *other_file,
|
||||
int use_cma_pages)
|
||||
{
|
||||
struct zone *zone;
|
||||
struct zoneref *zoneref;
|
||||
int zone_idx;
|
||||
|
||||
for_each_zone_zonelist(zone, zoneref, zonelist, MAX_NR_ZONES) {
|
||||
zone_idx = zonelist_zone_idx(zoneref);
|
||||
if (zone_idx == ZONE_MOVABLE) {
|
||||
if (!use_cma_pages)
|
||||
*other_free -=
|
||||
zone_page_state(zone, NR_FREE_CMA_PAGES);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (zone_idx > classzone_idx) {
|
||||
if (other_free != NULL)
|
||||
*other_free -= zone_page_state(zone,
|
||||
NR_FREE_PAGES);
|
||||
if (other_file != NULL)
|
||||
*other_file -= zone_page_state(zone,
|
||||
NR_FILE_PAGES)
|
||||
- zone_page_state(zone, NR_SHMEM)
|
||||
- zone_page_state(zone, NR_SWAPCACHE);
|
||||
} else if (zone_idx < classzone_idx) {
|
||||
if (zone_watermark_ok(zone, 0, 0, classzone_idx, 0)) {
|
||||
if (!use_cma_pages) {
|
||||
*other_free -= min(
|
||||
zone->lowmem_reserve[classzone_idx] +
|
||||
zone_page_state(
|
||||
zone, NR_FREE_CMA_PAGES),
|
||||
zone_page_state(
|
||||
zone, NR_FREE_PAGES));
|
||||
} else {
|
||||
*other_free -=
|
||||
zone->lowmem_reserve[classzone_idx];
|
||||
}
|
||||
} else {
|
||||
*other_free -=
|
||||
zone_page_state(zone, NR_FREE_PAGES);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
void adjust_gfp_mask(gfp_t *gfp_mask)
|
||||
{
|
||||
struct zone *preferred_zone;
|
||||
struct zonelist *zonelist;
|
||||
enum zone_type high_zoneidx;
|
||||
|
||||
if (current_is_kswapd()) {
|
||||
zonelist = node_zonelist(0, *gfp_mask);
|
||||
high_zoneidx = gfp_zone(*gfp_mask);
|
||||
first_zones_zonelist(zonelist, high_zoneidx, NULL,
|
||||
&preferred_zone);
|
||||
|
||||
if (high_zoneidx == ZONE_NORMAL) {
|
||||
if (zone_watermark_ok_safe(preferred_zone, 0,
|
||||
high_wmark_pages(preferred_zone), 0,
|
||||
0))
|
||||
*gfp_mask |= __GFP_HIGHMEM;
|
||||
} else if (high_zoneidx == ZONE_HIGHMEM) {
|
||||
*gfp_mask |= __GFP_HIGHMEM;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
void adjust_gfp_mask(gfp_t *unused)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
void tune_lmk_param(int *other_free, int *other_file, struct shrink_control *sc)
|
||||
{
|
||||
gfp_t gfp_mask;
|
||||
struct zone *preferred_zone;
|
||||
struct zonelist *zonelist;
|
||||
enum zone_type high_zoneidx, classzone_idx;
|
||||
unsigned long balance_gap;
|
||||
int use_cma_pages;
|
||||
|
||||
gfp_mask = sc->gfp_mask;
|
||||
adjust_gfp_mask(&gfp_mask);
|
||||
|
||||
zonelist = node_zonelist(0, gfp_mask);
|
||||
high_zoneidx = gfp_zone(gfp_mask);
|
||||
first_zones_zonelist(zonelist, high_zoneidx, NULL, &preferred_zone);
|
||||
classzone_idx = zone_idx(preferred_zone);
|
||||
use_cma_pages = can_use_cma_pages(gfp_mask);
|
||||
|
||||
balance_gap = min(low_wmark_pages(preferred_zone),
|
||||
(preferred_zone->present_pages +
|
||||
KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
|
||||
KSWAPD_ZONE_BALANCE_GAP_RATIO);
|
||||
|
||||
if (likely(current_is_kswapd() && zone_watermark_ok(preferred_zone, 0,
|
||||
high_wmark_pages(preferred_zone) + SWAP_CLUSTER_MAX +
|
||||
balance_gap, 0, 0))) {
|
||||
if (lmk_fast_run)
|
||||
tune_lmk_zone_param(zonelist, classzone_idx, other_free,
|
||||
other_file, use_cma_pages);
|
||||
else
|
||||
tune_lmk_zone_param(zonelist, classzone_idx, other_free,
|
||||
NULL, use_cma_pages);
|
||||
|
||||
if (zone_watermark_ok(preferred_zone, 0, 0, _ZONE, 0)) {
|
||||
if (!use_cma_pages) {
|
||||
*other_free -= min(
|
||||
preferred_zone->lowmem_reserve[_ZONE]
|
||||
+ zone_page_state(
|
||||
preferred_zone, NR_FREE_CMA_PAGES),
|
||||
zone_page_state(
|
||||
preferred_zone, NR_FREE_PAGES));
|
||||
} else {
|
||||
*other_free -=
|
||||
preferred_zone->lowmem_reserve[_ZONE];
|
||||
}
|
||||
} else {
|
||||
*other_free -= zone_page_state(preferred_zone,
|
||||
NR_FREE_PAGES);
|
||||
}
|
||||
|
||||
lowmem_print(4, "lowmem_shrink of kswapd tunning for highmem "
|
||||
"ofree %d, %d\n", *other_free, *other_file);
|
||||
} else {
|
||||
tune_lmk_zone_param(zonelist, classzone_idx, other_free,
|
||||
other_file, use_cma_pages);
|
||||
|
||||
if (!use_cma_pages) {
|
||||
*other_free -=
|
||||
zone_page_state(preferred_zone, NR_FREE_CMA_PAGES);
|
||||
}
|
||||
|
||||
lowmem_print(4, "lowmem_shrink tunning for others ofree %d, "
|
||||
"%d\n", *other_free, *other_file);
|
||||
}
|
||||
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
|
||||
if (zone_watermark_ok(preferred_zone, 0,
|
||||
low_wmark_pages(preferred_zone), 0, 0)) {
|
||||
struct sysinfo si;
|
||||
si_swapinfo(&si);
|
||||
*other_free += si.freeswap;
|
||||
#ifdef CONFIG_ZRAM
|
||||
/* If swap is actually residing in RAM (e. g. the swap device
|
||||
* is a ZRAM device), we need to subtract the amount of RAM
|
||||
* that will be occupied by compressed data. To play on the
|
||||
* safe side, it's better to subtract too much than too few,
|
||||
* otherwise LMK may not be triggered when it has to be. ZRAM
|
||||
* compression ratio is at least 2, so we subtract half of the
|
||||
* reported freeswap.
|
||||
*/
|
||||
*other_free -= si.freeswap >> 1;
|
||||
#endif
|
||||
lowmem_print(4, "lowmem_shrink tunning for swap "
|
||||
"ofree %d, %d\n", *other_free, *other_file);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
|
||||
static void lowmem_wakeup_kswapd(struct shrink_control *sc, int minfree)
|
||||
{
|
||||
gfp_t gfp_mask;
|
||||
struct zone *preferred_zone;
|
||||
struct zonelist *zonelist;
|
||||
enum zone_type high_zoneidx, classzone_idx;
|
||||
int order = 0;
|
||||
|
||||
if (likely(current_is_kswapd()))
|
||||
return;
|
||||
|
||||
gfp_mask = sc->gfp_mask;
|
||||
adjust_gfp_mask(&gfp_mask);
|
||||
|
||||
zonelist = node_zonelist(0, gfp_mask);
|
||||
high_zoneidx = gfp_zone(gfp_mask);
|
||||
first_zones_zonelist(zonelist, high_zoneidx, NULL, &preferred_zone);
|
||||
classzone_idx = zone_idx(preferred_zone);
|
||||
|
||||
for (minfree >>= 13; order < 7; order++) {
|
||||
if (minfree <= (1 << order))
|
||||
break;
|
||||
}
|
||||
|
||||
lowmem_print(4, "lowmem_wakeup_kswapd order %d\n", order);
|
||||
wakeup_kswapd(preferred_zone, order, classzone_idx);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
|
||||
static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
|
||||
{
|
||||
struct task_struct *tsk;
|
||||
struct task_struct *selected = NULL;
|
||||
int rem = 0;
|
||||
unsigned long rem = 0;
|
||||
int tasksize;
|
||||
int i;
|
||||
int ret = 0;
|
||||
short min_score_adj = OOM_SCORE_ADJ_MAX + 1;
|
||||
int minfree = 0;
|
||||
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
|
||||
int max_minfree = 0;
|
||||
#endif
|
||||
int selected_tasksize = 0;
|
||||
short selected_oom_score_adj;
|
||||
int array_size = ARRAY_SIZE(lowmem_adj);
|
||||
int other_free;
|
||||
int other_file;
|
||||
unsigned long nr_to_scan = sc->nr_to_scan;
|
||||
|
||||
if (nr_to_scan > 0) {
|
||||
if (mutex_lock_interruptible(&scan_mutex) < 0) {
|
||||
trace_lmk_remain_scan(0, nr_to_scan, sc->gfp_mask);
|
||||
return 0;
|
||||
};
|
||||
}
|
||||
|
||||
other_free = global_page_state(NR_FREE_PAGES);
|
||||
|
||||
if (global_page_state(NR_SHMEM) + global_page_state(NR_MLOCK_FILE) +
|
||||
total_swapcache_pages() < global_page_state(NR_FILE_PAGES))
|
||||
other_file = global_page_state(NR_FILE_PAGES) -
|
||||
int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
|
||||
int other_file = global_page_state(NR_FILE_PAGES) -
|
||||
global_page_state(NR_SHMEM) -
|
||||
global_page_state(NR_MLOCK_FILE) -
|
||||
global_page_state(NR_UNEVICTABLE) -
|
||||
total_swapcache_pages();
|
||||
else
|
||||
other_file = 0;
|
||||
|
||||
tune_lmk_param(&other_free, &other_file, sc);
|
||||
|
||||
if (lowmem_adj_size < array_size)
|
||||
array_size = lowmem_adj_size;
|
||||
@ -471,43 +101,22 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
|
||||
array_size = lowmem_minfree_size;
|
||||
for (i = 0; i < array_size; i++) {
|
||||
minfree = lowmem_minfree[i];
|
||||
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
|
||||
if (max_minfree < minfree)
|
||||
max_minfree = minfree;
|
||||
#endif
|
||||
if (other_free < minfree && other_file < minfree) {
|
||||
min_score_adj = lowmem_adj[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (nr_to_scan > 0) {
|
||||
ret = adjust_minadj(&min_score_adj);
|
||||
lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %hd\n",
|
||||
nr_to_scan, sc->gfp_mask, other_free,
|
||||
other_file, min_score_adj);
|
||||
|
||||
lowmem_print(3, "lowmem_scan %lu, %x, ofree %d %d, ma %hd\n",
|
||||
sc->nr_to_scan, sc->gfp_mask, other_free,
|
||||
other_file, min_score_adj);
|
||||
|
||||
if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
|
||||
lowmem_print(5, "lowmem_scan %lu, %x, return 0\n",
|
||||
sc->nr_to_scan, sc->gfp_mask);
|
||||
return 0;
|
||||
}
|
||||
|
||||
rem = global_page_state(NR_ACTIVE_ANON) +
|
||||
global_page_state(NR_ACTIVE_FILE) +
|
||||
global_page_state(NR_INACTIVE_ANON) +
|
||||
global_page_state(NR_INACTIVE_FILE);
|
||||
if (nr_to_scan <= 0 || min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
|
||||
lowmem_print(5, "lowmem_shrink %lu, %x, return %d\n",
|
||||
nr_to_scan, sc->gfp_mask, rem);
|
||||
|
||||
if (nr_to_scan > 0)
|
||||
mutex_unlock(&scan_mutex);
|
||||
|
||||
if ((min_score_adj == OOM_SCORE_ADJ_MAX + 1) &&
|
||||
(nr_to_scan > 0))
|
||||
trace_almk_shrink(0, ret, other_free, other_file, 0);
|
||||
|
||||
trace_lmk_remain_scan(rem, nr_to_scan, sc->gfp_mask);
|
||||
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
|
||||
lowmem_wakeup_kswapd(sc, max_minfree);
|
||||
#endif
|
||||
return rem;
|
||||
}
|
||||
selected_oom_score_adj = min_score_adj;
|
||||
|
||||
rcu_read_lock();
|
||||
@ -518,30 +127,16 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
|
||||
if (tsk->flags & PF_KTHREAD)
|
||||
continue;
|
||||
|
||||
/* if task no longer has any memory ignore it */
|
||||
if (test_task_flag(tsk, TIF_MM_RELEASED))
|
||||
continue;
|
||||
|
||||
/* Ignore task if coredump in progress */
|
||||
if (tsk->mm && tsk->mm->core_state)
|
||||
continue;
|
||||
|
||||
if (time_before_eq(jiffies, lowmem_deathpending_timeout)) {
|
||||
if (test_task_flag(tsk, TIF_MEMDIE)) {
|
||||
rcu_read_unlock();
|
||||
/* give the system time to free up the memory */
|
||||
msleep_interruptible(20);
|
||||
mutex_unlock(&scan_mutex);
|
||||
trace_lmk_remain_scan(rem, nr_to_scan,
|
||||
sc->gfp_mask);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
p = find_lock_task_mm(tsk);
|
||||
if (!p)
|
||||
continue;
|
||||
|
||||
if (test_tsk_thread_flag(p, TIF_MEMDIE) &&
|
||||
time_before_eq(jiffies, lowmem_deathpending_timeout)) {
|
||||
task_unlock(p);
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
oom_score_adj = p->signal->oom_score_adj;
|
||||
if (oom_score_adj < min_score_adj) {
|
||||
task_unlock(p);
|
||||
@ -561,7 +156,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
|
||||
selected = p;
|
||||
selected_tasksize = tasksize;
|
||||
selected_oom_score_adj = oom_score_adj;
|
||||
lowmem_print(3, "select '%s' (%d), adj %hd, size %d, to kill\n",
|
||||
lowmem_print(2, "select '%s' (%d), adj %hd, size %d, to kill\n",
|
||||
p->comm, p->pid, oom_score_adj, tasksize);
|
||||
}
|
||||
if (selected) {
|
||||
@ -572,83 +167,35 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
|
||||
lowmem_print(1, "Killing '%s' (%d), adj %hd,\n" \
|
||||
" to free %ldkB on behalf of '%s' (%d) because\n" \
|
||||
" cache %ldkB is below limit %ldkB for oom_score_adj %hd\n" \
|
||||
" Free memory is %ldkB above reserved.\n" \
|
||||
" Free CMA is %ldkB\n" \
|
||||
" Total reserve is %ldkB\n" \
|
||||
" Total free pages is %ldkB\n" \
|
||||
" Total file cache is %ldkB\n" \
|
||||
" Slab Reclaimable is %ldkB\n" \
|
||||
" Slab UnReclaimable is %ldkB\n" \
|
||||
" Total Slab is %ldkB\n" \
|
||||
" GFP mask is 0x%x\n",
|
||||
" Free memory is %ldkB above reserved\n",
|
||||
selected->comm, selected->pid,
|
||||
selected_oom_score_adj,
|
||||
selected_tasksize * (long)(PAGE_SIZE / 1024),
|
||||
current->comm, current->pid,
|
||||
cache_size, cache_limit,
|
||||
min_score_adj,
|
||||
free ,
|
||||
global_page_state(NR_FREE_CMA_PAGES) *
|
||||
(long)(PAGE_SIZE / 1024),
|
||||
totalreserve_pages * (long)(PAGE_SIZE / 1024),
|
||||
global_page_state(NR_FREE_PAGES) *
|
||||
(long)(PAGE_SIZE / 1024),
|
||||
global_page_state(NR_FILE_PAGES) *
|
||||
(long)(PAGE_SIZE / 1024),
|
||||
global_page_state(NR_SLAB_RECLAIMABLE) *
|
||||
(long)(PAGE_SIZE / 1024),
|
||||
global_page_state(NR_SLAB_UNRECLAIMABLE) *
|
||||
(long)(PAGE_SIZE / 1024),
|
||||
global_page_state(NR_SLAB_RECLAIMABLE) *
|
||||
(long)(PAGE_SIZE / 1024) +
|
||||
global_page_state(NR_SLAB_UNRECLAIMABLE) *
|
||||
(long)(PAGE_SIZE / 1024),
|
||||
sc->gfp_mask);
|
||||
|
||||
if (lowmem_debug_level >= 2 && selected_oom_score_adj == 0) {
|
||||
show_mem(SHOW_MEM_FILTER_NODES);
|
||||
dump_tasks(NULL, NULL);
|
||||
show_mem_call_notifiers();
|
||||
}
|
||||
|
||||
free);
|
||||
lowmem_deathpending_timeout = jiffies + HZ;
|
||||
/*
|
||||
* FIXME: lowmemorykiller shouldn't abuse global OOM killer
|
||||
* infrastructure. There is no real reason why the selected
|
||||
* task should have access to the memory reserves.
|
||||
*/
|
||||
mark_tsk_oom_victim(selected);
|
||||
set_tsk_thread_flag(selected, TIF_MEMDIE);
|
||||
send_sig(SIGKILL, selected, 0);
|
||||
rem -= selected_tasksize;
|
||||
rcu_read_unlock();
|
||||
trace_lmk_sigkill(selected->pid, selected->comm,
|
||||
selected_oom_score_adj, selected_tasksize,
|
||||
sc->gfp_mask);
|
||||
/* give the system time to free up the memory */
|
||||
msleep_interruptible(20);
|
||||
trace_almk_shrink(selected_tasksize, ret,
|
||||
other_free, other_file, selected_oom_score_adj);
|
||||
} else {
|
||||
trace_almk_shrink(1, ret, other_free, other_file, 0);
|
||||
rcu_read_unlock();
|
||||
rem += selected_tasksize;
|
||||
}
|
||||
|
||||
lowmem_print(4, "lowmem_shrink %lu, %x, return %d\n",
|
||||
nr_to_scan, sc->gfp_mask, rem);
|
||||
mutex_unlock(&scan_mutex);
|
||||
trace_lmk_remain_scan(rem, nr_to_scan, sc->gfp_mask);
|
||||
lowmem_print(4, "lowmem_scan %lu, %x, return %lu\n",
|
||||
sc->nr_to_scan, sc->gfp_mask, rem);
|
||||
rcu_read_unlock();
|
||||
return rem;
|
||||
}
|
||||
|
||||
static struct shrinker lowmem_shrinker = {
|
||||
.shrink = lowmem_shrink,
|
||||
.scan_objects = lowmem_scan,
|
||||
.count_objects = lowmem_count,
|
||||
.seeks = DEFAULT_SEEKS * 16
|
||||
};
|
||||
|
||||
static int __init lowmem_init(void)
|
||||
{
|
||||
register_shrinker(&lowmem_shrinker);
|
||||
vmpressure_notifier_register(&lmk_vmpr_nb);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -736,10 +283,8 @@ static const struct kparam_array __param_arr_adj = {
|
||||
|
||||
module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR);
|
||||
#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
|
||||
__module_param_call(MODULE_PARAM_PREFIX, adj,
|
||||
&lowmem_adj_array_ops,
|
||||
.arr = &__param_arr_adj,
|
||||
S_IRUGO | S_IWUSR, -1);
|
||||
module_param_cb(adj, &lowmem_adj_array_ops,
|
||||
.arr = &__param_arr_adj, S_IRUGO | S_IWUSR);
|
||||
__MODULE_PARM_TYPE(adj, "array of short");
|
||||
#else
|
||||
module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size,
|
||||
@ -748,7 +293,6 @@ module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size,
|
||||
module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size,
|
||||
S_IRUGO | S_IWUSR);
|
||||
module_param_named(debug_level, lowmem_debug_level, uint, S_IRUGO | S_IWUSR);
|
||||
module_param_named(lmk_fast_run, lmk_fast_run, int, S_IRUGO | S_IWUSR);
|
||||
|
||||
module_init(lowmem_init);
|
||||
module_exit(lowmem_exit);
|
||||
|
541
drivers/staging/android/uapi/binder.h
Normal file
541
drivers/staging/android/uapi/binder.h
Normal file
@ -0,0 +1,541 @@
|
||||
/*
|
||||
* Copyright (C) 2008 Google, Inc.
|
||||
*
|
||||
* Based on, but no longer compatible with, the original
|
||||
* OpenBinder.org binder driver interface, which is:
|
||||
*
|
||||
* Copyright (c) 2005 Palmsource, Inc.
|
||||
*
|
||||
* This software is licensed under the terms of the GNU General Public
|
||||
* License version 2, as published by the Free Software Foundation, and
|
||||
* may be copied, distributed, and modified under those terms.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _UAPI_LINUX_BINDER_H
|
||||
#define _UAPI_LINUX_BINDER_H
|
||||
|
||||
#include <linux/ioctl.h>
|
||||
|
||||
#define B_PACK_CHARS(c1, c2, c3, c4) \
|
||||
((((c1)<<24)) | (((c2)<<16)) | (((c3)<<8)) | (c4))
|
||||
#define B_TYPE_LARGE 0x85
|
||||
|
||||
enum {
|
||||
BINDER_TYPE_BINDER = B_PACK_CHARS('s', 'b', '*', B_TYPE_LARGE),
|
||||
BINDER_TYPE_WEAK_BINDER = B_PACK_CHARS('w', 'b', '*', B_TYPE_LARGE),
|
||||
BINDER_TYPE_HANDLE = B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE),
|
||||
BINDER_TYPE_WEAK_HANDLE = B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE),
|
||||
BINDER_TYPE_FD = B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE),
|
||||
BINDER_TYPE_FDA = B_PACK_CHARS('f', 'd', 'a', B_TYPE_LARGE),
|
||||
BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE),
|
||||
};
|
||||
|
||||
/**
|
||||
* enum flat_binder_object_shifts: shift values for flat_binder_object_flags
|
||||
* @FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT: shift for getting scheduler policy.
|
||||
*
|
||||
*/
|
||||
enum flat_binder_object_shifts {
|
||||
FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT = 9,
|
||||
};
|
||||
|
||||
/**
|
||||
* enum flat_binder_object_flags - flags for use in flat_binder_object.flags
|
||||
*/
|
||||
enum flat_binder_object_flags {
|
||||
/**
|
||||
* @FLAT_BINDER_FLAG_PRIORITY_MASK: bit-mask for min scheduler priority
|
||||
*
|
||||
* These bits can be used to set the minimum scheduler priority
|
||||
* at which transactions into this node should run. Valid values
|
||||
* in these bits depend on the scheduler policy encoded in
|
||||
* @FLAT_BINDER_FLAG_SCHED_POLICY_MASK.
|
||||
*
|
||||
* For SCHED_NORMAL/SCHED_BATCH, the valid range is between [-20..19]
|
||||
* For SCHED_FIFO/SCHED_RR, the value can run between [1..99]
|
||||
*/
|
||||
FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff,
|
||||
/**
|
||||
* @FLAT_BINDER_FLAG_ACCEPTS_FDS: whether the node accepts fds.
|
||||
*/
|
||||
FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100,
|
||||
/**
|
||||
* @FLAT_BINDER_FLAG_SCHED_POLICY_MASK: bit-mask for scheduling policy
|
||||
*
|
||||
* These two bits can be used to set the min scheduling policy at which
|
||||
* transactions on this node should run. These match the UAPI
|
||||
* scheduler policy values, eg:
|
||||
* 00b: SCHED_NORMAL
|
||||
* 01b: SCHED_FIFO
|
||||
* 10b: SCHED_RR
|
||||
* 11b: SCHED_BATCH
|
||||
*/
|
||||
FLAT_BINDER_FLAG_SCHED_POLICY_MASK =
|
||||
3U << FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT,
|
||||
|
||||
/**
|
||||
* @FLAT_BINDER_FLAG_INHERIT_RT: whether the node inherits RT policy
|
||||
*
|
||||
* Only when set, calls into this node will inherit a real-time
|
||||
* scheduling policy from the caller (for synchronous transactions).
|
||||
*/
|
||||
FLAT_BINDER_FLAG_INHERIT_RT = 0x800,
|
||||
|
||||
/**
|
||||
* @FLAT_BINDER_FLAG_TXN_SECURITY_CTX: request security contexts
|
||||
*
|
||||
* Only when set, causes senders to include their security
|
||||
* context
|
||||
*/
|
||||
FLAT_BINDER_FLAG_TXN_SECURITY_CTX = 0x1000,
|
||||
};
|
||||
|
||||
#ifdef BINDER_IPC_32BIT
|
||||
typedef __u32 binder_size_t;
|
||||
typedef __u32 binder_uintptr_t;
|
||||
#else
|
||||
typedef __u64 binder_size_t;
|
||||
typedef __u64 binder_uintptr_t;
|
||||
#endif
|
||||
|
||||
/**
|
||||
* struct binder_object_header - header shared by all binder metadata objects.
|
||||
* @type: type of the object
|
||||
*/
|
||||
struct binder_object_header {
|
||||
__u32 type;
|
||||
};
|
||||
|
||||
/*
|
||||
* This is the flattened representation of a Binder object for transfer
|
||||
* between processes. The 'offsets' supplied as part of a binder transaction
|
||||
* contains offsets into the data where these structures occur. The Binder
|
||||
* driver takes care of re-writing the structure type and data as it moves
|
||||
* between processes.
|
||||
*/
|
||||
struct flat_binder_object {
|
||||
struct binder_object_header hdr;
|
||||
__u32 flags;
|
||||
|
||||
/* 8 bytes of data. */
|
||||
union {
|
||||
binder_uintptr_t binder; /* local object */
|
||||
__u32 handle; /* remote object */
|
||||
};
|
||||
|
||||
/* extra data associated with local object */
|
||||
binder_uintptr_t cookie;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct binder_fd_object - describes a filedescriptor to be fixed up.
|
||||
* @hdr: common header structure
|
||||
* @pad_flags: padding to remain compatible with old userspace code
|
||||
* @pad_binder: padding to remain compatible with old userspace code
|
||||
* @fd: file descriptor
|
||||
* @cookie: opaque data, used by user-space
|
||||
*/
|
||||
struct binder_fd_object {
|
||||
struct binder_object_header hdr;
|
||||
__u32 pad_flags;
|
||||
union {
|
||||
binder_uintptr_t pad_binder;
|
||||
__u32 fd;
|
||||
};
|
||||
|
||||
binder_uintptr_t cookie;
|
||||
};
|
||||
|
||||
/* struct binder_buffer_object - object describing a userspace buffer
|
||||
* @hdr: common header structure
|
||||
* @flags: one or more BINDER_BUFFER_* flags
|
||||
* @buffer: address of the buffer
|
||||
* @length: length of the buffer
|
||||
* @parent: index in offset array pointing to parent buffer
|
||||
* @parent_offset: offset in @parent pointing to this buffer
|
||||
*
|
||||
* A binder_buffer object represents an object that the
|
||||
* binder kernel driver can copy verbatim to the target
|
||||
* address space. A buffer itself may be pointed to from
|
||||
* within another buffer, meaning that the pointer inside
|
||||
* that other buffer needs to be fixed up as well. This
|
||||
* can be done by setting the BINDER_BUFFER_FLAG_HAS_PARENT
|
||||
* flag in @flags, by setting @parent buffer to the index
|
||||
* in the offset array pointing to the parent binder_buffer_object,
|
||||
* and by setting @parent_offset to the offset in the parent buffer
|
||||
* at which the pointer to this buffer is located.
|
||||
*/
|
||||
struct binder_buffer_object {
|
||||
struct binder_object_header hdr;
|
||||
__u32 flags;
|
||||
binder_uintptr_t buffer;
|
||||
binder_size_t length;
|
||||
binder_size_t parent;
|
||||
binder_size_t parent_offset;
|
||||
};
|
||||
|
||||
enum {
|
||||
BINDER_BUFFER_FLAG_HAS_PARENT = 0x01,
|
||||
};
|
||||
|
||||
/* struct binder_fd_array_object - object describing an array of fds in a buffer
|
||||
* @hdr: common header structure
|
||||
* @pad: padding to ensure correct alignment
|
||||
* @num_fds: number of file descriptors in the buffer
|
||||
* @parent: index in offset array to buffer holding the fd array
|
||||
* @parent_offset: start offset of fd array in the buffer
|
||||
*
|
||||
* A binder_fd_array object represents an array of file
|
||||
* descriptors embedded in a binder_buffer_object. It is
|
||||
* different from a regular binder_buffer_object because it
|
||||
* describes a list of file descriptors to fix up, not an opaque
|
||||
* blob of memory, and hence the kernel needs to treat it differently.
|
||||
*
|
||||
* An example of how this would be used is with Android's
|
||||
* native_handle_t object, which is a struct with a list of integers
|
||||
* and a list of file descriptors. The native_handle_t struct itself
|
||||
* will be represented by a struct binder_buffer_objct, whereas the
|
||||
* embedded list of file descriptors is represented by a
|
||||
* struct binder_fd_array_object with that binder_buffer_object as
|
||||
* a parent.
|
||||
*/
|
||||
struct binder_fd_array_object {
|
||||
struct binder_object_header hdr;
|
||||
__u32 pad;
|
||||
binder_size_t num_fds;
|
||||
binder_size_t parent;
|
||||
binder_size_t parent_offset;
|
||||
};
|
||||
|
||||
/*
|
||||
* On 64-bit platforms where user code may run in 32-bits the driver must
|
||||
* translate the buffer (and local binder) addresses appropriately.
|
||||
*/
|
||||
|
||||
struct binder_write_read {
|
||||
binder_size_t write_size; /* bytes to write */
|
||||
binder_size_t write_consumed; /* bytes consumed by driver */
|
||||
binder_uintptr_t write_buffer;
|
||||
binder_size_t read_size; /* bytes to read */
|
||||
binder_size_t read_consumed; /* bytes consumed by driver */
|
||||
binder_uintptr_t read_buffer;
|
||||
};
|
||||
|
||||
/* Use with BINDER_VERSION, driver fills in fields. */
|
||||
struct binder_version {
|
||||
/* driver protocol version -- increment with incompatible change */
|
||||
__s32 protocol_version;
|
||||
};
|
||||
|
||||
/* This is the current protocol version. */
|
||||
#ifdef BINDER_IPC_32BIT
|
||||
#define BINDER_CURRENT_PROTOCOL_VERSION 7
|
||||
#else
|
||||
#define BINDER_CURRENT_PROTOCOL_VERSION 8
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Use with BINDER_GET_NODE_DEBUG_INFO, driver reads ptr, writes to all fields.
|
||||
* Set ptr to NULL for the first call to get the info for the first node, and
|
||||
* then repeat the call passing the previously returned value to get the next
|
||||
* nodes. ptr will be 0 when there are no more nodes.
|
||||
*/
|
||||
struct binder_node_debug_info {
|
||||
binder_uintptr_t ptr;
|
||||
binder_uintptr_t cookie;
|
||||
__u32 has_strong_ref;
|
||||
__u32 has_weak_ref;
|
||||
};
|
||||
|
||||
struct binder_node_info_for_ref {
|
||||
__u32 handle;
|
||||
__u32 strong_count;
|
||||
__u32 weak_count;
|
||||
__u32 reserved1;
|
||||
__u32 reserved2;
|
||||
__u32 reserved3;
|
||||
};
|
||||
|
||||
#define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read)
|
||||
#define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64)
|
||||
#define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32)
|
||||
#define BINDER_SET_IDLE_PRIORITY _IOW('b', 6, __s32)
|
||||
#define BINDER_SET_CONTEXT_MGR _IOW('b', 7, __s32)
|
||||
#define BINDER_THREAD_EXIT _IOW('b', 8, __s32)
|
||||
#define BINDER_VERSION _IOWR('b', 9, struct binder_version)
|
||||
#define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info)
|
||||
#define BINDER_GET_NODE_INFO_FOR_REF _IOWR('b', 12, struct binder_node_info_for_ref)
|
||||
#define BINDER_SET_CONTEXT_MGR_EXT _IOW('b', 13, struct flat_binder_object)
|
||||
|
||||
/*
|
||||
* NOTE: Two special error codes you should check for when calling
|
||||
* in to the driver are:
|
||||
*
|
||||
* EINTR -- The operation has been interupted. This should be
|
||||
* handled by retrying the ioctl() until a different error code
|
||||
* is returned.
|
||||
*
|
||||
* ECONNREFUSED -- The driver is no longer accepting operations
|
||||
* from your process. That is, the process is being destroyed.
|
||||
* You should handle this by exiting from your process. Note
|
||||
* that once this error code is returned, all further calls to
|
||||
* the driver from any thread will return this same code.
|
||||
*/
|
||||
|
||||
enum transaction_flags {
|
||||
TF_ONE_WAY = 0x01, /* this is a one-way call: async, no return */
|
||||
TF_ROOT_OBJECT = 0x04, /* contents are the component's root object */
|
||||
TF_STATUS_CODE = 0x08, /* contents are a 32-bit status code */
|
||||
TF_ACCEPT_FDS = 0x10, /* allow replies with file descriptors */
|
||||
};
|
||||
|
||||
struct binder_transaction_data {
|
||||
/* The first two are only used for bcTRANSACTION and brTRANSACTION,
|
||||
* identifying the target and contents of the transaction.
|
||||
*/
|
||||
union {
|
||||
/* target descriptor of command transaction */
|
||||
__u32 handle;
|
||||
/* target descriptor of return transaction */
|
||||
binder_uintptr_t ptr;
|
||||
} target;
|
||||
binder_uintptr_t cookie; /* target object cookie */
|
||||
__u32 code; /* transaction command */
|
||||
|
||||
/* General information about the transaction. */
|
||||
__u32 flags;
|
||||
pid_t sender_pid;
|
||||
uid_t sender_euid;
|
||||
binder_size_t data_size; /* number of bytes of data */
|
||||
binder_size_t offsets_size; /* number of bytes of offsets */
|
||||
|
||||
/* If this transaction is inline, the data immediately
|
||||
* follows here; otherwise, it ends with a pointer to
|
||||
* the data buffer.
|
||||
*/
|
||||
union {
|
||||
struct {
|
||||
/* transaction data */
|
||||
binder_uintptr_t buffer;
|
||||
/* offsets from buffer to flat_binder_object structs */
|
||||
binder_uintptr_t offsets;
|
||||
} ptr;
|
||||
__u8 buf[8];
|
||||
} data;
|
||||
};
|
||||
|
||||
struct binder_transaction_data_secctx {
|
||||
struct binder_transaction_data transaction_data;
|
||||
binder_uintptr_t secctx;
|
||||
};
|
||||
|
||||
struct binder_transaction_data_sg {
|
||||
struct binder_transaction_data transaction_data;
|
||||
binder_size_t buffers_size;
|
||||
};
|
||||
|
||||
struct binder_ptr_cookie {
|
||||
binder_uintptr_t ptr;
|
||||
binder_uintptr_t cookie;
|
||||
};
|
||||
|
||||
struct binder_handle_cookie {
|
||||
__u32 handle;
|
||||
binder_uintptr_t cookie;
|
||||
} __packed;
|
||||
|
||||
struct binder_pri_desc {
|
||||
__s32 priority;
|
||||
__u32 desc;
|
||||
};
|
||||
|
||||
struct binder_pri_ptr_cookie {
|
||||
__s32 priority;
|
||||
binder_uintptr_t ptr;
|
||||
binder_uintptr_t cookie;
|
||||
};
|
||||
|
||||
enum binder_driver_return_protocol {
|
||||
BR_ERROR = _IOR('r', 0, __s32),
|
||||
/*
|
||||
* int: error code
|
||||
*/
|
||||
|
||||
BR_OK = _IO('r', 1),
|
||||
/* No parameters! */
|
||||
|
||||
BR_TRANSACTION_SEC_CTX = _IOR('r', 2,
|
||||
struct binder_transaction_data_secctx),
|
||||
/*
|
||||
* binder_transaction_data_secctx: the received command.
|
||||
*/
|
||||
BR_TRANSACTION = _IOR('r', 2, struct binder_transaction_data),
|
||||
BR_REPLY = _IOR('r', 3, struct binder_transaction_data),
|
||||
/*
|
||||
* binder_transaction_data: the received command.
|
||||
*/
|
||||
|
||||
BR_ACQUIRE_RESULT = _IOR('r', 4, __s32),
|
||||
/*
|
||||
* not currently supported
|
||||
* int: 0 if the last bcATTEMPT_ACQUIRE was not successful.
|
||||
* Else the remote object has acquired a primary reference.
|
||||
*/
|
||||
|
||||
BR_DEAD_REPLY = _IO('r', 5),
|
||||
/*
|
||||
* The target of the last transaction (either a bcTRANSACTION or
|
||||
* a bcATTEMPT_ACQUIRE) is no longer with us. No parameters.
|
||||
*/
|
||||
|
||||
BR_TRANSACTION_COMPLETE = _IO('r', 6),
|
||||
/*
|
||||
* No parameters... always refers to the last transaction requested
|
||||
* (including replies). Note that this will be sent even for
|
||||
* asynchronous transactions.
|
||||
*/
|
||||
|
||||
BR_INCREFS = _IOR('r', 7, struct binder_ptr_cookie),
|
||||
BR_ACQUIRE = _IOR('r', 8, struct binder_ptr_cookie),
|
||||
BR_RELEASE = _IOR('r', 9, struct binder_ptr_cookie),
|
||||
BR_DECREFS = _IOR('r', 10, struct binder_ptr_cookie),
|
||||
/*
|
||||
* void *: ptr to binder
|
||||
* void *: cookie for binder
|
||||
*/
|
||||
|
||||
BR_ATTEMPT_ACQUIRE = _IOR('r', 11, struct binder_pri_ptr_cookie),
|
||||
/*
|
||||
* not currently supported
|
||||
* int: priority
|
||||
* void *: ptr to binder
|
||||
* void *: cookie for binder
|
||||
*/
|
||||
|
||||
BR_NOOP = _IO('r', 12),
|
||||
/*
|
||||
* No parameters. Do nothing and examine the next command. It exists
|
||||
* primarily so that we can replace it with a BR_SPAWN_LOOPER command.
|
||||
*/
|
||||
|
||||
BR_SPAWN_LOOPER = _IO('r', 13),
|
||||
/*
|
||||
* No parameters. The driver has determined that a process has no
|
||||
* threads waiting to service incoming transactions. When a process
|
||||
* receives this command, it must spawn a new service thread and
|
||||
* register it via bcENTER_LOOPER.
|
||||
*/
|
||||
|
||||
BR_FINISHED = _IO('r', 14),
|
||||
/*
|
||||
* not currently supported
|
||||
* stop threadpool thread
|
||||
*/
|
||||
|
||||
BR_DEAD_BINDER = _IOR('r', 15, binder_uintptr_t),
|
||||
/*
|
||||
* void *: cookie
|
||||
*/
|
||||
BR_CLEAR_DEATH_NOTIFICATION_DONE = _IOR('r', 16, binder_uintptr_t),
|
||||
/*
|
||||
* void *: cookie
|
||||
*/
|
||||
|
||||
BR_FAILED_REPLY = _IO('r', 17),
|
||||
/*
|
||||
* The the last transaction (either a bcTRANSACTION or
|
||||
* a bcATTEMPT_ACQUIRE) failed (e.g. out of memory). No parameters.
|
||||
*/
|
||||
};
|
||||
|
||||
enum binder_driver_command_protocol {
|
||||
BC_TRANSACTION = _IOW('c', 0, struct binder_transaction_data),
|
||||
BC_REPLY = _IOW('c', 1, struct binder_transaction_data),
|
||||
/*
|
||||
* binder_transaction_data: the sent command.
|
||||
*/
|
||||
|
||||
BC_ACQUIRE_RESULT = _IOW('c', 2, __s32),
|
||||
/*
|
||||
* not currently supported
|
||||
* int: 0 if the last BR_ATTEMPT_ACQUIRE was not successful.
|
||||
* Else you have acquired a primary reference on the object.
|
||||
*/
|
||||
|
||||
BC_FREE_BUFFER = _IOW('c', 3, binder_uintptr_t),
|
||||
/*
|
||||
* void *: ptr to transaction data received on a read
|
||||
*/
|
||||
|
||||
BC_INCREFS = _IOW('c', 4, __u32),
|
||||
BC_ACQUIRE = _IOW('c', 5, __u32),
|
||||
BC_RELEASE = _IOW('c', 6, __u32),
|
||||
BC_DECREFS = _IOW('c', 7, __u32),
|
||||
/*
|
||||
* int: descriptor
|
||||
*/
|
||||
|
||||
BC_INCREFS_DONE = _IOW('c', 8, struct binder_ptr_cookie),
|
||||
BC_ACQUIRE_DONE = _IOW('c', 9, struct binder_ptr_cookie),
|
||||
/*
|
||||
* void *: ptr to binder
|
||||
* void *: cookie for binder
|
||||
*/
|
||||
|
||||
BC_ATTEMPT_ACQUIRE = _IOW('c', 10, struct binder_pri_desc),
|
||||
/*
|
||||
* not currently supported
|
||||
* int: priority
|
||||
* int: descriptor
|
||||
*/
|
||||
|
||||
BC_REGISTER_LOOPER = _IO('c', 11),
|
||||
/*
|
||||
* No parameters.
|
||||
* Register a spawned looper thread with the device.
|
||||
*/
|
||||
|
||||
BC_ENTER_LOOPER = _IO('c', 12),
|
||||
BC_EXIT_LOOPER = _IO('c', 13),
|
||||
/*
|
||||
* No parameters.
|
||||
* These two commands are sent as an application-level thread
|
||||
* enters and exits the binder loop, respectively. They are
|
||||
* used so the binder can have an accurate count of the number
|
||||
* of looping threads it has available.
|
||||
*/
|
||||
|
||||
BC_REQUEST_DEATH_NOTIFICATION = _IOW('c', 14,
|
||||
struct binder_handle_cookie),
|
||||
/*
|
||||
* int: handle
|
||||
* void *: cookie
|
||||
*/
|
||||
|
||||
BC_CLEAR_DEATH_NOTIFICATION = _IOW('c', 15,
|
||||
struct binder_handle_cookie),
|
||||
/*
|
||||
* int: handle
|
||||
* void *: cookie
|
||||
*/
|
||||
|
||||
BC_DEAD_BINDER_DONE = _IOW('c', 16, binder_uintptr_t),
|
||||
/*
|
||||
* void *: cookie
|
||||
*/
|
||||
|
||||
BC_TRANSACTION_SG = _IOW('c', 17, struct binder_transaction_data_sg),
|
||||
BC_REPLY_SG = _IOW('c', 18, struct binder_transaction_data_sg),
|
||||
/*
|
||||
* binder_transaction_data_sg: the sent command.
|
||||
*/
|
||||
};
|
||||
|
||||
#endif /* _UAPI_LINUX_BINDER_H */
|
||||
|
@ -181,17 +181,7 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
|
||||
|
||||
#include <uapi/linux/types.h>
|
||||
|
||||
static __always_inline void data_access_exceeds_word_size(void)
|
||||
#ifdef __compiletime_warning
|
||||
__compiletime_warning("data access exceeds word size and won't be atomic")
|
||||
#endif
|
||||
;
|
||||
|
||||
static __always_inline void data_access_exceeds_word_size(void)
|
||||
{
|
||||
}
|
||||
|
||||
static __always_inline void __read_once_size(volatile void *p, void *res, int size)
|
||||
static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
|
||||
{
|
||||
switch (size) {
|
||||
case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
|
||||
@ -203,7 +193,6 @@ static __always_inline void __read_once_size(volatile void *p, void *res, int si
|
||||
default:
|
||||
barrier();
|
||||
__builtin_memcpy((void *)res, (const void *)p, size);
|
||||
data_access_exceeds_word_size();
|
||||
barrier();
|
||||
}
|
||||
}
|
||||
@ -220,7 +209,6 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
|
||||
default:
|
||||
barrier();
|
||||
__builtin_memcpy((void *)p, (const void *)res, size);
|
||||
data_access_exceeds_word_size();
|
||||
barrier();
|
||||
}
|
||||
}
|
||||
@ -248,10 +236,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
|
||||
*/
|
||||
|
||||
#define READ_ONCE(x) \
|
||||
({ typeof(x) __val; __read_once_size(&x, &__val, sizeof(__val)); __val; })
|
||||
({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
|
||||
|
||||
#define WRITE_ONCE(x, val) \
|
||||
({ typeof(x) __val; __val = val; __write_once_size(&x, &__val, sizeof(__val)); __val; })
|
||||
({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
|
||||
|
||||
#endif /* __KERNEL__ */
|
||||
|
||||
|
@ -388,6 +388,17 @@ static inline void list_splice_tail_init(struct list_head *list,
|
||||
#define list_prev_entry(pos, member) \
|
||||
list_entry((pos)->member.prev, typeof(*(pos)), member)
|
||||
|
||||
/**
|
||||
* list_first_entry_or_null - get the first element from a list
|
||||
* @ptr: the list head to take the element from.
|
||||
* @type: the type of the struct this is embedded in.
|
||||
* @member: the name of the list_struct within the struct.
|
||||
*
|
||||
* Note that if the list is empty, it returns NULL.
|
||||
*/
|
||||
#define list_first_entry_or_null(ptr, type, member) \
|
||||
(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
|
||||
|
||||
/**
|
||||
* list_for_each - iterate over a list
|
||||
* @pos: the &struct list_head to use as a loop cursor.
|
||||
|
137
include/linux/list_lru.h
Normal file
137
include/linux/list_lru.h
Normal file
@ -0,0 +1,137 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
|
||||
* Authors: David Chinner and Glauber Costa
|
||||
*
|
||||
* Generic LRU infrastructure
|
||||
*/
|
||||
#ifndef _LRU_LIST_H
|
||||
#define _LRU_LIST_H
|
||||
|
||||
#include <linux/list.h>
|
||||
#include <linux/nodemask.h>
|
||||
|
||||
/* list_lru_walk_cb has to always return one of those */
|
||||
enum lru_status {
|
||||
LRU_REMOVED, /* item removed from list */
|
||||
LRU_REMOVED_RETRY, /* item removed, but lock has been
|
||||
dropped and reacquired */
|
||||
LRU_ROTATE, /* item referenced, give another pass */
|
||||
LRU_SKIP, /* item cannot be locked, skip */
|
||||
LRU_RETRY, /* item not freeable. May drop the lock
|
||||
internally, but has to return locked. */
|
||||
};
|
||||
|
||||
struct list_lru_node {
|
||||
spinlock_t lock;
|
||||
struct list_head list;
|
||||
/* kept as signed so we can catch imbalance bugs */
|
||||
long nr_items;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
struct list_lru {
|
||||
struct list_lru_node *node;
|
||||
nodemask_t active_nodes;
|
||||
};
|
||||
|
||||
void list_lru_destroy(struct list_lru *lru);
|
||||
int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
|
||||
static inline int list_lru_init(struct list_lru *lru)
|
||||
{
|
||||
return list_lru_init_key(lru, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* list_lru_add: add an element to the lru list's tail
|
||||
* @list_lru: the lru pointer
|
||||
* @item: the item to be added.
|
||||
*
|
||||
* If the element is already part of a list, this function returns doing
|
||||
* nothing. Therefore the caller does not need to keep state about whether or
|
||||
* not the element already belongs in the list and is allowed to lazy update
|
||||
* it. Note however that this is valid for *a* list, not *this* list. If
|
||||
* the caller organize itself in a way that elements can be in more than
|
||||
* one type of list, it is up to the caller to fully remove the item from
|
||||
* the previous list (with list_lru_del() for instance) before moving it
|
||||
* to @list_lru
|
||||
*
|
||||
* Return value: true if the list was updated, false otherwise
|
||||
*/
|
||||
bool list_lru_add(struct list_lru *lru, struct list_head *item);
|
||||
|
||||
/**
|
||||
* list_lru_del: delete an element to the lru list
|
||||
* @list_lru: the lru pointer
|
||||
* @item: the item to be deleted.
|
||||
*
|
||||
* This function works analogously as list_lru_add in terms of list
|
||||
* manipulation. The comments about an element already pertaining to
|
||||
* a list are also valid for list_lru_del.
|
||||
*
|
||||
* Return value: true if the list was updated, false otherwise
|
||||
*/
|
||||
bool list_lru_del(struct list_lru *lru, struct list_head *item);
|
||||
|
||||
/**
|
||||
* list_lru_count_node: return the number of objects currently held by @lru
|
||||
* @lru: the lru pointer.
|
||||
* @nid: the node id to count from.
|
||||
*
|
||||
* Always return a non-negative number, 0 for empty lists. There is no
|
||||
* guarantee that the list is not updated while the count is being computed.
|
||||
* Callers that want such a guarantee need to provide an outer lock.
|
||||
*/
|
||||
unsigned long list_lru_count_node(struct list_lru *lru, int nid);
|
||||
static inline unsigned long list_lru_count(struct list_lru *lru)
|
||||
{
|
||||
long count = 0;
|
||||
int nid;
|
||||
|
||||
for_each_node_mask(nid, lru->active_nodes)
|
||||
count += list_lru_count_node(lru, nid);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
typedef enum lru_status
|
||||
(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
|
||||
/**
|
||||
* list_lru_walk_node: walk a list_lru, isolating and disposing freeable items.
|
||||
* @lru: the lru pointer.
|
||||
* @nid: the node id to scan from.
|
||||
* @isolate: callback function that is resposible for deciding what to do with
|
||||
* the item currently being scanned
|
||||
* @cb_arg: opaque type that will be passed to @isolate
|
||||
* @nr_to_walk: how many items to scan.
|
||||
*
|
||||
* This function will scan all elements in a particular list_lru, calling the
|
||||
* @isolate callback for each of those items, along with the current list
|
||||
* spinlock and a caller-provided opaque. The @isolate callback can choose to
|
||||
* drop the lock internally, but *must* return with the lock held. The callback
|
||||
* will return an enum lru_status telling the list_lru infrastructure what to
|
||||
* do with the object being scanned.
|
||||
*
|
||||
* Please note that nr_to_walk does not mean how many objects will be freed,
|
||||
* just how many objects will be scanned.
|
||||
*
|
||||
* Return value: the number of objects effectively removed from the LRU.
|
||||
*/
|
||||
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
|
||||
list_lru_walk_cb isolate, void *cb_arg,
|
||||
unsigned long *nr_to_walk);
|
||||
|
||||
static inline unsigned long
|
||||
list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
|
||||
void *cb_arg, unsigned long nr_to_walk)
|
||||
{
|
||||
long isolated = 0;
|
||||
int nid;
|
||||
|
||||
for_each_node_mask(nid, lru->active_nodes) {
|
||||
isolated += list_lru_walk_node(lru, nid, isolate,
|
||||
cb_arg, &nr_to_walk);
|
||||
if (nr_to_walk <= 0)
|
||||
break;
|
||||
}
|
||||
return isolated;
|
||||
}
|
||||
#endif /* _LRU_LIST_H */
|
@ -1560,6 +1560,62 @@ struct task_struct {
|
||||
/* Future-safe accessor for struct task_struct's cpus_allowed. */
|
||||
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
|
||||
|
||||
#define MAX_NICE 19
|
||||
#define MIN_NICE -20
|
||||
#define NICE_WIDTH (MAX_NICE - MIN_NICE + 1)
|
||||
|
||||
/*
|
||||
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
|
||||
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
|
||||
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
|
||||
* values are inverted: lower p->prio value means higher priority.
|
||||
*
|
||||
* The MAX_USER_RT_PRIO value allows the actual maximum
|
||||
* RT priority to be separate from the value exported to
|
||||
* user-space. This allows kernel threads to set their
|
||||
* priority to a value higher than any user task. Note:
|
||||
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
|
||||
*/
|
||||
#define MAX_USER_RT_PRIO 100
|
||||
#define MAX_RT_PRIO MAX_USER_RT_PRIO
|
||||
|
||||
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
|
||||
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
|
||||
|
||||
/*
|
||||
* Convert user-nice values [ -20 ... 0 ... 19 ]
|
||||
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
|
||||
* and back.
|
||||
*/
|
||||
#define NICE_TO_PRIO(nice) ((nice) + DEFAULT_PRIO)
|
||||
#define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO)
|
||||
#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
|
||||
|
||||
/*
|
||||
* 'User priority' is the nice value converted to something we
|
||||
* can work with better when scaling various scheduler parameters,
|
||||
* it's a [ 0 ... 39 ] range.
|
||||
*/
|
||||
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
|
||||
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
|
||||
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
|
||||
|
||||
/*
|
||||
* Convert nice value [19,-20] to rlimit style value [1,40].
|
||||
*/
|
||||
static inline long nice_to_rlimit(long nice)
|
||||
{
|
||||
return (MAX_NICE - nice + 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert rlimit style value [1,40] to nice value [-20, 19].
|
||||
*/
|
||||
static inline long rlimit_to_nice(long prio)
|
||||
{
|
||||
return (MAX_NICE - prio + 1);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
extern void task_numa_fault(int node, int pages, bool migrated);
|
||||
extern void set_numabalancing_state(bool enabled);
|
||||
|
@ -1,24 +1,9 @@
|
||||
#ifndef _SCHED_RT_H
|
||||
#define _SCHED_RT_H
|
||||
|
||||
/*
|
||||
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
|
||||
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
|
||||
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
|
||||
* values are inverted: lower p->prio value means higher priority.
|
||||
*
|
||||
* The MAX_USER_RT_PRIO value allows the actual maximum
|
||||
* RT priority to be separate from the value exported to
|
||||
* user-space. This allows kernel threads to set their
|
||||
* priority to a value higher than any user task. Note:
|
||||
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
|
||||
*/
|
||||
|
||||
#define MAX_USER_RT_PRIO 100
|
||||
#define MAX_RT_PRIO MAX_USER_RT_PRIO
|
||||
|
||||
#define MAX_PRIO (MAX_RT_PRIO + 40)
|
||||
#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
|
||||
#define MAX_NICE 19
|
||||
#define MIN_NICE -20
|
||||
#define NICE_WIDTH (MAX_NICE - MIN_NICE + 1)
|
||||
|
||||
static inline int rt_prio(int prio)
|
||||
{
|
||||
|
@ -4,6 +4,12 @@
|
||||
/*
|
||||
* This struct is used to pass information from page reclaim to the shrinkers.
|
||||
* We consolidate the values for easier extention later.
|
||||
*
|
||||
* The 'gfpmask' refers to the allocation we are currently trying to
|
||||
* fulfil.
|
||||
*
|
||||
* Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
|
||||
* querying the cache size, so a fastpath for that case is appropriate.
|
||||
*/
|
||||
struct shrink_control {
|
||||
gfp_t gfp_mask;
|
||||
@ -12,23 +18,37 @@ struct shrink_control {
|
||||
unsigned long nr_to_scan;
|
||||
};
|
||||
|
||||
#define SHRINK_STOP (~0UL)
|
||||
/*
|
||||
* A callback you can register to apply pressure to ageable caches.
|
||||
*
|
||||
* 'sc' is passed shrink_control which includes a count 'nr_to_scan'
|
||||
* and a 'gfpmask'. It should look through the least-recently-used
|
||||
* 'nr_to_scan' entries and attempt to free them up. It should return
|
||||
* the number of objects which remain in the cache. If it returns -1, it means
|
||||
* it cannot do any scanning at this time (eg. there is a risk of deadlock).
|
||||
* @shrink() should look through the least-recently-used 'nr_to_scan' entries
|
||||
* and attempt to free them up. It should return the number of objects which
|
||||
* remain in the cache. If it returns -1, it means it cannot do any scanning at
|
||||
* this time (eg. there is a risk of deadlock).
|
||||
*
|
||||
* The 'gfpmask' refers to the allocation we are currently trying to
|
||||
* fulfil.
|
||||
* @count_objects should return the number of freeable items in the cache. If
|
||||
* there are no objects to free or the number of freeable items cannot be
|
||||
* determined, it should return 0. No deadlock checks should be done during the
|
||||
* count callback - the shrinker relies on aggregating scan counts that couldn't
|
||||
* be executed due to potential deadlocks to be run at a later call when the
|
||||
* deadlock condition is no longer pending.
|
||||
*
|
||||
* Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
|
||||
* querying the cache size, so a fastpath for that case is appropriate.
|
||||
* @scan_objects will only be called if @count_objects returned a non-zero
|
||||
* value for the number of freeable objects. The callout should scan the cache
|
||||
* and attempt to free items from the cache. It should then return the number
|
||||
* of objects freed during the scan, or SHRINK_STOP if progress cannot be made
|
||||
* due to potential deadlocks. If SHRINK_STOP is returned, then no further
|
||||
* attempts to call the @scan_objects will be made from the current reclaim
|
||||
* context.
|
||||
*/
|
||||
struct shrinker {
|
||||
int (*shrink)(struct shrinker *, struct shrink_control *sc);
|
||||
unsigned long (*count_objects)(struct shrinker *,
|
||||
struct shrink_control *sc);
|
||||
unsigned long (*scan_objects)(struct shrinker *,
|
||||
struct shrink_control *sc);
|
||||
|
||||
int seeks; /* seeks to recreate an obj */
|
||||
long batch; /* reclaim batch size, 0 = default */
|
||||
|
||||
|
@ -1,50 +0,0 @@
|
||||
#ifndef __LINUX_VMPRESSURE_H
|
||||
#define __LINUX_VMPRESSURE_H
|
||||
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/cgroup.h>
|
||||
|
||||
struct vmpressure {
|
||||
unsigned long scanned;
|
||||
unsigned long reclaimed;
|
||||
unsigned long stall;
|
||||
/* The lock is used to keep the scanned/reclaimed above in sync. */
|
||||
struct mutex sr_lock;
|
||||
|
||||
/* The list of vmpressure_event structs. */
|
||||
struct list_head events;
|
||||
/* Have to grab the lock on events traversal or modifications. */
|
||||
struct mutex events_lock;
|
||||
|
||||
struct work_struct work;
|
||||
};
|
||||
|
||||
struct mem_cgroup;
|
||||
|
||||
extern int vmpressure_notifier_register(struct notifier_block *nb);
|
||||
extern int vmpressure_notifier_unregister(struct notifier_block *nb);
|
||||
extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
|
||||
unsigned long scanned, unsigned long reclaimed);
|
||||
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
extern void vmpressure_init(struct vmpressure *vmpr);
|
||||
extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
|
||||
extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
|
||||
extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
|
||||
extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
|
||||
struct eventfd_ctx *eventfd,
|
||||
const char *args);
|
||||
extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
|
||||
struct eventfd_ctx *eventfd);
|
||||
#else
|
||||
static inline struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif /* CONFIG_MEMCG */
|
||||
#endif /* __LINUX_VMPRESSURE_H */
|
@ -1,84 +0,0 @@
|
||||
/* Copyright (c) 2015, The Linux Foundation. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 and
|
||||
* only version 2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#undef TRACE_SYSTEM
|
||||
#define TRACE_SYSTEM almk
|
||||
|
||||
#if !defined(_TRACE_EVENT_ALMK_H) || defined(TRACE_HEADER_MULTI_READ)
|
||||
#define _TRACE_EVENT_ALMK_H
|
||||
|
||||
#include <linux/tracepoint.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
TRACE_EVENT(almk_vmpressure,
|
||||
|
||||
TP_PROTO(unsigned long pressure,
|
||||
int other_free,
|
||||
int other_file),
|
||||
|
||||
TP_ARGS(pressure, other_free, other_file),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned long, pressure)
|
||||
__field(int, other_free)
|
||||
__field(int, other_file)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->pressure = pressure;
|
||||
__entry->other_free = other_free;
|
||||
__entry->other_file = other_file;
|
||||
),
|
||||
|
||||
TP_printk("%lu, %d, %d",
|
||||
__entry->pressure, __entry->other_free,
|
||||
__entry->other_file)
|
||||
);
|
||||
|
||||
TRACE_EVENT(almk_shrink,
|
||||
|
||||
TP_PROTO(int tsize,
|
||||
int vmp,
|
||||
int other_free,
|
||||
int other_file,
|
||||
short adj),
|
||||
|
||||
TP_ARGS(tsize, vmp, other_free, other_file, adj),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, tsize)
|
||||
__field(int, vmp)
|
||||
__field(int, other_free)
|
||||
__field(int, other_file)
|
||||
__field(short, adj)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->tsize = tsize;
|
||||
__entry->vmp = vmp;
|
||||
__entry->other_free = other_free;
|
||||
__entry->other_file = other_file;
|
||||
__entry->adj = adj;
|
||||
),
|
||||
|
||||
TP_printk("%d, %d, %d, %d, %d",
|
||||
__entry->tsize,
|
||||
__entry->vmp,
|
||||
__entry->other_free,
|
||||
__entry->other_file,
|
||||
__entry->adj)
|
||||
);
|
||||
|
||||
#endif
|
||||
|
||||
#include <trace/define_trace.h>
|
||||
|
@ -17,24 +17,6 @@
|
||||
|
||||
extern __read_mostly int scheduler_running;
|
||||
|
||||
/*
|
||||
* Convert user-nice values [ -20 ... 0 ... 19 ]
|
||||
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
|
||||
* and back.
|
||||
*/
|
||||
#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
|
||||
#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
|
||||
#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
|
||||
|
||||
/*
|
||||
* 'User priority' is the nice value converted to something we
|
||||
* can work with better when scaling various scheduler parameters,
|
||||
* it's a [ 0 ... 39 ] range.
|
||||
*/
|
||||
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
|
||||
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
|
||||
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
|
||||
|
||||
/*
|
||||
* Helpers for converting nanosecond timing to jiffy resolution
|
||||
*/
|
||||
|
@ -17,8 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
|
||||
util.o mmzone.o vmstat.o backing-dev.o \
|
||||
mm_init.o mmu_context.o percpu.o slab_common.o \
|
||||
compaction.o balloon_compaction.o \
|
||||
interval_tree.o $(mmu-y) \
|
||||
showmem.o vmpressure.o
|
||||
interval_tree.o showmem.o list_lru.o $(mmu-y)
|
||||
|
||||
obj-y += init-mm.o
|
||||
|
||||
|
152
mm/list_lru.c
Normal file
152
mm/list_lru.c
Normal file
@ -0,0 +1,152 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
|
||||
* Authors: David Chinner and Glauber Costa
|
||||
*
|
||||
* Generic LRU infrastructure
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/list_lru.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
bool list_lru_add(struct list_lru *lru, struct list_head *item)
|
||||
{
|
||||
int nid = page_to_nid(virt_to_page(item));
|
||||
struct list_lru_node *nlru = &lru->node[nid];
|
||||
|
||||
spin_lock(&nlru->lock);
|
||||
WARN_ON_ONCE(nlru->nr_items < 0);
|
||||
if (list_empty(item)) {
|
||||
list_add_tail(item, &nlru->list);
|
||||
if (nlru->nr_items++ == 0)
|
||||
node_set(nid, lru->active_nodes);
|
||||
spin_unlock(&nlru->lock);
|
||||
return true;
|
||||
}
|
||||
spin_unlock(&nlru->lock);
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(list_lru_add);
|
||||
|
||||
bool list_lru_del(struct list_lru *lru, struct list_head *item)
|
||||
{
|
||||
int nid = page_to_nid(virt_to_page(item));
|
||||
struct list_lru_node *nlru = &lru->node[nid];
|
||||
|
||||
spin_lock(&nlru->lock);
|
||||
if (!list_empty(item)) {
|
||||
list_del_init(item);
|
||||
if (--nlru->nr_items == 0)
|
||||
node_clear(nid, lru->active_nodes);
|
||||
WARN_ON_ONCE(nlru->nr_items < 0);
|
||||
spin_unlock(&nlru->lock);
|
||||
return true;
|
||||
}
|
||||
spin_unlock(&nlru->lock);
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(list_lru_del);
|
||||
|
||||
unsigned long
|
||||
list_lru_count_node(struct list_lru *lru, int nid)
|
||||
{
|
||||
unsigned long count = 0;
|
||||
struct list_lru_node *nlru = &lru->node[nid];
|
||||
|
||||
spin_lock(&nlru->lock);
|
||||
WARN_ON_ONCE(nlru->nr_items < 0);
|
||||
count += nlru->nr_items;
|
||||
spin_unlock(&nlru->lock);
|
||||
|
||||
return count;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(list_lru_count_node);
|
||||
|
||||
unsigned long
|
||||
list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
|
||||
void *cb_arg, unsigned long *nr_to_walk)
|
||||
{
|
||||
|
||||
struct list_lru_node *nlru = &lru->node[nid];
|
||||
struct list_head *item, *n;
|
||||
unsigned long isolated = 0;
|
||||
|
||||
spin_lock(&nlru->lock);
|
||||
restart:
|
||||
list_for_each_safe(item, n, &nlru->list) {
|
||||
enum lru_status ret;
|
||||
|
||||
/*
|
||||
* decrement nr_to_walk first so that we don't livelock if we
|
||||
* get stuck on large numbesr of LRU_RETRY items
|
||||
*/
|
||||
if (!*nr_to_walk)
|
||||
break;
|
||||
--*nr_to_walk;
|
||||
|
||||
ret = isolate(item, &nlru->lock, cb_arg);
|
||||
switch (ret) {
|
||||
case LRU_REMOVED_RETRY:
|
||||
assert_spin_locked(&nlru->lock);
|
||||
case LRU_REMOVED:
|
||||
if (--nlru->nr_items == 0)
|
||||
node_clear(nid, lru->active_nodes);
|
||||
WARN_ON_ONCE(nlru->nr_items < 0);
|
||||
isolated++;
|
||||
/*
|
||||
* If the lru lock has been dropped, our list
|
||||
* traversal is now invalid and so we have to
|
||||
* restart from scratch.
|
||||
*/
|
||||
if (ret == LRU_REMOVED_RETRY)
|
||||
goto restart;
|
||||
break;
|
||||
case LRU_ROTATE:
|
||||
list_move_tail(item, &nlru->list);
|
||||
break;
|
||||
case LRU_SKIP:
|
||||
break;
|
||||
case LRU_RETRY:
|
||||
/*
|
||||
* The lru lock has been dropped, our list traversal is
|
||||
* now invalid and so we have to restart from scratch.
|
||||
*/
|
||||
assert_spin_locked(&nlru->lock);
|
||||
goto restart;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
spin_unlock(&nlru->lock);
|
||||
return isolated;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(list_lru_walk_node);
|
||||
|
||||
int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
|
||||
{
|
||||
int i;
|
||||
size_t size = sizeof(*lru->node) * nr_node_ids;
|
||||
|
||||
lru->node = kzalloc(size, GFP_KERNEL);
|
||||
if (!lru->node)
|
||||
return -ENOMEM;
|
||||
|
||||
nodes_clear(lru->active_nodes);
|
||||
for (i = 0; i < nr_node_ids; i++) {
|
||||
spin_lock_init(&lru->node[i].lock);
|
||||
if (key)
|
||||
lockdep_set_class(&lru->node[i].lock, key);
|
||||
INIT_LIST_HEAD(&lru->node[i].list);
|
||||
lru->node[i].nr_items = 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(list_lru_init_key);
|
||||
|
||||
void list_lru_destroy(struct list_lru *lru)
|
||||
{
|
||||
kfree(lru->node);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(list_lru_destroy);
|
@ -49,7 +49,6 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/vmpressure.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/page_cgroup.h>
|
||||
#include <linux/cpu.h>
|
||||
@ -264,9 +263,6 @@ struct mem_cgroup {
|
||||
*/
|
||||
struct res_counter res;
|
||||
|
||||
/* vmpressure notifications */
|
||||
struct vmpressure vmpressure;
|
||||
|
||||
union {
|
||||
/*
|
||||
* the counter to account for mem+swap usage.
|
||||
@ -366,7 +362,6 @@ struct mem_cgroup {
|
||||
atomic_t numainfo_events;
|
||||
atomic_t numainfo_updating;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Per cgroup active and inactive list, similar to the
|
||||
* per zone LRU lists.
|
||||
@ -518,24 +513,6 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
|
||||
return container_of(s, struct mem_cgroup, css);
|
||||
}
|
||||
|
||||
/* Some nice accessors for the vmpressure. */
|
||||
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
|
||||
{
|
||||
if (!memcg)
|
||||
memcg = root_mem_cgroup;
|
||||
return &memcg->vmpressure;
|
||||
}
|
||||
|
||||
struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
|
||||
{
|
||||
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
|
||||
}
|
||||
|
||||
struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
|
||||
{
|
||||
return &mem_cgroup_from_css(css)->vmpressure;
|
||||
}
|
||||
|
||||
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
|
||||
{
|
||||
return (memcg == root_mem_cgroup);
|
||||
@ -6001,11 +5978,6 @@ static struct cftype mem_cgroup_files[] = {
|
||||
.unregister_event = mem_cgroup_oom_unregister_event,
|
||||
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
|
||||
},
|
||||
{
|
||||
.name = "pressure_level",
|
||||
.register_event = vmpressure_register_event,
|
||||
.unregister_event = vmpressure_unregister_event,
|
||||
},
|
||||
#ifdef CONFIG_NUMA
|
||||
{
|
||||
.name = "numa_stat",
|
||||
@ -6287,7 +6259,6 @@ mem_cgroup_css_alloc(struct cgroup *cont)
|
||||
memcg->move_charge_at_immigrate = 0;
|
||||
mutex_init(&memcg->thresholds_lock);
|
||||
spin_lock_init(&memcg->move_lock);
|
||||
vmpressure_init(&memcg->vmpressure);
|
||||
|
||||
return &memcg->css;
|
||||
|
||||
|
502
mm/vmpressure.c
502
mm/vmpressure.c
@ -1,502 +0,0 @@
|
||||
/*
|
||||
* Linux VM pressure
|
||||
*
|
||||
* Copyright 2012 Linaro Ltd.
|
||||
* Anton Vorontsov <anton.vorontsov@linaro.org>
|
||||
*
|
||||
* Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
|
||||
* Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published
|
||||
* by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/log2.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/vmstat.h>
|
||||
#include <linux/eventfd.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/printk.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/vmpressure.h>
|
||||
|
||||
/*
|
||||
* The window size (vmpressure_win) is the number of scanned pages before
|
||||
* we try to analyze scanned/reclaimed ratio. So the window is used as a
|
||||
* rate-limit tunable for the "low" level notification, and also for
|
||||
* averaging the ratio for medium/critical levels. Using small window
|
||||
* sizes can cause lot of false positives, but too big window size will
|
||||
* delay the notifications.
|
||||
*
|
||||
* As the vmscan reclaimer logic works with chunks which are multiple of
|
||||
* SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
|
||||
*
|
||||
* TODO: Make the window size depend on machine size, as we do for vmstat
|
||||
* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
|
||||
*/
|
||||
static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
|
||||
|
||||
/*
|
||||
* These thresholds are used when we account memory pressure through
|
||||
* scanned/reclaimed ratio. The current values were chosen empirically. In
|
||||
* essence, they are percents: the higher the value, the more number
|
||||
* unsuccessful reclaims there were.
|
||||
*/
|
||||
static const unsigned int vmpressure_level_med = 60;
|
||||
static const unsigned int vmpressure_level_critical = 95;
|
||||
|
||||
static unsigned long vmpressure_scale_max = 100;
|
||||
module_param_named(vmpressure_scale_max, vmpressure_scale_max,
|
||||
ulong, S_IRUGO | S_IWUSR);
|
||||
|
||||
/* vmpressure values >= this will be scaled based on allocstalls */
|
||||
static unsigned long allocstall_threshold = 70;
|
||||
module_param_named(allocstall_threshold, allocstall_threshold,
|
||||
ulong, S_IRUGO | S_IWUSR);
|
||||
|
||||
static struct vmpressure global_vmpressure;
|
||||
BLOCKING_NOTIFIER_HEAD(vmpressure_notifier);
|
||||
|
||||
int vmpressure_notifier_register(struct notifier_block *nb)
|
||||
{
|
||||
return blocking_notifier_chain_register(&vmpressure_notifier, nb);
|
||||
}
|
||||
|
||||
int vmpressure_notifier_unregister(struct notifier_block *nb)
|
||||
{
|
||||
return blocking_notifier_chain_unregister(&vmpressure_notifier, nb);
|
||||
}
|
||||
|
||||
void vmpressure_notify(unsigned long pressure)
|
||||
{
|
||||
blocking_notifier_call_chain(&vmpressure_notifier, pressure, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* When there are too little pages left to scan, vmpressure() may miss the
|
||||
* critical pressure as number of pages will be less than "window size".
|
||||
* However, in that case the vmscan priority will raise fast as the
|
||||
* reclaimer will try to scan LRUs more deeply.
|
||||
*
|
||||
* The vmscan logic considers these special priorities:
|
||||
*
|
||||
* prio == DEF_PRIORITY (12): reclaimer starts with that value
|
||||
* prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
|
||||
* prio == 0 : close to OOM, kernel scans every page in an lru
|
||||
*
|
||||
* Any value in this range is acceptable for this tunable (i.e. from 12 to
|
||||
* 0). Current value for the vmpressure_level_critical_prio is chosen
|
||||
* empirically, but the number, in essence, means that we consider
|
||||
* critical level when scanning depth is ~10% of the lru size (vmscan
|
||||
* scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
|
||||
* eights).
|
||||
*/
|
||||
static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
|
||||
|
||||
static struct vmpressure *work_to_vmpressure(struct work_struct *work)
|
||||
{
|
||||
return container_of(work, struct vmpressure, work);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
|
||||
{
|
||||
return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
|
||||
}
|
||||
|
||||
static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
|
||||
{
|
||||
struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
|
||||
|
||||
memcg = parent_mem_cgroup(memcg);
|
||||
if (!memcg)
|
||||
return NULL;
|
||||
return memcg_to_vmpressure(memcg);
|
||||
}
|
||||
#else
|
||||
static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
enum vmpressure_levels {
|
||||
VMPRESSURE_LOW = 0,
|
||||
VMPRESSURE_MEDIUM,
|
||||
VMPRESSURE_CRITICAL,
|
||||
VMPRESSURE_NUM_LEVELS,
|
||||
};
|
||||
|
||||
static const char * const vmpressure_str_levels[] = {
|
||||
[VMPRESSURE_LOW] = "low",
|
||||
[VMPRESSURE_MEDIUM] = "medium",
|
||||
[VMPRESSURE_CRITICAL] = "critical",
|
||||
};
|
||||
|
||||
static enum vmpressure_levels vmpressure_level(unsigned long pressure)
|
||||
{
|
||||
if (pressure >= vmpressure_level_critical)
|
||||
return VMPRESSURE_CRITICAL;
|
||||
else if (pressure >= vmpressure_level_med)
|
||||
return VMPRESSURE_MEDIUM;
|
||||
return VMPRESSURE_LOW;
|
||||
}
|
||||
|
||||
static unsigned long vmpressure_calc_pressure(unsigned long scanned,
|
||||
unsigned long reclaimed)
|
||||
{
|
||||
unsigned long scale = scanned + reclaimed;
|
||||
unsigned long pressure = 0;
|
||||
|
||||
/*
|
||||
* reclaimed can be greater than scanned in cases
|
||||
* like THP, where the scanned is 1 and reclaimed
|
||||
* could be 512
|
||||
*/
|
||||
if (reclaimed >= scanned)
|
||||
goto out;
|
||||
/*
|
||||
* We calculate the ratio (in percents) of how many pages were
|
||||
* scanned vs. reclaimed in a given time frame (window). Note that
|
||||
* time is in VM reclaimer's "ticks", i.e. number of pages
|
||||
* scanned. This makes it possible to set desired reaction time
|
||||
* and serves as a ratelimit.
|
||||
*/
|
||||
pressure = scale - (reclaimed * scale / scanned);
|
||||
pressure = pressure * 100 / scale;
|
||||
|
||||
out:
|
||||
pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
|
||||
scanned, reclaimed);
|
||||
|
||||
return pressure;
|
||||
}
|
||||
|
||||
static unsigned long vmpressure_account_stall(unsigned long pressure,
|
||||
unsigned long stall, unsigned long scanned)
|
||||
{
|
||||
unsigned long scale;
|
||||
|
||||
if (pressure < allocstall_threshold)
|
||||
return pressure;
|
||||
|
||||
scale = ((vmpressure_scale_max - pressure) * stall) / scanned;
|
||||
|
||||
return pressure + scale;
|
||||
}
|
||||
|
||||
struct vmpressure_event {
|
||||
struct eventfd_ctx *efd;
|
||||
enum vmpressure_levels level;
|
||||
struct list_head node;
|
||||
};
|
||||
|
||||
static bool vmpressure_event(struct vmpressure *vmpr,
|
||||
unsigned long scanned, unsigned long reclaimed)
|
||||
{
|
||||
struct vmpressure_event *ev;
|
||||
enum vmpressure_levels level;
|
||||
unsigned long pressure;
|
||||
bool signalled = false;
|
||||
|
||||
pressure = vmpressure_calc_pressure(scanned, reclaimed);
|
||||
level = vmpressure_level(pressure);
|
||||
|
||||
mutex_lock(&vmpr->events_lock);
|
||||
|
||||
list_for_each_entry(ev, &vmpr->events, node) {
|
||||
if (level >= ev->level) {
|
||||
eventfd_signal(ev->efd, 1);
|
||||
signalled = true;
|
||||
}
|
||||
}
|
||||
|
||||
mutex_unlock(&vmpr->events_lock);
|
||||
|
||||
return signalled;
|
||||
}
|
||||
|
||||
static void vmpressure_work_fn(struct work_struct *work)
|
||||
{
|
||||
struct vmpressure *vmpr = work_to_vmpressure(work);
|
||||
unsigned long scanned;
|
||||
unsigned long reclaimed;
|
||||
|
||||
/*
|
||||
* Several contexts might be calling vmpressure(), so it is
|
||||
* possible that the work was rescheduled again before the old
|
||||
* work context cleared the counters. In that case we will run
|
||||
* just after the old work returns, but then scanned might be zero
|
||||
* here. No need for any locks here since we don't care if
|
||||
* vmpr->reclaimed is in sync.
|
||||
*/
|
||||
if (!vmpr->scanned)
|
||||
return;
|
||||
|
||||
mutex_lock(&vmpr->sr_lock);
|
||||
scanned = vmpr->scanned;
|
||||
reclaimed = vmpr->reclaimed;
|
||||
vmpr->scanned = 0;
|
||||
vmpr->reclaimed = 0;
|
||||
mutex_unlock(&vmpr->sr_lock);
|
||||
|
||||
do {
|
||||
if (vmpressure_event(vmpr, scanned, reclaimed))
|
||||
break;
|
||||
/*
|
||||
* If not handled, propagate the event upward into the
|
||||
* hierarchy.
|
||||
*/
|
||||
} while ((vmpr = vmpressure_parent(vmpr)));
|
||||
}
|
||||
|
||||
void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg,
|
||||
unsigned long scanned, unsigned long reclaimed)
|
||||
{
|
||||
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
|
||||
|
||||
BUG_ON(!vmpr);
|
||||
|
||||
/*
|
||||
* Here we only want to account pressure that userland is able to
|
||||
* help us with. For example, suppose that DMA zone is under
|
||||
* pressure; if we notify userland about that kind of pressure,
|
||||
* then it will be mostly a waste as it will trigger unnecessary
|
||||
* freeing of memory by userland (since userland is more likely to
|
||||
* have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
|
||||
* is why we include only movable, highmem and FS/IO pages.
|
||||
* Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
|
||||
* we account it too.
|
||||
*/
|
||||
if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
|
||||
return;
|
||||
|
||||
/*
|
||||
* If we got here with no pages scanned, then that is an indicator
|
||||
* that reclaimer was unable to find any shrinkable LRUs at the
|
||||
* current scanning depth. But it does not mean that we should
|
||||
* report the critical pressure, yet. If the scanning priority
|
||||
* (scanning depth) goes too high (deep), we will be notified
|
||||
* through vmpressure_prio(). But so far, keep calm.
|
||||
*/
|
||||
if (!scanned)
|
||||
return;
|
||||
|
||||
mutex_lock(&vmpr->sr_lock);
|
||||
vmpr->scanned += scanned;
|
||||
vmpr->reclaimed += reclaimed;
|
||||
scanned = vmpr->scanned;
|
||||
mutex_unlock(&vmpr->sr_lock);
|
||||
|
||||
if (scanned < vmpressure_win || work_pending(&vmpr->work))
|
||||
return;
|
||||
schedule_work(&vmpr->work);
|
||||
}
|
||||
|
||||
void vmpressure_global(gfp_t gfp, unsigned long scanned,
|
||||
unsigned long reclaimed)
|
||||
{
|
||||
struct vmpressure *vmpr = &global_vmpressure;
|
||||
unsigned long pressure;
|
||||
unsigned long stall;
|
||||
|
||||
if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
|
||||
return;
|
||||
|
||||
if (!scanned)
|
||||
return;
|
||||
|
||||
mutex_lock(&vmpr->sr_lock);
|
||||
vmpr->scanned += scanned;
|
||||
vmpr->reclaimed += reclaimed;
|
||||
|
||||
if (!current_is_kswapd())
|
||||
vmpr->stall += scanned;
|
||||
|
||||
stall = vmpr->stall;
|
||||
scanned = vmpr->scanned;
|
||||
reclaimed = vmpr->reclaimed;
|
||||
mutex_unlock(&vmpr->sr_lock);
|
||||
|
||||
if (scanned < vmpressure_win)
|
||||
return;
|
||||
|
||||
mutex_lock(&vmpr->sr_lock);
|
||||
vmpr->scanned = 0;
|
||||
vmpr->reclaimed = 0;
|
||||
vmpr->stall = 0;
|
||||
mutex_unlock(&vmpr->sr_lock);
|
||||
|
||||
pressure = vmpressure_calc_pressure(scanned, reclaimed);
|
||||
pressure = vmpressure_account_stall(pressure, stall, scanned);
|
||||
vmpressure_notify(pressure);
|
||||
}
|
||||
|
||||
/**
|
||||
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
|
||||
* @gfp: reclaimer's gfp mask
|
||||
* @memcg: cgroup memory controller handle
|
||||
* @scanned: number of pages scanned
|
||||
* @reclaimed: number of pages reclaimed
|
||||
*
|
||||
* This function should be called from the vmscan reclaim path to account
|
||||
* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
|
||||
* pressure index is then further refined and averaged over time.
|
||||
*
|
||||
* This function does not return any value.
|
||||
*/
|
||||
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
|
||||
unsigned long scanned, unsigned long reclaimed)
|
||||
{
|
||||
if (!memcg)
|
||||
vmpressure_global(gfp, scanned, reclaimed);
|
||||
|
||||
if (IS_ENABLED(CONFIG_MEMCG))
|
||||
vmpressure_memcg(gfp, memcg, scanned, reclaimed);
|
||||
}
|
||||
|
||||
/**
|
||||
* vmpressure_prio() - Account memory pressure through reclaimer priority level
|
||||
* @gfp: reclaimer's gfp mask
|
||||
* @memcg: cgroup memory controller handle
|
||||
* @prio: reclaimer's priority
|
||||
*
|
||||
* This function should be called from the reclaim path every time when
|
||||
* the vmscan's reclaiming priority (scanning depth) changes.
|
||||
*
|
||||
* This function does not return any value.
|
||||
*/
|
||||
void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
|
||||
{
|
||||
/*
|
||||
* We only use prio for accounting critical level. For more info
|
||||
* see comment for vmpressure_level_critical_prio variable above.
|
||||
*/
|
||||
if (prio > vmpressure_level_critical_prio)
|
||||
return;
|
||||
|
||||
/*
|
||||
* OK, the prio is below the threshold, updating vmpressure
|
||||
* information before shrinker dives into long shrinking of long
|
||||
* range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
|
||||
* to the vmpressure() basically means that we signal 'critical'
|
||||
* level.
|
||||
*/
|
||||
vmpressure(gfp, memcg, vmpressure_win, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* vmpressure_register_event() - Bind vmpressure notifications to an eventfd
|
||||
* @cg: cgroup that is interested in vmpressure notifications
|
||||
* @cft: cgroup control files handle
|
||||
* @eventfd: eventfd context to link notifications with
|
||||
* @args: event arguments (used to set up a pressure level threshold)
|
||||
*
|
||||
* This function associates eventfd context with the vmpressure
|
||||
* infrastructure, so that the notifications will be delivered to the
|
||||
* @eventfd. The @args parameter is a string that denotes pressure level
|
||||
* threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
|
||||
* "critical").
|
||||
*
|
||||
* This function should not be used directly, just pass it to (struct
|
||||
* cftype).register_event, and then cgroup core will handle everything by
|
||||
* itself.
|
||||
*/
|
||||
int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
|
||||
struct eventfd_ctx *eventfd, const char *args)
|
||||
{
|
||||
struct vmpressure *vmpr = cg_to_vmpressure(cg);
|
||||
struct vmpressure_event *ev;
|
||||
int level;
|
||||
|
||||
BUG_ON(!vmpr);
|
||||
|
||||
for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
|
||||
if (!strcmp(vmpressure_str_levels[level], args))
|
||||
break;
|
||||
}
|
||||
|
||||
if (level >= VMPRESSURE_NUM_LEVELS)
|
||||
return -EINVAL;
|
||||
|
||||
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
|
||||
if (!ev)
|
||||
return -ENOMEM;
|
||||
|
||||
ev->efd = eventfd;
|
||||
ev->level = level;
|
||||
|
||||
mutex_lock(&vmpr->events_lock);
|
||||
list_add(&ev->node, &vmpr->events);
|
||||
mutex_unlock(&vmpr->events_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* vmpressure_unregister_event() - Unbind eventfd from vmpressure
|
||||
* @cg: cgroup handle
|
||||
* @cft: cgroup control files handle
|
||||
* @eventfd: eventfd context that was used to link vmpressure with the @cg
|
||||
*
|
||||
* This function does internal manipulations to detach the @eventfd from
|
||||
* the vmpressure notifications, and then frees internal resources
|
||||
* associated with the @eventfd (but the @eventfd itself is not freed).
|
||||
*
|
||||
* This function should not be used directly, just pass it to (struct
|
||||
* cftype).unregister_event, and then cgroup core will handle everything
|
||||
* by itself.
|
||||
*/
|
||||
void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
|
||||
struct eventfd_ctx *eventfd)
|
||||
{
|
||||
struct vmpressure *vmpr = cg_to_vmpressure(cg);
|
||||
struct vmpressure_event *ev;
|
||||
|
||||
if (!vmpr)
|
||||
BUG();
|
||||
|
||||
mutex_lock(&vmpr->events_lock);
|
||||
list_for_each_entry(ev, &vmpr->events, node) {
|
||||
if (ev->efd != eventfd)
|
||||
continue;
|
||||
list_del(&ev->node);
|
||||
kfree(ev);
|
||||
break;
|
||||
}
|
||||
mutex_unlock(&vmpr->events_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* vmpressure_init() - Initialize vmpressure control structure
|
||||
* @vmpr: Structure to be initialized
|
||||
*
|
||||
* This function should be called on every allocated vmpressure structure
|
||||
* before any usage.
|
||||
*/
|
||||
void vmpressure_init(struct vmpressure *vmpr)
|
||||
{
|
||||
mutex_init(&vmpr->sr_lock);
|
||||
mutex_init(&vmpr->events_lock);
|
||||
INIT_LIST_HEAD(&vmpr->events);
|
||||
INIT_WORK(&vmpr->work, vmpressure_work_fn);
|
||||
}
|
||||
|
||||
int vmpressure_global_init(void)
|
||||
{
|
||||
vmpressure_init(&global_vmpressure);
|
||||
return 0;
|
||||
}
|
||||
late_initcall(vmpressure_global_init);
|
68
mm/vmscan.c
68
mm/vmscan.c
@ -24,7 +24,6 @@
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/vmpressure.h>
|
||||
#include <linux/vmstat.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/writeback.h>
|
||||
@ -283,19 +282,24 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
|
||||
*
|
||||
* Returns the number of slab objects which we shrunk.
|
||||
*/
|
||||
unsigned long shrink_slab(struct shrink_control *shrink,
|
||||
unsigned long shrink_slab(struct shrink_control *shrinkctl,
|
||||
unsigned long nr_pages_scanned,
|
||||
unsigned long lru_pages)
|
||||
{
|
||||
struct shrinker *shrinker;
|
||||
unsigned long ret = 0;
|
||||
unsigned long freed = 0;
|
||||
|
||||
if (nr_pages_scanned == 0)
|
||||
nr_pages_scanned = SWAP_CLUSTER_MAX;
|
||||
|
||||
if (!down_read_trylock(&shrinker_rwsem)) {
|
||||
/* Assume we'll be able to shrink next time */
|
||||
ret = 1;
|
||||
/*
|
||||
* If we would return 0, our callers would understand that we
|
||||
* have nothing else to shrink and give up trying. By returning
|
||||
* 1 we keep it going and assume we'll be able to shrink next
|
||||
* time.
|
||||
*/
|
||||
freed = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -303,7 +307,6 @@ unsigned long shrink_slab(struct shrink_control *shrink,
|
||||
unsigned long long delta;
|
||||
long total_scan;
|
||||
long max_pass;
|
||||
int shrink_ret = 0;
|
||||
long nr;
|
||||
long new_nr;
|
||||
long batch_size = shrinker->batch ? shrinker->batch
|
||||
@ -313,8 +316,11 @@ unsigned long shrink_slab(struct shrink_control *shrink,
|
||||
if (current_is_kswapd())
|
||||
min_cache_size = 0;
|
||||
|
||||
max_pass = do_shrinker_shrink(shrinker, shrink, 0);
|
||||
if (max_pass <= 0)
|
||||
if (shrinker->count_objects)
|
||||
max_pass = shrinker->count_objects(shrinker, shrinkctl);
|
||||
else
|
||||
max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
|
||||
if (max_pass == 0)
|
||||
continue;
|
||||
|
||||
/*
|
||||
@ -330,8 +336,8 @@ unsigned long shrink_slab(struct shrink_control *shrink,
|
||||
do_div(delta, lru_pages + 1);
|
||||
total_scan += delta;
|
||||
if (total_scan < 0) {
|
||||
printk(KERN_ERR "shrink_slab: %pF negative objects to "
|
||||
"delete nr=%ld\n",
|
||||
printk(KERN_ERR
|
||||
"shrink_slab: %pF negative objects to delete nr=%ld\n",
|
||||
shrinker->shrink, total_scan);
|
||||
total_scan = max_pass;
|
||||
}
|
||||
@ -359,23 +365,36 @@ unsigned long shrink_slab(struct shrink_control *shrink,
|
||||
if (total_scan > max_pass * 2)
|
||||
total_scan = max_pass * 2;
|
||||
|
||||
trace_mm_shrink_slab_start(shrinker, shrink, nr,
|
||||
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
|
||||
nr_pages_scanned, lru_pages,
|
||||
max_pass, delta, total_scan);
|
||||
|
||||
while (total_scan > min_cache_size) {
|
||||
int nr_before;
|
||||
|
||||
if (total_scan < batch_size)
|
||||
batch_size = total_scan;
|
||||
|
||||
nr_before = do_shrinker_shrink(shrinker, shrink, 0);
|
||||
shrink_ret = do_shrinker_shrink(shrinker, shrink,
|
||||
batch_size);
|
||||
if (shrink_ret == -1)
|
||||
break;
|
||||
if (shrink_ret < nr_before)
|
||||
ret += nr_before - shrink_ret;
|
||||
if (shrinker->scan_objects) {
|
||||
unsigned long ret;
|
||||
shrinkctl->nr_to_scan = batch_size;
|
||||
ret = shrinker->scan_objects(shrinker, shrinkctl);
|
||||
|
||||
if (ret == SHRINK_STOP)
|
||||
break;
|
||||
freed += ret;
|
||||
} else {
|
||||
int nr_before;
|
||||
long ret;
|
||||
|
||||
nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
|
||||
ret = do_shrinker_shrink(shrinker, shrinkctl,
|
||||
batch_size);
|
||||
if (ret == -1)
|
||||
break;
|
||||
if (ret < nr_before)
|
||||
freed += nr_before - ret;
|
||||
}
|
||||
|
||||
count_vm_events(SLABS_SCANNED, batch_size);
|
||||
total_scan -= batch_size;
|
||||
|
||||
@ -393,12 +412,12 @@ unsigned long shrink_slab(struct shrink_control *shrink,
|
||||
else
|
||||
new_nr = atomic_long_read(&shrinker->nr_in_batch);
|
||||
|
||||
trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
|
||||
trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
|
||||
}
|
||||
up_read(&shrinker_rwsem);
|
||||
out:
|
||||
cond_resched();
|
||||
return ret;
|
||||
return freed;
|
||||
}
|
||||
|
||||
static inline int is_page_cache_freeable(struct page *page)
|
||||
@ -2312,11 +2331,6 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
|
||||
}
|
||||
memcg = mem_cgroup_iter(root, memcg, &reclaim);
|
||||
} while (memcg);
|
||||
|
||||
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
|
||||
sc->nr_scanned - nr_scanned,
|
||||
sc->nr_reclaimed - nr_reclaimed);
|
||||
|
||||
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
|
||||
sc->nr_scanned - nr_scanned, sc));
|
||||
}
|
||||
@ -2497,8 +2511,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
|
||||
count_vm_event(ALLOCSTALL);
|
||||
|
||||
do {
|
||||
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
|
||||
sc->priority);
|
||||
sc->nr_scanned = 0;
|
||||
aborted_reclaim = shrink_zones(zonelist, sc);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user