Revert "Squash revert 3.18 binder/lmk changes"

This reverts commit 3b278f89ee.
2024-11-23 03:50:08 +00:00 · 2022-01-21 11:12:33 +08:00 · 2022-01-21 11:12:33 +08:00 · 9a3c5248a3
commit 9a3c5248a3
parent 8ba63ccfde
31 changed files with 8713 additions and 5992 deletions
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@ -40,7 +40,6 @@ Features:
 - soft limit
 - moving (recharging) account at moving a task is selectable.
 - usage threshold notifier
- - memory pressure notifier
 - oom-killer disable knob and oom-notifier
 - Root cgroup has no limit controls.

@ -66,7 +65,6 @@ Brief summary of control files.
 memory.stat			 # show various statistics
 memory.use_hierarchy		 # set/show hierarchical account enabled
 memory.force_empty		 # trigger forced move charge to parent
- memory.pressure_level		 # set memory pressure notifications
 memory.swappiness		 # set/show swappiness parameter of vmscan
 				 (See sysctl's vm.swappiness)
 memory.move_charge_at_immigrate # set/show controls of moving charges
@ -766,73 +764,7 @@ At reading, current status of OOM is shown.
 	under_oom	 0 or 1 (if 1, the memory cgroup is under OOM, tasks may
 				 be stopped.)

-11. Memory Pressure
-
-The pressure level notifications can be used to monitor the memory
-allocation cost; based on the pressure, applications can implement
-different strategies of managing their memory resources. The pressure
-levels are defined as following:
-
-The "low" level means that the system is reclaiming memory for new
-allocations. Monitoring this reclaiming activity might be useful for
-maintaining cache level. Upon notification, the program (typically
-"Activity Manager") might analyze vmstat and act in advance (i.e.
-prematurely shutdown unimportant services).
-
-The "medium" level means that the system is experiencing medium memory
-pressure, the system might be making swap, paging out active file caches,
-etc. Upon this event applications may decide to further analyze
-vmstat/zoneinfo/memcg or internal memory usage statistics and free any
-resources that can be easily reconstructed or re-read from a disk.
-
-The "critical" level means that the system is actively thrashing, it is
-about to out of memory (OOM) or even the in-kernel OOM killer is on its
-way to trigger. Applications should do whatever they can to help the
-system. It might be too late to consult with vmstat or any other
-statistics, so it's advisable to take an immediate action.
-
-The events are propagated upward until the event is handled, i.e. the
-events are not pass-through. Here is what this means: for example you have
-three cgroups: A->B->C. Now you set up an event listener on cgroups A, B
-and C, and suppose group C experiences some pressure. In this situation,
-only group C will receive the notification, i.e. groups A and B will not
-receive it. This is done to avoid excessive "broadcasting" of messages,
-which disturbs the system and which is especially bad if we are low on
-memory or thrashing. So, organize the cgroups wisely, or propagate the
-events manually (or, ask us to implement the pass-through events,
-explaining why would you need them.)
-
-The file memory.pressure_level is only used to setup an eventfd. To
-register a notification, an application must:
-
- create an eventfd using eventfd(2);
- open memory.pressure_level;
- write string like "<event_fd> <fd of memory.pressure_level> <level>"
-  to cgroup.event_control.
-
-Application will be notified through eventfd when memory pressure is at
-the specific level (or higher). Read/write operations to
-memory.pressure_level are no implemented.
-
-Test:
-
-   Here is a small script example that makes a new cgroup, sets up a
-   memory limit, sets up a notification in the cgroup and then makes child
-   cgroup experience a critical pressure:
-
-   # cd /sys/fs/cgroup/memory/
-   # mkdir foo
-   # cd foo
-   # cgroup_event_listener memory.pressure_level low &
-   # echo 8000000 > memory.limit_in_bytes
-   # echo 8000000 > memory.memsw.limit_in_bytes
-   # echo $$ > tasks
-   # dd if=/dev/zero | read x
-
-   (Expect a bunch of notifications, and eventually, the oom-killer will
-   trigger.)
-
-12. TODO
+11. TODO

 1. Add support for accounting huge pages (as a separate controller)
 2. Make per-cgroup scanner reclaim not-shared pages first
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@ -174,8 +174,6 @@ source "drivers/ipack/Kconfig"

 source "drivers/reset/Kconfig"

-source "drivers/android/Kconfig"
-
 source "drivers/coresight/Kconfig"

 source "drivers/bif/Kconfig"
--- a/drivers/Makefile
+++ b/drivers/Makefile
@ -158,7 +158,6 @@ obj-$(CONFIG_IIO)		+= iio/
 obj-$(CONFIG_VME_BUS)		+= vme/
 obj-$(CONFIG_IPACK_BUS)		+= ipack/
 obj-$(CONFIG_NTB)		+= ntb/
-obj-$(CONFIG_ANDROID)		+= android/

 obj-$(CONFIG_CORESIGHT)		+= coresight/
 obj-$(CONFIG_ESOC)		+= esoc/
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@ -1,49 +0,0 @@
-menu "Android"
-
-config ANDROID
-	bool "Android Drivers"
-	---help---
-	  Enable support for various drivers needed on the Android platform
-
-if ANDROID
-
-config ANDROID_BINDER_IPC
-	bool "Android Binder IPC Driver"
-	depends on MMU
-	default n
-	---help---
-	  Binder is used in Android for both communication between processes,
-	  and remote method invocation.
-
-	  This means one Android process can call a method/routine in another
-	  Android process, using Binder to identify, invoke and pass arguments
-	  between said processes.
-
-config ANDROID_BINDER_DEVICES
-	string "Android Binder devices"
-	depends on ANDROID_BINDER_IPC
-	default "binder,hwbinder,vndbinder"
-	---help---
-	  Default value for the binder.devices parameter.
-
-	  The binder.devices parameter is a comma-separated list of strings
-	  that specifies the names of the binder device nodes that will be
-	  created. Each binder device has its own context manager, and is
-	  therefore logically separated from the other devices.
-
-config ANDROID_BINDER_IPC_32BIT
-	bool
-	depends on !64BIT && ANDROID_BINDER_IPC
-	default y
-	---help---
-	  The Binder API has been changed to support both 32 and 64bit
-	  applications in a mixed environment.
-
-	  Enable this to support an old 32-bit Android user-space (v4.4 and
-	  earlier).
-
-	  Note that enabling this will break newer Android user-space.
-
-endif # if ANDROID
-
-endmenu
--- a/drivers/android/Makefile
+++ b/drivers/android/Makefile
@ -1,3 +0,0 @@
-ccflags-y += -I$(src)			# needed for trace events
-
-obj-$(CONFIG_ANDROID_BINDER_IPC)	+= binder.o
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
--- a/drivers/staging/android/Kconfig
+++ b/drivers/staging/android/Kconfig
@ -8,6 +8,53 @@ config ANDROID

 if ANDROID

+config ANDROID_BINDER_IPC
+	bool "Android Binder IPC Driver"
+	depends on MMU
+	default n
+	---help---
+	  Binder is used in Android for both communication between processes,
+	  and remote method invocation.
+
+	  This means one Android process can call a method/routine in another
+	  Android process, using Binder to identify, invoke and pass arguments
+	  between said processes.
+
+config ANDROID_BINDER_IPC_32BIT
+	bool "Use old (Android 4.4 and earlier) 32-bit binder API"
+	depends on !64BIT && ANDROID_BINDER_IPC
+	default y
+	---help---
+	  The Binder API has been changed to support both 32 and 64bit
+	  applications in a mixed environment.
+
+	  Enable this to support an old 32-bit Android user-space (v4.4 and
+	  earlier).
+
+	  Note that enabling this will break newer Android user-space.
+
+config ANDROID_BINDER_DEVICES
+	string "Android Binder devices"
+	depends on ANDROID_BINDER_IPC
+	default "binder,hwbinder,vndbinder"
+	---help---
+	  Default value for the binder.devices parameter.
+
+	  The binder.devices parameter is a comma-separated list of strings
+	  that specifies the names of the binder device nodes that will be
+	  created. Each binder device has its own context manager, and is
+	  therefore logically separated from the other devices.
+
+config ANDROID_BINDER_IPC_SELFTEST
+        bool "Android Binder IPC Driver Selftest"
+        depends on ANDROID_BINDER_IPC
+        ---help---
+          This feature allows binder selftest to run.
+
+          Binder selftest checks the allocation and free of binder buffers
+          exhaustively with combinations of various buffer sizes and
+          alignments.
+
 config ASHMEM
 	bool "Enable the Anonymous Shared Memory Subsystem"
 	default n
--- a/drivers/staging/android/Makefile
+++ b/drivers/staging/android/Makefile
@ -3,6 +3,8 @@ ccflags-y += -I$(src)			# needed for trace events
 obj-y					+= ion/
 obj-$(CONFIG_FIQ_DEBUGGER)		+= fiq_debugger/

+obj-$(CONFIG_ANDROID_BINDER_IPC)	+= binder.o binder_alloc.o
+obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
 obj-$(CONFIG_ASHMEM)			+= ashmem.o
 obj-$(CONFIG_ANDROID_TIMED_OUTPUT)	+= timed_output.o
 obj-$(CONFIG_ANDROID_TIMED_GPIO)	+= timed_gpio.o
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@ -268,24 +268,23 @@ static loff_t ashmem_llseek(struct file *file, loff_t offset, int origin)
 	mutex_lock(&ashmem_mutex);

 	if (asma->size == 0) {
-		ret = -EINVAL;
-		goto out;
+		mutex_unlock(&ashmem_mutex);
+		return -EINVAL;
 	}

 	if (!asma->file) {
-		ret = -EBADF;
-		goto out;
+		mutex_unlock(&ashmem_mutex);
+		return -EBADF;
 	}

-	ret = asma->file->f_op->llseek(asma->file, offset, origin);
+	mutex_unlock(&ashmem_mutex);
+
+	ret = vfs_llseek(asma->file, offset, origin);
 	if (ret < 0)
-		goto out;
+		return ret;

 	/** Copy f_pos from backing file, since f_ops->llseek() sets it */
 	file->f_pos = asma->file->f_pos;
-
-out:
-	mutex_unlock(&ashmem_mutex);
 	return ret;
 }

@ -309,6 +308,12 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
 		goto out;
 	}

+	/* requested mapping size larger than object size */
+	if (vma->vm_end - vma->vm_start > PAGE_ALIGN(asma->size)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	/* requested protection bits must match our allowed protection mask */
 	if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
 		     calc_vm_prot_bits(PROT_MASK))) {
@ -330,6 +335,7 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
 			ret = PTR_ERR(vmfile);
 			goto out;
 		}
+		vmfile->f_mode |= FMODE_LSEEK;
 		asma->file = vmfile;
 	}
 	get_file(asma->file);
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
--- a/drivers/staging/android/binder.h
+++ b/drivers/staging/android/binder.h
@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * Based on, but no longer compatible with, the original
+ * OpenBinder.org binder driver interface, which is:
+ *
+ * Copyright (c) 2005 Palmsource, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_BINDER_H
+#define _LINUX_BINDER_H
+
+#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT
+#define BINDER_IPC_32BIT 1
+#endif
+
+#include "uapi/binder.h"
+
+#endif /* _LINUX_BINDER_H */
+
--- a/drivers/staging/android/binder_alloc.c
+++ b/drivers/staging/android/binder_alloc.c
--- a/drivers/staging/android/binder_alloc.h
+++ b/drivers/staging/android/binder_alloc.h
@ -0,0 +1,186 @@
+/*
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_BINDER_ALLOC_H
+#define _LINUX_BINDER_ALLOC_H
+
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/rtmutex.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/list_lru.h>
+
+extern struct list_lru binder_alloc_lru;
+struct binder_transaction;
+
+/**
+ * struct binder_buffer - buffer used for binder transactions
+ * @entry:              entry alloc->buffers
+ * @rb_node:            node for allocated_buffers/free_buffers rb trees
+ * @free:               true if buffer is free
+ * @allow_user_free:    describe the second member of struct blah,
+ * @async_transaction:  describe the second member of struct blah,
+ * @debug_id:           describe the second member of struct blah,
+ * @transaction:        describe the second member of struct blah,
+ * @target_node:        describe the second member of struct blah,
+ * @data_size:          describe the second member of struct blah,
+ * @offsets_size:       describe the second member of struct blah,
+ * @extra_buffers_size: describe the second member of struct blah,
+ * @data:i              describe the second member of struct blah,
+ *
+ * Bookkeeping structure for binder transaction buffers
+ */
+struct binder_buffer {
+	struct list_head entry; /* free and allocated entries by address */
+	struct rb_node rb_node; /* free entry by size or allocated entry */
+				/* by address */
+	unsigned free:1;
+	unsigned allow_user_free:1;
+	unsigned async_transaction:1;
+	unsigned debug_id:29;
+
+	struct binder_transaction *transaction;
+
+	struct binder_node *target_node;
+	size_t data_size;
+	size_t offsets_size;
+	size_t extra_buffers_size;
+	void *data;
+};
+
+/**
+ * struct binder_lru_page - page object used for binder shrinker
+ * @page_ptr: pointer to physical page in mmap'd space
+ * @lru:      entry in binder_alloc_lru
+ * @alloc:    binder_alloc for a proc
+ */
+struct binder_lru_page {
+	struct list_head lru;
+	struct page *page_ptr;
+	struct binder_alloc *alloc;
+};
+
+/**
+ * struct binder_alloc - per-binder proc state for binder allocator
+ * @vma:                vm_area_struct passed to mmap_handler
+ *                      (invarient after mmap)
+ * @tsk:                tid for task that called init for this proc
+ *                      (invariant after init)
+ * @vma_vm_mm:          copy of vma->vm_mm (invarient after mmap)
+ * @buffer:             base of per-proc address space mapped via mmap
+ * @user_buffer_offset: offset between user and kernel VAs for buffer
+ * @buffers:            list of all buffers for this proc
+ * @free_buffers:       rb tree of buffers available for allocation
+ *                      sorted by size
+ * @allocated_buffers:  rb tree of allocated buffers sorted by address
+ * @free_async_space:   VA space available for async buffers. This is
+ *                      initialized at mmap time to 1/2 the full VA space
+ * @pages:              array of binder_lru_page
+ * @buffer_size:        size of address space specified via mmap
+ * @pid:                pid for associated binder_proc (invariant after init)
+ * @pages_high:         high watermark of offset in @pages
+ *
+ * Bookkeeping structure for per-proc address space management for binder
+ * buffers. It is normally initialized during binder_init() and binder_mmap()
+ * calls. The address space is used for both user-visible buffers and for
+ * struct binder_buffer objects used to track the user buffers
+ */
+struct binder_alloc {
+	struct mutex mutex;
+	struct vm_area_struct *vma;
+	struct mm_struct *vma_vm_mm;
+	void *buffer;
+	ptrdiff_t user_buffer_offset;
+	struct list_head buffers;
+	struct rb_root free_buffers;
+	struct rb_root allocated_buffers;
+	size_t free_async_space;
+	struct binder_lru_page *pages;
+	size_t buffer_size;
+	uint32_t buffer_free;
+	int pid;
+	size_t pages_high;
+};
+
+#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST
+void binder_selftest_alloc(struct binder_alloc *alloc);
+#else
+static inline void binder_selftest_alloc(struct binder_alloc *alloc) {}
+#endif
+enum lru_status binder_alloc_free_page(struct list_head *item,
+				       spinlock_t *lock, void *cb_arg);
+extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
+						  size_t data_size,
+						  size_t offsets_size,
+						  size_t extra_buffers_size,
+						  int is_async);
+extern void binder_alloc_init(struct binder_alloc *alloc);
+void binder_alloc_shrinker_init(void);
+extern void binder_alloc_vma_close(struct binder_alloc *alloc);
+extern struct binder_buffer *
+binder_alloc_prepare_to_free(struct binder_alloc *alloc,
+			     uintptr_t user_ptr);
+extern void binder_alloc_free_buf(struct binder_alloc *alloc,
+				  struct binder_buffer *buffer);
+extern int binder_alloc_mmap_handler(struct binder_alloc *alloc,
+				     struct vm_area_struct *vma);
+extern void binder_alloc_deferred_release(struct binder_alloc *alloc);
+extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc);
+extern void binder_alloc_print_allocated(struct seq_file *m,
+					 struct binder_alloc *alloc);
+void binder_alloc_print_pages(struct seq_file *m,
+			      struct binder_alloc *alloc);
+
+/**
+ * binder_alloc_get_free_async_space() - get free space available for async
+ * @alloc:	binder_alloc for this proc
+ *
+ * Return:	the bytes remaining in the address-space for async transactions
+ */
+static inline size_t
+binder_alloc_get_free_async_space(struct binder_alloc *alloc)
+{
+	size_t free_async_space;
+
+	mutex_lock(&alloc->mutex);
+	free_async_space = alloc->free_async_space;
+	mutex_unlock(&alloc->mutex);
+	return free_async_space;
+}
+
+/**
+ * binder_alloc_get_user_buffer_offset() - get offset between kernel/user addrs
+ * @alloc:	binder_alloc for this proc
+ *
+ * Return:	the offset between kernel and user-space addresses to use for
+ * virtual address conversion
+ */
+static inline ptrdiff_t
+binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc)
+{
+	/*
+	 * user_buffer_offset is constant if vma is set and
+	 * undefined if vma is not set. It is possible to
+	 * get here with !alloc->vma if the target process
+	 * is dying while a transaction is being initiated.
+	 * Returning the old value is ok in this case and
+	 * the transaction will fail.
+	 */
+	return alloc->user_buffer_offset;
+}
+
+#endif /* _LINUX_BINDER_ALLOC_H */
+
--- a/drivers/staging/android/binder_alloc_selftest.c
+++ b/drivers/staging/android/binder_alloc_selftest.c
@ -0,0 +1,310 @@
+/* binder_alloc_selftest.c
+ *
+ * Android IPC Subsystem
+ *
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm_types.h>
+#include <linux/err.h>
+#include "binder_alloc.h"
+
+#define BUFFER_NUM 5
+#define BUFFER_MIN_SIZE (PAGE_SIZE / 8)
+
+static bool binder_selftest_run = true;
+static int binder_selftest_failures;
+static DEFINE_MUTEX(binder_selftest_lock);
+
+/**
+ * enum buf_end_align_type - Page alignment of a buffer
+ * end with regard to the end of the previous buffer.
+ *
+ * In the pictures below, buf2 refers to the buffer we
+ * are aligning. buf1 refers to previous buffer by addr.
+ * Symbol [ means the start of a buffer, ] means the end
+ * of a buffer, and | means page boundaries.
+ */
+enum buf_end_align_type {
+	/**
+	 * @SAME_PAGE_UNALIGNED: The end of this buffer is on
+	 * the same page as the end of the previous buffer and
+	 * is not page aligned. Examples:
+	 * buf1 ][ buf2 ][ ...
+	 * buf1 ]|[ buf2 ][ ...
+	 */
+	SAME_PAGE_UNALIGNED = 0,
+	/**
+	 * @SAME_PAGE_ALIGNED: When the end of the previous buffer
+	 * is not page aligned, the end of this buffer is on the
+	 * same page as the end of the previous buffer and is page
+	 * aligned. When the previous buffer is page aligned, the
+	 * end of this buffer is aligned to the next page boundary.
+	 * Examples:
+	 * buf1 ][ buf2 ]| ...
+	 * buf1 ]|[ buf2 ]| ...
+	 */
+	SAME_PAGE_ALIGNED,
+	/**
+	 * @NEXT_PAGE_UNALIGNED: The end of this buffer is on
+	 * the page next to the end of the previous buffer and
+	 * is not page aligned. Examples:
+	 * buf1 ][ buf2 | buf2 ][ ...
+	 * buf1 ]|[ buf2 | buf2 ][ ...
+	 */
+	NEXT_PAGE_UNALIGNED,
+	/**
+	 * @NEXT_PAGE_ALIGNED: The end of this buffer is on
+	 * the page next to the end of the previous buffer and
+	 * is page aligned. Examples:
+	 * buf1 ][ buf2 | buf2 ]| ...
+	 * buf1 ]|[ buf2 | buf2 ]| ...
+	 */
+	NEXT_PAGE_ALIGNED,
+	/**
+	 * @NEXT_NEXT_UNALIGNED: The end of this buffer is on
+	 * the page that follows the page after the end of the
+	 * previous buffer and is not page aligned. Examples:
+	 * buf1 ][ buf2 | buf2 | buf2 ][ ...
+	 * buf1 ]|[ buf2 | buf2 | buf2 ][ ...
+	 */
+	NEXT_NEXT_UNALIGNED,
+	LOOP_END,
+};
+
+static void pr_err_size_seq(size_t *sizes, int *seq)
+{
+	int i;
+
+	pr_err("alloc sizes: ");
+	for (i = 0; i < BUFFER_NUM; i++)
+		pr_cont("[%zu]", sizes[i]);
+	pr_cont("\n");
+	pr_err("free seq: ");
+	for (i = 0; i < BUFFER_NUM; i++)
+		pr_cont("[%d]", seq[i]);
+	pr_cont("\n");
+}
+
+static bool check_buffer_pages_allocated(struct binder_alloc *alloc,
+					 struct binder_buffer *buffer,
+					 size_t size)
+{
+	void *page_addr, *end;
+	int page_index;
+
+	end = (void *)PAGE_ALIGN((uintptr_t)buffer->data + size);
+	page_addr = buffer->data;
+	for (; page_addr < end; page_addr += PAGE_SIZE) {
+		page_index = (page_addr - alloc->buffer) / PAGE_SIZE;
+		if (!alloc->pages[page_index].page_ptr ||
+		    !list_empty(&alloc->pages[page_index].lru)) {
+			pr_err("expect alloc but is %s at page index %d\n",
+			       alloc->pages[page_index].page_ptr ?
+			       "lru" : "free", page_index);
+			return false;
+		}
+	}
+	return true;
+}
+
+static void binder_selftest_alloc_buf(struct binder_alloc *alloc,
+				      struct binder_buffer *buffers[],
+				      size_t *sizes, int *seq)
+{
+	int i;
+
+	for (i = 0; i < BUFFER_NUM; i++) {
+		buffers[i] = binder_alloc_new_buf(alloc, sizes[i], 0, 0, 0);
+		if (IS_ERR(buffers[i]) ||
+		    !check_buffer_pages_allocated(alloc, buffers[i],
+						  sizes[i])) {
+			pr_err_size_seq(sizes, seq);
+			binder_selftest_failures++;
+		}
+	}
+}
+
+static void binder_selftest_free_buf(struct binder_alloc *alloc,
+				     struct binder_buffer *buffers[],
+				     size_t *sizes, int *seq, size_t end)
+{
+	int i;
+
+	for (i = 0; i < BUFFER_NUM; i++)
+		binder_alloc_free_buf(alloc, buffers[seq[i]]);
+
+	for (i = 0; i < end / PAGE_SIZE; i++) {
+		/**
+		 * Error message on a free page can be false positive
+		 * if binder shrinker ran during binder_alloc_free_buf
+		 * calls above.
+		 */
+		if (list_empty(&alloc->pages[i].lru)) {
+			pr_err_size_seq(sizes, seq);
+			pr_err("expect lru but is %s at page index %d\n",
+			       alloc->pages[i].page_ptr ? "alloc" : "free", i);
+			binder_selftest_failures++;
+		}
+	}
+}
+
+static void binder_selftest_free_page(struct binder_alloc *alloc)
+{
+	int i;
+	unsigned long count;
+
+	while ((count = list_lru_count(&binder_alloc_lru))) {
+		list_lru_walk(&binder_alloc_lru, binder_alloc_free_page,
+			      NULL, count);
+	}
+
+	for (i = 0; i < (alloc->buffer_size / PAGE_SIZE); i++) {
+		if (alloc->pages[i].page_ptr) {
+			pr_err("expect free but is %s at page index %d\n",
+			       list_empty(&alloc->pages[i].lru) ?
+			       "alloc" : "lru", i);
+			binder_selftest_failures++;
+		}
+	}
+}
+
+static void binder_selftest_alloc_free(struct binder_alloc *alloc,
+				       size_t *sizes, int *seq, size_t end)
+{
+	struct binder_buffer *buffers[BUFFER_NUM];
+
+	binder_selftest_alloc_buf(alloc, buffers, sizes, seq);
+	binder_selftest_free_buf(alloc, buffers, sizes, seq, end);
+
+	/* Allocate from lru. */
+	binder_selftest_alloc_buf(alloc, buffers, sizes, seq);
+	if (list_lru_count(&binder_alloc_lru))
+		pr_err("lru list should be empty but is not\n");
+
+	binder_selftest_free_buf(alloc, buffers, sizes, seq, end);
+	binder_selftest_free_page(alloc);
+}
+
+static bool is_dup(int *seq, int index, int val)
+{
+	int i;
+
+	for (i = 0; i < index; i++) {
+		if (seq[i] == val)
+			return true;
+	}
+	return false;
+}
+
+/* Generate BUFFER_NUM factorial free orders. */
+static void binder_selftest_free_seq(struct binder_alloc *alloc,
+				     size_t *sizes, int *seq,
+				     int index, size_t end)
+{
+	int i;
+
+	if (index == BUFFER_NUM) {
+		binder_selftest_alloc_free(alloc, sizes, seq, end);
+		return;
+	}
+	for (i = 0; i < BUFFER_NUM; i++) {
+		if (is_dup(seq, index, i))
+			continue;
+		seq[index] = i;
+		binder_selftest_free_seq(alloc, sizes, seq, index + 1, end);
+	}
+}
+
+static void binder_selftest_alloc_size(struct binder_alloc *alloc,
+				       size_t *end_offset)
+{
+	int i;
+	int seq[BUFFER_NUM] = {0};
+	size_t front_sizes[BUFFER_NUM];
+	size_t back_sizes[BUFFER_NUM];
+	size_t last_offset, offset = 0;
+
+	for (i = 0; i < BUFFER_NUM; i++) {
+		last_offset = offset;
+		offset = end_offset[i];
+		front_sizes[i] = offset - last_offset;
+		back_sizes[BUFFER_NUM - i - 1] = front_sizes[i];
+	}
+	/*
+	 * Buffers share the first or last few pages.
+	 * Only BUFFER_NUM - 1 buffer sizes are adjustable since
+	 * we need one giant buffer before getting to the last page.
+	 */
+	back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1];
+	binder_selftest_free_seq(alloc, front_sizes, seq, 0,
+				 end_offset[BUFFER_NUM - 1]);
+	binder_selftest_free_seq(alloc, back_sizes, seq, 0, alloc->buffer_size);
+}
+
+static void binder_selftest_alloc_offset(struct binder_alloc *alloc,
+					 size_t *end_offset, int index)
+{
+	int align;
+	size_t end, prev;
+
+	if (index == BUFFER_NUM) {
+		binder_selftest_alloc_size(alloc, end_offset);
+		return;
+	}
+	prev = index == 0 ? 0 : end_offset[index - 1];
+	end = prev;
+
+	BUILD_BUG_ON(BUFFER_MIN_SIZE * BUFFER_NUM >= PAGE_SIZE);
+
+	for (align = SAME_PAGE_UNALIGNED; align < LOOP_END; align++) {
+		if (align % 2)
+			end = ALIGN(end, PAGE_SIZE);
+		else
+			end += BUFFER_MIN_SIZE;
+		end_offset[index] = end;
+		binder_selftest_alloc_offset(alloc, end_offset, index + 1);
+	}
+}
+
+/**
+ * binder_selftest_alloc() - Test alloc and free of buffer pages.
+ * @alloc: Pointer to alloc struct.
+ *
+ * Allocate BUFFER_NUM buffers to cover all page alignment cases,
+ * then free them in all orders possible. Check that pages are
+ * correctly allocated, put onto lru when buffers are freed, and
+ * are freed when binder_alloc_free_page is called.
+ */
+void binder_selftest_alloc(struct binder_alloc *alloc)
+{
+	size_t end_offset[BUFFER_NUM];
+
+	if (!binder_selftest_run)
+		return;
+	mutex_lock(&binder_selftest_lock);
+	if (!binder_selftest_run || !alloc->vma)
+		goto done;
+	pr_info("STARTED\n");
+	binder_selftest_alloc_offset(alloc, end_offset, 0);
+	binder_selftest_run = false;
+	if (binder_selftest_failures > 0)
+		pr_info("%d tests FAILED\n", binder_selftest_failures);
+	else
+		pr_info("PASSED\n");
+
+done:
+	mutex_unlock(&binder_selftest_lock);
+}
--- a/drivers/staging/android/binder_trace.h
+++ b/drivers/staging/android/binder_trace.h
@ -23,7 +23,8 @@
 struct binder_buffer;
 struct binder_node;
 struct binder_proc;
-struct binder_ref;
+struct binder_alloc;
+struct binder_ref_data;
 struct binder_thread;
 struct binder_transaction;

@ -84,6 +85,30 @@ DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_ioctl_done);
 DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_write_done);
 DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_read_done);

+TRACE_EVENT(binder_set_priority,
+	TP_PROTO(int proc, int thread, unsigned int old_prio,
+		 unsigned int desired_prio, unsigned int new_prio),
+	TP_ARGS(proc, thread, old_prio, new_prio, desired_prio),
+
+	TP_STRUCT__entry(
+		__field(int, proc)
+		__field(int, thread)
+		__field(unsigned int, old_prio)
+		__field(unsigned int, new_prio)
+		__field(unsigned int, desired_prio)
+	),
+	TP_fast_assign(
+		__entry->proc = proc;
+		__entry->thread = thread;
+		__entry->old_prio = old_prio;
+		__entry->new_prio = new_prio;
+		__entry->desired_prio = desired_prio;
+	),
+	TP_printk("proc=%d thread=%d old=%d => new=%d desired=%d",
+		  __entry->proc, __entry->thread, __entry->old_prio,
+		  __entry->new_prio, __entry->desired_prio)
+);
+
 TRACE_EVENT(binder_wait_for_work,
 	TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo),
 	TP_ARGS(proc_work, transaction_stack, thread_todo),
@ -146,8 +171,8 @@ TRACE_EVENT(binder_transaction_received,

 TRACE_EVENT(binder_transaction_node_to_ref,
 	TP_PROTO(struct binder_transaction *t, struct binder_node *node,
-		 struct binder_ref *ref),
-	TP_ARGS(t, node, ref),
+		 struct binder_ref_data *rdata),
+	TP_ARGS(t, node, rdata),

 	TP_STRUCT__entry(
 		__field(int, debug_id)
@ -160,8 +185,8 @@ TRACE_EVENT(binder_transaction_node_to_ref,
 		__entry->debug_id = t->debug_id;
 		__entry->node_debug_id = node->debug_id;
 		__entry->node_ptr = node->ptr;
-		__entry->ref_debug_id = ref->debug_id;
-		__entry->ref_desc = ref->desc;
+		__entry->ref_debug_id = rdata->debug_id;
+		__entry->ref_desc = rdata->desc;
 	),
 	TP_printk("transaction=%d node=%d src_ptr=0x%016llx ==> dest_ref=%d dest_desc=%d",
 		  __entry->debug_id, __entry->node_debug_id,
@ -170,8 +195,9 @@ TRACE_EVENT(binder_transaction_node_to_ref,
 );

 TRACE_EVENT(binder_transaction_ref_to_node,
-	TP_PROTO(struct binder_transaction *t, struct binder_ref *ref),
-	TP_ARGS(t, ref),
+	TP_PROTO(struct binder_transaction *t, struct binder_node *node,
+		 struct binder_ref_data *rdata),
+	TP_ARGS(t, node, rdata),

 	TP_STRUCT__entry(
 		__field(int, debug_id)
@ -182,10 +208,10 @@ TRACE_EVENT(binder_transaction_ref_to_node,
 	),
 	TP_fast_assign(
 		__entry->debug_id = t->debug_id;
-		__entry->ref_debug_id = ref->debug_id;
-		__entry->ref_desc = ref->desc;
-		__entry->node_debug_id = ref->node->debug_id;
-		__entry->node_ptr = ref->node->ptr;
+		__entry->ref_debug_id = rdata->debug_id;
+		__entry->ref_desc = rdata->desc;
+		__entry->node_debug_id = node->debug_id;
+		__entry->node_ptr = node->ptr;
 	),
 	TP_printk("transaction=%d node=%d src_ref=%d src_desc=%d ==> dest_ptr=0x%016llx",
 		  __entry->debug_id, __entry->node_debug_id,
@ -194,9 +220,10 @@ TRACE_EVENT(binder_transaction_ref_to_node,
 );

 TRACE_EVENT(binder_transaction_ref_to_ref,
-	TP_PROTO(struct binder_transaction *t, struct binder_ref *src_ref,
-		 struct binder_ref *dest_ref),
-	TP_ARGS(t, src_ref, dest_ref),
+	TP_PROTO(struct binder_transaction *t, struct binder_node *node,
+		 struct binder_ref_data *src_ref,
+		 struct binder_ref_data *dest_ref),
+	TP_ARGS(t, node, src_ref, dest_ref),

 	TP_STRUCT__entry(
 		__field(int, debug_id)
@ -208,7 +235,7 @@ TRACE_EVENT(binder_transaction_ref_to_ref,
 	),
 	TP_fast_assign(
 		__entry->debug_id = t->debug_id;
-		__entry->node_debug_id = src_ref->node->debug_id;
+		__entry->node_debug_id = node->debug_id;
 		__entry->src_ref_debug_id = src_ref->debug_id;
 		__entry->src_ref_desc = src_ref->desc;
 		__entry->dest_ref_debug_id = dest_ref->debug_id;
@ -268,9 +295,9 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release,
 	TP_ARGS(buffer));

 TRACE_EVENT(binder_update_page_range,
-	TP_PROTO(struct binder_proc *proc, bool allocate,
+	TP_PROTO(struct binder_alloc *alloc, bool allocate,
 		 void *start, void *end),
-	TP_ARGS(proc, allocate, start, end),
+	TP_ARGS(alloc, allocate, start, end),
 	TP_STRUCT__entry(
 		__field(int, proc)
 		__field(bool, allocate)
@ -278,9 +305,9 @@ TRACE_EVENT(binder_update_page_range,
 		__field(size_t, size)
 	),
 	TP_fast_assign(
-		__entry->proc = proc->pid;
+		__entry->proc = alloc->pid;
 		__entry->allocate = allocate;
-		__entry->offset = start - proc->buffer;
+		__entry->offset = start - alloc->buffer;
 		__entry->size = end - start;
 	),
 	TP_printk("proc=%d allocate=%d offset=%zu size=%zu",
@ -288,6 +315,61 @@ TRACE_EVENT(binder_update_page_range,
 		  __entry->offset, __entry->size)
 );

+DECLARE_EVENT_CLASS(binder_lru_page_class,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index),
+	TP_STRUCT__entry(
+		__field(int, proc)
+		__field(size_t, page_index)
+	),
+	TP_fast_assign(
+		__entry->proc = alloc->pid;
+		__entry->page_index = page_index;
+	),
+	TP_printk("proc=%d page_index=%zu",
+		  __entry->proc, __entry->page_index)
+);
+
+DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_start,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_end,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_free_lru_start,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_free_lru_end,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_start,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_end,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_start,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_end,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_start,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_end,
+	TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+	TP_ARGS(alloc, page_index));
+
 TRACE_EVENT(binder_command,
 	TP_PROTO(uint32_t cmd),
 	TP_ARGS(cmd),
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@ -13,7 +13,9 @@
 * drops below 4096 pages and kill processes with a oom_score_adj value of 0 or
 * higher when the free memory drops below 1024 pages.
 *
- * The driver considers memory used for caches to be free.
+ * The driver considers memory used for caches to be free, but if a large
+ * percentage of the cached memory is locked this can be very inaccurate
+ * and processes may not get killed until the normal oom killer is triggered.
 *
 * Copyright (C) 2007-2008 Google, Inc.
 *
@ -27,11 +29,6 @@
 * GNU General Public License for more details.
 *
 */
-/*
- * NOTE: This file has been modified by Sony Mobile Communications Inc.
- * Modifications are Copyright (c) 2015 Sony Mobile Communications Inc,
- * and licensed under the license of the file.
- */

 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

@ -40,27 +37,9 @@
 #include <linux/mm.h>
 #include <linux/oom.h>
 #include <linux/sched.h>
+#include <linux/swap.h>
 #include <linux/rcupdate.h>
 #include <linux/notifier.h>
-#include <linux/mutex.h>
-#include <linux/delay.h>
-#include <linux/swap.h>
-#include <linux/fs.h>
-#include <linux/cpuset.h>
-#include <linux/show_mem_notifier.h>
-#include <linux/vmpressure.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/almk.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/lmk.h>
-
-#ifdef CONFIG_HIGHMEM
-#define _ZONE ZONE_HIGHMEM
-#else
-#define _ZONE ZONE_NORMAL
-#endif

 #define CREATE_TRACE_POINTS
 #include "trace/lowmemorykiller.h"
@ -80,7 +59,6 @@ static int lowmem_minfree[6] = {
 	16 * 1024,	/* 64MB */
 };
 static int lowmem_minfree_size = 4;
-static int lmk_fast_run = 1;

 static unsigned long lowmem_deathpending_timeout;

@ -90,380 +68,32 @@ static unsigned long lowmem_deathpending_timeout;
 			pr_info(x);			\
 	} while (0)

-static atomic_t shift_adj = ATOMIC_INIT(0);
-static short adj_max_shift = 353;
-
-/* User knob to enable/disable adaptive lmk feature */
-static int enable_adaptive_lmk;
-module_param_named(enable_adaptive_lmk, enable_adaptive_lmk, int,
-	S_IRUGO | S_IWUSR);
-
-/*
- * This parameter controls the behaviour of LMK when vmpressure is in
- * the range of 90-94. Adaptive lmk triggers based on number of file
- * pages wrt vmpressure_file_min, when vmpressure is in the range of
- * 90-94. Usually this is a pseudo minfree value, higher than the
- * highest configured value in minfree array.
- */
-static int vmpressure_file_min;
-module_param_named(vmpressure_file_min, vmpressure_file_min, int,
-	S_IRUGO | S_IWUSR);
-
-enum {
-	VMPRESSURE_NO_ADJUST = 0,
-	VMPRESSURE_ADJUST_ENCROACH,
-	VMPRESSURE_ADJUST_NORMAL,
-};
-
-int adjust_minadj(short *min_score_adj)
+static unsigned long lowmem_count(struct shrinker *s,
+				  struct shrink_control *sc)
 {
-	int ret = VMPRESSURE_NO_ADJUST;
-
-	if (!enable_adaptive_lmk)
-		return 0;
-
-	if (atomic_read(&shift_adj) &&
-		(*min_score_adj > adj_max_shift)) {
-		if (*min_score_adj == OOM_SCORE_ADJ_MAX + 1)
-			ret = VMPRESSURE_ADJUST_ENCROACH;
-		else
-			ret = VMPRESSURE_ADJUST_NORMAL;
-		*min_score_adj = adj_max_shift;
-	}
-	atomic_set(&shift_adj, 0);
-
-	return ret;
+	return global_page_state(NR_ACTIVE_ANON) +
+		global_page_state(NR_ACTIVE_FILE) +
+		global_page_state(NR_INACTIVE_ANON) +
+		global_page_state(NR_INACTIVE_FILE);
 }

-static int lmk_vmpressure_notifier(struct notifier_block *nb,
-			unsigned long action, void *data)
-{
-	int other_free, other_file;
-	unsigned long pressure = action;
-	int array_size = ARRAY_SIZE(lowmem_adj);
-
-	if (!enable_adaptive_lmk)
-		return 0;
-
-	if (pressure >= 95) {
-		other_file = global_page_state(NR_FILE_PAGES) -
-			global_page_state(NR_SHMEM) -
-			total_swapcache_pages();
-		other_free = global_page_state(NR_FREE_PAGES);
-
-		atomic_set(&shift_adj, 1);
-		trace_almk_vmpressure(pressure, other_free, other_file);
-	} else if (pressure >= 90) {
-		if (lowmem_adj_size < array_size)
-			array_size = lowmem_adj_size;
-		if (lowmem_minfree_size < array_size)
-			array_size = lowmem_minfree_size;
-
-		other_file = global_page_state(NR_FILE_PAGES) -
-			global_page_state(NR_SHMEM) -
-			total_swapcache_pages();
-
-		other_free = global_page_state(NR_FREE_PAGES);
-
-		if ((other_free < lowmem_minfree[array_size - 1]) &&
-			(other_file < vmpressure_file_min)) {
-				atomic_set(&shift_adj, 1);
-				trace_almk_vmpressure(pressure, other_free,
-					other_file);
-		}
-	} else if (atomic_read(&shift_adj)) {
-		/*
-		 * shift_adj would have been set by a previous invocation
-		 * of notifier, which is not followed by a lowmem_shrink yet.
-		 * Since vmpressure has improved, reset shift_adj to avoid
-		 * false adaptive LMK trigger.
-		 */
-		trace_almk_vmpressure(pressure, other_free, other_file);
-		atomic_set(&shift_adj, 0);
-	}
-
-	return 0;
-}
-
-static struct notifier_block lmk_vmpr_nb = {
-	.notifier_call = lmk_vmpressure_notifier,
-};
-
-static int test_task_flag(struct task_struct *p, int flag)
-{
-	struct task_struct *t;
-
-	for_each_thread(p, t) {
-		task_lock(t);
-		if (test_tsk_thread_flag(t, flag)) {
-			task_unlock(t);
-			return 1;
-		}
-		task_unlock(t);
-	}
-
-	return 0;
-}
-
-static DEFINE_MUTEX(scan_mutex);
-
-int can_use_cma_pages(gfp_t gfp_mask)
-{
-	int can_use = 0;
-	int mtype = allocflags_to_migratetype(gfp_mask);
-	int i = 0;
-	int *mtype_fallbacks = get_migratetype_fallbacks(mtype);
-
-	if (is_migrate_cma(mtype)) {
-		can_use = 1;
-	} else {
-		for (i = 0;; i++) {
-			int fallbacktype = mtype_fallbacks[i];
-
-			if (is_migrate_cma(fallbacktype)) {
-				can_use = 1;
-				break;
-			}
-
-			if (fallbacktype == MIGRATE_RESERVE)
-				break;
-		}
-	}
-	return can_use;
-}
-
-void tune_lmk_zone_param(struct zonelist *zonelist, int classzone_idx,
-					int *other_free, int *other_file,
-					int use_cma_pages)
-{
-	struct zone *zone;
-	struct zoneref *zoneref;
-	int zone_idx;
-
-	for_each_zone_zonelist(zone, zoneref, zonelist, MAX_NR_ZONES) {
-		zone_idx = zonelist_zone_idx(zoneref);
-		if (zone_idx == ZONE_MOVABLE) {
-			if (!use_cma_pages)
-				*other_free -=
-				    zone_page_state(zone, NR_FREE_CMA_PAGES);
-			continue;
-		}
-
-		if (zone_idx > classzone_idx) {
-			if (other_free != NULL)
-				*other_free -= zone_page_state(zone,
-							       NR_FREE_PAGES);
-			if (other_file != NULL)
-				*other_file -= zone_page_state(zone,
-							       NR_FILE_PAGES)
-					- zone_page_state(zone, NR_SHMEM)
-					- zone_page_state(zone, NR_SWAPCACHE);
-		} else if (zone_idx < classzone_idx) {
-			if (zone_watermark_ok(zone, 0, 0, classzone_idx, 0)) {
-				if (!use_cma_pages) {
-					*other_free -= min(
-					  zone->lowmem_reserve[classzone_idx] +
-					  zone_page_state(
-					    zone, NR_FREE_CMA_PAGES),
-					  zone_page_state(
-					    zone, NR_FREE_PAGES));
-				} else {
-					*other_free -=
-					  zone->lowmem_reserve[classzone_idx];
-				}
-			} else {
-				*other_free -=
-					   zone_page_state(zone, NR_FREE_PAGES);
-			}
-		}
-	}
-}
-
-#ifdef CONFIG_HIGHMEM
-void adjust_gfp_mask(gfp_t *gfp_mask)
-{
-	struct zone *preferred_zone;
-	struct zonelist *zonelist;
-	enum zone_type high_zoneidx;
-
-	if (current_is_kswapd()) {
-		zonelist = node_zonelist(0, *gfp_mask);
-		high_zoneidx = gfp_zone(*gfp_mask);
-		first_zones_zonelist(zonelist, high_zoneidx, NULL,
-				&preferred_zone);
-
-		if (high_zoneidx == ZONE_NORMAL) {
-			if (zone_watermark_ok_safe(preferred_zone, 0,
-					high_wmark_pages(preferred_zone), 0,
-					0))
-				*gfp_mask |= __GFP_HIGHMEM;
-		} else if (high_zoneidx == ZONE_HIGHMEM) {
-			*gfp_mask |= __GFP_HIGHMEM;
-		}
-	}
-}
-#else
-void adjust_gfp_mask(gfp_t *unused)
-{
-}
-#endif
-
-void tune_lmk_param(int *other_free, int *other_file, struct shrink_control *sc)
-{
-	gfp_t gfp_mask;
-	struct zone *preferred_zone;
-	struct zonelist *zonelist;
-	enum zone_type high_zoneidx, classzone_idx;
-	unsigned long balance_gap;
-	int use_cma_pages;
-
-	gfp_mask = sc->gfp_mask;
-	adjust_gfp_mask(&gfp_mask);
-
-	zonelist = node_zonelist(0, gfp_mask);
-	high_zoneidx = gfp_zone(gfp_mask);
-	first_zones_zonelist(zonelist, high_zoneidx, NULL, &preferred_zone);
-	classzone_idx = zone_idx(preferred_zone);
-	use_cma_pages = can_use_cma_pages(gfp_mask);
-
-	balance_gap = min(low_wmark_pages(preferred_zone),
-			  (preferred_zone->present_pages +
-			   KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
-			   KSWAPD_ZONE_BALANCE_GAP_RATIO);
-
-	if (likely(current_is_kswapd() && zone_watermark_ok(preferred_zone, 0,
-			  high_wmark_pages(preferred_zone) + SWAP_CLUSTER_MAX +
-			  balance_gap, 0, 0))) {
-		if (lmk_fast_run)
-			tune_lmk_zone_param(zonelist, classzone_idx, other_free,
-				       other_file, use_cma_pages);
-		else
-			tune_lmk_zone_param(zonelist, classzone_idx, other_free,
-				       NULL, use_cma_pages);
-
-		if (zone_watermark_ok(preferred_zone, 0, 0, _ZONE, 0)) {
-			if (!use_cma_pages) {
-				*other_free -= min(
-				  preferred_zone->lowmem_reserve[_ZONE]
-				  + zone_page_state(
-				    preferred_zone, NR_FREE_CMA_PAGES),
-				  zone_page_state(
-				    preferred_zone, NR_FREE_PAGES));
-			} else {
-				*other_free -=
-				  preferred_zone->lowmem_reserve[_ZONE];
-			}
-		} else {
-			*other_free -= zone_page_state(preferred_zone,
-						      NR_FREE_PAGES);
-		}
-
-		lowmem_print(4, "lowmem_shrink of kswapd tunning for highmem "
-			     "ofree %d, %d\n", *other_free, *other_file);
-	} else {
-		tune_lmk_zone_param(zonelist, classzone_idx, other_free,
-			       other_file, use_cma_pages);
-
-		if (!use_cma_pages) {
-			*other_free -=
-			  zone_page_state(preferred_zone, NR_FREE_CMA_PAGES);
-		}
-
-		lowmem_print(4, "lowmem_shrink tunning for others ofree %d, "
-			     "%d\n", *other_free, *other_file);
-	}
-#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
-	if (zone_watermark_ok(preferred_zone, 0,
-			  low_wmark_pages(preferred_zone), 0, 0)) {
-		struct sysinfo si;
-		si_swapinfo(&si);
-		*other_free += si.freeswap;
-#ifdef CONFIG_ZRAM
-		/* If swap is actually residing in RAM (e. g. the swap device
-		 * is a ZRAM device), we need to subtract the amount of RAM
-		 * that will be occupied by compressed data. To play on the
-		 * safe side, it's better to subtract too much than too few,
-		 * otherwise LMK may not be triggered when it has to be. ZRAM
-		 * compression ratio is at least 2, so we subtract half of the
-		 * reported freeswap.
-		 */
-		*other_free -= si.freeswap >> 1;
-#endif
-		lowmem_print(4, "lowmem_shrink tunning for swap "
-			     "ofree %d, %d\n", *other_free, *other_file);
-	}
-#endif
-}
-
-#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
-static void lowmem_wakeup_kswapd(struct shrink_control *sc, int minfree)
-{
-	gfp_t gfp_mask;
-	struct zone *preferred_zone;
-	struct zonelist *zonelist;
-	enum zone_type high_zoneidx, classzone_idx;
-	int order = 0;
-
-	if (likely(current_is_kswapd()))
-		return;
-
-	gfp_mask = sc->gfp_mask;
-	adjust_gfp_mask(&gfp_mask);
-
-	zonelist = node_zonelist(0, gfp_mask);
-	high_zoneidx = gfp_zone(gfp_mask);
-	first_zones_zonelist(zonelist, high_zoneidx, NULL, &preferred_zone);
-	classzone_idx = zone_idx(preferred_zone);
-
-	for (minfree >>= 13; order < 7; order++) {
-		if (minfree <= (1 << order))
-			break;
-	}
-
-	lowmem_print(4, "lowmem_wakeup_kswapd order %d\n", order);
-	wakeup_kswapd(preferred_zone, order, classzone_idx);
-}
-#endif
-
-static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
+static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 {
 	struct task_struct *tsk;
 	struct task_struct *selected = NULL;
-	int rem = 0;
+	unsigned long rem = 0;
 	int tasksize;
 	int i;
-	int ret = 0;
 	short min_score_adj = OOM_SCORE_ADJ_MAX + 1;
 	int minfree = 0;
-#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
-	int  max_minfree = 0;
-#endif
 	int selected_tasksize = 0;
 	short selected_oom_score_adj;
 	int array_size = ARRAY_SIZE(lowmem_adj);
-	int other_free;
-	int other_file;
-	unsigned long nr_to_scan = sc->nr_to_scan;
-
-	if (nr_to_scan > 0) {
-		if (mutex_lock_interruptible(&scan_mutex) < 0) {
-			trace_lmk_remain_scan(0, nr_to_scan, sc->gfp_mask);
-			return 0;
-		};
-	}
-
-	other_free = global_page_state(NR_FREE_PAGES);
-
-	if (global_page_state(NR_SHMEM) + global_page_state(NR_MLOCK_FILE) +
-		total_swapcache_pages() < global_page_state(NR_FILE_PAGES))
-		other_file = global_page_state(NR_FILE_PAGES) -
+	int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+	int other_file = global_page_state(NR_FILE_PAGES) -
 						global_page_state(NR_SHMEM) -
-						global_page_state(NR_MLOCK_FILE) -
+						global_page_state(NR_UNEVICTABLE) -
 						total_swapcache_pages();
-	else
-		other_file = 0;
-
-	tune_lmk_param(&other_free, &other_file, sc);

 	if (lowmem_adj_size < array_size)
 		array_size = lowmem_adj_size;
@ -471,43 +101,22 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 		array_size = lowmem_minfree_size;
 	for (i = 0; i < array_size; i++) {
 		minfree = lowmem_minfree[i];
-#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
-		if (max_minfree < minfree)
-			max_minfree = minfree;
-#endif
 		if (other_free < minfree && other_file < minfree) {
 			min_score_adj = lowmem_adj[i];
 			break;
 		}
 	}
-	if (nr_to_scan > 0) {
-		ret = adjust_minadj(&min_score_adj);
-		lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %hd\n",
-				nr_to_scan, sc->gfp_mask, other_free,
-				other_file, min_score_adj);
+
+	lowmem_print(3, "lowmem_scan %lu, %x, ofree %d %d, ma %hd\n",
+			sc->nr_to_scan, sc->gfp_mask, other_free,
+			other_file, min_score_adj);
+
+	if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
+		lowmem_print(5, "lowmem_scan %lu, %x, return 0\n",
+			     sc->nr_to_scan, sc->gfp_mask);
+		return 0;
 	}

-	rem = global_page_state(NR_ACTIVE_ANON) +
-		global_page_state(NR_ACTIVE_FILE) +
-		global_page_state(NR_INACTIVE_ANON) +
-		global_page_state(NR_INACTIVE_FILE);
-	if (nr_to_scan <= 0 || min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
-		lowmem_print(5, "lowmem_shrink %lu, %x, return %d\n",
-			     nr_to_scan, sc->gfp_mask, rem);
-
-		if (nr_to_scan > 0)
-			mutex_unlock(&scan_mutex);
-
-		if ((min_score_adj == OOM_SCORE_ADJ_MAX + 1) &&
-			(nr_to_scan > 0))
-			trace_almk_shrink(0, ret, other_free, other_file, 0);
-
-		trace_lmk_remain_scan(rem, nr_to_scan, sc->gfp_mask);
-#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_CONSIDER_SWAP
-		lowmem_wakeup_kswapd(sc, max_minfree);
-#endif
-		return rem;
-	}
 	selected_oom_score_adj = min_score_adj;

 	rcu_read_lock();
@ -518,30 +127,16 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 		if (tsk->flags & PF_KTHREAD)
 			continue;

-		/* if task no longer has any memory ignore it */
-		if (test_task_flag(tsk, TIF_MM_RELEASED))
-			continue;
-
-		/* Ignore task if coredump in progress */
-		if (tsk->mm && tsk->mm->core_state)
-			continue;
-
-		if (time_before_eq(jiffies, lowmem_deathpending_timeout)) {
-			if (test_task_flag(tsk, TIF_MEMDIE)) {
-				rcu_read_unlock();
-				/* give the system time to free up the memory */
-				msleep_interruptible(20);
-				mutex_unlock(&scan_mutex);
-				trace_lmk_remain_scan(rem, nr_to_scan,
-						      sc->gfp_mask);
-				return 0;
-			}
-		}
-
 		p = find_lock_task_mm(tsk);
 		if (!p)
 			continue;

+		if (test_tsk_thread_flag(p, TIF_MEMDIE) &&
+		    time_before_eq(jiffies, lowmem_deathpending_timeout)) {
+			task_unlock(p);
+			rcu_read_unlock();
+			return 0;
+		}
 		oom_score_adj = p->signal->oom_score_adj;
 		if (oom_score_adj < min_score_adj) {
 			task_unlock(p);
@ -561,7 +156,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 		selected = p;
 		selected_tasksize = tasksize;
 		selected_oom_score_adj = oom_score_adj;
-		lowmem_print(3, "select '%s' (%d), adj %hd, size %d, to kill\n",
+		lowmem_print(2, "select '%s' (%d), adj %hd, size %d, to kill\n",
 			     p->comm, p->pid, oom_score_adj, tasksize);
 	}
 	if (selected) {
@ -572,83 +167,35 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 		lowmem_print(1, "Killing '%s' (%d), adj %hd,\n" \
 				"   to free %ldkB on behalf of '%s' (%d) because\n" \
 				"   cache %ldkB is below limit %ldkB for oom_score_adj %hd\n" \
-				"   Free memory is %ldkB above reserved.\n" \
-				"   Free CMA is %ldkB\n" \
-				"   Total reserve is %ldkB\n" \
-				"   Total free pages is %ldkB\n" \
-				"   Total file cache is %ldkB\n" \
-				"   Slab Reclaimable is %ldkB\n" \
-				"   Slab UnReclaimable is %ldkB\n" \
-				"   Total Slab is %ldkB\n" \
-				"   GFP mask is 0x%x\n",
+				"   Free memory is %ldkB above reserved\n",
 			     selected->comm, selected->pid,
 			     selected_oom_score_adj,
 			     selected_tasksize * (long)(PAGE_SIZE / 1024),
 			     current->comm, current->pid,
 			     cache_size, cache_limit,
 			     min_score_adj,
-			     free ,
-			     global_page_state(NR_FREE_CMA_PAGES) *
-				(long)(PAGE_SIZE / 1024),
-			     totalreserve_pages * (long)(PAGE_SIZE / 1024),
-			     global_page_state(NR_FREE_PAGES) *
-				(long)(PAGE_SIZE / 1024),
-			     global_page_state(NR_FILE_PAGES) *
-				(long)(PAGE_SIZE / 1024),
-			     global_page_state(NR_SLAB_RECLAIMABLE) *
-				(long)(PAGE_SIZE / 1024),
-			     global_page_state(NR_SLAB_UNRECLAIMABLE) *
-				(long)(PAGE_SIZE / 1024),
-			     global_page_state(NR_SLAB_RECLAIMABLE) *
-				(long)(PAGE_SIZE / 1024) +
-			     global_page_state(NR_SLAB_UNRECLAIMABLE) *
-				(long)(PAGE_SIZE / 1024),
-			     sc->gfp_mask);
-
-		if (lowmem_debug_level >= 2 && selected_oom_score_adj == 0) {
-			show_mem(SHOW_MEM_FILTER_NODES);
-			dump_tasks(NULL, NULL);
-			show_mem_call_notifiers();
-		}
-
+			     free);
 		lowmem_deathpending_timeout = jiffies + HZ;
-		/*
-		 * FIXME: lowmemorykiller shouldn't abuse global OOM killer
-		 * infrastructure. There is no real reason why the selected
-		 * task should have access to the memory reserves.
-		 */
-		mark_tsk_oom_victim(selected);
+		set_tsk_thread_flag(selected, TIF_MEMDIE);
 		send_sig(SIGKILL, selected, 0);
-		rem -= selected_tasksize;
-		rcu_read_unlock();
-		trace_lmk_sigkill(selected->pid, selected->comm,
-				 selected_oom_score_adj, selected_tasksize,
-				 sc->gfp_mask);
-		/* give the system time to free up the memory */
-		msleep_interruptible(20);
-		trace_almk_shrink(selected_tasksize, ret,
-			other_free, other_file, selected_oom_score_adj);
-	} else {
-		trace_almk_shrink(1, ret, other_free, other_file, 0);
-		rcu_read_unlock();
+		rem += selected_tasksize;
 	}

-	lowmem_print(4, "lowmem_shrink %lu, %x, return %d\n",
-		     nr_to_scan, sc->gfp_mask, rem);
-	mutex_unlock(&scan_mutex);
-	trace_lmk_remain_scan(rem, nr_to_scan, sc->gfp_mask);
+	lowmem_print(4, "lowmem_scan %lu, %x, return %lu\n",
+		     sc->nr_to_scan, sc->gfp_mask, rem);
+	rcu_read_unlock();
 	return rem;
 }

 static struct shrinker lowmem_shrinker = {
-	.shrink = lowmem_shrink,
+	.scan_objects = lowmem_scan,
+	.count_objects = lowmem_count,
 	.seeks = DEFAULT_SEEKS * 16
 };

 static int __init lowmem_init(void)
 {
 	register_shrinker(&lowmem_shrinker);
-	vmpressure_notifier_register(&lmk_vmpr_nb);
 	return 0;
 }

@ -736,10 +283,8 @@ static const struct kparam_array __param_arr_adj = {

 module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR);
 #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
-__module_param_call(MODULE_PARAM_PREFIX, adj,
-		    &lowmem_adj_array_ops,
-		    .arr = &__param_arr_adj,
-		    S_IRUGO | S_IWUSR, -1);
+module_param_cb(adj, &lowmem_adj_array_ops,
+		.arr = &__param_arr_adj, S_IRUGO | S_IWUSR);
 __MODULE_PARM_TYPE(adj, "array of short");
 #else
 module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size,
@ -748,7 +293,6 @@ module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size,
 module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size,
 			 S_IRUGO | S_IWUSR);
 module_param_named(debug_level, lowmem_debug_level, uint, S_IRUGO | S_IWUSR);
-module_param_named(lmk_fast_run, lmk_fast_run, int, S_IRUGO | S_IWUSR);

 module_init(lowmem_init);
 module_exit(lowmem_exit);
--- a/drivers/staging/android/uapi/binder.h
+++ b/drivers/staging/android/uapi/binder.h
@ -0,0 +1,541 @@
+/*
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * Based on, but no longer compatible with, the original
+ * OpenBinder.org binder driver interface, which is:
+ *
+ * Copyright (c) 2005 Palmsource, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_BINDER_H
+#define _UAPI_LINUX_BINDER_H
+
+#include <linux/ioctl.h>
+
+#define B_PACK_CHARS(c1, c2, c3, c4) \
+	((((c1)<<24)) | (((c2)<<16)) | (((c3)<<8)) | (c4))
+#define B_TYPE_LARGE 0x85
+
+enum {
+	BINDER_TYPE_BINDER	= B_PACK_CHARS('s', 'b', '*', B_TYPE_LARGE),
+	BINDER_TYPE_WEAK_BINDER	= B_PACK_CHARS('w', 'b', '*', B_TYPE_LARGE),
+	BINDER_TYPE_HANDLE	= B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE),
+	BINDER_TYPE_WEAK_HANDLE	= B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE),
+	BINDER_TYPE_FD		= B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE),
+	BINDER_TYPE_FDA		= B_PACK_CHARS('f', 'd', 'a', B_TYPE_LARGE),
+	BINDER_TYPE_PTR		= B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE),
+};
+
+/**
+ * enum flat_binder_object_shifts: shift values for flat_binder_object_flags
+ * @FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT: shift for getting scheduler policy.
+ *
+ */
+enum flat_binder_object_shifts {
+	FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT = 9,
+};
+
+/**
+ * enum flat_binder_object_flags - flags for use in flat_binder_object.flags
+ */
+enum flat_binder_object_flags {
+	/**
+	 * @FLAT_BINDER_FLAG_PRIORITY_MASK: bit-mask for min scheduler priority
+	 *
+	 * These bits can be used to set the minimum scheduler priority
+	 * at which transactions into this node should run. Valid values
+	 * in these bits depend on the scheduler policy encoded in
+	 * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK.
+	 *
+	 * For SCHED_NORMAL/SCHED_BATCH, the valid range is between [-20..19]
+	 * For SCHED_FIFO/SCHED_RR, the value can run between [1..99]
+	 */
+	FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff,
+	/**
+	 * @FLAT_BINDER_FLAG_ACCEPTS_FDS: whether the node accepts fds.
+	 */
+	FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100,
+	/**
+	 * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK: bit-mask for scheduling policy
+	 *
+	 * These two bits can be used to set the min scheduling policy at which
+	 * transactions on this node should run. These match the UAPI
+	 * scheduler policy values, eg:
+	 * 00b: SCHED_NORMAL
+	 * 01b: SCHED_FIFO
+	 * 10b: SCHED_RR
+	 * 11b: SCHED_BATCH
+	 */
+	FLAT_BINDER_FLAG_SCHED_POLICY_MASK =
+		3U << FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT,
+
+	/**
+	 * @FLAT_BINDER_FLAG_INHERIT_RT: whether the node inherits RT policy
+	 *
+	 * Only when set, calls into this node will inherit a real-time
+	 * scheduling policy from the caller (for synchronous transactions).
+	 */
+	FLAT_BINDER_FLAG_INHERIT_RT = 0x800,
+
+	/**
+	 * @FLAT_BINDER_FLAG_TXN_SECURITY_CTX: request security contexts
+	 *
+	 * Only when set, causes senders to include their security
+	 * context
+	 */
+	FLAT_BINDER_FLAG_TXN_SECURITY_CTX = 0x1000,
+};
+
+#ifdef BINDER_IPC_32BIT
+typedef __u32 binder_size_t;
+typedef __u32 binder_uintptr_t;
+#else
+typedef __u64 binder_size_t;
+typedef __u64 binder_uintptr_t;
+#endif
+
+/**
+ * struct binder_object_header - header shared by all binder metadata objects.
+ * @type:	type of the object
+ */
+struct binder_object_header {
+	__u32        type;
+};
+
+/*
+ * This is the flattened representation of a Binder object for transfer
+ * between processes.  The 'offsets' supplied as part of a binder transaction
+ * contains offsets into the data where these structures occur.  The Binder
+ * driver takes care of re-writing the structure type and data as it moves
+ * between processes.
+ */
+struct flat_binder_object {
+	struct binder_object_header	hdr;
+	__u32				flags;
+
+	/* 8 bytes of data. */
+	union {
+		binder_uintptr_t	binder;	/* local object */
+		__u32			handle;	/* remote object */
+	};
+
+	/* extra data associated with local object */
+	binder_uintptr_t	cookie;
+};
+
+/**
+ * struct binder_fd_object - describes a filedescriptor to be fixed up.
+ * @hdr:	common header structure
+ * @pad_flags:	padding to remain compatible with old userspace code
+ * @pad_binder:	padding to remain compatible with old userspace code
+ * @fd:		file descriptor
+ * @cookie:	opaque data, used by user-space
+ */
+struct binder_fd_object {
+	struct binder_object_header	hdr;
+	__u32				pad_flags;
+	union {
+		binder_uintptr_t	pad_binder;
+		__u32			fd;
+	};
+
+	binder_uintptr_t		cookie;
+};
+
+/* struct binder_buffer_object - object describing a userspace buffer
+ * @hdr:		common header structure
+ * @flags:		one or more BINDER_BUFFER_* flags
+ * @buffer:		address of the buffer
+ * @length:		length of the buffer
+ * @parent:		index in offset array pointing to parent buffer
+ * @parent_offset:	offset in @parent pointing to this buffer
+ *
+ * A binder_buffer object represents an object that the
+ * binder kernel driver can copy verbatim to the target
+ * address space. A buffer itself may be pointed to from
+ * within another buffer, meaning that the pointer inside
+ * that other buffer needs to be fixed up as well. This
+ * can be done by setting the BINDER_BUFFER_FLAG_HAS_PARENT
+ * flag in @flags, by setting @parent buffer to the index
+ * in the offset array pointing to the parent binder_buffer_object,
+ * and by setting @parent_offset to the offset in the parent buffer
+ * at which the pointer to this buffer is located.
+ */
+struct binder_buffer_object {
+	struct binder_object_header	hdr;
+	__u32				flags;
+	binder_uintptr_t		buffer;
+	binder_size_t			length;
+	binder_size_t			parent;
+	binder_size_t			parent_offset;
+};
+
+enum {
+	BINDER_BUFFER_FLAG_HAS_PARENT = 0x01,
+};
+
+/* struct binder_fd_array_object - object describing an array of fds in a buffer
+ * @hdr:		common header structure
+ * @pad:		padding to ensure correct alignment
+ * @num_fds:		number of file descriptors in the buffer
+ * @parent:		index in offset array to buffer holding the fd array
+ * @parent_offset:	start offset of fd array in the buffer
+ *
+ * A binder_fd_array object represents an array of file
+ * descriptors embedded in a binder_buffer_object. It is
+ * different from a regular binder_buffer_object because it
+ * describes a list of file descriptors to fix up, not an opaque
+ * blob of memory, and hence the kernel needs to treat it differently.
+ *
+ * An example of how this would be used is with Android's
+ * native_handle_t object, which is a struct with a list of integers
+ * and a list of file descriptors. The native_handle_t struct itself
+ * will be represented by a struct binder_buffer_objct, whereas the
+ * embedded list of file descriptors is represented by a
+ * struct binder_fd_array_object with that binder_buffer_object as
+ * a parent.
+ */
+struct binder_fd_array_object {
+	struct binder_object_header	hdr;
+	__u32				pad;
+	binder_size_t			num_fds;
+	binder_size_t			parent;
+	binder_size_t			parent_offset;
+};
+
+/*
+ * On 64-bit platforms where user code may run in 32-bits the driver must
+ * translate the buffer (and local binder) addresses appropriately.
+ */
+
+struct binder_write_read {
+	binder_size_t		write_size;	/* bytes to write */
+	binder_size_t		write_consumed;	/* bytes consumed by driver */
+	binder_uintptr_t	write_buffer;
+	binder_size_t		read_size;	/* bytes to read */
+	binder_size_t		read_consumed;	/* bytes consumed by driver */
+	binder_uintptr_t	read_buffer;
+};
+
+/* Use with BINDER_VERSION, driver fills in fields. */
+struct binder_version {
+	/* driver protocol version -- increment with incompatible change */
+	__s32       protocol_version;
+};
+
+/* This is the current protocol version. */
+#ifdef BINDER_IPC_32BIT
+#define BINDER_CURRENT_PROTOCOL_VERSION 7
+#else
+#define BINDER_CURRENT_PROTOCOL_VERSION 8
+#endif
+
+/*
+ * Use with BINDER_GET_NODE_DEBUG_INFO, driver reads ptr, writes to all fields.
+ * Set ptr to NULL for the first call to get the info for the first node, and
+ * then repeat the call passing the previously returned value to get the next
+ * nodes.  ptr will be 0 when there are no more nodes.
+ */
+struct binder_node_debug_info {
+	binder_uintptr_t ptr;
+	binder_uintptr_t cookie;
+	__u32            has_strong_ref;
+	__u32            has_weak_ref;
+};
+
+struct binder_node_info_for_ref {
+	__u32            handle;
+	__u32            strong_count;
+	__u32            weak_count;
+	__u32            reserved1;
+	__u32            reserved2;
+	__u32            reserved3;
+};
+
+#define BINDER_WRITE_READ		_IOWR('b', 1, struct binder_write_read)
+#define BINDER_SET_IDLE_TIMEOUT		_IOW('b', 3, __s64)
+#define BINDER_SET_MAX_THREADS		_IOW('b', 5, __u32)
+#define BINDER_SET_IDLE_PRIORITY	_IOW('b', 6, __s32)
+#define BINDER_SET_CONTEXT_MGR		_IOW('b', 7, __s32)
+#define BINDER_THREAD_EXIT		_IOW('b', 8, __s32)
+#define BINDER_VERSION			_IOWR('b', 9, struct binder_version)
+#define BINDER_GET_NODE_DEBUG_INFO	_IOWR('b', 11, struct binder_node_debug_info)
+#define BINDER_GET_NODE_INFO_FOR_REF	_IOWR('b', 12, struct binder_node_info_for_ref)
+#define BINDER_SET_CONTEXT_MGR_EXT	_IOW('b', 13, struct flat_binder_object)
+
+/*
+ * NOTE: Two special error codes you should check for when calling
+ * in to the driver are:
+ *
+ * EINTR -- The operation has been interupted.  This should be
+ * handled by retrying the ioctl() until a different error code
+ * is returned.
+ *
+ * ECONNREFUSED -- The driver is no longer accepting operations
+ * from your process.  That is, the process is being destroyed.
+ * You should handle this by exiting from your process.  Note
+ * that once this error code is returned, all further calls to
+ * the driver from any thread will return this same code.
+ */
+
+enum transaction_flags {
+	TF_ONE_WAY	= 0x01,	/* this is a one-way call: async, no return */
+	TF_ROOT_OBJECT	= 0x04,	/* contents are the component's root object */
+	TF_STATUS_CODE	= 0x08,	/* contents are a 32-bit status code */
+	TF_ACCEPT_FDS	= 0x10,	/* allow replies with file descriptors */
+};
+
+struct binder_transaction_data {
+	/* The first two are only used for bcTRANSACTION and brTRANSACTION,
+	 * identifying the target and contents of the transaction.
+	 */
+	union {
+		/* target descriptor of command transaction */
+		__u32	handle;
+		/* target descriptor of return transaction */
+		binder_uintptr_t ptr;
+	} target;
+	binder_uintptr_t	cookie;	/* target object cookie */
+	__u32		code;		/* transaction command */
+
+	/* General information about the transaction. */
+	__u32	        flags;
+	pid_t		sender_pid;
+	uid_t		sender_euid;
+	binder_size_t	data_size;	/* number of bytes of data */
+	binder_size_t	offsets_size;	/* number of bytes of offsets */
+
+	/* If this transaction is inline, the data immediately
+	 * follows here; otherwise, it ends with a pointer to
+	 * the data buffer.
+	 */
+	union {
+		struct {
+			/* transaction data */
+			binder_uintptr_t	buffer;
+			/* offsets from buffer to flat_binder_object structs */
+			binder_uintptr_t	offsets;
+		} ptr;
+		__u8	buf[8];
+	} data;
+};
+
+struct binder_transaction_data_secctx {
+	struct binder_transaction_data transaction_data;
+	binder_uintptr_t secctx;
+};
+
+struct binder_transaction_data_sg {
+	struct binder_transaction_data transaction_data;
+	binder_size_t buffers_size;
+};
+
+struct binder_ptr_cookie {
+	binder_uintptr_t ptr;
+	binder_uintptr_t cookie;
+};
+
+struct binder_handle_cookie {
+	__u32 handle;
+	binder_uintptr_t cookie;
+} __packed;
+
+struct binder_pri_desc {
+	__s32 priority;
+	__u32 desc;
+};
+
+struct binder_pri_ptr_cookie {
+	__s32 priority;
+	binder_uintptr_t ptr;
+	binder_uintptr_t cookie;
+};
+
+enum binder_driver_return_protocol {
+	BR_ERROR = _IOR('r', 0, __s32),
+	/*
+	 * int: error code
+	 */
+
+	BR_OK = _IO('r', 1),
+	/* No parameters! */
+
+	BR_TRANSACTION_SEC_CTX = _IOR('r', 2,
+				      struct binder_transaction_data_secctx),
+	/*
+	 * binder_transaction_data_secctx: the received command.
+	 */
+	BR_TRANSACTION = _IOR('r', 2, struct binder_transaction_data),
+	BR_REPLY = _IOR('r', 3, struct binder_transaction_data),
+	/*
+	 * binder_transaction_data: the received command.
+	 */
+
+	BR_ACQUIRE_RESULT = _IOR('r', 4, __s32),
+	/*
+	 * not currently supported
+	 * int: 0 if the last bcATTEMPT_ACQUIRE was not successful.
+	 * Else the remote object has acquired a primary reference.
+	 */
+
+	BR_DEAD_REPLY = _IO('r', 5),
+	/*
+	 * The target of the last transaction (either a bcTRANSACTION or
+	 * a bcATTEMPT_ACQUIRE) is no longer with us.  No parameters.
+	 */
+
+	BR_TRANSACTION_COMPLETE = _IO('r', 6),
+	/*
+	 * No parameters... always refers to the last transaction requested
+	 * (including replies).  Note that this will be sent even for
+	 * asynchronous transactions.
+	 */
+
+	BR_INCREFS = _IOR('r', 7, struct binder_ptr_cookie),
+	BR_ACQUIRE = _IOR('r', 8, struct binder_ptr_cookie),
+	BR_RELEASE = _IOR('r', 9, struct binder_ptr_cookie),
+	BR_DECREFS = _IOR('r', 10, struct binder_ptr_cookie),
+	/*
+	 * void *:	ptr to binder
+	 * void *: cookie for binder
+	 */
+
+	BR_ATTEMPT_ACQUIRE = _IOR('r', 11, struct binder_pri_ptr_cookie),
+	/*
+	 * not currently supported
+	 * int:	priority
+	 * void *: ptr to binder
+	 * void *: cookie for binder
+	 */
+
+	BR_NOOP = _IO('r', 12),
+	/*
+	 * No parameters.  Do nothing and examine the next command.  It exists
+	 * primarily so that we can replace it with a BR_SPAWN_LOOPER command.
+	 */
+
+	BR_SPAWN_LOOPER = _IO('r', 13),
+	/*
+	 * No parameters.  The driver has determined that a process has no
+	 * threads waiting to service incoming transactions.  When a process
+	 * receives this command, it must spawn a new service thread and
+	 * register it via bcENTER_LOOPER.
+	 */
+
+	BR_FINISHED = _IO('r', 14),
+	/*
+	 * not currently supported
+	 * stop threadpool thread
+	 */
+
+	BR_DEAD_BINDER = _IOR('r', 15, binder_uintptr_t),
+	/*
+	 * void *: cookie
+	 */
+	BR_CLEAR_DEATH_NOTIFICATION_DONE = _IOR('r', 16, binder_uintptr_t),
+	/*
+	 * void *: cookie
+	 */
+
+	BR_FAILED_REPLY = _IO('r', 17),
+	/*
+	 * The the last transaction (either a bcTRANSACTION or
+	 * a bcATTEMPT_ACQUIRE) failed (e.g. out of memory).  No parameters.
+	 */
+};
+
+enum binder_driver_command_protocol {
+	BC_TRANSACTION = _IOW('c', 0, struct binder_transaction_data),
+	BC_REPLY = _IOW('c', 1, struct binder_transaction_data),
+	/*
+	 * binder_transaction_data: the sent command.
+	 */
+
+	BC_ACQUIRE_RESULT = _IOW('c', 2, __s32),
+	/*
+	 * not currently supported
+	 * int:  0 if the last BR_ATTEMPT_ACQUIRE was not successful.
+	 * Else you have acquired a primary reference on the object.
+	 */
+
+	BC_FREE_BUFFER = _IOW('c', 3, binder_uintptr_t),
+	/*
+	 * void *: ptr to transaction data received on a read
+	 */
+
+	BC_INCREFS = _IOW('c', 4, __u32),
+	BC_ACQUIRE = _IOW('c', 5, __u32),
+	BC_RELEASE = _IOW('c', 6, __u32),
+	BC_DECREFS = _IOW('c', 7, __u32),
+	/*
+	 * int:	descriptor
+	 */
+
+	BC_INCREFS_DONE = _IOW('c', 8, struct binder_ptr_cookie),
+	BC_ACQUIRE_DONE = _IOW('c', 9, struct binder_ptr_cookie),
+	/*
+	 * void *: ptr to binder
+	 * void *: cookie for binder
+	 */
+
+	BC_ATTEMPT_ACQUIRE = _IOW('c', 10, struct binder_pri_desc),
+	/*
+	 * not currently supported
+	 * int: priority
+	 * int: descriptor
+	 */
+
+	BC_REGISTER_LOOPER = _IO('c', 11),
+	/*
+	 * No parameters.
+	 * Register a spawned looper thread with the device.
+	 */
+
+	BC_ENTER_LOOPER = _IO('c', 12),
+	BC_EXIT_LOOPER = _IO('c', 13),
+	/*
+	 * No parameters.
+	 * These two commands are sent as an application-level thread
+	 * enters and exits the binder loop, respectively.  They are
+	 * used so the binder can have an accurate count of the number
+	 * of looping threads it has available.
+	 */
+
+	BC_REQUEST_DEATH_NOTIFICATION = _IOW('c', 14,
+						struct binder_handle_cookie),
+	/*
+	 * int: handle
+	 * void *: cookie
+	 */
+
+	BC_CLEAR_DEATH_NOTIFICATION = _IOW('c', 15,
+						struct binder_handle_cookie),
+	/*
+	 * int: handle
+	 * void *: cookie
+	 */
+
+	BC_DEAD_BINDER_DONE = _IOW('c', 16, binder_uintptr_t),
+	/*
+	 * void *: cookie
+	 */
+
+	BC_TRANSACTION_SG = _IOW('c', 17, struct binder_transaction_data_sg),
+	BC_REPLY_SG = _IOW('c', 18, struct binder_transaction_data_sg),
+	/*
+	 * binder_transaction_data_sg: the sent command.
+	 */
+};
+
+#endif /* _UAPI_LINUX_BINDER_H */
+
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@ -181,17 +181,7 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);

 #include <uapi/linux/types.h>

-static __always_inline void data_access_exceeds_word_size(void)
-#ifdef __compiletime_warning
-__compiletime_warning("data access exceeds word size and won't be atomic")
-#endif
-;
-
-static __always_inline void data_access_exceeds_word_size(void)
-{
-}
-
-static __always_inline void __read_once_size(volatile void *p, void *res, int size)
+static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
 {
 	switch (size) {
 	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
@ -203,7 +193,6 @@ static __always_inline void __read_once_size(volatile void *p, void *res, int si
 	default:
 		barrier();
 		__builtin_memcpy((void *)res, (const void *)p, size);
-		data_access_exceeds_word_size();
 		barrier();
 	}
 }
@ -220,7 +209,6 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	default:
 		barrier();
 		__builtin_memcpy((void *)p, (const void *)res, size);
-		data_access_exceeds_word_size();
 		barrier();
 	}
 }
@ -248,10 +236,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 */

 #define READ_ONCE(x) \
-	({ typeof(x) __val; __read_once_size(&x, &__val, sizeof(__val)); __val; })
+	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })

 #define WRITE_ONCE(x, val) \
-	({ typeof(x) __val; __val = val; __write_once_size(&x, &__val, sizeof(__val)); __val; })
+	({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })

 #endif /* __KERNEL__ */

--- a/include/linux/list.h
+++ b/include/linux/list.h
@ -388,6 +388,17 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_prev_entry(pos, member) \
 	list_entry((pos)->member.prev, typeof(*(pos)), member)

+/**
+ * list_first_entry_or_null - get the first element from a list
+ * @ptr:	the list head to take the element from.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Note that if the list is empty, it returns NULL.
+ */
+#define list_first_entry_or_null(ptr, type, member) \
+	(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
+
 /**
 * list_for_each	-	iterate over a list
 * @pos:	the &struct list_head to use as a loop cursor.
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
+ * Authors: David Chinner and Glauber Costa
+ *
+ * Generic LRU infrastructure
+ */
+#ifndef _LRU_LIST_H
+#define _LRU_LIST_H
+
+#include <linux/list.h>
+#include <linux/nodemask.h>
+
+/* list_lru_walk_cb has to always return one of those */
+enum lru_status {
+	LRU_REMOVED,		/* item removed from list */
+	LRU_REMOVED_RETRY,	/* item removed, but lock has been
+				   dropped and reacquired */
+	LRU_ROTATE,		/* item referenced, give another pass */
+	LRU_SKIP,		/* item cannot be locked, skip */
+	LRU_RETRY,		/* item not freeable. May drop the lock
+				   internally, but has to return locked. */
+};
+
+struct list_lru_node {
+	spinlock_t		lock;
+	struct list_head	list;
+	/* kept as signed so we can catch imbalance bugs */
+	long			nr_items;
+} ____cacheline_aligned_in_smp;
+
+struct list_lru {
+	struct list_lru_node	*node;
+	nodemask_t		active_nodes;
+};
+
+void list_lru_destroy(struct list_lru *lru);
+int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
+static inline int list_lru_init(struct list_lru *lru)
+{
+	return list_lru_init_key(lru, NULL);
+}
+
+/**
+ * list_lru_add: add an element to the lru list's tail
+ * @list_lru: the lru pointer
+ * @item: the item to be added.
+ *
+ * If the element is already part of a list, this function returns doing
+ * nothing. Therefore the caller does not need to keep state about whether or
+ * not the element already belongs in the list and is allowed to lazy update
+ * it. Note however that this is valid for *a* list, not *this* list. If
+ * the caller organize itself in a way that elements can be in more than
+ * one type of list, it is up to the caller to fully remove the item from
+ * the previous list (with list_lru_del() for instance) before moving it
+ * to @list_lru
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_add(struct list_lru *lru, struct list_head *item);
+
+/**
+ * list_lru_del: delete an element to the lru list
+ * @list_lru: the lru pointer
+ * @item: the item to be deleted.
+ *
+ * This function works analogously as list_lru_add in terms of list
+ * manipulation. The comments about an element already pertaining to
+ * a list are also valid for list_lru_del.
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_del(struct list_lru *lru, struct list_head *item);
+
+/**
+ * list_lru_count_node: return the number of objects currently held by @lru
+ * @lru: the lru pointer.
+ * @nid: the node id to count from.
+ *
+ * Always return a non-negative number, 0 for empty lists. There is no
+ * guarantee that the list is not updated while the count is being computed.
+ * Callers that want such a guarantee need to provide an outer lock.
+ */
+unsigned long list_lru_count_node(struct list_lru *lru, int nid);
+static inline unsigned long list_lru_count(struct list_lru *lru)
+{
+	long count = 0;
+	int nid;
+
+	for_each_node_mask(nid, lru->active_nodes)
+		count += list_lru_count_node(lru, nid);
+
+	return count;
+}
+
+typedef enum lru_status
+(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
+/**
+ * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items.
+ * @lru: the lru pointer.
+ * @nid: the node id to scan from.
+ * @isolate: callback function that is resposible for deciding what to do with
+ *  the item currently being scanned
+ * @cb_arg: opaque type that will be passed to @isolate
+ * @nr_to_walk: how many items to scan.
+ *
+ * This function will scan all elements in a particular list_lru, calling the
+ * @isolate callback for each of those items, along with the current list
+ * spinlock and a caller-provided opaque. The @isolate callback can choose to
+ * drop the lock internally, but *must* return with the lock held. The callback
+ * will return an enum lru_status telling the list_lru infrastructure what to
+ * do with the object being scanned.
+ *
+ * Please note that nr_to_walk does not mean how many objects will be freed,
+ * just how many objects will be scanned.
+ *
+ * Return value: the number of objects effectively removed from the LRU.
+ */
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+				 list_lru_walk_cb isolate, void *cb_arg,
+				 unsigned long *nr_to_walk);
+
+static inline unsigned long
+list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
+	      void *cb_arg, unsigned long nr_to_walk)
+{
+	long isolated = 0;
+	int nid;
+
+	for_each_node_mask(nid, lru->active_nodes) {
+		isolated += list_lru_walk_node(lru, nid, isolate,
+					       cb_arg, &nr_to_walk);
+		if (nr_to_walk <= 0)
+			break;
+	}
+	return isolated;
+}
+#endif /* _LRU_LIST_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -1560,6 +1560,62 @@ struct task_struct {
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)

+#define MAX_NICE	19
+#define MIN_NICE	-20
+#define NICE_WIDTH	(MAX_NICE - MIN_NICE + 1)
+
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space.  This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+#define MAX_USER_RT_PRIO	100
+#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+
+#define MAX_PRIO		(MAX_RT_PRIO + NICE_WIDTH)
+#define DEFAULT_PRIO		(MAX_RT_PRIO + NICE_WIDTH / 2)
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice)	((nice) + DEFAULT_PRIO)
+#define PRIO_TO_NICE(prio)	((prio) - DEFAULT_PRIO)
+#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)		((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+
+/*
+ * Convert nice value [19,-20] to rlimit style value [1,40].
+ */
+static inline long nice_to_rlimit(long nice)
+{
+	return (MAX_NICE - nice + 1);
+}
+
+/*
+ * Convert rlimit style value [1,40] to nice value [-20, 19].
+ */
+static inline long rlimit_to_nice(long prio)
+{
+	return (MAX_NICE - prio + 1);
+}
+
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int node, int pages, bool migrated);
 extern void set_numabalancing_state(bool enabled);
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@ -1,24 +1,9 @@
 #ifndef _SCHED_RT_H
 #define _SCHED_RT_H

-/*
- * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
- * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
- * values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space.  This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
- */
-
-#define MAX_USER_RT_PRIO	100
-#define MAX_RT_PRIO		MAX_USER_RT_PRIO
-
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
-#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
+#define MAX_NICE	19
+#define MIN_NICE	-20
+#define NICE_WIDTH	(MAX_NICE - MIN_NICE + 1)

 static inline int rt_prio(int prio)
 {
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@ -4,6 +4,12 @@
 /*
 * This struct is used to pass information from page reclaim to the shrinkers.
 * We consolidate the values for easier extention later.
+ *
+ * The 'gfpmask' refers to the allocation we are currently trying to
+ * fulfil.
+ *
+ * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
+ * querying the cache size, so a fastpath for that case is appropriate.
 */
 struct shrink_control {
 	gfp_t gfp_mask;
@ -12,23 +18,37 @@ struct shrink_control {
 	unsigned long nr_to_scan;
 };

+#define SHRINK_STOP (~0UL)
 /*
 * A callback you can register to apply pressure to ageable caches.
 *
- * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
- * and a 'gfpmask'.  It should look through the least-recently-used
- * 'nr_to_scan' entries and attempt to free them up.  It should return
- * the number of objects which remain in the cache.  If it returns -1, it means
- * it cannot do any scanning at this time (eg. there is a risk of deadlock).
+ * @shrink() should look through the least-recently-used 'nr_to_scan' entries
+ * and attempt to free them up.  It should return the number of objects which
+ * remain in the cache.  If it returns -1, it means it cannot do any scanning at
+ * this time (eg. there is a risk of deadlock).
 *
- * The 'gfpmask' refers to the allocation we are currently trying to
- * fulfil.
+ * @count_objects should return the number of freeable items in the cache. If
+ * there are no objects to free or the number of freeable items cannot be
+ * determined, it should return 0. No deadlock checks should be done during the
+ * count callback - the shrinker relies on aggregating scan counts that couldn't
+ * be executed due to potential deadlocks to be run at a later call when the
+ * deadlock condition is no longer pending.
 *
- * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
- * querying the cache size, so a fastpath for that case is appropriate.
+ * @scan_objects will only be called if @count_objects returned a non-zero
+ * value for the number of freeable objects. The callout should scan the cache
+ * and attempt to free items from the cache. It should then return the number
+ * of objects freed during the scan, or SHRINK_STOP if progress cannot be made
+ * due to potential deadlocks. If SHRINK_STOP is returned, then no further
+ * attempts to call the @scan_objects will be made from the current reclaim
+ * context.
 */
 struct shrinker {
 	int (*shrink)(struct shrinker *, struct shrink_control *sc);
+	unsigned long (*count_objects)(struct shrinker *,
+				       struct shrink_control *sc);
+	unsigned long (*scan_objects)(struct shrinker *,
+				      struct shrink_control *sc);
+
 	int seeks;	/* seeks to recreate an obj */
 	long batch;	/* reclaim batch size, 0 = default */

--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@ -1,50 +0,0 @@
-#ifndef __LINUX_VMPRESSURE_H
-#define __LINUX_VMPRESSURE_H
-
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/workqueue.h>
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/cgroup.h>
-
-struct vmpressure {
-	unsigned long scanned;
-	unsigned long reclaimed;
-	unsigned long stall;
-	/* The lock is used to keep the scanned/reclaimed above in sync. */
-	struct mutex sr_lock;
-
-	/* The list of vmpressure_event structs. */
-	struct list_head events;
-	/* Have to grab the lock on events traversal or modifications. */
-	struct mutex events_lock;
-
-	struct work_struct work;
-};
-
-struct mem_cgroup;
-
-extern int vmpressure_notifier_register(struct notifier_block *nb);
-extern int vmpressure_notifier_unregister(struct notifier_block *nb);
-extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
-		       unsigned long scanned, unsigned long reclaimed);
-extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
-
-#ifdef CONFIG_MEMCG
-extern void vmpressure_init(struct vmpressure *vmpr);
-extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
-extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
-extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
-extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
-				     struct eventfd_ctx *eventfd,
-				     const char *args);
-extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
-					struct eventfd_ctx *eventfd);
-#else
-static inline struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
-{
-	return NULL;
-}
-#endif /* CONFIG_MEMCG */
-#endif /* __LINUX_VMPRESSURE_H */
--- a/include/trace/events/almk.h
+++ b/include/trace/events/almk.h
@ -1,84 +0,0 @@
-/* Copyright (c) 2015, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM almk
-
-#if !defined(_TRACE_EVENT_ALMK_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_EVENT_ALMK_H
-
-#include <linux/tracepoint.h>
-#include <linux/types.h>
-
-TRACE_EVENT(almk_vmpressure,
-
-	TP_PROTO(unsigned long pressure,
-		int other_free,
-		int other_file),
-
-	TP_ARGS(pressure, other_free, other_file),
-
-	TP_STRUCT__entry(
-		__field(unsigned long, pressure)
-		__field(int, other_free)
-		__field(int, other_file)
-	),
-
-	TP_fast_assign(
-		__entry->pressure	= pressure;
-		__entry->other_free	= other_free;
-		__entry->other_file	= other_file;
-	),
-
-	TP_printk("%lu, %d, %d",
-			__entry->pressure, __entry->other_free,
-			__entry->other_file)
-);
-
-TRACE_EVENT(almk_shrink,
-
-	TP_PROTO(int tsize,
-		 int vmp,
-		 int other_free,
-		 int other_file,
-		 short adj),
-
-	TP_ARGS(tsize, vmp, other_free, other_file, adj),
-
-	TP_STRUCT__entry(
-		__field(int, tsize)
-		__field(int, vmp)
-		__field(int, other_free)
-		__field(int, other_file)
-		__field(short, adj)
-	),
-
-	TP_fast_assign(
-		__entry->tsize		= tsize;
-		__entry->vmp		= vmp;
-		__entry->other_free     = other_free;
-		__entry->other_file     = other_file;
-		__entry->adj		= adj;
-	),
-
-	TP_printk("%d, %d, %d, %d, %d",
-		__entry->tsize,
-		__entry->vmp,
-		__entry->other_free,
-		__entry->other_file,
-		__entry->adj)
-);
-
-#endif
-
-#include <trace/define_trace.h>
-
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@ -17,24 +17,6 @@

 extern __read_mostly int scheduler_running;

-/*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
-
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p)		((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
-
 /*
 * Helpers for converting nanosecond timing to jiffy resolution
 */
--- a/mm/Makefile
+++ b/mm/Makefile
@ -17,8 +17,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o balloon_compaction.o \
-			   interval_tree.o $(mmu-y) \
-			   showmem.o vmpressure.o
+			   interval_tree.o showmem.o list_lru.o $(mmu-y)

 obj-y += init-mm.o

--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
+ * Authors: David Chinner and Glauber Costa
+ *
+ * Generic LRU infrastructure
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/list_lru.h>
+#include <linux/slab.h>
+
+bool list_lru_add(struct list_lru *lru, struct list_head *item)
+{
+	int nid = page_to_nid(virt_to_page(item));
+	struct list_lru_node *nlru = &lru->node[nid];
+
+	spin_lock(&nlru->lock);
+	WARN_ON_ONCE(nlru->nr_items < 0);
+	if (list_empty(item)) {
+		list_add_tail(item, &nlru->list);
+		if (nlru->nr_items++ == 0)
+			node_set(nid, lru->active_nodes);
+		spin_unlock(&nlru->lock);
+		return true;
+	}
+	spin_unlock(&nlru->lock);
+	return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_add);
+
+bool list_lru_del(struct list_lru *lru, struct list_head *item)
+{
+	int nid = page_to_nid(virt_to_page(item));
+	struct list_lru_node *nlru = &lru->node[nid];
+
+	spin_lock(&nlru->lock);
+	if (!list_empty(item)) {
+		list_del_init(item);
+		if (--nlru->nr_items == 0)
+			node_clear(nid, lru->active_nodes);
+		WARN_ON_ONCE(nlru->nr_items < 0);
+		spin_unlock(&nlru->lock);
+		return true;
+	}
+	spin_unlock(&nlru->lock);
+	return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_del);
+
+unsigned long
+list_lru_count_node(struct list_lru *lru, int nid)
+{
+	unsigned long count = 0;
+	struct list_lru_node *nlru = &lru->node[nid];
+
+	spin_lock(&nlru->lock);
+	WARN_ON_ONCE(nlru->nr_items < 0);
+	count += nlru->nr_items;
+	spin_unlock(&nlru->lock);
+
+	return count;
+}
+EXPORT_SYMBOL_GPL(list_lru_count_node);
+
+unsigned long
+list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
+		   void *cb_arg, unsigned long *nr_to_walk)
+{
+
+	struct list_lru_node	*nlru = &lru->node[nid];
+	struct list_head *item, *n;
+	unsigned long isolated = 0;
+
+	spin_lock(&nlru->lock);
+restart:
+	list_for_each_safe(item, n, &nlru->list) {
+		enum lru_status ret;
+
+		/*
+		 * decrement nr_to_walk first so that we don't livelock if we
+		 * get stuck on large numbesr of LRU_RETRY items
+		 */
+		if (!*nr_to_walk)
+			break;
+		--*nr_to_walk;
+
+		ret = isolate(item, &nlru->lock, cb_arg);
+		switch (ret) {
+		case LRU_REMOVED_RETRY:
+			assert_spin_locked(&nlru->lock);
+		case LRU_REMOVED:
+			if (--nlru->nr_items == 0)
+				node_clear(nid, lru->active_nodes);
+			WARN_ON_ONCE(nlru->nr_items < 0);
+			isolated++;
+			/*
+			 * If the lru lock has been dropped, our list
+			 * traversal is now invalid and so we have to
+			 * restart from scratch.
+			 */
+			if (ret == LRU_REMOVED_RETRY)
+				goto restart;
+			break;
+		case LRU_ROTATE:
+			list_move_tail(item, &nlru->list);
+			break;
+		case LRU_SKIP:
+			break;
+		case LRU_RETRY:
+			/*
+			 * The lru lock has been dropped, our list traversal is
+			 * now invalid and so we have to restart from scratch.
+			 */
+			assert_spin_locked(&nlru->lock);
+			goto restart;
+		default:
+			BUG();
+		}
+	}
+
+	spin_unlock(&nlru->lock);
+	return isolated;
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_node);
+
+int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
+{
+	int i;
+	size_t size = sizeof(*lru->node) * nr_node_ids;
+
+	lru->node = kzalloc(size, GFP_KERNEL);
+	if (!lru->node)
+		return -ENOMEM;
+
+	nodes_clear(lru->active_nodes);
+	for (i = 0; i < nr_node_ids; i++) {
+		spin_lock_init(&lru->node[i].lock);
+		if (key)
+			lockdep_set_class(&lru->node[i].lock, key);
+		INIT_LIST_HEAD(&lru->node[i].list);
+		lru->node[i].nr_items = 0;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(list_lru_init_key);
+
+void list_lru_destroy(struct list_lru *lru)
+{
+	kfree(lru->node);
+}
+EXPORT_SYMBOL_GPL(list_lru_destroy);
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@ -49,7 +49,6 @@
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
-#include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
@ -264,9 +263,6 @@ struct mem_cgroup {
 	 */
 	struct res_counter res;

-	/* vmpressure notifications */
-	struct vmpressure vmpressure;
-
 	union {
 		/*
 		 * the counter to account for mem+swap usage.
@ -366,7 +362,6 @@ struct mem_cgroup {
 	atomic_t	numainfo_events;
 	atomic_t	numainfo_updating;
 #endif
-
 	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
@ -518,24 +513,6 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 	return container_of(s, struct mem_cgroup, css);
 }

-/* Some nice accessors for the vmpressure. */
-struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
-{
-	if (!memcg)
-		memcg = root_mem_cgroup;
-	return &memcg->vmpressure;
-}
-
-struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
-{
-	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
-}
-
-struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
-{
-	return &mem_cgroup_from_css(css)->vmpressure;
-}
-
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
 	return (memcg == root_mem_cgroup);
@ -6001,11 +5978,6 @@ static struct cftype mem_cgroup_files[] = {
 		.unregister_event = mem_cgroup_oom_unregister_event,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
-	{
-		.name = "pressure_level",
-		.register_event = vmpressure_register_event,
-		.unregister_event = vmpressure_unregister_event,
-	},
 #ifdef CONFIG_NUMA
 	{
 		.name = "numa_stat",
@ -6287,7 +6259,6 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 	memcg->move_charge_at_immigrate = 0;
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
-	vmpressure_init(&memcg->vmpressure);

 	return &memcg->css;

--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@ -1,502 +0,0 @@
-/*
- * Linux VM pressure
- *
- * Copyright 2012 Linaro Ltd.
- *		  Anton Vorontsov <anton.vorontsov@linaro.org>
- *
- * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
- * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- */
-
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/log2.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/eventfd.h>
-#include <linux/swap.h>
-#include <linux/printk.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/vmpressure.h>
-
-/*
- * The window size (vmpressure_win) is the number of scanned pages before
- * we try to analyze scanned/reclaimed ratio. So the window is used as a
- * rate-limit tunable for the "low" level notification, and also for
- * averaging the ratio for medium/critical levels. Using small window
- * sizes can cause lot of false positives, but too big window size will
- * delay the notifications.
- *
- * As the vmscan reclaimer logic works with chunks which are multiple of
- * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
- *
- * TODO: Make the window size depend on machine size, as we do for vmstat
- * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
- */
-static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
-
-/*
- * These thresholds are used when we account memory pressure through
- * scanned/reclaimed ratio. The current values were chosen empirically. In
- * essence, they are percents: the higher the value, the more number
- * unsuccessful reclaims there were.
- */
-static const unsigned int vmpressure_level_med = 60;
-static const unsigned int vmpressure_level_critical = 95;
-
-static unsigned long vmpressure_scale_max = 100;
-module_param_named(vmpressure_scale_max, vmpressure_scale_max,
-			ulong, S_IRUGO | S_IWUSR);
-
-/* vmpressure values >= this will be scaled based on allocstalls */
-static unsigned long allocstall_threshold = 70;
-module_param_named(allocstall_threshold, allocstall_threshold,
-			ulong, S_IRUGO | S_IWUSR);
-
-static struct vmpressure global_vmpressure;
-BLOCKING_NOTIFIER_HEAD(vmpressure_notifier);
-
-int vmpressure_notifier_register(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_register(&vmpressure_notifier, nb);
-}
-
-int vmpressure_notifier_unregister(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_unregister(&vmpressure_notifier, nb);
-}
-
-void vmpressure_notify(unsigned long pressure)
-{
-	blocking_notifier_call_chain(&vmpressure_notifier, pressure, NULL);
-}
-
-/*
- * When there are too little pages left to scan, vmpressure() may miss the
- * critical pressure as number of pages will be less than "window size".
- * However, in that case the vmscan priority will raise fast as the
- * reclaimer will try to scan LRUs more deeply.
- *
- * The vmscan logic considers these special priorities:
- *
- * prio == DEF_PRIORITY (12): reclaimer starts with that value
- * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
- * prio == 0                : close to OOM, kernel scans every page in an lru
- *
- * Any value in this range is acceptable for this tunable (i.e. from 12 to
- * 0). Current value for the vmpressure_level_critical_prio is chosen
- * empirically, but the number, in essence, means that we consider
- * critical level when scanning depth is ~10% of the lru size (vmscan
- * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
- * eights).
- */
-static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
-
-static struct vmpressure *work_to_vmpressure(struct work_struct *work)
-{
-	return container_of(work, struct vmpressure, work);
-}
-
-#ifdef CONFIG_MEMCG
-static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
-{
-	return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
-}
-
-static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
-{
-	struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
-	struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
-
-	memcg = parent_mem_cgroup(memcg);
-	if (!memcg)
-		return NULL;
-	return memcg_to_vmpressure(memcg);
-}
-#else
-static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
-{
-	return NULL;
-}
-
-static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
-{
-	return NULL;
-}
-#endif
-
-enum vmpressure_levels {
-	VMPRESSURE_LOW = 0,
-	VMPRESSURE_MEDIUM,
-	VMPRESSURE_CRITICAL,
-	VMPRESSURE_NUM_LEVELS,
-};
-
-static const char * const vmpressure_str_levels[] = {
-	[VMPRESSURE_LOW] = "low",
-	[VMPRESSURE_MEDIUM] = "medium",
-	[VMPRESSURE_CRITICAL] = "critical",
-};
-
-static enum vmpressure_levels vmpressure_level(unsigned long pressure)
-{
-	if (pressure >= vmpressure_level_critical)
-		return VMPRESSURE_CRITICAL;
-	else if (pressure >= vmpressure_level_med)
-		return VMPRESSURE_MEDIUM;
-	return VMPRESSURE_LOW;
-}
-
-static unsigned long vmpressure_calc_pressure(unsigned long scanned,
-						    unsigned long reclaimed)
-{
-	unsigned long scale = scanned + reclaimed;
-	unsigned long pressure = 0;
-
-	/*
-	 * reclaimed can be greater than scanned in cases
-	 * like THP, where the scanned is 1 and reclaimed
-	 * could be 512
-	 */
-	if (reclaimed >= scanned)
-		goto out;
-	/*
-	 * We calculate the ratio (in percents) of how many pages were
-	 * scanned vs. reclaimed in a given time frame (window). Note that
-	 * time is in VM reclaimer's "ticks", i.e. number of pages
-	 * scanned. This makes it possible to set desired reaction time
-	 * and serves as a ratelimit.
-	 */
-	pressure = scale - (reclaimed * scale / scanned);
-	pressure = pressure * 100 / scale;
-
-out:
-	pr_debug("%s: %3lu  (s: %lu  r: %lu)\n", __func__, pressure,
-		 scanned, reclaimed);
-
-	return pressure;
-}
-
-static unsigned long vmpressure_account_stall(unsigned long pressure,
-				unsigned long stall, unsigned long scanned)
-{
-	unsigned long scale;
-
-	if (pressure < allocstall_threshold)
-		return pressure;
-
-	scale = ((vmpressure_scale_max - pressure) * stall) / scanned;
-
-	return pressure + scale;
-}
-
-struct vmpressure_event {
-	struct eventfd_ctx *efd;
-	enum vmpressure_levels level;
-	struct list_head node;
-};
-
-static bool vmpressure_event(struct vmpressure *vmpr,
-			     unsigned long scanned, unsigned long reclaimed)
-{
-	struct vmpressure_event *ev;
-	enum vmpressure_levels level;
-	unsigned long pressure;
-	bool signalled = false;
-
-	pressure = vmpressure_calc_pressure(scanned, reclaimed);
-	level = vmpressure_level(pressure);
-
-	mutex_lock(&vmpr->events_lock);
-
-	list_for_each_entry(ev, &vmpr->events, node) {
-		if (level >= ev->level) {
-			eventfd_signal(ev->efd, 1);
-			signalled = true;
-		}
-	}
-
-	mutex_unlock(&vmpr->events_lock);
-
-	return signalled;
-}
-
-static void vmpressure_work_fn(struct work_struct *work)
-{
-	struct vmpressure *vmpr = work_to_vmpressure(work);
-	unsigned long scanned;
-	unsigned long reclaimed;
-
-	/*
-	 * Several contexts might be calling vmpressure(), so it is
-	 * possible that the work was rescheduled again before the old
-	 * work context cleared the counters. In that case we will run
-	 * just after the old work returns, but then scanned might be zero
-	 * here. No need for any locks here since we don't care if
-	 * vmpr->reclaimed is in sync.
-	 */
-	if (!vmpr->scanned)
-		return;
-
-	mutex_lock(&vmpr->sr_lock);
-	scanned = vmpr->scanned;
-	reclaimed = vmpr->reclaimed;
-	vmpr->scanned = 0;
-	vmpr->reclaimed = 0;
-	mutex_unlock(&vmpr->sr_lock);
-
-	do {
-		if (vmpressure_event(vmpr, scanned, reclaimed))
-			break;
-		/*
-		 * If not handled, propagate the event upward into the
-		 * hierarchy.
-		 */
-	} while ((vmpr = vmpressure_parent(vmpr)));
-}
-
-void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg,
-		unsigned long scanned, unsigned long reclaimed)
-{
-	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
-
-	BUG_ON(!vmpr);
-
-	/*
-	 * Here we only want to account pressure that userland is able to
-	 * help us with. For example, suppose that DMA zone is under
-	 * pressure; if we notify userland about that kind of pressure,
-	 * then it will be mostly a waste as it will trigger unnecessary
-	 * freeing of memory by userland (since userland is more likely to
-	 * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
-	 * is why we include only movable, highmem and FS/IO pages.
-	 * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
-	 * we account it too.
-	 */
-	if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
-		return;
-
-	/*
-	 * If we got here with no pages scanned, then that is an indicator
-	 * that reclaimer was unable to find any shrinkable LRUs at the
-	 * current scanning depth. But it does not mean that we should
-	 * report the critical pressure, yet. If the scanning priority
-	 * (scanning depth) goes too high (deep), we will be notified
-	 * through vmpressure_prio(). But so far, keep calm.
-	 */
-	if (!scanned)
-		return;
-
-	mutex_lock(&vmpr->sr_lock);
-	vmpr->scanned += scanned;
-	vmpr->reclaimed += reclaimed;
-	scanned = vmpr->scanned;
-	mutex_unlock(&vmpr->sr_lock);
-
-	if (scanned < vmpressure_win || work_pending(&vmpr->work))
-		return;
-	schedule_work(&vmpr->work);
-}
-
-void vmpressure_global(gfp_t gfp, unsigned long scanned,
-		unsigned long reclaimed)
-{
-	struct vmpressure *vmpr = &global_vmpressure;
-	unsigned long pressure;
-	unsigned long stall;
-
-	if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
-		return;
-
-	if (!scanned)
-		return;
-
-	mutex_lock(&vmpr->sr_lock);
-	vmpr->scanned += scanned;
-	vmpr->reclaimed += reclaimed;
-
-	if (!current_is_kswapd())
-		vmpr->stall += scanned;
-
-	stall = vmpr->stall;
-	scanned = vmpr->scanned;
-	reclaimed = vmpr->reclaimed;
-	mutex_unlock(&vmpr->sr_lock);
-
-	if (scanned < vmpressure_win)
-		return;
-
-	mutex_lock(&vmpr->sr_lock);
-	vmpr->scanned = 0;
-	vmpr->reclaimed = 0;
-	vmpr->stall = 0;
-	mutex_unlock(&vmpr->sr_lock);
-
-	pressure = vmpressure_calc_pressure(scanned, reclaimed);
-	pressure = vmpressure_account_stall(pressure, stall, scanned);
-	vmpressure_notify(pressure);
-}
-
-/**
- * vmpressure() - Account memory pressure through scanned/reclaimed ratio
- * @gfp:	reclaimer's gfp mask
- * @memcg:	cgroup memory controller handle
- * @scanned:	number of pages scanned
- * @reclaimed:	number of pages reclaimed
- *
- * This function should be called from the vmscan reclaim path to account
- * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
- * pressure index is then further refined and averaged over time.
- *
- * This function does not return any value.
- */
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
-		unsigned long scanned, unsigned long reclaimed)
-{
-	if (!memcg)
-		vmpressure_global(gfp, scanned, reclaimed);
-
-	if (IS_ENABLED(CONFIG_MEMCG))
-		vmpressure_memcg(gfp, memcg, scanned, reclaimed);
-}
-
-/**
- * vmpressure_prio() - Account memory pressure through reclaimer priority level
- * @gfp:	reclaimer's gfp mask
- * @memcg:	cgroup memory controller handle
- * @prio:	reclaimer's priority
- *
- * This function should be called from the reclaim path every time when
- * the vmscan's reclaiming priority (scanning depth) changes.
- *
- * This function does not return any value.
- */
-void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
-{
-	/*
-	 * We only use prio for accounting critical level. For more info
-	 * see comment for vmpressure_level_critical_prio variable above.
-	 */
-	if (prio > vmpressure_level_critical_prio)
-		return;
-
-	/*
-	 * OK, the prio is below the threshold, updating vmpressure
-	 * information before shrinker dives into long shrinking of long
-	 * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
-	 * to the vmpressure() basically means that we signal 'critical'
-	 * level.
-	 */
-	vmpressure(gfp, memcg, vmpressure_win, 0);
-}
-
-/**
- * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
- * @cg:		cgroup that is interested in vmpressure notifications
- * @cft:	cgroup control files handle
- * @eventfd:	eventfd context to link notifications with
- * @args:	event arguments (used to set up a pressure level threshold)
- *
- * This function associates eventfd context with the vmpressure
- * infrastructure, so that the notifications will be delivered to the
- * @eventfd. The @args parameter is a string that denotes pressure level
- * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
- * "critical").
- *
- * This function should not be used directly, just pass it to (struct
- * cftype).register_event, and then cgroup core will handle everything by
- * itself.
- */
-int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
-			      struct eventfd_ctx *eventfd, const char *args)
-{
-	struct vmpressure *vmpr = cg_to_vmpressure(cg);
-	struct vmpressure_event *ev;
-	int level;
-
-	BUG_ON(!vmpr);
-
-	for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
-		if (!strcmp(vmpressure_str_levels[level], args))
-			break;
-	}
-
-	if (level >= VMPRESSURE_NUM_LEVELS)
-		return -EINVAL;
-
-	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
-	if (!ev)
-		return -ENOMEM;
-
-	ev->efd = eventfd;
-	ev->level = level;
-
-	mutex_lock(&vmpr->events_lock);
-	list_add(&ev->node, &vmpr->events);
-	mutex_unlock(&vmpr->events_lock);
-
-	return 0;
-}
-
-/**
- * vmpressure_unregister_event() - Unbind eventfd from vmpressure
- * @cg:		cgroup handle
- * @cft:	cgroup control files handle
- * @eventfd:	eventfd context that was used to link vmpressure with the @cg
- *
- * This function does internal manipulations to detach the @eventfd from
- * the vmpressure notifications, and then frees internal resources
- * associated with the @eventfd (but the @eventfd itself is not freed).
- *
- * This function should not be used directly, just pass it to (struct
- * cftype).unregister_event, and then cgroup core will handle everything
- * by itself.
- */
-void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
-				 struct eventfd_ctx *eventfd)
-{
-	struct vmpressure *vmpr = cg_to_vmpressure(cg);
-	struct vmpressure_event *ev;
-
-	if (!vmpr)
-		BUG();
-
-	mutex_lock(&vmpr->events_lock);
-	list_for_each_entry(ev, &vmpr->events, node) {
-		if (ev->efd != eventfd)
-			continue;
-		list_del(&ev->node);
-		kfree(ev);
-		break;
-	}
-	mutex_unlock(&vmpr->events_lock);
-}
-
-/**
- * vmpressure_init() - Initialize vmpressure control structure
- * @vmpr:	Structure to be initialized
- *
- * This function should be called on every allocated vmpressure structure
- * before any usage.
- */
-void vmpressure_init(struct vmpressure *vmpr)
-{
-	mutex_init(&vmpr->sr_lock);
-	mutex_init(&vmpr->events_lock);
-	INIT_LIST_HEAD(&vmpr->events);
-	INIT_WORK(&vmpr->work, vmpressure_work_fn);
-}
-
-int vmpressure_global_init(void)
-{
-	vmpressure_init(&global_vmpressure);
-	return 0;
-}
-late_initcall(vmpressure_global_init);
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@ -24,7 +24,6 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
-#include <linux/vmpressure.h>
 #include <linux/vmstat.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
@ -283,19 +282,24 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
 *
 * Returns the number of slab objects which we shrunk.
 */
-unsigned long shrink_slab(struct shrink_control *shrink,
+unsigned long shrink_slab(struct shrink_control *shrinkctl,
 			  unsigned long nr_pages_scanned,
 			  unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
-	unsigned long ret = 0;
+	unsigned long freed = 0;

 	if (nr_pages_scanned == 0)
 		nr_pages_scanned = SWAP_CLUSTER_MAX;

 	if (!down_read_trylock(&shrinker_rwsem)) {
-		/* Assume we'll be able to shrink next time */
-		ret = 1;
+		/*
+		 * If we would return 0, our callers would understand that we
+		 * have nothing else to shrink and give up trying. By returning
+		 * 1 we keep it going and assume we'll be able to shrink next
+		 * time.
+		 */
+		freed = 1;
 		goto out;
 	}

@ -303,7 +307,6 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		unsigned long long delta;
 		long total_scan;
 		long max_pass;
-		int shrink_ret = 0;
 		long nr;
 		long new_nr;
 		long batch_size = shrinker->batch ? shrinker->batch
@ -313,8 +316,11 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		if (current_is_kswapd())
 			min_cache_size = 0;

-		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
-		if (max_pass <= 0)
+		if (shrinker->count_objects)
+			max_pass = shrinker->count_objects(shrinker, shrinkctl);
+		else
+			max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+		if (max_pass == 0)
 			continue;

 		/*
@ -330,8 +336,8 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		do_div(delta, lru_pages + 1);
 		total_scan += delta;
 		if (total_scan < 0) {
-			printk(KERN_ERR "shrink_slab: %pF negative objects to "
-			       "delete nr=%ld\n",
+			printk(KERN_ERR
+			"shrink_slab: %pF negative objects to delete nr=%ld\n",
 			       shrinker->shrink, total_scan);
 			total_scan = max_pass;
 		}
@ -359,23 +365,36 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		if (total_scan > max_pass * 2)
 			total_scan = max_pass * 2;

-		trace_mm_shrink_slab_start(shrinker, shrink, nr,
+		trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
 					nr_pages_scanned, lru_pages,
 					max_pass, delta, total_scan);

 		while (total_scan > min_cache_size) {
-			int nr_before;

 			if (total_scan < batch_size)
 				batch_size = total_scan;

-			nr_before = do_shrinker_shrink(shrinker, shrink, 0);
-			shrink_ret = do_shrinker_shrink(shrinker, shrink,
-							batch_size);
-			if (shrink_ret == -1)
-				break;
-			if (shrink_ret < nr_before)
-				ret += nr_before - shrink_ret;
+			if (shrinker->scan_objects) {
+				unsigned long ret;
+				shrinkctl->nr_to_scan = batch_size;
+				ret = shrinker->scan_objects(shrinker, shrinkctl);
+
+				if (ret == SHRINK_STOP)
+					break;
+				freed += ret;
+			} else {
+				int nr_before;
+				long ret;
+
+				nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
+				ret = do_shrinker_shrink(shrinker, shrinkctl,
+								batch_size);
+				if (ret == -1)
+					break;
+				if (ret < nr_before)
+					freed += nr_before - ret;
+			}
+
 			count_vm_events(SLABS_SCANNED, batch_size);
 			total_scan -= batch_size;

@ -393,12 +412,12 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		else
 			new_nr = atomic_long_read(&shrinker->nr_in_batch);

-		trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
+		trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
 	}
 	up_read(&shrinker_rwsem);
 out:
 	cond_resched();
-	return ret;
+	return freed;
 }

 static inline int is_page_cache_freeable(struct page *page)
@ -2312,11 +2331,6 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 			}
 			memcg = mem_cgroup_iter(root, memcg, &reclaim);
 		} while (memcg);
-
-		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
-			   sc->nr_scanned - nr_scanned,
-			   sc->nr_reclaimed - nr_reclaimed);
-
 	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));
 }
@ -2497,8 +2511,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		count_vm_event(ALLOCSTALL);

 	do {
-		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
-				sc->priority);
 		sc->nr_scanned = 0;
 		aborted_reclaim = shrink_zones(zonelist, sc);