uksm: Ported Ultra-KSM 0.1.0

Experimental port for stock ICS semc-msm7x30 kernel. This is an improvement upon KSM. Its features: 1. Full system scan: It automatically scans all user processes' anonymous VMAs. Kernel-user interaction to submit a memory area to KSM is no longer needed. 2. Rich area detection based on random sampling: It automatically detects rich areas containing abundant duplicated pages based on their randomly-sampled history. Rich areas are given a full scan speed. Poor areas are sampled at a reasonable speed with very low CPU consumption. 3. Per-page scan speed improvement: A new hash algorithm(random_sample_hash) is proposed. Quite usually, it's enough to distinguish pages by hashing their partial content instead of full pages. This algorithm can automatically adapt to this situation. For the best case, only one 32-bit-word/page is needed to get the hash value for distinguishing pages. For the worst case, it's as fast as SuperFastHash. 4. Thrashing area avoidance: Thrashing area(an VMA that has frequent Ksm page break-out) can be filtered out. My benchmark shows it's more efficient than KSM's per-page hash value based volatile page detection. 5. Hash-value-based identical page detection: It no longer uses "memcmp" based page detection any more. 6. Misc changes upon KSM: * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page comparison. It's much faster than default C version on x86. * rmap_item now has an struct *page member to loosely cache a address-->page mapping, which reduces too much time-costly follow_page(). * The VMA creation/exit procedures are hooked to let the Ultra KSM know. * try_to_merge_two_pages() now can revert a pte if it fails. No break_ksm is needed for this case. Signed-off-by: CTCaer <ctcaer@gmail.com>
2024-11-26 21:10:26 +00:00 · 2013-05-30 04:05:03 +03:00 · 2013-05-30 04:05:03 +03:00 · b8944a4e5f
commit b8944a4e5f
parent 5eeb44b75c
13 changed files with 5046 additions and 1359 deletions
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@ -9,72 +9,295 @@

 #include <linux/bitops.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
 #include <linux/sched.h>
-#include <linux/vmstat.h>
+
+struct stable_node;
+struct mem_cgroup;
+
+struct page *ksm_does_need_to_copy(struct page *page,
+			struct vm_area_struct *vma, unsigned long address);

 #ifdef CONFIG_KSM
-int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
-		unsigned long end, int advice, unsigned long *vm_flags);
-int __ksm_enter(struct mm_struct *mm);
-void __ksm_exit(struct mm_struct *mm);
-
-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-		return __ksm_enter(mm);
-	return 0;
-}
-
-static inline void ksm_exit(struct mm_struct *mm)
-{
-	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
-		__ksm_exit(mm);
-}
-
 /*
 * A KSM page is one of those write-protected "shared pages" or "merged pages"
 * which KSM maps into multiple mms, wherever identical anonymous page content
- * is found in VM_MERGEABLE vmas.  It's a PageAnon page, with NULL anon_vma.
+ * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
+ * anon_vma, but to that page's node of the stable tree.
 */
 static inline int PageKsm(struct page *page)
 {
 	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
-		(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
 }

+static inline struct stable_node *page_stable_node(struct page *page)
+{
+	return PageKsm(page) ? page_rmapping(page) : NULL;
+}
+
+static inline void set_page_stable_node(struct page *page,
+					struct stable_node *stable_node)
+{
+	page->mapping = (void *)stable_node +
+				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+}
+
+/* must be done before linked to mm */
+extern inline void ksm_vma_add_new(struct vm_area_struct *vma);
+
+extern void ksm_remove_vma(struct vm_area_struct *vma);
+extern inline int unmerge_ksm_pages(struct vm_area_struct *vma,
+				    unsigned long start, unsigned long end);
+
 /*
- * But we have to avoid the checking which page_add_anon_rmap() performs.
+ * When do_swap_page() first faults in from swap what used to be a KSM page,
+ * no problem, it will be assigned to this vma's anon_vma; but thereafter,
+ * it might be faulted into a different anon_vma (or perhaps to a different
+ * offset in the same anon_vma).  do_swap_page() cannot do all the locking
+ * needed to reconstitute a cross-anon_vma KSM page: for now it has to make
+ * a copy, and leave remerging the pages to a later pass of ksmd.
+ *
+ * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
+ * but what if the vma was unmerged while the page was swapped out?
 */
-static inline void page_add_ksm_rmap(struct page *page)
+static inline int ksm_might_need_to_copy(struct page *page,
+			struct vm_area_struct *vma, unsigned long address)
 {
-	if (atomic_inc_and_test(&page->_mapcount)) {
-		page->mapping = (void *) (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
-		__inc_zone_page_state(page, NR_ANON_PAGES);
-	}
+	struct anon_vma *anon_vma = page_anon_vma(page);
+
+	return anon_vma &&
+		(anon_vma != vma->anon_vma ||
+		 page->index != linear_page_index(vma, address));
 }
+
+int page_referenced_ksm(struct page *page,
+			struct mem_cgroup *memcg, unsigned long *vm_flags);
+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags);
+int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
+		  struct vm_area_struct *, unsigned long, void *), void *arg);
+void ksm_migrate_page(struct page *newpage, struct page *oldpage);
+
+/* Each rung of this ladder is a list of VMAs having a same scan ratio */
+struct scan_rung {
+	struct list_head vma_list;
+	//spinlock_t vma_list_lock;
+	//struct semaphore sem;
+	struct list_head *current_scan;
+	unsigned int pages_to_scan;
+	unsigned char round_finished; /* rung is ready for the next round */
+	unsigned char busy_searched;
+	unsigned long fully_scanned_slots;
+	unsigned long scan_ratio;
+	unsigned long vma_num;
+	//unsigned long vma_finished;
+	unsigned long scan_turn;
+};
+
+struct vma_slot {
+	struct list_head ksm_list;
+	struct list_head slot_list;
+	unsigned long dedup_ratio;
+	unsigned long dedup_num;
+	int ksm_index; /* -1 if vma is not in inter-table,
+				positive otherwise */
+	unsigned long pages_scanned;
+	unsigned long last_scanned;
+	unsigned long pages_to_scan;
+	struct scan_rung *rung;
+	struct page **rmap_list_pool;
+	unsigned long *pool_counts;
+	unsigned long pool_size;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	unsigned long ctime_j;
+	unsigned long pages;
+	unsigned char need_sort;
+	unsigned char need_rerand;
+	unsigned long slot_scanned; /* It's scanned in this round */
+	unsigned long fully_scanned; /* the above four to be merged to status bits */
+	unsigned long pages_cowed; /* pages cowed this round */
+	unsigned long pages_merged; /* pages merged this round */
+
+	/* used for dup vma pair */
+	struct radix_tree_root dup_tree;
+};
+
+/*
+ * A few notes about the KSM scanning process,
+ * to make it easier to understand the data structures below:
+ *
+ * In order to reduce excessive scanning, KSM sorts the memory pages by their
+ * contents into a data structure that holds pointers to the pages' locations.
+ *
+ * Since the contents of the pages may change at any moment, KSM cannot just
+ * insert the pages into a normal sorted tree and expect it to find anything.
+ * Therefore KSM uses two data structures - the stable and the unstable tree.
+ *
+ * The stable tree holds pointers to all the merged pages (ksm pages), sorted
+ * by their contents.  Because each such page is write-protected, searching on
+ * this tree is fully assured to be working (except when pages are unmapped),
+ * and therefore this tree is called the stable tree.
+ *
+ * In addition to the stable tree, KSM uses a second data structure called the
+ * unstable tree: this tree holds pointers to pages which have been found to
+ * be "unchanged for a period of time".  The unstable tree sorts these pages
+ * by their contents, but since they are not write-protected, KSM cannot rely
+ * upon the unstable tree to work correctly - the unstable tree is liable to
+ * be corrupted as its contents are modified, and so it is called unstable.
+ *
+ * KSM solves this problem by several techniques:
+ *
+ * 1) The unstable tree is flushed every time KSM completes scanning all
+ *    memory areas, and then the tree is rebuilt again from the beginning.
+ * 2) KSM will only insert into the unstable tree, pages whose hash value
+ *    has not changed since the previous scan of all memory areas.
+ * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
+ *    colors of the nodes and not on their contents, assuring that even when
+ *    the tree gets "corrupted" it won't get out of balance, so scanning time
+ *    remains the same (also, searching and inserting nodes in an rbtree uses
+ *    the same algorithm, so we have no overhead when we flush and rebuild).
+ * 4) KSM never flushes the stable tree, which means that even if it were to
+ *    take 10 attempts to find a page in the unstable tree, once it is found,
+ *    it is secured in the stable tree.  (When we scan a new page, we first
+ *    compare it against the stable tree, and then against the unstable tree.)
+ */
+
+
+/**
+ * node of either the stable or unstale rbtree
+ *
+ */
+struct tree_node {
+	struct rb_node node; /* link in the main (un)stable rbtree */
+	struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
+	u32 hash;
+	unsigned long count; /* how many sublevel tree nodes */
+	struct list_head all_list; /* all tree nodes in stable/unstable tree */
+};
+
+
+/**
+ * struct stable_node - node of the stable rbtree
+ * @node: rb node of this ksm page in the stable tree
+ * @hlist: hlist head of rmap_items using this ksm page
+ * @kpfn: page frame number of this ksm page
+ */
+struct stable_node {
+	struct rb_node node; /* link in sub-rbtree */
+	struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
+	struct hlist_head hlist;
+	unsigned long kpfn;
+	u32 hash_max; /* if ==0 then it's not been calculated yet */
+	//struct vm_area_struct *old_vma;
+	struct list_head all_list; /* in a list for all stable nodes */
+};
+
+
+
+
+/**
+ * struct node_vma - group rmap_items linked in a same stable
+ * node together.
+ */
+struct node_vma {
+	union {
+		struct vma_slot *slot;
+		unsigned long key;  /* slot is used as key sorted on hlist */
+	};
+	struct hlist_node hlist;
+	struct hlist_head rmap_hlist;
+	struct stable_node *head;
+	unsigned long last_update;
+};
+
+/**
+ * struct rmap_item - reverse mapping item for virtual addresses
+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
+ * @mm: the memory structure this rmap_item is pointing into
+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
+ * @node: rb node of this rmap_item in the unstable tree
+ * @head: pointer to stable_node heading this list in the stable tree
+ * @hlist: link into hlist of rmap_items hanging off that stable_node
+ */
+struct rmap_item {
+	struct vma_slot *slot;
+	struct page *page;
+	unsigned long address;	/* + low bits used for flags below */
+	/* Appendded to (un)stable tree on which scan round */
+	unsigned long append_round;
+
+	/* Which rung scan turn it was last scanned */
+	//unsigned long last_scan;
+	unsigned long entry_index;
+	union {
+		struct {/* when in unstable tree */
+			struct rb_node node;
+			struct tree_node *tree_node;
+			u32 hash_max;
+		};
+		struct { /* when in stable tree */
+			struct node_vma *head;
+			struct hlist_node hlist;
+			struct anon_vma *anon_vma;
+		};
+	};
+} __attribute__((aligned(4)));
+
+struct rmap_list_entry {
+	union {
+		struct rmap_item *item;
+		unsigned long addr;
+	};
+	// lowest bit is used for is_addr tag
+	//unsigned char is_addr;
+} __attribute__((aligned(4))); // 4 aligned to fit in to pages
+
+//extern struct semaphore ksm_scan_sem;
 #else  /* !CONFIG_KSM */

-static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
-		unsigned long end, int advice, unsigned long *vm_flags)
-{
-	return 0;
-}
-
-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-	return 0;
-}
-
-static inline void ksm_exit(struct mm_struct *mm)
-{
-}
-
 static inline int PageKsm(struct page *page)
 {
 	return 0;
 }

-/* No stub required for page_add_ksm_rmap(page) */
+#ifdef CONFIG_MMU
+
+extern inline int unmerge_ksm_pages(struct vm_area_struct *vma,
+				    unsigned long start, unsigned long end)
+{
+	return 0;
+}
+
+static inline int ksm_might_need_to_copy(struct page *page,
+			struct vm_area_struct *vma, unsigned long address)
+{
+	return 0;
+}
+
+static inline int page_referenced_ksm(struct page *page,
+			struct mem_cgroup *memcg, unsigned long *vm_flags)
+{
+	return 0;
+}
+
+static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
+{
+	return 0;
+}
+
+static inline int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page*,
+		struct vm_area_struct *, unsigned long, void *), void *arg)
+{
+	return 0;
+}
+
+static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+{
+}
+#endif /* CONFIG_MMU */
 #endif /* !CONFIG_KSM */

-#endif
+#endif /* __LINUX_KSM_H */
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@ -186,6 +186,9 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
+#ifdef CONFIG_KSM
+	struct vma_slot *ksm_vma_slot;
+#endif
 };

 struct core_thread {
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@ -26,6 +26,9 @@
 */
 struct anon_vma {
 	spinlock_t lock;	/* Serialize access to vma list */
+#ifdef CONFIG_KSM
+	atomic_t ksm_refcount;
+#endif
 	/*
 	 * NOTE: the LSB of the head.next is set by
 	 * mm_take_all_locks() _after_ taking the above lock. So the
@ -47,6 +50,27 @@ static inline struct anon_vma *page_anon_vma(struct page *page)
 	return page_rmapping(page);
 }

+#ifdef CONFIG_KSM
+static inline void ksm_refcount_init(struct anon_vma *anon_vma)
+{
+	atomic_set(&anon_vma->ksm_refcount, 0);
+}
+
+static inline int ksm_refcount(struct anon_vma *anon_vma)
+{
+	return atomic_read(&anon_vma->ksm_refcount);
+}
+#else
+static inline void ksm_refcount_init(struct anon_vma *anon_vma)
+{
+}
+
+static inline int ksm_refcount(struct anon_vma *anon_vma)
+{
+	return 0;
+}
+#endif /* CONFIG_KSM */
+
 static inline void anon_vma_lock(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
@ -70,6 +94,7 @@ void __anon_vma_merge(struct vm_area_struct *, struct vm_area_struct *);
 void anon_vma_unlink(struct vm_area_struct *);
 void anon_vma_link(struct vm_area_struct *);
 void __anon_vma_link(struct vm_area_struct *);
+void anon_vma_free(struct anon_vma *);

 /*
 * rmap interfaces called when adding or removing pte of page
@ -89,6 +114,9 @@ static inline void page_dup_rmap(struct page *page)
 */
 int page_referenced(struct page *, int is_locked,
 			struct mem_cgroup *cnt, unsigned long *vm_flags);
+int page_referenced_one(struct page *, struct vm_area_struct *,
+	unsigned long address, unsigned int *mapcount, unsigned long *vm_flags);
+
 enum ttu_flags {
 	TTU_UNMAP = 0,			/* unmap mode */
 	TTU_MIGRATION = 1,		/* migration mode */
@ -102,6 +130,9 @@ enum ttu_flags {
 #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)

 int try_to_unmap(struct page *, enum ttu_flags flags);
+int try_to_unmap_one(struct page *, struct vm_area_struct *,
+			unsigned long address, enum ttu_flags flags);
+

 /*
 * Called from mm/filemap_xip.c to unmap empty zero page
--- a/kernel/fork.c
+++ b/kernel/fork.c
@ -317,9 +317,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	rb_link = &mm->mm_rb.rb_node;
 	rb_parent = NULL;
 	pprev = &mm->mmap;
-	retval = ksm_fork(mm, oldmm);
-	if (retval)
-		goto out;

 	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
 		struct file *file;
@ -338,7 +335,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				goto fail_nomem;
 			charge = len;
 		}
-		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
@ -387,7 +384,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		__vma_link_rb(mm, tmp, rb_link, rb_parent);
 		rb_link = &tmp->vm_rb.rb_right;
 		rb_parent = &tmp->vm_rb;
-
+#ifdef CONFIG_KSM
+		ksm_vma_add_new(tmp);
+#endif
 		mm->map_count++;
 		retval = copy_page_range(mm, oldmm, mpnt);

@ -525,7 +524,6 @@ void mmput(struct mm_struct *mm)

 	if (atomic_dec_and_test(&mm->mm_users)) {
 		exit_aio(mm);
-		ksm_exit(mm);
 		exit_mmap(mm);
 		set_mm_exe_file(mm, NULL);
 		if (!list_empty(&mm->mmlist)) {
--- a/mm/internal.h
+++ b/mm/internal.h
@ -152,6 +152,7 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 }
 static inline void clear_page_mlock(struct page *page) { }
 static inline void mlock_vma_page(struct page *page) { }
+static inline void munlock_vma_page(struct page *page) { }
 static inline void mlock_migrate_page(struct page *new, struct page *old) { }

 #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
--- a/mm/ksm.c
+++ b/mm/ksm.c
--- a/mm/madvise.c
+++ b/mm/madvise.c
@ -64,12 +64,6 @@ static long madvise_behavior(struct vm_area_struct * vma,
 		}
 		new_flags &= ~VM_DONTCOPY;
 		break;
-	case MADV_MERGEABLE:
-	case MADV_UNMERGEABLE:
-		error = ksm_madvise(vma, start, end, behavior, &new_flags);
-		if (error)
-			goto out;
-		break;
 	}

 	if (new_flags == vma->vm_flags) {
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@ -1747,7 +1747,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 	 * to go on to do_swap_page()'s pte_same() test, which should fail.
 	 */
 	if (!PageSwapCache(page))
-		return 0;
+		goto charge_cur_mm;
 	mem = try_get_mem_cgroup_from_swapcache(page);
 	if (!mem)
 		goto charge_cur_mm;
--- a/mm/memory.c
+++ b/mm/memory.c
@ -1241,7 +1241,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,

 	VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));

-	/* 
+	/*
 	 * Require read or write permissions.
 	 * If FOLL_FORCE is set, we only require the "MAY" flags.
 	 */
@ -1963,8 +1963,13 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 			memset(kaddr, 0, PAGE_SIZE);
 		kunmap_atomic(kaddr, KM_USER0);
 		flush_dcache_page(dst);
-	} else
+	} else {
 		copy_user_highpage(dst, src, va, vma);
+#ifdef CONFIG_KSM
+		if (vma->ksm_vma_slot && PageKsm(src))
+			vma->ksm_vma_slot->pages_cowed++;
+#endif
+	}
 }

 /*
@ -2497,10 +2502,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned int flags, pte_t orig_pte)
 {
 	spinlock_t *ptl;
-	struct page *page;
+	struct page *page, *swapcache = NULL;
 	swp_entry_t entry;
 	pte_t pte;
 	struct mem_cgroup *ptr = NULL;
+	int exclusive = 0;
 	int ret = 0;

 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@ -2548,6 +2554,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	lock_page(page);
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);

+	if (ksm_might_need_to_copy(page, vma, address)) {
+		swapcache = page;
+		page = ksm_does_need_to_copy(page, vma, address);
+
+		if (unlikely(!page)) {
+			ret = VM_FAULT_OOM;
+			page = swapcache;
+			swapcache = NULL;
+			goto out_page;
+		}
+	}
+
 	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
 		ret = VM_FAULT_OOM;
 		goto out_page;
@ -2584,6 +2602,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
 		flags &= ~FAULT_FLAG_WRITE;
+		ret |= VM_FAULT_WRITE;
+		exclusive = 1;
 	}
 	flush_icache_page(vma, page);
 	set_pte_at(mm, address, page_table, pte);
@ -2595,6 +2615,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
 		try_to_free_swap(page);
 	unlock_page(page);
+	if (swapcache) {
+		/*
+		 * Hold the lock to avoid the swap entry to be reused
+		 * until we take the PT lock for the pte_same() check
+		 * (to avoid false positives from pte_same). For
+		 * further safety release the lock after the swap_free
+		 * so that the swap count won't change under a
+		 * parallel locked swapcache.
+		 */
+		unlock_page(swapcache);
+		page_cache_release(swapcache);
+	}

 	if (flags & FAULT_FLAG_WRITE) {
 		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
@ -2616,6 +2648,10 @@ out_page:
 	unlock_page(page);
 out_release:
 	page_cache_release(page);
+	if (swapcache) {
+		unlock_page(swapcache);
+		page_cache_release(swapcache);
+	}
 	return ret;
 }

--- a/mm/mmap.c
+++ b/mm/mmap.c
@ -29,6 +29,7 @@
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
+#include <linux/ksm.h>

 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@ -64,7 +65,7 @@ static void unmap_region(struct mm_struct *mm,
 * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
 *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
- *		
+ *
 * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
 *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
@ -231,6 +232,9 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 			removed_exe_file_vma(vma->vm_mm);
 	}
 	mpol_put(vma_policy(vma));
+#ifdef CONFIG_KSM
+	ksm_remove_vma(vma);
+#endif
 	kmem_cache_free(vm_area_cachep, vma);
 	return next;
 }
@ -505,7 +509,19 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	long adjust_next = 0;
 	int remove_next = 0;

+/*
+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
+ * acquired
+ */
+#ifdef CONFIG_KSM
+       ksm_remove_vma(vma);
+#endif
+
 	if (next && !insert) {
+#ifdef CONFIG_KSM
+               ksm_remove_vma(next);
+#endif
+
 		if (end >= next->vm_end) {
 			/*
 			 * vma expands, overlapping all the next, and
@ -585,10 +601,10 @@ again:			remove_next = 1 + (end > next->vm_end);
 		if (adjust_next)
 			vma_prio_tree_remove(next, root);
 	}
-
 	vma->vm_start = start;
 	vma->vm_end = end;
 	vma->vm_pgoff = pgoff;
+
 	if (adjust_next) {
 		next->vm_start += adjust_next << PAGE_SHIFT;
 		next->vm_pgoff += adjust_next;
@ -641,10 +657,22 @@ again:			remove_next = 1 + (end > next->vm_end);
 		 */
 		if (remove_next == 2) {
 			next = vma->vm_next;
+#ifdef CONFIG_KSM
+			ksm_remove_vma(next);
+#endif
 			goto again;
 		}
+	} else {
+#ifdef CONFIG_KSM
+		if (next && !insert)
+			ksm_vma_add_new(next);
+#endif
 	}

+#ifdef CONFIG_KSM
+	ksm_vma_add_new(vma);
+#endif
+
 	validate_mm(mm);
 }

@ -1199,6 +1227,9 @@ munmap_back:

 	vma_link(mm, vma, prev, rb_link, rb_parent);
 	file = vma->vm_file;
+#ifdef CONFIG_KSM
+	ksm_vma_add_new(vma);
+#endif

 	/* Once vma denies write, undo our temporary denial count */
 	if (correct_wcount)
@ -1230,6 +1261,9 @@ unmap_and_free_vma:
 	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
 	charged = 0;
 free_vma:
+#ifdef CONFIG_KSM
+	ksm_remove_vma(vma);
+#endif
 	kmem_cache_free(vm_area_cachep, vma);
 unacct_error:
 	if (charged)
@ -1305,7 +1339,7 @@ full_search:
 		addr = vma->vm_end;
 	}
 }
-#endif	
+#endif

 void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
@ -1859,6 +1893,10 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	else
 		vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);

+#ifdef CONFIG_KSM
+       ksm_vma_add_new(new);
+#endif
+
 	return 0;
 }

@ -2053,6 +2091,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	vma->vm_flags = flags;
 	vma->vm_page_prot = vm_get_page_prot(flags);
 	vma_link(mm, vma, prev, rb_link, rb_parent);
+#ifdef CONFIG_KSM
+	ksm_vma_add_new(vma);
+#endif
 out:
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED) {
@ -2075,6 +2116,12 @@ void exit_mmap(struct mm_struct *mm)
 	/* mm's last user has gone, and its about to be pulled down */
 	mmu_notifier_release(mm);

+	/*
+	 * Taking write lock on mmap_sem does not harm others,
+	 * but it's crucial for uksm to avoid races.
+	 */
+	down_write(&mm->mmap_sem);
+
 	if (mm->locked_vm) {
 		vma = mm->mmap;
 		while (vma) {
@ -2108,6 +2155,11 @@ void exit_mmap(struct mm_struct *mm)
 	while (vma)
 		vma = remove_vma(vma);

+	mm->mmap = NULL;
+	mm->mm_rb = RB_ROOT;
+	mm->mmap_cache = NULL;
+	up_write(&mm->mmap_sem);
+
 	BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }

@ -2198,6 +2250,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
 				new_vma->vm_ops->open(new_vma);
 			vma_link(mm, new_vma, prev, rb_link, rb_parent);
+#ifdef CONFIG_KSM
+			ksm_vma_add_new(new_vma);
+#endif
 		}
 	}
 	return new_vma;
@ -2297,6 +2352,10 @@ int install_special_mapping(struct mm_struct *mm,

 	perf_event_mmap(vma);

+#ifdef CONFIG_KSM
+       ksm_vma_add_new(vma);
+#endif
+
 	return 0;
 }

--- a/mm/mremap.c
+++ b/mm/mremap.c
@ -191,8 +191,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	 * pages recently unmapped.  But leave vma->vm_flags as it was,
 	 * so KSM can come around to merge on vma and new_vma afterwards.
 	 */
-	err = ksm_madvise(vma, old_addr, old_addr + old_len,
-						MADV_UNMERGEABLE, &vm_flags);
+	err = unmerge_ksm_pages(vma, old_addr, old_addr + old_len);
 	if (err)
 		return err;

--- a/mm/rmap.c
+++ b/mm/rmap.c
@ -55,6 +55,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
+#include <linux/ksm.h>

 #include <asm/tlbflush.h>

@ -67,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
 	return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
 }

-static inline void anon_vma_free(struct anon_vma *anon_vma)
+inline void anon_vma_free(struct anon_vma *anon_vma)
 {
 	kmem_cache_free(anon_vma_cachep, anon_vma);
 }
@ -171,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
 	list_del(&vma->anon_vma_node);

 	/* We must garbage collect the anon_vma if it's empty */
-	empty = list_empty(&anon_vma->head);
+	empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
 	spin_unlock(&anon_vma->lock);

 	if (empty)
@ -184,6 +185,7 @@ static void anon_vma_ctor(void *data)

 	spin_lock_init(&anon_vma->lock);
 	INIT_LIST_HEAD(&anon_vma->head);
+	ksm_refcount_init(anon_vma);
 }

 void __init anon_vma_init(void)
@ -248,7 +250,13 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
 	if (PageAnon(page)) {
-		if (vma->anon_vma != page_anon_vma(page))
+		struct anon_vma *page__anon_vma = page_anon_vma(page);
+		/*
+		 * Note: swapoff's unuse_vma() is more efficient with this
+		 * check, and needs it to match anon_vma when KSM is active.
+		 */
+		if (!vma->anon_vma || !page__anon_vma ||
+		    vma->anon_vma != page__anon_vma)
 			return -EFAULT;
 	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
 		if (!vma->vm_file ||
@ -336,21 +344,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
 * Subfunctions of page_referenced: page_referenced_one called
 * repeatedly from either page_referenced_anon or page_referenced_file.
 */
-static int page_referenced_one(struct page *page,
-			       struct vm_area_struct *vma,
-			       unsigned int *mapcount,
-			       unsigned long *vm_flags)
+int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+			unsigned long address, unsigned int *mapcount,
+			unsigned long *vm_flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
 	pte_t *pte;
 	spinlock_t *ptl;
 	int referenced = 0;

-	address = vma_address(page, vma);
-	if (address == -EFAULT)
-		goto out;
-
 	pte = page_check_address(page, mm, address, &ptl, 0);
 	if (!pte)
 		goto out;
@ -408,6 +410,9 @@ static int page_referenced_anon(struct page *page,

 	mapcount = page_mapcount(page);
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+		unsigned long address = vma_address(page, vma);
+		if (address == -EFAULT)
+			continue;
 		/*
 		 * If we are reclaiming on behalf of a cgroup, skip
 		 * counting on behalf of references from different
@ -415,7 +420,7 @@ static int page_referenced_anon(struct page *page,
 		 */
 		if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
 			continue;
-		referenced += page_referenced_one(page, vma,
+		referenced += page_referenced_one(page, vma, address,
 						  &mapcount, vm_flags);
 		if (!mapcount)
 			break;
@ -473,6 +478,9 @@ static int page_referenced_file(struct page *page,
 	mapcount = page_mapcount(page);

 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		unsigned long address = vma_address(page, vma);
+		if (address == -EFAULT)
+			continue;
 		/*
 		 * If we are reclaiming on behalf of a cgroup, skip
 		 * counting on behalf of references from different
@ -480,7 +488,7 @@ static int page_referenced_file(struct page *page,
 		 */
 		if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
 			continue;
-		referenced += page_referenced_one(page, vma,
+		referenced += page_referenced_one(page, vma, address,
 						  &mapcount, vm_flags);
 		if (!mapcount)
 			break;
@ -506,46 +514,44 @@ int page_referenced(struct page *page,
 		    unsigned long *vm_flags)
 {
 	int referenced = 0;
-
-	if (TestClearPageReferenced(page))
-		referenced++;
+	int we_locked = 0;

 	*vm_flags = 0;
 	if (page_mapped(page) && page_rmapping(page)) {
-		if (PageAnon(page))
+		if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+			we_locked = trylock_page(page);
+			if (!we_locked) {
+				referenced++;
+				goto out;
+			}
+		}
+		if (unlikely(PageKsm(page)))
+			referenced += page_referenced_ksm(page, mem_cont,
+								vm_flags);
+		else if (PageAnon(page))
 			referenced += page_referenced_anon(page, mem_cont,
 								vm_flags);
-		else if (is_locked)
+		else if (page->mapping)
 			referenced += page_referenced_file(page, mem_cont,
 								vm_flags);
-		else if (!trylock_page(page))
-			referenced++;
-		else {
-			if (page->mapping)
-				referenced += page_referenced_file(page,
-							mem_cont, vm_flags);
+		if (we_locked)
 			unlock_page(page);
-		}
 	}
-
+out:
 	if (page_test_and_clear_young(page))
 		referenced++;

 	return referenced;
 }

-static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
+static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
+			    unsigned long address)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
 	pte_t *pte;
 	spinlock_t *ptl;
 	int ret = 0;

-	address = vma_address(page, vma);
-	if (address == -EFAULT)
-		goto out;
-
 	pte = page_check_address(page, mm, address, &ptl, 1);
 	if (!pte)
 		goto out;
@ -577,8 +583,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)

 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-		if (vma->vm_flags & VM_SHARED)
-			ret += page_mkclean_one(page, vma);
+		if (vma->vm_flags & VM_SHARED) {
+			unsigned long address = vma_address(page, vma);
+			if (address == -EFAULT)
+				continue;
+			ret += page_mkclean_one(page, vma, address);
+		}
 	}
 	spin_unlock(&mapping->i_mmap_lock);
 	return ret;
@ -619,14 +629,7 @@ static void __page_set_anon_rmap(struct page *page,
 	BUG_ON(!anon_vma);
 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 	page->mapping = (struct address_space *) anon_vma;
-
 	page->index = linear_page_index(vma, address);
-
-	/*
-	 * nr_mapped state can be updated without turning off
-	 * interrupts because it is not modified via interrupt.
-	 */
-	__inc_zone_page_state(page, NR_ANON_PAGES);
 }

 /**
@ -669,9 +672,15 @@ static void __page_check_anon_rmap(struct page *page,
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
+	int first = atomic_inc_and_test(&page->_mapcount);
+	if (first)
+		__inc_zone_page_state(page, NR_ANON_PAGES);
+	if (unlikely(PageKsm(page)))
+		return;
+
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-	if (atomic_inc_and_test(&page->_mapcount))
+	if (first)
 		__page_set_anon_rmap(page, vma, address);
 	else
 		__page_check_anon_rmap(page, vma, address);
@ -693,6 +702,7 @@ void page_add_new_anon_rmap(struct page *page,
 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 	SetPageSwapBacked(page);
 	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
+	__inc_zone_page_state(page, NR_ANON_PAGES);
 	__page_set_anon_rmap(page, vma, address);
 	if (page_evictable(page, vma))
 		lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@ -759,20 +769,15 @@ void page_remove_rmap(struct page *page)
 * Subfunctions of try_to_unmap: try_to_unmap_one called
 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
 */
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-				enum ttu_flags flags)
+int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+		     unsigned long address, enum ttu_flags flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
 	spinlock_t *ptl;
 	int ret = SWAP_AGAIN;

-	address = vma_address(page, vma);
-	if (address == -EFAULT)
-		goto out;
-
 	pte = page_check_address(page, mm, address, &ptl, 0);
 	if (!pte)
 		goto out;
@ -1031,7 +1036,10 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 				continue;  /* must visit all unlocked vmas */
 			ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
 		} else {
-			ret = try_to_unmap_one(page, vma, flags);
+		       unsigned long address = vma_address(page, vma);
+		       if (address == -EFAULT)
+			       continue;
+		       ret = try_to_unmap_one(page, vma, address, flags);
 			if (ret == SWAP_FAIL || !page_mapped(page))
 				break;
 		}
@ -1092,7 +1100,10 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 				continue;	/* must visit all vmas */
 			ret = SWAP_MLOCK;
 		} else {
-			ret = try_to_unmap_one(page, vma, flags);
+		       unsigned long address = vma_address(page, vma);
+		       if (address == -EFAULT)
+			       continue;
+		       ret = try_to_unmap_one(page, vma, address, flags);
 			if (ret == SWAP_FAIL || !page_mapped(page))
 				goto out;
 		}
@ -1209,7 +1220,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)

 	BUG_ON(!PageLocked(page));

-	if (PageAnon(page))
+	if (unlikely(PageKsm(page)))
+		ret = try_to_unmap_ksm(page, flags);
+	else if (PageAnon(page))
 		ret = try_to_unmap_anon(page, flags);
 	else
 		ret = try_to_unmap_file(page, flags);
@ -1230,13 +1243,16 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 *
 * SWAP_SUCCESS	- no vma's holding page mlocked.
 * SWAP_AGAIN	- page mapped in mlocked vma -- couldn't acquire mmap sem
+ * SWAP_FAIL	- page cannot be located at present
 * SWAP_MLOCK	- page is now mlocked.
 */
 int try_to_munlock(struct page *page)
 {
 	VM_BUG_ON(!PageLocked(page) || PageLRU(page));

-	if (PageAnon(page))
+	if (unlikely(PageKsm(page)))
+		return try_to_unmap_ksm(page, TTU_MUNLOCK);
+	else if (PageAnon(page))
 		return try_to_unmap_anon(page, TTU_MUNLOCK);
 	else
 		return try_to_unmap_file(page, TTU_MUNLOCK);
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@ -29,6 +29,7 @@
 #include <linux/capability.h>
 #include <linux/syscalls.h>
 #include <linux/memcontrol.h>
+#include <linux/ksm.h>

 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@ -663,6 +664,8 @@ int reuse_swap_page(struct page *page)
 	int count;

 	VM_BUG_ON(!PageLocked(page));
+	if (unlikely(PageKsm(page)))
+		return 0;
 	count = page_mapcount(page);
 	if (count <= 1 && PageSwapCache(page)) {
 		count += page_swapcount(page);
@ -671,7 +674,7 @@ int reuse_swap_page(struct page *page)
 			SetPageDirty(page);
 		}
 	}
-	return count == 1;
+	return count <= 1;
 }

 /*
@ -1217,7 +1220,13 @@ static int try_to_unuse(unsigned int type)
 		 * read from disk into another page.  Splitting into two
 		 * pages would be incorrect if swap supported "shared
 		 * private" pages, but they are handled by tmpfs files.
-		 */
+		 *
+		 * Given how unuse_vma() targets one particular offset
+		 * in an anon_vma, once the anon_vma has been determined,
+		 * this splitting happens to be just what is needed to
+		 * handle where KSM pages have been swapped out: re-reading
+		 * is unnecessarily slow, but we can fix that later on.
+ 		 */
 		if (swap_count(*swap_map) &&
 		     PageDirty(page) && PageSwapCache(page)) {
 			struct writeback_control wbc = {