remove useless patches

author: antman666 2021-07-06 17:15:37 +0800
committer: antman666 2021-07-06 17:15:37 +0800
commit: 8782ea33567efa1b67c9fd54ca51bdefe596f89a (patch)
tree: 63d19e5daab3fe80230ea602f38a86f2dcaaf677
parent: 7f8e8c158ef86353e2613129a89f5681f6093f34 (diff)
download: aur-8782ea33567efa1b67c9fd54ca51bdefe596f89a.tar.gz
5 files changed, 11 insertions, 12480 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 7dd6b27c69ea..b706f61897ab 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -24,9 +24,11 @@ pkgbase = linux-ck-uksm
 	source = 0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch
 	source = 0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch
 	source = 0007-x86-crash-remove-crash_reserve_low_1M.patch
-	source = 0008-UKSM.patch
-	source = 0009-bbr2.patch
-	source = 0010-btrfs.patch
+	source = 0008-UKSM.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/uksm-patches/0001-UKSM-for-5.12.patch
+	source = 0009-bbr2.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/bbr2-patches-v2/0001-bbr2-5.12-introduce-BBRv2.patch
+	source = 0010-btrfs.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/btrfs-patches-v13/0001-btrfs-patches.patch
+	source = 0011-block.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/block-patches-v6/0001-block-patches.patch
+	source = 0012-bfq.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/bfq-patches-v15/0001-bfq-patches.patch
 	validpgpkeys = ABAF11C65A2970B130ABE3C479BE3E4300411886
 	validpgpkeys = 647F28654894E3BD457199BE38DBBDC86092693E
 	b2sums = 3bc213b432d61c358f85b932dec8bd44a1ef73442f20424ad5ce374b6982a6909c5b318d5e9848996989d5e421ab6c2128cdb51a3724adc95222f96a859486a1
@@ -44,6 +46,8 @@ pkgbase = linux-ck-uksm
 	b2sums = 14f45171afc3b15488b40a05e58b352c5057da3a5782e13527392f7750d8e45a8db54f9b50b218fedb8bf679de3b4e5d78e230a44f7b1aa482f7b3aa831bd641
 	b2sums = 0c5f2e21e27aee6c8d8eaa07daa111ff2687756413f8a909cf03acc8f836367c6b27050966f9b7bf1521ad11b84fe94fb42d70c33693c80a674ef223cf2cfc00
 	b2sums = 705a8f2037eef3afdd0f2a7648cc8d00bfc03112385b44a8907182812b6aed075519a9236909c0e3ba09df887381dd76cb01c601e0df05119136f7318587a416
+	b2sums = 67067d624711d663c1be1d35c5e59cb588faba1769b27443a3a13b44dbe9e627edd054a4fd122d04d587e21b25be5520fffb61cfc7538aee77c33a1a8cb1b97a
+	b2sums = 9aba508592818a4b4f000fc1bd471ec74687c8f0f972f330e851bd2364eaf30cff4d5012f843625ca025bc2478a2c76e0d082d43f33358ab18ce829fab4f0c2b
 
 pkgname = linux-ck-uksm
 	pkgdesc = The Linux-ck-uksm kernel and modules with the ck1 and uksm patchesset featuring MuQSS CPU scheduler
diff --git a/0008-UKSM.patch b/0008-UKSM.patch
deleted file mode 100644
index 3321eaa8ee58..000000000000
--- a/0008-UKSM.patch
+++ /dev/null
@@ -1,6970 +0,0 @@
-From 9a42006b641bc8e0c333174a9bf269ac9450d521 Mon Sep 17 00:00:00 2001
-From: Piotr Gorski <lucjan.lucjanov@gmail.com>
-Date: Tue, 13 Apr 2021 16:27:12 +0200
-Subject: [PATCH] UKSM for 5.12
-
-Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
----
- Documentation/vm/uksm.txt   |   61 +
- fs/exec.c                   |    1 +
- fs/proc/meminfo.c           |    4 +
- include/linux/ksm.h         |   43 +-
- include/linux/mm_types.h    |    3 +
- include/linux/mmzone.h      |    3 +
- include/linux/pgtable.h     |   17 +-
- include/linux/sradix-tree.h |   77 +
- include/linux/uksm.h        |  149 +
- kernel/fork.c               |    2 +-
- lib/Makefile                |    2 +-
- lib/sradix-tree.c           |  476 +++
- mm/Kconfig                  |   26 +
- mm/Makefile                 |    3 +-
- mm/ksm.c                    |   11 -
- mm/memory.c                 |   33 +-
- mm/mmap.c                   |   37 +
- mm/uksm.c                   | 5614 +++++++++++++++++++++++++++++++++++
- mm/vmstat.c                 |    3 +
- 19 files changed, 6539 insertions(+), 26 deletions(-)
- create mode 100644 Documentation/vm/uksm.txt
- create mode 100644 include/linux/sradix-tree.h
- create mode 100644 include/linux/uksm.h
- create mode 100644 lib/sradix-tree.c
- create mode 100644 mm/uksm.c
-
-diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
-new file mode 100644
-index 000000000..be19a3127
---- /dev/null
-+++ b/Documentation/vm/uksm.txt
-@@ -0,0 +1,61 @@
-+The Ultra Kernel Samepage Merging feature
-+----------------------------------------------
-+/*
-+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
-+ *
-+ * This is an improvement upon KSM. Some basic data structures and routines
-+ * are borrowed from ksm.c .
-+ *
-+ * Its new features:
-+ * 1. Full system scan:
-+ *      It automatically scans all user processes' anonymous VMAs. Kernel-user
-+ *      interaction to submit a memory area to KSM is no longer needed.
-+ *
-+ * 2. Rich area detection:
-+ *      It automatically detects rich areas containing abundant duplicated
-+ *      pages based. Rich areas are given a full scan speed. Poor areas are
-+ *      sampled at a reasonable speed with very low CPU consumption.
-+ *
-+ * 3. Ultra Per-page scan speed improvement:
-+ *      A new hash algorithm is proposed. As a result, on a machine with
-+ *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
-+ *      can scan memory areas that does not contain duplicated pages at speed of
-+ *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
-+ *      477MB/sec ~ 923MB/sec.
-+ *
-+ * 4. Thrashing area avoidance:
-+ *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
-+ *      filtered out. My benchmark shows it's more efficient than KSM's per-page
-+ *      hash value based volatile page detection.
-+ *
-+ *
-+ * 5. Misc changes upon KSM:
-+ *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
-+ *        comparison. It's much faster than default C version on x86.
-+ *      * rmap_item now has an struct *page member to loosely cache a
-+ *        address-->page mapping, which reduces too much time-costly
-+ *        follow_page().
-+ *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
-+ *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
-+ *        ksm is needed for this case.
-+ *
-+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
-+ *    Now uksmd consider full zero pages as special pages and merge them to an
-+ *    special unswappable uksm zero page.
-+ */
-+
-+ChangeLog:
-+
-+2012-05-05 The creation of this Doc
-+2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
-+2012-05-28 UKSM 0.1.1.2 bug fix release
-+2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
-+2012-07-2  UKSM 0.1.2-beta2
-+2012-07-10 UKSM 0.1.2-beta3
-+2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
-+2012-10-13 UKSM 0.1.2.1 Bug fixes.
-+2012-12-31 UKSM 0.1.2.2 Minor bug fixes.
-+2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug".
-+2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings.
-+2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation.
-+2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration.
-diff --git a/fs/exec.c b/fs/exec.c
-index 18594f11c..aee636fd4 100644
---- a/fs/exec.c
-+++ b/fs/exec.c
-@@ -65,6 +65,7 @@
- #include <linux/vmalloc.h>
- #include <linux/io_uring.h>
- #include <linux/syscall_user_dispatch.h>
-+#include <linux/ksm.h>
- 
- #include <linux/uaccess.h>
- #include <asm/mmu_context.h>
-diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
-index 6fa761c9c..45fd59a0d 100644
---- a/fs/proc/meminfo.c
-+++ b/fs/proc/meminfo.c
-@@ -108,6 +108,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
- #endif
- 	show_val_kb(m, "PageTables:     ",
- 		    global_node_page_state(NR_PAGETABLE));
-+#ifdef CONFIG_UKSM
-+	show_val_kb(m, "KsmZeroPages:     ",
-+		    global_zone_page_state(NR_UKSM_ZERO_PAGES));
-+#endif
- 
- 	show_val_kb(m, "NFS_Unstable:   ", 0);
- 	show_val_kb(m, "Bounce:         ",
-diff --git a/include/linux/ksm.h b/include/linux/ksm.h
-index 161e8164a..f0dbdf3c9 100644
---- a/include/linux/ksm.h
-+++ b/include/linux/ksm.h
-@@ -21,20 +21,16 @@ struct mem_cgroup;
- #ifdef CONFIG_KSM
- int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
- 		unsigned long end, int advice, unsigned long *vm_flags);
--int __ksm_enter(struct mm_struct *mm);
--void __ksm_exit(struct mm_struct *mm);
- 
--static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-+static inline struct stable_node *page_stable_node(struct page *page)
- {
--	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
--		return __ksm_enter(mm);
--	return 0;
-+	return PageKsm(page) ? page_rmapping(page) : NULL;
- }
- 
--static inline void ksm_exit(struct mm_struct *mm)
-+static inline void set_page_stable_node(struct page *page,
-+					struct stable_node *stable_node)
- {
--	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
--		__ksm_exit(mm);
-+	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
- }
- 
- /*
-@@ -54,6 +50,33 @@ struct page *ksm_might_need_to_copy(struct page *page,
- void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
- void ksm_migrate_page(struct page *newpage, struct page *oldpage);
- 
-+#ifdef CONFIG_KSM_LEGACY
-+int __ksm_enter(struct mm_struct *mm);
-+void __ksm_exit(struct mm_struct *mm);
-+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-+{
-+	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-+		return __ksm_enter(mm);
-+	return 0;
-+}
-+
-+static inline void ksm_exit(struct mm_struct *mm)
-+{
-+	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
-+		__ksm_exit(mm);
-+}
-+
-+#elif defined(CONFIG_UKSM)
-+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-+{
-+	return 0;
-+}
-+
-+static inline void ksm_exit(struct mm_struct *mm)
-+{
-+}
-+#endif /* !CONFIG_UKSM */
-+
- #else  /* !CONFIG_KSM */
- 
- static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-@@ -89,4 +112,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
- #endif /* CONFIG_MMU */
- #endif /* !CONFIG_KSM */
- 
-+#include <linux/uksm.h>
-+
- #endif /* __LINUX_KSM_H */
-diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
-index 6613b26a8..82e18e41b 100644
---- a/include/linux/mm_types.h
-+++ b/include/linux/mm_types.h
-@@ -370,6 +370,9 @@ struct vm_area_struct {
- 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
- #endif
- 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
-+#ifdef CONFIG_UKSM
-+	struct vma_slot *uksm_vma_slot;
-+#endif
- } __randomize_layout;
- 
- struct core_thread {
-diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
-index 47946cec7..a6ce64844 100644
---- a/include/linux/mmzone.h
-+++ b/include/linux/mmzone.h
-@@ -157,6 +157,9 @@ enum zone_stat_item {
- 	NR_ZSPAGES,		/* allocated in zsmalloc */
- #endif
- 	NR_FREE_CMA_PAGES,
-+#ifdef CONFIG_UKSM
-+	NR_UKSM_ZERO_PAGES,
-+#endif
- 	NR_VM_ZONE_STAT_ITEMS };
- 
- enum node_stat_item {
-diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
-index 5e772392a..9d733540d 100644
---- a/include/linux/pgtable.h
-+++ b/include/linux/pgtable.h
-@@ -1111,12 +1111,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
- extern void untrack_pfn_moved(struct vm_area_struct *vma);
- #endif
- 
-+#ifdef CONFIG_UKSM
-+static inline int is_uksm_zero_pfn(unsigned long pfn)
-+{
-+	extern unsigned long uksm_zero_pfn;
-+	return pfn == uksm_zero_pfn;
-+}
-+#else
-+static inline int is_uksm_zero_pfn(unsigned long pfn)
-+{
-+	return 0;
-+}
-+#endif
-+
- #ifdef __HAVE_COLOR_ZERO_PAGE
- static inline int is_zero_pfn(unsigned long pfn)
- {
- 	extern unsigned long zero_pfn;
- 	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
--	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
-+	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
- }
- 
- #define my_zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
-@@ -1125,7 +1138,7 @@ static inline int is_zero_pfn(unsigned long pfn)
- static inline int is_zero_pfn(unsigned long pfn)
- {
- 	extern unsigned long zero_pfn;
--	return pfn == zero_pfn;
-+	return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
- }
- 
- static inline unsigned long my_zero_pfn(unsigned long addr)
-diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
-new file mode 100644
-index 000000000..d71edba6b
---- /dev/null
-+++ b/include/linux/sradix-tree.h
-@@ -0,0 +1,77 @@
-+#ifndef _LINUX_SRADIX_TREE_H
-+#define _LINUX_SRADIX_TREE_H
-+
-+
-+#define INIT_SRADIX_TREE(root, mask)					\
-+do {									\
-+	(root)->height = 0;						\
-+	(root)->gfp_mask = (mask);					\
-+	(root)->rnode = NULL;						\
-+} while (0)
-+
-+#define ULONG_BITS	(sizeof(unsigned long) * 8)
-+#define SRADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
-+//#define SRADIX_TREE_MAP_SHIFT	6
-+//#define SRADIX_TREE_MAP_SIZE	(1UL << SRADIX_TREE_MAP_SHIFT)
-+//#define SRADIX_TREE_MAP_MASK	(SRADIX_TREE_MAP_SIZE-1)
-+
-+struct sradix_tree_node {
-+	unsigned int	height;		/* Height from the bottom */
-+	unsigned int	count;
-+	unsigned int	fulls;		/* Number of full sublevel trees */
-+	struct sradix_tree_node *parent;
-+	void *stores[0];
-+};
-+
-+/* A simple radix tree implementation */
-+struct sradix_tree_root {
-+	unsigned int            height;
-+	struct sradix_tree_node *rnode;
-+
-+	/* Where found to have available empty stores in its sublevels */
-+	struct sradix_tree_node *enter_node;
-+	unsigned int shift;
-+	unsigned int stores_size;
-+	unsigned int mask;
-+	unsigned long min;	/* The first hole index */
-+	unsigned long num;
-+	//unsigned long *height_to_maxindex;
-+
-+	/* How the node is allocated and freed. */
-+	struct sradix_tree_node *(*alloc)(void);
-+	void (*free)(struct sradix_tree_node *node);
-+
-+	/* When a new node is added and removed */
-+	void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
-+	void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item);
-+	void (*rm)(struct sradix_tree_node *node, unsigned int offset);
-+};
-+
-+struct sradix_tree_path {
-+	struct sradix_tree_node *node;
-+	int offset;
-+};
-+
-+static inline
-+void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
-+{
-+	root->height = 0;
-+	root->rnode = NULL;
-+	root->shift = shift;
-+	root->stores_size = 1UL << shift;
-+	root->mask = root->stores_size - 1;
-+}
-+
-+
-+extern void *sradix_tree_next(struct sradix_tree_root *root,
-+		       struct sradix_tree_node *node, unsigned long index,
-+		       int (*iter)(void *, unsigned long));
-+
-+extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
-+
-+extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
-+			struct sradix_tree_node *node, unsigned long index);
-+
-+extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
-+
-+#endif /* _LINUX_SRADIX_TREE_H */
-diff --git a/include/linux/uksm.h b/include/linux/uksm.h
-new file mode 100644
-index 000000000..bb8651f53
---- /dev/null
-+++ b/include/linux/uksm.h
-@@ -0,0 +1,149 @@
-+#ifndef __LINUX_UKSM_H
-+#define __LINUX_UKSM_H
-+/*
-+ * Memory merging support.
-+ *
-+ * This code enables dynamic sharing of identical pages found in different
-+ * memory areas, even if they are not shared by fork().
-+ */
-+
-+/* if !CONFIG_UKSM this file should not be compiled at all. */
-+#ifdef CONFIG_UKSM
-+
-+#include <linux/bitops.h>
-+#include <linux/mm.h>
-+#include <linux/pagemap.h>
-+#include <linux/rmap.h>
-+#include <linux/sched.h>
-+
-+extern unsigned long zero_pfn __read_mostly;
-+extern unsigned long uksm_zero_pfn __read_mostly;
-+extern struct page *empty_uksm_zero_page;
-+
-+/* must be done before linked to mm */
-+extern void uksm_vma_add_new(struct vm_area_struct *vma);
-+extern void uksm_remove_vma(struct vm_area_struct *vma);
-+
-+#define UKSM_SLOT_NEED_SORT	(1 << 0)
-+#define UKSM_SLOT_NEED_RERAND	(1 << 1)
-+#define UKSM_SLOT_SCANNED	(1 << 2) /* It's scanned in this round */
-+#define UKSM_SLOT_FUL_SCANNED	(1 << 3)
-+#define UKSM_SLOT_IN_UKSM	(1 << 4)
-+
-+struct vma_slot {
-+	struct sradix_tree_node *snode;
-+	unsigned long sindex;
-+
-+	struct list_head slot_list;
-+	unsigned long fully_scanned_round;
-+	unsigned long dedup_num;
-+	unsigned long pages_scanned;
-+	unsigned long this_sampled;
-+	unsigned long last_scanned;
-+	unsigned long pages_to_scan;
-+	struct scan_rung *rung;
-+	struct page **rmap_list_pool;
-+	unsigned int *pool_counts;
-+	unsigned long pool_size;
-+	struct vm_area_struct *vma;
-+	struct mm_struct *mm;
-+	unsigned long ctime_j;
-+	unsigned long pages;
-+	unsigned long flags;
-+	unsigned long pages_cowed; /* pages cowed this round */
-+	unsigned long pages_merged; /* pages merged this round */
-+	unsigned long pages_bemerged;
-+
-+	/* when it has page merged in this eval round */
-+	struct list_head dedup_list;
-+};
-+
-+static inline void uksm_unmap_zero_page(pte_t pte)
-+{
-+	if (pte_pfn(pte) == uksm_zero_pfn)
-+		__dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
-+}
-+
-+static inline void uksm_map_zero_page(pte_t pte)
-+{
-+	if (pte_pfn(pte) == uksm_zero_pfn)
-+		__inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
-+}
-+
-+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
-+{
-+	if (vma->uksm_vma_slot && PageKsm(page))
-+		vma->uksm_vma_slot->pages_cowed++;
-+}
-+
-+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
-+{
-+	if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
-+		vma->uksm_vma_slot->pages_cowed++;
-+}
-+
-+static inline int uksm_flags_can_scan(unsigned long vm_flags)
-+{
-+#ifdef VM_SAO
-+		if (vm_flags & VM_SAO)
-+			return 0;
-+#endif
-+
-+	return !(vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
-+			     VM_HUGETLB | VM_MIXEDMAP | VM_SHARED
-+			     | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN));
-+}
-+
-+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
-+{
-+	if (uksm_flags_can_scan(*vm_flags_p))
-+		*vm_flags_p |= VM_MERGEABLE;
-+}
-+
-+/*
-+ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
-+ * be removed when uksm zero page patch is stable enough.
-+ */
-+static inline void uksm_bugon_zeropage(pte_t pte)
-+{
-+	BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
-+}
-+#else
-+static inline void uksm_vma_add_new(struct vm_area_struct *vma)
-+{
-+}
-+
-+static inline void uksm_remove_vma(struct vm_area_struct *vma)
-+{
-+}
-+
-+static inline void uksm_unmap_zero_page(pte_t pte)
-+{
-+}
-+
-+static inline void uksm_map_zero_page(pte_t pte)
-+{
-+}
-+
-+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
-+{
-+}
-+
-+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
-+{
-+}
-+
-+static inline int uksm_flags_can_scan(unsigned long vm_flags)
-+{
-+	return 0;
-+}
-+
-+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
-+{
-+}
-+
-+static inline void uksm_bugon_zeropage(pte_t pte)
-+{
-+}
-+#endif /* !CONFIG_UKSM */
-+#endif /* __LINUX_UKSM_H */
-diff --git a/kernel/fork.c b/kernel/fork.c
-index 426cd0c51..5fd356ca7 100644
---- a/kernel/fork.c
-+++ b/kernel/fork.c
-@@ -588,7 +588,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
- 		__vma_link_rb(mm, tmp, rb_link, rb_parent);
- 		rb_link = &tmp->vm_rb.rb_right;
- 		rb_parent = &tmp->vm_rb;
--
-+		uksm_vma_add_new(tmp);
- 		mm->map_count++;
- 		if (!(tmp->vm_flags & VM_WIPEONFORK))
- 			retval = copy_page_range(tmp, mpnt);
-diff --git a/lib/Makefile b/lib/Makefile
-index b5307d3ee..480b099e1 100644
---- a/lib/Makefile
-+++ b/lib/Makefile
-@@ -28,7 +28,7 @@ CFLAGS_string.o += -fno-stack-protector
- endif
- 
- lib-y := ctype.o string.o vsprintf.o cmdline.o \
--	 rbtree.o radix-tree.o timerqueue.o xarray.o \
-+	 rbtree.o radix-tree.o sradix-tree.o timerqueue.o xarray.o \
- 	 idr.o extable.o sha1.o irq_regs.o argv_split.o \
- 	 flex_proportions.o ratelimit.o show_mem.o \
- 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c
-new file mode 100644
-index 000000000..ab21e6309
---- /dev/null
-+++ b/lib/sradix-tree.c
-@@ -0,0 +1,476 @@
-+#include <linux/errno.h>
-+#include <linux/mm.h>
-+#include <linux/mman.h>
-+#include <linux/spinlock.h>
-+#include <linux/slab.h>
-+#include <linux/gcd.h>
-+#include <linux/sradix-tree.h>
-+
-+static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
-+{
-+	return node->fulls == root->stores_size ||
-+		(node->height == 1 && node->count == root->stores_size);
-+}
-+
-+/*
-+ *	Extend a sradix tree so it can store key @index.
-+ */
-+static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
-+{
-+	struct sradix_tree_node *node;
-+	unsigned int height;
-+
-+	if (unlikely(root->rnode == NULL)) {
-+		if (!(node = root->alloc()))
-+			return -ENOMEM;
-+
-+		node->height = 1;
-+		root->rnode = node;
-+		root->height = 1;
-+	}
-+
-+	/* Figure out what the height should be.  */
-+	height = root->height;
-+	index >>= root->shift * height;
-+
-+	while (index) {
-+		index >>= root->shift;
-+		height++;
-+	}
-+
-+	while (height > root->height) {
-+		unsigned int newheight;
-+
-+		if (!(node = root->alloc()))
-+			return -ENOMEM;
-+
-+		/* Increase the height.  */
-+		node->stores[0] = root->rnode;
-+		root->rnode->parent = node;
-+		if (root->extend)
-+			root->extend(node, root->rnode);
-+
-+		newheight = root->height + 1;
-+		node->height = newheight;
-+		node->count = 1;
-+		if (sradix_node_full(root, root->rnode))
-+			node->fulls = 1;
-+
-+		root->rnode = node;
-+		root->height = newheight;
-+	}
-+
-+	return 0;
-+}
-+
-+/*
-+ * Search the next item from the current node, that is not NULL
-+ * and can satify root->iter().
-+ */
-+void *sradix_tree_next(struct sradix_tree_root *root,
-+		       struct sradix_tree_node *node, unsigned long index,
-+		       int (*iter)(void *item, unsigned long height))
-+{
-+	unsigned long offset;
-+	void *item;
-+
-+	if (unlikely(node == NULL)) {
-+		node = root->rnode;
-+		for (offset = 0; offset < root->stores_size; offset++) {
-+			item = node->stores[offset];
-+			if (item && (!iter || iter(item, node->height)))
-+				break;
-+		}
-+
-+		if (unlikely(offset >= root->stores_size))
-+			return NULL;
-+
-+		if (node->height == 1)
-+			return item;
-+		else
-+			goto go_down;
-+	}
-+
-+	while (node) {
-+		offset = (index & root->mask) + 1;
-+		for (; offset < root->stores_size; offset++) {
-+			item = node->stores[offset];
-+			if (item && (!iter || iter(item, node->height)))
-+				break;
-+		}
-+
-+		if (offset < root->stores_size)
-+			break;
-+
-+		node = node->parent;
-+		index >>= root->shift;
-+	}
-+
-+	if (!node)
-+		return NULL;
-+
-+	while (node->height > 1) {
-+go_down:
-+		node = item;
-+		for (offset = 0; offset < root->stores_size; offset++) {
-+			item = node->stores[offset];
-+			if (item && (!iter || iter(item, node->height)))
-+				break;
-+		}
-+
-+		if (unlikely(offset >= root->stores_size))
-+			return NULL;
-+	}
-+
-+	BUG_ON(offset > root->stores_size);
-+
-+	return item;
-+}
-+
-+/*
-+ * Blindly insert the item to the tree. Typically, we reuse the
-+ * first empty store item.
-+ */
-+int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
-+{
-+	unsigned long index;
-+	unsigned int height;
-+	struct sradix_tree_node *node, *tmp = NULL;
-+	int offset, offset_saved;
-+	void **store = NULL;
-+	int error, i, j, shift;
-+
-+go_on:
-+	index = root->min;
-+
-+	if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
-+		node = root->enter_node;
-+		BUG_ON((index >> (root->shift * root->height)));
-+	} else {
-+		node = root->rnode;
-+		if (node == NULL || (index >> (root->shift * root->height))
-+		    || sradix_node_full(root, node)) {
-+			error = sradix_tree_extend(root, index);
-+			if (error)
-+				return error;
-+
-+			node = root->rnode;
-+		}
-+	}
-+
-+
-+	height = node->height;
-+	shift = (height - 1) * root->shift;
-+	offset = (index >> shift) & root->mask;
-+	while (shift > 0) {
-+		offset_saved = offset;
-+		for (; offset < root->stores_size; offset++) {
-+			store = &node->stores[offset];
-+			tmp = *store;
-+
-+			if (!tmp || !sradix_node_full(root, tmp))
-+				break;
-+		}
-+		BUG_ON(offset >= root->stores_size);
-+
-+		if (offset != offset_saved) {
-+			index += (offset - offset_saved) << shift;
-+			index &= ~((1UL << shift) - 1);
-+		}
-+
-+		if (!tmp) {
-+			if (!(tmp = root->alloc()))
-+				return -ENOMEM;
-+
-+			tmp->height = shift / root->shift;
-+			*store = tmp;
-+			tmp->parent = node;
-+			node->count++;
-+//			if (root->extend)
-+//				root->extend(node, tmp);
-+		}
-+
-+		node = tmp;
-+		shift -= root->shift;
-+		offset = (index >> shift) & root->mask;
-+	}
-+
-+	BUG_ON(node->height != 1);
-+
-+
-+	store = &node->stores[offset];
-+	for (i = 0, j = 0;
-+	      j < root->stores_size - node->count &&
-+	      i < root->stores_size - offset && j < num; i++) {
-+		if (!store[i]) {
-+			store[i] = item[j];
-+			if (root->assign)
-+				root->assign(node, index + i, item[j]);
-+			j++;
-+		}
-+	}
-+
-+	node->count += j;
-+	root->num += j;
-+	num -= j;
-+
-+	while (sradix_node_full(root, node)) {
-+		node = node->parent;
-+		if (!node)
-+			break;
-+
-+		node->fulls++;
-+	}
-+
-+	if (unlikely(!node)) {
-+		/* All nodes are full */
-+		root->min = 1 << (root->height * root->shift);
-+		root->enter_node = NULL;
-+	} else {
-+		root->min = index + i - 1;
-+		root->min |= (1UL << (node->height - 1)) - 1;
-+		root->min++;
-+		root->enter_node = node;
-+	}
-+
-+	if (num) {
-+		item += j;
-+		goto go_on;
-+	}
-+
-+	return 0;
-+}
-+
-+
-+/**
-+ *	sradix_tree_shrink    -    shrink height of a sradix tree to minimal
-+ *      @root		sradix tree root
-+ *
-+ */
-+static inline void sradix_tree_shrink(struct sradix_tree_root *root)
-+{
-+	/* try to shrink tree height */
-+	while (root->height > 1) {
-+		struct sradix_tree_node *to_free = root->rnode;
-+
-+		/*
-+		 * The candidate node has more than one child, or its child
-+		 * is not at the leftmost store, we cannot shrink.
-+		 */
-+		if (to_free->count != 1 || !to_free->stores[0])
-+			break;
-+
-+		root->rnode = to_free->stores[0];
-+		root->rnode->parent = NULL;
-+		root->height--;
-+		if (unlikely(root->enter_node == to_free))
-+			root->enter_node = NULL;
-+		root->free(to_free);
-+	}
-+}
-+
-+/*
-+ * Del the item on the known leaf node and index
-+ */
-+void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
-+				  struct sradix_tree_node *node, unsigned long index)
-+{
-+	unsigned int offset;
-+	struct sradix_tree_node *start, *end;
-+
-+	BUG_ON(node->height != 1);
-+
-+	start = node;
-+	while (node && !(--node->count))
-+		node = node->parent;
-+
-+	end = node;
-+	if (!node) {
-+		root->rnode = NULL;
-+		root->height = 0;
-+		root->min = 0;
-+		root->num = 0;
-+		root->enter_node = NULL;
-+	} else {
-+		offset = (index >> (root->shift * (node->height - 1))) & root->mask;
-+		if (root->rm)
-+			root->rm(node, offset);
-+		node->stores[offset] = NULL;
-+		root->num--;
-+		if (root->min > index) {
-+			root->min = index;
-+			root->enter_node = node;
-+		}
-+	}
-+
-+	if (start != end) {
-+		do {
-+			node = start;
-+			start = start->parent;
-+			if (unlikely(root->enter_node == node))
-+				root->enter_node = end;
-+			root->free(node);
-+		} while (start != end);
-+
-+		/*
-+		 * Note that shrink may free "end", so enter_node still need to
-+		 * be checked inside.
-+		 */
-+		sradix_tree_shrink(root);
-+	} else if (node->count == root->stores_size - 1) {
-+		/* It WAS a full leaf node. Update the ancestors */
-+		node = node->parent;
-+		while (node) {
-+			node->fulls--;
-+			if (node->fulls != root->stores_size - 1)
-+				break;
-+
-+			node = node->parent;
-+		}
-+	}
-+}
-+
-+void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
-+{
-+	unsigned int height, offset;
-+	struct sradix_tree_node *node;
-+	int shift;
-+
-+	node = root->rnode;
-+	if (node == NULL || (index >> (root->shift * root->height)))
-+		return NULL;
-+
-+	height = root->height;
-+	shift = (height - 1) * root->shift;
-+
-+	do {
-+		offset = (index >> shift) & root->mask;
-+		node = node->stores[offset];
-+		if (!node)
-+			return NULL;
-+
-+		shift -= root->shift;
-+	} while (shift >= 0);
-+
-+	return node;
-+}
-+
-+/*
-+ * Return the item if it exists, otherwise create it in place
-+ * and return the created item.
-+ */
-+void *sradix_tree_lookup_create(struct sradix_tree_root *root,
-+			unsigned long index, void *(*item_alloc)(void))
-+{
-+	unsigned int height, offset;
-+	struct sradix_tree_node *node, *tmp;
-+	void *item;
-+	int shift, error;
-+
-+	if (root->rnode == NULL || (index >> (root->shift * root->height))) {
-+		if (item_alloc) {
-+			error = sradix_tree_extend(root, index);
-+			if (error)
-+				return NULL;
-+		} else {
-+			return NULL;
-+		}
-+	}
-+
-+	node = root->rnode;
-+	height = root->height;
-+	shift = (height - 1) * root->shift;
-+
-+	do {
-+		offset = (index >> shift) & root->mask;
-+		if (!node->stores[offset]) {
-+			if (!(tmp = root->alloc()))
-+				return NULL;
-+
-+			tmp->height = shift / root->shift;
-+			node->stores[offset] = tmp;
-+			tmp->parent = node;
-+			node->count++;
-+			node = tmp;
-+		} else {
-+			node = node->stores[offset];
-+		}
-+
-+		shift -= root->shift;
-+	} while (shift > 0);
-+
-+	BUG_ON(node->height != 1);
-+	offset = index & root->mask;
-+	if (node->stores[offset]) {
-+		return node->stores[offset];
-+	} else if (item_alloc) {
-+		if (!(item = item_alloc()))
-+			return NULL;
-+
-+		node->stores[offset] = item;
-+
-+		/*
-+		 * NOTE: we do NOT call root->assign here, since this item is
-+		 * newly created by us having no meaning. Caller can call this
-+		 * if it's necessary to do so.
-+		 */
-+
-+		node->count++;
-+		root->num++;
-+
-+		while (sradix_node_full(root, node)) {
-+			node = node->parent;
-+			if (!node)
-+				break;
-+
-+			node->fulls++;
-+		}
-+
-+		if (unlikely(!node)) {
-+			/* All nodes are full */
-+			root->min = 1 << (root->height * root->shift);
-+		} else {
-+			if (root->min == index) {
-+				root->min |= (1UL << (node->height - 1)) - 1;
-+				root->min++;
-+				root->enter_node = node;
-+			}
-+		}
-+
-+		return item;
-+	} else {
-+		return NULL;
-+	}
-+
-+}
-+
-+int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
-+{
-+	unsigned int height, offset;
-+	struct sradix_tree_node *node;
-+	int shift;
-+
-+	node = root->rnode;
-+	if (node == NULL || (index >> (root->shift * root->height)))
-+		return -ENOENT;
-+
-+	height = root->height;
-+	shift = (height - 1) * root->shift;
-+
-+	do {
-+		offset = (index >> shift) & root->mask;
-+		node = node->stores[offset];
-+		if (!node)
-+			return -ENOENT;
-+
-+		shift -= root->shift;
-+	} while (shift > 0);
-+
-+	offset = index & root->mask;
-+	if (!node->stores[offset])
-+		return -ENOENT;
-+
-+	sradix_tree_delete_from_leaf(root, node, index);
-+
-+	return 0;
-+}
-diff --git a/mm/Kconfig b/mm/Kconfig
-index 24c045b24..3ce98ecc2 100644
---- a/mm/Kconfig
-+++ b/mm/Kconfig
-@@ -317,6 +317,32 @@ config KSM
- 	  See Documentation/vm/ksm.rst for more information: KSM is inactive
- 	  until a program has madvised that an area is MADV_MERGEABLE, and
- 	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
-+choice
-+	prompt "Choose UKSM/KSM strategy"
-+	default UKSM
-+	depends on KSM
-+	help
-+	  This option allows to select a UKSM/KSM stragety.
-+
-+config UKSM
-+	bool "Ultra-KSM for page merging"
-+	depends on KSM
-+	help
-+	UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
-+	page Merging), but with a fundamentally rewritten core algorithm. With
-+	an advanced algorithm, UKSM now can transparently scans all anonymously
-+	mapped user space applications with an significantly improved scan speed
-+	and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
-+	UKSM. Now UKSM has its first stable release and first real world enterprise user.
-+	For more information, please goto its project page.
-+	(github.com/dolohow/uksm)
-+
-+config KSM_LEGACY
-+	bool "Legacy KSM implementation"
-+	depends on KSM
-+	help
-+	The legacy KSM implementation from Red Hat.
-+endchoice
- 
- config DEFAULT_MMAP_MIN_ADDR
- 	int "Low address space to protect from user allocation"
-diff --git a/mm/Makefile b/mm/Makefile
-index 72227b24a..fd50a3a51 100644
---- a/mm/Makefile
-+++ b/mm/Makefile
-@@ -76,7 +76,8 @@ obj-$(CONFIG_SPARSEMEM)	+= sparse.o
- obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
- obj-$(CONFIG_SLOB) += slob.o
- obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
--obj-$(CONFIG_KSM) += ksm.o
-+obj-$(CONFIG_KSM_LEGACY) += ksm.o
-+obj-$(CONFIG_UKSM) += uksm.o
- obj-$(CONFIG_PAGE_POISONING) += page_poison.o
- obj-$(CONFIG_SLAB) += slab.o
- obj-$(CONFIG_SLUB) += slub.o
-diff --git a/mm/ksm.c b/mm/ksm.c
-index 9694ee2c7..63af6a528 100644
---- a/mm/ksm.c
-+++ b/mm/ksm.c
-@@ -858,17 +858,6 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
- 	return err;
- }
- 
--static inline struct stable_node *page_stable_node(struct page *page)
--{
--	return PageKsm(page) ? page_rmapping(page) : NULL;
--}
--
--static inline void set_page_stable_node(struct page *page,
--					struct stable_node *stable_node)
--{
--	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
--}
--
- #ifdef CONFIG_SYSFS
- /*
-  * Only called through the sysfs control interface:
-diff --git a/mm/memory.c b/mm/memory.c
-index 550405fc3..b4005b195 100644
---- a/mm/memory.c
-+++ b/mm/memory.c
-@@ -158,6 +158,25 @@ EXPORT_SYMBOL(zero_pfn);
- 
- unsigned long highest_memmap_pfn __read_mostly;
- 
-+#ifdef CONFIG_UKSM
-+unsigned long uksm_zero_pfn __read_mostly;
-+EXPORT_SYMBOL_GPL(uksm_zero_pfn);
-+struct page *empty_uksm_zero_page;
-+
-+static int __init setup_uksm_zero_page(void)
-+{
-+	empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0);
-+	if (!empty_uksm_zero_page)
-+		panic("Oh boy, that early out of memory?");
-+
-+	SetPageReserved(empty_uksm_zero_page);
-+	uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
-+
-+	return 0;
-+}
-+core_initcall(setup_uksm_zero_page);
-+#endif
-+
- /*
-  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
-  */
-@@ -173,6 +192,7 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
- 	trace_rss_stat(mm, member, count);
- }
- 
-+
- #if defined(SPLIT_RSS_COUNTING)
- 
- void sync_mm_rss(struct mm_struct *mm)
-@@ -875,6 +895,11 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
- 		get_page(page);
- 		page_dup_rmap(page, false);
- 		rss[mm_counter(page)]++;
-+
-+		/* Should return NULL in vm_normal_page() */
-+		uksm_bugon_zeropage(pte);
-+	} else {
-+		uksm_map_zero_page(pte);
- 	}
- 
- 	/*
-@@ -1254,8 +1279,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
- 			ptent = ptep_get_and_clear_full(mm, addr, pte,
- 							tlb->fullmm);
- 			tlb_remove_tlb_entry(tlb, pte, addr);
--			if (unlikely(!page))
-+			if (unlikely(!page)) {
-+				uksm_unmap_zero_page(ptent);
- 				continue;
-+			}
- 
- 			if (!PageAnon(page)) {
- 				if (pte_dirty(ptent)) {
-@@ -2603,6 +2630,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
- 
- 	if (likely(src)) {
- 		copy_user_highpage(dst, src, addr, vma);
-+		uksm_cow_page(vma, src);
- 		return true;
- 	}
- 
-@@ -2849,6 +2877,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
- 							      vmf->address);
- 		if (!new_page)
- 			goto oom;
-+		uksm_cow_pte(vma, vmf->orig_pte);
- 	} else {
- 		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- 				vmf->address);
-@@ -2891,7 +2920,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
- 						mm_counter_file(old_page));
- 				inc_mm_counter_fast(mm, MM_ANONPAGES);
- 			}
-+			uksm_bugon_zeropage(vmf->orig_pte);
- 		} else {
-+			uksm_unmap_zero_page(vmf->orig_pte);
- 			inc_mm_counter_fast(mm, MM_ANONPAGES);
- 		}
- 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
-diff --git a/mm/mmap.c b/mm/mmap.c
-index 3f287599a..dc719db43 100644
---- a/mm/mmap.c
-+++ b/mm/mmap.c
-@@ -46,6 +46,7 @@
- #include <linux/moduleparam.h>
- #include <linux/pkeys.h>
- #include <linux/oom.h>
-+#include <linux/ksm.h>
- #include <linux/sched/mm.h>
- 
- #include <linux/uaccess.h>
-@@ -181,6 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
- 	if (vma->vm_file)
- 		fput(vma->vm_file);
- 	mpol_put(vma_policy(vma));
-+       uksm_remove_vma(vma);
- 	vm_area_free(vma);
- 	return next;
- }
-@@ -748,9 +750,16 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- 	long adjust_next = 0;
- 	int remove_next = 0;
- 
-+/*
-+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
-+ * acquired
-+ */
-+	uksm_remove_vma(vma);
-+
- 	if (next && !insert) {
- 		struct vm_area_struct *exporter = NULL, *importer = NULL;
- 
-+		uksm_remove_vma(next);
- 		if (end >= next->vm_end) {
- 			/*
- 			 * vma expands, overlapping all the next, and
-@@ -881,6 +890,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- 		end_changed = true;
- 	}
- 	vma->vm_pgoff = pgoff;
-+
- 	if (adjust_next) {
- 		next->vm_start += adjust_next;
- 		next->vm_pgoff += adjust_next >> PAGE_SHIFT;
-@@ -985,6 +995,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- 		if (remove_next == 2) {
- 			remove_next = 1;
- 			end = next->vm_end;
-+			uksm_remove_vma(next);
- 			goto again;
- 		}
- 		else if (next)
-@@ -1011,10 +1022,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- 			 */
- 			VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
- 		}
-+	} else {
-+		if (next && !insert)
-+			uksm_vma_add_new(next);
- 	}
- 	if (insert && file)
- 		uprobe_mmap(insert);
- 
-+	uksm_vma_add_new(vma);
- 	validate_mm(mm);
- 
- 	return 0;
-@@ -1470,6 +1485,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
- 	vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
- 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- 
-+	/* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */
-+	uksm_vm_flags_mod(&vm_flags);
-+
- 	if (flags & MAP_LOCKED)
- 		if (!can_do_mlock())
- 			return -EPERM;
-@@ -1865,6 +1883,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
- 			allow_write_access(file);
- 	}
- 	file = vma->vm_file;
-+	uksm_vma_add_new(vma);
- out:
- 	perf_event_mmap(vma);
- 
-@@ -1907,6 +1926,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
- 	if (vm_flags & VM_DENYWRITE)
- 		allow_write_access(file);
- free_vma:
-+	uksm_remove_vma(vma);
- 	vm_area_free(vma);
- unacct_error:
- 	if (charged)
-@@ -2766,6 +2786,8 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
- 	else
- 		err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
- 
-+	uksm_vma_add_new(new);
-+
- 	/* Success. */
- 	if (!err)
- 		return 0;
-@@ -3073,6 +3095,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
- 	if ((flags & (~VM_EXEC)) != 0)
- 		return -EINVAL;
- 	flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-+	uksm_vm_flags_mod(&flags);
- 
- 	mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- 	if (IS_ERR_VALUE(mapped_addr))
-@@ -3118,6 +3141,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
- 	vma->vm_flags = flags;
- 	vma->vm_page_prot = vm_get_page_prot(flags);
- 	vma_link(mm, vma, prev, rb_link, rb_parent);
-+	uksm_vma_add_new(vma);
- out:
- 	perf_event_mmap(vma);
- 	mm->total_vm += len >> PAGE_SHIFT;
-@@ -3195,6 +3219,12 @@ void exit_mmap(struct mm_struct *mm)
- 		mmap_write_unlock(mm);
- 	}
- 
-+	/*
-+	 * Taking write lock on mmap does not harm others,
-+	 * but it's crucial for uksm to avoid races.
-+	 */
-+	mmap_write_lock(mm);
-+
- 	if (mm->locked_vm) {
- 		vma = mm->mmap;
- 		while (vma) {
-@@ -3230,6 +3260,11 @@ void exit_mmap(struct mm_struct *mm)
- 		cond_resched();
- 	}
- 	vm_unacct_memory(nr_accounted);
-+
-+	mm->mmap = NULL;
-+	mm->mm_rb = RB_ROOT;
-+	vmacache_invalidate(mm);
-+	mmap_write_unlock(mm);
- }
- 
- /* Insert vm structure into process list sorted by address
-@@ -3337,6 +3372,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
- 			new_vma->vm_ops->open(new_vma);
- 		vma_link(mm, new_vma, prev, rb_link, rb_parent);
- 		*need_rmap_locks = false;
-+		uksm_vma_add_new(new_vma);
- 	}
- 	return new_vma;
- 
-@@ -3505,6 +3541,7 @@ static struct vm_area_struct *__install_special_mapping(
- 	vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
- 
- 	perf_event_mmap(vma);
-+	uksm_vma_add_new(vma);
- 
- 	return vma;
- 
-diff --git a/mm/uksm.c b/mm/uksm.c
-new file mode 100644
-index 000000000..e4732c00b
---- /dev/null
-+++ b/mm/uksm.c
-@@ -0,0 +1,5614 @@
-+/*
-+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
-+ *
-+ * This is an improvement upon KSM. Some basic data structures and routines
-+ * are borrowed from ksm.c .
-+ *
-+ * Its new features:
-+ * 1. Full system scan:
-+ *      It automatically scans all user processes' anonymous VMAs. Kernel-user
-+ *      interaction to submit a memory area to KSM is no longer needed.
-+ *
-+ * 2. Rich area detection:
-+ *      It automatically detects rich areas containing abundant duplicated
-+ *      pages based. Rich areas are given a full scan speed. Poor areas are
-+ *      sampled at a reasonable speed with very low CPU consumption.
-+ *
-+ * 3. Ultra Per-page scan speed improvement:
-+ *      A new hash algorithm is proposed. As a result, on a machine with
-+ *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
-+ *      can scan memory areas that does not contain duplicated pages at speed of
-+ *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
-+ *      477MB/sec ~ 923MB/sec.
-+ *
-+ * 4. Thrashing area avoidance:
-+ *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
-+ *      filtered out. My benchmark shows it's more efficient than KSM's per-page
-+ *      hash value based volatile page detection.
-+ *
-+ *
-+ * 5. Misc changes upon KSM:
-+ *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
-+ *        comparison. It's much faster than default C version on x86.
-+ *      * rmap_item now has an struct *page member to loosely cache a
-+ *        address-->page mapping, which reduces too much time-costly
-+ *        follow_page().
-+ *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
-+ *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
-+ *        ksm is needed for this case.
-+ *
-+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
-+ *    Now uksmd consider full zero pages as special pages and merge them to an
-+ *    special unswappable uksm zero page.
-+ */
-+
-+#include <linux/errno.h>
-+#include <linux/mm.h>
-+#include <linux/fs.h>
-+#include <linux/mman.h>
-+#include <linux/sched.h>
-+#include <linux/sched/mm.h>
-+#include <linux/sched/coredump.h>
-+#include <linux/sched/cputime.h>
-+#include <linux/rwsem.h>
-+#include <linux/pagemap.h>
-+#include <linux/rmap.h>
-+#include <linux/spinlock.h>
-+#include <linux/jhash.h>
-+#include <linux/delay.h>
-+#include <linux/kthread.h>
-+#include <linux/wait.h>
-+#include <linux/slab.h>
-+#include <linux/rbtree.h>
-+#include <linux/memory.h>
-+#include <linux/mmu_notifier.h>
-+#include <linux/swap.h>
-+#include <linux/ksm.h>
-+#include <linux/crypto.h>
-+#include <linux/scatterlist.h>
-+#include <crypto/hash.h>
-+#include <linux/random.h>
-+#include <linux/math64.h>
-+#include <linux/gcd.h>
-+#include <linux/freezer.h>
-+#include <linux/oom.h>
-+#include <linux/numa.h>
-+#include <linux/sradix-tree.h>
-+
-+#include <asm/tlbflush.h>
-+#include "internal.h"
-+
-+#ifdef CONFIG_X86
-+#undef memcmp
-+
-+#ifdef CONFIG_X86_32
-+#define memcmp memcmpx86_32
-+/*
-+ * Compare 4-byte-aligned address s1 and s2, with length n
-+ */
-+int memcmpx86_32(void *s1, void *s2, size_t n)
-+{
-+	size_t num = n / 4;
-+	register int res;
-+
-+	__asm__ __volatile__
-+	(
-+	 "testl %3,%3\n\t"
-+	 "repe; cmpsd\n\t"
-+	 "je        1f\n\t"
-+	 "sbbl      %0,%0\n\t"
-+	 "orl       $1,%0\n"
-+	 "1:"
-+	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
-+	 : "0" (0)
-+	 : "cc");
-+
-+	return res;
-+}
-+
-+/*
-+ * Check the page is all zero ?
-+ */
-+static int is_full_zero(const void *s1, size_t len)
-+{
-+	unsigned char same;
-+
-+	len /= 4;
-+
-+	__asm__ __volatile__
-+	("repe; scasl;"
-+	 "sete %0"
-+	 : "=qm" (same), "+D" (s1), "+c" (len)
-+	 : "a" (0)
-+	 : "cc");
-+
-+	return same;
-+}
-+
-+
-+#elif defined(CONFIG_X86_64)
-+#define memcmp memcmpx86_64
-+/*
-+ * Compare 8-byte-aligned address s1 and s2, with length n
-+ */
-+int memcmpx86_64(void *s1, void *s2, size_t n)
-+{
-+	size_t num = n / 8;
-+	register int res;
-+
-+	__asm__ __volatile__
-+	(
-+	 "testq %q3,%q3\n\t"
-+	 "repe; cmpsq\n\t"
-+	 "je        1f\n\t"
-+	 "sbbq      %q0,%q0\n\t"
-+	 "orq       $1,%q0\n"
-+	 "1:"
-+	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
-+	 : "0" (0)
-+	 : "cc");
-+
-+	return res;
-+}
-+
-+static int is_full_zero(const void *s1, size_t len)
-+{
-+	unsigned char same;
-+
-+	len /= 8;
-+
-+	__asm__ __volatile__
-+	("repe; scasq;"
-+	 "sete %0"
-+	 : "=qm" (same), "+D" (s1), "+c" (len)
-+	 : "a" (0)
-+	 : "cc");
-+
-+	return same;
-+}
-+
-+#endif
-+#else
-+static int is_full_zero(const void *s1, size_t len)
-+{
-+	unsigned long *src = s1;
-+	int i;
-+
-+	len /= sizeof(*src);
-+
-+	for (i = 0; i < len; i++) {
-+		if (src[i])
-+			return 0;
-+	}
-+
-+	return 1;
-+}
-+#endif
-+
-+#define UKSM_RUNG_ROUND_FINISHED  (1 << 0)
-+#define TIME_RATIO_SCALE	10000
-+
-+#define SLOT_TREE_NODE_SHIFT	8
-+#define SLOT_TREE_NODE_STORE_SIZE	(1UL << SLOT_TREE_NODE_SHIFT)
-+struct slot_tree_node {
-+	unsigned long size;
-+	struct sradix_tree_node snode;
-+	void *stores[SLOT_TREE_NODE_STORE_SIZE];
-+};
-+
-+static struct kmem_cache *slot_tree_node_cachep;
-+
-+static struct sradix_tree_node *slot_tree_node_alloc(void)
-+{
-+	struct slot_tree_node *p;
-+
-+	p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL |
-+			      __GFP_NORETRY | __GFP_NOWARN);
-+	if (!p)
-+		return NULL;
-+
-+	return &p->snode;
-+}
-+
-+static void slot_tree_node_free(struct sradix_tree_node *node)
-+{
-+	struct slot_tree_node *p;
-+
-+	p = container_of(node, struct slot_tree_node, snode);
-+	kmem_cache_free(slot_tree_node_cachep, p);
-+}
-+
-+static void slot_tree_node_extend(struct sradix_tree_node *parent,
-+				  struct sradix_tree_node *child)
-+{
-+	struct slot_tree_node *p, *c;
-+
-+	p = container_of(parent, struct slot_tree_node, snode);
-+	c = container_of(child, struct slot_tree_node, snode);
-+
-+	p->size += c->size;
-+}
-+
-+void slot_tree_node_assign(struct sradix_tree_node *node,
-+			   unsigned int index, void *item)
-+{
-+	struct vma_slot *slot = item;
-+	struct slot_tree_node *cur;
-+
-+	slot->snode = node;
-+	slot->sindex = index;
-+
-+	while (node) {
-+		cur = container_of(node, struct slot_tree_node, snode);
-+		cur->size += slot->pages;
-+		node = node->parent;
-+	}
-+}
-+
-+void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset)
-+{
-+	struct vma_slot *slot;
-+	struct slot_tree_node *cur;
-+	unsigned long pages;
-+
-+	if (node->height == 1) {
-+		slot = node->stores[offset];
-+		pages = slot->pages;
-+	} else {
-+		cur = container_of(node->stores[offset],
-+				   struct slot_tree_node, snode);
-+		pages = cur->size;
-+	}
-+
-+	while (node) {
-+		cur = container_of(node, struct slot_tree_node, snode);
-+		cur->size -= pages;
-+		node = node->parent;
-+	}
-+}
-+
-+unsigned long slot_iter_index;
-+int slot_iter(void *item,  unsigned long height)
-+{
-+	struct slot_tree_node *node;
-+	struct vma_slot *slot;
-+
-+	if (height == 1) {
-+		slot = item;
-+		if (slot_iter_index < slot->pages) {
-+			/*in this one*/
-+			return 1;
-+		} else {
-+			slot_iter_index -= slot->pages;
-+			return 0;
-+		}
-+
-+	} else {
-+		node = container_of(item, struct slot_tree_node, snode);
-+		if (slot_iter_index < node->size) {
-+			/*in this one*/
-+			return 1;
-+		} else {
-+			slot_iter_index -= node->size;
-+			return 0;
-+		}
-+	}
-+}
-+
-+
-+static inline void slot_tree_init_root(struct sradix_tree_root *root)
-+{
-+	init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
-+	root->alloc = slot_tree_node_alloc;
-+	root->free = slot_tree_node_free;
-+	root->extend = slot_tree_node_extend;
-+	root->assign = slot_tree_node_assign;
-+	root->rm = slot_tree_node_rm;
-+}
-+
-+void slot_tree_init(void)
-+{
-+	slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
-+				sizeof(struct slot_tree_node), 0,
-+				SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
-+				NULL);
-+}
-+
-+
-+/* Each rung of this ladder is a list of VMAs having a same scan ratio */
-+struct scan_rung {
-+	//struct list_head scanned_list;
-+	struct sradix_tree_root vma_root;
-+	struct sradix_tree_root vma_root2;
-+
-+	struct vma_slot *current_scan;
-+	unsigned long current_offset;
-+
-+	/*
-+	 * The initial value for current_offset, it should loop over
-+	 * [0~ step - 1] to let all slot have its chance to be scanned.
-+	 */
-+	unsigned long offset_init;
-+	unsigned long step; /* dynamic step for current_offset */
-+	unsigned int flags;
-+	unsigned long pages_to_scan;
-+	//unsigned long fully_scanned_slots;
-+	/*
-+	 * a little bit tricky - if cpu_time_ratio > 0, then the value is the
-+	 * the cpu time ratio it can spend in rung_i for every scan
-+	 * period. if < 0, then it is the cpu time ratio relative to the
-+	 * max cpu percentage user specified. Both in unit of
-+	 * 1/TIME_RATIO_SCALE
-+	 */
-+	int cpu_ratio;
-+
-+	/*
-+	 * How long it will take for all slots in this rung to be fully
-+	 * scanned? If it's zero, we don't care about the cover time:
-+	 * it's fully scanned.
-+	 */
-+	unsigned int cover_msecs;
-+	//unsigned long vma_num;
-+	//unsigned long pages; /* Sum of all slot's pages in rung */
-+};
-+
-+/**
-+ * node of either the stable or unstale rbtree
-+ *
-+ */
-+struct tree_node {
-+	struct rb_node node; /* link in the main (un)stable rbtree */
-+	struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
-+	u32 hash;
-+	unsigned long count; /* TODO: merged with sub_root */
-+	struct list_head all_list; /* all tree nodes in stable/unstable tree */
-+};
-+
-+/**
-+ * struct stable_node - node of the stable rbtree
-+ * @node: rb node of this ksm page in the stable tree
-+ * @hlist: hlist head of rmap_items using this ksm page
-+ * @kpfn: page frame number of this ksm page
-+ */
-+struct stable_node {
-+	struct rb_node node; /* link in sub-rbtree */
-+	struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
-+	struct hlist_head hlist;
-+	unsigned long kpfn;
-+	u32 hash_max; /* if ==0 then it's not been calculated yet */
-+	struct list_head all_list; /* in a list for all stable nodes */
-+};
-+
-+/**
-+ * struct node_vma - group rmap_items linked in a same stable
-+ * node together.
-+ */
-+struct node_vma {
-+	union {
-+		struct vma_slot *slot;
-+		unsigned long key;  /* slot is used as key sorted on hlist */
-+	};
-+	struct hlist_node hlist;
-+	struct hlist_head rmap_hlist;
-+	struct stable_node *head;
-+};
-+
-+/**
-+ * struct rmap_item - reverse mapping item for virtual addresses
-+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
-+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
-+ * @mm: the memory structure this rmap_item is pointing into
-+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
-+ * @node: rb node of this rmap_item in the unstable tree
-+ * @head: pointer to stable_node heading this list in the stable tree
-+ * @hlist: link into hlist of rmap_items hanging off that stable_node
-+ */
-+struct rmap_item {
-+	struct vma_slot *slot;
-+	struct page *page;
-+	unsigned long address;	/* + low bits used for flags below */
-+	unsigned long hash_round;
-+	unsigned long entry_index;
-+	union {
-+		struct {/* when in unstable tree */
-+			struct rb_node node;
-+			struct tree_node *tree_node;
-+			u32 hash_max;
-+		};
-+		struct { /* when in stable tree */
-+			struct node_vma *head;
-+			struct hlist_node hlist;
-+			struct anon_vma *anon_vma;
-+		};
-+	};
-+} __aligned(4);
-+
-+struct rmap_list_entry {
-+	union {
-+		struct rmap_item *item;
-+		unsigned long addr;
-+	};
-+	/* lowest bit is used for is_addr tag */
-+} __aligned(4); /* 4 aligned to fit in to pages*/
-+
-+
-+/* Basic data structure definition ends */
-+
-+
-+/*
-+ * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
-+ * The flags use the low bits of rmap_item.address
-+ */
-+#define UNSTABLE_FLAG	0x1
-+#define STABLE_FLAG	0x2
-+#define get_rmap_addr(x)	((x)->address & PAGE_MASK)
-+
-+/*
-+ * rmap_list_entry helpers
-+ */
-+#define IS_ADDR_FLAG	1
-+#define is_addr(ptr)		((unsigned long)(ptr) & IS_ADDR_FLAG)
-+#define set_is_addr(ptr)	((ptr) |= IS_ADDR_FLAG)
-+#define get_clean_addr(ptr)	(((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
-+
-+
-+/*
-+ * High speed caches for frequently allocated and freed structs
-+ */
-+static struct kmem_cache *rmap_item_cache;
-+static struct kmem_cache *stable_node_cache;
-+static struct kmem_cache *node_vma_cache;
-+static struct kmem_cache *vma_slot_cache;
-+static struct kmem_cache *tree_node_cache;
-+#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
-+		sizeof(struct __struct), __alignof__(struct __struct),\
-+		(__flags), NULL)
-+
-+/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
-+#define SCAN_LADDER_SIZE 4
-+static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
-+
-+/* The evaluation rounds uksmd has finished */
-+static unsigned long long uksm_eval_round = 1;
-+
-+/*
-+ * we add 1 to this var when we consider we should rebuild the whole
-+ * unstable tree.
-+ */
-+static unsigned long uksm_hash_round = 1;
-+
-+/*
-+ * How many times the whole memory is scanned.
-+ */
-+static unsigned long long fully_scanned_round = 1;
-+
-+/* The total number of virtual pages of all vma slots */
-+static u64 uksm_pages_total;
-+
-+/* The number of pages has been scanned since the start up */
-+static u64 uksm_pages_scanned;
-+
-+static u64 scanned_virtual_pages;
-+
-+/* The number of pages has been scanned since last encode_benefit call */
-+static u64 uksm_pages_scanned_last;
-+
-+/* If the scanned number is tooo large, we encode it here */
-+static u64 pages_scanned_stored;
-+
-+static unsigned long pages_scanned_base;
-+
-+/* The number of nodes in the stable tree */
-+static unsigned long uksm_pages_shared;
-+
-+/* The number of page slots additionally sharing those nodes */
-+static unsigned long uksm_pages_sharing;
-+
-+/* The number of nodes in the unstable tree */
-+static unsigned long uksm_pages_unshared;
-+
-+/*
-+ * Milliseconds ksmd should sleep between scans,
-+ * >= 100ms to be consistent with
-+ * scan_time_to_sleep_msec()
-+ */
-+static unsigned int uksm_sleep_jiffies;
-+
-+/* The real value for the uksmd next sleep */
-+static unsigned int uksm_sleep_real;
-+
-+/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
-+static unsigned int uksm_sleep_saved;
-+
-+/* Max percentage of cpu utilization ksmd can take to scan in one batch */
-+static unsigned int uksm_max_cpu_percentage;
-+
-+static int uksm_cpu_governor;
-+
-+static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
-+
-+struct uksm_cpu_preset_s {
-+	int cpu_ratio[SCAN_LADDER_SIZE];
-+	unsigned int cover_msecs[SCAN_LADDER_SIZE];
-+	unsigned int max_cpu; /* percentage */
-+};
-+
-+struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
-+	{ {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
-+	{ {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
-+	{ {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
-+	{ {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
-+};
-+
-+/* The default value for uksm_ema_page_time if it's not initialized */
-+#define UKSM_PAGE_TIME_DEFAULT	500
-+
-+/*cost to scan one page by expotional moving average in nsecs */
-+static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
-+
-+/* The expotional moving average alpha weight, in percentage. */
-+#define EMA_ALPHA	20
-+
-+/*
-+ * The threshold used to filter out thrashing areas,
-+ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
-+ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
-+ * will be considered as having a zero duplication ratio.
-+ */
-+static unsigned int uksm_thrash_threshold = 50;
-+
-+/* How much dedup ratio is considered to be abundant*/
-+static unsigned int uksm_abundant_threshold = 10;
-+
-+/* All slots having merged pages in this eval round. */
-+struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
-+
-+/* How many times the ksmd has slept since startup */
-+static unsigned long long uksm_sleep_times;
-+
-+#define UKSM_RUN_STOP	0
-+#define UKSM_RUN_MERGE	1
-+static unsigned int uksm_run = 1;
-+
-+static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
-+static DEFINE_MUTEX(uksm_thread_mutex);
-+
-+/*
-+ * List vma_slot_new is for newly created vma_slot waiting to be added by
-+ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
-+ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
-+ * VMA has been removed/freed.
-+ */
-+struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
-+struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
-+struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
-+static DEFINE_SPINLOCK(vma_slot_list_lock);
-+
-+/* The unstable tree heads */
-+static struct rb_root root_unstable_tree = RB_ROOT;
-+
-+/*
-+ * All tree_nodes are in a list to be freed at once when unstable tree is
-+ * freed after each scan round.
-+ */
-+static struct list_head unstable_tree_node_list =
-+				LIST_HEAD_INIT(unstable_tree_node_list);
-+
-+/* List contains all stable nodes */
-+static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
-+
-+/*
-+ * When the hash strength is changed, the stable tree must be delta_hashed and
-+ * re-structured. We use two set of below structs to speed up the
-+ * re-structuring of stable tree.
-+ */
-+static struct list_head
-+stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
-+			    LIST_HEAD_INIT(stable_tree_node_list[1])};
-+
-+static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
-+static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
-+static struct rb_root *root_stable_treep = &root_stable_tree[0];
-+static unsigned long stable_tree_index;
-+
-+/* The hash strength needed to hash a full page */
-+#define HASH_STRENGTH_FULL		(PAGE_SIZE / sizeof(u32))
-+
-+/* The hash strength needed for loop-back hashing */
-+#define HASH_STRENGTH_MAX		(HASH_STRENGTH_FULL + 10)
-+
-+/* The random offsets in a page */
-+static u32 *random_nums;
-+
-+/* The hash strength */
-+static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
-+
-+/* The delta value each time the hash strength increases or decreases */
-+static unsigned long hash_strength_delta;
-+#define HASH_STRENGTH_DELTA_MAX	5
-+
-+/* The time we have saved due to random_sample_hash */
-+static u64 rshash_pos;
-+
-+/* The time we have wasted due to hash collision */
-+static u64 rshash_neg;
-+
-+struct uksm_benefit {
-+	u64 pos;
-+	u64 neg;
-+	u64 scanned;
-+	unsigned long base;
-+} benefit;
-+
-+/*
-+ * The relative cost of memcmp, compared to 1 time unit of random sample
-+ * hash, this value is tested when ksm module is initialized
-+ */
-+static unsigned long memcmp_cost;
-+
-+static unsigned long  rshash_neg_cont_zero;
-+static unsigned long  rshash_cont_obscure;
-+
-+/* The possible states of hash strength adjustment heuristic */
-+enum rshash_states {
-+		RSHASH_STILL,
-+		RSHASH_TRYUP,
-+		RSHASH_TRYDOWN,
-+		RSHASH_NEW,
-+		RSHASH_PRE_STILL,
-+};
-+
-+/* The possible direction we are about to adjust hash strength */
-+enum rshash_direct {
-+	GO_UP,
-+	GO_DOWN,
-+	OBSCURE,
-+	STILL,
-+};
-+
-+/* random sampling hash state machine */
-+static struct {
-+	enum rshash_states state;
-+	enum rshash_direct pre_direct;
-+	u8 below_count;
-+	/* Keep a lookup window of size 5, iff above_count/below_count > 3
-+	 * in this window we stop trying.
-+	 */
-+	u8 lookup_window_index;
-+	u64 stable_benefit;
-+	unsigned long turn_point_down;
-+	unsigned long turn_benefit_down;
-+	unsigned long turn_point_up;
-+	unsigned long turn_benefit_up;
-+	unsigned long stable_point;
-+} rshash_state;
-+
-+/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
-+static u32 *zero_hash_table;
-+
-+static inline struct node_vma *alloc_node_vma(void)
-+{
-+	struct node_vma *node_vma;
-+
-+	node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL |
-+				     __GFP_NORETRY | __GFP_NOWARN);
-+	if (node_vma) {
-+		INIT_HLIST_HEAD(&node_vma->rmap_hlist);
-+		INIT_HLIST_NODE(&node_vma->hlist);
-+	}
-+	return node_vma;
-+}
-+
-+static inline void free_node_vma(struct node_vma *node_vma)
-+{
-+	kmem_cache_free(node_vma_cache, node_vma);
-+}
-+
-+
-+static inline struct vma_slot *alloc_vma_slot(void)
-+{
-+	struct vma_slot *slot;
-+
-+	/*
-+	 * In case ksm is not initialized by now.
-+	 * Oops, we need to consider the call site of uksm_init() in the future.
-+	 */
-+	if (!vma_slot_cache)
-+		return NULL;
-+
-+	slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL |
-+				 __GFP_NORETRY | __GFP_NOWARN);
-+	if (slot) {
-+		INIT_LIST_HEAD(&slot->slot_list);
-+		INIT_LIST_HEAD(&slot->dedup_list);
-+		slot->flags |= UKSM_SLOT_NEED_RERAND;
-+	}
-+	return slot;
-+}
-+
-+static inline void free_vma_slot(struct vma_slot *vma_slot)
-+{
-+	kmem_cache_free(vma_slot_cache, vma_slot);
-+}
-+
-+
-+
-+static inline struct rmap_item *alloc_rmap_item(void)
-+{
-+	struct rmap_item *rmap_item;
-+
-+	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
-+				      __GFP_NORETRY | __GFP_NOWARN);
-+	if (rmap_item) {
-+		/* bug on lowest bit is not clear for flag use */
-+		BUG_ON(is_addr(rmap_item));
-+	}
-+	return rmap_item;
-+}
-+
-+static inline void free_rmap_item(struct rmap_item *rmap_item)
-+{
-+	rmap_item->slot = NULL;	/* debug safety */
-+	kmem_cache_free(rmap_item_cache, rmap_item);
-+}
-+
-+static inline struct stable_node *alloc_stable_node(void)
-+{
-+	struct stable_node *node;
-+
-+	node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL |
-+				__GFP_NORETRY | __GFP_NOWARN);
-+	if (!node)
-+		return NULL;
-+
-+	INIT_HLIST_HEAD(&node->hlist);
-+	list_add(&node->all_list, &stable_node_list);
-+	return node;
-+}
-+
-+static inline void free_stable_node(struct stable_node *stable_node)
-+{
-+	list_del(&stable_node->all_list);
-+	kmem_cache_free(stable_node_cache, stable_node);
-+}
-+
-+static inline struct tree_node *alloc_tree_node(struct list_head *list)
-+{
-+	struct tree_node *node;
-+
-+	node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL |
-+				 __GFP_NORETRY | __GFP_NOWARN);
-+	if (!node)
-+		return NULL;
-+
-+	list_add(&node->all_list, list);
-+	return node;
-+}
-+
-+static inline void free_tree_node(struct tree_node *node)
-+{
-+	list_del(&node->all_list);
-+	kmem_cache_free(tree_node_cache, node);
-+}
-+
-+static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
-+{
-+	struct anon_vma *anon_vma = rmap_item->anon_vma;
-+
-+	put_anon_vma(anon_vma);
-+}
-+
-+
-+/**
-+ * Remove a stable node from stable_tree, may unlink from its tree_node and
-+ * may remove its parent tree_node if no other stable node is pending.
-+ *
-+ * @stable_node	    The node need to be removed
-+ * @unlink_rb	    Will this node be unlinked from the rbtree?
-+ * @remove_tree_    node Will its tree_node be removed if empty?
-+ */
-+static void remove_node_from_stable_tree(struct stable_node *stable_node,
-+					 int unlink_rb,  int remove_tree_node)
-+{
-+	struct node_vma *node_vma;
-+	struct rmap_item *rmap_item;
-+	struct hlist_node *n;
-+
-+	if (!hlist_empty(&stable_node->hlist)) {
-+		hlist_for_each_entry_safe(node_vma, n,
-+					  &stable_node->hlist, hlist) {
-+			hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
-+				uksm_pages_sharing--;
-+
-+				uksm_drop_anon_vma(rmap_item);
-+				rmap_item->address &= PAGE_MASK;
-+			}
-+			free_node_vma(node_vma);
-+			cond_resched();
-+		}
-+
-+		/* the last one is counted as shared */
-+		uksm_pages_shared--;
-+		uksm_pages_sharing++;
-+	}
-+
-+	if (stable_node->tree_node && unlink_rb) {
-+		rb_erase(&stable_node->node,
-+			 &stable_node->tree_node->sub_root);
-+
-+		if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
-+		    remove_tree_node) {
-+			rb_erase(&stable_node->tree_node->node,
-+				 root_stable_treep);
-+			free_tree_node(stable_node->tree_node);
-+		} else {
-+			stable_node->tree_node->count--;
-+		}
-+	}
-+
-+	free_stable_node(stable_node);
-+}
-+
-+
-+/*
-+ * get_uksm_page: checks if the page indicated by the stable node
-+ * is still its ksm page, despite having held no reference to it.
-+ * In which case we can trust the content of the page, and it
-+ * returns the gotten page; but if the page has now been zapped,
-+ * remove the stale node from the stable tree and return NULL.
-+ *
-+ * You would expect the stable_node to hold a reference to the ksm page.
-+ * But if it increments the page's count, swapping out has to wait for
-+ * ksmd to come around again before it can free the page, which may take
-+ * seconds or even minutes: much too unresponsive.  So instead we use a
-+ * "keyhole reference": access to the ksm page from the stable node peeps
-+ * out through its keyhole to see if that page still holds the right key,
-+ * pointing back to this stable node.  This relies on freeing a PageAnon
-+ * page to reset its page->mapping to NULL, and relies on no other use of
-+ * a page to put something that might look like our key in page->mapping.
-+ *
-+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
-+ * but this is different - made simpler by uksm_thread_mutex being held, but
-+ * interesting for assuming that no other use of the struct page could ever
-+ * put our expected_mapping into page->mapping (or a field of the union which
-+ * coincides with page->mapping).  The RCU calls are not for KSM at all, but
-+ * to keep the page_count protocol described with page_cache_get_speculative.
-+ *
-+ * Note: it is possible that get_uksm_page() will return NULL one moment,
-+ * then page the next, if the page is in between page_freeze_refs() and
-+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
-+ * is on its way to being freed; but it is an anomaly to bear in mind.
-+ *
-+ * @unlink_rb:			if the removal of this node will firstly unlink from
-+ * its rbtree. stable_node_reinsert will prevent this when restructuring the
-+ * node from its old tree.
-+ *
-+ * @remove_tree_node:	if this is the last one of its tree_node, will the
-+ * tree_node be freed ? If we are inserting stable node, this tree_node may
-+ * be reused, so don't free it.
-+ */
-+static struct page *get_uksm_page(struct stable_node *stable_node,
-+				 int unlink_rb, int remove_tree_node)
-+{
-+	struct page *page;
-+	void *expected_mapping;
-+	unsigned long kpfn;
-+
-+	expected_mapping = (void *)((unsigned long)stable_node |
-+				    PAGE_MAPPING_KSM);
-+again:
-+	kpfn = READ_ONCE(stable_node->kpfn);
-+	page = pfn_to_page(kpfn);
-+
-+	/*
-+	 * page is computed from kpfn, so on most architectures reading
-+	 * page->mapping is naturally ordered after reading node->kpfn,
-+	 * but on Alpha we need to be more careful.
-+	 */
-+	smp_rmb();
-+
-+	if (READ_ONCE(page->mapping) != expected_mapping)
-+		goto stale;
-+
-+	/*
-+	 * We cannot do anything with the page while its refcount is 0.
-+	 * Usually 0 means free, or tail of a higher-order page: in which
-+	 * case this node is no longer referenced, and should be freed;
-+	 * however, it might mean that the page is under page_freeze_refs().
-+	 * The __remove_mapping() case is easy, again the node is now stale;
-+	 * but if page is swapcache in migrate_page_move_mapping(), it might
-+	 * still be our page, in which case it's essential to keep the node.
-+	 */
-+	while (!get_page_unless_zero(page)) {
-+		/*
-+		 * Another check for page->mapping != expected_mapping would
-+		 * work here too.  We have chosen the !PageSwapCache test to
-+		 * optimize the common case, when the page is or is about to
-+		 * be freed: PageSwapCache is cleared (under spin_lock_irq)
-+		 * in the freeze_refs section of __remove_mapping(); but Anon
-+		 * page->mapping reset to NULL later, in free_pages_prepare().
-+		 */
-+		if (!PageSwapCache(page))
-+			goto stale;
-+		cpu_relax();
-+	}
-+
-+	if (READ_ONCE(page->mapping) != expected_mapping) {
-+		put_page(page);
-+		goto stale;
-+	}
-+
-+	lock_page(page);
-+	if (READ_ONCE(page->mapping) != expected_mapping) {
-+		unlock_page(page);
-+		put_page(page);
-+		goto stale;
-+	}
-+	unlock_page(page);
-+	return page;
-+stale:
-+	/*
-+	 * We come here from above when page->mapping or !PageSwapCache
-+	 * suggests that the node is stale; but it might be under migration.
-+	 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
-+	 * before checking whether node->kpfn has been changed.
-+	 */
-+	smp_rmb();
-+	if (stable_node->kpfn != kpfn)
-+		goto again;
-+
-+	remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
-+
-+	return NULL;
-+}
-+
-+/*
-+ * Removing rmap_item from stable or unstable tree.
-+ * This function will clean the information from the stable/unstable tree.
-+ */
-+static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
-+{
-+	if (rmap_item->address & STABLE_FLAG) {
-+		struct stable_node *stable_node;
-+		struct node_vma *node_vma;
-+		struct page *page;
-+
-+		node_vma = rmap_item->head;
-+		stable_node = node_vma->head;
-+		page = get_uksm_page(stable_node, 1, 1);
-+		if (!page)
-+			goto out;
-+
-+		/*
-+		 * page lock is needed because it's racing with
-+		 * try_to_unmap_ksm(), etc.
-+		 */
-+		lock_page(page);
-+		hlist_del(&rmap_item->hlist);
-+
-+		if (hlist_empty(&node_vma->rmap_hlist)) {
-+			hlist_del(&node_vma->hlist);
-+			free_node_vma(node_vma);
-+		}
-+		unlock_page(page);
-+
-+		put_page(page);
-+		if (hlist_empty(&stable_node->hlist)) {
-+			/* do NOT call remove_node_from_stable_tree() here,
-+			 * it's possible for a forked rmap_item not in
-+			 * stable tree while the in-tree rmap_items were
-+			 * deleted.
-+			 */
-+			uksm_pages_shared--;
-+		} else
-+			uksm_pages_sharing--;
-+
-+
-+		uksm_drop_anon_vma(rmap_item);
-+	} else if (rmap_item->address & UNSTABLE_FLAG) {
-+		if (rmap_item->hash_round == uksm_hash_round) {
-+
-+			rb_erase(&rmap_item->node,
-+				 &rmap_item->tree_node->sub_root);
-+			if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
-+				rb_erase(&rmap_item->tree_node->node,
-+					 &root_unstable_tree);
-+
-+				free_tree_node(rmap_item->tree_node);
-+			} else
-+				rmap_item->tree_node->count--;
-+		}
-+		uksm_pages_unshared--;
-+	}
-+
-+	rmap_item->address &= PAGE_MASK;
-+	rmap_item->hash_max = 0;
-+
-+out:
-+	cond_resched();		/* we're called from many long loops */
-+}
-+
-+static inline int slot_in_uksm(struct vma_slot *slot)
-+{
-+	return list_empty(&slot->slot_list);
-+}
-+
-+/*
-+ * Test if the mm is exiting
-+ */
-+static inline bool uksm_test_exit(struct mm_struct *mm)
-+{
-+	return atomic_read(&mm->mm_users) == 0;
-+}
-+
-+static inline unsigned long vma_pool_size(struct vma_slot *slot)
-+{
-+	return round_up(sizeof(struct rmap_list_entry) * slot->pages,
-+			PAGE_SIZE) >> PAGE_SHIFT;
-+}
-+
-+#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
-+
-+/* must be done with sem locked */
-+static int slot_pool_alloc(struct vma_slot *slot)
-+{
-+	unsigned long pool_size;
-+
-+	if (slot->rmap_list_pool)
-+		return 0;
-+
-+	pool_size = vma_pool_size(slot);
-+	slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *),
-+				       GFP_KERNEL);
-+	if (!slot->rmap_list_pool)
-+		return -ENOMEM;
-+
-+	slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int),
-+				    GFP_KERNEL);
-+	if (!slot->pool_counts) {
-+		kfree(slot->rmap_list_pool);
-+		return -ENOMEM;
-+	}
-+
-+	slot->pool_size = pool_size;
-+	BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
-+	slot->flags |= UKSM_SLOT_IN_UKSM;
-+	uksm_pages_total += slot->pages;
-+
-+	return 0;
-+}
-+
-+/*
-+ * Called after vma is unlinked from its mm
-+ */
-+void uksm_remove_vma(struct vm_area_struct *vma)
-+{
-+	struct vma_slot *slot;
-+
-+	if (!vma->uksm_vma_slot)
-+		return;
-+
-+	spin_lock(&vma_slot_list_lock);
-+	slot = vma->uksm_vma_slot;
-+	if (!slot)
-+		goto out;
-+
-+	if (slot_in_uksm(slot)) {
-+		/**
-+		 * This slot has been added by ksmd, so move to the del list
-+		 * waiting ksmd to free it.
-+		 */
-+		list_add_tail(&slot->slot_list, &vma_slot_del);
-+	} else {
-+		/**
-+		 * It's still on new list. It's ok to free slot directly.
-+		 */
-+		list_del(&slot->slot_list);
-+		free_vma_slot(slot);
-+	}
-+out:
-+	vma->uksm_vma_slot = NULL;
-+	spin_unlock(&vma_slot_list_lock);
-+}
-+
-+/**
-+ * Need to do two things:
-+ * 1. check if slot was moved to del list
-+ * 2. make sure the mmap_sem is manipulated under valid vma.
-+ *
-+ * My concern here is that in some cases, this may make
-+ * vma_slot_list_lock() waiters to serialized further by some
-+ * sem->wait_lock, can this really be expensive?
-+ *
-+ *
-+ * @return
-+ * 0: if successfully locked mmap_sem
-+ * -ENOENT: this slot was moved to del list
-+ * -EBUSY: vma lock failed
-+ */
-+static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
-+{
-+	struct vm_area_struct *vma;
-+	struct mm_struct *mm;
-+	struct rw_semaphore *sem;
-+
-+	spin_lock(&vma_slot_list_lock);
-+
-+	/* the slot_list was removed and inited from new list, when it enters
-+	 * uksm_list. If now it's not empty, then it must be moved to del list
-+	 */
-+	if (!slot_in_uksm(slot)) {
-+		spin_unlock(&vma_slot_list_lock);
-+		return -ENOENT;
-+	}
-+
-+	BUG_ON(slot->pages != vma_pages(slot->vma));
-+	/* Ok, vma still valid */
-+	vma = slot->vma;
-+	mm = vma->vm_mm;
-+	sem = &mm->mmap_lock;
-+
-+	if (uksm_test_exit(mm)) {
-+		spin_unlock(&vma_slot_list_lock);
-+		return -ENOENT;
-+	}
-+
-+	if (down_read_trylock(sem)) {
-+		spin_unlock(&vma_slot_list_lock);
-+		if (slot_pool_alloc(slot)) {
-+			uksm_remove_vma(vma);
-+			up_read(sem);
-+			return -ENOENT;
-+		}
-+		return 0;
-+	}
-+
-+	spin_unlock(&vma_slot_list_lock);
-+	return -EBUSY;
-+}
-+
-+static inline unsigned long
-+vma_page_address(struct page *page, struct vm_area_struct *vma)
-+{
-+	pgoff_t pgoff = page->index;
-+	unsigned long address;
-+
-+	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-+	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
-+		/* page should be within @vma mapping range */
-+		return -EFAULT;
-+	}
-+	return address;
-+}
-+
-+
-+/* return 0 on success with the item's mmap_sem locked */
-+static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
-+{
-+	struct mm_struct *mm;
-+	struct vma_slot *slot = item->slot;
-+	int err = -EINVAL;
-+
-+	struct page *page;
-+
-+	/*
-+	 * try_down_read_slot_mmap_sem() returns non-zero if the slot
-+	 * has been removed by uksm_remove_vma().
-+	 */
-+	if (try_down_read_slot_mmap_sem(slot))
-+		return -EBUSY;
-+
-+	mm = slot->vma->vm_mm;
-+
-+	if (uksm_test_exit(mm))
-+		goto failout_up;
-+
-+	page = item->page;
-+	rcu_read_lock();
-+	if (!get_page_unless_zero(page)) {
-+		rcu_read_unlock();
-+		goto failout_up;
-+	}
-+
-+	/* No need to consider huge page here. */
-+	if (item->slot->vma->anon_vma != page_anon_vma(page) ||
-+	    vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
-+		/*
-+		 * TODO:
-+		 * should we release this item becase of its stale page
-+		 * mapping?
-+		 */
-+		put_page(page);
-+		rcu_read_unlock();
-+		goto failout_up;
-+	}
-+	rcu_read_unlock();
-+	return 0;
-+
-+failout_up:
-+	mmap_read_unlock(mm);
-+	return err;
-+}
-+
-+/*
-+ * What kind of VMA is considered ?
-+ */
-+static inline int vma_can_enter(struct vm_area_struct *vma)
-+{
-+	return uksm_flags_can_scan(vma->vm_flags);
-+}
-+
-+/*
-+ * Called whenever a fresh new vma is created A new vma_slot.
-+ * is created and inserted into a global list Must be called.
-+ * after vma is inserted to its mm.
-+ */
-+void uksm_vma_add_new(struct vm_area_struct *vma)
-+{
-+	struct vma_slot *slot;
-+
-+	if (!vma_can_enter(vma)) {
-+		vma->uksm_vma_slot = NULL;
-+		return;
-+	}
-+
-+	slot = alloc_vma_slot();
-+	if (!slot) {
-+		vma->uksm_vma_slot = NULL;
-+		return;
-+	}
-+
-+	vma->uksm_vma_slot = slot;
-+	vma->vm_flags |= VM_MERGEABLE;
-+	slot->vma = vma;
-+	slot->mm = vma->vm_mm;
-+	slot->ctime_j = jiffies;
-+	slot->pages = vma_pages(vma);
-+	spin_lock(&vma_slot_list_lock);
-+	list_add_tail(&slot->slot_list, &vma_slot_new);
-+	spin_unlock(&vma_slot_list_lock);
-+}
-+
-+/*   32/3 < they < 32/2 */
-+#define shiftl	8
-+#define shiftr	12
-+
-+#define HASH_FROM_TO(from, to)			\
-+for (index = from; index < to; index++) {	\
-+	pos = random_nums[index];		\
-+	hash += key[pos];			\
-+	hash += (hash << shiftl);		\
-+	hash ^= (hash >> shiftr);		\
-+}
-+
-+
-+#define HASH_FROM_DOWN_TO(from, to)		\
-+for (index = from - 1; index >= to; index--) {	\
-+	hash ^= (hash >> shiftr);		\
-+	hash ^= (hash >> (shiftr*2));		\
-+	hash -= (hash << shiftl);		\
-+	hash += (hash << (shiftl*2));		\
-+	pos = random_nums[index];		\
-+	hash -= key[pos];			\
-+}
-+
-+/*
-+ * The main random sample hash function.
-+ */
-+static u32 random_sample_hash(void *addr, u32 hash_strength)
-+{
-+	u32 hash = 0xdeadbeef;
-+	int index, pos, loop = hash_strength;
-+	u32 *key = (u32 *)addr;
-+
-+	if (loop > HASH_STRENGTH_FULL)
-+		loop = HASH_STRENGTH_FULL;
-+
-+	HASH_FROM_TO(0, loop);
-+
-+	if (hash_strength > HASH_STRENGTH_FULL) {
-+		loop = hash_strength - HASH_STRENGTH_FULL;
-+		HASH_FROM_TO(0, loop);
-+	}
-+
-+	return hash;
-+}
-+
-+
-+/**
-+ * It's used when hash strength is adjusted
-+ *
-+ * @addr The page's virtual address
-+ * @from The original hash strength
-+ * @to   The hash strength changed to
-+ * @hash The hash value generated with "from" hash value
-+ *
-+ * return the hash value
-+ */
-+static u32 delta_hash(void *addr, int from, int to, u32 hash)
-+{
-+	u32 *key = (u32 *)addr;
-+	int index, pos; /* make sure they are int type */
-+
-+	if (to > from) {
-+		if (from >= HASH_STRENGTH_FULL) {
-+			from -= HASH_STRENGTH_FULL;
-+			to -= HASH_STRENGTH_FULL;
-+			HASH_FROM_TO(from, to);
-+		} else if (to <= HASH_STRENGTH_FULL) {
-+			HASH_FROM_TO(from, to);
-+		} else {
-+			HASH_FROM_TO(from, HASH_STRENGTH_FULL);
-+			HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
-+		}
-+	} else {
-+		if (from <= HASH_STRENGTH_FULL) {
-+			HASH_FROM_DOWN_TO(from, to);
-+		} else if (to >= HASH_STRENGTH_FULL) {
-+			from -= HASH_STRENGTH_FULL;
-+			to -= HASH_STRENGTH_FULL;
-+			HASH_FROM_DOWN_TO(from, to);
-+		} else {
-+			HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
-+			HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
-+		}
-+	}
-+
-+	return hash;
-+}
-+
-+/**
-+ *
-+ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
-+ * has finished.
-+ *
-+ * return 0 if no page has been scanned since last call, 1 otherwise.
-+ */
-+static inline int encode_benefit(void)
-+{
-+	u64 scanned_delta, pos_delta, neg_delta;
-+	unsigned long base = benefit.base;
-+
-+	scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
-+
-+	if (!scanned_delta)
-+		return 0;
-+
-+	scanned_delta >>= base;
-+	pos_delta = rshash_pos >> base;
-+	neg_delta = rshash_neg >> base;
-+
-+	if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
-+	    CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
-+	    CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
-+		benefit.scanned >>= 1;
-+		benefit.neg >>= 1;
-+		benefit.pos >>= 1;
-+		benefit.base++;
-+		scanned_delta >>= 1;
-+		pos_delta >>= 1;
-+		neg_delta >>= 1;
-+	}
-+
-+	benefit.pos += pos_delta;
-+	benefit.neg += neg_delta;
-+	benefit.scanned += scanned_delta;
-+
-+	BUG_ON(!benefit.scanned);
-+
-+	rshash_pos = rshash_neg = 0;
-+	uksm_pages_scanned_last = uksm_pages_scanned;
-+
-+	return 1;
-+}
-+
-+static inline void reset_benefit(void)
-+{
-+	benefit.pos = 0;
-+	benefit.neg = 0;
-+	benefit.base = 0;
-+	benefit.scanned = 0;
-+}
-+
-+static inline void inc_rshash_pos(unsigned long delta)
-+{
-+	if (CAN_OVERFLOW_U64(rshash_pos, delta))
-+		encode_benefit();
-+
-+	rshash_pos += delta;
-+}
-+
-+static inline void inc_rshash_neg(unsigned long delta)
-+{
-+	if (CAN_OVERFLOW_U64(rshash_neg, delta))
-+		encode_benefit();
-+
-+	rshash_neg += delta;
-+}
-+
-+
-+static inline u32 page_hash(struct page *page, unsigned long hash_strength,
-+			    int cost_accounting)
-+{
-+	u32 val;
-+	unsigned long delta;
-+
-+	void *addr = kmap_atomic(page);
-+
-+	val = random_sample_hash(addr, hash_strength);
-+	kunmap_atomic(addr);
-+
-+	if (cost_accounting) {
-+		if (hash_strength < HASH_STRENGTH_FULL)
-+			delta = HASH_STRENGTH_FULL - hash_strength;
-+		else
-+			delta = 0;
-+
-+		inc_rshash_pos(delta);
-+	}
-+
-+	return val;
-+}
-+
-+static int memcmp_pages_with_cost(struct page *page1, struct page *page2,
-+			int cost_accounting)
-+{
-+	char *addr1, *addr2;
-+	int ret;
-+
-+	addr1 = kmap_atomic(page1);
-+	addr2 = kmap_atomic(page2);
-+	ret = memcmp(addr1, addr2, PAGE_SIZE);
-+	kunmap_atomic(addr2);
-+	kunmap_atomic(addr1);
-+
-+	if (cost_accounting)
-+		inc_rshash_neg(memcmp_cost);
-+
-+	return ret;
-+}
-+
-+static inline int pages_identical_with_cost(struct page *page1, struct page *page2)
-+{
-+	return !memcmp_pages_with_cost(page1, page2, 0);
-+}
-+
-+static inline int is_page_full_zero(struct page *page)
-+{
-+	char *addr;
-+	int ret;
-+
-+	addr = kmap_atomic(page);
-+	ret = is_full_zero(addr, PAGE_SIZE);
-+	kunmap_atomic(addr);
-+
-+	return ret;
-+}
-+
-+static int write_protect_page(struct vm_area_struct *vma, struct page *page,
-+			      pte_t *orig_pte, pte_t *old_pte)
-+{
-+	struct mm_struct *mm = vma->vm_mm;
-+	struct page_vma_mapped_walk pvmw = {
-+		.page = page,
-+		.vma = vma,
-+	};
-+       struct mmu_notifier_range range;
-+	int swapped;
-+	int err = -EFAULT;
-+
-+	pvmw.address = page_address_in_vma(page, vma);
-+	if (pvmw.address == -EFAULT)
-+		goto out;
-+
-+	BUG_ON(PageTransCompound(page));
-+
-+        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address,
-+                                pvmw.address + PAGE_SIZE);
-+	mmu_notifier_invalidate_range_start(&range);
-+
-+	if (!page_vma_mapped_walk(&pvmw))
-+		goto out_mn;
-+	if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
-+		goto out_unlock;
-+
-+	if (old_pte)
-+		*old_pte = *pvmw.pte;
-+
-+	if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
-+	    (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) {
-+		pte_t entry;
-+
-+		swapped = PageSwapCache(page);
-+		flush_cache_page(vma, pvmw.address, page_to_pfn(page));
-+		/*
-+		 * Ok this is tricky, when get_user_pages_fast() run it doesn't
-+		 * take any lock, therefore the check that we are going to make
-+		 * with the pagecount against the mapcount is racey and
-+		 * O_DIRECT can happen right after the check.
-+		 * So we clear the pte and flush the tlb before the check
-+		 * this assure us that no O_DIRECT can happen after the check
-+		 * or in the middle of the check.
-+		 */
-+		entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
-+		/*
-+		 * Check that no O_DIRECT or similar I/O is in progress on the
-+		 * page
-+		 */
-+		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
-+			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
-+			goto out_unlock;
-+		}
-+		if (pte_dirty(entry))
-+			set_page_dirty(page);
-+
-+		if (pte_protnone(entry))
-+			entry = pte_mkclean(pte_clear_savedwrite(entry));
-+		else
-+			entry = pte_mkclean(pte_wrprotect(entry));
-+
-+		set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
-+	}
-+	*orig_pte = *pvmw.pte;
-+	err = 0;
-+
-+out_unlock:
-+	page_vma_mapped_walk_done(&pvmw);
-+out_mn:
-+	mmu_notifier_invalidate_range_end(&range);
-+out:
-+	return err;
-+}
-+
-+#define MERGE_ERR_PGERR		1 /* the page is invalid cannot continue */
-+#define MERGE_ERR_COLLI		2 /* there is a collision */
-+#define MERGE_ERR_COLLI_MAX	3 /* collision at the max hash strength */
-+#define MERGE_ERR_CHANGED	4 /* the page has changed since last hash */
-+
-+
-+/**
-+ * replace_page - replace page in vma by new ksm page
-+ * @vma:      vma that holds the pte pointing to page
-+ * @page:     the page we are replacing by kpage
-+ * @kpage:    the ksm page we replace page by
-+ * @orig_pte: the original value of the pte
-+ *
-+ * Returns 0 on success, MERGE_ERR_PGERR on failure.
-+ */
-+static int replace_page(struct vm_area_struct *vma, struct page *page,
-+			struct page *kpage, pte_t orig_pte)
-+{
-+	struct mm_struct *mm = vma->vm_mm;
-+       struct mmu_notifier_range range;
-+	pgd_t *pgd;
-+	p4d_t *p4d;
-+	pud_t *pud;
-+	pmd_t *pmd;
-+	pte_t *ptep;
-+	spinlock_t *ptl;
-+	pte_t entry;
-+
-+	unsigned long addr;
-+	int err = MERGE_ERR_PGERR;
-+
-+	addr = page_address_in_vma(page, vma);
-+	if (addr == -EFAULT)
-+		goto out;
-+
-+	pgd = pgd_offset(mm, addr);
-+	if (!pgd_present(*pgd))
-+		goto out;
-+
-+	p4d = p4d_offset(pgd, addr);
-+	pud = pud_offset(p4d, addr);
-+	if (!pud_present(*pud))
-+		goto out;
-+
-+	pmd = pmd_offset(pud, addr);
-+	BUG_ON(pmd_trans_huge(*pmd));
-+	if (!pmd_present(*pmd))
-+		goto out;
-+
-+        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
-+                                addr + PAGE_SIZE);
-+	mmu_notifier_invalidate_range_start(&range);
-+
-+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
-+	if (!pte_same(*ptep, orig_pte)) {
-+		pte_unmap_unlock(ptep, ptl);
-+		goto out_mn;
-+	}
-+
-+	flush_cache_page(vma, addr, pte_pfn(*ptep));
-+	ptep_clear_flush_notify(vma, addr, ptep);
-+	entry = mk_pte(kpage, vma->vm_page_prot);
-+
-+	/* special treatment is needed for zero_page */
-+	if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
-+				(page_to_pfn(kpage) == zero_pfn)) {
-+		entry = pte_mkspecial(entry);
-+		dec_mm_counter(mm, MM_ANONPAGES);
-+		inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
-+	} else {
-+		get_page(kpage);
-+		page_add_anon_rmap(kpage, vma, addr, false);
-+	}
-+
-+	set_pte_at_notify(mm, addr, ptep, entry);
-+
-+	page_remove_rmap(page, false);
-+	if (!page_mapped(page))
-+		try_to_free_swap(page);
-+	put_page(page);
-+
-+	pte_unmap_unlock(ptep, ptl);
-+	err = 0;
-+out_mn:
-+	mmu_notifier_invalidate_range_end(&range);
-+out:
-+	return err;
-+}
-+
-+
-+/**
-+ *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
-+ *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
-+ *  hash_max member has not been calculated.
-+ *
-+ * @page The page needs to be hashed
-+ * @hash_old The hash value calculated with current hash strength
-+ *
-+ * return the new hash value calculated at HASH_STRENGTH_MAX
-+ */
-+static inline u32 page_hash_max(struct page *page, u32 hash_old)
-+{
-+	u32 hash_max = 0;
-+	void *addr;
-+
-+	addr = kmap_atomic(page);
-+	hash_max = delta_hash(addr, hash_strength,
-+			      HASH_STRENGTH_MAX, hash_old);
-+
-+	kunmap_atomic(addr);
-+
-+	if (!hash_max)
-+		hash_max = 1;
-+
-+	inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
-+	return hash_max;
-+}
-+
-+/*
-+ * We compare the hash again, to ensure that it is really a hash collision
-+ * instead of being caused by page write.
-+ */
-+static inline int check_collision(struct rmap_item *rmap_item,
-+				  u32 hash)
-+{
-+	int err;
-+	struct page *page = rmap_item->page;
-+
-+	/* if this rmap_item has already been hash_maxed, then the collision
-+	 * must appears in the second-level rbtree search. In this case we check
-+	 * if its hash_max value has been changed. Otherwise, the collision
-+	 * happens in the first-level rbtree search, so we check against it's
-+	 * current hash value.
-+	 */
-+	if (rmap_item->hash_max) {
-+		inc_rshash_neg(memcmp_cost);
-+		inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
-+
-+		if (rmap_item->hash_max == page_hash_max(page, hash))
-+			err = MERGE_ERR_COLLI;
-+		else
-+			err = MERGE_ERR_CHANGED;
-+	} else {
-+		inc_rshash_neg(memcmp_cost + hash_strength);
-+
-+		if (page_hash(page, hash_strength, 0) == hash)
-+			err = MERGE_ERR_COLLI;
-+		else
-+			err = MERGE_ERR_CHANGED;
-+	}
-+
-+	return err;
-+}
-+
-+/**
-+ * Try to merge a rmap_item.page with a kpage in stable node. kpage must
-+ * already be a ksm page.
-+ *
-+ * @return 0 if the pages were merged, -EFAULT otherwise.
-+ */
-+static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
-+				      struct page *kpage, u32 hash)
-+{
-+	struct vm_area_struct *vma = rmap_item->slot->vma;
-+	struct mm_struct *mm = vma->vm_mm;
-+	pte_t orig_pte = __pte(0);
-+	int err = MERGE_ERR_PGERR;
-+	struct page *page;
-+
-+	if (uksm_test_exit(mm))
-+		goto out;
-+
-+	page = rmap_item->page;
-+
-+	if (page == kpage) { /* ksm page forked */
-+		err = 0;
-+		goto out;
-+	}
-+
-+	/*
-+	 * We need the page lock to read a stable PageSwapCache in
-+	 * write_protect_page().  We use trylock_page() instead of
-+	 * lock_page() because we don't want to wait here - we
-+	 * prefer to continue scanning and merging different pages,
-+	 * then come back to this page when it is unlocked.
-+	 */
-+	if (!trylock_page(page))
-+		goto out;
-+
-+	if (!PageAnon(page) || !PageKsm(kpage))
-+		goto out_unlock;
-+
-+	if (PageTransCompound(page)) {
-+		err = split_huge_page(page);
-+		if (err)
-+			goto out_unlock;
-+	}
-+
-+	/*
-+	 * If this anonymous page is mapped only here, its pte may need
-+	 * to be write-protected.  If it's mapped elsewhere, all of its
-+	 * ptes are necessarily already write-protected.  But in either
-+	 * case, we need to lock and check page_count is not raised.
-+	 */
-+	if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
-+		if (pages_identical_with_cost(page, kpage))
-+			err = replace_page(vma, page, kpage, orig_pte);
-+		else
-+			err = check_collision(rmap_item, hash);
-+	}
-+
-+	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
-+		munlock_vma_page(page);
-+		if (!PageMlocked(kpage)) {
-+			unlock_page(page);
-+			lock_page(kpage);
-+			mlock_vma_page(kpage);
-+			page = kpage;		/* for final unlock */
-+		}
-+	}
-+
-+out_unlock:
-+	unlock_page(page);
-+out:
-+	return err;
-+}
-+
-+
-+
-+/**
-+ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
-+ * to restore a page mapping that has been changed in try_to_merge_two_pages.
-+ *
-+ * @return 0 on success.
-+ */
-+static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
-+			     pte_t orig_pte, pte_t wprt_pte)
-+{
-+	struct mm_struct *mm = vma->vm_mm;
-+	pgd_t *pgd;
-+	p4d_t *p4d;
-+	pud_t *pud;
-+	pmd_t *pmd;
-+	pte_t *ptep;
-+	spinlock_t *ptl;
-+
-+	int err = -EFAULT;
-+
-+	pgd = pgd_offset(mm, addr);
-+	if (!pgd_present(*pgd))
-+		goto out;
-+
-+	p4d = p4d_offset(pgd, addr);
-+	pud = pud_offset(p4d, addr);
-+	if (!pud_present(*pud))
-+		goto out;
-+
-+	pmd = pmd_offset(pud, addr);
-+	if (!pmd_present(*pmd))
-+		goto out;
-+
-+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
-+	if (!pte_same(*ptep, wprt_pte)) {
-+		/* already copied, let it be */
-+		pte_unmap_unlock(ptep, ptl);
-+		goto out;
-+	}
-+
-+	/*
-+	 * Good boy, still here. When we still get the ksm page, it does not
-+	 * return to the free page pool, there is no way that a pte was changed
-+	 * to other page and gets back to this page. And remind that ksm page
-+	 * do not reuse in do_wp_page(). So it's safe to restore the original
-+	 * pte.
-+	 */
-+	flush_cache_page(vma, addr, pte_pfn(*ptep));
-+	ptep_clear_flush_notify(vma, addr, ptep);
-+	set_pte_at_notify(mm, addr, ptep, orig_pte);
-+
-+	pte_unmap_unlock(ptep, ptl);
-+	err = 0;
-+out:
-+	return err;
-+}
-+
-+/**
-+ * try_to_merge_two_pages() - take two identical pages and prepare
-+ * them to be merged into one page(rmap_item->page)
-+ *
-+ * @return 0 if we successfully merged two identical pages into
-+ *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
-+ *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
-+ *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
-+ *
-+ */
-+static int try_to_merge_two_pages(struct rmap_item *rmap_item,
-+				  struct rmap_item *tree_rmap_item,
-+				  u32 hash)
-+{
-+	pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
-+	pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
-+	struct vm_area_struct *vma1 = rmap_item->slot->vma;
-+	struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
-+	struct page *page = rmap_item->page;
-+	struct page *tree_page = tree_rmap_item->page;
-+	int err = MERGE_ERR_PGERR;
-+	struct address_space *saved_mapping;
-+
-+
-+	if (rmap_item->page == tree_rmap_item->page)
-+		goto out;
-+
-+	if (!trylock_page(page))
-+		goto out;
-+
-+	if (!PageAnon(page))
-+		goto out_unlock;
-+
-+	if (PageTransCompound(page)) {
-+		err = split_huge_page(page);
-+		if (err)
-+			goto out_unlock;
-+	}
-+
-+	if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
-+		unlock_page(page);
-+		goto out;
-+	}
-+
-+	/*
-+	 * While we hold page lock, upgrade page from
-+	 * PageAnon+anon_vma to PageKsm+NULL stable_node:
-+	 * stable_tree_insert() will update stable_node.
-+	 */
-+	saved_mapping = page->mapping;
-+	set_page_stable_node(page, NULL);
-+	mark_page_accessed(page);
-+	if (!PageDirty(page))
-+		SetPageDirty(page);
-+
-+	unlock_page(page);
-+
-+	if (!trylock_page(tree_page))
-+		goto restore_out;
-+
-+	if (!PageAnon(tree_page)) {
-+		unlock_page(tree_page);
-+		goto restore_out;
-+	}
-+
-+	if (PageTransCompound(tree_page)) {
-+		err = split_huge_page(tree_page);
-+		if (err) {
-+			unlock_page(tree_page);
-+			goto restore_out;
-+		}
-+	}
-+
-+	if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
-+		unlock_page(tree_page);
-+		goto restore_out;
-+	}
-+
-+	if (pages_identical_with_cost(page, tree_page)) {
-+		err = replace_page(vma2, tree_page, page, wprt_pte2);
-+		if (err) {
-+			unlock_page(tree_page);
-+			goto restore_out;
-+		}
-+
-+		if ((vma2->vm_flags & VM_LOCKED)) {
-+			munlock_vma_page(tree_page);
-+			if (!PageMlocked(page)) {
-+				unlock_page(tree_page);
-+				lock_page(page);
-+				mlock_vma_page(page);
-+				tree_page = page; /* for final unlock */
-+			}
-+		}
-+
-+		unlock_page(tree_page);
-+
-+		goto out; /* success */
-+
-+	} else {
-+		if (tree_rmap_item->hash_max &&
-+		    tree_rmap_item->hash_max == rmap_item->hash_max) {
-+			err = MERGE_ERR_COLLI_MAX;
-+		} else if (page_hash(page, hash_strength, 0) ==
-+		    page_hash(tree_page, hash_strength, 0)) {
-+			inc_rshash_neg(memcmp_cost + hash_strength * 2);
-+			err = MERGE_ERR_COLLI;
-+		} else {
-+			err = MERGE_ERR_CHANGED;
-+		}
-+
-+		unlock_page(tree_page);
-+	}
-+
-+restore_out:
-+	lock_page(page);
-+	if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
-+				  orig_pte1, wprt_pte1))
-+		page->mapping = saved_mapping;
-+
-+out_unlock:
-+	unlock_page(page);
-+out:
-+	return err;
-+}
-+
-+static inline int hash_cmp(u32 new_val, u32 node_val)
-+{
-+	if (new_val > node_val)
-+		return 1;
-+	else if (new_val < node_val)
-+		return -1;
-+	else
-+		return 0;
-+}
-+
-+static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
-+{
-+	u32 hash_max = item->hash_max;
-+
-+	if (!hash_max) {
-+		hash_max = page_hash_max(item->page, hash);
-+
-+		item->hash_max = hash_max;
-+	}
-+
-+	return hash_max;
-+}
-+
-+
-+
-+/**
-+ * stable_tree_search() - search the stable tree for a page
-+ *
-+ * @item:	the rmap_item we are comparing with
-+ * @hash:	the hash value of this item->page already calculated
-+ *
-+ * @return	the page we have found, NULL otherwise. The page returned has
-+ *			been gotten.
-+ */
-+static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
-+{
-+	struct rb_node *node = root_stable_treep->rb_node;
-+	struct tree_node *tree_node;
-+	unsigned long hash_max;
-+	struct page *page = item->page;
-+	struct stable_node *stable_node;
-+
-+	stable_node = page_stable_node(page);
-+	if (stable_node) {
-+		/* ksm page forked, that is
-+		 * if (PageKsm(page) && !in_stable_tree(rmap_item))
-+		 * it's actually gotten once outside.
-+		 */
-+		get_page(page);
-+		return page;
-+	}
-+
-+	while (node) {
-+		int cmp;
-+
-+		tree_node = rb_entry(node, struct tree_node, node);
-+
-+		cmp = hash_cmp(hash, tree_node->hash);
-+
-+		if (cmp < 0)
-+			node = node->rb_left;
-+		else if (cmp > 0)
-+			node = node->rb_right;
-+		else
-+			break;
-+	}
-+
-+	if (!node)
-+		return NULL;
-+
-+	if (tree_node->count == 1) {
-+		stable_node = rb_entry(tree_node->sub_root.rb_node,
-+				       struct stable_node, node);
-+		BUG_ON(!stable_node);
-+
-+		goto get_page_out;
-+	}
-+
-+	/*
-+	 * ok, we have to search the second
-+	 * level subtree, hash the page to a
-+	 * full strength.
-+	 */
-+	node = tree_node->sub_root.rb_node;
-+	BUG_ON(!node);
-+	hash_max = rmap_item_hash_max(item, hash);
-+
-+	while (node) {
-+		int cmp;
-+
-+		stable_node = rb_entry(node, struct stable_node, node);
-+
-+		cmp = hash_cmp(hash_max, stable_node->hash_max);
-+
-+		if (cmp < 0)
-+			node = node->rb_left;
-+		else if (cmp > 0)
-+			node = node->rb_right;
-+		else
-+			goto get_page_out;
-+	}
-+
-+	return NULL;
-+
-+get_page_out:
-+	page = get_uksm_page(stable_node, 1, 1);
-+	return page;
-+}
-+
-+static int try_merge_rmap_item(struct rmap_item *item,
-+			       struct page *kpage,
-+			       struct page *tree_page)
-+{
-+	struct vm_area_struct *vma = item->slot->vma;
-+	struct page_vma_mapped_walk pvmw = {
-+		.page = kpage,
-+		.vma = vma,
-+	};
-+
-+	pvmw.address = get_rmap_addr(item);
-+	if (!page_vma_mapped_walk(&pvmw))
-+		return 0;
-+
-+	if (pte_write(*pvmw.pte)) {
-+		/* has changed, abort! */
-+		page_vma_mapped_walk_done(&pvmw);
-+		return 0;
-+	}
-+
-+	get_page(tree_page);
-+	page_add_anon_rmap(tree_page, vma, pvmw.address, false);
-+
-+	flush_cache_page(vma, pvmw.address, page_to_pfn(kpage));
-+	ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
-+	set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte,
-+			  mk_pte(tree_page, vma->vm_page_prot));
-+
-+	page_remove_rmap(kpage, false);
-+	put_page(kpage);
-+
-+	page_vma_mapped_walk_done(&pvmw);
-+
-+	return 1;
-+}
-+
-+/**
-+ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
-+ * into stable tree, the page was found to be identical to a stable ksm page,
-+ * this is the last chance we can merge them into one.
-+ *
-+ * @item1:	the rmap_item holding the page which we wanted to insert
-+ *		into stable tree.
-+ * @item2:	the other rmap_item we found when unstable tree search
-+ * @oldpage:	the page currently mapped by the two rmap_items
-+ * @tree_page:	the page we found identical in stable tree node
-+ * @success1:	return if item1 is successfully merged
-+ * @success2:	return if item2 is successfully merged
-+ */
-+static void try_merge_with_stable(struct rmap_item *item1,
-+				  struct rmap_item *item2,
-+				  struct page **kpage,
-+				  struct page *tree_page,
-+				  int *success1, int *success2)
-+{
-+	struct vm_area_struct *vma1 = item1->slot->vma;
-+	struct vm_area_struct *vma2 = item2->slot->vma;
-+	*success1 = 0;
-+	*success2 = 0;
-+
-+	if (unlikely(*kpage == tree_page)) {
-+		/* I don't think this can really happen */
-+		pr_warn("UKSM: unexpected condition detected in "
-+			"%s -- *kpage == tree_page !\n", __func__);
-+		*success1 = 1;
-+		*success2 = 1;
-+		return;
-+	}
-+
-+	if (!PageAnon(*kpage) || !PageKsm(*kpage))
-+		goto failed;
-+
-+	if (!trylock_page(tree_page))
-+		goto failed;
-+
-+	/* If the oldpage is still ksm and still pointed
-+	 * to in the right place, and still write protected,
-+	 * we are confident it's not changed, no need to
-+	 * memcmp anymore.
-+	 * be ware, we cannot take nested pte locks,
-+	 * deadlock risk.
-+	 */
-+	if (!try_merge_rmap_item(item1, *kpage, tree_page))
-+		goto unlock_failed;
-+
-+	/* ok, then vma2, remind that pte1 already set */
-+	if (!try_merge_rmap_item(item2, *kpage, tree_page))
-+		goto success_1;
-+
-+	*success2 = 1;
-+success_1:
-+	*success1 = 1;
-+
-+
-+	if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
-+	    (*success2 && vma2->vm_flags & VM_LOCKED)) {
-+		munlock_vma_page(*kpage);
-+		if (!PageMlocked(tree_page))
-+			mlock_vma_page(tree_page);
-+	}
-+
-+	/*
-+	 * We do not need oldpage any more in the caller, so can break the lock
-+	 * now.
-+	 */
-+	unlock_page(*kpage);
-+	*kpage = tree_page; /* Get unlocked outside. */
-+	return;
-+
-+unlock_failed:
-+	unlock_page(tree_page);
-+failed:
-+	return;
-+}
-+
-+static inline void stable_node_hash_max(struct stable_node *node,
-+					 struct page *page, u32 hash)
-+{
-+	u32 hash_max = node->hash_max;
-+
-+	if (!hash_max) {
-+		hash_max = page_hash_max(page, hash);
-+		node->hash_max = hash_max;
-+	}
-+}
-+
-+static inline
-+struct stable_node *new_stable_node(struct tree_node *tree_node,
-+				    struct page *kpage, u32 hash_max)
-+{
-+	struct stable_node *new_stable_node;
-+
-+	new_stable_node = alloc_stable_node();
-+	if (!new_stable_node)
-+		return NULL;
-+
-+	new_stable_node->kpfn = page_to_pfn(kpage);
-+	new_stable_node->hash_max = hash_max;
-+	new_stable_node->tree_node = tree_node;
-+	set_page_stable_node(kpage, new_stable_node);
-+
-+	return new_stable_node;
-+}
-+
-+static inline
-+struct stable_node *first_level_insert(struct tree_node *tree_node,
-+				       struct rmap_item *rmap_item,
-+				       struct rmap_item *tree_rmap_item,
-+				       struct page **kpage, u32 hash,
-+				       int *success1, int *success2)
-+{
-+	int cmp;
-+	struct page *tree_page;
-+	u32 hash_max = 0;
-+	struct stable_node *stable_node, *new_snode;
-+	struct rb_node *parent = NULL, **new;
-+
-+	/* this tree node contains no sub-tree yet */
-+	stable_node = rb_entry(tree_node->sub_root.rb_node,
-+			       struct stable_node, node);
-+
-+	tree_page = get_uksm_page(stable_node, 1, 0);
-+	if (tree_page) {
-+		cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
-+		if (!cmp) {
-+			try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
-+					      tree_page, success1, success2);
-+			put_page(tree_page);
-+			if (!*success1 && !*success2)
-+				goto failed;
-+
-+			return stable_node;
-+
-+		} else {
-+			/*
-+			 * collision in first level try to create a subtree.
-+			 * A new node need to be created.
-+			 */
-+			put_page(tree_page);
-+
-+			stable_node_hash_max(stable_node, tree_page,
-+					     tree_node->hash);
-+			hash_max = rmap_item_hash_max(rmap_item, hash);
-+			cmp = hash_cmp(hash_max, stable_node->hash_max);
-+
-+			parent = &stable_node->node;
-+			if (cmp < 0)
-+				new = &parent->rb_left;
-+			else if (cmp > 0)
-+				new = &parent->rb_right;
-+			else
-+				goto failed;
-+		}
-+
-+	} else {
-+		/* the only stable_node deleted, we reuse its tree_node.
-+		 */
-+		parent = NULL;
-+		new = &tree_node->sub_root.rb_node;
-+	}
-+
-+	new_snode = new_stable_node(tree_node, *kpage, hash_max);
-+	if (!new_snode)
-+		goto failed;
-+
-+	rb_link_node(&new_snode->node, parent, new);
-+	rb_insert_color(&new_snode->node, &tree_node->sub_root);
-+	tree_node->count++;
-+	*success1 = *success2 = 1;
-+
-+	return new_snode;
-+
-+failed:
-+	return NULL;
-+}
-+
-+static inline
-+struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
-+					  struct rmap_item *rmap_item,
-+					  struct rmap_item *tree_rmap_item,
-+					  struct page **kpage, u32 hash,
-+					  int *success1, int *success2)
-+{
-+	struct page *tree_page;
-+	u32 hash_max;
-+	struct stable_node *stable_node, *new_snode;
-+	struct rb_node *parent, **new;
-+
-+research:
-+	parent = NULL;
-+	new = &tree_node->sub_root.rb_node;
-+	BUG_ON(!*new);
-+	hash_max = rmap_item_hash_max(rmap_item, hash);
-+	while (*new) {
-+		int cmp;
-+
-+		stable_node = rb_entry(*new, struct stable_node, node);
-+
-+		cmp = hash_cmp(hash_max, stable_node->hash_max);
-+
-+		if (cmp < 0) {
-+			parent = *new;
-+			new = &parent->rb_left;
-+		} else if (cmp > 0) {
-+			parent = *new;
-+			new = &parent->rb_right;
-+		} else {
-+			tree_page = get_uksm_page(stable_node, 1, 0);
-+			if (tree_page) {
-+				cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
-+				if (!cmp) {
-+					try_merge_with_stable(rmap_item,
-+						tree_rmap_item, kpage,
-+						tree_page, success1, success2);
-+
-+					put_page(tree_page);
-+					if (!*success1 && !*success2)
-+						goto failed;
-+					/*
-+					 * successfully merged with a stable
-+					 * node
-+					 */
-+					return stable_node;
-+				} else {
-+					put_page(tree_page);
-+					goto failed;
-+				}
-+			} else {
-+				/*
-+				 * stable node may be deleted,
-+				 * and subtree maybe
-+				 * restructed, cannot
-+				 * continue, research it.
-+				 */
-+				if (tree_node->count) {
-+					goto research;
-+				} else {
-+					/* reuse the tree node*/
-+					parent = NULL;
-+					new = &tree_node->sub_root.rb_node;
-+				}
-+			}
-+		}
-+	}
-+
-+	new_snode = new_stable_node(tree_node, *kpage, hash_max);
-+	if (!new_snode)
-+		goto failed;
-+
-+	rb_link_node(&new_snode->node, parent, new);
-+	rb_insert_color(&new_snode->node, &tree_node->sub_root);
-+	tree_node->count++;
-+	*success1 = *success2 = 1;
-+
-+	return new_snode;
-+
-+failed:
-+	return NULL;
-+}
-+
-+
-+/**
-+ * stable_tree_insert() - try to insert a merged page in unstable tree to
-+ * the stable tree
-+ *
-+ * @kpage:		the page need to be inserted
-+ * @hash:		the current hash of this page
-+ * @rmap_item:		the rmap_item being scanned
-+ * @tree_rmap_item:	the rmap_item found on unstable tree
-+ * @success1:		return if rmap_item is merged
-+ * @success2:		return if tree_rmap_item is merged
-+ *
-+ * @return		the stable_node on stable tree if at least one
-+ *			rmap_item is inserted into stable tree, NULL
-+ *			otherwise.
-+ */
-+static struct stable_node *
-+stable_tree_insert(struct page **kpage, u32 hash,
-+		   struct rmap_item *rmap_item,
-+		   struct rmap_item *tree_rmap_item,
-+		   int *success1, int *success2)
-+{
-+	struct rb_node **new = &root_stable_treep->rb_node;
-+	struct rb_node *parent = NULL;
-+	struct stable_node *stable_node;
-+	struct tree_node *tree_node;
-+	u32 hash_max = 0;
-+
-+	*success1 = *success2 = 0;
-+
-+	while (*new) {
-+		int cmp;
-+
-+		tree_node = rb_entry(*new, struct tree_node, node);
-+
-+		cmp = hash_cmp(hash, tree_node->hash);
-+
-+		if (cmp < 0) {
-+			parent = *new;
-+			new = &parent->rb_left;
-+		} else if (cmp > 0) {
-+			parent = *new;
-+			new = &parent->rb_right;
-+		} else
-+			break;
-+	}
-+
-+	if (*new) {
-+		if (tree_node->count == 1) {
-+			stable_node = first_level_insert(tree_node, rmap_item,
-+						tree_rmap_item, kpage,
-+						hash, success1, success2);
-+		} else {
-+			stable_node = stable_subtree_insert(tree_node,
-+					rmap_item, tree_rmap_item, kpage,
-+					hash, success1, success2);
-+		}
-+	} else {
-+
-+		/* no tree node found */
-+		tree_node = alloc_tree_node(stable_tree_node_listp);
-+		if (!tree_node) {
-+			stable_node = NULL;
-+			goto out;
-+		}
-+
-+		stable_node = new_stable_node(tree_node, *kpage, hash_max);
-+		if (!stable_node) {
-+			free_tree_node(tree_node);
-+			goto out;
-+		}
-+
-+		tree_node->hash = hash;
-+		rb_link_node(&tree_node->node, parent, new);
-+		rb_insert_color(&tree_node->node, root_stable_treep);
-+		parent = NULL;
-+		new = &tree_node->sub_root.rb_node;
-+
-+		rb_link_node(&stable_node->node, parent, new);
-+		rb_insert_color(&stable_node->node, &tree_node->sub_root);
-+		tree_node->count++;
-+		*success1 = *success2 = 1;
-+	}
-+
-+out:
-+	return stable_node;
-+}
-+
-+
-+/**
-+ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
-+ *
-+ * @return	0 on success, -EBUSY if unable to lock the mmap_sem,
-+ *		-EINVAL if the page mapping has been changed.
-+ */
-+static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
-+{
-+	int err;
-+
-+	err = get_mergeable_page_lock_mmap(tree_rmap_item);
-+
-+	if (err == -EINVAL) {
-+		/* its page map has been changed, remove it */
-+		remove_rmap_item_from_tree(tree_rmap_item);
-+	}
-+
-+	/* The page is gotten and mmap_sem is locked now. */
-+	return err;
-+}
-+
-+
-+/**
-+ * unstable_tree_search_insert() - search an unstable tree rmap_item with the
-+ * same hash value. Get its page and trylock the mmap_sem
-+ */
-+static inline
-+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
-+					      u32 hash)
-+
-+{
-+	struct rb_node **new = &root_unstable_tree.rb_node;
-+	struct rb_node *parent = NULL;
-+	struct tree_node *tree_node;
-+	u32 hash_max;
-+	struct rmap_item *tree_rmap_item;
-+
-+	while (*new) {
-+		int cmp;
-+
-+		tree_node = rb_entry(*new, struct tree_node, node);
-+
-+		cmp = hash_cmp(hash, tree_node->hash);
-+
-+		if (cmp < 0) {
-+			parent = *new;
-+			new = &parent->rb_left;
-+		} else if (cmp > 0) {
-+			parent = *new;
-+			new = &parent->rb_right;
-+		} else
-+			break;
-+	}
-+
-+	if (*new) {
-+		/* got the tree_node */
-+		if (tree_node->count == 1) {
-+			tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
-+						  struct rmap_item, node);
-+			BUG_ON(!tree_rmap_item);
-+
-+			goto get_page_out;
-+		}
-+
-+		/* well, search the collision subtree */
-+		new = &tree_node->sub_root.rb_node;
-+		BUG_ON(!*new);
-+		hash_max = rmap_item_hash_max(rmap_item, hash);
-+
-+		while (*new) {
-+			int cmp;
-+
-+			tree_rmap_item = rb_entry(*new, struct rmap_item,
-+						  node);
-+
-+			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
-+			parent = *new;
-+			if (cmp < 0)
-+				new = &parent->rb_left;
-+			else if (cmp > 0)
-+				new = &parent->rb_right;
-+			else
-+				goto get_page_out;
-+		}
-+	} else {
-+		/* alloc a new tree_node */
-+		tree_node = alloc_tree_node(&unstable_tree_node_list);
-+		if (!tree_node)
-+			return NULL;
-+
-+		tree_node->hash = hash;
-+		rb_link_node(&tree_node->node, parent, new);
-+		rb_insert_color(&tree_node->node, &root_unstable_tree);
-+		parent = NULL;
-+		new = &tree_node->sub_root.rb_node;
-+	}
-+
-+	/* did not found even in sub-tree */
-+	rmap_item->tree_node = tree_node;
-+	rmap_item->address |= UNSTABLE_FLAG;
-+	rmap_item->hash_round = uksm_hash_round;
-+	rb_link_node(&rmap_item->node, parent, new);
-+	rb_insert_color(&rmap_item->node, &tree_node->sub_root);
-+
-+	uksm_pages_unshared++;
-+	return NULL;
-+
-+get_page_out:
-+	if (tree_rmap_item->page == rmap_item->page)
-+		return NULL;
-+
-+	if (get_tree_rmap_item_page(tree_rmap_item))
-+		return NULL;
-+
-+	return tree_rmap_item;
-+}
-+
-+static void hold_anon_vma(struct rmap_item *rmap_item,
-+			  struct anon_vma *anon_vma)
-+{
-+	rmap_item->anon_vma = anon_vma;
-+	get_anon_vma(anon_vma);
-+}
-+
-+
-+/**
-+ * stable_tree_append() - append a rmap_item to a stable node. Deduplication
-+ * ratio statistics is done in this function.
-+ *
-+ */
-+static void stable_tree_append(struct rmap_item *rmap_item,
-+			       struct stable_node *stable_node, int logdedup)
-+{
-+	struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
-+	unsigned long key = (unsigned long)rmap_item->slot;
-+	unsigned long factor = rmap_item->slot->rung->step;
-+
-+	BUG_ON(!stable_node);
-+	rmap_item->address |= STABLE_FLAG;
-+
-+	if (hlist_empty(&stable_node->hlist)) {
-+		uksm_pages_shared++;
-+		goto node_vma_new;
-+	} else {
-+		uksm_pages_sharing++;
-+	}
-+
-+	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
-+		if (node_vma->key >= key)
-+			break;
-+
-+		if (logdedup) {
-+			node_vma->slot->pages_bemerged += factor;
-+			if (list_empty(&node_vma->slot->dedup_list))
-+				list_add(&node_vma->slot->dedup_list,
-+					 &vma_slot_dedup);
-+		}
-+	}
-+
-+	if (node_vma) {
-+		if (node_vma->key == key) {
-+			node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
-+			goto node_vma_ok;
-+		} else if (node_vma->key > key) {
-+			node_vma_cont = node_vma;
-+		}
-+	}
-+
-+node_vma_new:
-+	/* no same vma already in node, alloc a new node_vma */
-+	new_node_vma = alloc_node_vma();
-+	BUG_ON(!new_node_vma);
-+	new_node_vma->head = stable_node;
-+	new_node_vma->slot = rmap_item->slot;
-+
-+	if (!node_vma) {
-+		hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
-+	} else if (node_vma->key != key) {
-+		if (node_vma->key < key)
-+			hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
-+		else {
-+			hlist_add_before(&new_node_vma->hlist,
-+					 &node_vma->hlist);
-+		}
-+
-+	}
-+	node_vma = new_node_vma;
-+
-+node_vma_ok: /* ok, ready to add to the list */
-+	rmap_item->head = node_vma;
-+	hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
-+	hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
-+	if (logdedup) {
-+		rmap_item->slot->pages_merged++;
-+		if (node_vma_cont) {
-+			node_vma = node_vma_cont;
-+			hlist_for_each_entry_continue(node_vma, hlist) {
-+				node_vma->slot->pages_bemerged += factor;
-+				if (list_empty(&node_vma->slot->dedup_list))
-+					list_add(&node_vma->slot->dedup_list,
-+						 &vma_slot_dedup);
-+			}
-+		}
-+	}
-+}
-+
-+/*
-+ * We use break_ksm to break COW on a ksm page: it's a stripped down
-+ *
-+ *	if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
-+ *		put_page(page);
-+ *
-+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
-+ * in case the application has unmapped and remapped mm,addr meanwhile.
-+ * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
-+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
-+ */
-+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
-+{
-+	struct page *page;
-+	int ret = 0;
-+
-+	do {
-+		cond_resched();
-+		page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
-+		if (IS_ERR_OR_NULL(page))
-+			break;
-+		if (PageKsm(page)) {
-+			ret = handle_mm_fault(vma, addr,
-+					      FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
-+                                             NULL);
-+		} else
-+			ret = VM_FAULT_WRITE;
-+		put_page(page);
-+	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
-+	/*
-+	 * We must loop because handle_mm_fault() may back out if there's
-+	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
-+	 *
-+	 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
-+	 * COW has been broken, even if the vma does not permit VM_WRITE;
-+	 * but note that a concurrent fault might break PageKsm for us.
-+	 *
-+	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
-+	 * backing file, which also invalidates anonymous pages: that's
-+	 * okay, that truncation will have unmapped the PageKsm for us.
-+	 *
-+	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
-+	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
-+	 * current task has TIF_MEMDIE set, and will be OOM killed on return
-+	 * to user; and ksmd, having no mm, would never be chosen for that.
-+	 *
-+	 * But if the mm is in a limited mem_cgroup, then the fault may fail
-+	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
-+	 * even ksmd can fail in this way - though it's usually breaking ksm
-+	 * just to undo a merge it made a moment before, so unlikely to oom.
-+	 *
-+	 * That's a pity: we might therefore have more kernel pages allocated
-+	 * than we're counting as nodes in the stable tree; but uksm_do_scan
-+	 * will retry to break_cow on each pass, so should recover the page
-+	 * in due course.  The important thing is to not let VM_MERGEABLE
-+	 * be cleared while any such pages might remain in the area.
-+	 */
-+	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
-+}
-+
-+static void break_cow(struct rmap_item *rmap_item)
-+{
-+	struct vm_area_struct *vma = rmap_item->slot->vma;
-+	struct mm_struct *mm = vma->vm_mm;
-+	unsigned long addr = get_rmap_addr(rmap_item);
-+
-+	if (uksm_test_exit(mm))
-+		goto out;
-+
-+	break_ksm(vma, addr);
-+out:
-+	return;
-+}
-+
-+/*
-+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
-+ * than check every pte of a given vma, the locking doesn't quite work for
-+ * that - an rmap_item is assigned to the stable tree after inserting ksm
-+ * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
-+ * rmap_items from parent to child at fork time (so as not to waste time
-+ * if exit comes before the next scan reaches it).
-+ *
-+ * Similarly, although we'd like to remove rmap_items (so updating counts
-+ * and freeing memory) when unmerging an area, it's easier to leave that
-+ * to the next pass of ksmd - consider, for example, how ksmd might be
-+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
-+ */
-+inline int unmerge_uksm_pages(struct vm_area_struct *vma,
-+		      unsigned long start, unsigned long end)
-+{
-+	unsigned long addr;
-+	int err = 0;
-+
-+	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
-+		if (uksm_test_exit(vma->vm_mm))
-+			break;
-+		if (signal_pending(current))
-+			err = -ERESTARTSYS;
-+		else
-+			err = break_ksm(vma, addr);
-+	}
-+	return err;
-+}
-+
-+static inline void inc_uksm_pages_scanned(void)
-+{
-+	u64 delta;
-+
-+
-+	if (uksm_pages_scanned == U64_MAX) {
-+		encode_benefit();
-+
-+		delta = uksm_pages_scanned >> pages_scanned_base;
-+
-+		if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
-+			pages_scanned_stored >>= 1;
-+			delta >>= 1;
-+			pages_scanned_base++;
-+		}
-+
-+		pages_scanned_stored += delta;
-+
-+		uksm_pages_scanned = uksm_pages_scanned_last = 0;
-+	}
-+
-+	uksm_pages_scanned++;
-+}
-+
-+static inline int find_zero_page_hash(int strength, u32 hash)
-+{
-+	return (zero_hash_table[strength] == hash);
-+}
-+
-+static
-+int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
-+{
-+	struct page *zero_page = empty_uksm_zero_page;
-+	struct mm_struct *mm = vma->vm_mm;
-+	pte_t orig_pte = __pte(0);
-+	int err = -EFAULT;
-+
-+	if (uksm_test_exit(mm))
-+		goto out;
-+
-+	if (!trylock_page(page))
-+		goto out;
-+
-+	if (!PageAnon(page))
-+		goto out_unlock;
-+
-+	if (PageTransCompound(page)) {
-+		err = split_huge_page(page);
-+		if (err)
-+			goto out_unlock;
-+	}
-+
-+	if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
-+		if (is_page_full_zero(page))
-+			err = replace_page(vma, page, zero_page, orig_pte);
-+	}
-+
-+out_unlock:
-+	unlock_page(page);
-+out:
-+	return err;
-+}
-+
-+/*
-+ * cmp_and_merge_page() - first see if page can be merged into the stable
-+ * tree; if not, compare hash to previous and if it's the same, see if page
-+ * can be inserted into the unstable tree, or merged with a page already there
-+ * and both transferred to the stable tree.
-+ *
-+ * @page: the page that we are searching identical page to.
-+ * @rmap_item: the reverse mapping into the virtual address of this page
-+ */
-+static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
-+{
-+	struct rmap_item *tree_rmap_item;
-+	struct page *page;
-+	struct page *kpage = NULL;
-+	u32 hash_max;
-+	int err;
-+	unsigned int success1, success2;
-+	struct stable_node *snode;
-+	int cmp;
-+	struct rb_node *parent = NULL, **new;
-+
-+	remove_rmap_item_from_tree(rmap_item);
-+	page = rmap_item->page;
-+
-+	/* We first start with searching the page inside the stable tree */
-+	kpage = stable_tree_search(rmap_item, hash);
-+	if (kpage) {
-+		err = try_to_merge_with_uksm_page(rmap_item, kpage,
-+						 hash);
-+		if (!err) {
-+			/*
-+			 * The page was successfully merged, add
-+			 * its rmap_item to the stable tree.
-+			 * page lock is needed because it's
-+			 * racing with try_to_unmap_ksm(), etc.
-+			 */
-+			lock_page(kpage);
-+			snode = page_stable_node(kpage);
-+			stable_tree_append(rmap_item, snode, 1);
-+			unlock_page(kpage);
-+			put_page(kpage);
-+			return; /* success */
-+		}
-+		put_page(kpage);
-+
-+		/*
-+		 * if it's a collision and it has been search in sub-rbtree
-+		 * (hash_max != 0), we want to abort, because if it is
-+		 * successfully merged in unstable tree, the collision trends to
-+		 * happen again.
-+		 */
-+		if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
-+			return;
-+	}
-+
-+	tree_rmap_item =
-+		unstable_tree_search_insert(rmap_item, hash);
-+	if (tree_rmap_item) {
-+		err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
-+		/*
-+		 * As soon as we merge this page, we want to remove the
-+		 * rmap_item of the page we have merged with from the unstable
-+		 * tree, and insert it instead as new node in the stable tree.
-+		 */
-+		if (!err) {
-+			kpage = page;
-+			remove_rmap_item_from_tree(tree_rmap_item);
-+			lock_page(kpage);
-+			snode = stable_tree_insert(&kpage, hash,
-+						   rmap_item, tree_rmap_item,
-+						   &success1, &success2);
-+
-+			/*
-+			 * Do not log dedup for tree item, it's not counted as
-+			 * scanned in this round.
-+			 */
-+			if (success2)
-+				stable_tree_append(tree_rmap_item, snode, 0);
-+
-+			/*
-+			 * The order of these two stable append is important:
-+			 * we are scanning rmap_item.
-+			 */
-+			if (success1)
-+				stable_tree_append(rmap_item, snode, 1);
-+
-+			/*
-+			 * The original kpage may be unlocked inside
-+			 * stable_tree_insert() already. This page
-+			 * should be unlocked before doing
-+			 * break_cow().
-+			 */
-+			unlock_page(kpage);
-+
-+			if (!success1)
-+				break_cow(rmap_item);
-+
-+			if (!success2)
-+				break_cow(tree_rmap_item);
-+
-+		} else if (err == MERGE_ERR_COLLI) {
-+			BUG_ON(tree_rmap_item->tree_node->count > 1);
-+
-+			rmap_item_hash_max(tree_rmap_item,
-+					   tree_rmap_item->tree_node->hash);
-+
-+			hash_max = rmap_item_hash_max(rmap_item, hash);
-+			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
-+			parent = &tree_rmap_item->node;
-+			if (cmp < 0)
-+				new = &parent->rb_left;
-+			else if (cmp > 0)
-+				new = &parent->rb_right;
-+			else
-+				goto put_up_out;
-+
-+			rmap_item->tree_node = tree_rmap_item->tree_node;
-+			rmap_item->address |= UNSTABLE_FLAG;
-+			rmap_item->hash_round = uksm_hash_round;
-+			rb_link_node(&rmap_item->node, parent, new);
-+			rb_insert_color(&rmap_item->node,
-+					&tree_rmap_item->tree_node->sub_root);
-+			rmap_item->tree_node->count++;
-+		} else {
-+			/*
-+			 * either one of the page has changed or they collide
-+			 * at the max hash, we consider them as ill items.
-+			 */
-+			remove_rmap_item_from_tree(tree_rmap_item);
-+		}
-+put_up_out:
-+		put_page(tree_rmap_item->page);
-+		mmap_read_unlock(tree_rmap_item->slot->vma->vm_mm);
-+	}
-+}
-+
-+
-+
-+
-+static inline unsigned long get_pool_index(struct vma_slot *slot,
-+					   unsigned long index)
-+{
-+	unsigned long pool_index;
-+
-+	pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
-+	if (pool_index >= slot->pool_size)
-+		BUG();
-+	return pool_index;
-+}
-+
-+static inline unsigned long index_page_offset(unsigned long index)
-+{
-+	return offset_in_page(sizeof(struct rmap_list_entry *) * index);
-+}
-+
-+static inline
-+struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
-+					    unsigned long index, int need_alloc)
-+{
-+	unsigned long pool_index;
-+	struct page *page;
-+	void *addr;
-+
-+
-+	pool_index = get_pool_index(slot, index);
-+	if (!slot->rmap_list_pool[pool_index]) {
-+		if (!need_alloc)
-+			return NULL;
-+
-+		page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
-+		if (!page)
-+			return NULL;
-+
-+		slot->rmap_list_pool[pool_index] = page;
-+	}
-+
-+	addr = kmap(slot->rmap_list_pool[pool_index]);
-+	addr += index_page_offset(index);
-+
-+	return addr;
-+}
-+
-+static inline void put_rmap_list_entry(struct vma_slot *slot,
-+				       unsigned long index)
-+{
-+	unsigned long pool_index;
-+
-+	pool_index = get_pool_index(slot, index);
-+	BUG_ON(!slot->rmap_list_pool[pool_index]);
-+	kunmap(slot->rmap_list_pool[pool_index]);
-+}
-+
-+static inline int entry_is_new(struct rmap_list_entry *entry)
-+{
-+	return !entry->item;
-+}
-+
-+static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
-+						unsigned long index)
-+{
-+	return slot->vma->vm_start + (index << PAGE_SHIFT);
-+}
-+
-+static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
-+{
-+	unsigned long addr;
-+
-+	if (is_addr(entry->addr))
-+		addr = get_clean_addr(entry->addr);
-+	else if (entry->item)
-+		addr = get_rmap_addr(entry->item);
-+	else
-+		BUG();
-+
-+	return addr;
-+}
-+
-+static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
-+{
-+	if (is_addr(entry->addr))
-+		return NULL;
-+
-+	return entry->item;
-+}
-+
-+static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
-+					    unsigned long index)
-+{
-+	unsigned long pool_index;
-+
-+	pool_index = get_pool_index(slot, index);
-+	BUG_ON(!slot->rmap_list_pool[pool_index]);
-+	slot->pool_counts[pool_index]++;
-+}
-+
-+static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
-+					    unsigned long index)
-+{
-+	unsigned long pool_index;
-+
-+	pool_index = get_pool_index(slot, index);
-+	BUG_ON(!slot->rmap_list_pool[pool_index]);
-+	BUG_ON(!slot->pool_counts[pool_index]);
-+	slot->pool_counts[pool_index]--;
-+}
-+
-+static inline int entry_has_rmap(struct rmap_list_entry *entry)
-+{
-+	return !is_addr(entry->addr) && entry->item;
-+}
-+
-+static inline void swap_entries(struct rmap_list_entry *entry1,
-+				unsigned long index1,
-+				struct rmap_list_entry *entry2,
-+				unsigned long index2)
-+{
-+	struct rmap_list_entry tmp;
-+
-+	/* swapping two new entries is meaningless */
-+	BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
-+
-+	tmp = *entry1;
-+	*entry1 = *entry2;
-+	*entry2 = tmp;
-+
-+	if (entry_has_rmap(entry1))
-+		entry1->item->entry_index = index1;
-+
-+	if (entry_has_rmap(entry2))
-+		entry2->item->entry_index = index2;
-+
-+	if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
-+		inc_rmap_list_pool_count(entry1->item->slot, index1);
-+		dec_rmap_list_pool_count(entry1->item->slot, index2);
-+	} else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
-+		inc_rmap_list_pool_count(entry2->item->slot, index2);
-+		dec_rmap_list_pool_count(entry2->item->slot, index1);
-+	}
-+}
-+
-+static inline void free_entry_item(struct rmap_list_entry *entry)
-+{
-+	unsigned long index;
-+	struct rmap_item *item;
-+
-+	if (!is_addr(entry->addr)) {
-+		BUG_ON(!entry->item);
-+		item = entry->item;
-+		entry->addr = get_rmap_addr(item);
-+		set_is_addr(entry->addr);
-+		index = item->entry_index;
-+		remove_rmap_item_from_tree(item);
-+		dec_rmap_list_pool_count(item->slot, index);
-+		free_rmap_item(item);
-+	}
-+}
-+
-+static inline int pool_entry_boundary(unsigned long index)
-+{
-+	unsigned long linear_addr;
-+
-+	linear_addr = sizeof(struct rmap_list_entry *) * index;
-+	return index && !offset_in_page(linear_addr);
-+}
-+
-+static inline void try_free_last_pool(struct vma_slot *slot,
-+				      unsigned long index)
-+{
-+	unsigned long pool_index;
-+
-+	pool_index = get_pool_index(slot, index);
-+	if (slot->rmap_list_pool[pool_index] &&
-+	    !slot->pool_counts[pool_index]) {
-+		__free_page(slot->rmap_list_pool[pool_index]);
-+		slot->rmap_list_pool[pool_index] = NULL;
-+		slot->flags |= UKSM_SLOT_NEED_SORT;
-+	}
-+
-+}
-+
-+static inline unsigned long vma_item_index(struct vm_area_struct *vma,
-+					   struct rmap_item *item)
-+{
-+	return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
-+}
-+
-+static int within_same_pool(struct vma_slot *slot,
-+			    unsigned long i, unsigned long j)
-+{
-+	unsigned long pool_i, pool_j;
-+
-+	pool_i = get_pool_index(slot, i);
-+	pool_j = get_pool_index(slot, j);
-+
-+	return (pool_i == pool_j);
-+}
-+
-+static void sort_rmap_entry_list(struct vma_slot *slot)
-+{
-+	unsigned long i, j;
-+	struct rmap_list_entry *entry, *swap_entry;
-+
-+	entry = get_rmap_list_entry(slot, 0, 0);
-+	for (i = 0; i < slot->pages; ) {
-+
-+		if (!entry)
-+			goto skip_whole_pool;
-+
-+		if (entry_is_new(entry))
-+			goto next_entry;
-+
-+		if (is_addr(entry->addr)) {
-+			entry->addr = 0;
-+			goto next_entry;
-+		}
-+
-+		j = vma_item_index(slot->vma, entry->item);
-+		if (j == i)
-+			goto next_entry;
-+
-+		if (within_same_pool(slot, i, j))
-+			swap_entry = entry + j - i;
-+		else
-+			swap_entry = get_rmap_list_entry(slot, j, 1);
-+
-+		swap_entries(entry, i, swap_entry, j);
-+		if (!within_same_pool(slot, i, j))
-+			put_rmap_list_entry(slot, j);
-+		continue;
-+
-+skip_whole_pool:
-+		i += PAGE_SIZE / sizeof(*entry);
-+		if (i < slot->pages)
-+			entry = get_rmap_list_entry(slot, i, 0);
-+		continue;
-+
-+next_entry:
-+		if (i >= slot->pages - 1 ||
-+		    !within_same_pool(slot, i, i + 1)) {
-+			put_rmap_list_entry(slot, i);
-+			if (i + 1 < slot->pages)
-+				entry = get_rmap_list_entry(slot, i + 1, 0);
-+		} else
-+			entry++;
-+		i++;
-+		continue;
-+	}
-+
-+	/* free empty pool entries which contain no rmap_item */
-+	/* CAN be simplied to based on only pool_counts when bug freed !!!!! */
-+	for (i = 0; i < slot->pool_size; i++) {
-+		unsigned char has_rmap;
-+		void *addr;
-+
-+		if (!slot->rmap_list_pool[i])
-+			continue;
-+
-+		has_rmap = 0;
-+		addr = kmap(slot->rmap_list_pool[i]);
-+		BUG_ON(!addr);
-+		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
-+			entry = (struct rmap_list_entry *)addr + j;
-+			if (is_addr(entry->addr))
-+				continue;
-+			if (!entry->item)
-+				continue;
-+			has_rmap = 1;
-+		}
-+		kunmap(slot->rmap_list_pool[i]);
-+		if (!has_rmap) {
-+			BUG_ON(slot->pool_counts[i]);
-+			__free_page(slot->rmap_list_pool[i]);
-+			slot->rmap_list_pool[i] = NULL;
-+		}
-+	}
-+
-+	slot->flags &= ~UKSM_SLOT_NEED_SORT;
-+}
-+
-+/*
-+ * vma_fully_scanned() - if all the pages in this slot have been scanned.
-+ */
-+static inline int vma_fully_scanned(struct vma_slot *slot)
-+{
-+	return slot->pages_scanned == slot->pages;
-+}
-+
-+/**
-+ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
-+ * its random permutation. This function is embedded with the random
-+ * permutation index management code.
-+ */
-+static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
-+{
-+	unsigned long rand_range, addr, swap_index, scan_index;
-+	struct rmap_item *item = NULL;
-+	struct rmap_list_entry *scan_entry, *swap_entry = NULL;
-+	struct page *page;
-+
-+	scan_index = swap_index = slot->pages_scanned % slot->pages;
-+
-+	if (pool_entry_boundary(scan_index))
-+		try_free_last_pool(slot, scan_index - 1);
-+
-+	if (vma_fully_scanned(slot)) {
-+		if (slot->flags & UKSM_SLOT_NEED_SORT)
-+			slot->flags |= UKSM_SLOT_NEED_RERAND;
-+		else
-+			slot->flags &= ~UKSM_SLOT_NEED_RERAND;
-+		if (slot->flags & UKSM_SLOT_NEED_SORT)
-+			sort_rmap_entry_list(slot);
-+	}
-+
-+	scan_entry = get_rmap_list_entry(slot, scan_index, 1);
-+	if (!scan_entry)
-+		return NULL;
-+
-+	if (entry_is_new(scan_entry)) {
-+		scan_entry->addr = get_index_orig_addr(slot, scan_index);
-+		set_is_addr(scan_entry->addr);
-+	}
-+
-+	if (slot->flags & UKSM_SLOT_NEED_RERAND) {
-+		rand_range = slot->pages - scan_index;
-+		BUG_ON(!rand_range);
-+		swap_index = scan_index + (prandom_u32() % rand_range);
-+	}
-+
-+	if (swap_index != scan_index) {
-+		swap_entry = get_rmap_list_entry(slot, swap_index, 1);
-+
-+		if (!swap_entry)
-+			return NULL;
-+
-+		if (entry_is_new(swap_entry)) {
-+			swap_entry->addr = get_index_orig_addr(slot,
-+							       swap_index);
-+			set_is_addr(swap_entry->addr);
-+		}
-+		swap_entries(scan_entry, scan_index, swap_entry, swap_index);
-+	}
-+
-+	addr = get_entry_address(scan_entry);
-+	item = get_entry_item(scan_entry);
-+	BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
-+
-+	page = follow_page(slot->vma, addr, FOLL_GET);
-+	if (IS_ERR_OR_NULL(page))
-+		goto nopage;
-+
-+	if (!PageAnon(page))
-+		goto putpage;
-+
-+	/*check is zero_page pfn or uksm_zero_page*/
-+	if ((page_to_pfn(page) == zero_pfn)
-+			|| (page_to_pfn(page) == uksm_zero_pfn))
-+		goto putpage;
-+
-+	flush_anon_page(slot->vma, page, addr);
-+	flush_dcache_page(page);
-+
-+
-+	*hash = page_hash(page, hash_strength, 1);
-+	inc_uksm_pages_scanned();
-+	/*if the page content all zero, re-map to zero-page*/
-+	if (find_zero_page_hash(hash_strength, *hash)) {
-+		if (!cmp_and_merge_zero_page(slot->vma, page)) {
-+			slot->pages_merged++;
-+
-+			/* For full-zero pages, no need to create rmap item */
-+			goto putpage;
-+		} else {
-+			inc_rshash_neg(memcmp_cost / 2);
-+		}
-+	}
-+
-+	if (!item) {
-+		item = alloc_rmap_item();
-+		if (item) {
-+			/* It has already been zeroed */
-+			item->slot = slot;
-+			item->address = addr;
-+			item->entry_index = scan_index;
-+			scan_entry->item = item;
-+			inc_rmap_list_pool_count(slot, scan_index);
-+		} else
-+			goto putpage;
-+	}
-+
-+	BUG_ON(item->slot != slot);
-+	/* the page may have changed */
-+	item->page = page;
-+	put_rmap_list_entry(slot, scan_index);
-+	if (swap_entry)
-+		put_rmap_list_entry(slot, swap_index);
-+	return item;
-+
-+putpage:
-+	put_page(page);
-+	page = NULL;
-+nopage:
-+	/* no page, store addr back and free rmap_item if possible */
-+	free_entry_item(scan_entry);
-+	put_rmap_list_entry(slot, scan_index);
-+	if (swap_entry)
-+		put_rmap_list_entry(slot, swap_index);
-+	return NULL;
-+}
-+
-+static inline int in_stable_tree(struct rmap_item *rmap_item)
-+{
-+	return rmap_item->address & STABLE_FLAG;
-+}
-+
-+/**
-+ * scan_vma_one_page() - scan the next page in a vma_slot. Called with
-+ * mmap_sem locked.
-+ */
-+static noinline void scan_vma_one_page(struct vma_slot *slot)
-+{
-+	u32 hash;
-+	struct mm_struct *mm;
-+	struct rmap_item *rmap_item = NULL;
-+	struct vm_area_struct *vma = slot->vma;
-+
-+	mm = vma->vm_mm;
-+	BUG_ON(!mm);
-+	BUG_ON(!slot);
-+
-+	rmap_item = get_next_rmap_item(slot, &hash);
-+	if (!rmap_item)
-+		goto out1;
-+
-+	if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
-+		goto out2;
-+
-+	cmp_and_merge_page(rmap_item, hash);
-+out2:
-+	put_page(rmap_item->page);
-+out1:
-+	slot->pages_scanned++;
-+	slot->this_sampled++;
-+	if (slot->fully_scanned_round != fully_scanned_round)
-+		scanned_virtual_pages++;
-+
-+	if (vma_fully_scanned(slot))
-+		slot->fully_scanned_round = fully_scanned_round;
-+}
-+
-+static inline unsigned long rung_get_pages(struct scan_rung *rung)
-+{
-+	struct slot_tree_node *node;
-+
-+	if (!rung->vma_root.rnode)
-+		return 0;
-+
-+	node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
-+
-+	return node->size;
-+}
-+
-+#define RUNG_SAMPLED_MIN	3
-+
-+static inline
-+void uksm_calc_rung_step(struct scan_rung *rung,
-+			 unsigned long page_time, unsigned long ratio)
-+{
-+	unsigned long sampled, pages;
-+
-+	/* will be fully scanned ? */
-+	if (!rung->cover_msecs) {
-+		rung->step = 1;
-+		return;
-+	}
-+
-+	sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
-+		  * ratio / page_time;
-+
-+	/*
-+	 *  Before we finsish a scan round and expensive per-round jobs,
-+	 *  we need to have a chance to estimate the per page time. So
-+	 *  the sampled number can not be too small.
-+	 */
-+	if (sampled < RUNG_SAMPLED_MIN)
-+		sampled = RUNG_SAMPLED_MIN;
-+
-+	pages = rung_get_pages(rung);
-+	if (likely(pages > sampled))
-+		rung->step = pages / sampled;
-+	else
-+		rung->step = 1;
-+}
-+
-+static inline int step_need_recalc(struct scan_rung *rung)
-+{
-+	unsigned long pages, stepmax;
-+
-+	pages = rung_get_pages(rung);
-+	stepmax = pages / RUNG_SAMPLED_MIN;
-+
-+	return pages && (rung->step > pages ||
-+			 (stepmax && rung->step > stepmax));
-+}
-+
-+static inline
-+void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
-+{
-+	struct vma_slot *slot;
-+
-+	if (finished)
-+		rung->flags |= UKSM_RUNG_ROUND_FINISHED;
-+
-+	if (step_recalc || step_need_recalc(rung)) {
-+		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
-+		BUG_ON(step_need_recalc(rung));
-+	}
-+
-+	slot_iter_index = prandom_u32() % rung->step;
-+	BUG_ON(!rung->vma_root.rnode);
-+	slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
-+	BUG_ON(!slot);
-+
-+	rung->current_scan = slot;
-+	rung->current_offset = slot_iter_index;
-+}
-+
-+static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
-+{
-+	return &slot->rung->vma_root;
-+}
-+
-+/*
-+ * return if resetted.
-+ */
-+static int advance_current_scan(struct scan_rung *rung)
-+{
-+	unsigned short n;
-+	struct vma_slot *slot, *next = NULL;
-+
-+	BUG_ON(!rung->vma_root.num);
-+
-+	slot = rung->current_scan;
-+	n = (slot->pages - rung->current_offset) % rung->step;
-+	slot_iter_index = rung->step - n;
-+	next = sradix_tree_next(&rung->vma_root, slot->snode,
-+				slot->sindex, slot_iter);
-+
-+	if (next) {
-+		rung->current_offset = slot_iter_index;
-+		rung->current_scan = next;
-+		return 0;
-+	} else {
-+		reset_current_scan(rung, 1, 0);
-+		return 1;
-+	}
-+}
-+
-+static inline void rung_rm_slot(struct vma_slot *slot)
-+{
-+	struct scan_rung *rung = slot->rung;
-+	struct sradix_tree_root *root;
-+
-+	if (rung->current_scan == slot)
-+		advance_current_scan(rung);
-+
-+	root = slot_get_root(slot);
-+	sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
-+	slot->snode = NULL;
-+	if (step_need_recalc(rung)) {
-+		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
-+		BUG_ON(step_need_recalc(rung));
-+	}
-+
-+	/* In case advance_current_scan loop back to this slot again */
-+	if (rung->vma_root.num && rung->current_scan == slot)
-+		reset_current_scan(slot->rung, 1, 0);
-+}
-+
-+static inline void rung_add_new_slots(struct scan_rung *rung,
-+			struct vma_slot **slots, unsigned long num)
-+{
-+	int err;
-+	struct vma_slot *slot;
-+	unsigned long i;
-+	struct sradix_tree_root *root = &rung->vma_root;
-+
-+	err = sradix_tree_enter(root, (void **)slots, num);
-+	BUG_ON(err);
-+
-+	for (i = 0; i < num; i++) {
-+		slot = slots[i];
-+		slot->rung = rung;
-+		BUG_ON(vma_fully_scanned(slot));
-+	}
-+
-+	if (rung->vma_root.num == num)
-+		reset_current_scan(rung, 0, 1);
-+}
-+
-+static inline int rung_add_one_slot(struct scan_rung *rung,
-+				     struct vma_slot *slot)
-+{
-+	int err;
-+
-+	err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
-+	if (err)
-+		return err;
-+
-+	slot->rung = rung;
-+	if (rung->vma_root.num == 1)
-+		reset_current_scan(rung, 0, 1);
-+
-+	return 0;
-+}
-+
-+/*
-+ * Return true if the slot is deleted from its rung.
-+ */
-+static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
-+{
-+	struct scan_rung *old_rung = slot->rung;
-+	int err;
-+
-+	if (old_rung == rung)
-+		return 0;
-+
-+	rung_rm_slot(slot);
-+	err = rung_add_one_slot(rung, slot);
-+	if (err) {
-+		err = rung_add_one_slot(old_rung, slot);
-+		WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
-+	}
-+
-+	return 1;
-+}
-+
-+static inline int vma_rung_up(struct vma_slot *slot)
-+{
-+	struct scan_rung *rung;
-+
-+	rung = slot->rung;
-+	if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
-+		rung++;
-+
-+	return vma_rung_enter(slot, rung);
-+}
-+
-+static inline int vma_rung_down(struct vma_slot *slot)
-+{
-+	struct scan_rung *rung;
-+
-+	rung = slot->rung;
-+	if (slot->rung != &uksm_scan_ladder[0])
-+		rung--;
-+
-+	return vma_rung_enter(slot, rung);
-+}
-+
-+/**
-+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
-+ */
-+static unsigned long cal_dedup_ratio(struct vma_slot *slot)
-+{
-+	unsigned long ret;
-+	unsigned long pages;
-+
-+	pages = slot->this_sampled;
-+	if (!pages)
-+		return 0;
-+
-+	BUG_ON(slot->pages_scanned == slot->last_scanned);
-+
-+	ret = slot->pages_merged;
-+
-+	/* Thrashing area filtering */
-+	if (ret && uksm_thrash_threshold) {
-+		if (slot->pages_cowed * 100 / slot->pages_merged
-+		    > uksm_thrash_threshold) {
-+			ret = 0;
-+		} else {
-+			ret = slot->pages_merged - slot->pages_cowed;
-+		}
-+	}
-+
-+	return ret * 100 / pages;
-+}
-+
-+/**
-+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
-+ */
-+static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
-+{
-+	unsigned long ret;
-+	unsigned long pages;
-+
-+	pages = slot->pages;
-+	if (!pages)
-+		return 0;
-+
-+	ret = slot->pages_bemerged;
-+
-+	/* Thrashing area filtering */
-+	if (ret && uksm_thrash_threshold) {
-+		if (slot->pages_cowed * 100 / slot->pages_bemerged
-+		    > uksm_thrash_threshold) {
-+			ret = 0;
-+		} else {
-+			ret = slot->pages_bemerged - slot->pages_cowed;
-+		}
-+	}
-+
-+	return ret * 100 / pages;
-+}
-+
-+/**
-+ * stable_node_reinsert() - When the hash_strength has been adjusted, the
-+ * stable tree need to be restructured, this is the function re-inserting the
-+ * stable node.
-+ */
-+static inline void stable_node_reinsert(struct stable_node *new_node,
-+					struct page *page,
-+					struct rb_root *root_treep,
-+					struct list_head *tree_node_listp,
-+					u32 hash)
-+{
-+	struct rb_node **new = &root_treep->rb_node;
-+	struct rb_node *parent = NULL;
-+	struct stable_node *stable_node;
-+	struct tree_node *tree_node;
-+	struct page *tree_page;
-+	int cmp;
-+
-+	while (*new) {
-+		int cmp;
-+
-+		tree_node = rb_entry(*new, struct tree_node, node);
-+
-+		cmp = hash_cmp(hash, tree_node->hash);
-+
-+		if (cmp < 0) {
-+			parent = *new;
-+			new = &parent->rb_left;
-+		} else if (cmp > 0) {
-+			parent = *new;
-+			new = &parent->rb_right;
-+		} else
-+			break;
-+	}
-+
-+	if (*new) {
-+		/* find a stable tree node with same first level hash value */
-+		stable_node_hash_max(new_node, page, hash);
-+		if (tree_node->count == 1) {
-+			stable_node = rb_entry(tree_node->sub_root.rb_node,
-+					       struct stable_node, node);
-+			tree_page = get_uksm_page(stable_node, 1, 0);
-+			if (tree_page) {
-+				stable_node_hash_max(stable_node,
-+						      tree_page, hash);
-+				put_page(tree_page);
-+
-+				/* prepare for stable node insertion */
-+
-+				cmp = hash_cmp(new_node->hash_max,
-+						   stable_node->hash_max);
-+				parent = &stable_node->node;
-+				if (cmp < 0)
-+					new = &parent->rb_left;
-+				else if (cmp > 0)
-+					new = &parent->rb_right;
-+				else
-+					goto failed;
-+
-+				goto add_node;
-+			} else {
-+				/* the only stable_node deleted, the tree node
-+				 * was not deleted.
-+				 */
-+				goto tree_node_reuse;
-+			}
-+		}
-+
-+		/* well, search the collision subtree */
-+		new = &tree_node->sub_root.rb_node;
-+		parent = NULL;
-+		BUG_ON(!*new);
-+		while (*new) {
-+			int cmp;
-+
-+			stable_node = rb_entry(*new, struct stable_node, node);
-+
-+			cmp = hash_cmp(new_node->hash_max,
-+					   stable_node->hash_max);
-+
-+			if (cmp < 0) {
-+				parent = *new;
-+				new = &parent->rb_left;
-+			} else if (cmp > 0) {
-+				parent = *new;
-+				new = &parent->rb_right;
-+			} else {
-+				/* oh, no, still a collision */
-+				goto failed;
-+			}
-+		}
-+
-+		goto add_node;
-+	}
-+
-+	/* no tree node found */
-+	tree_node = alloc_tree_node(tree_node_listp);
-+	if (!tree_node) {
-+		pr_err("UKSM: memory allocation error!\n");
-+		goto failed;
-+	} else {
-+		tree_node->hash = hash;
-+		rb_link_node(&tree_node->node, parent, new);
-+		rb_insert_color(&tree_node->node, root_treep);
-+
-+tree_node_reuse:
-+		/* prepare for stable node insertion */
-+		parent = NULL;
-+		new = &tree_node->sub_root.rb_node;
-+	}
-+
-+add_node:
-+	rb_link_node(&new_node->node, parent, new);
-+	rb_insert_color(&new_node->node, &tree_node->sub_root);
-+	new_node->tree_node = tree_node;
-+	tree_node->count++;
-+	return;
-+
-+failed:
-+	/* This can only happen when two nodes have collided
-+	 * in two levels.
-+	 */
-+	new_node->tree_node = NULL;
-+	return;
-+}
-+
-+static inline void free_all_tree_nodes(struct list_head *list)
-+{
-+	struct tree_node *node, *tmp;
-+
-+	list_for_each_entry_safe(node, tmp, list, all_list) {
-+		free_tree_node(node);
-+	}
-+}
-+
-+/**
-+ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
-+ * strength to the current hash_strength. It re-structures the hole tree.
-+ */
-+static inline void stable_tree_delta_hash(u32 prev_hash_strength)
-+{
-+	struct stable_node *node, *tmp;
-+	struct rb_root *root_new_treep;
-+	struct list_head *new_tree_node_listp;
-+
-+	stable_tree_index = (stable_tree_index + 1) % 2;
-+	root_new_treep = &root_stable_tree[stable_tree_index];
-+	new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
-+	*root_new_treep = RB_ROOT;
-+	BUG_ON(!list_empty(new_tree_node_listp));
-+
-+	/*
-+	 * we need to be safe, the node could be removed by get_uksm_page()
-+	 */
-+	list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
-+		void *addr;
-+		struct page *node_page;
-+		u32 hash;
-+
-+		/*
-+		 * We are completely re-structuring the stable nodes to a new
-+		 * stable tree. We don't want to touch the old tree unlinks and
-+		 * old tree_nodes. The old tree_nodes will be freed at once.
-+		 */
-+		node_page = get_uksm_page(node, 0, 0);
-+		if (!node_page)
-+			continue;
-+
-+		if (node->tree_node) {
-+			hash = node->tree_node->hash;
-+
-+			addr = kmap_atomic(node_page);
-+
-+			hash = delta_hash(addr, prev_hash_strength,
-+					  hash_strength, hash);
-+			kunmap_atomic(addr);
-+		} else {
-+			/*
-+			 *it was not inserted to rbtree due to collision in last
-+			 *round scan.
-+			 */
-+			hash = page_hash(node_page, hash_strength, 0);
-+		}
-+
-+		stable_node_reinsert(node, node_page, root_new_treep,
-+				     new_tree_node_listp, hash);
-+		put_page(node_page);
-+	}
-+
-+	root_stable_treep = root_new_treep;
-+	free_all_tree_nodes(stable_tree_node_listp);
-+	BUG_ON(!list_empty(stable_tree_node_listp));
-+	stable_tree_node_listp = new_tree_node_listp;
-+}
-+
-+static inline void inc_hash_strength(unsigned long delta)
-+{
-+	hash_strength += 1 << delta;
-+	if (hash_strength > HASH_STRENGTH_MAX)
-+		hash_strength = HASH_STRENGTH_MAX;
-+}
-+
-+static inline void dec_hash_strength(unsigned long delta)
-+{
-+	unsigned long change = 1 << delta;
-+
-+	if (hash_strength <= change + 1)
-+		hash_strength = 1;
-+	else
-+		hash_strength -= change;
-+}
-+
-+static inline void inc_hash_strength_delta(void)
-+{
-+	hash_strength_delta++;
-+	if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
-+		hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
-+}
-+
-+static inline unsigned long get_current_neg_ratio(void)
-+{
-+	u64 pos = benefit.pos;
-+	u64 neg = benefit.neg;
-+
-+	if (!neg)
-+		return 0;
-+
-+	if (!pos || neg > pos)
-+		return 100;
-+
-+	if (neg > div64_u64(U64_MAX, 100))
-+		pos = div64_u64(pos, 100);
-+	else
-+		neg *= 100;
-+
-+	return div64_u64(neg, pos);
-+}
-+
-+static inline unsigned long get_current_benefit(void)
-+{
-+	u64 pos = benefit.pos;
-+	u64 neg = benefit.neg;
-+	u64 scanned = benefit.scanned;
-+
-+	if (neg > pos)
-+		return 0;
-+
-+	return div64_u64((pos - neg), scanned);
-+}
-+
-+static inline int judge_rshash_direction(void)
-+{
-+	u64 current_neg_ratio, stable_benefit;
-+	u64 current_benefit, delta = 0;
-+	int ret = STILL;
-+
-+	/*
-+	 * Try to probe a value after the boot, and in case the system
-+	 * are still for a long time.
-+	 */
-+	if ((fully_scanned_round & 0xFFULL) == 10) {
-+		ret = OBSCURE;
-+		goto out;
-+	}
-+
-+	current_neg_ratio = get_current_neg_ratio();
-+
-+	if (current_neg_ratio == 0) {
-+		rshash_neg_cont_zero++;
-+		if (rshash_neg_cont_zero > 2)
-+			return GO_DOWN;
-+		else
-+			return STILL;
-+	}
-+	rshash_neg_cont_zero = 0;
-+
-+	if (current_neg_ratio > 90) {
-+		ret = GO_UP;
-+		goto out;
-+	}
-+
-+	current_benefit = get_current_benefit();
-+	stable_benefit = rshash_state.stable_benefit;
-+
-+	if (!stable_benefit) {
-+		ret = OBSCURE;
-+		goto out;
-+	}
-+
-+	if (current_benefit > stable_benefit)
-+		delta = current_benefit - stable_benefit;
-+	else if (current_benefit < stable_benefit)
-+		delta = stable_benefit - current_benefit;
-+
-+	delta = div64_u64(100 * delta, stable_benefit);
-+
-+	if (delta > 50) {
-+		rshash_cont_obscure++;
-+		if (rshash_cont_obscure > 2)
-+			return OBSCURE;
-+		else
-+			return STILL;
-+	}
-+
-+out:
-+	rshash_cont_obscure = 0;
-+	return ret;
-+}
-+
-+/**
-+ * rshash_adjust() - The main function to control the random sampling state
-+ * machine for hash strength adapting.
-+ *
-+ * return true if hash_strength has changed.
-+ */
-+static inline int rshash_adjust(void)
-+{
-+	unsigned long prev_hash_strength = hash_strength;
-+
-+	if (!encode_benefit())
-+		return 0;
-+
-+	switch (rshash_state.state) {
-+	case RSHASH_STILL:
-+		switch (judge_rshash_direction()) {
-+		case GO_UP:
-+			if (rshash_state.pre_direct == GO_DOWN)
-+				hash_strength_delta = 0;
-+
-+			inc_hash_strength(hash_strength_delta);
-+			inc_hash_strength_delta();
-+			rshash_state.stable_benefit = get_current_benefit();
-+			rshash_state.pre_direct = GO_UP;
-+			break;
-+
-+		case GO_DOWN:
-+			if (rshash_state.pre_direct == GO_UP)
-+				hash_strength_delta = 0;
-+
-+			dec_hash_strength(hash_strength_delta);
-+			inc_hash_strength_delta();
-+			rshash_state.stable_benefit = get_current_benefit();
-+			rshash_state.pre_direct = GO_DOWN;
-+			break;
-+
-+		case OBSCURE:
-+			rshash_state.stable_point = hash_strength;
-+			rshash_state.turn_point_down = hash_strength;
-+			rshash_state.turn_point_up = hash_strength;
-+			rshash_state.turn_benefit_down = get_current_benefit();
-+			rshash_state.turn_benefit_up = get_current_benefit();
-+			rshash_state.lookup_window_index = 0;
-+			rshash_state.state = RSHASH_TRYDOWN;
-+			dec_hash_strength(hash_strength_delta);
-+			inc_hash_strength_delta();
-+			break;
-+
-+		case STILL:
-+			break;
-+		default:
-+			BUG();
-+		}
-+		break;
-+
-+	case RSHASH_TRYDOWN:
-+		if (rshash_state.lookup_window_index++ % 5 == 0)
-+			rshash_state.below_count = 0;
-+
-+		if (get_current_benefit() < rshash_state.stable_benefit)
-+			rshash_state.below_count++;
-+		else if (get_current_benefit() >
-+			 rshash_state.turn_benefit_down) {
-+			rshash_state.turn_point_down = hash_strength;
-+			rshash_state.turn_benefit_down = get_current_benefit();
-+		}
-+
-+		if (rshash_state.below_count >= 3 ||
-+		    judge_rshash_direction() == GO_UP ||
-+		    hash_strength == 1) {
-+			hash_strength = rshash_state.stable_point;
-+			hash_strength_delta = 0;
-+			inc_hash_strength(hash_strength_delta);
-+			inc_hash_strength_delta();
-+			rshash_state.lookup_window_index = 0;
-+			rshash_state.state = RSHASH_TRYUP;
-+			hash_strength_delta = 0;
-+		} else {
-+			dec_hash_strength(hash_strength_delta);
-+			inc_hash_strength_delta();
-+		}
-+		break;
-+
-+	case RSHASH_TRYUP:
-+		if (rshash_state.lookup_window_index++ % 5 == 0)
-+			rshash_state.below_count = 0;
-+
-+		if (get_current_benefit() < rshash_state.turn_benefit_down)
-+			rshash_state.below_count++;
-+		else if (get_current_benefit() > rshash_state.turn_benefit_up) {
-+			rshash_state.turn_point_up = hash_strength;
-+			rshash_state.turn_benefit_up = get_current_benefit();
-+		}
-+
-+		if (rshash_state.below_count >= 3 ||
-+		    judge_rshash_direction() == GO_DOWN ||
-+		    hash_strength == HASH_STRENGTH_MAX) {
-+			hash_strength = rshash_state.turn_benefit_up >
-+				rshash_state.turn_benefit_down ?
-+				rshash_state.turn_point_up :
-+				rshash_state.turn_point_down;
-+
-+			rshash_state.state = RSHASH_PRE_STILL;
-+		} else {
-+			inc_hash_strength(hash_strength_delta);
-+			inc_hash_strength_delta();
-+		}
-+
-+		break;
-+
-+	case RSHASH_NEW:
-+	case RSHASH_PRE_STILL:
-+		rshash_state.stable_benefit = get_current_benefit();
-+		rshash_state.state = RSHASH_STILL;
-+		hash_strength_delta = 0;
-+		break;
-+	default:
-+		BUG();
-+	}
-+
-+	/* rshash_neg = rshash_pos = 0; */
-+	reset_benefit();
-+
-+	if (prev_hash_strength != hash_strength)
-+		stable_tree_delta_hash(prev_hash_strength);
-+
-+	return prev_hash_strength != hash_strength;
-+}
-+
-+/**
-+ * round_update_ladder() - The main function to do update of all the
-+ * adjustments whenever a scan round is finished.
-+ */
-+static noinline void round_update_ladder(void)
-+{
-+	int i;
-+	unsigned long dedup;
-+	struct vma_slot *slot, *tmp_slot;
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++)
-+		uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
-+
-+	list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
-+
-+		/* slot may be rung_rm_slot() when mm exits */
-+		if (slot->snode) {
-+			dedup = cal_dedup_ratio_old(slot);
-+			if (dedup && dedup >= uksm_abundant_threshold)
-+				vma_rung_up(slot);
-+		}
-+
-+		slot->pages_bemerged = 0;
-+		slot->pages_cowed = 0;
-+
-+		list_del_init(&slot->dedup_list);
-+	}
-+}
-+
-+static void uksm_del_vma_slot(struct vma_slot *slot)
-+{
-+	int i, j;
-+	struct rmap_list_entry *entry;
-+
-+	if (slot->snode) {
-+		/*
-+		 * In case it just failed when entering the rung, it's not
-+		 * necessary.
-+		 */
-+		rung_rm_slot(slot);
-+	}
-+
-+	if (!list_empty(&slot->dedup_list))
-+		list_del(&slot->dedup_list);
-+
-+	if (!slot->rmap_list_pool || !slot->pool_counts) {
-+		/* In case it OOMed in uksm_vma_enter() */
-+		goto out;
-+	}
-+
-+	for (i = 0; i < slot->pool_size; i++) {
-+		void *addr;
-+
-+		if (!slot->rmap_list_pool[i])
-+			continue;
-+
-+		addr = kmap(slot->rmap_list_pool[i]);
-+		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
-+			entry = (struct rmap_list_entry *)addr + j;
-+			if (is_addr(entry->addr))
-+				continue;
-+			if (!entry->item)
-+				continue;
-+
-+			remove_rmap_item_from_tree(entry->item);
-+			free_rmap_item(entry->item);
-+			slot->pool_counts[i]--;
-+		}
-+		BUG_ON(slot->pool_counts[i]);
-+		kunmap(slot->rmap_list_pool[i]);
-+		__free_page(slot->rmap_list_pool[i]);
-+	}
-+	kfree(slot->rmap_list_pool);
-+	kfree(slot->pool_counts);
-+
-+out:
-+	slot->rung = NULL;
-+	if (slot->flags & UKSM_SLOT_IN_UKSM) {
-+		BUG_ON(uksm_pages_total < slot->pages);
-+		uksm_pages_total -= slot->pages;
-+	}
-+
-+	if (slot->fully_scanned_round == fully_scanned_round)
-+		scanned_virtual_pages -= slot->pages;
-+	else
-+		scanned_virtual_pages -= slot->pages_scanned;
-+	free_vma_slot(slot);
-+}
-+
-+
-+#define SPIN_LOCK_PERIOD	32
-+static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
-+static inline void cleanup_vma_slots(void)
-+{
-+	struct vma_slot *slot;
-+	int i;
-+
-+	i = 0;
-+	spin_lock(&vma_slot_list_lock);
-+	while (!list_empty(&vma_slot_del)) {
-+		slot = list_entry(vma_slot_del.next,
-+				  struct vma_slot, slot_list);
-+		list_del(&slot->slot_list);
-+		cleanup_slots[i++] = slot;
-+		if (i == SPIN_LOCK_PERIOD) {
-+			spin_unlock(&vma_slot_list_lock);
-+			while (--i >= 0)
-+				uksm_del_vma_slot(cleanup_slots[i]);
-+			i = 0;
-+			spin_lock(&vma_slot_list_lock);
-+		}
-+	}
-+	spin_unlock(&vma_slot_list_lock);
-+
-+	while (--i >= 0)
-+		uksm_del_vma_slot(cleanup_slots[i]);
-+}
-+
-+/*
-+ * Expotional moving average formula
-+ */
-+static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
-+{
-+	/*
-+	 * For a very high burst, even the ema cannot work well, a false very
-+	 * high per-page time estimation can result in feedback in very high
-+	 * overhead of context switch and rung update -- this will then lead
-+	 * to higher per-paper time, this may not converge.
-+	 *
-+	 * Instead, we try to approach this value in a binary manner.
-+	 */
-+	if (curr > last_ema * 10)
-+		return last_ema * 2;
-+
-+	return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
-+}
-+
-+/*
-+ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
-+ * nanoseconds based on current uksm_sleep_jiffies.
-+ */
-+static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
-+{
-+	return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
-+		(TIME_RATIO_SCALE - ratio) * ratio;
-+}
-+
-+
-+static inline unsigned long rung_real_ratio(int cpu_time_ratio)
-+{
-+	unsigned long ret;
-+
-+	BUG_ON(!cpu_time_ratio);
-+
-+	if (cpu_time_ratio > 0)
-+		ret = cpu_time_ratio;
-+	else
-+		ret = (unsigned long)(-cpu_time_ratio) *
-+			uksm_max_cpu_percentage / 100UL;
-+
-+	return ret ? ret : 1;
-+}
-+
-+static noinline void uksm_calc_scan_pages(void)
-+{
-+	struct scan_rung *ladder = uksm_scan_ladder;
-+	unsigned long sleep_usecs, nsecs;
-+	unsigned long ratio;
-+	int i;
-+	unsigned long per_page;
-+
-+	if (uksm_ema_page_time > 100000 ||
-+	    (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
-+		uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
-+
-+	per_page = uksm_ema_page_time;
-+	BUG_ON(!per_page);
-+
-+	/*
-+	 * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
-+	 * based on saved user input.
-+	 */
-+	if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
-+		uksm_sleep_jiffies = uksm_sleep_saved;
-+
-+	/* We require a rung scan at least 1 page in a period. */
-+	nsecs = per_page;
-+	ratio = rung_real_ratio(ladder[0].cpu_ratio);
-+	if (cpu_ratio_to_nsec(ratio) < nsecs) {
-+		sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
-+				/ NSEC_PER_USEC;
-+		uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
-+	}
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		ratio = rung_real_ratio(ladder[i].cpu_ratio);
-+		ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
-+					per_page;
-+		BUG_ON(!ladder[i].pages_to_scan);
-+		uksm_calc_rung_step(&ladder[i], per_page, ratio);
-+	}
-+}
-+
-+/*
-+ * From the scan time of this round (ns) to next expected min sleep time
-+ * (ms), be careful of the possible overflows. ratio is taken from
-+ * rung_real_ratio()
-+ */
-+static inline
-+unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
-+{
-+	scan_time >>= 20; /* to msec level now */
-+	BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
-+
-+	return (unsigned int) ((unsigned long) scan_time *
-+			       (TIME_RATIO_SCALE - ratio) / ratio);
-+}
-+
-+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
-+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
-+
-+static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
-+{
-+	struct scan_rung *rung;
-+
-+	rung = &uksm_scan_ladder[0];
-+	rung_add_new_slots(rung, slots, num);
-+}
-+
-+static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
-+
-+static void uksm_enter_all_slots(void)
-+{
-+	struct vma_slot *slot;
-+	unsigned long index;
-+	struct list_head empty_vma_list;
-+	int i;
-+
-+	i = 0;
-+	index = 0;
-+	INIT_LIST_HEAD(&empty_vma_list);
-+
-+	spin_lock(&vma_slot_list_lock);
-+	while (!list_empty(&vma_slot_new)) {
-+		slot = list_entry(vma_slot_new.next,
-+				  struct vma_slot, slot_list);
-+
-+		if (!slot->vma->anon_vma) {
-+			list_move(&slot->slot_list, &empty_vma_list);
-+		} else if (vma_can_enter(slot->vma)) {
-+			batch_slots[index++] = slot;
-+			list_del_init(&slot->slot_list);
-+		} else {
-+			list_move(&slot->slot_list, &vma_slot_noadd);
-+		}
-+
-+		if (++i == SPIN_LOCK_PERIOD ||
-+		    (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
-+			spin_unlock(&vma_slot_list_lock);
-+
-+			if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
-+				uksm_vma_enter(batch_slots, index);
-+				index = 0;
-+			}
-+			i = 0;
-+			cond_resched();
-+			spin_lock(&vma_slot_list_lock);
-+		}
-+	}
-+
-+	list_splice(&empty_vma_list, &vma_slot_new);
-+
-+	spin_unlock(&vma_slot_list_lock);
-+
-+	if (index)
-+		uksm_vma_enter(batch_slots, index);
-+
-+}
-+
-+static inline int rung_round_finished(struct scan_rung *rung)
-+{
-+	return rung->flags & UKSM_RUNG_ROUND_FINISHED;
-+}
-+
-+static inline void judge_slot(struct vma_slot *slot)
-+{
-+	struct scan_rung *rung = slot->rung;
-+	unsigned long dedup;
-+	int deleted;
-+
-+	dedup = cal_dedup_ratio(slot);
-+	if (vma_fully_scanned(slot) && uksm_thrash_threshold)
-+		deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
-+	else if (dedup && dedup >= uksm_abundant_threshold)
-+		deleted = vma_rung_up(slot);
-+	else
-+		deleted = vma_rung_down(slot);
-+
-+	slot->pages_merged = 0;
-+	slot->pages_cowed = 0;
-+	slot->this_sampled = 0;
-+
-+	if (vma_fully_scanned(slot))
-+		slot->pages_scanned = 0;
-+
-+	slot->last_scanned = slot->pages_scanned;
-+
-+	/* If its deleted in above, then rung was already advanced. */
-+	if (!deleted)
-+		advance_current_scan(rung);
-+}
-+
-+
-+static inline int hash_round_finished(void)
-+{
-+	if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
-+		scanned_virtual_pages = 0;
-+		if (uksm_pages_scanned)
-+			fully_scanned_round++;
-+
-+		return 1;
-+	} else {
-+		return 0;
-+	}
-+}
-+
-+#define UKSM_MMSEM_BATCH	5
-+#define BUSY_RETRY		100
-+
-+/**
-+ * uksm_do_scan()  - the main worker function.
-+ */
-+static noinline void uksm_do_scan(void)
-+{
-+	struct vma_slot *slot, *iter;
-+	struct mm_struct *busy_mm;
-+	unsigned char round_finished, all_rungs_emtpy;
-+	int i, err, mmsem_batch;
-+	unsigned long pcost;
-+	long long delta_exec;
-+	unsigned long vpages, max_cpu_ratio;
-+	unsigned long long start_time, end_time, scan_time;
-+	unsigned int expected_jiffies;
-+
-+	might_sleep();
-+
-+	vpages = 0;
-+
-+	start_time = task_sched_runtime(current);
-+	max_cpu_ratio = 0;
-+	mmsem_batch = 0;
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE;) {
-+		struct scan_rung *rung = &uksm_scan_ladder[i];
-+		unsigned long ratio;
-+		int busy_retry;
-+
-+		if (!rung->pages_to_scan) {
-+			i++;
-+			continue;
-+		}
-+
-+		if (!rung->vma_root.num) {
-+			rung->pages_to_scan = 0;
-+			i++;
-+			continue;
-+		}
-+
-+		ratio = rung_real_ratio(rung->cpu_ratio);
-+		if (ratio > max_cpu_ratio)
-+			max_cpu_ratio = ratio;
-+
-+		busy_retry = BUSY_RETRY;
-+		/*
-+		 * Do not consider rung_round_finished() here, just used up the
-+		 * rung->pages_to_scan quota.
-+		 */
-+		while (rung->pages_to_scan && rung->vma_root.num &&
-+		       likely(!freezing(current))) {
-+			int reset = 0;
-+
-+			slot = rung->current_scan;
-+
-+			BUG_ON(vma_fully_scanned(slot));
-+
-+			if (mmsem_batch)
-+				err = 0;
-+			else
-+				err = try_down_read_slot_mmap_sem(slot);
-+
-+			if (err == -ENOENT) {
-+rm_slot:
-+				rung_rm_slot(slot);
-+				continue;
-+			}
-+
-+			busy_mm = slot->mm;
-+
-+			if (err == -EBUSY) {
-+				/* skip other vmas on the same mm */
-+				do {
-+					reset = advance_current_scan(rung);
-+					iter = rung->current_scan;
-+					busy_retry--;
-+					if (iter->vma->vm_mm != busy_mm ||
-+					    !busy_retry || reset)
-+						break;
-+				} while (1);
-+
-+				if (iter->vma->vm_mm != busy_mm) {
-+					continue;
-+				} else {
-+					/* scan round finsished */
-+					break;
-+				}
-+			}
-+
-+			BUG_ON(!vma_can_enter(slot->vma));
-+			if (uksm_test_exit(slot->vma->vm_mm)) {
-+				mmsem_batch = 0;
-+				mmap_read_unlock(slot->vma->vm_mm);
-+				goto rm_slot;
-+			}
-+
-+			if (mmsem_batch)
-+				mmsem_batch--;
-+			else
-+				mmsem_batch = UKSM_MMSEM_BATCH;
-+
-+			/* Ok, we have take the mmap_sem, ready to scan */
-+			scan_vma_one_page(slot);
-+			rung->pages_to_scan--;
-+			vpages++;
-+
-+			if (rung->current_offset + rung->step > slot->pages - 1
-+			    || vma_fully_scanned(slot)) {
-+				mmap_read_unlock(slot->vma->vm_mm);
-+				judge_slot(slot);
-+				mmsem_batch = 0;
-+			} else {
-+				rung->current_offset += rung->step;
-+				if (!mmsem_batch)
-+					mmap_read_unlock(slot->vma->vm_mm);
-+			}
-+
-+			busy_retry = BUSY_RETRY;
-+			cond_resched();
-+		}
-+
-+		if (mmsem_batch) {
-+			mmap_read_unlock(slot->vma->vm_mm);
-+			mmsem_batch = 0;
-+		}
-+
-+		if (freezing(current))
-+			break;
-+
-+		cond_resched();
-+	}
-+	end_time = task_sched_runtime(current);
-+	delta_exec = end_time - start_time;
-+
-+	if (freezing(current))
-+		return;
-+
-+	cleanup_vma_slots();
-+	uksm_enter_all_slots();
-+
-+	round_finished = 1;
-+	all_rungs_emtpy = 1;
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		struct scan_rung *rung = &uksm_scan_ladder[i];
-+
-+		if (rung->vma_root.num) {
-+			all_rungs_emtpy = 0;
-+			if (!rung_round_finished(rung))
-+				round_finished = 0;
-+		}
-+	}
-+
-+	if (all_rungs_emtpy)
-+		round_finished = 0;
-+
-+	if (round_finished) {
-+		round_update_ladder();
-+		uksm_eval_round++;
-+
-+		if (hash_round_finished() && rshash_adjust()) {
-+			/* Reset the unstable root iff hash strength changed */
-+			uksm_hash_round++;
-+			root_unstable_tree = RB_ROOT;
-+			free_all_tree_nodes(&unstable_tree_node_list);
-+		}
-+
-+		/*
-+		 * A number of pages can hang around indefinitely on per-cpu
-+		 * pagevecs, raised page count preventing write_protect_page
-+		 * from merging them.  Though it doesn't really matter much,
-+		 * it is puzzling to see some stuck in pages_volatile until
-+		 * other activity jostles them out, and they also prevented
-+		 * LTP's KSM test from succeeding deterministically; so drain
-+		 * them here (here rather than on entry to uksm_do_scan(),
-+		 * so we don't IPI too often when pages_to_scan is set low).
-+		 */
-+		lru_add_drain_all();
-+	}
-+
-+
-+	if (vpages && delta_exec > 0) {
-+		pcost = (unsigned long) delta_exec / vpages;
-+		if (likely(uksm_ema_page_time))
-+			uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
-+		else
-+			uksm_ema_page_time = pcost;
-+	}
-+
-+	uksm_calc_scan_pages();
-+	uksm_sleep_real = uksm_sleep_jiffies;
-+	/* in case of radical cpu bursts, apply the upper bound */
-+	end_time = task_sched_runtime(current);
-+	if (max_cpu_ratio && end_time > start_time) {
-+		scan_time = end_time - start_time;
-+		expected_jiffies = msecs_to_jiffies(
-+			scan_time_to_sleep(scan_time, max_cpu_ratio));
-+
-+		if (expected_jiffies > uksm_sleep_real)
-+			uksm_sleep_real = expected_jiffies;
-+
-+		/* We have a 1 second up bound for responsiveness. */
-+		if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
-+			uksm_sleep_real = msecs_to_jiffies(1000);
-+	}
-+
-+	return;
-+}
-+
-+static int ksmd_should_run(void)
-+{
-+	return uksm_run & UKSM_RUN_MERGE;
-+}
-+
-+static int uksm_scan_thread(void *nothing)
-+{
-+	set_freezable();
-+	set_user_nice(current, 5);
-+
-+	while (!kthread_should_stop()) {
-+		mutex_lock(&uksm_thread_mutex);
-+		if (ksmd_should_run())
-+			uksm_do_scan();
-+		mutex_unlock(&uksm_thread_mutex);
-+
-+		try_to_freeze();
-+
-+		if (ksmd_should_run()) {
-+			schedule_timeout_interruptible(uksm_sleep_real);
-+			uksm_sleep_times++;
-+		} else {
-+			wait_event_freezable(uksm_thread_wait,
-+				ksmd_should_run() || kthread_should_stop());
-+		}
-+	}
-+	return 0;
-+}
-+
-+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
-+{
-+	struct stable_node *stable_node;
-+	struct node_vma *node_vma;
-+	struct rmap_item *rmap_item;
-+	int search_new_forks = 0;
-+	unsigned long address;
-+
-+	VM_BUG_ON_PAGE(!PageKsm(page), page);
-+	VM_BUG_ON_PAGE(!PageLocked(page), page);
-+
-+	stable_node = page_stable_node(page);
-+	if (!stable_node)
-+		return;
-+again:
-+	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
-+		hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
-+			struct anon_vma *anon_vma = rmap_item->anon_vma;
-+			struct anon_vma_chain *vmac;
-+			struct vm_area_struct *vma;
-+
-+			cond_resched();
-+			anon_vma_lock_read(anon_vma);
-+			anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
-+						       0, ULONG_MAX) {
-+				cond_resched();
-+				vma = vmac->vma;
-+				address = get_rmap_addr(rmap_item);
-+
-+				if (address < vma->vm_start ||
-+				    address >= vma->vm_end)
-+					continue;
-+
-+				if ((rmap_item->slot->vma == vma) ==
-+				    search_new_forks)
-+					continue;
-+
-+				if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
-+					continue;
-+
-+				if (!rwc->rmap_one(page, vma, address, rwc->arg)) {
-+					anon_vma_unlock_read(anon_vma);
-+					return;
-+				}
-+
-+				if (rwc->done && rwc->done(page)) {
-+					anon_vma_unlock_read(anon_vma);
-+					return;
-+				}
-+			}
-+			anon_vma_unlock_read(anon_vma);
-+		}
-+	}
-+	if (!search_new_forks++)
-+		goto again;
-+}
-+
-+#ifdef CONFIG_MIGRATION
-+/* Common ksm interface but may be specific to uksm */
-+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
-+{
-+	struct stable_node *stable_node;
-+
-+	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
-+	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-+	VM_BUG_ON(newpage->mapping != oldpage->mapping);
-+
-+	stable_node = page_stable_node(newpage);
-+	if (stable_node) {
-+		VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
-+		stable_node->kpfn = page_to_pfn(newpage);
-+		/*
-+		 * newpage->mapping was set in advance; now we need smp_wmb()
-+		 * to make sure that the new stable_node->kpfn is visible
-+		 * to get_ksm_page() before it can see that oldpage->mapping
-+		 * has gone stale (or that PageSwapCache has been cleared).
-+		 */
-+		smp_wmb();
-+		set_page_stable_node(oldpage, NULL);
-+	}
-+}
-+#endif /* CONFIG_MIGRATION */
-+
-+#ifdef CONFIG_MEMORY_HOTREMOVE
-+static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
-+						 unsigned long end_pfn)
-+{
-+	struct rb_node *node;
-+
-+	for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
-+		struct stable_node *stable_node;
-+
-+		stable_node = rb_entry(node, struct stable_node, node);
-+		if (stable_node->kpfn >= start_pfn &&
-+		    stable_node->kpfn < end_pfn)
-+			return stable_node;
-+	}
-+	return NULL;
-+}
-+
-+static int uksm_memory_callback(struct notifier_block *self,
-+			       unsigned long action, void *arg)
-+{
-+	struct memory_notify *mn = arg;
-+	struct stable_node *stable_node;
-+
-+	switch (action) {
-+	case MEM_GOING_OFFLINE:
-+		/*
-+		 * Keep it very simple for now: just lock out ksmd and
-+		 * MADV_UNMERGEABLE while any memory is going offline.
-+		 * mutex_lock_nested() is necessary because lockdep was alarmed
-+		 * that here we take uksm_thread_mutex inside notifier chain
-+		 * mutex, and later take notifier chain mutex inside
-+		 * uksm_thread_mutex to unlock it.   But that's safe because both
-+		 * are inside mem_hotplug_mutex.
-+		 */
-+		mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
-+		break;
-+
-+	case MEM_OFFLINE:
-+		/*
-+		 * Most of the work is done by page migration; but there might
-+		 * be a few stable_nodes left over, still pointing to struct
-+		 * pages which have been offlined: prune those from the tree.
-+		 */
-+		while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
-+					mn->start_pfn + mn->nr_pages)) != NULL)
-+			remove_node_from_stable_tree(stable_node, 1, 1);
-+		/* fallthrough */
-+
-+	case MEM_CANCEL_OFFLINE:
-+		mutex_unlock(&uksm_thread_mutex);
-+		break;
-+	}
-+	return NOTIFY_OK;
-+}
-+#endif /* CONFIG_MEMORY_HOTREMOVE */
-+
-+#ifdef CONFIG_SYSFS
-+/*
-+ * This all compiles without CONFIG_SYSFS, but is a waste of space.
-+ */
-+
-+#define UKSM_ATTR_RO(_name) \
-+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
-+#define UKSM_ATTR(_name) \
-+	static struct kobj_attribute _name##_attr = \
-+		__ATTR(_name, 0644, _name##_show, _name##_store)
-+
-+static ssize_t max_cpu_percentage_show(struct kobject *kobj,
-+				    struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
-+}
-+
-+static ssize_t max_cpu_percentage_store(struct kobject *kobj,
-+				     struct kobj_attribute *attr,
-+				     const char *buf, size_t count)
-+{
-+	unsigned long max_cpu_percentage;
-+	int err;
-+
-+	err = kstrtoul(buf, 10, &max_cpu_percentage);
-+	if (err || max_cpu_percentage > 100)
-+		return -EINVAL;
-+
-+	if (max_cpu_percentage == 100)
-+		max_cpu_percentage = 99;
-+	else if (max_cpu_percentage < 10)
-+		max_cpu_percentage = 10;
-+
-+	uksm_max_cpu_percentage = max_cpu_percentage;
-+
-+	return count;
-+}
-+UKSM_ATTR(max_cpu_percentage);
-+
-+static ssize_t sleep_millisecs_show(struct kobject *kobj,
-+				    struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
-+}
-+
-+static ssize_t sleep_millisecs_store(struct kobject *kobj,
-+				     struct kobj_attribute *attr,
-+				     const char *buf, size_t count)
-+{
-+	unsigned long msecs;
-+	int err;
-+
-+	err = kstrtoul(buf, 10, &msecs);
-+	if (err || msecs > MSEC_PER_SEC)
-+		return -EINVAL;
-+
-+	uksm_sleep_jiffies = msecs_to_jiffies(msecs);
-+	uksm_sleep_saved = uksm_sleep_jiffies;
-+
-+	return count;
-+}
-+UKSM_ATTR(sleep_millisecs);
-+
-+
-+static ssize_t cpu_governor_show(struct kobject *kobj,
-+				  struct kobj_attribute *attr, char *buf)
-+{
-+	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
-+	int i;
-+
-+	buf[0] = '\0';
-+	for (i = 0; i < n ; i++) {
-+		if (uksm_cpu_governor == i)
-+			strcat(buf, "[");
-+
-+		strcat(buf, uksm_cpu_governor_str[i]);
-+
-+		if (uksm_cpu_governor == i)
-+			strcat(buf, "]");
-+
-+		strcat(buf, " ");
-+	}
-+	strcat(buf, "\n");
-+
-+	return strlen(buf);
-+}
-+
-+static inline void init_performance_values(void)
-+{
-+	int i;
-+	struct scan_rung *rung;
-+	struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
-+
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		rung = uksm_scan_ladder + i;
-+		rung->cpu_ratio = preset->cpu_ratio[i];
-+		rung->cover_msecs = preset->cover_msecs[i];
-+	}
-+
-+	uksm_max_cpu_percentage = preset->max_cpu;
-+}
-+
-+static ssize_t cpu_governor_store(struct kobject *kobj,
-+				   struct kobj_attribute *attr,
-+				   const char *buf, size_t count)
-+{
-+	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
-+
-+	for (n--; n >= 0 ; n--) {
-+		if (!strncmp(buf, uksm_cpu_governor_str[n],
-+			     strlen(uksm_cpu_governor_str[n])))
-+			break;
-+	}
-+
-+	if (n < 0)
-+		return -EINVAL;
-+	else
-+		uksm_cpu_governor = n;
-+
-+	init_performance_values();
-+
-+	return count;
-+}
-+UKSM_ATTR(cpu_governor);
-+
-+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
-+			char *buf)
-+{
-+	return sprintf(buf, "%u\n", uksm_run);
-+}
-+
-+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
-+			 const char *buf, size_t count)
-+{
-+	int err;
-+	unsigned long flags;
-+
-+	err = kstrtoul(buf, 10, &flags);
-+	if (err || flags > UINT_MAX)
-+		return -EINVAL;
-+	if (flags > UKSM_RUN_MERGE)
-+		return -EINVAL;
-+
-+	mutex_lock(&uksm_thread_mutex);
-+	if (uksm_run != flags)
-+		uksm_run = flags;
-+	mutex_unlock(&uksm_thread_mutex);
-+
-+	if (flags & UKSM_RUN_MERGE)
-+		wake_up_interruptible(&uksm_thread_wait);
-+
-+	return count;
-+}
-+UKSM_ATTR(run);
-+
-+static ssize_t abundant_threshold_show(struct kobject *kobj,
-+				     struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%u\n", uksm_abundant_threshold);
-+}
-+
-+static ssize_t abundant_threshold_store(struct kobject *kobj,
-+				      struct kobj_attribute *attr,
-+				      const char *buf, size_t count)
-+{
-+	int err;
-+	unsigned long flags;
-+
-+	err = kstrtoul(buf, 10, &flags);
-+	if (err || flags > 99)
-+		return -EINVAL;
-+
-+	uksm_abundant_threshold = flags;
-+
-+	return count;
-+}
-+UKSM_ATTR(abundant_threshold);
-+
-+static ssize_t thrash_threshold_show(struct kobject *kobj,
-+				     struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%u\n", uksm_thrash_threshold);
-+}
-+
-+static ssize_t thrash_threshold_store(struct kobject *kobj,
-+				      struct kobj_attribute *attr,
-+				      const char *buf, size_t count)
-+{
-+	int err;
-+	unsigned long flags;
-+
-+	err = kstrtoul(buf, 10, &flags);
-+	if (err || flags > 99)
-+		return -EINVAL;
-+
-+	uksm_thrash_threshold = flags;
-+
-+	return count;
-+}
-+UKSM_ATTR(thrash_threshold);
-+
-+static ssize_t cpu_ratios_show(struct kobject *kobj,
-+			       struct kobj_attribute *attr, char *buf)
-+{
-+	int i, size;
-+	struct scan_rung *rung;
-+	char *p = buf;
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		rung = &uksm_scan_ladder[i];
-+
-+		if (rung->cpu_ratio > 0)
-+			size = sprintf(p, "%d ", rung->cpu_ratio);
-+		else
-+			size = sprintf(p, "MAX/%d ",
-+					TIME_RATIO_SCALE / -rung->cpu_ratio);
-+
-+		p += size;
-+	}
-+
-+	*p++ = '\n';
-+	*p = '\0';
-+
-+	return p - buf;
-+}
-+
-+static ssize_t cpu_ratios_store(struct kobject *kobj,
-+				      struct kobj_attribute *attr,
-+				      const char *buf, size_t count)
-+{
-+	int i, cpuratios[SCAN_LADDER_SIZE], err;
-+	unsigned long value;
-+	struct scan_rung *rung;
-+	char *p, *end = NULL;
-+
-+	p = kzalloc(count, GFP_KERNEL);
-+	if (!p)
-+		return -ENOMEM;
-+
-+	memcpy(p, buf, count);
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		if (i != SCAN_LADDER_SIZE - 1) {
-+			end = strchr(p, ' ');
-+			if (!end)
-+				return -EINVAL;
-+
-+			*end = '\0';
-+		}
-+
-+		if (strstr(p, "MAX/")) {
-+			p = strchr(p, '/') + 1;
-+			err = kstrtoul(p, 10, &value);
-+			if (err || value > TIME_RATIO_SCALE || !value)
-+				return -EINVAL;
-+
-+			cpuratios[i] = -(int) (TIME_RATIO_SCALE / value);
-+		} else {
-+			err = kstrtoul(p, 10, &value);
-+			if (err || value > TIME_RATIO_SCALE || !value)
-+				return -EINVAL;
-+
-+			cpuratios[i] = value;
-+		}
-+
-+		p = end + 1;
-+	}
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		rung = &uksm_scan_ladder[i];
-+
-+		rung->cpu_ratio = cpuratios[i];
-+	}
-+
-+	return count;
-+}
-+UKSM_ATTR(cpu_ratios);
-+
-+static ssize_t eval_intervals_show(struct kobject *kobj,
-+			       struct kobj_attribute *attr, char *buf)
-+{
-+	int i, size;
-+	struct scan_rung *rung;
-+	char *p = buf;
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		rung = &uksm_scan_ladder[i];
-+		size = sprintf(p, "%u ", rung->cover_msecs);
-+		p += size;
-+	}
-+
-+	*p++ = '\n';
-+	*p = '\0';
-+
-+	return p - buf;
-+}
-+
-+static ssize_t eval_intervals_store(struct kobject *kobj,
-+				      struct kobj_attribute *attr,
-+				      const char *buf, size_t count)
-+{
-+	int i, err;
-+	unsigned long values[SCAN_LADDER_SIZE];
-+	struct scan_rung *rung;
-+	char *p, *end = NULL;
-+	ssize_t ret = count;
-+
-+	p = kzalloc(count + 2, GFP_KERNEL);
-+	if (!p)
-+		return -ENOMEM;
-+
-+	memcpy(p, buf, count);
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		if (i != SCAN_LADDER_SIZE - 1) {
-+			end = strchr(p, ' ');
-+			if (!end) {
-+				ret = -EINVAL;
-+				goto out;
-+			}
-+
-+			*end = '\0';
-+		}
-+
-+		err = kstrtoul(p, 10, &values[i]);
-+		if (err) {
-+			ret = -EINVAL;
-+			goto out;
-+		}
-+
-+		p = end + 1;
-+	}
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		rung = &uksm_scan_ladder[i];
-+
-+		rung->cover_msecs = values[i];
-+	}
-+
-+out:
-+	kfree(p);
-+	return ret;
-+}
-+UKSM_ATTR(eval_intervals);
-+
-+static ssize_t ema_per_page_time_show(struct kobject *kobj,
-+				 struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%lu\n", uksm_ema_page_time);
-+}
-+UKSM_ATTR_RO(ema_per_page_time);
-+
-+static ssize_t pages_shared_show(struct kobject *kobj,
-+				 struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%lu\n", uksm_pages_shared);
-+}
-+UKSM_ATTR_RO(pages_shared);
-+
-+static ssize_t pages_sharing_show(struct kobject *kobj,
-+				  struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%lu\n", uksm_pages_sharing);
-+}
-+UKSM_ATTR_RO(pages_sharing);
-+
-+static ssize_t pages_unshared_show(struct kobject *kobj,
-+				   struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%lu\n", uksm_pages_unshared);
-+}
-+UKSM_ATTR_RO(pages_unshared);
-+
-+static ssize_t full_scans_show(struct kobject *kobj,
-+			       struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%llu\n", fully_scanned_round);
-+}
-+UKSM_ATTR_RO(full_scans);
-+
-+static ssize_t pages_scanned_show(struct kobject *kobj,
-+				  struct kobj_attribute *attr, char *buf)
-+{
-+	unsigned long base = 0;
-+	u64 delta, ret;
-+
-+	if (pages_scanned_stored) {
-+		base = pages_scanned_base;
-+		ret = pages_scanned_stored;
-+		delta = uksm_pages_scanned >> base;
-+		if (CAN_OVERFLOW_U64(ret, delta)) {
-+			ret >>= 1;
-+			delta >>= 1;
-+			base++;
-+			ret += delta;
-+		}
-+	} else {
-+		ret = uksm_pages_scanned;
-+	}
-+
-+	while (ret > ULONG_MAX) {
-+		ret >>= 1;
-+		base++;
-+	}
-+
-+	if (base)
-+		return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
-+	else
-+		return sprintf(buf, "%lu\n", (unsigned long)ret);
-+}
-+UKSM_ATTR_RO(pages_scanned);
-+
-+static ssize_t hash_strength_show(struct kobject *kobj,
-+				  struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%lu\n", hash_strength);
-+}
-+UKSM_ATTR_RO(hash_strength);
-+
-+static ssize_t sleep_times_show(struct kobject *kobj,
-+				  struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%llu\n", uksm_sleep_times);
-+}
-+UKSM_ATTR_RO(sleep_times);
-+
-+
-+static struct attribute *uksm_attrs[] = {
-+	&max_cpu_percentage_attr.attr,
-+	&sleep_millisecs_attr.attr,
-+	&cpu_governor_attr.attr,
-+	&run_attr.attr,
-+	&ema_per_page_time_attr.attr,
-+	&pages_shared_attr.attr,
-+	&pages_sharing_attr.attr,
-+	&pages_unshared_attr.attr,
-+	&full_scans_attr.attr,
-+	&pages_scanned_attr.attr,
-+	&hash_strength_attr.attr,
-+	&sleep_times_attr.attr,
-+	&thrash_threshold_attr.attr,
-+	&abundant_threshold_attr.attr,
-+	&cpu_ratios_attr.attr,
-+	&eval_intervals_attr.attr,
-+	NULL,
-+};
-+
-+static struct attribute_group uksm_attr_group = {
-+	.attrs = uksm_attrs,
-+	.name = "uksm",
-+};
-+#endif /* CONFIG_SYSFS */
-+
-+static inline void init_scan_ladder(void)
-+{
-+	int i;
-+	struct scan_rung *rung;
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		rung = uksm_scan_ladder + i;
-+		slot_tree_init_root(&rung->vma_root);
-+	}
-+
-+	init_performance_values();
-+	uksm_calc_scan_pages();
-+}
-+
-+static inline int cal_positive_negative_costs(void)
-+{
-+	struct page *p1, *p2;
-+	unsigned char *addr1, *addr2;
-+	unsigned long i, time_start, hash_cost;
-+	unsigned long loopnum = 0;
-+
-+	/*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
-+	volatile u32 hash;
-+	volatile int ret;
-+
-+	p1 = alloc_page(GFP_KERNEL);
-+	if (!p1)
-+		return -ENOMEM;
-+
-+	p2 = alloc_page(GFP_KERNEL);
-+	if (!p2)
-+		return -ENOMEM;
-+
-+	addr1 = kmap_atomic(p1);
-+	addr2 = kmap_atomic(p2);
-+	memset(addr1, prandom_u32(), PAGE_SIZE);
-+	memcpy(addr2, addr1, PAGE_SIZE);
-+
-+	/* make sure that the two pages differ in last byte */
-+	addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
-+	kunmap_atomic(addr2);
-+	kunmap_atomic(addr1);
-+
-+	time_start = jiffies;
-+	while (jiffies - time_start < 100) {
-+		for (i = 0; i < 100; i++)
-+			hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
-+		loopnum += 100;
-+	}
-+	hash_cost = (jiffies - time_start);
-+
-+	time_start = jiffies;
-+	for (i = 0; i < loopnum; i++)
-+		ret = pages_identical_with_cost(p1, p2);
-+	memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
-+	memcmp_cost /= hash_cost;
-+	pr_info("UKSM: relative memcmp_cost = %lu "
-+		"hash=%u cmp_ret=%d.\n",
-+		memcmp_cost, hash, ret);
-+
-+	__free_page(p1);
-+	__free_page(p2);
-+	return 0;
-+}
-+
-+static int init_zeropage_hash_table(void)
-+{
-+	struct page *page;
-+	char *addr;
-+	int i;
-+
-+	page = alloc_page(GFP_KERNEL);
-+	if (!page)
-+		return -ENOMEM;
-+
-+	addr = kmap_atomic(page);
-+	memset(addr, 0, PAGE_SIZE);
-+	kunmap_atomic(addr);
-+
-+	zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32),
-+		GFP_KERNEL);
-+	if (!zero_hash_table)
-+		return -ENOMEM;
-+
-+	for (i = 0; i < HASH_STRENGTH_MAX; i++)
-+		zero_hash_table[i] = page_hash(page, i, 0);
-+
-+	__free_page(page);
-+
-+	return 0;
-+}
-+
-+static inline int init_random_sampling(void)
-+{
-+	unsigned long i;
-+
-+	random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
-+	if (!random_nums)
-+		return -ENOMEM;
-+
-+	for (i = 0; i < HASH_STRENGTH_FULL; i++)
-+		random_nums[i] = i;
-+
-+	for (i = 0; i < HASH_STRENGTH_FULL; i++) {
-+		unsigned long rand_range, swap_index, tmp;
-+
-+		rand_range = HASH_STRENGTH_FULL - i;
-+		swap_index = i + prandom_u32() % rand_range;
-+		tmp = random_nums[i];
-+		random_nums[i] =  random_nums[swap_index];
-+		random_nums[swap_index] = tmp;
-+	}
-+
-+	rshash_state.state = RSHASH_NEW;
-+	rshash_state.below_count = 0;
-+	rshash_state.lookup_window_index = 0;
-+
-+	return cal_positive_negative_costs();
-+}
-+
-+static int __init uksm_slab_init(void)
-+{
-+	rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
-+	if (!rmap_item_cache)
-+		goto out;
-+
-+	stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
-+	if (!stable_node_cache)
-+		goto out_free1;
-+
-+	node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
-+	if (!node_vma_cache)
-+		goto out_free2;
-+
-+	vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
-+	if (!vma_slot_cache)
-+		goto out_free3;
-+
-+	tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
-+	if (!tree_node_cache)
-+		goto out_free4;
-+
-+	return 0;
-+
-+out_free4:
-+	kmem_cache_destroy(vma_slot_cache);
-+out_free3:
-+	kmem_cache_destroy(node_vma_cache);
-+out_free2:
-+	kmem_cache_destroy(stable_node_cache);
-+out_free1:
-+	kmem_cache_destroy(rmap_item_cache);
-+out:
-+	return -ENOMEM;
-+}
-+
-+static void __init uksm_slab_free(void)
-+{
-+	kmem_cache_destroy(stable_node_cache);
-+	kmem_cache_destroy(rmap_item_cache);
-+	kmem_cache_destroy(node_vma_cache);
-+	kmem_cache_destroy(vma_slot_cache);
-+	kmem_cache_destroy(tree_node_cache);
-+}
-+
-+/* Common interface to ksm, different to it. */
-+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
-+		unsigned long end, int advice, unsigned long *vm_flags)
-+{
-+	int err;
-+
-+	switch (advice) {
-+	case MADV_MERGEABLE:
-+		return 0;		/* just ignore the advice */
-+
-+	case MADV_UNMERGEABLE:
-+		if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags))
-+			return 0;		/* just ignore the advice */
-+
-+		if (vma->anon_vma) {
-+			err = unmerge_uksm_pages(vma, start, end);
-+			if (err)
-+				return err;
-+		}
-+
-+		uksm_remove_vma(vma);
-+		*vm_flags &= ~VM_MERGEABLE;
-+		break;
-+	}
-+
-+	return 0;
-+}
-+
-+/* Common interface to ksm, actually the same. */
-+struct page *ksm_might_need_to_copy(struct page *page,
-+			struct vm_area_struct *vma, unsigned long address)
-+{
-+	struct anon_vma *anon_vma = page_anon_vma(page);
-+	struct page *new_page;
-+
-+	if (PageKsm(page)) {
-+		if (page_stable_node(page))
-+			return page;	/* no need to copy it */
-+	} else if (!anon_vma) {
-+		return page;		/* no need to copy it */
-+	} else if (anon_vma->root == vma->anon_vma->root &&
-+		 page->index == linear_page_index(vma, address)) {
-+		return page;		/* still no need to copy it */
-+	}
-+	if (!PageUptodate(page))
-+		return page;		/* let do_swap_page report the error */
-+
-+	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-+	if (new_page) {
-+		copy_user_highpage(new_page, page, address, vma);
-+
-+		SetPageDirty(new_page);
-+		__SetPageUptodate(new_page);
-+		__SetPageLocked(new_page);
-+	}
-+
-+	return new_page;
-+}
-+
-+/* Copied from mm/ksm.c and required from 5.1 */
-+bool reuse_ksm_page(struct page *page,
-+		    struct vm_area_struct *vma,
-+		    unsigned long address)
-+{
-+#ifdef CONFIG_DEBUG_VM
-+	if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
-+			WARN_ON(!page_mapped(page)) ||
-+			WARN_ON(!PageLocked(page))) {
-+		dump_page(page, "reuse_ksm_page");
-+		return false;
-+	}
-+#endif
-+
-+	if (PageSwapCache(page) || !page_stable_node(page))
-+		return false;
-+	/* Prohibit parallel get_ksm_page() */
-+	if (!page_ref_freeze(page, 1))
-+		return false;
-+
-+	page_move_anon_rmap(page, vma);
-+	page->index = linear_page_index(vma, address);
-+	page_ref_unfreeze(page, 1);
-+
-+	return true;
-+}
-+
-+static int __init uksm_init(void)
-+{
-+	struct task_struct *uksm_thread;
-+	int err;
-+
-+	uksm_sleep_jiffies = msecs_to_jiffies(100);
-+	uksm_sleep_saved = uksm_sleep_jiffies;
-+
-+	slot_tree_init();
-+	init_scan_ladder();
-+
-+
-+	err = init_random_sampling();
-+	if (err)
-+		goto out_free2;
-+
-+	err = uksm_slab_init();
-+	if (err)
-+		goto out_free1;
-+
-+	err = init_zeropage_hash_table();
-+	if (err)
-+		goto out_free0;
-+
-+	uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
-+	if (IS_ERR(uksm_thread)) {
-+		pr_err("uksm: creating kthread failed\n");
-+		err = PTR_ERR(uksm_thread);
-+		goto out_free;
-+	}
-+
-+#ifdef CONFIG_SYSFS
-+	err = sysfs_create_group(mm_kobj, &uksm_attr_group);
-+	if (err) {
-+		pr_err("uksm: register sysfs failed\n");
-+		kthread_stop(uksm_thread);
-+		goto out_free;
-+	}
-+#else
-+	uksm_run = UKSM_RUN_MERGE;	/* no way for user to start it */
-+
-+#endif /* CONFIG_SYSFS */
-+
-+#ifdef CONFIG_MEMORY_HOTREMOVE
-+	/*
-+	 * Choose a high priority since the callback takes uksm_thread_mutex:
-+	 * later callbacks could only be taking locks which nest within that.
-+	 */
-+	hotplug_memory_notifier(uksm_memory_callback, 100);
-+#endif
-+	return 0;
-+
-+out_free:
-+	kfree(zero_hash_table);
-+out_free0:
-+	uksm_slab_free();
-+out_free1:
-+	kfree(random_nums);
-+out_free2:
-+	kfree(uksm_scan_ladder);
-+	return err;
-+}
-+
-+#ifdef MODULE
-+subsys_initcall(ksm_init);
-+#else
-+late_initcall(uksm_init);
-+#endif
-+
-diff --git a/mm/vmstat.c b/mm/vmstat.c
-index 74b2c374b..ae42103a8 100644
---- a/mm/vmstat.c
-+++ b/mm/vmstat.c
-@@ -1231,6 +1231,9 @@ const char * const vmstat_text[] = {
- 	"nr_swapcached",
- #endif
- 
-+#ifdef CONFIG_UKSM
-+	"nr_uksm_zero_pages",
-+#endif
- 	/* enum writeback_stat_item counters */
- 	"nr_dirty_threshold",
- 	"nr_dirty_background_threshold",
--- 
-2.31.1.305.gd1b10fc6d8
-
diff --git a/0009-bbr2.patch b/0009-bbr2.patch
deleted file mode 100644
index 5a257ca22043..000000000000
--- a/0009-bbr2.patch
+++ /dev/null
@@ -1,3347 +0,0 @@
-From f3d069e2cafed9758d66fcfc42447b028d42493f Mon Sep 17 00:00:00 2001
-From: Piotr Gorski <lucjan.lucjanov@gmail.com>
-Date: Mon, 26 Apr 2021 21:14:18 +0200
-Subject: [PATCH] bbr2-5.12: introduce BBRv2
-
-Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
----
- include/linux/tcp.h                |    3 +-
- include/net/inet_connection_sock.h |    5 +-
- include/net/tcp.h                  |   48 +-
- include/uapi/linux/inet_diag.h     |   33 +
- net/ipv4/Kconfig                   |   22 +
- net/ipv4/Makefile                  |    1 +
- net/ipv4/bpf_tcp_ca.c              |    2 +-
- net/ipv4/tcp.c                     |    1 +
- net/ipv4/tcp_bbr.c                 |   38 +-
- net/ipv4/tcp_bbr2.c                | 2671 ++++++++++++++++++++++++++++
- net/ipv4/tcp_cong.c                |    1 +
- net/ipv4/tcp_input.c               |   38 +-
- net/ipv4/tcp_output.c              |   25 +-
- net/ipv4/tcp_rate.c                |   36 +-
- net/ipv4/tcp_timer.c               |    1 +
- 15 files changed, 2879 insertions(+), 46 deletions(-)
- create mode 100644 net/ipv4/tcp_bbr2.c
-
-diff --git a/include/linux/tcp.h b/include/linux/tcp.h
-index 48d8a3633..1bd559c69 100644
---- a/include/linux/tcp.h
-+++ b/include/linux/tcp.h
-@@ -225,7 +225,8 @@ struct tcp_sock {
- 	u8	compressed_ack;
- 	u8	dup_ack_counter:2,
- 		tlp_retrans:1,	/* TLP is a retransmission */
--		unused:5;
-+		fast_ack_mode:2, /* which fast ack mode ? */
-+		unused:3;
- 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
- 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
- 	u8	chrono_type:2,	/* current chronograph type */
-diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
-index 3c8c59471..2cdc5a070 100644
---- a/include/net/inet_connection_sock.h
-+++ b/include/net/inet_connection_sock.h
-@@ -134,8 +134,9 @@ struct inet_connection_sock {
- 	u32			  icsk_probes_tstamp;
- 	u32			  icsk_user_timeout;
- 
--	u64			  icsk_ca_priv[104 / sizeof(u64)];
--#define ICSK_CA_PRIV_SIZE      (13 * sizeof(u64))
-+/* XXX inflated by temporary internal debugging info */
-+#define ICSK_CA_PRIV_SIZE      (216)
-+	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
- };
- 
- #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
-diff --git a/include/net/tcp.h b/include/net/tcp.h
-index 963cd86d1..5a86fa1d2 100644
---- a/include/net/tcp.h
-+++ b/include/net/tcp.h
-@@ -799,6 +799,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
- 	return max_t(s64, t1 - t0, 0);
- }
- 
-+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
-+{
-+	return max_t(s32, t1 - t0, 0);
-+}
-+
- static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
- {
- 	return tcp_ns_to_ts(skb->skb_mstamp_ns);
-@@ -866,16 +871,22 @@ struct tcp_skb_cb {
- 	__u32		ack_seq;	/* Sequence number ACK'd	*/
- 	union {
- 		struct {
-+#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1)
- 			/* There is space for up to 24 bytes */
--			__u32 in_flight:30,/* Bytes in flight at transmit */
--			      is_app_limited:1, /* cwnd not fully used? */
--			      unused:1;
-+			__u32 is_app_limited:1, /* cwnd not fully used? */
-+			      delivered_ce:20,
-+			      unused:11;
- 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
- 			__u32 delivered;
- 			/* start of send pipeline phase */
--			u64 first_tx_mstamp;
-+			u32 first_tx_mstamp;
- 			/* when we reached the "delivered" count */
--			u64 delivered_mstamp;
-+			u32 delivered_mstamp;
-+#define TCPCB_IN_FLIGHT_BITS 20
-+#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
-+			u32 in_flight:20,   /* packets in flight at transmit */
-+			    unused2:12;
-+			u32 lost;	/* packets lost so far upon tx of skb */
- 		} tx;   /* only used for outgoing skbs */
- 		union {
- 			struct inet_skb_parm	h4;
-@@ -1025,7 +1036,11 @@ enum tcp_ca_ack_event_flags {
- #define TCP_CONG_NON_RESTRICTED 0x1
- /* Requires ECN/ECT set on all packets */
- #define TCP_CONG_NEEDS_ECN	0x2
--#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
-+/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
-+#define TCP_CONG_WANTS_CE_EVENTS	0x4
-+#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
-+			 TCP_CONG_NEEDS_ECN | \
-+			 TCP_CONG_WANTS_CE_EVENTS)
- 
- union tcp_cc_info;
- 
-@@ -1045,8 +1060,13 @@ struct ack_sample {
-  */
- struct rate_sample {
- 	u64  prior_mstamp; /* starting timestamp for interval */
-+	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
- 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
-+	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
-+	u32 tx_in_flight;	/* packets in flight at starting timestamp */
-+	s32  lost;		/* number of packets lost over interval */
- 	s32  delivered;		/* number of packets delivered over interval */
-+	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
- 	long interval_us;	/* time for tp->delivered to incr "delivered" */
- 	u32 snd_interval_us;	/* snd interval for delivered packets */
- 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
-@@ -1057,6 +1077,7 @@ struct rate_sample {
- 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
- 	bool is_retrans;	/* is sample from retransmission? */
- 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
-+	bool is_ece;		/* did this ACK have ECN marked? */
- };
- 
- struct tcp_congestion_ops {
-@@ -1083,10 +1104,12 @@ struct tcp_congestion_ops {
- 	u32  (*undo_cwnd)(struct sock *sk);
- 	/* hook for packet ack accounting (optional) */
- 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
--	/* override sysctl_tcp_min_tso_segs */
--	u32 (*min_tso_segs)(struct sock *sk);
-+	/* pick target number of segments per TSO/GSO skb (optional): */
-+	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
- 	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
- 	u32 (*sndbuf_expand)(struct sock *sk);
-+	/* react to a specific lost skb (optional) */
-+	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
- 	/* call when packets are delivered to update cwnd and pacing rate,
- 	 * after all the ca_state processing. (optional)
- 	 */
-@@ -1132,6 +1155,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
- }
- #endif
- 
-+static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
-+{
-+	const struct inet_connection_sock *icsk = inet_csk(sk);
-+
-+	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
-+					   TCP_CONG_WANTS_CE_EVENTS);
-+}
-+
- static inline bool tcp_ca_needs_ecn(const struct sock *sk)
- {
- 	const struct inet_connection_sock *icsk = inet_csk(sk);
-@@ -1157,6 +1188,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
- }
- 
- /* From tcp_rate.c */
-+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
- void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
- void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
- 			    struct rate_sample *rs);
-diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
-index 20ee93f0f..96d52dd9c 100644
---- a/include/uapi/linux/inet_diag.h
-+++ b/include/uapi/linux/inet_diag.h
-@@ -231,9 +231,42 @@ struct tcp_bbr_info {
- 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
- };
- 
-+/* Phase as reported in netlink/ss stats. */
-+enum tcp_bbr2_phase {
-+	BBR2_PHASE_INVALID		= 0,
-+	BBR2_PHASE_STARTUP		= 1,
-+	BBR2_PHASE_DRAIN		= 2,
-+	BBR2_PHASE_PROBE_RTT		= 3,
-+	BBR2_PHASE_PROBE_BW_UP		= 4,
-+	BBR2_PHASE_PROBE_BW_DOWN	= 5,
-+	BBR2_PHASE_PROBE_BW_CRUISE	= 6,
-+	BBR2_PHASE_PROBE_BW_REFILL	= 7
-+};
-+
-+struct tcp_bbr2_info {
-+	/* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */
-+	__u32	bbr_bw_lsb;		/* lower 32 bits of bw */
-+	__u32	bbr_bw_msb;		/* upper 32 bits of bw */
-+	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
-+	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
-+	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
-+	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
-+	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
-+	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
-+	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
-+	__u8	bbr_mode;		/* current bbr_mode in state machine */
-+	__u8	bbr_phase;		/* current state machine phase */
-+	__u8	unused1;		/* alignment padding; not used yet */
-+	__u8	bbr_version;		/* MUST be at this offset in struct */
-+	__u32	bbr_inflight_lo;	/* lower/short-term data volume bound */
-+	__u32	bbr_inflight_hi;	/* higher/long-term data volume bound */
-+	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
-+};
-+
- union tcp_cc_info {
- 	struct tcpvegas_info	vegas;
- 	struct tcp_dctcp_info	dctcp;
- 	struct tcp_bbr_info	bbr;
-+	struct tcp_bbr2_info	bbr2;
- };
- #endif /* _UAPI_INET_DIAG_H_ */
-diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
-index 87983e70f..a833a7a67 100644
---- a/net/ipv4/Kconfig
-+++ b/net/ipv4/Kconfig
-@@ -669,6 +669,24 @@ config TCP_CONG_BBR
- 	  AQM schemes that do not provide a delay signal. It requires the fq
- 	  ("Fair Queue") pacing packet scheduler.
- 
-+config TCP_CONG_BBR2
-+	tristate "BBR2 TCP"
-+	default n
-+	help
-+
-+	BBR2 TCP congestion control is a model-based congestion control
-+	algorithm that aims to maximize network utilization, keep queues and
-+	retransmit rates low, and to be able to coexist with Reno/CUBIC in
-+	common scenarios. It builds an explicit model of the network path.  It
-+	tolerates a targeted degree of random packet loss and delay that are
-+	unrelated to congestion. It can operate over LAN, WAN, cellular, wifi,
-+	or cable modem links, and can use DCTCP-L4S-style ECN signals.  It can
-+	coexist with flows that use loss-based congestion control, and can
-+	operate with shallow buffers, deep buffers, bufferbloat, policers, or
-+	AQM schemes that do not provide a delay signal. It requires pacing,
-+	using either TCP internal pacing or the fq ("Fair Queue") pacing packet
-+	scheduler.
-+
- choice
- 	prompt "Default TCP congestion control"
- 	default DEFAULT_CUBIC
-@@ -706,6 +724,9 @@ choice
- 	config DEFAULT_BBR
- 		bool "BBR" if TCP_CONG_BBR=y
- 
-+	config DEFAULT_BBR2
-+		bool "BBR2" if TCP_CONG_BBR2=y
-+
- 	config DEFAULT_RENO
- 		bool "Reno"
- endchoice
-@@ -730,6 +751,7 @@ config DEFAULT_TCP_CONG
- 	default "dctcp" if DEFAULT_DCTCP
- 	default "cdg" if DEFAULT_CDG
- 	default "bbr" if DEFAULT_BBR
-+	default "bbr2" if DEFAULT_BBR2
- 	default "cubic"
- 
- config TCP_MD5SIG
-diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
-index 5b77a4688..8c5779dba 100644
---- a/net/ipv4/Makefile
-+++ b/net/ipv4/Makefile
-@@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
- obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
- obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
- obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
-+obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o
- obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
- obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
- obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
-diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
-index d520e6164..22129c1c5 100644
---- a/net/ipv4/bpf_tcp_ca.c
-+++ b/net/ipv4/bpf_tcp_ca.c
-@@ -16,7 +16,7 @@ static u32 optional_ops[] = {
- 	offsetof(struct tcp_congestion_ops, cwnd_event),
- 	offsetof(struct tcp_congestion_ops, in_ack_event),
- 	offsetof(struct tcp_congestion_ops, pkts_acked),
--	offsetof(struct tcp_congestion_ops, min_tso_segs),
-+	offsetof(struct tcp_congestion_ops, tso_segs),
- 	offsetof(struct tcp_congestion_ops, sndbuf_expand),
- 	offsetof(struct tcp_congestion_ops, cong_control),
- };
-diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
-index de7cc8445..521f310f2 100644
---- a/net/ipv4/tcp.c
-+++ b/net/ipv4/tcp.c
-@@ -3033,6 +3033,7 @@ int tcp_disconnect(struct sock *sk, int flags)
- 	tp->rx_opt.dsack = 0;
- 	tp->rx_opt.num_sacks = 0;
- 	tp->rcv_ooopack = 0;
-+	tp->fast_ack_mode = 0;
- 
- 
- 	/* Clean up fastopen related fields */
-diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
-index 6ea3dc2e4..8ef512fef 100644
---- a/net/ipv4/tcp_bbr.c
-+++ b/net/ipv4/tcp_bbr.c
-@@ -292,26 +292,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
- 		sk->sk_pacing_rate = rate;
- }
- 
--/* override sysctl_tcp_min_tso_segs */
- static u32 bbr_min_tso_segs(struct sock *sk)
- {
- 	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
- }
- 
-+/* Return the number of segments BBR would like in a TSO/GSO skb, given
-+ * a particular max gso size as a constraint.
-+ */
-+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
-+				u32 gso_max_size)
-+{
-+	u32 segs;
-+	u64 bytes;
-+
-+	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
-+	bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
-+
-+	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
-+	segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
-+	return segs;
-+}
-+
-+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
-+static u32  bbr_tso_segs(struct sock *sk, unsigned int mss_now)
-+{
-+	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
-+}
-+
-+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
- static u32 bbr_tso_segs_goal(struct sock *sk)
- {
- 	struct tcp_sock *tp = tcp_sk(sk);
--	u32 segs, bytes;
--
--	/* Sort of tcp_tso_autosize() but ignoring
--	 * driver provided sk_gso_max_size.
--	 */
--	bytes = min_t(unsigned long,
--		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
--		      GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
--	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
- 
--	return min(segs, 0x7FU);
-+	return  bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
- }
- 
- /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
-@@ -1147,7 +1161,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
- 	.undo_cwnd	= bbr_undo_cwnd,
- 	.cwnd_event	= bbr_cwnd_event,
- 	.ssthresh	= bbr_ssthresh,
--	.min_tso_segs	= bbr_min_tso_segs,
-+	.tso_segs	= bbr_tso_segs,
- 	.get_info	= bbr_get_info,
- 	.set_state	= bbr_set_state,
- };
-diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
-new file mode 100644
-index 000000000..17d6a059d
---- /dev/null
-+++ b/net/ipv4/tcp_bbr2.c
-@@ -0,0 +1,2671 @@
-+/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2
-+ *
-+ * BBRv2 is a model-based congestion control algorithm that aims for low
-+ * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model
-+ * of the network path, it uses measurements of bandwidth and RTT, as well as
-+ * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals.  Note that
-+ * although it can use ECN or loss signals explicitly, it does not require
-+ * either; it can bound its in-flight data based on its estimate of the BDP.
-+ *
-+ * The model has both higher and lower bounds for the operating range:
-+ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
-+ *   hi: bw_hi, inflight_hi: robust long-term upper bound
-+ * The bandwidth-probing time scale is (a) extended dynamically based on
-+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
-+ * an interactive wall-clock time-scale to be more scalable and responsive
-+ * than Reno and CUBIC.
-+ *
-+ * Here is a state transition diagram for BBR:
-+ *
-+ *             |
-+ *             V
-+ *    +---> STARTUP  ----+
-+ *    |        |         |
-+ *    |        V         |
-+ *    |      DRAIN   ----+
-+ *    |        |         |
-+ *    |        V         |
-+ *    +---> PROBE_BW ----+
-+ *    |      ^    |      |
-+ *    |      |    |      |
-+ *    |      +----+      |
-+ *    |                  |
-+ *    +---- PROBE_RTT <--+
-+ *
-+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
-+ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
-+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
-+ * A long-lived BBR flow spends the vast majority of its time remaining
-+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
-+ * in a fair manner, with a small, bounded queue. *If* a flow has been
-+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
-+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
-+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
-+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
-+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
-+ * otherwise we enter STARTUP to try to fill the pipe.
-+ *
-+ * BBR is described in detail in:
-+ *   "BBR: Congestion-Based Congestion Control",
-+ *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
-+ *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
-+ *
-+ * There is a public e-mail list for discussing BBR development and testing:
-+ *   https://groups.google.com/forum/#!forum/bbr-dev
-+ *
-+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
-+ * otherwise TCP stack falls back to an internal pacing using one high
-+ * resolution timer per TCP socket and may use more resources.
-+ */
-+#include <linux/module.h>
-+#include <net/tcp.h>
-+#include <linux/inet_diag.h>
-+#include <linux/inet.h>
-+#include <linux/random.h>
-+
-+#include "tcp_dctcp.h"
-+
-+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
-+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
-+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
-+ * Since the minimum window is >=4 packets, the lower bound isn't
-+ * an issue. The upper bound isn't an issue with existing technologies.
-+ */
-+#define BW_SCALE 24
-+#define BW_UNIT (1 << BW_SCALE)
-+
-+#define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
-+#define BBR_UNIT (1 << BBR_SCALE)
-+
-+#define FLAG_DEBUG_VERBOSE	0x1	/* Verbose debugging messages */
-+#define FLAG_DEBUG_LOOPBACK	0x2	/* Do NOT skip loopback addr */
-+
-+#define CYCLE_LEN		8	/* number of phases in a pacing gain cycle */
-+
-+/* BBR has the following modes for deciding how fast to send: */
-+enum bbr_mode {
-+	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
-+	BBR_DRAIN,	/* drain any queue created during startup */
-+	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
-+	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
-+};
-+
-+/* How does the incoming ACK stream relate to our bandwidth probing? */
-+enum bbr_ack_phase {
-+	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
-+	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
-+	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
-+	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
-+	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
-+};
-+
-+/* BBR congestion control block */
-+struct bbr {
-+	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
-+	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
-+	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
-+	u32	probe_rtt_min_us;	/* min RTT in bbr_probe_rtt_win_ms window */
-+	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
-+	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
-+	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
-+	u64	cycle_mstamp;	     /* time of this cycle phase start */
-+	u32     mode:3,		     /* current bbr_mode in state machine */
-+		prev_ca_state:3,     /* CA state on previous ACK */
-+		packet_conservation:1,  /* use packet conservation? */
-+		round_start:1,	     /* start of packet-timed tx->ack round? */
-+		ce_state:1,          /* If most recent data has CE bit set */
-+		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
-+		try_fast_path:1, 	/* can we take fast path? */
-+		unused2:11,
-+		idle_restart:1,	     /* restarting after idle? */
-+		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
-+		cycle_idx:3,	/* current index in pacing_gain cycle array */
-+		has_seen_rtt:1;	     /* have we seen an RTT sample yet? */
-+	u32	pacing_gain:11,	/* current gain for setting pacing rate */
-+		cwnd_gain:11,	/* current gain for setting cwnd */
-+		full_bw_reached:1,   /* reached full bw in Startup? */
-+		full_bw_cnt:2,	/* number of rounds without large bw gains */
-+		init_cwnd:7;	/* initial cwnd */
-+	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
-+	u32	full_bw;	/* recent bw, to estimate if pipe is full */
-+
-+	/* For tracking ACK aggregation: */
-+	u64	ack_epoch_mstamp;	/* start of ACK sampling epoch */
-+	u16	extra_acked[2];		/* max excess data ACKed in epoch */
-+	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
-+		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
-+		extra_acked_win_idx:1,	/* current index in extra_acked array */
-+	/* BBR v2 state: */
-+		unused1:2,
-+		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
-+		loss_in_cycle:1,	/* packet loss in this cycle? */
-+		ecn_in_cycle:1;		/* ECN in this cycle? */
-+	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
-+	u32	undo_bw_lo;	     /* bw_lo before latest losses */
-+	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
-+	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
-+	u32	bw_latest;	 /* max delivered bw in last round trip */
-+	u32	bw_lo;		 /* lower bound on sending bandwidth */
-+	u32	bw_hi[2];	 /* upper bound of sending bandwidth range*/
-+	u32	inflight_latest; /* max delivered data in last round trip */
-+	u32	inflight_lo;	 /* lower bound of inflight data range */
-+	u32	inflight_hi;	 /* upper bound of inflight data range */
-+	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
-+	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
-+	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
-+	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
-+		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
-+		bw_probe_samples:1,    /* rate samples reflect bw probing? */
-+		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
-+		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
-+		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
-+		loss_round_start:1,    /* loss_round_delivered round trip? */
-+		loss_in_round:1,       /* loss marked in this round trip? */
-+		ecn_in_round:1,	       /* ECN marked in this round trip? */
-+		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
-+		loss_events_in_round:4,/* losses in STARTUP round */
-+		initialized:1;	       /* has bbr_init() been called? */
-+	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
-+	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
-+
-+	/* Params configurable using setsockopt. Refer to correspoding
-+	 * module param for detailed description of params.
-+	 */
-+	struct bbr_params {
-+		u32	high_gain:11,		/* max allowed value: 2047 */
-+			drain_gain:10,		/* max allowed value: 1023 */
-+			cwnd_gain:11;		/* max allowed value: 2047 */
-+		u32	cwnd_min_target:4,	/* max allowed value: 15 */
-+			min_rtt_win_sec:5,	/* max allowed value: 31 */
-+			probe_rtt_mode_ms:9,	/* max allowed value: 511 */
-+			full_bw_cnt:3,		/* max allowed value: 7 */
-+			cwnd_tso_budget:1,	/* allowed values: {0, 1} */
-+			unused3:6,
-+			drain_to_target:1,	/* boolean */
-+			precise_ece_ack:1,	/* boolean */
-+			extra_acked_in_startup:1, /* allowed values: {0, 1} */
-+			fast_path:1;		/* boolean */
-+		u32	full_bw_thresh:10,	/* max allowed value: 1023 */
-+			startup_cwnd_gain:11,	/* max allowed value: 2047 */
-+			bw_probe_pif_gain:9,	/* max allowed value: 511 */
-+			usage_based_cwnd:1, 	/* boolean */
-+			unused2:1;
-+		u16	probe_rtt_win_ms:14,	/* max allowed value: 16383 */
-+			refill_add_inc:2;	/* max allowed value: 3 */
-+		u16	extra_acked_gain:11,	/* max allowed value: 2047 */
-+			extra_acked_win_rtts:5; /* max allowed value: 31*/
-+		u16	pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */
-+		/* Mostly BBR v2 parameters below here: */
-+		u32	ecn_alpha_gain:8,	/* max allowed value: 255 */
-+			ecn_factor:8,		/* max allowed value: 255 */
-+			ecn_thresh:8,		/* max allowed value: 255 */
-+			beta:8;			/* max allowed value: 255 */
-+		u32	ecn_max_rtt_us:19,	/* max allowed value: 524287 */
-+			bw_probe_reno_gain:9,	/* max allowed value: 511 */
-+			full_loss_cnt:4;	/* max allowed value: 15 */
-+		u32	probe_rtt_cwnd_gain:8,	/* max allowed value: 255 */
-+			inflight_headroom:8,	/* max allowed value: 255 */
-+			loss_thresh:8,		/* max allowed value: 255 */
-+			bw_probe_max_rounds:8;	/* max allowed value: 255 */
-+		u32	bw_probe_rand_rounds:4, /* max allowed value: 15 */
-+			bw_probe_base_us:26,	/* usecs: 0..2^26-1 (67 secs) */
-+			full_ecn_cnt:2;		/* max allowed value: 3 */
-+		u32	bw_probe_rand_us:26,	/* usecs: 0..2^26-1 (67 secs) */
-+			undo:1,			/* boolean */
-+			tso_rtt_shift:4,	/* max allowed value: 15 */
-+			unused5:1;
-+		u32	ecn_reprobe_gain:9,	/* max allowed value: 511 */
-+			unused1:14,
-+			ecn_alpha_init:9;	/* max allowed value: 256 */
-+	} params;
-+
-+	struct {
-+		u32	snd_isn; /* Initial sequence number */
-+		u32	rs_bw; 	 /* last valid rate sample bw */
-+		u32	target_cwnd; /* target cwnd, based on BDP */
-+		u8	undo:1,  /* Undo even happened but not yet logged */
-+			unused:7;
-+		char	event;	 /* single-letter event debug codes */
-+		u16	unused2;
-+	} debug;
-+};
-+
-+struct bbr_context {
-+	u32 sample_bw;
-+	u32 target_cwnd;
-+	u32 log:1;
-+};
-+
-+/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */
-+static u32 bbr_min_rtt_win_sec = 10;
-+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode.
-+ * Max allowed value is 511 (0x1FF).
-+ */
-+static u32 bbr_probe_rtt_mode_ms = 200;
-+/* Window length of probe_rtt_min_us filter (in ms), and consequently the
-+ * typical interval between PROBE_RTT mode entries.
-+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
-+ */
-+static u32 bbr_probe_rtt_win_ms = 5000;
-+/* Skip TSO below the following bandwidth (bits/sec): */
-+static int bbr_min_tso_rate = 1200000;
-+
-+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
-+ * in bigger TSO bursts. By default we cut the RTT-based allowance in half
-+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
-+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
-+ */
-+static u32 bbr_tso_rtt_shift = 9;  /* halve allowance per 2^9 usecs, 512us */
-+
-+/* Select cwnd TSO budget approach:
-+ *  0: padding
-+ *  1: flooring
-+ */
-+static uint bbr_cwnd_tso_budget = 1;
-+
-+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
-+ * In order to help drive the network toward lower queues and low latency while
-+ * maintaining high utilization, the average pacing rate aims to be slightly
-+ * lower than the estimated bandwidth. This is an important aspect of the
-+ * design.
-+ */
-+static const int bbr_pacing_margin_percent = 1;
-+
-+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
-+ * that will allow a smoothly increasing pacing rate that will double each RTT
-+ * and send the same number of packets per RTT that an un-paced, slow-starting
-+ * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF).
-+ */
-+static int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
-+/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */
-+static int bbr_startup_cwnd_gain  = BBR_UNIT * 2885 / 1000 + 1;
-+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
-+ * the queue created in BBR_STARTUP in a single round. Max allowed value
-+ * is 1023 (0x3FF).
-+ */
-+static int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
-+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs.
-+ * Max allowed value is 2047 (0x7FF).
-+ */
-+static int bbr_cwnd_gain  = BBR_UNIT * 2;
-+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw.
-+ * Max allowed value for each element is 1023 (0x3FF).
-+ */
-+enum bbr_pacing_gain_phase {
-+	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
-+	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
-+	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
-+	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
-+};
-+static int bbr_pacing_gain[] = {
-+	BBR_UNIT * 5 / 4,	/* probe for more available bw */
-+	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
-+	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
-+	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
-+};
-+
-+/* Try to keep at least this many packets in flight, if things go smoothly. For
-+ * smooth functioning, a sliding window protocol ACKing every other packet
-+ * needs at least 4 packets in flight. Max allowed value is 15 (0xF).
-+ */
-+static u32 bbr_cwnd_min_target = 4;
-+
-+/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%.
-+ * Use 0 to disable. Max allowed value is 255.
-+ */
-+static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
-+
-+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
-+/* If bw has increased significantly (1.25x), there may be more bw available.
-+ * Max allowed value is 1023 (0x3FF).
-+ */
-+static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
-+/* But after 3 rounds w/o significant bw growth, estimate pipe is full.
-+ * Max allowed value is 7 (0x7).
-+ */
-+static u32 bbr_full_bw_cnt = 3;
-+
-+static u32 bbr_flags;		/* Debugging related stuff */
-+
-+/* Whether to debug using printk.
-+ */
-+static bool bbr_debug_with_printk;
-+
-+/* Whether to debug using ftrace event tcp:tcp_bbr_event.
-+ * Ignored when bbr_debug_with_printk is set.
-+ */
-+static bool bbr_debug_ftrace;
-+
-+/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */
-+static bool bbr_drain_to_target = true;		/* default: enabled */
-+
-+/* Experiment: Flags to control BBR with ECN behavior.
-+ */
-+static bool bbr_precise_ece_ack = true;		/* default: enabled */
-+
-+/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is
-+ * (2^(16+14) B)/(1024 B/packet) = 1M packets.
-+ */
-+static u32 bbr_cwnd_warn_val	= 1U << 20;
-+
-+static u16 bbr_debug_port_mask;
-+
-+/* BBR module parameters. These are module parameters only in Google prod.
-+ * Upstream these are intentionally not module parameters.
-+ */
-+static int bbr_pacing_gain_size = CYCLE_LEN;
-+
-+/* Gain factor for adding extra_acked to target cwnd: */
-+static int bbr_extra_acked_gain = 256;
-+
-+/* Window length of extra_acked window. Max allowed val is 31. */
-+static u32 bbr_extra_acked_win_rtts = 5;
-+
-+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
-+static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
-+
-+/* Time period for clamping cwnd increment due to ack aggregation */
-+static u32 bbr_extra_acked_max_us = 100 * 1000;
-+
-+/* Use extra acked in startup ?
-+ * 0: disabled
-+ * 1: use latest extra_acked value from 1-2 rtt in startup
-+ */
-+static int bbr_extra_acked_in_startup = 1;		/* default: enabled */
-+
-+/* Experiment: don't grow cwnd beyond twice of what we just probed. */
-+static bool bbr_usage_based_cwnd;		/* default: disabled */
-+
-+/* For lab testing, researchers can enable BBRv2 ECN support with this flag,
-+ * when they know that any ECN marks that the connections experience will be
-+ * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks.
-+ * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on
-+ * negotiation or configuration that is outside the scope of the BBRv2
-+ * alpha release.
-+ */
-+static bool bbr_ecn_enable = false;
-+
-+module_param_named(min_tso_rate,      bbr_min_tso_rate,      int,    0644);
-+module_param_named(tso_rtt_shift,     bbr_tso_rtt_shift,     int,    0644);
-+module_param_named(high_gain,         bbr_high_gain,         int,    0644);
-+module_param_named(drain_gain,        bbr_drain_gain,        int,    0644);
-+module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int,    0644);
-+module_param_named(cwnd_gain,         bbr_cwnd_gain,         int,    0644);
-+module_param_array_named(pacing_gain, bbr_pacing_gain,       int,
-+			 &bbr_pacing_gain_size, 0644);
-+module_param_named(cwnd_min_target,   bbr_cwnd_min_target,   uint,   0644);
-+module_param_named(probe_rtt_cwnd_gain,
-+		   bbr_probe_rtt_cwnd_gain,		     uint,   0664);
-+module_param_named(cwnd_warn_val,     bbr_cwnd_warn_val,     uint,   0664);
-+module_param_named(debug_port_mask,   bbr_debug_port_mask,   ushort, 0644);
-+module_param_named(flags,             bbr_flags,             uint,   0644);
-+module_param_named(debug_ftrace,      bbr_debug_ftrace, bool,   0644);
-+module_param_named(debug_with_printk, bbr_debug_with_printk, bool,   0644);
-+module_param_named(min_rtt_win_sec,   bbr_min_rtt_win_sec,   uint,   0644);
-+module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint,   0644);
-+module_param_named(probe_rtt_win_ms,  bbr_probe_rtt_win_ms,  uint,   0644);
-+module_param_named(full_bw_thresh,    bbr_full_bw_thresh,    uint,   0644);
-+module_param_named(full_bw_cnt,       bbr_full_bw_cnt,       uint,   0644);
-+module_param_named(cwnd_tso_bduget,   bbr_cwnd_tso_budget,   uint,   0664);
-+module_param_named(extra_acked_gain,  bbr_extra_acked_gain,  int,    0664);
-+module_param_named(extra_acked_win_rtts,
-+		   bbr_extra_acked_win_rtts, uint,   0664);
-+module_param_named(extra_acked_max_us,
-+		   bbr_extra_acked_max_us, uint,   0664);
-+module_param_named(ack_epoch_acked_reset_thresh,
-+		   bbr_ack_epoch_acked_reset_thresh, uint,   0664);
-+module_param_named(drain_to_target,   bbr_drain_to_target,   bool,   0664);
-+module_param_named(precise_ece_ack,   bbr_precise_ece_ack,   bool,   0664);
-+module_param_named(extra_acked_in_startup,
-+		   bbr_extra_acked_in_startup, int, 0664);
-+module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool,   0664);
-+module_param_named(ecn_enable,       bbr_ecn_enable,         bool,   0664);
-+
-+static void bbr2_exit_probe_rtt(struct sock *sk);
-+static void bbr2_reset_congestion_signals(struct sock *sk);
-+
-+static void bbr_check_probe_rtt_done(struct sock *sk);
-+
-+/* Do we estimate that STARTUP filled the pipe? */
-+static bool bbr_full_bw_reached(const struct sock *sk)
-+{
-+	const struct bbr *bbr = inet_csk_ca(sk);
-+
-+	return bbr->full_bw_reached;
-+}
-+
-+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
-+static u32 bbr_max_bw(const struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
-+}
-+
-+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
-+static u32 bbr_bw(const struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	return min(bbr_max_bw(sk), bbr->bw_lo);
-+}
-+
-+/* Return maximum extra acked in past k-2k round trips,
-+ * where k = bbr_extra_acked_win_rtts.
-+ */
-+static u16 bbr_extra_acked(const struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	return max(bbr->extra_acked[0], bbr->extra_acked[1]);
-+}
-+
-+/* Return rate in bytes per second, optionally with a gain.
-+ * The order here is chosen carefully to avoid overflow of u64. This should
-+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
-+ */
-+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
-+				  int margin)
-+{
-+	unsigned int mss = tcp_sk(sk)->mss_cache;
-+
-+	rate *= mss;
-+	rate *= gain;
-+	rate >>= BBR_SCALE;
-+	rate *= USEC_PER_SEC / 100 * (100 - margin);
-+	rate >>= BW_SCALE;
-+	rate = max(rate, 1ULL);
-+	return rate;
-+}
-+
-+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
-+{
-+	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
-+}
-+
-+static u64 bbr_rate_kbps(struct sock *sk, u64 rate)
-+{
-+	rate = bbr_bw_bytes_per_sec(sk, rate);
-+	rate *= 8;
-+	do_div(rate, 1000);
-+	return rate;
-+}
-+
-+static u32 bbr_tso_segs_goal(struct sock *sk);
-+static void bbr_debug(struct sock *sk, u32 acked,
-+		      const struct rate_sample *rs, struct bbr_context *ctx)
-+{
-+	static const char ca_states[] = {
-+		[TCP_CA_Open]		= 'O',
-+		[TCP_CA_Disorder]	= 'D',
-+		[TCP_CA_CWR]		= 'C',
-+		[TCP_CA_Recovery]	= 'R',
-+		[TCP_CA_Loss]		= 'L',
-+	};
-+	static const char mode[] = {
-+		'G',  /* Growing   - BBR_STARTUP */
-+		'D',  /* Drain     - BBR_DRAIN */
-+		'W',  /* Window    - BBR_PROBE_BW */
-+		'M',  /* Min RTT   - BBR_PROBE_RTT */
-+	};
-+	static const char ack_phase[] = { /* bbr_ack_phase strings */
-+		'I',	/* BBR_ACKS_INIT	   - 'Init' */
-+		'R',	/* BBR_ACKS_REFILLING	   - 'Refilling' */
-+		'B',	/* BBR_ACKS_PROBE_STARTING - 'Before' */
-+		'F',	/* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */
-+		'A',	/* BBR_ACKS_PROBE_STOPPING - 'After' */
-+	};
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	const u32 una = tp->snd_una - bbr->debug.snd_isn;
-+	const u32 fack = tcp_highest_sack_seq(tp);
-+	const u16 dport = ntohs(inet_sk(sk)->inet_dport);
-+	bool is_port_match = (bbr_debug_port_mask &&
-+			      ((dport & bbr_debug_port_mask) == 0));
-+	char debugmsg[320];
-+
-+	if (sk->sk_state == TCP_SYN_SENT)
-+		return;  /* no bbr_init() yet if SYN retransmit -> CA_Loss */
-+
-+	if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) {
-+		char addr[INET6_ADDRSTRLEN + 10] = { 0 };
-+
-+		if (sk->sk_family == AF_INET)
-+			snprintf(addr, sizeof(addr), "%pI4:%u",
-+				 &inet_sk(sk)->inet_daddr, dport);
-+		else if (sk->sk_family == AF_INET6)
-+			snprintf(addr, sizeof(addr), "%pI6:%u",
-+				 &sk->sk_daddr, dport);
-+
-+		WARN_ONCE(1,
-+			"BBR %s cwnd alert: %u "
-+			"snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u "
-+			"bw: %u rtt: %u min_rtt: %u "
-+			"acked: %u tso_segs: %u "
-+			"bw: %d %ld %d pif: %u\n",
-+			addr, tp->snd_cwnd,
-+			una, inet_csk(sk)->icsk_ca_state,
-+			bbr->pacing_gain, bbr->cwnd_gain,
-+			bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us,
-+			acked, bbr_tso_segs_goal(sk),
-+			rs->delivered, rs->interval_us, rs->is_retrans,
-+			tcp_packets_in_flight(tp));
-+	}
-+
-+	if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace))
-+		return;
-+
-+	if (!sock_flag(sk, SOCK_DBG) && !is_port_match)
-+		return;
-+
-+	if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE))
-+		return;
-+
-+	if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) &&
-+	    !(bbr_flags & FLAG_DEBUG_LOOPBACK))
-+		return;
-+
-+	snprintf(debugmsg, sizeof(debugmsg) - 1,
-+		 "BBR %pI4:%-5u %5u,%03u:%-7u %c "
-+		 "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu "
-+		 "bw %llu lb %llu ib %llu qb %llu "
-+		 "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c "
-+		 "lr %d er %d ea %d bwl %lld il %d ih %d c %d "
-+		 "v %d %c %u %c %s\n",
-+		 &inet_sk(sk)->inet_daddr, dport,
-+		 una / 1000, una % 1000, fack - tp->snd_una,
-+		 ca_states[inet_csk(sk)->icsk_ca_state],
-+		 bbr->debug.undo ? '@' : mode[bbr->mode],
-+		 tp->snd_cwnd,
-+		 bbr_extra_acked(sk),	/* br (legacy): extra_acked */
-+		 rs->tx_in_flight,	/* cr (legacy): tx_inflight */
-+		 rs->rtt_us,
-+		 rs->delivered,
-+		 rs->interval_us,
-+		 bbr->min_rtt_us,
-+		 rs->is_app_limited ? '_' : 'l',
-+		 bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */
-+		 bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */
-+		 0ULL,				    /* lb: [obsolete] */
-+		 0ULL,				    /* ib: [obsolete] */
-+		 (u64)sk->sk_pacing_rate * 8 / 1000,
-+		 acked,
-+		 tcp_packets_in_flight(tp),
-+		 rs->is_ack_delayed ? 'd' : '.',
-+		 bbr->round_start ? '*' : '.',
-+		 tp->delivered, tp->lost,
-+		 tp->app_limited,
-+		 0,			    	    /* #: [obsolete] */
-+		 ctx->target_cwnd,
-+		 tp->reord_seen ? 'r' : '.',  /* r: reordering seen? */
-+		 ca_states[bbr->prev_ca_state],
-+		 (rs->lost + rs->delivered) > 0 ?
-+		 (1000 * rs->lost /
-+		  (rs->lost + rs->delivered)) : 0,    /* lr: loss rate x1000 */
-+		 (rs->delivered) > 0 ?
-+		 (1000 * rs->delivered_ce /
-+		  (rs->delivered)) : 0,		      /* er: ECN rate x1000 */
-+		 1000 * bbr->ecn_alpha >> BBR_SCALE,  /* ea: ECN alpha x1000 */
-+		 bbr->bw_lo == ~0U ?
-+		   -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */
-+		 bbr->inflight_lo,	/* il */
-+		 bbr->inflight_hi,	/* ih */
-+		 bbr->bw_probe_up_cnt,	/* c */
-+		 2,			/* v: version */
-+		 bbr->debug.event,
-+		 bbr->cycle_idx,
-+		 ack_phase[bbr->ack_phase],
-+		 bbr->bw_probe_samples ? "Y" : "N");
-+	debugmsg[sizeof(debugmsg) - 1] = 0;
-+
-+	/* printk takes a higher precedence. */
-+	if (bbr_debug_with_printk)
-+		printk(KERN_DEBUG "%s", debugmsg);
-+
-+	if (unlikely(bbr->debug.undo))
-+		bbr->debug.undo = 0;
-+}
-+
-+/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
-+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
-+{
-+	u64 rate = bw;
-+
-+	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
-+				      bbr_pacing_margin_percent);
-+	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
-+	return rate;
-+}
-+
-+/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
-+static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u64 bw;
-+	u32 rtt_us;
-+
-+	if (tp->srtt_us) {		/* any RTT sample yet? */
-+		rtt_us = max(tp->srtt_us >> 3, 1U);
-+		bbr->has_seen_rtt = 1;
-+	} else {			 /* no RTT sample yet */
-+		rtt_us = USEC_PER_MSEC;	 /* use nominal default RTT */
-+	}
-+	bw = (u64)tp->snd_cwnd * BW_UNIT;
-+	do_div(bw, rtt_us);
-+	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain);
-+}
-+
-+/* Pace using current bw estimate and a gain factor. */
-+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
-+
-+	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
-+		bbr_init_pacing_rate_from_rtt(sk);
-+	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
-+		sk->sk_pacing_rate = rate;
-+}
-+
-+static u32 bbr_min_tso_segs(struct sock *sk)
-+{
-+	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
-+}
-+
-+/* Return the number of segments BBR would like in a TSO/GSO skb, given
-+ * a particular max gso size as a constraint.
-+ */
-+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
-+				u32 gso_max_size)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 segs, r;
-+	u64 bytes;
-+
-+	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
-+	bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
-+
-+	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
-+	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
-+	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
-+	 */
-+	if (bbr->params.tso_rtt_shift) {
-+		r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift;
-+		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
-+			bytes += GSO_MAX_SIZE >> r;
-+	}
-+
-+	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
-+	segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
-+	return segs;
-+}
-+
-+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
-+static u32  bbr_tso_segs(struct sock *sk, unsigned int mss_now)
-+{
-+	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
-+}
-+
-+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
-+static u32 bbr_tso_segs_goal(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+
-+	return  bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
-+}
-+
-+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
-+static void bbr_save_cwnd(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
-+		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
-+	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
-+		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
-+}
-+
-+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (event == CA_EVENT_TX_START && tp->app_limited) {
-+		bbr->idle_restart = 1;
-+		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
-+		bbr->ack_epoch_acked = 0;
-+		/* Avoid pointless buffer overflows: pace at est. bw if we don't
-+		 * need more speed (we're restarting from idle and app-limited).
-+		 */
-+		if (bbr->mode == BBR_PROBE_BW)
-+			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
-+		else if (bbr->mode == BBR_PROBE_RTT)
-+			bbr_check_probe_rtt_done(sk);
-+	} else if ((event == CA_EVENT_ECN_IS_CE ||
-+		    event == CA_EVENT_ECN_NO_CE) &&
-+		    bbr_ecn_enable &&
-+		    bbr->params.precise_ece_ack) {
-+		u32 state = bbr->ce_state;
-+		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
-+		bbr->ce_state = state;
-+		if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE)
-+			tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
-+	}
-+}
-+
-+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
-+ *
-+ * bdp = ceil(bw * min_rtt * gain)
-+ *
-+ * The key factor, gain, controls the amount of queue. While a small gain
-+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
-+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
-+ * noise may cause BBR to under-estimate the rate.
-+ */
-+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 bdp;
-+	u64 w;
-+
-+	/* If we've never had a valid RTT sample, cap cwnd at the initial
-+	 * default. This should only happen when the connection is not using TCP
-+	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
-+	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
-+	 * case we need to slow-start up toward something safe: initial cwnd.
-+	 */
-+	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
-+		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
-+
-+	w = (u64)bw * bbr->min_rtt_us;
-+
-+	/* Apply a gain to the given value, remove the BW_SCALE shift, and
-+	 * round the value up to avoid a negative feedback loop.
-+	 */
-+	bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
-+
-+	return bdp;
-+}
-+
-+/* To achieve full performance in high-speed paths, we budget enough cwnd to
-+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
-+ *   - one skb in sending host Qdisc,
-+ *   - one skb in sending host TSO/GSO engine
-+ *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
-+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
-+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
-+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
-+ * full even with ACK-every-other-packet delayed ACKs.
-+ */
-+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 tso_segs_goal;
-+
-+	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
-+
-+	/* Allow enough full-sized skbs in flight to utilize end systems. */
-+	if (bbr->params.cwnd_tso_budget == 1) {
-+		cwnd = max_t(u32, cwnd, tso_segs_goal);
-+		cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
-+	} else {
-+		cwnd += tso_segs_goal;
-+		cwnd = (cwnd + 1) & ~1U;
-+	}
-+	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
-+	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
-+		cwnd += 2;
-+
-+	return cwnd;
-+}
-+
-+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
-+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
-+{
-+	u32 inflight;
-+
-+	inflight = bbr_bdp(sk, bw, gain);
-+	inflight = bbr_quantization_budget(sk, inflight);
-+
-+	return inflight;
-+}
-+
-+/* With pacing at lower layers, there's often less data "in the network" than
-+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
-+ * we often have several skbs queued in the pacing layer with a pre-scheduled
-+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
-+ * inflight level that it estimates has already been "baked in" by previous
-+ * departure time decisions. We calculate a rough estimate of the number of our
-+ * packets that might be in the network at the earliest departure time for the
-+ * next skb scheduled:
-+ *   in_network_at_edt = inflight_at_edt - (EDT - now) * bw
-+ * If we're increasing inflight, then we want to know if the transmit of the
-+ * EDT skb will push inflight above the target, so inflight_at_edt includes
-+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
-+ * then estimate if inflight will sink too low just before the EDT transmit.
-+ */
-+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u64 now_ns, edt_ns, interval_us;
-+	u32 interval_delivered, inflight_at_edt;
-+
-+	now_ns = tp->tcp_clock_cache;
-+	edt_ns = max(tp->tcp_wstamp_ns, now_ns);
-+	interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
-+	interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
-+	inflight_at_edt = inflight_now;
-+	if (bbr->pacing_gain > BBR_UNIT)              /* increasing inflight */
-+		inflight_at_edt += bbr_tso_segs_goal(sk);  /* include EDT skb */
-+	if (interval_delivered >= inflight_at_edt)
-+		return 0;
-+	return inflight_at_edt - interval_delivered;
-+}
-+
-+/* Find the cwnd increment based on estimate of ack aggregation */
-+static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 max_aggr_cwnd, aggr_cwnd = 0;
-+
-+	if (bbr->params.extra_acked_gain &&
-+	    (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) {
-+		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
-+				/ BW_UNIT;
-+		aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk))
-+			     >> BBR_SCALE;
-+		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
-+	}
-+
-+	return aggr_cwnd;
-+}
-+
-+/* Returns the cwnd for PROBE_RTT mode. */
-+static u32 bbr_probe_rtt_cwnd(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (bbr->params.probe_rtt_cwnd_gain == 0)
-+		return bbr->params.cwnd_min_target;
-+	return max_t(u32, bbr->params.cwnd_min_target,
-+		     bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain));
-+}
-+
-+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
-+ * has drawn us down below target), or snap down to target if we're above it.
-+ */
-+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
-+			 u32 acked, u32 bw, int gain, u32 cwnd,
-+			 struct bbr_context *ctx)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe;
-+
-+	if (!acked)
-+		goto done;  /* no packet fully ACKed; just apply caps */
-+
-+	target_cwnd = bbr_bdp(sk, bw, gain);
-+
-+	/* Increment the cwnd to account for excess ACKed data that seems
-+	 * due to aggregation (of data and/or ACKs) visible in the ACK stream.
-+	 */
-+	target_cwnd += bbr_ack_aggregation_cwnd(sk);
-+	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
-+
-+	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
-+	bbr->debug.target_cwnd = target_cwnd;
-+
-+	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
-+	bbr->try_fast_path = 0;
-+	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
-+		cwnd += acked;
-+		if (cwnd >= target_cwnd) {
-+			cwnd = target_cwnd;
-+			bbr->try_fast_path = 1;
-+		}
-+	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
-+		cwnd += acked;
-+	} else {
-+		bbr->try_fast_path = 1;
-+	}
-+
-+	/* When growing cwnd, don't grow beyond twice what we just probed. */
-+	if (bbr->params.usage_based_cwnd) {
-+		max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd);
-+		cwnd = min(cwnd, max_probe);
-+	}
-+
-+	cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
-+done:
-+	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
-+	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
-+		tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk));
-+
-+	ctx->target_cwnd = target_cwnd;
-+	ctx->log = (tp->snd_cwnd != prev_cwnd);
-+}
-+
-+/* See if we have reached next round trip */
-+static void bbr_update_round_start(struct sock *sk,
-+		const struct rate_sample *rs, struct bbr_context *ctx)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr->round_start = 0;
-+
-+	/* See if we've reached the next RTT */
-+	if (rs->interval_us > 0 &&
-+	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
-+		bbr->next_rtt_delivered = tp->delivered;
-+		bbr->round_start = 1;
-+	}
-+}
-+
-+/* Calculate the bandwidth based on how fast packets are delivered */
-+static void bbr_calculate_bw_sample(struct sock *sk,
-+			const struct rate_sample *rs, struct bbr_context *ctx)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u64 bw = 0;
-+
-+	/* Divide delivered by the interval to find a (lower bound) bottleneck
-+	 * bandwidth sample. Delivered is in packets and interval_us in uS and
-+	 * ratio will be <<1 for most connections. So delivered is first scaled.
-+	 * Round up to allow growth at low rates, even with integer division.
-+	 */
-+	if (rs->interval_us > 0) {
-+		if (WARN_ONCE(rs->delivered < 0,
-+			      "negative delivered: %d interval_us: %ld\n",
-+			      rs->delivered, rs->interval_us))
-+			return;
-+
-+		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
-+	}
-+
-+	ctx->sample_bw = bw;
-+	bbr->debug.rs_bw = bw;
-+}
-+
-+/* Estimates the windowed max degree of ack aggregation.
-+ * This is used to provision extra in-flight data to keep sending during
-+ * inter-ACK silences.
-+ *
-+ * Degree of ack aggregation is estimated as extra data acked beyond expected.
-+ *
-+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
-+ * cwnd += max_extra_acked
-+ *
-+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
-+ * Max filter is an approximate sliding window of 5-10 (packet timed) round
-+ * trips for non-startup phase, and 1-2 round trips for startup.
-+ */
-+static void bbr_update_ack_aggregation(struct sock *sk,
-+				       const struct rate_sample *rs)
-+{
-+	u32 epoch_us, expected_acked, extra_acked;
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts;
-+
-+	if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 ||
-+	    rs->delivered < 0 || rs->interval_us <= 0)
-+		return;
-+
-+	if (bbr->round_start) {
-+		bbr->extra_acked_win_rtts = min(0x1F,
-+						bbr->extra_acked_win_rtts + 1);
-+		if (bbr->params.extra_acked_in_startup &&
-+		    !bbr_full_bw_reached(sk))
-+			extra_acked_win_rtts_thresh = 1;
-+		if (bbr->extra_acked_win_rtts >=
-+		    extra_acked_win_rtts_thresh) {
-+			bbr->extra_acked_win_rtts = 0;
-+			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
-+						   0 : 1;
-+			bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
-+		}
-+	}
-+
-+	/* Compute how many packets we expected to be delivered over epoch. */
-+	epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
-+				      bbr->ack_epoch_mstamp);
-+	expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
-+
-+	/* Reset the aggregation epoch if ACK rate is below expected rate or
-+	 * significantly large no. of ack received since epoch (potentially
-+	 * quite old epoch).
-+	 */
-+	if (bbr->ack_epoch_acked <= expected_acked ||
-+	    (bbr->ack_epoch_acked + rs->acked_sacked >=
-+	     bbr_ack_epoch_acked_reset_thresh)) {
-+		bbr->ack_epoch_acked = 0;
-+		bbr->ack_epoch_mstamp = tp->delivered_mstamp;
-+		expected_acked = 0;
-+	}
-+
-+	/* Compute excess data delivered, beyond what was expected. */
-+	bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
-+				   bbr->ack_epoch_acked + rs->acked_sacked);
-+	extra_acked = bbr->ack_epoch_acked - expected_acked;
-+	extra_acked = min(extra_acked, tp->snd_cwnd);
-+	if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
-+		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
-+}
-+
-+/* Estimate when the pipe is full, using the change in delivery rate: BBR
-+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
-+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
-+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
-+ * higher rwin, 3: we get higher delivery rate samples. Or transient
-+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
-+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
-+ */
-+static void bbr_check_full_bw_reached(struct sock *sk,
-+				      const struct rate_sample *rs)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 bw_thresh;
-+
-+	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
-+		return;
-+
-+	bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE;
-+	if (bbr_max_bw(sk) >= bw_thresh) {
-+		bbr->full_bw = bbr_max_bw(sk);
-+		bbr->full_bw_cnt = 0;
-+		return;
-+	}
-+	++bbr->full_bw_cnt;
-+	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt;
-+}
-+
-+/* If pipe is probably full, drain the queue and then enter steady-state. */
-+static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
-+			    struct bbr_context *ctx)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
-+		bbr->mode = BBR_DRAIN;	/* drain queue we created */
-+		tcp_sk(sk)->snd_ssthresh =
-+				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
-+		bbr2_reset_congestion_signals(sk);
-+	}	/* fall through to check if in-flight is already small: */
-+	if (bbr->mode == BBR_DRAIN &&
-+	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
-+	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
-+		return true;  /* exiting DRAIN now */
-+	return false;
-+}
-+
-+static void bbr_check_probe_rtt_done(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (!(bbr->probe_rtt_done_stamp &&
-+	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
-+		return;
-+
-+	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
-+	tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
-+	bbr2_exit_probe_rtt(sk);
-+}
-+
-+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
-+ * periodically drain the bottleneck queue, to converge to measure the true
-+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
-+ * small (reducing queuing delay and packet loss) and achieve fairness among
-+ * BBR flows.
-+ *
-+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
-+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
-+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
-+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
-+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
-+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
-+ *
-+ * Note that flows need only pay 2% if they are busy sending over the last 10
-+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
-+ * natural silences or low-rate periods within 10 seconds where the rate is low
-+ * enough for long enough to drain its queue in the bottleneck. We pick up
-+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
-+ */
-+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	bool probe_rtt_expired, min_rtt_expired;
-+	u32 expire;
-+
-+	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
-+	expire = bbr->probe_rtt_min_stamp +
-+		 msecs_to_jiffies(bbr->params.probe_rtt_win_ms);
-+	probe_rtt_expired = after(tcp_jiffies32, expire);
-+	if (rs->rtt_us >= 0 &&
-+	    (rs->rtt_us <= bbr->probe_rtt_min_us ||
-+	     (probe_rtt_expired && !rs->is_ack_delayed))) {
-+		bbr->probe_rtt_min_us = rs->rtt_us;
-+		bbr->probe_rtt_min_stamp = tcp_jiffies32;
-+	}
-+	/* Track min RTT seen in the min_rtt_win_sec filter window: */
-+	expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ;
-+	min_rtt_expired = after(tcp_jiffies32, expire);
-+	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
-+	    min_rtt_expired) {
-+		bbr->min_rtt_us = bbr->probe_rtt_min_us;
-+		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
-+	}
-+
-+	if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired &&
-+	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
-+		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
-+		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
-+		bbr->probe_rtt_done_stamp = 0;
-+		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
-+		bbr->next_rtt_delivered = tp->delivered;
-+	}
-+
-+	if (bbr->mode == BBR_PROBE_RTT) {
-+		/* Ignore low rate samples during this mode. */
-+		tp->app_limited =
-+			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
-+		/* Maintain min packets in flight for max(200 ms, 1 round). */
-+		if (!bbr->probe_rtt_done_stamp &&
-+		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
-+			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
-+				msecs_to_jiffies(bbr->params.probe_rtt_mode_ms);
-+			bbr->probe_rtt_round_done = 0;
-+			bbr->next_rtt_delivered = tp->delivered;
-+		} else if (bbr->probe_rtt_done_stamp) {
-+			if (bbr->round_start)
-+				bbr->probe_rtt_round_done = 1;
-+			if (bbr->probe_rtt_round_done)
-+				bbr_check_probe_rtt_done(sk);
-+		}
-+	}
-+	/* Restart after idle ends only once we process a new S/ACK for data */
-+	if (rs->delivered > 0)
-+		bbr->idle_restart = 0;
-+}
-+
-+static void bbr_update_gains(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	switch (bbr->mode) {
-+	case BBR_STARTUP:
-+		bbr->pacing_gain = bbr->params.high_gain;
-+		bbr->cwnd_gain	 = bbr->params.startup_cwnd_gain;
-+		break;
-+	case BBR_DRAIN:
-+		bbr->pacing_gain = bbr->params.drain_gain;  /* slow, to drain */
-+		bbr->cwnd_gain = bbr->params.startup_cwnd_gain;  /* keep cwnd */
-+		break;
-+	case BBR_PROBE_BW:
-+		bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx];
-+		bbr->cwnd_gain = bbr->params.cwnd_gain;
-+		break;
-+	case BBR_PROBE_RTT:
-+		bbr->pacing_gain = BBR_UNIT;
-+		bbr->cwnd_gain = BBR_UNIT;
-+		break;
-+	default:
-+		WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
-+		break;
-+	}
-+}
-+
-+static void bbr_init(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	int i;
-+
-+	WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val);
-+
-+	bbr->initialized = 1;
-+	bbr->params.high_gain = min(0x7FF, bbr_high_gain);
-+	bbr->params.drain_gain = min(0x3FF, bbr_drain_gain);
-+	bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain);
-+	bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain);
-+	bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget);
-+	bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target);
-+	bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec);
-+	bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms);
-+	bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt);
-+	bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh);
-+	bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain);
-+	bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts);
-+	bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0;
-+	bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0;
-+	bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0;
-+	bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain);
-+	bbr->params.probe_rtt_win_ms =
-+		min(0x3FFFU,
-+		    min_t(u32, bbr_probe_rtt_win_ms,
-+			  bbr->params.min_rtt_win_sec * MSEC_PER_SEC));
-+	for (i = 0; i < CYCLE_LEN; i++)
-+		bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]);
-+	bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0;
-+	bbr->params.tso_rtt_shift =  min(0xFU, bbr_tso_rtt_shift);
-+
-+	bbr->debug.snd_isn = tp->snd_una;
-+	bbr->debug.target_cwnd = 0;
-+	bbr->debug.undo = 0;
-+
-+	bbr->init_cwnd = min(0x7FU, tp->snd_cwnd);
-+	bbr->prior_cwnd = tp->prior_cwnd;
-+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-+	bbr->next_rtt_delivered = 0;
-+	bbr->prev_ca_state = TCP_CA_Open;
-+	bbr->packet_conservation = 0;
-+
-+	bbr->probe_rtt_done_stamp = 0;
-+	bbr->probe_rtt_round_done = 0;
-+	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
-+	bbr->probe_rtt_min_stamp = tcp_jiffies32;
-+	bbr->min_rtt_us = tcp_min_rtt(tp);
-+	bbr->min_rtt_stamp = tcp_jiffies32;
-+
-+	bbr->has_seen_rtt = 0;
-+	bbr_init_pacing_rate_from_rtt(sk);
-+
-+	bbr->round_start = 0;
-+	bbr->idle_restart = 0;
-+	bbr->full_bw_reached = 0;
-+	bbr->full_bw = 0;
-+	bbr->full_bw_cnt = 0;
-+	bbr->cycle_mstamp = 0;
-+	bbr->cycle_idx = 0;
-+	bbr->mode = BBR_STARTUP;
-+	bbr->debug.rs_bw = 0;
-+
-+	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
-+	bbr->ack_epoch_acked = 0;
-+	bbr->extra_acked_win_rtts = 0;
-+	bbr->extra_acked_win_idx = 0;
-+	bbr->extra_acked[0] = 0;
-+	bbr->extra_acked[1] = 0;
-+
-+	bbr->ce_state = 0;
-+	bbr->prior_rcv_nxt = tp->rcv_nxt;
-+	bbr->try_fast_path = 0;
-+
-+	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
-+}
-+
-+static u32 bbr_sndbuf_expand(struct sock *sk)
-+{
-+	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
-+	return 3;
-+}
-+
-+/* __________________________________________________________________________
-+ *
-+ * Functions new to BBR v2 ("bbr") congestion control are below here.
-+ * __________________________________________________________________________
-+ */
-+
-+/* Incorporate a new bw sample into the current window of our max filter. */
-+static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
-+}
-+
-+/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
-+static void bbr2_advance_bw_hi_filter(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (!bbr->bw_hi[1])
-+		return;  /* no samples in this window; remember old window */
-+	bbr->bw_hi[0] = bbr->bw_hi[1];
-+	bbr->bw_hi[1] = 0;
-+}
-+
-+/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
-+static u32 bbr2_target_inflight(struct sock *sk)
-+{
-+	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
-+
-+	return min(bdp, tcp_sk(sk)->snd_cwnd);
-+}
-+
-+static bool bbr2_is_probing_bandwidth(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	return (bbr->mode == BBR_STARTUP) ||
-+		(bbr->mode == BBR_PROBE_BW &&
-+		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
-+		  bbr->cycle_idx == BBR_BW_PROBE_UP));
-+}
-+
-+/* Has the given amount of time elapsed since we marked the phase start? */
-+static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
-+{
-+	const struct tcp_sock *tp = tcp_sk(sk);
-+	const struct bbr *bbr = inet_csk_ca(sk);
-+
-+	return tcp_stamp_us_delta(tp->tcp_mstamp,
-+				  bbr->cycle_mstamp + interval_us) > 0;
-+}
-+
-+static void bbr2_handle_queue_too_high_in_startup(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr->full_bw_reached = 1;
-+	bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
-+}
-+
-+/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
-+static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
-+	    !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh)
-+		return;
-+
-+	if (ce_ratio >= bbr->params.ecn_thresh)
-+		bbr->startup_ecn_rounds++;
-+	else
-+		bbr->startup_ecn_rounds = 0;
-+
-+	if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) {
-+		bbr->debug.event = 'E';  /* ECN caused STARTUP exit */
-+		bbr2_handle_queue_too_high_in_startup(sk);
-+		return;
-+	}
-+}
-+
-+static void bbr2_update_ecn_alpha(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	s32 delivered, delivered_ce;
-+	u64 alpha, ce_ratio;
-+	u32 gain;
-+
-+	if (bbr->params.ecn_factor == 0)
-+		return;
-+
-+	delivered = tp->delivered - bbr->alpha_last_delivered;
-+	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
-+
-+	if (delivered == 0 ||		/* avoid divide by zero */
-+	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
-+		return;
-+
-+	/* See if we should use ECN sender logic for this connection. */
-+	if (!bbr->ecn_eligible && bbr_ecn_enable &&
-+	    (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us ||
-+	     !bbr->params.ecn_max_rtt_us))
-+		bbr->ecn_eligible = 1;
-+
-+	ce_ratio = (u64)delivered_ce << BBR_SCALE;
-+	do_div(ce_ratio, delivered);
-+	gain = bbr->params.ecn_alpha_gain;
-+	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
-+	alpha += (gain * ce_ratio) >> BBR_SCALE;
-+	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
-+
-+	bbr->alpha_last_delivered = tp->delivered;
-+	bbr->alpha_last_delivered_ce = tp->delivered_ce;
-+
-+	bbr2_check_ecn_too_high_in_startup(sk, ce_ratio);
-+}
-+
-+/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
-+static void bbr2_raise_inflight_hi_slope(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 growth_this_round, cnt;
-+
-+	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
-+	growth_this_round = 1 << bbr->bw_probe_up_rounds;
-+	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
-+	cnt = tp->snd_cwnd / growth_this_round;
-+	cnt = max(cnt, 1U);
-+	bbr->bw_probe_up_cnt = cnt;
-+	bbr->debug.event = 'G';  /* Grow inflight_hi slope */
-+}
-+
-+/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
-+static void bbr2_probe_inflight_hi_upward(struct sock *sk,
-+					  const struct rate_sample *rs)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 delta;
-+
-+	if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) {
-+		bbr->bw_probe_up_acks = 0;  /* don't accmulate unused credits */
-+		return;  /* not fully using inflight_hi, so don't grow it */
-+	}
-+
-+	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
-+	bbr->bw_probe_up_acks += rs->acked_sacked;
-+	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
-+		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
-+		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
-+		bbr->inflight_hi += delta;
-+		bbr->debug.event = 'I';  /* Increment inflight_hi */
-+	}
-+
-+	if (bbr->round_start)
-+		bbr2_raise_inflight_hi_slope(sk);
-+}
-+
-+/* Does loss/ECN rate for this sample say inflight is "too high"?
-+ * This is used by both the bbr_check_loss_too_high_in_startup() function,
-+ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
-+ * uses it to notice when loss/ECN rates suggest inflight is too high.
-+ */
-+static bool bbr2_is_inflight_too_high(const struct sock *sk,
-+				     const struct rate_sample *rs)
-+{
-+	const struct bbr *bbr = inet_csk_ca(sk);
-+	u32 loss_thresh, ecn_thresh;
-+
-+	if (rs->lost > 0 && rs->tx_in_flight) {
-+		loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >>
-+				BBR_SCALE;
-+		if (rs->lost > loss_thresh)
-+			return true;
-+	}
-+
-+	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
-+	    bbr->ecn_eligible && bbr->params.ecn_thresh) {
-+		ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >>
-+				BBR_SCALE;
-+		if (rs->delivered_ce >= ecn_thresh)
-+			return true;
-+	}
-+
-+	return false;
-+}
-+
-+/* Calculate the tx_in_flight level that corresponded to excessive loss.
-+ * We find "lost_prefix" segs of the skb where loss rate went too high,
-+ * by solving for "lost_prefix" in the following equation:
-+ *   lost                     /  inflight                     >= loss_thresh
-+ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
-+ * Then we take that equation, convert it to fixed point, and
-+ * round up to the nearest packet.
-+ */
-+static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk,
-+					  const struct rate_sample *rs,
-+					  const struct sk_buff *skb)
-+{
-+	const struct bbr *bbr = inet_csk_ca(sk);
-+	u32 loss_thresh  = bbr->params.loss_thresh;
-+	u32 pcount, divisor, inflight_hi;
-+	s32 inflight_prev, lost_prev;
-+	u64 loss_budget, lost_prefix;
-+
-+	pcount = tcp_skb_pcount(skb);
-+
-+	/* How much data was in flight before this skb? */
-+	inflight_prev = rs->tx_in_flight - pcount;
-+	if (WARN_ONCE(inflight_prev < 0,
-+		      "tx_in_flight: %u pcount: %u reneg: %u",
-+		      rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg))
-+		return ~0U;
-+
-+	/* How much inflight data was marked lost before this skb? */
-+	lost_prev = rs->lost - pcount;
-+	if (WARN_ON_ONCE(lost_prev < 0))
-+		return ~0U;
-+
-+	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
-+	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
-+	loss_budget >>= BBR_SCALE;
-+	if (lost_prev >= loss_budget) {
-+		lost_prefix = 0;   /* previous losses crossed loss_thresh */
-+	} else {
-+		lost_prefix = loss_budget - lost_prev;
-+		lost_prefix <<= BBR_SCALE;
-+		divisor = BBR_UNIT - loss_thresh;
-+		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
-+			return ~0U;
-+		do_div(lost_prefix, divisor);
-+	}
-+
-+	inflight_hi = inflight_prev + lost_prefix;
-+	return inflight_hi;
-+}
-+
-+/* If loss/ECN rates during probing indicated we may have overfilled a
-+ * buffer, return an operating point that tries to leave unutilized headroom in
-+ * the path for other flows, for fairness convergence and lower RTTs and loss.
-+ */
-+static u32 bbr2_inflight_with_headroom(const struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 headroom, headroom_fraction;
-+
-+	if (bbr->inflight_hi == ~0U)
-+		return ~0U;
-+
-+	headroom_fraction = bbr->params.inflight_headroom;
-+	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
-+	headroom = max(headroom, 1U);
-+	return max_t(s32, bbr->inflight_hi - headroom,
-+		     bbr->params.cwnd_min_target);
-+}
-+
-+/* Bound cwnd to a sensible level, based on our current probing state
-+ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
-+ */
-+static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 cap;
-+
-+	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
-+	 * and thus cong_control() without first initializing us(!).
-+	 */
-+	if (!bbr->initialized)
-+		return;
-+
-+	cap = ~0U;
-+	if (bbr->mode == BBR_PROBE_BW &&
-+	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
-+		/* Probe to see if more packets fit in the path. */
-+		cap = bbr->inflight_hi;
-+	} else {
-+		if (bbr->mode == BBR_PROBE_RTT ||
-+		    (bbr->mode == BBR_PROBE_BW &&
-+		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
-+			cap = bbr2_inflight_with_headroom(sk);
-+	}
-+	/* Adapt to any loss/ECN since our last bw probe. */
-+	cap = min(cap, bbr->inflight_lo);
-+
-+	cap = max_t(u32, cap, bbr->params.cwnd_min_target);
-+	tp->snd_cwnd = min(cap, tp->snd_cwnd);
-+}
-+
-+/* Estimate a short-term lower bound on the capacity available now, based
-+ * on measurements of the current delivery process and recent history. When we
-+ * are seeing loss/ECN at times when we are not probing bw, then conservatively
-+ * move toward flow balance by multiplicatively cutting our short-term
-+ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
-+ * multiplicative decrease in order to converge to a lower capacity in time
-+ * logarithmic in the magnitude of the decrease.
-+ *
-+ * However, we do not cut our short-term estimates lower than the current rate
-+ * and volume of delivered data from this round trip, since from the current
-+ * delivery process we can estimate the measured capacity available now.
-+ *
-+ * Anything faster than that approach would knowingly risk high loss, which can
-+ * cause low bw for Reno/CUBIC and high loss recovery latency for
-+ * request/response flows using any congestion control.
-+ */
-+static void bbr2_adapt_lower_bounds(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 ecn_cut, ecn_inflight_lo, beta;
-+
-+	/* We only use lower-bound estimates when not probing bw.
-+	 * When probing we need to push inflight higher to probe bw.
-+	 */
-+	if (bbr2_is_probing_bandwidth(sk))
-+		return;
-+
-+	/* ECN response. */
-+	if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) {
-+		/* Reduce inflight to (1 - alpha*ecn_factor). */
-+		ecn_cut = (BBR_UNIT -
-+			   ((bbr->ecn_alpha * bbr->params.ecn_factor) >>
-+			    BBR_SCALE));
-+		if (bbr->inflight_lo == ~0U)
-+			bbr->inflight_lo = tp->snd_cwnd;
-+		ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
-+	} else {
-+		ecn_inflight_lo = ~0U;
-+	}
-+
-+	/* Loss response. */
-+	if (bbr->loss_in_round) {
-+		/* Reduce bw and inflight to (1 - beta). */
-+		if (bbr->bw_lo == ~0U)
-+			bbr->bw_lo = bbr_max_bw(sk);
-+		if (bbr->inflight_lo == ~0U)
-+			bbr->inflight_lo = tp->snd_cwnd;
-+		beta = bbr->params.beta;
-+		bbr->bw_lo =
-+			max_t(u32, bbr->bw_latest,
-+			      (u64)bbr->bw_lo *
-+			      (BBR_UNIT - beta) >> BBR_SCALE);
-+		bbr->inflight_lo =
-+			max_t(u32, bbr->inflight_latest,
-+			      (u64)bbr->inflight_lo *
-+			      (BBR_UNIT - beta) >> BBR_SCALE);
-+	}
-+
-+	/* Adjust to the lower of the levels implied by loss or ECN. */
-+	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
-+}
-+
-+/* Reset any short-term lower-bound adaptation to congestion, so that we can
-+ * push our inflight up.
-+ */
-+static void bbr2_reset_lower_bounds(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr->bw_lo = ~0U;
-+	bbr->inflight_lo = ~0U;
-+}
-+
-+/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
-+ * machine phase where we adapt our lower bound based on congestion signals.
-+ */
-+static void bbr2_reset_congestion_signals(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr->loss_in_round = 0;
-+	bbr->ecn_in_round = 0;
-+	bbr->loss_in_cycle = 0;
-+	bbr->ecn_in_cycle = 0;
-+	bbr->bw_latest = 0;
-+	bbr->inflight_latest = 0;
-+}
-+
-+/* Update (most of) our congestion signals: track the recent rate and volume of
-+ * delivered data, presence of loss, and EWMA degree of ECN marking.
-+ */
-+static void bbr2_update_congestion_signals(
-+	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u64 bw;
-+
-+	bbr->loss_round_start = 0;
-+	if (rs->interval_us <= 0 || !rs->acked_sacked)
-+		return; /* Not a valid observation */
-+	bw = ctx->sample_bw;
-+
-+	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
-+		bbr2_take_bw_hi_sample(sk, bw);
-+
-+	bbr->loss_in_round |= (rs->losses > 0);
-+
-+	/* Update rate and volume of delivered data from latest round trip: */
-+	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
-+	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
-+
-+	if (before(rs->prior_delivered, bbr->loss_round_delivered))
-+		return;		/* skip the per-round-trip updates */
-+	/* Now do per-round-trip updates. */
-+	bbr->loss_round_delivered = tp->delivered;  /* mark round trip */
-+	bbr->loss_round_start = 1;
-+	bbr2_adapt_lower_bounds(sk);
-+
-+	/* Update windowed "latest" (single-round-trip) filters. */
-+	bbr->loss_in_round = 0;
-+	bbr->ecn_in_round  = 0;
-+	bbr->bw_latest = ctx->sample_bw;
-+	bbr->inflight_latest = rs->delivered;
-+}
-+
-+/* Bandwidth probing can cause loss. To help coexistence with loss-based
-+ * congestion control we spread out our probing in a Reno-conscious way. Due to
-+ * the shape of the Reno sawtooth, the time required between loss epochs for an
-+ * idealized Reno flow is a number of round trips that is the BDP of that
-+ * flow. We count packet-timed round trips directly, since measured RTT can
-+ * vary widely, and Reno is driven by packet-timed round trips.
-+ */
-+static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 inflight, rounds, reno_gain, reno_rounds;
-+
-+	/* Random loss can shave some small percentage off of our inflight
-+	 * in each round. To survive this, flows need robust periodic probes.
-+	 */
-+	rounds = bbr->params.bw_probe_max_rounds;
-+
-+	reno_gain = bbr->params.bw_probe_reno_gain;
-+	if (reno_gain) {
-+		inflight = bbr2_target_inflight(sk);
-+		reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE;
-+		rounds = min(rounds, reno_rounds);
-+	}
-+	return bbr->rounds_since_probe >= rounds;
-+}
-+
-+/* How long do we want to wait before probing for bandwidth (and risking
-+ * loss)? We randomize the wait, for better mixing and fairness convergence.
-+ *
-+ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
-+ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
-+ * (eg 4K video to a broadband user):
-+ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
-+ *
-+ * We bound the BBR-native inter-bw-probe wall clock time to be:
-+ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
-+ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
-+ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
-+ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
-+ *      amount of time to discover unutilized bw on human-scale interactive
-+ *      time-scales (e.g. perhaps traffic from a web page download that we
-+ *      were competing with is now complete).
-+ */
-+static void bbr2_pick_probe_wait(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	/* Decide the random round-trip bound for wait until probe: */
-+	bbr->rounds_since_probe =
-+		prandom_u32_max(bbr->params.bw_probe_rand_rounds);
-+	/* Decide the random wall clock bound for wait until probe: */
-+	bbr->probe_wait_us = bbr->params.bw_probe_base_us +
-+			     prandom_u32_max(bbr->params.bw_probe_rand_us);
-+}
-+
-+static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr->cycle_idx = cycle_idx;
-+	/* New phase, so need to update cwnd and pacing rate. */
-+	bbr->try_fast_path = 0;
-+}
-+
-+/* Send at estimated bw to fill the pipe, but not queue. We need this phase
-+ * before PROBE_UP, because as soon as we send faster than the available bw
-+ * we will start building a queue, and if the buffer is shallow we can cause
-+ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
-+ * inflight_hi estimates will underestimate.
-+ */
-+static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr2_reset_lower_bounds(sk);
-+	if (bbr->inflight_hi != ~0U)
-+		bbr->inflight_hi += bbr->params.refill_add_inc;
-+	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
-+	bbr->bw_probe_up_acks = 0;
-+	bbr->stopped_risky_probe = 0;
-+	bbr->ack_phase = BBR_ACKS_REFILLING;
-+	bbr->next_rtt_delivered = tp->delivered;
-+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
-+}
-+
-+/* Now probe max deliverable data rate and volume. */
-+static void bbr2_start_bw_probe_up(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
-+	bbr->next_rtt_delivered = tp->delivered;
-+	bbr->cycle_mstamp = tp->tcp_mstamp;
-+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP);
-+	bbr2_raise_inflight_hi_slope(sk);
-+}
-+
-+/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
-+ * clock time at which to probe beyond an inflight that we think to be
-+ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
-+ * keep packet loss rates low. Also start a round-trip counter, to probe faster
-+ * if we estimate a Reno flow at our BDP would probe faster.
-+ */
-+static void bbr2_start_bw_probe_down(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr2_reset_congestion_signals(sk);
-+	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
-+	bbr2_pick_probe_wait(sk);
-+	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
-+	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
-+	bbr->next_rtt_delivered = tp->delivered;
-+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
-+}
-+
-+/* Cruise: maintain what we estimate to be a neutral, conservative
-+ * operating point, without attempting to probe up for bandwidth or down for
-+ * RTT, and only reducing inflight in response to loss/ECN signals.
-+ */
-+static void bbr2_start_bw_probe_cruise(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (bbr->inflight_lo != ~0U)
-+		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
-+
-+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
-+}
-+
-+/* Loss and/or ECN rate is too high while probing.
-+ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
-+ */
-+static void bbr2_handle_inflight_too_high(struct sock *sk,
-+					  const struct rate_sample *rs)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	const u32 beta = bbr->params.beta;
-+
-+	bbr->prev_probe_too_high = 1;
-+	bbr->bw_probe_samples = 0;  /* only react once per probe */
-+	bbr->debug.event = 'L';     /* Loss/ECN too high */
-+	/* If we are app-limited then we are not robustly
-+	 * probing the max volume of inflight data we think
-+	 * might be safe (analogous to how app-limited bw
-+	 * samples are not known to be robustly probing bw).
-+	 */
-+	if (!rs->is_app_limited)
-+		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
-+					 (u64)bbr2_target_inflight(sk) *
-+					 (BBR_UNIT - beta) >> BBR_SCALE);
-+	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
-+		bbr2_start_bw_probe_down(sk);
-+}
-+
-+/* If we're seeing bw and loss samples reflecting our bw probing, adapt
-+ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
-+ * inflight_hi downward. If we're able to push inflight higher without such
-+ * signals, push higher: adapt inflight_hi upward.
-+ */
-+static bool bbr2_adapt_upper_bounds(struct sock *sk,
-+				   const struct rate_sample *rs)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	/* Track when we'll see bw/loss samples resulting from our bw probes. */
-+	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
-+		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
-+	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
-+		/* End of samples from bw probing phase. */
-+		bbr->bw_probe_samples = 0;
-+		bbr->ack_phase = BBR_ACKS_INIT;
-+		/* At this point in the cycle, our current bw sample is also
-+		 * our best recent chance at finding the highest available bw
-+		 * for this flow. So now is the best time to forget the bw
-+		 * samples from the previous cycle, by advancing the window.
-+		 */
-+		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
-+			bbr2_advance_bw_hi_filter(sk);
-+		/* If we had an inflight_hi, then probed and pushed inflight all
-+		 * the way up to hit that inflight_hi without seeing any
-+		 * high loss/ECN in all the resulting ACKs from that probing,
-+		 * then probe up again, this time letting inflight persist at
-+		 * inflight_hi for a round trip, then accelerating beyond.
-+		 */
-+		if (bbr->mode == BBR_PROBE_BW &&
-+		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
-+			bbr->debug.event = 'R';  /* reprobe */
-+			bbr2_start_bw_probe_refill(sk, 0);
-+			return true;  /* yes, decided state transition */
-+		}
-+	}
-+
-+	if (bbr2_is_inflight_too_high(sk, rs)) {
-+		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
-+			bbr2_handle_inflight_too_high(sk, rs);
-+	} else {
-+		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
-+		if (bbr->inflight_hi == ~0U)  /* no excess queue signals yet? */
-+			return false;
-+
-+		/* To be resilient to random loss, we must raise inflight_hi
-+		 * if we observe in any phase that a higher level is safe.
-+		 */
-+		if (rs->tx_in_flight > bbr->inflight_hi) {
-+			bbr->inflight_hi = rs->tx_in_flight;
-+			bbr->debug.event = 'U';  /* raise up inflight_hi */
-+		}
-+
-+		if (bbr->mode == BBR_PROBE_BW &&
-+		    bbr->cycle_idx == BBR_BW_PROBE_UP)
-+			bbr2_probe_inflight_hi_upward(sk, rs);
-+	}
-+
-+	return false;
-+}
-+
-+/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
-+static bool bbr2_check_time_to_probe_bw(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 n;
-+
-+	/* If we seem to be at an operating point where we are not seeing loss
-+	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
-+	 * quickly (in case a burst of cross-traffic has ceased and freed up bw,
-+	 * or in case we are sharing with multiplicatively probing traffic).
-+	 */
-+	if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible &&
-+	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
-+	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
-+		bbr->debug.event = 'A';  /* *A*ll clear to probe *A*gain */
-+		/* Calculate n so that when bbr2_raise_inflight_hi_slope()
-+		 * computes growth_this_round as 2^n it will be roughly the
-+		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
-+		 */
-+		n = ilog2((((u64)bbr->inflight_hi *
-+			    bbr->params.ecn_reprobe_gain) >> BBR_SCALE));
-+		bbr2_start_bw_probe_refill(sk, n);
-+		return true;
-+	}
-+
-+	if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
-+	    bbr2_is_reno_coexistence_probe_time(sk)) {
-+		bbr2_start_bw_probe_refill(sk, 0);
-+		return true;
-+	}
-+	return false;
-+}
-+
-+/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
-+static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	bool is_under_bdp, is_long_enough;
-+
-+	/* Always need to pull inflight down to leave headroom in queue. */
-+	if (inflight > bbr2_inflight_with_headroom(sk))
-+		return false;
-+
-+	is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT);
-+	if (bbr->params.drain_to_target)
-+		return is_under_bdp;
-+
-+	is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us);
-+	return is_under_bdp || is_long_enough;
-+}
-+
-+/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
-+static void bbr2_update_cycle_phase(struct sock *sk,
-+				    const struct rate_sample *rs)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	bool is_risky = false, is_queuing = false;
-+	u32 inflight, bw;
-+
-+	if (!bbr_full_bw_reached(sk))
-+		return;
-+
-+	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
-+	if (bbr2_adapt_upper_bounds(sk, rs))
-+		return;		/* already decided state transition */
-+
-+	if (bbr->mode != BBR_PROBE_BW)
-+		return;
-+
-+	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
-+	bw = bbr_max_bw(sk);
-+
-+	switch (bbr->cycle_idx) {
-+	/* First we spend most of our time cruising with a pacing_gain of 1.0,
-+	 * which paces at the estimated bw, to try to fully use the pipe
-+	 * without building queue. If we encounter loss/ECN marks, we adapt
-+	 * by slowing down.
-+	 */
-+	case BBR_BW_PROBE_CRUISE:
-+		if (bbr2_check_time_to_probe_bw(sk))
-+			return;		/* already decided state transition */
-+		break;
-+
-+	/* After cruising, when it's time to probe, we first "refill": we send
-+	 * at the estimated bw to fill the pipe, before probing higher and
-+	 * knowingly risking overflowing the bottleneck buffer (causing loss).
-+	 */
-+	case BBR_BW_PROBE_REFILL:
-+		if (bbr->round_start) {
-+			/* After one full round trip of sending in REFILL, we
-+			 * start to see bw samples reflecting our REFILL, which
-+			 * may be putting too much data in flight.
-+			 */
-+			bbr->bw_probe_samples = 1;
-+			bbr2_start_bw_probe_up(sk);
-+		}
-+		break;
-+
-+	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
-+	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
-+	 * to at least pacing_gain*BDP; note that this may take more than
-+	 * min_rtt if min_rtt is small (e.g. on a LAN).
-+	 *
-+	 * We terminate PROBE_UP bandwidth probing upon any of the following:
-+	 *
-+	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
-+	 *     most recent previous bw probe phase. Thus we want to start
-+	 *     draining the queue immediately because it's very likely the most
-+	 *     recently sent packets will fill the queue and cause drops.
-+	 *     (checked here)
-+	 * (2) We have probed for at least 1*min_rtt_us, and the
-+	 *     estimated queue is high enough (inflight > 1.25 * estimated_bdp).
-+	 *     (checked here)
-+	 * (3) Loss filter says loss rate is "too high".
-+	 *     (checked in bbr_is_inflight_too_high())
-+	 * (4) ECN filter says ECN mark rate is "too high".
-+	 *     (checked in bbr_is_inflight_too_high())
-+	 */
-+	case BBR_BW_PROBE_UP:
-+		if (bbr->prev_probe_too_high &&
-+		    inflight >= bbr->inflight_hi) {
-+			bbr->stopped_risky_probe = 1;
-+			is_risky = true;
-+			bbr->debug.event = 'D';   /* D for danger */
-+		} else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) &&
-+			   inflight >=
-+			   bbr_inflight(sk, bw,
-+					bbr->params.bw_probe_pif_gain)) {
-+			is_queuing = true;
-+			bbr->debug.event = 'Q'; /* building Queue */
-+		}
-+		if (is_risky || is_queuing) {
-+			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
-+			bbr2_start_bw_probe_down(sk);  /* restart w/ down */
-+		}
-+		break;
-+
-+	/* After probing in PROBE_UP, we have usually accumulated some data in
-+	 * the bottleneck buffer (if bw probing didn't find more bw). We next
-+	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
-+	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
-+	 * our inflight is less then that target cruising point, which is the
-+	 * minimum of (a) the amount needed to leave headroom, and (b) the
-+	 * estimated BDP. Once inflight falls to match the target, we estimate
-+	 * the queue is drained; persisting would underutilize the pipe.
-+	 */
-+	case BBR_BW_PROBE_DOWN:
-+		if (bbr2_check_time_to_probe_bw(sk))
-+			return;		/* already decided state transition */
-+		if (bbr2_check_time_to_cruise(sk, inflight, bw))
-+			bbr2_start_bw_probe_cruise(sk);
-+		break;
-+
-+	default:
-+		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
-+	}
-+}
-+
-+/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
-+static void bbr2_exit_probe_rtt(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr2_reset_lower_bounds(sk);
-+	if (bbr_full_bw_reached(sk)) {
-+		bbr->mode = BBR_PROBE_BW;
-+		/* Raising inflight after PROBE_RTT may cause loss, so reset
-+		 * the PROBE_BW clock and schedule the next bandwidth probe for
-+		 * a friendly and randomized future point in time.
-+		 */
-+		bbr2_start_bw_probe_down(sk);
-+		/* Since we are exiting PROBE_RTT, we know inflight is
-+		 * below our estimated BDP, so it is reasonable to cruise.
-+		 */
-+		bbr2_start_bw_probe_cruise(sk);
-+	} else {
-+		bbr->mode = BBR_STARTUP;
-+	}
-+}
-+
-+/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
-+ * the end of the round in recovery to get a good estimate of how many packets
-+ * have been lost, and how many we need to drain with a low pacing rate.
-+ */
-+static void bbr2_check_loss_too_high_in_startup(struct sock *sk,
-+					       const struct rate_sample *rs)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (bbr_full_bw_reached(sk))
-+		return;
-+
-+	/* For STARTUP exit, check the loss rate at the end of each round trip
-+	 * of Recovery episodes in STARTUP. We check the loss rate at the end
-+	 * of the round trip to filter out noisy/low loss and have a better
-+	 * sense of inflight (extent of loss), so we can drain more accurately.
-+	 */
-+	if (rs->losses && bbr->loss_events_in_round < 0xf)
-+		bbr->loss_events_in_round++;  /* update saturating counter */
-+	if (bbr->params.full_loss_cnt && bbr->loss_round_start &&
-+	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
-+	    bbr->loss_events_in_round >= bbr->params.full_loss_cnt &&
-+	    bbr2_is_inflight_too_high(sk, rs)) {
-+		bbr->debug.event = 'P';  /* Packet loss caused STARTUP exit */
-+		bbr2_handle_queue_too_high_in_startup(sk);
-+		return;
-+	}
-+	if (bbr->loss_round_start)
-+		bbr->loss_events_in_round = 0;
-+}
-+
-+/* If we are done draining, advance into steady state operation in PROBE_BW. */
-+static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs,
-+			     struct bbr_context *ctx)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (bbr_check_drain(sk, rs, ctx)) {
-+		bbr->mode = BBR_PROBE_BW;
-+		bbr2_start_bw_probe_down(sk);
-+	}
-+}
-+
-+static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs,
-+			      struct bbr_context *ctx)
-+{
-+	bbr2_update_congestion_signals(sk, rs, ctx);
-+	bbr_update_ack_aggregation(sk, rs);
-+	bbr2_check_loss_too_high_in_startup(sk, rs);
-+	bbr_check_full_bw_reached(sk, rs);
-+	bbr2_check_drain(sk, rs, ctx);
-+	bbr2_update_cycle_phase(sk, rs);
-+	bbr_update_min_rtt(sk, rs);
-+}
-+
-+/* Fast path for app-limited case.
-+ *
-+ * On each ack, we execute bbr state machine, which primarily consists of:
-+ * 1) update model based on new rate sample, and
-+ * 2) update control based on updated model or state change.
-+ *
-+ * There are certain workload/scenarios, e.g. app-limited case, where
-+ * either we can skip updating model or we can skip update of both model
-+ * as well as control. This provides signifcant softirq cpu savings for
-+ * processing incoming acks.
-+ *
-+ * In case of app-limited, if there is no congestion (loss/ecn) and
-+ * if observed bw sample is less than current estimated bw, then we can
-+ * skip some of the computation in bbr state processing:
-+ *
-+ * - if there is no rtt/mode/phase change: In this case, since all the
-+ *   parameters of the network model are constant, we can skip model
-+ *   as well control update.
-+ *
-+ * - else we can skip rest of the model update. But we still need to
-+ *   update the control to account for the new rtt/mode/phase.
-+ *
-+ * Returns whether we can take fast path or not.
-+ */
-+static bool bbr2_fast_path(struct sock *sk, bool *update_model,
-+		const struct rate_sample *rs, struct bbr_context *ctx)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 prev_min_rtt_us, prev_mode;
-+
-+	if (bbr->params.fast_path && bbr->try_fast_path &&
-+	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
-+	    !bbr->loss_in_round && !bbr->ecn_in_round) {
-+		prev_mode = bbr->mode;
-+		prev_min_rtt_us = bbr->min_rtt_us;
-+		bbr2_check_drain(sk, rs, ctx);
-+		bbr2_update_cycle_phase(sk, rs);
-+		bbr_update_min_rtt(sk, rs);
-+
-+		if (bbr->mode == prev_mode &&
-+		    bbr->min_rtt_us == prev_min_rtt_us &&
-+		    bbr->try_fast_path)
-+			return true;
-+
-+		/* Skip model update, but control still needs to be updated */
-+		*update_model = false;
-+	}
-+	return false;
-+}
-+
-+static void bbr2_main(struct sock *sk, const struct rate_sample *rs)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	struct bbr_context ctx = { 0 };
-+	bool update_model = true;
-+	u32 bw;
-+
-+	bbr->debug.event = '.';  /* init to default NOP (no event yet) */
-+
-+	bbr_update_round_start(sk, rs, &ctx);
-+	if (bbr->round_start) {
-+		bbr->rounds_since_probe =
-+			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
-+		bbr2_update_ecn_alpha(sk);
-+	}
-+
-+	bbr->ecn_in_round  |= rs->is_ece;
-+	bbr_calculate_bw_sample(sk, rs, &ctx);
-+
-+	if (bbr2_fast_path(sk, &update_model, rs, &ctx))
-+		goto out;
-+
-+	if (update_model)
-+		bbr2_update_model(sk, rs, &ctx);
-+
-+	bbr_update_gains(sk);
-+	bw = bbr_bw(sk);
-+	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
-+	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
-+		     tp->snd_cwnd, &ctx);
-+	bbr2_bound_cwnd_for_inflight_model(sk);
-+
-+out:
-+	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
-+	bbr->loss_in_cycle |= rs->lost > 0;
-+	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
-+
-+	bbr_debug(sk, rs->acked_sacked, rs, &ctx);
-+}
-+
-+/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared
-+ * down here, so that the algorithm functions that use the parameters must use
-+ * the per-socket parameters; if they accidentally use the global version
-+ * then there will be a compile error.
-+ * TODO(ncardwell): move all per-socket parameters down to this section.
-+ */
-+
-+/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
-+ * No loss response when 0. Max allwed value is 255.
-+ */
-+static u32 bbr_beta = BBR_UNIT * 30 / 100;
-+
-+/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE.
-+ * Max allowed value is 255.
-+ */
-+static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;  /* 1/16 = 6.25% */
-+
-+/* The initial value for the ecn_alpha state variable. Default and max
-+ * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly
-+ * to congestion if the bottleneck is congested when the flow starts up.
-+ */
-+static u32 bbr_ecn_alpha_init = BBR_UNIT;	/* 1.0, to respond quickly */
-+
-+/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
-+ * No ECN based bounding when 0. Max allwed value is 255.
-+ */
-+static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	    /* 1/3 = 33% */
-+
-+/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
-+ * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255.
-+ */
-+static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
-+
-+/* Max RTT (in usec) at which to use sender-side ECN logic.
-+ * Disabled when 0 (ECN allowed at any RTT).
-+ * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms.
-+ */
-+static u32 bbr_ecn_max_rtt_us = 5000;
-+
-+/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
-+ * clears then use a multiplicative increase to quickly reprobe bw by
-+ * starting inflight probing at the given multiple of inflight_hi.
-+ * Default for this experimental knob is 0 (disabled).
-+ * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5.
-+ */
-+static u32 bbr_ecn_reprobe_gain;
-+
-+/* Estimate bw probing has gone too far if loss rate exceeds this level. */
-+static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
-+
-+/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
-+ * and loss rate is higher than bbr_loss_thresh.
-+ * Disabled if 0. Max allowed value is 15 (0xF).
-+ */
-+static u32 bbr_full_loss_cnt = 8;
-+
-+/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
-+ * meets this count. Max allowed value is 3.
-+ */
-+static u32 bbr_full_ecn_cnt = 2;
-+
-+/* Fraction of unutilized headroom to try to leave in path upon high loss. */
-+static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
-+
-+/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase.
-+ * Default is 1.25x, as in BBR v1. Max allowed is 511.
-+ */
-+static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4;
-+
-+/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips.
-+ * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism.
-+ * Max allowed is 511.
-+ */
-+static u32 bbr_bw_probe_reno_gain = BBR_UNIT;
-+
-+/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
-+ * we want to tolerate 1% random loss per round, and not have this cut our
-+ * inflight too much, we must probe for bw periodically on roughly this scale.
-+ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
-+ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
-+ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
-+ */
-+static u32 bbr_bw_probe_max_rounds = 63;
-+
-+/* Max amount of randomness to inject in round counting for Reno-coexistence.
-+ * Max value is 15.
-+ */
-+static u32 bbr_bw_probe_rand_rounds = 2;
-+
-+/* Use BBR-native probe time scale starting at this many usec.
-+ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
-+ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
-+ */
-+static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
-+
-+/* Use BBR-native probes spread over this many usec: */
-+static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
-+
-+/* Undo the model changes made in loss recovery if recovery was spurious? */
-+static bool bbr_undo = true;
-+
-+/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
-+static bool bbr_fast_path = true;	/* default: enabled */
-+
-+/* Use fast ack mode ? */
-+static int bbr_fast_ack_mode = 1;	/* default: rwnd check off */
-+
-+/* How much to additively increase inflight_hi when entering REFILL? */
-+static u32 bbr_refill_add_inc;		/* default: disabled */
-+
-+module_param_named(beta,                 bbr_beta,                 uint, 0644);
-+module_param_named(ecn_alpha_gain,       bbr_ecn_alpha_gain,       uint, 0644);
-+module_param_named(ecn_alpha_init,       bbr_ecn_alpha_init,       uint, 0644);
-+module_param_named(ecn_factor,           bbr_ecn_factor,           uint, 0644);
-+module_param_named(ecn_thresh,           bbr_ecn_thresh,           uint, 0644);
-+module_param_named(ecn_max_rtt_us,       bbr_ecn_max_rtt_us,       uint, 0644);
-+module_param_named(ecn_reprobe_gain,     bbr_ecn_reprobe_gain,     uint, 0644);
-+module_param_named(loss_thresh,          bbr_loss_thresh,          uint, 0664);
-+module_param_named(full_loss_cnt,        bbr_full_loss_cnt,        uint, 0664);
-+module_param_named(full_ecn_cnt,         bbr_full_ecn_cnt,         uint, 0664);
-+module_param_named(inflight_headroom,    bbr_inflight_headroom,    uint, 0664);
-+module_param_named(bw_probe_pif_gain,    bbr_bw_probe_pif_gain,    uint, 0664);
-+module_param_named(bw_probe_reno_gain,   bbr_bw_probe_reno_gain,   uint, 0664);
-+module_param_named(bw_probe_max_rounds,  bbr_bw_probe_max_rounds,  uint, 0664);
-+module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664);
-+module_param_named(bw_probe_base_us,     bbr_bw_probe_base_us,     uint, 0664);
-+module_param_named(bw_probe_rand_us,     bbr_bw_probe_rand_us,     uint, 0664);
-+module_param_named(undo,                 bbr_undo,                 bool, 0664);
-+module_param_named(fast_path,		 bbr_fast_path,		   bool, 0664);
-+module_param_named(fast_ack_mode,	 bbr_fast_ack_mode,	   uint, 0664);
-+module_param_named(refill_add_inc,       bbr_refill_add_inc,       uint, 0664);
-+
-+static void bbr2_init(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr_init(sk);	/* run shared init code for v1 and v2 */
-+
-+	/* BBR v2 parameters: */
-+	bbr->params.beta = min_t(u32, 0xFFU, bbr_beta);
-+	bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain);
-+	bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init);
-+	bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor);
-+	bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh);
-+	bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us);
-+	bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain);
-+	bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh);
-+	bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt);
-+	bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt);
-+	bbr->params.inflight_headroom =
-+		min_t(u32, 0xFFU, bbr_inflight_headroom);
-+	bbr->params.bw_probe_pif_gain =
-+		min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain);
-+	bbr->params.bw_probe_reno_gain =
-+		min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain);
-+	bbr->params.bw_probe_max_rounds =
-+		min_t(u32, 0xFFU, bbr_bw_probe_max_rounds);
-+	bbr->params.bw_probe_rand_rounds =
-+		min_t(u32, 0xFU, bbr_bw_probe_rand_rounds);
-+	bbr->params.bw_probe_base_us =
-+		min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us);
-+	bbr->params.bw_probe_rand_us =
-+		min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us);
-+	bbr->params.undo = bbr_undo;
-+	bbr->params.fast_path = bbr_fast_path ? 1 : 0;
-+	bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc);
-+
-+	/* BBR v2 state: */
-+	bbr->initialized = 1;
-+	/* Start sampling ECN mark rate after first full flight is ACKed: */
-+	bbr->loss_round_delivered = tp->delivered + 1;
-+	bbr->loss_round_start = 0;
-+	bbr->undo_bw_lo = 0;
-+	bbr->undo_inflight_lo = 0;
-+	bbr->undo_inflight_hi = 0;
-+	bbr->loss_events_in_round = 0;
-+	bbr->startup_ecn_rounds = 0;
-+	bbr2_reset_congestion_signals(sk);
-+	bbr->bw_lo = ~0U;
-+	bbr->bw_hi[0] = 0;
-+	bbr->bw_hi[1] = 0;
-+	bbr->inflight_lo = ~0U;
-+	bbr->inflight_hi = ~0U;
-+	bbr->bw_probe_up_cnt = ~0U;
-+	bbr->bw_probe_up_acks = 0;
-+	bbr->bw_probe_up_rounds = 0;
-+	bbr->probe_wait_us = 0;
-+	bbr->stopped_risky_probe = 0;
-+	bbr->ack_phase = BBR_ACKS_INIT;
-+	bbr->rounds_since_probe = 0;
-+	bbr->bw_probe_samples = 0;
-+	bbr->prev_probe_too_high = 0;
-+	bbr->ecn_eligible = 0;
-+	bbr->ecn_alpha = bbr->params.ecn_alpha_init;
-+	bbr->alpha_last_delivered = 0;
-+	bbr->alpha_last_delivered_ce = 0;
-+
-+	tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode);
-+}
-+
-+/* Core TCP stack informs us that the given skb was just marked lost. */
-+static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
-+	struct rate_sample rs;
-+
-+	/* Capture "current" data over the full round trip of loss,
-+	 * to have a better chance to see the full capacity of the path.
-+	*/
-+	if (!bbr->loss_in_round)  /* first loss in this round trip? */
-+		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
-+	bbr->loss_in_round = 1;
-+	bbr->loss_in_cycle = 1;
-+
-+	if (!bbr->bw_probe_samples)
-+		return;  /* not an skb sent while probing for bandwidth */
-+	if (unlikely(!scb->tx.delivered_mstamp))
-+		return;  /* skb was SACKed, reneged, marked lost; ignore it */
-+	/* We are probing for bandwidth. Construct a rate sample that
-+	 * estimates what happened in the flight leading up to this lost skb,
-+	 * then see if the loss rate went too high, and if so at which packet.
-+	 */
-+	memset(&rs, 0, sizeof(rs));
-+	rs.tx_in_flight = scb->tx.in_flight;
-+	rs.lost = tp->lost - scb->tx.lost;
-+	rs.is_app_limited = scb->tx.is_app_limited;
-+	if (bbr2_is_inflight_too_high(sk, &rs)) {
-+		rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb);
-+		bbr2_handle_inflight_too_high(sk, &rs);
-+	}
-+}
-+
-+/* Revert short-term model if current loss recovery event was spurious. */
-+static u32 bbr2_undo_cwnd(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr->debug.undo = 1;
-+	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
-+	bbr->full_bw_cnt = 0;
-+	bbr->loss_in_round = 0;
-+
-+	if (!bbr->params.undo)
-+		return tp->snd_cwnd;
-+
-+	/* Revert to cwnd and other state saved before loss episode. */
-+	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
-+	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
-+	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
-+	return bbr->prior_cwnd;
-+}
-+
-+/* Entering loss recovery, so save state for when we undo recovery. */
-+static u32 bbr2_ssthresh(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr_save_cwnd(sk);
-+	/* For undo, save state that adapts based on loss signal. */
-+	bbr->undo_bw_lo		= bbr->bw_lo;
-+	bbr->undo_inflight_lo	= bbr->inflight_lo;
-+	bbr->undo_inflight_hi	= bbr->inflight_hi;
-+	return tcp_sk(sk)->snd_ssthresh;
-+}
-+
-+static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr)
-+{
-+	switch (bbr->mode) {
-+	case BBR_STARTUP:
-+		return BBR2_PHASE_STARTUP;
-+	case BBR_DRAIN:
-+		return BBR2_PHASE_DRAIN;
-+	case BBR_PROBE_BW:
-+		break;
-+	case BBR_PROBE_RTT:
-+		return BBR2_PHASE_PROBE_RTT;
-+	default:
-+		return BBR2_PHASE_INVALID;
-+	}
-+	switch (bbr->cycle_idx) {
-+	case BBR_BW_PROBE_UP:
-+		return BBR2_PHASE_PROBE_BW_UP;
-+	case BBR_BW_PROBE_DOWN:
-+		return BBR2_PHASE_PROBE_BW_DOWN;
-+	case BBR_BW_PROBE_CRUISE:
-+		return BBR2_PHASE_PROBE_BW_CRUISE;
-+	case BBR_BW_PROBE_REFILL:
-+		return BBR2_PHASE_PROBE_BW_REFILL;
-+	default:
-+		return BBR2_PHASE_INVALID;
-+	}
-+}
-+
-+static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr,
-+			    union tcp_cc_info *info)
-+{
-+	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
-+	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
-+		struct bbr *bbr = inet_csk_ca(sk);
-+		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
-+		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
-+		u64 bw_lo = bbr->bw_lo == ~0U ?
-+			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
-+
-+		memset(&info->bbr2, 0, sizeof(info->bbr2));
-+		info->bbr2.bbr_bw_lsb		= (u32)bw;
-+		info->bbr2.bbr_bw_msb		= (u32)(bw >> 32);
-+		info->bbr2.bbr_min_rtt		= bbr->min_rtt_us;
-+		info->bbr2.bbr_pacing_gain	= bbr->pacing_gain;
-+		info->bbr2.bbr_cwnd_gain	= bbr->cwnd_gain;
-+		info->bbr2.bbr_bw_hi_lsb	= (u32)bw_hi;
-+		info->bbr2.bbr_bw_hi_msb	= (u32)(bw_hi >> 32);
-+		info->bbr2.bbr_bw_lo_lsb	= (u32)bw_lo;
-+		info->bbr2.bbr_bw_lo_msb	= (u32)(bw_lo >> 32);
-+		info->bbr2.bbr_mode		= bbr->mode;
-+		info->bbr2.bbr_phase		= (__u8)bbr2_get_phase(bbr);
-+		info->bbr2.bbr_version		= (__u8)2;
-+		info->bbr2.bbr_inflight_lo	= bbr->inflight_lo;
-+		info->bbr2.bbr_inflight_hi	= bbr->inflight_hi;
-+		info->bbr2.bbr_extra_acked	= bbr_extra_acked(sk);
-+		*attr = INET_DIAG_BBRINFO;
-+		return sizeof(info->bbr2);
-+	}
-+	return 0;
-+}
-+
-+static void bbr2_set_state(struct sock *sk, u8 new_state)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (new_state == TCP_CA_Loss) {
-+		struct rate_sample rs = { .losses = 1 };
-+		struct bbr_context ctx = { 0 };
-+
-+		bbr->prev_ca_state = TCP_CA_Loss;
-+		bbr->full_bw = 0;
-+		if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
-+			/* bbr_adapt_lower_bounds() needs cwnd before
-+			 * we suffered an RTO, to update inflight_lo:
-+			 */
-+			bbr->inflight_lo =
-+				max(tp->snd_cwnd, bbr->prior_cwnd);
-+		}
-+		bbr_debug(sk, 0, &rs, &ctx);
-+	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
-+		   new_state != TCP_CA_Loss) {
-+		tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
-+		bbr->try_fast_path = 0; /* bound cwnd using latest model */
-+	}
-+}
-+
-+static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = {
-+	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
-+	.name		= "bbr2",
-+	.owner		= THIS_MODULE,
-+	.init		= bbr2_init,
-+	.cong_control	= bbr2_main,
-+	.sndbuf_expand	= bbr_sndbuf_expand,
-+	.skb_marked_lost = bbr2_skb_marked_lost,
-+	.undo_cwnd	= bbr2_undo_cwnd,
-+	.cwnd_event	= bbr_cwnd_event,
-+	.ssthresh	= bbr2_ssthresh,
-+	.tso_segs	= bbr_tso_segs,
-+	.get_info	= bbr2_get_info,
-+	.set_state	= bbr2_set_state,
-+};
-+
-+static int __init bbr_register(void)
-+{
-+	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
-+	return tcp_register_congestion_control(&tcp_bbr2_cong_ops);
-+}
-+
-+static void __exit bbr_unregister(void)
-+{
-+	tcp_unregister_congestion_control(&tcp_bbr2_cong_ops);
-+}
-+
-+module_init(bbr_register);
-+module_exit(bbr_unregister);
-+
-+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
-+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
-+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
-+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
-+MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
-+MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
-+MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
-+MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
-+
-+MODULE_LICENSE("Dual BSD/GPL");
-+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
-diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
-index 563d016e7..1c94abb6f 100644
---- a/net/ipv4/tcp_cong.c
-+++ b/net/ipv4/tcp_cong.c
-@@ -179,6 +179,7 @@ void tcp_init_congestion_control(struct sock *sk)
- 	struct inet_connection_sock *icsk = inet_csk(sk);
- 
- 	tcp_sk(sk)->prior_ssthresh = 0;
-+	tcp_sk(sk)->fast_ack_mode = 0;
- 	if (icsk->icsk_ca_ops->init)
- 		icsk->icsk_ca_ops->init(sk);
- 	if (tcp_ca_needs_ecn(sk))
-diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
-index 69a545db8..45aaba87c 100644
---- a/net/ipv4/tcp_input.c
-+++ b/net/ipv4/tcp_input.c
-@@ -348,7 +348,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
- 			tcp_enter_quickack_mode(sk, 2);
- 		break;
- 	case INET_ECN_CE:
--		if (tcp_ca_needs_ecn(sk))
-+		if (tcp_ca_wants_ce_events(sk))
- 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
- 
- 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
-@@ -359,7 +359,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
- 		tp->ecn_flags |= TCP_ECN_SEEN;
- 		break;
- 	default:
--		if (tcp_ca_needs_ecn(sk))
-+		if (tcp_ca_wants_ce_events(sk))
- 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
- 		tp->ecn_flags |= TCP_ECN_SEEN;
- 		break;
-@@ -1039,7 +1039,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
-  */
- static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
- {
-+	struct sock *sk = (struct sock *)tp;
-+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-+
- 	tp->lost += tcp_skb_pcount(skb);
-+	if (ca_ops->skb_marked_lost)
-+		ca_ops->skb_marked_lost(sk, skb);
- }
- 
- void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
-@@ -1420,6 +1425,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
- 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
- 	tcp_skb_pcount_add(skb, -pcount);
- 
-+	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
-+	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
-+		      "prev in_flight: %u skb in_flight: %u pcount: %u",
-+		      TCP_SKB_CB(prev)->tx.in_flight,
-+		      TCP_SKB_CB(skb)->tx.in_flight,
-+		      pcount))
-+		TCP_SKB_CB(skb)->tx.in_flight = 0;
-+	else
-+		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
-+	TCP_SKB_CB(prev)->tx.in_flight += pcount;
-+
- 	/* When we're adding to gso_segs == 1, gso_size will be zero,
- 	 * in theory this shouldn't be necessary but as long as DSACK
- 	 * code can come after this skb later on it's better to keep
-@@ -3182,7 +3198,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
- 	long seq_rtt_us = -1L;
- 	long ca_rtt_us = -1L;
- 	u32 pkts_acked = 0;
--	u32 last_in_flight = 0;
- 	bool rtt_update;
- 	int flag = 0;
- 
-@@ -3218,7 +3233,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
- 			if (!first_ackt)
- 				first_ackt = last_ackt;
- 
--			last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
- 			if (before(start_seq, reord))
- 				reord = start_seq;
- 			if (!after(scb->end_seq, tp->high_seq))
-@@ -3284,8 +3298,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
- 		seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
- 		ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
- 
--		if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
--		    last_in_flight && !prior_sacked && fully_acked &&
-+		if (pkts_acked == 1 && fully_acked && !prior_sacked &&
-+		    (tp->snd_una - prior_snd_una) < tp->mss_cache &&
- 		    sack->rate->prior_delivered + 1 == tp->delivered &&
- 		    !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
- 			/* Conservatively mark a delayed ACK. It's typically
-@@ -3342,9 +3356,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
- 
- 	if (icsk->icsk_ca_ops->pkts_acked) {
- 		struct ack_sample sample = { .pkts_acked = pkts_acked,
--					     .rtt_us = sack->rate->rtt_us,
--					     .in_flight = last_in_flight };
-+					     .rtt_us = sack->rate->rtt_us };
- 
-+		sample.in_flight = tp->mss_cache *
-+			(tp->delivered - sack->rate->prior_delivered);
- 		icsk->icsk_ca_ops->pkts_acked(sk, &sample);
- 	}
- 
-@@ -3742,6 +3757,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
- 
- 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
- 	rs.prior_in_flight = tcp_packets_in_flight(tp);
-+	tcp_rate_check_app_limited(sk);
- 
- 	/* ts_recent update must be made after we are sure that the packet
- 	 * is in window.
-@@ -3839,6 +3855,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
- 	delivered = tcp_newly_delivered(sk, delivered, flag);
- 	lost = tp->lost - lost;			/* freshly marked lost */
- 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
-+	rs.is_ece = !!(flag & FLAG_ECE);
- 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
- 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
- 	tcp_xmit_recovery(sk, rexmit);
-@@ -5399,13 +5416,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
- 
- 	    /* More than one full frame received... */
- 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
-+	     (tp->fast_ack_mode == 1 ||
- 	     /* ... and right edge of window advances far enough.
- 	      * (tcp_recvmsg() will send ACK otherwise).
- 	      * If application uses SO_RCVLOWAT, we want send ack now if
- 	      * we have not received enough bytes to satisfy the condition.
- 	      */
--	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
--	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
-+	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
-+	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
- 	    /* We ACK each frame or... */
- 	    tcp_in_quickack_mode(sk) ||
- 	    /* Protocol state mandates a one-time immediate ACK */
-diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
-index fbf140a77..90d939375 100644
---- a/net/ipv4/tcp_output.c
-+++ b/net/ipv4/tcp_output.c
-@@ -1256,8 +1256,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
- 	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
- 	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
- 	if (clone_it) {
--		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
--			- tp->snd_una;
- 		oskb = skb;
- 
- 		tcp_skb_tsorted_save(oskb) {
-@@ -1536,7 +1534,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
- {
- 	struct tcp_sock *tp = tcp_sk(sk);
- 	struct sk_buff *buff;
--	int nsize, old_factor;
-+	int nsize, old_factor, inflight_prev;
- 	long limit;
- 	int nlen;
- 	u8 flags;
-@@ -1615,6 +1613,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
- 
- 		if (diff)
- 			tcp_adjust_pcount(sk, skb, diff);
-+
-+		/* Set buff tx.in_flight as if buff were sent by itself. */
-+		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
-+		if (WARN_ONCE(inflight_prev < 0,
-+			      "inconsistent: tx.in_flight: %u old_factor: %d",
-+			      TCP_SKB_CB(skb)->tx.in_flight, old_factor))
-+			inflight_prev = 0;
-+		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
-+						 tcp_skb_pcount(buff);
- 	}
- 
- 	/* Link BUFF into the send queue. */
-@@ -1982,13 +1989,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
- static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
- {
- 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
--	u32 min_tso, tso_segs;
--
--	min_tso = ca_ops->min_tso_segs ?
--			ca_ops->min_tso_segs(sk) :
--			sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
-+	u32 tso_segs;
- 
--	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
-+	tso_segs = ca_ops->tso_segs ?
-+		ca_ops->tso_segs(sk, mss_now) :
-+		tcp_tso_autosize(sk, mss_now,
-+				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
- 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
- }
- 
-@@ -2628,6 +2634,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
- 			skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
- 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
- 			tcp_init_tso_segs(skb, mss_now);
-+			tcp_set_tx_in_flight(sk, skb);
- 			goto repair; /* Skip network transmission */
- 		}
- 
-diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
-index 0de693565..796fa6e53 100644
---- a/net/ipv4/tcp_rate.c
-+++ b/net/ipv4/tcp_rate.c
-@@ -34,6 +34,24 @@
-  * ready to send in the write queue.
-  */
- 
-+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	u32 in_flight;
-+
-+	/* Check, sanitize, and record packets in flight after skb was sent. */
-+	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
-+	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
-+		      "insane in_flight %u cc %s mss %u "
-+		      "cwnd %u pif %u %u %u %u\n",
-+		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
-+		      tp->mss_cache, tp->snd_cwnd,
-+		      tp->packets_out, tp->retrans_out,
-+		      tp->sacked_out, tp->lost_out))
-+		in_flight = TCPCB_IN_FLIGHT_MAX;
-+	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
-+}
-+
- /* Snapshot the current delivery information in the skb, to generate
-  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
-  */
-@@ -65,7 +83,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
- 	TCP_SKB_CB(skb)->tx.first_tx_mstamp	= tp->first_tx_mstamp;
- 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
- 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
-+	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
-+	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
- 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
-+	tcp_set_tx_in_flight(sk, skb);
- }
- 
- /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
-@@ -86,16 +107,20 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
- 
- 	if (!rs->prior_delivered ||
- 	    after(scb->tx.delivered, rs->prior_delivered)) {
-+		rs->prior_lost	     = scb->tx.lost;
-+		rs->prior_delivered_ce  = scb->tx.delivered_ce;
- 		rs->prior_delivered  = scb->tx.delivered;
- 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
- 		rs->is_app_limited   = scb->tx.is_app_limited;
- 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
-+		rs->tx_in_flight     = scb->tx.in_flight;
- 
- 		/* Record send time of most recently ACKed packet: */
- 		tp->first_tx_mstamp  = tcp_skb_timestamp_us(skb);
- 		/* Find the duration of the "send phase" of this window: */
--		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
--						     scb->tx.first_tx_mstamp);
-+		rs->interval_us      = tcp_stamp32_us_delta(
-+						tp->first_tx_mstamp,
-+						scb->tx.first_tx_mstamp);
- 
- 	}
- 	/* Mark off the skb delivered once it's sacked to avoid being
-@@ -137,6 +162,11 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
- 		return;
- 	}
- 	rs->delivered   = tp->delivered - rs->prior_delivered;
-+	rs->lost        = tp->lost - rs->prior_lost;
-+
-+	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
-+	/* delivered_ce occupies less than 32 bits in the skb control block */
-+	rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
- 
- 	/* Model sending data and receiving ACKs as separate pipeline phases
- 	 * for a window. Usually the ACK phase is longer, but with ACK
-@@ -144,7 +174,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
- 	 * longer phase.
- 	 */
- 	snd_us = rs->interval_us;				/* send phase */
--	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
-+	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
- 				    rs->prior_mstamp); /* ack phase */
- 	rs->interval_us = max(snd_us, ack_us);
- 
-diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
-index 4ef08079c..b5b24caa8 100644
---- a/net/ipv4/tcp_timer.c
-+++ b/net/ipv4/tcp_timer.c
-@@ -607,6 +607,7 @@ void tcp_write_timer_handler(struct sock *sk)
- 		goto out;
- 	}
- 
-+	tcp_rate_check_app_limited(sk);
- 	tcp_mstamp_refresh(tcp_sk(sk));
- 	event = icsk->icsk_pending;
- 
--- 
-2.31.1.305.gd1b10fc6d8
-
diff --git a/0010-btrfs.patch b/0010-btrfs.patch
deleted file mode 100644
index 457e2445824d..000000000000
--- a/0010-btrfs.patch
+++ /dev/null
@@ -1,2157 +0,0 @@
-From dfe89528bf8d093c1df80ea3fea2a50d3dc4a302 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Mon, 5 Aug 2019 14:31:53 -0400
-Subject: [PATCH 01/22] btrfs: add a force_chunk_alloc to space_info's sysfs
-
-In testing various things such as the btrfsck patch to detect over
-allocation of chunks, empty block group deletion, and balance I've had
-various ways to force chunk allocations for debug purposes.  Add a sysfs
-file to enable forcing of chunk allocation for the owning space info in
-order to enable us to add testcases in the future to test these various
-features easier.
-
-[HH: rebased for 5.4]
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/sysfs.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++
- 1 file changed, 64 insertions(+)
-
-diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
-index 6eb1c50fa..9372ef191 100644
---- a/fs/btrfs/sysfs.c
-+++ b/fs/btrfs/sysfs.c
-@@ -72,6 +72,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = {	     \
- 
- static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
- static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
-+static inline struct kobject *get_btrfs_kobj(struct kobject *kobj);
- 
- static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a)
- {
-@@ -640,6 +641,58 @@ static struct kobj_type btrfs_raid_ktype = {
- 	.default_groups = raid_groups,
- };
- 
-+static ssize_t btrfs_space_info_force_chunk_alloc_show(struct kobject *kobj,
-+						       struct kobj_attribute *a,
-+						       char *buf)
-+{
-+	return snprintf(buf, PAGE_SIZE, "0\n");
-+}
-+
-+static ssize_t btrfs_space_info_force_chunk_alloc(struct kobject *kobj,
-+						  struct kobj_attribute *a,
-+						  const char *buf, size_t len)
-+{
-+	struct btrfs_space_info *space_info = to_space_info(kobj);
-+	struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
-+	struct btrfs_trans_handle *trans;
-+	unsigned long val;
-+	int ret;
-+
-+	if (!fs_info) {
-+		printk(KERN_ERR "couldn't get fs_info\n");
-+		return -EPERM;
-+	}
-+
-+	if (!capable(CAP_SYS_ADMIN))
-+		return -EPERM;
-+
-+	if (sb_rdonly(fs_info->sb))
-+		return -EROFS;
-+
-+	ret = kstrtoul(buf, 10, &val);
-+	if (ret)
-+		return ret;
-+
-+	/*
-+	 * We don't really care, but if we echo 0 > force it seems silly to do
-+	 * anything.
-+	 */
-+	if (val == 0)
-+		return -EINVAL;
-+
-+	trans = btrfs_start_transaction(fs_info->extent_root, 0);
-+	if (!trans)
-+		return PTR_ERR(trans);
-+	ret = btrfs_force_chunk_alloc(trans, space_info->flags);
-+	btrfs_end_transaction(trans);
-+	if (ret == 1)
-+		return len;
-+	return -ENOSPC;
-+}
-+BTRFS_ATTR_RW(space_info, force_chunk_alloc,
-+	      btrfs_space_info_force_chunk_alloc_show,
-+	      btrfs_space_info_force_chunk_alloc);
-+
- #define SPACE_INFO_ATTR(field)						\
- static ssize_t btrfs_space_info_show_##field(struct kobject *kobj,	\
- 					     struct kobj_attribute *a,	\
-@@ -684,6 +737,7 @@ static struct attribute *space_info_attrs[] = {
- 	BTRFS_ATTR_PTR(space_info, disk_used),
- 	BTRFS_ATTR_PTR(space_info, disk_total),
- 	BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
-+	BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
- 	NULL,
- };
- ATTRIBUTE_GROUPS(space_info);
-@@ -1006,6 +1060,16 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
- 	return to_fs_devs(kobj)->fs_info;
- }
- 
-+static inline struct kobject *get_btrfs_kobj(struct kobject *kobj)
-+{
-+	while (kobj) {
-+		if (kobj->ktype == &btrfs_ktype)
-+			return kobj;
-+		kobj = kobj->parent;
-+	}
-+	return NULL;
-+}
-+
- #define NUM_FEATURE_BITS 64
- #define BTRFS_FEATURE_NAME_MAX 13
- static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
--- 
-2.32.0
-
-
-From e104f0dda22a999ddd5f0be76ffc62637b411a3f Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Fri, 24 Jul 2020 12:41:47 -0400
-Subject: [PATCH 02/22] btrfs: do not evaluate the expression with
- !CONFIG_BTRFS_ASSERT
-
-While investigating a performance issue I noticed that turning off
-CONFIG_BTRFS_ASSERT had no effect in what I was seeing in perf,
-specifically check_setget_bounds() was around 5% for this workload.
-Upon investigation I realized that I made a mistake when I added
-ASSERT(), I would still evaluate the expression, but simply ignore the
-result.
-
-This is useless, and has a marked impact on performance.  This
-microbenchmark is the watered down version of an application that is
-experiencing performance issues, and does renames and creates over and
-over again.  Doing these operations 200k times without this patch takes
-13 seconds on my machine.  With this patch it takes 7 seconds.
-
-[HH: removed the second hunk for 5.7.x]
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
-Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
----
- fs/btrfs/ctree.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
-index 29ef96903..12921830e 100644
---- a/fs/btrfs/ctree.h
-+++ b/fs/btrfs/ctree.h
-@@ -3402,7 +3402,7 @@ static inline void assertfail(const char *expr, const char *file, int line)
- 
- #else
- static inline void assertfail(const char *expr, const char* file, int line) { }
--#define ASSERT(expr)	(void)(expr)
-+#define ASSERT(expr)	((void)0)
- #endif
- 
- /*
--- 
-2.32.0
-
-
-From a8c8b6d8a9763fe25f616567c8005318d3cbd948 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Fri, 20 Mar 2020 14:34:36 -0400
-Subject: [PATCH 03/22] btrfs: restart snapshot delete if we have to end the
- transaction
-
-This is to fully fix the deadlock described in
-
-btrfs: do not resolve backrefs for roots that are being deleted
-
-Holding write locks on our deleted snapshot across trans handles will
-just lead to sadness, and our backref lookup code is going to want to
-still process dropped snapshots for things like qgroup accounting.
-
-Fix this by simply dropping our path before we restart our transaction,
-and picking back up from our drop_progress key.  This is less efficient
-obviously, but it also doesn't deadlock, so it feels like a reasonable
-trade off.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/extent-tree.c | 16 ++++++++++++++++
- 1 file changed, 16 insertions(+)
-
-diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
-index 27c368007..1aff5769c 100644
---- a/fs/btrfs/extent-tree.c
-+++ b/fs/btrfs/extent-tree.c
-@@ -5563,6 +5563,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
- 	 * already dropped.
- 	 */
- 	set_bit(BTRFS_ROOT_DELETING, &root->state);
-+again:
- 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
- 		level = btrfs_header_level(root->node);
- 		path->nodes[level] = btrfs_lock_root_node(root);
-@@ -5574,7 +5575,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
- 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
- 		memcpy(&wc->update_progress, &key,
- 		       sizeof(wc->update_progress));
-+		memcpy(&wc->drop_progress, &key, sizeof(key));
- 
-+		wc->drop_level = btrfs_root_drop_level(root_item);
- 		level = btrfs_root_drop_level(root_item);
- 		BUG_ON(level == 0);
- 		path->lowest_level = level;
-@@ -5666,6 +5669,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
- 				goto out_end_trans;
- 			}
- 
-+			/*
-+			 * We used to keep the path open until we completed the
-+			 * snapshot delete.  However this can deadlock with
-+			 * things like backref walking that may want to resolve
-+			 * references that still point to this deleted root.  We
-+			 * already have the ability to restart snapshot
-+			 * deletions on mount, so just clear our walk_control,
-+			 * drop the path, and go to the beginning and re-lookup
-+			 * our drop_progress key and continue from there.
-+			 */
-+			memset(wc, 0, sizeof(*wc));
-+			btrfs_release_path(path);
- 			btrfs_end_transaction_throttle(trans);
- 			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
- 				btrfs_debug(fs_info,
-@@ -5687,6 +5702,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
- 				err = PTR_ERR(trans);
- 				goto out_free;
- 			}
-+			goto again;
- 		}
- 	}
- 	btrfs_release_path(path);
--- 
-2.32.0
-
-
-From f241ea708d4c4da7800436d3c74d0cd7836c75e8 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 24 Mar 2021 09:44:21 -0400
-Subject: [PATCH 04/22] btrfs: use percpu_read_positive instead of sum_positive
- for need_preempt
-
-Looking at perf data for a fio workload I noticed that we were spending
-a pretty large chunk of time (around 5%) doing percpu_counter_sum() in
-need_preemptive_reclaim.  This is silly, as we only want to know if we
-have more ordered than delalloc to see if we should be counting the
-delayed items in our threshold calculation.  Change this to
-percpu_read_positive() to avoid the overhead.
-
-I ran this through fsperf to validate the changes, obviously the latency
-numbers in dbench and fio are quite jittery, so take them as you wish,
-but overall the improvements on throughput, iops, and bw are all
-positive.  Each test was run two times, the given value is the average
-of both runs for their respective column.
-
-btrfs ssd normal test results
-
-bufferedrandwrite16g results
-     metric         baseline   current          diff
-==========================================================
-write_io_kbytes     16777216   16777216     0.00%
-read_clat_ns_p99           0          0     0.00%
-write_bw_bytes      1.04e+08   1.05e+08     1.12%
-read_iops                  0          0     0.00%
-write_clat_ns_p50      13888      11840   -14.75%
-read_io_kbytes             0          0     0.00%
-read_io_bytes              0          0     0.00%
-write_clat_ns_p99      35008      29312   -16.27%
-read_bw_bytes              0          0     0.00%
-elapsed                  170        167    -1.76%
-write_lat_ns_min     4221.50    3762.50   -10.87%
-sys_cpu                39.65      35.37   -10.79%
-write_lat_ns_max    2.67e+10   2.50e+10    -6.63%
-read_lat_ns_min            0          0     0.00%
-write_iops          25270.10   25553.43     1.12%
-read_lat_ns_max            0          0     0.00%
-read_clat_ns_p50           0          0     0.00%
-
-dbench60 results
-  metric     baseline   current         diff
-==================================================
-qpathinfo       11.12     12.73    14.52%
-throughput     416.09    445.66     7.11%
-flush         3485.63   1887.55   -45.85%
-qfileinfo        0.70      1.92   173.86%
-ntcreatex      992.60    695.76   -29.91%
-qfsinfo          2.43      3.71    52.48%
-close            1.67      3.14    88.09%
-sfileinfo       66.54    105.20    58.10%
-rename         809.23    619.59   -23.43%
-find            16.88     15.46    -8.41%
-unlink         820.54    670.86   -18.24%
-writex        3375.20   2637.91   -21.84%
-deltree        386.33    449.98    16.48%
-readx            3.43      3.41    -0.60%
-mkdir            0.05      0.03   -38.46%
-lockx            0.26      0.26    -0.76%
-unlockx          0.81      0.32   -60.33%
-
-dio4kbs16threads results
-     metric          baseline       current           diff
-================================================================
-write_io_kbytes         5249676       3357150   -36.05%
-read_clat_ns_p99              0             0     0.00%
-write_bw_bytes      89583501.50   57291192.50   -36.05%
-read_iops                     0             0     0.00%
-write_clat_ns_p50        242688        263680     8.65%
-read_io_kbytes                0             0     0.00%
-read_io_bytes                 0             0     0.00%
-write_clat_ns_p99      15826944      36732928   132.09%
-read_bw_bytes                 0             0     0.00%
-elapsed                      61            61     0.00%
-write_lat_ns_min          42704         42095    -1.43%
-sys_cpu                    5.27          3.45   -34.52%
-write_lat_ns_max       7.43e+08      9.27e+08    24.71%
-read_lat_ns_min               0             0     0.00%
-write_iops             21870.97      13987.11   -36.05%
-read_lat_ns_max               0             0     0.00%
-read_clat_ns_p50              0             0     0.00%
-
-randwrite2xram results
-     metric          baseline       current           diff
-================================================================
-write_io_kbytes        24831972      28876262    16.29%
-read_clat_ns_p99              0             0     0.00%
-write_bw_bytes      83745273.50   92182192.50    10.07%
-read_iops                     0             0     0.00%
-write_clat_ns_p50         13952         11648   -16.51%
-read_io_kbytes                0             0     0.00%
-read_io_bytes                 0             0     0.00%
-write_clat_ns_p99         50176         52992     5.61%
-read_bw_bytes                 0             0     0.00%
-elapsed                     314           332     5.73%
-write_lat_ns_min        5920.50          5127   -13.40%
-sys_cpu                    7.82          7.35    -6.07%
-write_lat_ns_max       5.27e+10      3.88e+10   -26.44%
-read_lat_ns_min               0             0     0.00%
-write_iops             20445.62      22505.42    10.07%
-read_lat_ns_max               0             0     0.00%
-read_clat_ns_p50              0             0     0.00%
-
-untarfirefox results
-metric    baseline   current        diff
-==============================================
-elapsed      47.41     47.40   -0.03%
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index 2da6177f4..2dc674b7c 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -861,8 +861,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- 	 * of heavy DIO or ordered reservations, preemptive flushing will just
- 	 * waste time and cause us to slow down.
- 	 */
--	ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
--	delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
-+	ordered = percpu_counter_read_positive(&fs_info->ordered_bytes);
-+	delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
- 	if (ordered >= delalloc)
- 		used += fs_info->delayed_refs_rsv.reserved +
- 			fs_info->delayed_block_rsv.reserved;
--- 
-2.32.0
-
-
-From aafa62450e938fae02462b7d47e055eb6307c57d Mon Sep 17 00:00:00 2001
-From: Filipe Manana <fdmanana@suse.com>
-Date: Mon, 1 Mar 2021 09:26:42 +0000
-Subject: [PATCH 05/22] btrfs: add btree read ahead for full send operations
-
-When doing a full send we know that we are going to be reading every node
-and leaf of the send root, so we benefit from enabling read ahead for the
-btree.
-
-This change enables read ahead for full send operations only, incremental
-sends will have read ahead enabled in a different way by a separate patch.
-
-The following test script was used to measure the improvement on a box
-using an average, consumer grade, spinning disk and with 16Gb of ram:
-
-  $ cat test.sh
-  #!/bin/bash
-
-  DEV=/dev/sdj
-  MNT=/mnt/sdj
-  MKFS_OPTIONS="--nodesize 16384"     # default, just to be explicit
-  MOUNT_OPTIONS="-o max_inline=2048"  # default, just to be explicit
-
-  mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null
-  mount $MOUNT_OPTIONS $DEV $MNT
-
-  # Create files with inline data to make it easier and faster to create
-  # large btrees.
-  add_files()
-  {
-      local total=$1
-      local start_offset=$2
-      local number_jobs=$3
-      local total_per_job=$(($total / $number_jobs))
-
-      echo "Creating $total new files using $number_jobs jobs"
-      for ((n = 0; n < $number_jobs; n++)); do
-          (
-              local start_num=$(($start_offset + $n * $total_per_job))
-              for ((i = 1; i <= $total_per_job; i++)); do
-                  local file_num=$((start_num + $i))
-                  local file_path="$MNT/file_${file_num}"
-                  xfs_io -f -c "pwrite -S 0xab 0 2000" $file_path > /dev/null
-                  if [ $? -ne 0 ]; then
-                      echo "Failed creating file $file_path"
-                      break
-                  fi
-              done
-          ) &
-          worker_pids[$n]=$!
-      done
-
-      wait ${worker_pids[@]}
-
-      sync
-      echo
-      echo "btree node/leaf count: $(btrfs inspect-internal dump-tree -t 5 $DEV | egrep '^(node|leaf) ' | wc -l)"
-  }
-
-  initial_file_count=500000
-  add_files $initial_file_count 0 4
-
-  echo
-  echo "Creating first snapshot..."
-  btrfs subvolume snapshot -r $MNT $MNT/snap1
-
-  echo
-  echo "Adding more files..."
-  add_files $((initial_file_count / 4)) $initial_file_count 4
-
-  echo
-  echo "Updating 1/50th of the initial files..."
-  for ((i = 1; i < $initial_file_count; i += 50)); do
-      xfs_io -c "pwrite -S 0xcd 0 20" $MNT/file_$i > /dev/null
-  done
-
-  echo
-  echo "Creating second snapshot..."
-  btrfs subvolume snapshot -r $MNT $MNT/snap2
-
-  umount $MNT
-
-  echo 3 > /proc/sys/vm/drop_caches
-  blockdev --flushbufs $DEV &> /dev/null
-  hdparm -F $DEV &> /dev/null
-
-  mount $MOUNT_OPTIONS $DEV $MNT
-
-  echo
-  echo "Testing full send..."
-  start=$(date +%s)
-  btrfs send $MNT/snap1 > /dev/null
-  end=$(date +%s)
-  echo
-  echo "Full send took $((end - start)) seconds"
-
-  umount $MNT
-
-  echo 3 > /proc/sys/vm/drop_caches
-  blockdev --flushbufs $DEV &> /dev/null
-  hdparm -F $DEV &> /dev/null
-
-  mount $MOUNT_OPTIONS $DEV $MNT
-
-  echo
-  echo "Testing incremental send..."
-  start=$(date +%s)
-  btrfs send -p $MNT/snap1 $MNT/snap2 > /dev/null
-  end=$(date +%s)
-  echo
-  echo "Incremental send took $((end - start)) seconds"
-
-  umount $MNT
-
-Before this change, full send duration:
-
-with $initial_file_count == 200000:  165 seconds
-with $initial_file_count == 500000:  407 seconds
-
-After this change, full send duration:
-
-with $initial_file_count == 200000:  149 seconds (-10.2%)
-with $initial_file_count == 500000:  353 seconds (-14.2%)
-
-For $initial_file_count == 200000 there are 62600 nodes and leaves in the
-btree of the first snapshot, while for $initial_file_count == 500000 there
-are 152476 nodes and leaves. The roots were at level 2.
-
-Signed-off-by: Filipe Manana <fdmanana@suse.com>
----
- fs/btrfs/send.c | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
-index 8ae8f1732..9817da145 100644
---- a/fs/btrfs/send.c
-+++ b/fs/btrfs/send.c
-@@ -6650,6 +6650,7 @@ static int full_send_tree(struct send_ctx *sctx)
- 	path = alloc_path_for_send();
- 	if (!path)
- 		return -ENOMEM;
-+	path->reada = READA_FORWARD;
- 
- 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
- 	key.type = BTRFS_INODE_ITEM_KEY;
--- 
-2.32.0
-
-
-From a1df61cf5c2efa2298869acced2eb5f6a51e27ed Mon Sep 17 00:00:00 2001
-From: Filipe Manana <fdmanana@suse.com>
-Date: Mon, 1 Mar 2021 09:26:43 +0000
-Subject: [PATCH 06/22] btrfs: add btree read ahead for incremental send
- operations
-
-Currently we do not do btree read ahead when doing an incremental send,
-however we know that we will read and process any node or leaf in the
-send root that has a generation greater than the generation of the parent
-root. So triggering read ahead for such nodes and leafs is beneficial
-for an incremental send.
-
-This change does that, triggers read ahead of any node or leaf in the
-send root that has a generation greater then the generation of the
-parent root. As for the parent root, no readahead is triggered because
-knowing in advance which nodes/leaves are going to be read is not so
-linear and there's often a large time window between visiting nodes or
-leaves of the parent root. So I opted to leave out the parent root,
-and triggering read ahead for its nodes/leaves seemed to have not made
-significant difference.
-
-The following test script was used to measure the improvement on a box
-using an average, consumer grade, spinning disk and with 16Gb of ram:
-
-  $ cat test.sh
-  #!/bin/bash
-
-  DEV=/dev/sdj
-  MNT=/mnt/sdj
-  MKFS_OPTIONS="--nodesize 16384"     # default, just to be explicit
-  MOUNT_OPTIONS="-o max_inline=2048"  # default, just to be explicit
-
-  mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null
-  mount $MOUNT_OPTIONS $DEV $MNT
-
-  # Create files with inline data to make it easier and faster to create
-  # large btrees.
-  add_files()
-  {
-      local total=$1
-      local start_offset=$2
-      local number_jobs=$3
-      local total_per_job=$(($total / $number_jobs))
-
-      echo "Creating $total new files using $number_jobs jobs"
-      for ((n = 0; n < $number_jobs; n++)); do
-          (
-              local start_num=$(($start_offset + $n * $total_per_job))
-              for ((i = 1; i <= $total_per_job; i++)); do
-                  local file_num=$((start_num + $i))
-                  local file_path="$MNT/file_${file_num}"
-                  xfs_io -f -c "pwrite -S 0xab 0 2000" $file_path > /dev/null
-                  if [ $? -ne 0 ]; then
-                      echo "Failed creating file $file_path"
-                      break
-                  fi
-              done
-          ) &
-          worker_pids[$n]=$!
-      done
-
-      wait ${worker_pids[@]}
-
-      sync
-      echo
-      echo "btree node/leaf count: $(btrfs inspect-internal dump-tree -t 5 $DEV | egrep '^(node|leaf) ' | wc -l)"
-  }
-
-  initial_file_count=500000
-  add_files $initial_file_count 0 4
-
-  echo
-  echo "Creating first snapshot..."
-  btrfs subvolume snapshot -r $MNT $MNT/snap1
-
-  echo
-  echo "Adding more files..."
-  add_files $((initial_file_count / 4)) $initial_file_count 4
-
-  echo
-  echo "Updating 1/50th of the initial files..."
-  for ((i = 1; i < $initial_file_count; i += 50)); do
-      xfs_io -c "pwrite -S 0xcd 0 20" $MNT/file_$i > /dev/null
-  done
-
-  echo
-  echo "Creating second snapshot..."
-  btrfs subvolume snapshot -r $MNT $MNT/snap2
-
-  umount $MNT
-
-  echo 3 > /proc/sys/vm/drop_caches
-  blockdev --flushbufs $DEV &> /dev/null
-  hdparm -F $DEV &> /dev/null
-
-  mount $MOUNT_OPTIONS $DEV $MNT
-
-  echo
-  echo "Testing full send..."
-  start=$(date +%s)
-  btrfs send $MNT/snap1 > /dev/null
-  end=$(date +%s)
-  echo
-  echo "Full send took $((end - start)) seconds"
-
-  umount $MNT
-
-  echo 3 > /proc/sys/vm/drop_caches
-  blockdev --flushbufs $DEV &> /dev/null
-  hdparm -F $DEV &> /dev/null
-
-  mount $MOUNT_OPTIONS $DEV $MNT
-
-  echo
-  echo "Testing incremental send..."
-  start=$(date +%s)
-  btrfs send -p $MNT/snap1 $MNT/snap2 > /dev/null
-  end=$(date +%s)
-  echo
-  echo "Incremental send took $((end - start)) seconds"
-
-  umount $MNT
-
-Before this change, incremental send duration:
-
-with $initial_file_count == 200000: 51 seconds
-with $initial_file_count == 500000: 168 seconds
-
-After this change, incremental send duration:
-
-with $initial_file_count == 200000:  39 seconds (-26.7%)
-with $initial_file_count == 500000:  125 seconds (-29.4%)
-
-For $initial_file_count == 200000 there are 62600 nodes and leaves in the
-btree of the first snapshot, and 77759 nodes and leaves in the btree of
-the second snapshot. The root nodes were at level 2.
-
-While for $initial_file_count == 500000 there are 152476 nodes and leaves
-in the btree of the first snapshot, and 190511 nodes and leaves in the
-btree of the second snapshot. The root nodes were at level 2 as well.
-
-Signed-off-by: Filipe Manana <fdmanana@suse.com>
----
- fs/btrfs/send.c | 42 ++++++++++++++++++++++++++++++++++++------
- 1 file changed, 36 insertions(+), 6 deletions(-)
-
-diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
-index 9817da145..ed1310e38 100644
---- a/fs/btrfs/send.c
-+++ b/fs/btrfs/send.c
-@@ -6689,15 +6689,35 @@ static int full_send_tree(struct send_ctx *sctx)
- 	return ret;
- }
- 
--static int tree_move_down(struct btrfs_path *path, int *level)
-+static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
- {
- 	struct extent_buffer *eb;
-+	struct extent_buffer *parent = path->nodes[*level];
-+	int slot = path->slots[*level];
-+	const int nritems = btrfs_header_nritems(parent);
-+	u64 reada_max;
-+	u64 reada_done = 0;
- 
- 	BUG_ON(*level == 0);
--	eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]);
-+	eb = btrfs_read_node_slot(parent, slot);
- 	if (IS_ERR(eb))
- 		return PTR_ERR(eb);
- 
-+	/*
-+	 * Trigger readahead for the next leaves we will process, so that it is
-+	 * very likely that when we need them they are already in memory and we
-+	 * will not block on disk IO. For nodes we only do readahead for one,
-+	 * since the time window between processing nodes is typically larger.
-+	 */
-+	reada_max = *level == 1 ? SZ_128K : eb->fs_info->nodesize;
-+
-+	for (slot++; slot < nritems && reada_done < reada_max; slot++) {
-+		if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
-+			btrfs_readahead_node_child(parent, slot);
-+			reada_done += eb->fs_info->nodesize;
-+		}
-+	}
-+
- 	path->nodes[*level - 1] = eb;
- 	path->slots[*level - 1] = 0;
- 	(*level)--;
-@@ -6737,14 +6757,15 @@ static int tree_move_next_or_upnext(struct btrfs_path *path,
- static int tree_advance(struct btrfs_path *path,
- 			int *level, int root_level,
- 			int allow_down,
--			struct btrfs_key *key)
-+			struct btrfs_key *key,
-+			u64 reada_min_gen)
- {
- 	int ret;
- 
- 	if (*level == 0 || !allow_down) {
- 		ret = tree_move_next_or_upnext(path, level, root_level);
- 	} else {
--		ret = tree_move_down(path, level);
-+		ret = tree_move_down(path, level, reada_min_gen);
- 	}
- 	if (ret >= 0) {
- 		if (*level == 0)
-@@ -6818,6 +6839,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
- 	u64 right_blockptr;
- 	u64 left_gen;
- 	u64 right_gen;
-+	u64 reada_min_gen;
- 
- 	left_path = btrfs_alloc_path();
- 	if (!left_path) {
-@@ -6897,6 +6919,14 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
- 		ret = -ENOMEM;
- 		goto out;
- 	}
-+	/*
-+	 * Our right root is the parent root, while the left root is the "send"
-+	 * root. We know that all new nodes/leaves in the left root must have
-+	 * a generation greater than the right root's generation, so we trigger
-+	 * readahead for those nodes and leaves of the left root, as we know we
-+	 * will need to read them at some point.
-+	 */
-+	reada_min_gen = btrfs_header_generation(right_root->commit_root);
- 	up_read(&fs_info->commit_root_sem);
- 
- 	if (left_level == 0)
-@@ -6921,7 +6951,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
- 			ret = tree_advance(left_path, &left_level,
- 					left_root_level,
- 					advance_left != ADVANCE_ONLY_NEXT,
--					&left_key);
-+					&left_key, reada_min_gen);
- 			if (ret == -1)
- 				left_end_reached = ADVANCE;
- 			else if (ret < 0)
-@@ -6932,7 +6962,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
- 			ret = tree_advance(right_path, &right_level,
- 					right_root_level,
- 					advance_right != ADVANCE_ONLY_NEXT,
--					&right_key);
-+					&right_key, reada_min_gen);
- 			if (ret == -1)
- 				right_end_reached = ADVANCE;
- 			else if (ret < 0)
--- 
-2.32.0
-
-
-From 54ee4da43eb299229eced65ffd9097b16003d1a3 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:42 -0400
-Subject: [PATCH 07/22] btrfs: check worker before need_preemptive_reclaim
-
-need_preemptive_reclaim() does some calculations, which aren't heavy,
-but if we're already running preemptive reclaim there's no reason to do
-them at all, so re-order the checks so that we don't do the calculation
-if we're already doing reclaim.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index 2dc674b7c..c9a5e003b 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -1588,8 +1588,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
- 		 * the async reclaim as we will panic.
- 		 */
- 		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
--		    need_preemptive_reclaim(fs_info, space_info) &&
--		    !work_busy(&fs_info->preempt_reclaim_work)) {
-+		    !work_busy(&fs_info->preempt_reclaim_work) &&
-+		    need_preemptive_reclaim(fs_info, space_info)) {
- 			trace_btrfs_trigger_flush(fs_info, space_info->flags,
- 						  orig_bytes, flush, "preempt");
- 			queue_work(system_unbound_wq,
--- 
-2.32.0
-
-
-From 76efa702dc8f1626c48eb0836205b7fb5b0ea94d Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:43 -0400
-Subject: [PATCH 08/22] btrfs: only clamp the first time we have to start
- flushing
-
-We were clamping the threshold for preemptive reclaim any time we added
-a ticket to wait on, which if we have a lot of threads means we'd
-essentially max out the clamp the first time we start to flush.  Instead
-of doing this, simply do it every time we have to start flushing, this
-will make us ramp up gradually instead of going to max clamping as soon
-as we start needing to do flushing.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 17 +++++++++--------
- 1 file changed, 9 insertions(+), 8 deletions(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index c9a5e003b..33edab17a 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -1561,6 +1561,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
- 		    flush == BTRFS_RESERVE_FLUSH_DATA) {
- 			list_add_tail(&ticket.list, &space_info->tickets);
- 			if (!space_info->flush) {
-+				/*
-+				 * We were forced to add a reserve ticket, so
-+				 * our preemptive flushing is unable to keep
-+				 * up.  Clamp down on the threshold for the
-+				 * preemptive flushing in order to keep up with
-+				 * the workload.
-+				 */
-+				maybe_clamp_preempt(fs_info, space_info);
-+
- 				space_info->flush = 1;
- 				trace_btrfs_trigger_flush(fs_info,
- 							  space_info->flags,
-@@ -1572,14 +1581,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
- 			list_add_tail(&ticket.list,
- 				      &space_info->priority_tickets);
- 		}
--
--		/*
--		 * We were forced to add a reserve ticket, so our preemptive
--		 * flushing is unable to keep up.  Clamp down on the threshold
--		 * for the preemptive flushing in order to keep up with the
--		 * workload.
--		 */
--		maybe_clamp_preempt(fs_info, space_info);
- 	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
- 		used += orig_bytes;
- 		/*
--- 
-2.32.0
-
-
-From 9844f3f192822e44c70ee90722b704d785f4884e Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:46 -0400
-Subject: [PATCH 09/22] btrfs: don't include the global rsv size in the
- preemptive used amount
-
-When deciding if we should preemptively flush space, we will add in the
-amount of space used by all block rsvs.  However this also includes the
-global block rsv, which isn't flushable so shouldn't be accounted for in
-this calculation.  If we decide to use ->bytes_may_use in our used
-calculation we need to subtract the global rsv size from this amount so
-it most closely matches the flushable space.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index 33edab17a..52e3bfedc 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -867,7 +867,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- 		used += fs_info->delayed_refs_rsv.reserved +
- 			fs_info->delayed_block_rsv.reserved;
- 	else
--		used += space_info->bytes_may_use;
-+		used += space_info->bytes_may_use - global_rsv_size;
- 
- 	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
- 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
--- 
-2.32.0
-
-
-From 1d309a9a237acff83ae53ba25b79dc21e2454b3d Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:44 -0400
-Subject: [PATCH 10/22] btrfs: take into account global rsv in
- need_preemptive_reclaim
-
-Global rsv can't be used for normal allocations, and for very full file
-systems we can decide to try and async flush constantly even though
-there's really not a lot of space to reclaim.  Deal with this by
-including the global block rsv size in the "total used" calculation.
-
-[HH: small context fix for 5.10.x]
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 4 +++-
- 1 file changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index 52e3bfedc..aeb1f0b7b 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -792,12 +792,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
- static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- 				    struct btrfs_space_info *space_info)
- {
-+	u64 global_rsv_size = fs_info->global_block_rsv.reserved;
- 	u64 ordered, delalloc;
- 	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
- 	u64 used;
- 
- 	/* If we're just plain full then async reclaim just slows us down. */
--	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
-+	if ((space_info->bytes_used + space_info->bytes_reserved +
-+	     global_rsv_size) >= thresh)
- 		return false;
- 
- 	/*
--- 
-2.32.0
-
-
-From fa148091ff63557e1d123194069b3d418d6129e4 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:45 -0400
-Subject: [PATCH 11/22] btrfs: use the global rsv size in the preemptive thresh
- calculation
-
-We calculate the amount of "free" space available for normal
-reservations by taking the total space and subtracting out the hard used
-space, which is readonly, used, and reserved space.  However we weren't
-taking into account the global block rsv, which is essentially hard used
-space.  Handle this by subtracting it from the available free space, so
-that our threshold more closely mirrors reality.
-
-[HH: small context fix for 5.10.x]
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 6 ++++--
- 1 file changed, 4 insertions(+), 2 deletions(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index aeb1f0b7b..4e3857474 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -840,8 +840,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- 
- 	thresh = calc_available_free_space(fs_info, space_info,
- 					   BTRFS_RESERVE_FLUSH_ALL);
--	thresh += (space_info->total_bytes - space_info->bytes_used -
--		   space_info->bytes_reserved - space_info->bytes_readonly);
-+	used = space_info->bytes_used + space_info->bytes_reserved +
-+		space_info->bytes_readonly + global_rsv_size;
-+	if (used < space_info->total_bytes)
-+		thresh += space_info->total_bytes - used;
- 	thresh >>= space_info->clamp;
- 
- 	used = space_info->bytes_pinned;
--- 
-2.32.0
-
-
-From 5270bca333d3c4f4bf045f224f6b94fe12528086 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:47 -0400
-Subject: [PATCH 12/22] btrfs: only ignore delalloc if delalloc is much smaller
- than ordered
-
-While testing heavy delalloc workloads I noticed that sometimes we'd
-just stop preemptively flushing when we had loads of delalloc available
-to flush.  This is because we skip preemptive flushing if delalloc <=
-ordered.  However if we start with say 4gib of delalloc, and we flush
-2gib of that, we'll stop flushing there, when we still have 2gib of
-delalloc to flush.
-
-Instead adjust the ordered bytes down by half, this way if 2/3 of our
-outstanding delalloc reservations are tied up by ordered extents we
-don't bother preemptive flushing, as we're getting close to the state
-where we need to wait on ordered extents.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 8 +++++++-
- 1 file changed, 7 insertions(+), 1 deletion(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index 4e3857474..cf09b23f3 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -864,8 +864,14 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- 	 * clearly be heavy enough to warrant preemptive flushing.  In the case
- 	 * of heavy DIO or ordered reservations, preemptive flushing will just
- 	 * waste time and cause us to slow down.
-+	 *
-+	 * We want to make sure we truly are maxed out on ordered however, so
-+	 * cut ordered in half, and if it's still higher than delalloc then we
-+	 * can keep flushing.  This is to avoid the case where we start
-+	 * flushing, and now delalloc == ordered and we stop preemptively
-+	 * flushing when we could still have several gigs of delalloc to flush.
- 	 */
--	ordered = percpu_counter_read_positive(&fs_info->ordered_bytes);
-+	ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
- 	delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
- 	if (ordered >= delalloc)
- 		used += fs_info->delayed_refs_rsv.reserved +
--- 
-2.32.0
-
-
-From 7713c31e0c5896f22358551aff967cf9f7dfe91c Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:48 -0400
-Subject: [PATCH 13/22] btrfs: handle preemptive delalloc flushing slightly
- differently
-
-If we decide to flush delalloc from the preemptive flusher, we really do
-not want to wait on ordered extents, as it gains us nothing.  However
-there was logic to go ahead and wait on ordered extents if there was
-more ordered bytes than delalloc bytes.  We do not want this behavior,
-so pass through whether this flushing is for preemption, and do not wait
-for ordered extents if that's the case.  Also break out of the shrink
-loop after the first flushing, as we just want to one shot shrink
-delalloc.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 15 ++++++++++++---
- 1 file changed, 12 insertions(+), 3 deletions(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index cf09b23f3..b2d834b92 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -495,7 +495,8 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
-  */
- static void shrink_delalloc(struct btrfs_fs_info *fs_info,
- 			    struct btrfs_space_info *space_info,
--			    u64 to_reclaim, bool wait_ordered)
-+			    u64 to_reclaim, bool wait_ordered,
-+			    bool for_preempt)
- {
- 	struct btrfs_trans_handle *trans;
- 	u64 delalloc_bytes;
-@@ -532,7 +533,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
- 	 * ordered extents, otherwise we'll waste time trying to flush delalloc
- 	 * that likely won't give us the space back we need.
- 	 */
--	if (ordered_bytes > delalloc_bytes)
-+	if (ordered_bytes > delalloc_bytes && !for_preempt)
- 		wait_ordered = true;
- 
- 	loops = 0;
-@@ -551,6 +552,14 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
- 				break;
- 		}
- 
-+		/*
-+		 * If we are for preemption we just want a one-shot of delalloc
-+		 * flushing so we can stop flushing if we decide we don't need
-+		 * to anymore.
-+		 */
-+		if (for_preempt)
-+			break;
-+
- 		spin_lock(&space_info->lock);
- 		if (list_empty(&space_info->tickets) &&
- 		    list_empty(&space_info->priority_tickets)) {
-@@ -702,7 +711,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
- 	case FLUSH_DELALLOC:
- 	case FLUSH_DELALLOC_WAIT:
- 		shrink_delalloc(fs_info, space_info, num_bytes,
--				state == FLUSH_DELALLOC_WAIT);
-+				state == FLUSH_DELALLOC_WAIT, for_preempt);
- 		break;
- 	case FLUSH_DELAYED_REFS_NR:
- 	case FLUSH_DELAYED_REFS:
--- 
-2.32.0
-
-
-From b1fa125daba80334e1efdc50e5a2e70f8585e755 Mon Sep 17 00:00:00 2001
-From: David Sterba <dsterba@suse.com>
-Date: Tue, 18 May 2021 16:49:35 +0200
-Subject: [PATCH 14/22] btrfs: scrub: per-device bandwidth control
-
-Add sysfs interface to limit io during scrub. We relied on the ionice
-interface to do that, eg. the idle class let the system usable while
-scrub was running. This has changed when mq-deadline got widespread and
-did not implement the scheduling classes. That was a CFQ thing that got
-deleted. We've got numerous complaints from users about degraded
-performance.
-
-Currently only BFQ supports that but it's not a common scheduler and we
-can't ask everybody to switch to it.
-
-Alternatively the cgroup io limiting can be used but that also a
-non-trivial setup (v2 required, the controller must be enabled on the
-system). This can still be used if desired.
-
-Other ideas that have been explored: piggy-back on ionice (that is set
-per-process and is accessible) and interpret the class and classdata as
-bandwidth limits, but this does not have enough flexibility as there are
-only 8 allowed and we'd have to map fixed limits to each value. Also
-adjusting the value would need to lookup the process that currently runs
-scrub on the given device, and the value is not sticky so would have to
-be adjusted each time scrub runs.
-
-Running out of options, sysfs does not look that bad:
-
-- it's accessible from scripts, or udev rules
-- the name is similar to what MD-RAID has
-  (/proc/sys/dev/raid/speed_limit_max or /sys/block/mdX/md/sync_speed_max)
-- the value is sticky at least for filesystem mount time
-- adjusting the value has immediate effect
-- sysfs is available in constrained environments (eg. system rescue)
-- the limit also applies to device replace
-
-Sysfs:
-
-- raw value is in bytes
-- values written to the file accept suffixes like K, M
-- file is in the per-device directory /sys/fs/btrfs/FSID/devinfo/DEVID/scrub_speed_max
-- 0 means use default priority of IO
-
-The scheduler is a simple deadline one and the accuracy is up to nearest
-128K.
-
-[HH: trivial context fix in hunk #1 for 5.10.x]
-Signed-off-by: David Sterba <dsterba@suse.com>
----
- fs/btrfs/scrub.c   | 61 ++++++++++++++++++++++++++++++++++++++++++++++
- fs/btrfs/sysfs.c   | 28 +++++++++++++++++++++
- fs/btrfs/volumes.h |  3 +++
- 3 files changed, 92 insertions(+)
-
-diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
-index b9202a1f1..adc8cf404 100644
---- a/fs/btrfs/scrub.c
-+++ b/fs/btrfs/scrub.c
-@@ -165,6 +165,10 @@ struct scrub_ctx {
- 	int			readonly;
- 	int			pages_per_rd_bio;
- 
-+	/* State of IO submission throttling affecting the associated device */
-+	ktime_t			throttle_deadline;
-+	u64			throttle_sent;
-+
- 	int			is_dev_replace;
- 	u64			write_pointer;
- 
-@@ -613,6 +617,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
- 	spin_lock_init(&sctx->list_lock);
- 	spin_lock_init(&sctx->stat_lock);
- 	init_waitqueue_head(&sctx->list_wait);
-+	sctx->throttle_deadline = 0;
- 
- 	WARN_ON(sctx->wr_curr_bio != NULL);
- 	mutex_init(&sctx->wr_lock);
-@@ -1996,6 +2001,60 @@ static void scrub_page_put(struct scrub_page *spage)
- 	}
- }
- 
-+/*
-+ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
-+ * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
-+ */
-+static void scrub_throttle(struct scrub_ctx *sctx)
-+{
-+	const int time_slice = 1000;
-+	struct scrub_bio *sbio;
-+	struct btrfs_device *device;
-+	s64 delta;
-+	ktime_t now;
-+	u32 div;
-+	u64 bwlimit;
-+
-+	sbio = sctx->bios[sctx->curr];
-+	device = sbio->dev;
-+	bwlimit = READ_ONCE(device->scrub_speed_max);
-+	if (bwlimit == 0)
-+		return;
-+
-+	/*
-+	 * Slice is divided into intervals when the IO is submitted, adjust by
-+	 * bwlimit and maximum of 64 intervals.
-+	 */
-+	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
-+	div = min_t(u32, 64, div);
-+
-+	/* Start new epoch, set deadline */
-+	now = ktime_get();
-+	if (sctx->throttle_deadline == 0) {
-+		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
-+		sctx->throttle_sent = 0;
-+	}
-+
-+	/* Still in the time to send? */
-+	if (ktime_before(now, sctx->throttle_deadline)) {
-+		/* If current bio is within the limit, send it */
-+		sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
-+		if (sctx->throttle_sent <= bwlimit / div)
-+			return;
-+
-+		/* We're over the limit, sleep until the rest of the slice */
-+		delta = ktime_ms_delta(sctx->throttle_deadline, now);
-+	} else {
-+		/* New request after deadline, start new epoch */
-+		delta = 0;
-+	}
-+
-+	if (delta)
-+		schedule_timeout_interruptible(delta * HZ / 1000);
-+	/* Next call will start the deadline period */
-+	sctx->throttle_deadline = 0;
-+}
-+
- static void scrub_submit(struct scrub_ctx *sctx)
- {
- 	struct scrub_bio *sbio;
-@@ -2003,6 +2062,8 @@ static void scrub_submit(struct scrub_ctx *sctx)
- 	if (sctx->curr == -1)
- 		return;
- 
-+	scrub_throttle(sctx);
-+
- 	sbio = sctx->bios[sctx->curr];
- 	sctx->curr = -1;
- 	scrub_pending_bio_inc(sctx);
-diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
-index 9372ef191..9dda3feda 100644
---- a/fs/btrfs/sysfs.c
-+++ b/fs/btrfs/sysfs.c
-@@ -1469,6 +1469,33 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
- }
- BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
- 
-+static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
-+					     struct kobj_attribute *a,
-+					     char *buf)
-+{
-+	struct btrfs_device *device = container_of(kobj, struct btrfs_device,
-+						   devid_kobj);
-+
-+	return scnprintf(buf, PAGE_SIZE, "%llu\n",
-+			 READ_ONCE(device->scrub_speed_max));
-+}
-+
-+static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
-+					      struct kobj_attribute *a,
-+					      const char *buf, size_t len)
-+{
-+	struct btrfs_device *device = container_of(kobj, struct btrfs_device,
-+						   devid_kobj);
-+	char *endptr;
-+	unsigned long long limit;
-+
-+	limit = memparse(buf, &endptr);
-+	WRITE_ONCE(device->scrub_speed_max, limit);
-+	return len;
-+}
-+BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show,
-+	      btrfs_devinfo_scrub_speed_max_store);
-+
- static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
- 					    struct kobj_attribute *a, char *buf)
- {
-@@ -1486,6 +1513,7 @@ static struct attribute *devid_attrs[] = {
- 	BTRFS_ATTR_PTR(devid, in_fs_metadata),
- 	BTRFS_ATTR_PTR(devid, missing),
- 	BTRFS_ATTR_PTR(devid, replace_target),
-+	BTRFS_ATTR_PTR(devid, scrub_speed_max),
- 	BTRFS_ATTR_PTR(devid, writeable),
- 	NULL
- };
-diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
-index d4c3e0dd3..be7932d9b 100644
---- a/fs/btrfs/volumes.h
-+++ b/fs/btrfs/volumes.h
-@@ -143,6 +143,9 @@ struct btrfs_device {
- 	struct completion kobj_unregister;
- 	/* For sysfs/FSID/devinfo/devid/ */
- 	struct kobject devid_kobj;
-+
-+	/* Bandwidth limit for scrub, in bytes */
-+	u64 scrub_speed_max;
- };
- 
- /*
--- 
-2.32.0
-
-
-From 69dcc9b1b2ab6c0f2be5fd2020651ddb63e00f9c Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 19 May 2021 11:29:03 -0400
-Subject: [PATCH 15/22] btrfs: abort the transaction if we fail to replay log
- trees
-
-During inspection of the return path for replay I noticed that we don't
-actually abort the transaction if we get a failure during replay.  This
-isn't a problem necessarily, as we properly return the error and will
-fail to mount.  However we still leave this dangling transaction that
-could conceivably be committed without thinking there was an error.
-Handle this by making sure we abort the transaction on error to
-safeguard us from any problems in the future.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/tree-log.c | 4 +++-
- 1 file changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
-index 276b5511f..f9332bb84 100644
---- a/fs/btrfs/tree-log.c
-+++ b/fs/btrfs/tree-log.c
-@@ -6363,8 +6363,10 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
- 
- 	return 0;
- error:
--	if (wc.trans)
-+	if (wc.trans) {
-+		btrfs_abort_transaction(wc.trans, ret);
- 		btrfs_end_transaction(wc.trans);
-+	}
- 	btrfs_free_path(path);
- 	return ret;
- }
--- 
-2.32.0
-
-
-From df6c67b60e03836e5deabb8d8ea76cfe5f4c5886 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 19 May 2021 11:45:16 -0400
-Subject: [PATCH 16/22] btrfs: do not infinite loop in data reclaim if we
- aborted
-
-Error injection stressing uncovered a busy loop in our data reclaim
-loop.  There are two cases here, one where we loop creating block groups
-until space_info->full is set, or in the main loop we will skip erroring
-out any tickets if space_info->full == 0.  Unfortunately if we aborted
-the transaction then we will never allocate chunks or reclaim any space
-and thus never get ->full, and you'll see stack traces like this
-
-watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [kworker/u4:4:139]
-CPU: 0 PID: 139 Comm: kworker/u4:4 Tainted: G        W         5.13.0-rc1+ #328
-Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
-Workqueue: events_unbound btrfs_async_reclaim_data_space
-RIP: 0010:btrfs_join_transaction+0x12/0x20
-RSP: 0018:ffffb2b780b77de0 EFLAGS: 00000246
-RAX: ffffb2b781863d58 RBX: 0000000000000000 RCX: 0000000000000000
-RDX: 0000000000000801 RSI: ffff987952b57400 RDI: ffff987940aa3000
-RBP: ffff987954d55000 R08: 0000000000000001 R09: ffff98795539e8f0
-R10: 000000000000000f R11: 000000000000000f R12: ffffffffffffffff
-R13: ffff987952b574c8 R14: ffff987952b57400 R15: 0000000000000008
-FS:  0000000000000000(0000) GS:ffff9879bbc00000(0000) knlGS:0000000000000000
-CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
-CR2: 00007f0703da4000 CR3: 0000000113398004 CR4: 0000000000370ef0
-Call Trace:
- flush_space+0x4a8/0x660
- btrfs_async_reclaim_data_space+0x55/0x130
- process_one_work+0x1e9/0x380
- worker_thread+0x53/0x3e0
- ? process_one_work+0x380/0x380
- kthread+0x118/0x140
- ? __kthread_bind_mask+0x60/0x60
- ret_from_fork+0x1f/0x30
-
-Fix this by checking to see if we have BTRFS_FS_STATE_TRANS_ABORTED in
-either of the reclaim loops, and if so fail the tickets and bail.  In
-addition to this, fix maybe_fail_all_tickets() to not try to grant
-tickets if we've aborted, simply fail everything.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 35 ++++++++++++++++++++++++++++++-----
- 1 file changed, 30 insertions(+), 5 deletions(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index b2d834b92..208f47e60 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -941,6 +941,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
- 	struct reserve_ticket *ticket;
- 	u64 tickets_id = space_info->tickets_id;
- 	u64 first_ticket_bytes = 0;
-+	bool aborted = test_bit(BTRFS_FS_STATE_TRANS_ABORTED,
-+				&fs_info->fs_state);
- 
- 	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
- 		btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
-@@ -952,7 +954,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
- 		ticket = list_first_entry(&space_info->tickets,
- 					  struct reserve_ticket, list);
- 
--		if (ticket->steal &&
-+		if (!aborted && ticket->steal &&
- 		    steal_from_global_rsv(fs_info, space_info, ticket))
- 			return true;
- 
-@@ -968,15 +970,18 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
- 		 */
- 		if (first_ticket_bytes == 0)
- 			first_ticket_bytes = ticket->bytes;
--		else if (first_ticket_bytes > ticket->bytes)
-+		else if (!aborted && first_ticket_bytes > ticket->bytes)
- 			return true;
- 
--		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
-+		if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- 			btrfs_info(fs_info, "failing ticket with %llu bytes",
- 				   ticket->bytes);
- 
- 		remove_ticket(space_info, ticket);
--		ticket->error = -ENOSPC;
-+		if (aborted)
-+			ticket->error = -EIO;
-+		else
-+			ticket->error = -ENOSPC;
- 		wake_up(&ticket->wait);
- 
- 		/*
-@@ -985,7 +990,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
- 		 * here to see if we can make progress with the next ticket in
- 		 * the list.
- 		 */
--		btrfs_try_granting_tickets(fs_info, space_info);
-+		if (!aborted)
-+			btrfs_try_granting_tickets(fs_info, space_info);
- 	}
- 	return (tickets_id != space_info->tickets_id);
- }
-@@ -1253,6 +1259,15 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
- 			spin_unlock(&space_info->lock);
- 			return;
- 		}
-+
-+		/* Something happened, fail everything and bail. */
-+		if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED,
-+			     &fs_info->fs_state)) {
-+			maybe_fail_all_tickets(fs_info, space_info);
-+			space_info->flush = 0;
-+			spin_unlock(&space_info->lock);
-+			return;
-+		}
- 		last_tickets_id = space_info->tickets_id;
- 		spin_unlock(&space_info->lock);
- 	}
-@@ -1283,6 +1298,16 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
- 			} else {
- 				flush_state = 0;
- 			}
-+
-+			/* Something happened, fail everything and bail. */
-+			if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED,
-+				     &fs_info->fs_state)) {
-+				maybe_fail_all_tickets(fs_info, space_info);
-+				space_info->flush = 0;
-+				spin_unlock(&space_info->lock);
-+				return;
-+			}
-+
- 		}
- 		spin_unlock(&space_info->lock);
- 	}
--- 
-2.32.0
-
-
-From c7c8879b24c228034f942521af8011a600b273ed Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Fri, 12 Mar 2021 15:25:05 -0500
-Subject: [PATCH 17/22] btrfs: handle btrfs_record_root_in_trans failure in
- btrfs_recover_log_trees
-
-btrfs_record_root_in_trans will return errors in the future, so handle
-the error properly in btrfs_recover_log_trees.
-
-This appears tricky, however we have a reference count on the
-destination root, so if this fails we need to continue on in the loop to
-make sure the proper cleanup is done.
-
-Reviewed-by: Qu Wenruo <wqu@suse.com>
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
-Reviewed-by: David Sterba <dsterba@suse.com>
-[ add comment ]
-Signed-off-by: David Sterba <dsterba@suse.com>
----
- fs/btrfs/tree-log.c | 9 +++++++--
- 1 file changed, 7 insertions(+), 2 deletions(-)
-
-diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
-index f9332bb84..1e3cfc935 100644
---- a/fs/btrfs/tree-log.c
-+++ b/fs/btrfs/tree-log.c
-@@ -6300,8 +6300,13 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
- 		}
- 
- 		wc.replay_dest->log_root = log;
--		btrfs_record_root_in_trans(trans, wc.replay_dest);
--		ret = walk_log_tree(trans, log, &wc);
-+		ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
-+		if (ret)
-+			/* The loop needs to continue due to the root refs */
-+			btrfs_handle_fs_error(fs_info, ret,
-+				"failed to record the log root in transaction");
-+		else
-+			ret = walk_log_tree(trans, log, &wc);
- 
- 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
--- 
-2.32.0
-
-
-From c2a7ee7bf274fa4b307dc78e36ac32fde7ad9e91 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef () toxicpanda ! com>
-Date: Thu, 20 May 2021 14:46:01 +0000
-Subject: [PATCH 18/22] btrfs: change handle_fs_error in recover_log_trees to
- aborts
-
-During inspection of the return path for replay I noticed that we don't
-actually abort the transaction if we get a failure during replay.  This
-isn't a problem necessarily, as we properly return the error and will
-fail to mount.  However we still leave this dangling transaction that
-could conceivably be committed without thinking there was an error.
-We were using btrfs_handle_fs_error() here, but that pre-dates the
-transaction abort code.  Simply replace the btrfs_handle_fs_error()
-calls with transaction aborts, so we still know where exactly things
-went wrong, and add a few in some other un-handled error cases.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/tree-log.c | 16 ++++++++--------
- 1 file changed, 8 insertions(+), 8 deletions(-)
-
-diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
-index 1e3cfc935..876445337 100644
---- a/fs/btrfs/tree-log.c
-+++ b/fs/btrfs/tree-log.c
-@@ -6247,8 +6247,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
- 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
- 
- 		if (ret < 0) {
--			btrfs_handle_fs_error(fs_info, ret,
--				    "Couldn't find tree log root.");
-+			btrfs_abort_transaction(trans, ret);
- 			goto error;
- 		}
- 		if (ret > 0) {
-@@ -6265,8 +6264,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
- 		log = btrfs_read_tree_root(log_root_tree, &found_key);
- 		if (IS_ERR(log)) {
- 			ret = PTR_ERR(log);
--			btrfs_handle_fs_error(fs_info, ret,
--				    "Couldn't read tree log root.");
-+			btrfs_abort_transaction(trans, ret);
- 			goto error;
- 		}
- 
-@@ -6294,8 +6292,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
- 
- 			if (!ret)
- 				goto next;
--			btrfs_handle_fs_error(fs_info, ret,
--				"Couldn't read target root for tree log recovery.");
-+			btrfs_abort_transaction(trans, ret);
- 			goto error;
- 		}
- 
-@@ -6303,14 +6300,15 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
- 		ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
- 		if (ret)
- 			/* The loop needs to continue due to the root refs */
--			btrfs_handle_fs_error(fs_info, ret,
--				"failed to record the log root in transaction");
-+			btrfs_abort_transaction(trans, ret);
- 		else
- 			ret = walk_log_tree(trans, log, &wc);
- 
- 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
- 						      path);
-+			if (ret)
-+				btrfs_abort_transaction(trans, ret);
- 		}
- 
- 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
-@@ -6327,6 +6325,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
- 			 * could only happen during mount.
- 			 */
- 			ret = btrfs_init_root_free_objectid(root);
-+			if (ret)
-+				btrfs_abort_transaction(trans, ret);
- 		}
- 
- 		wc.replay_dest->log_root = NULL;
--- 
-2.32.0
-
-
-From 03c9f21055825cd463b214cf8341e9c4907525f0 Mon Sep 17 00:00:00 2001
-From: Filipe Manana <fdmanana@suse.com>
-Date: Fri, 28 May 2021 11:37:32 +0100
-Subject: [PATCH 19/22] btrfs: avoid unnecessary logging of xattrs during fast
- fsyncs
-
-When logging an inode we always log all its xattrs, so that we are able
-to figure out which ones should be deleted during log replay. However this
-is unnecessary when we are doing a fast fsync and no xattrs were added,
-changed or deleted since the last time we logged the inode in the current
-transaction.
-
-So skip the logging of xattrs when the inode was previously logged in the
-current transaction and no xattrs were added, changed or deleted. If any
-changes to xattrs happened, than the inode has BTRFS_INODE_COPY_EVERYTHING
-set in its runtime flags and the xattrs get logged. This saves time on
-scanning for xattrs, allocating memory, COWing log tree extent buffers and
-adding more lock contention on the extent buffers when there are multiple
-tasks logging in parallel.
-
-The use of xattrs is common when using ACLs, some applications, or when
-using security modules like SELinux where every inode gets a security
-xattr added to it.
-
-The following test script, using fio, was used on a box with 12 cores, 64G
-of RAM, a NVMe device and the default non-debug kernel config from Debian.
-It uses 8 concurrent jobs each writing in blocks of 64K to its own 4G file,
-each file with a single xattr of 50 bytes (about the same size for an ACL
-or SELinux xattr), doing random buffered writes with an fsync after each
-write.
-
-   $ cat test.sh
-   #!/bin/bash
-
-   DEV=/dev/nvme0n1
-   MNT=/mnt/test
-   MOUNT_OPTIONS="-o ssd"
-   MKFS_OPTIONS="-d single -m single"
-
-   NUM_JOBS=8
-   FILE_SIZE=4G
-
-   cat <<EOF > /tmp/fio-job.ini
-   [writers]
-   rw=randwrite
-   fsync=1
-   fallocate=none
-   group_reporting=1
-   direct=0
-   bs=64K
-   ioengine=sync
-   size=$FILE_SIZE
-   directory=$MNT
-   numjobs=$NUM_JOBS
-   EOF
-
-   echo "performance" | \
-       tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
-
-   mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null
-   mount $MOUNT_OPTIONS $DEV $MNT
-
-   echo "Creating files before fio runs, each with 1 xattr of 50 bytes"
-   for ((i = 0; i < $NUM_JOBS; i++)); do
-       path="$MNT/writers.$i.0"
-       truncate -s $FILE_SIZE $path
-       setfattr -n user.xa1 -v $(printf '%0.sX' $(seq 50)) $path
-   done
-
-   fio /tmp/fio-job.ini
-   umount $MNT
-
-fio output before this change:
-
-WRITE: bw=120MiB/s (126MB/s), 120MiB/s-120MiB/s (126MB/s-126MB/s), io=32.0GiB (34.4GB), run=272145-272145msec
-
-fio output after this change:
-
-WRITE: bw=142MiB/s (149MB/s), 142MiB/s-142MiB/s (149MB/s-149MB/s), io=32.0GiB (34.4GB), run=230408-230408msec
-
-+16.8% throughput, -16.6% runtime
-
-Signed-off-by: Filipe Manana <fdmanana@suse.com>
----
- fs/btrfs/tree-log.c | 16 +++++++++++++---
- 1 file changed, 13 insertions(+), 3 deletions(-)
-
-diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
-index 876445337..fb4704ed9 100644
---- a/fs/btrfs/tree-log.c
-+++ b/fs/btrfs/tree-log.c
-@@ -5465,13 +5465,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- 	btrfs_release_path(dst_path);
- 	if (need_log_inode_item) {
- 		err = log_inode_item(trans, log, dst_path, inode);
--		if (!err && !xattrs_logged) {
-+		if (err)
-+			goto out_unlock;
-+		/*
-+		 * If we are doing a fast fsync and the inode was logged before
-+		 * in this transaction, we don't need to log the xattrs because
-+		 * they were logged before. If xattrs were added, changed or
-+		 * deleted since the last time we logged the inode, then we have
-+		 * already logged them because the inode had the runtime flag
-+		 * BTRFS_INODE_COPY_EVERYTHING set.
-+		 */
-+		if (!xattrs_logged && inode->logged_trans < trans->transid) {
- 			err = btrfs_log_all_xattrs(trans, root, inode, path,
- 						   dst_path);
-+			if (err)
-+				goto out_unlock;
- 			btrfs_release_path(path);
- 		}
--		if (err)
--			goto out_unlock;
- 	}
- 	if (fast_search) {
- 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
--- 
-2.32.0
-
-
-From 5615df097f5c4d07edf14bb40292f26043b405f5 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Tue, 1 Jun 2021 15:45:08 -0400
-Subject: [PATCH 20/22] btrfs: handle shrink_delalloc pages calculation
- differently
-
-We have been hitting some early ENOSPC issues in production with more
-recent kernels, and I tracked it down to us simply not flushing delalloc
-as aggressively as we should be.  With tracing I was seeing us failing
-all tickets with all of the block rsvs at or around 0, with very little
-pinned space, but still around 120mib of outstanding bytes_may_used.
-Upon further investigation I saw that we were flushing around 14 pages
-per shrink call for delalloc, despite having around 2gib of delalloc
-outstanding.
-
-Consider the example of a 8 way machine, all cpu's trying to create a
-file in parallel, which at the time of this commit requires 5 items to
-do.  Assuming a 16k leaf size, we have 10mib of total metadata reclaim
-size waiting on reservations.  Now assume we have 128mib of delalloc
-outstanding.  With our current math we would set items to 20, and then
-set to_reclaim to 20 * 256k, or 5mib.
-
-Assuming that we went through this loop all 3 times, for both
-FLUSH_DELALLOC and FLUSH_DELALLOC_WAIT, and then did the full loop
-twice, we'd only flush 60mib of the 128mib delalloc space.  This could
-leave a fair bit of delalloc reservations still hanging around by the
-time we go to ENOSPC out all the remaining tickets.
-
-Fix this two ways.  First, change the calculations to be a fraction of
-the total delalloc bytes on the system.  Prior to my change we were
-calculating based on dirty inodes so our math made more sense, now it's
-just completely unrelated to what we're actually doing.
-
-Second add a FLUSH_DELALLOC_FULL state, that we hold off until we've
-gone through the flush states at least once.  This will empty the system
-of all delalloc so we're sure to be truly out of space when we start
-failing tickets.
-
-I'm tagging stable 5.10 and forward, because this is where we started
-using the page stuff heavily again.  This affects earlier kernel
-versions as well, but would be a pain to backport to them as the
-flushing mechanisms aren't the same.
-
-CC: stable@vger.kernel.org # 5.10
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/ctree.h             | 11 ++++++-----
- fs/btrfs/space-info.c        | 36 +++++++++++++++++++++++++++---------
- include/trace/events/btrfs.h |  1 +
- 3 files changed, 34 insertions(+), 14 deletions(-)
-
-diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
-index 12921830e..75ba87dcc 100644
---- a/fs/btrfs/ctree.h
-+++ b/fs/btrfs/ctree.h
-@@ -2746,11 +2746,12 @@ enum btrfs_flush_state {
- 	FLUSH_DELAYED_REFS	=	4,
- 	FLUSH_DELALLOC		=	5,
- 	FLUSH_DELALLOC_WAIT	=	6,
--	ALLOC_CHUNK		=	7,
--	ALLOC_CHUNK_FORCE	=	8,
--	RUN_DELAYED_IPUTS	=	9,
--	COMMIT_TRANS		=	10,
--	FORCE_COMMIT_TRANS	=	11,
-+	FLUSH_DELALLOC_FULL	=	7,
-+	ALLOC_CHUNK		=	8,
-+	ALLOC_CHUNK_FORCE	=	9,
-+	RUN_DELAYED_IPUTS	=	10,
-+	COMMIT_TRANS		=	11,
-+	FORCE_COMMIT_TRANS	=	12,
- };
- 
- int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index 208f47e60..88bac64d5 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -505,6 +505,10 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
- 	long time_left;
- 	int loops;
- 
-+	delalloc_bytes = percpu_counter_sum_positive(
-+						&fs_info->delalloc_bytes);
-+	ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
-+
- 	/* Calc the number of the pages we need flush for space reservation */
- 	if (to_reclaim == U64_MAX) {
- 		items = U64_MAX;
-@@ -512,19 +516,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
- 		/*
- 		 * to_reclaim is set to however much metadata we need to
- 		 * reclaim, but reclaiming that much data doesn't really track
--		 * exactly, so increase the amount to reclaim by 2x in order to
--		 * make sure we're flushing enough delalloc to hopefully reclaim
--		 * some metadata reservations.
-+		 * exactly.  What we really want to do is reclaim full inode's
-+		 * worth of reservations, however that's not available to us
-+		 * here.  We will take a fraction of the delalloc bytes for our
-+		 * flushing loops and hope for the best.  Delalloc will expand
-+		 * the amount we write to cover an entire dirty extent, which
-+		 * will reclaim the metadata reservation for that range.  If
-+		 * it's not enough subsequent flush stages will be more
-+		 * aggressive.
- 		 */
-+		to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
- 		items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
--		to_reclaim = items * EXTENT_SIZE_PER_ITEM;
- 	}
- 
- 	trans = (struct btrfs_trans_handle *)current->journal_info;
- 
--	delalloc_bytes = percpu_counter_sum_positive(
--						&fs_info->delalloc_bytes);
--	ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
- 	if (delalloc_bytes == 0 && ordered_bytes == 0)
- 		return;
- 
-@@ -710,8 +716,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
- 		break;
- 	case FLUSH_DELALLOC:
- 	case FLUSH_DELALLOC_WAIT:
-+	case FLUSH_DELALLOC_FULL:
-+		if (state == FLUSH_DELALLOC_FULL)
-+			num_bytes = U64_MAX;
- 		shrink_delalloc(fs_info, space_info, num_bytes,
--				state == FLUSH_DELALLOC_WAIT, for_preempt);
-+				state != FLUSH_DELALLOC, for_preempt);
- 		break;
- 	case FLUSH_DELAYED_REFS_NR:
- 	case FLUSH_DELAYED_REFS:
-@@ -1043,6 +1052,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
- 				commit_cycles--;
- 		}
- 
-+		/*
-+		 * We do not want to empty the system of delalloc unless we're
-+		 * under heavy pressure, so allow one trip through the flushing
-+		 * logic before we start doing a FLUSH_DELALLOC_FULL.
-+		 */
-+		if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
-+			flush_state++;
-+
- 		/*
- 		 * We don't want to force a chunk allocation until we've tried
- 		 * pretty hard to reclaim space.  Think of the case where we
-@@ -1225,7 +1242,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
-  *   so if we now have space to allocate do the force chunk allocation.
-  */
- static const enum btrfs_flush_state data_flush_states[] = {
--	FLUSH_DELALLOC_WAIT,
-+	FLUSH_DELALLOC_FULL,
- 	RUN_DELAYED_IPUTS,
- 	FLUSH_DELAYED_REFS,
- 	COMMIT_TRANS,
-@@ -1334,6 +1351,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
- 	FLUSH_DELAYED_REFS,
- 	FLUSH_DELALLOC,
- 	FLUSH_DELALLOC_WAIT,
-+	FLUSH_DELALLOC_FULL,
- 	ALLOC_CHUNK,
- 	COMMIT_TRANS,
- };
-diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
-index 0551ea653..7cda6c3d7 100644
---- a/include/trace/events/btrfs.h
-+++ b/include/trace/events/btrfs.h
-@@ -94,6 +94,7 @@ struct btrfs_space_info;
- 	EM( FLUSH_DELAYED_ITEMS,	"FLUSH_DELAYED_ITEMS")		\
- 	EM( FLUSH_DELALLOC,		"FLUSH_DELALLOC")		\
- 	EM( FLUSH_DELALLOC_WAIT,	"FLUSH_DELALLOC_WAIT")		\
-+	EM( FLUSH_DELALLOC_FULL,	"FLUSH_DELALLOC_FULL")		\
- 	EM( FLUSH_DELAYED_REFS_NR,	"FLUSH_DELAYED_REFS_NR")	\
- 	EM( FLUSH_DELAYED_REFS,		"FLUSH_ELAYED_REFS")		\
- 	EM( ALLOC_CHUNK,		"ALLOC_CHUNK")			\
--- 
-2.32.0
-
-
-From ec26628891bae6cd63bcd934dc3b13157cbc1024 Mon Sep 17 00:00:00 2001
-From: Filipe Manana <fdmanana@suse.com>
-Date: Wed, 9 Jun 2021 11:25:03 +0100
-Subject: [PATCH 21/22] btrfs: send: fix invalid path for unlink operations
- after parent orphanization
-
-During an incremental send operation, when processing the new references
-for the current inode, we might send an unlink operation for another inode
-that has a conflicting path and has more than one hard link. However this
-path was computed and cached before we processed previous new references
-for the current inode. We may have orphanized a directory of that path
-while processing a previous new reference, in which case the path will
-be invalid and cause the receiver process to fail.
-
-The following reproducer triggers the problem and explains how/why it
-happens in its comments:
-
-  $ cat test-send-unlink.sh
-  #!/bin/bash
-
-  DEV=/dev/sdi
-  MNT=/mnt/sdi
-
-  mkfs.btrfs -f $DEV >/dev/null
-  mount $DEV $MNT
-
-  # Create our test files and directory. Inode 259 (file3) has two hard
-  # links.
-  touch $MNT/file1
-  touch $MNT/file2
-  touch $MNT/file3
-
-  mkdir $MNT/A
-  ln $MNT/file3 $MNT/A/hard_link
-
-  # Filesystem looks like:
-  #
-  # .                                     (ino 256)
-  # |----- file1                          (ino 257)
-  # |----- file2                          (ino 258)
-  # |----- file3                          (ino 259)
-  # |----- A/                             (ino 260)
-  #        |---- hard_link                (ino 259)
-  #
-
-  # Now create the base snapshot, which is going to be the parent snapshot
-  # for a later incremental send.
-  btrfs subvolume snapshot -r $MNT $MNT/snap1
-  btrfs send -f /tmp/snap1.send $MNT/snap1
-
-  # Move inode 257 into directory inode 260. This results in computing the
-  # path for inode 260 as "/A" and caching it.
-  mv $MNT/file1 $MNT/A/file1
-
-  # Move inode 258 (file2) into directory inode 260, with a name of
-  # "hard_link", moving first inode 259 away since it currently has that
-  # location and name.
-  mv $MNT/A/hard_link $MNT/tmp
-  mv $MNT/file2 $MNT/A/hard_link
-
-  # Now rename inode 260 to something else (B for example) and then create
-  # a hard link for inode 258 that has the old name and location of inode
-  # 260 ("/A").
-  mv $MNT/A $MNT/B
-  ln $MNT/B/hard_link $MNT/A
-
-  # Filesystem now looks like:
-  #
-  # .                                     (ino 256)
-  # |----- tmp                            (ino 259)
-  # |----- file3                          (ino 259)
-  # |----- B/                             (ino 260)
-  # |      |---- file1                    (ino 257)
-  # |      |---- hard_link                (ino 258)
-  # |
-  # |----- A                              (ino 258)
-
-  # Create another snapshot of our subvolume and use it for an incremental
-  # send.
-  btrfs subvolume snapshot -r $MNT $MNT/snap2
-  btrfs send -f /tmp/snap2.send -p $MNT/snap1 $MNT/snap2
-
-  # Now unmount the filesystem, create a new one, mount it and try to
-  # apply both send streams to recreate both snapshots.
-  umount $DEV
-
-  mkfs.btrfs -f $DEV >/dev/null
-
-  mount $DEV $MNT
-
-  # First add the first snapshot to the new filesystem by applying the
-  # first send stream.
-  btrfs receive -f /tmp/snap1.send $MNT
-
-  # The incremental receive operation below used to fail with the
-  # following error:
-  #
-  #    ERROR: unlink A/hard_link failed: No such file or directory
-  #
-  # This is because when send is processing inode 257, it generates the
-  # path for inode 260 as "/A", since that inode is its parent in the send
-  # snapshot, and caches that path.
-  #
-  # Later when processing inode 258, it first processes its new reference
-  # that has the path of "/A", which results in orphanizing inode 260
-  # because there is a a path collision. This results in issuing a rename
-  # operation from "/A" to "/o260-6-0".
-  #
-  # Finally when processing the new reference "B/hard_link" for inode 258,
-  # it notices that it collides with inode 259 (not yet processed, because
-  # it has a higher inode number), since that inode has the name
-  # "hard_link" under the directory inode 260. It also checks that inode
-  # 259 has two hardlinks, so it decides to issue a unlink operation for
-  # the name "hard_link" for inode 259. However the path passed to the
-  # unlink operation is "/A/hard_link", which is incorrect since currently
-  # "/A" does not exists, due to the orphanization of inode 260 mentioned
-  # before. The path is incorrect because it was computed and cached
-  # before the orphanization. This results in the receiver to fail with
-  # the above error.
-  btrfs receive -f /tmp/snap2.send $MNT
-
-  umount $MNT
-
-When running the test, it fails like this:
-
-  $ ./test-send-unlink.sh
-  Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1'
-  At subvol /mnt/sdi/snap1
-  Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2'
-  At subvol /mnt/sdi/snap2
-  At subvol snap1
-  At snapshot snap2
-  ERROR: unlink A/hard_link failed: No such file or directory
-
-Fix this by recomputing a path before issuing an unlink operation when
-processing the new references for the current inode if we previously
-have orphanized a directory.
-
-A test case for fstests will follow soon.
-
-Signed-off-by: Filipe Manana <fdmanana@suse.com>
----
- fs/btrfs/send.c | 11 +++++++++++
- 1 file changed, 11 insertions(+)
-
-diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
-index ed1310e38..f61ababf8 100644
---- a/fs/btrfs/send.c
-+++ b/fs/btrfs/send.c
-@@ -4064,6 +4064,17 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
- 				if (ret < 0)
- 					goto out;
- 			} else {
-+				/*
-+				 * If we previously orphanized a directory that
-+				 * collided with a new reference that we already
-+				 * processed, recompute the current path because
-+				 * that directory may be part of the path.
-+				 */
-+				if (orphanized_dir) {
-+					ret = refresh_ref_path(sctx, cur);
-+					if (ret < 0)
-+						goto out;
-+				}
- 				ret = send_unlink(sctx, cur->full_path);
- 				if (ret < 0)
- 					goto out;
--- 
-2.32.0
-
-
-From 3a07c030c466316a5f74cb9f320d7b9df985ec1c Mon Sep 17 00:00:00 2001
-From: David Sterba <dsterba () suse ! com>
-Date: Fri, 11 Jun 2021 13:36:22 +0000
-Subject: [PATCH 22/22] btrfs: sysfs: export dev stats in devinfo directory
-
-The device stats can be read by ioctl, wrapped by command 'btrfs device
-stats'. Provide another source where to read the information in
-/sys/fs/btrfs/FSID/devinfo/DEVID/stats . The format is a list of
-'key value' pairs one per line, which is common in other stat files.
-The names are the same as used in other device stat outputs.
-
-The stats are all in one file as it's the snapshot of all available
-stats. The 'one value per file' is not very suitable here. The stats
-should be valid right after the stats item is read from disk, shortly
-after initializing the device.
-
-In case the stats are not yet valid, print just 'invalid' as the file
-contents.
-
-Signed-off-by: David Sterba <dsterba@suse.com>
----
- fs/btrfs/sysfs.c | 29 +++++++++++++++++++++++++++++
- 1 file changed, 29 insertions(+)
-
-diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
-index 9dda3feda..5c50fd77f 100644
---- a/fs/btrfs/sysfs.c
-+++ b/fs/btrfs/sysfs.c
-@@ -1509,7 +1509,36 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
- }
- BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
- 
-+static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
-+		struct kobj_attribute *a, char *buf)
-+{
-+	struct btrfs_device *device = container_of(kobj, struct btrfs_device,
-+						   devid_kobj);
-+
-+	if (!device->dev_stats_valid)
-+		return scnprintf(buf, PAGE_SIZE, "invalid\n");
-+
-+	/*
-+	 * Print all at once so we get a snapshot of all values from the same
-+	 * time. Keep them in sync and in order of definition of
-+	 * btrfs_dev_stat_values.
-+	 */
-+	return scnprintf(buf, PAGE_SIZE,
-+		"write_errs %d\n"
-+		"read_errs %d\n"
-+		"flush_errs %d\n"
-+		"corruption_errs %d\n"
-+		"generation_errs %d\n",
-+		btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS),
-+		btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS),
-+		btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS),
-+		btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS),
-+		btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS));
-+}
-+BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
-+
- static struct attribute *devid_attrs[] = {
-+	BTRFS_ATTR_PTR(devid, error_stats),
- 	BTRFS_ATTR_PTR(devid, in_fs_metadata),
- 	BTRFS_ATTR_PTR(devid, missing),
- 	BTRFS_ATTR_PTR(devid, replace_target),
--- 
-2.32.0
-
diff --git a/PKGBUILD b/PKGBUILD
index 8def1d039c87..b709f1c3f928 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -74,6 +74,7 @@ _major=5.12
 _ckpatchversion=1
 _ckpatch="patch-${_major}-ck${_ckpatchversion}"
 _gcc_more_v=20210610
+_patches_url="https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/${_major}"
 arch=(x86_64)
 url="https://wiki.archlinux.org/index.php/Linux-ck"
 license=(GPL2)
@@ -93,9 +94,9 @@ source=(
   0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch
   0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch
   0007-x86-crash-remove-crash_reserve_low_1M.patch
-  0008-UKSM.patch
-  0009-bbr2.patch
-  0010-btrfs.patch
+  "0008-UKSM.patch::${_patches_url}/uksm-patches/0001-UKSM-for-5.12.patch"
+  "0009-bbr2.patch::${_patches_url}/bbr2-patches-v2/0001-bbr2-5.12-introduce-BBRv2.patch"
+  "0010-btrfs.patch::${_patches_url}/btrfs-patches-v13/0001-btrfs-patches.patch"
   "0011-block.patch::${_patches_url}/block-patches-v6/0001-block-patches.patch"
   "0012-bfq.patch::${_patches_url}/bfq-patches-v15/0001-bfq-patches.patch"
 )
author	antman666	2021-07-06 17:15:37 +0800
committer	antman666	2021-07-06 17:15:37 +0800
commit	8782ea33567efa1b67c9fd54ca51bdefe596f89a (patch)
tree	63d19e5daab3fe80230ea602f38a86f2dcaaf677
parent	7f8e8c158ef86353e2613129a89f5681f6093f34 (diff)
download	aur-8782ea33567efa1b67c9fd54ca51bdefe596f89a.tar.gz