summarylogtreecommitdiffstats
diff options
context:
space:
mode:
authorantman6662021-07-06 17:15:37 +0800
committerantman6662021-07-06 17:15:37 +0800
commit8782ea33567efa1b67c9fd54ca51bdefe596f89a (patch)
tree63d19e5daab3fe80230ea602f38a86f2dcaaf677
parent7f8e8c158ef86353e2613129a89f5681f6093f34 (diff)
downloadaur-8782ea33567efa1b67c9fd54ca51bdefe596f89a.tar.gz
remove useless patches
-rw-r--r--.SRCINFO10
-rw-r--r--0008-UKSM.patch6970
-rw-r--r--0009-bbr2.patch3347
-rw-r--r--0010-btrfs.patch2157
-rw-r--r--PKGBUILD7
5 files changed, 11 insertions, 12480 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 7dd6b27c69ea..b706f61897ab 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -24,9 +24,11 @@ pkgbase = linux-ck-uksm
source = 0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch
source = 0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch
source = 0007-x86-crash-remove-crash_reserve_low_1M.patch
- source = 0008-UKSM.patch
- source = 0009-bbr2.patch
- source = 0010-btrfs.patch
+ source = 0008-UKSM.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/uksm-patches/0001-UKSM-for-5.12.patch
+ source = 0009-bbr2.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/bbr2-patches-v2/0001-bbr2-5.12-introduce-BBRv2.patch
+ source = 0010-btrfs.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/btrfs-patches-v13/0001-btrfs-patches.patch
+ source = 0011-block.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/block-patches-v6/0001-block-patches.patch
+ source = 0012-bfq.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/bfq-patches-v15/0001-bfq-patches.patch
validpgpkeys = ABAF11C65A2970B130ABE3C479BE3E4300411886
validpgpkeys = 647F28654894E3BD457199BE38DBBDC86092693E
b2sums = 3bc213b432d61c358f85b932dec8bd44a1ef73442f20424ad5ce374b6982a6909c5b318d5e9848996989d5e421ab6c2128cdb51a3724adc95222f96a859486a1
@@ -44,6 +46,8 @@ pkgbase = linux-ck-uksm
b2sums = 14f45171afc3b15488b40a05e58b352c5057da3a5782e13527392f7750d8e45a8db54f9b50b218fedb8bf679de3b4e5d78e230a44f7b1aa482f7b3aa831bd641
b2sums = 0c5f2e21e27aee6c8d8eaa07daa111ff2687756413f8a909cf03acc8f836367c6b27050966f9b7bf1521ad11b84fe94fb42d70c33693c80a674ef223cf2cfc00
b2sums = 705a8f2037eef3afdd0f2a7648cc8d00bfc03112385b44a8907182812b6aed075519a9236909c0e3ba09df887381dd76cb01c601e0df05119136f7318587a416
+ b2sums = 67067d624711d663c1be1d35c5e59cb588faba1769b27443a3a13b44dbe9e627edd054a4fd122d04d587e21b25be5520fffb61cfc7538aee77c33a1a8cb1b97a
+ b2sums = 9aba508592818a4b4f000fc1bd471ec74687c8f0f972f330e851bd2364eaf30cff4d5012f843625ca025bc2478a2c76e0d082d43f33358ab18ce829fab4f0c2b
pkgname = linux-ck-uksm
pkgdesc = The Linux-ck-uksm kernel and modules with the ck1 and uksm patchesset featuring MuQSS CPU scheduler
diff --git a/0008-UKSM.patch b/0008-UKSM.patch
deleted file mode 100644
index 3321eaa8ee58..000000000000
--- a/0008-UKSM.patch
+++ /dev/null
@@ -1,6970 +0,0 @@
-From 9a42006b641bc8e0c333174a9bf269ac9450d521 Mon Sep 17 00:00:00 2001
-From: Piotr Gorski <lucjan.lucjanov@gmail.com>
-Date: Tue, 13 Apr 2021 16:27:12 +0200
-Subject: [PATCH] UKSM for 5.12
-
-Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
----
- Documentation/vm/uksm.txt | 61 +
- fs/exec.c | 1 +
- fs/proc/meminfo.c | 4 +
- include/linux/ksm.h | 43 +-
- include/linux/mm_types.h | 3 +
- include/linux/mmzone.h | 3 +
- include/linux/pgtable.h | 17 +-
- include/linux/sradix-tree.h | 77 +
- include/linux/uksm.h | 149 +
- kernel/fork.c | 2 +-
- lib/Makefile | 2 +-
- lib/sradix-tree.c | 476 +++
- mm/Kconfig | 26 +
- mm/Makefile | 3 +-
- mm/ksm.c | 11 -
- mm/memory.c | 33 +-
- mm/mmap.c | 37 +
- mm/uksm.c | 5614 +++++++++++++++++++++++++++++++++++
- mm/vmstat.c | 3 +
- 19 files changed, 6539 insertions(+), 26 deletions(-)
- create mode 100644 Documentation/vm/uksm.txt
- create mode 100644 include/linux/sradix-tree.h
- create mode 100644 include/linux/uksm.h
- create mode 100644 lib/sradix-tree.c
- create mode 100644 mm/uksm.c
-
-diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
-new file mode 100644
-index 000000000..be19a3127
---- /dev/null
-+++ b/Documentation/vm/uksm.txt
-@@ -0,0 +1,61 @@
-+The Ultra Kernel Samepage Merging feature
-+----------------------------------------------
-+/*
-+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
-+ *
-+ * This is an improvement upon KSM. Some basic data structures and routines
-+ * are borrowed from ksm.c .
-+ *
-+ * Its new features:
-+ * 1. Full system scan:
-+ * It automatically scans all user processes' anonymous VMAs. Kernel-user
-+ * interaction to submit a memory area to KSM is no longer needed.
-+ *
-+ * 2. Rich area detection:
-+ * It automatically detects rich areas containing abundant duplicated
-+ * pages based. Rich areas are given a full scan speed. Poor areas are
-+ * sampled at a reasonable speed with very low CPU consumption.
-+ *
-+ * 3. Ultra Per-page scan speed improvement:
-+ * A new hash algorithm is proposed. As a result, on a machine with
-+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
-+ * can scan memory areas that does not contain duplicated pages at speed of
-+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
-+ * 477MB/sec ~ 923MB/sec.
-+ *
-+ * 4. Thrashing area avoidance:
-+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be
-+ * filtered out. My benchmark shows it's more efficient than KSM's per-page
-+ * hash value based volatile page detection.
-+ *
-+ *
-+ * 5. Misc changes upon KSM:
-+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
-+ * comparison. It's much faster than default C version on x86.
-+ * * rmap_item now has an struct *page member to loosely cache a
-+ * address-->page mapping, which reduces too much time-costly
-+ * follow_page().
-+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
-+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
-+ * ksm is needed for this case.
-+ *
-+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
-+ * Now uksmd consider full zero pages as special pages and merge them to an
-+ * special unswappable uksm zero page.
-+ */
-+
-+ChangeLog:
-+
-+2012-05-05 The creation of this Doc
-+2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
-+2012-05-28 UKSM 0.1.1.2 bug fix release
-+2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
-+2012-07-2 UKSM 0.1.2-beta2
-+2012-07-10 UKSM 0.1.2-beta3
-+2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
-+2012-10-13 UKSM 0.1.2.1 Bug fixes.
-+2012-12-31 UKSM 0.1.2.2 Minor bug fixes.
-+2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug".
-+2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings.
-+2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation.
-+2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration.
-diff --git a/fs/exec.c b/fs/exec.c
-index 18594f11c..aee636fd4 100644
---- a/fs/exec.c
-+++ b/fs/exec.c
-@@ -65,6 +65,7 @@
- #include <linux/vmalloc.h>
- #include <linux/io_uring.h>
- #include <linux/syscall_user_dispatch.h>
-+#include <linux/ksm.h>
-
- #include <linux/uaccess.h>
- #include <asm/mmu_context.h>
-diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
-index 6fa761c9c..45fd59a0d 100644
---- a/fs/proc/meminfo.c
-+++ b/fs/proc/meminfo.c
-@@ -108,6 +108,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
- #endif
- show_val_kb(m, "PageTables: ",
- global_node_page_state(NR_PAGETABLE));
-+#ifdef CONFIG_UKSM
-+ show_val_kb(m, "KsmZeroPages: ",
-+ global_zone_page_state(NR_UKSM_ZERO_PAGES));
-+#endif
-
- show_val_kb(m, "NFS_Unstable: ", 0);
- show_val_kb(m, "Bounce: ",
-diff --git a/include/linux/ksm.h b/include/linux/ksm.h
-index 161e8164a..f0dbdf3c9 100644
---- a/include/linux/ksm.h
-+++ b/include/linux/ksm.h
-@@ -21,20 +21,16 @@ struct mem_cgroup;
- #ifdef CONFIG_KSM
- int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, int advice, unsigned long *vm_flags);
--int __ksm_enter(struct mm_struct *mm);
--void __ksm_exit(struct mm_struct *mm);
-
--static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-+static inline struct stable_node *page_stable_node(struct page *page)
- {
-- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-- return __ksm_enter(mm);
-- return 0;
-+ return PageKsm(page) ? page_rmapping(page) : NULL;
- }
-
--static inline void ksm_exit(struct mm_struct *mm)
-+static inline void set_page_stable_node(struct page *page,
-+ struct stable_node *stable_node)
- {
-- if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
-- __ksm_exit(mm);
-+ page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
- }
-
- /*
-@@ -54,6 +50,33 @@ struct page *ksm_might_need_to_copy(struct page *page,
- void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
- void ksm_migrate_page(struct page *newpage, struct page *oldpage);
-
-+#ifdef CONFIG_KSM_LEGACY
-+int __ksm_enter(struct mm_struct *mm);
-+void __ksm_exit(struct mm_struct *mm);
-+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-+{
-+ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-+ return __ksm_enter(mm);
-+ return 0;
-+}
-+
-+static inline void ksm_exit(struct mm_struct *mm)
-+{
-+ if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
-+ __ksm_exit(mm);
-+}
-+
-+#elif defined(CONFIG_UKSM)
-+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-+{
-+ return 0;
-+}
-+
-+static inline void ksm_exit(struct mm_struct *mm)
-+{
-+}
-+#endif /* !CONFIG_UKSM */
-+
- #else /* !CONFIG_KSM */
-
- static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-@@ -89,4 +112,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
- #endif /* CONFIG_MMU */
- #endif /* !CONFIG_KSM */
-
-+#include <linux/uksm.h>
-+
- #endif /* __LINUX_KSM_H */
-diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
-index 6613b26a8..82e18e41b 100644
---- a/include/linux/mm_types.h
-+++ b/include/linux/mm_types.h
-@@ -370,6 +370,9 @@ struct vm_area_struct {
- struct mempolicy *vm_policy; /* NUMA policy for the VMA */
- #endif
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
-+#ifdef CONFIG_UKSM
-+ struct vma_slot *uksm_vma_slot;
-+#endif
- } __randomize_layout;
-
- struct core_thread {
-diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
-index 47946cec7..a6ce64844 100644
---- a/include/linux/mmzone.h
-+++ b/include/linux/mmzone.h
-@@ -157,6 +157,9 @@ enum zone_stat_item {
- NR_ZSPAGES, /* allocated in zsmalloc */
- #endif
- NR_FREE_CMA_PAGES,
-+#ifdef CONFIG_UKSM
-+ NR_UKSM_ZERO_PAGES,
-+#endif
- NR_VM_ZONE_STAT_ITEMS };
-
- enum node_stat_item {
-diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
-index 5e772392a..9d733540d 100644
---- a/include/linux/pgtable.h
-+++ b/include/linux/pgtable.h
-@@ -1111,12 +1111,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
- extern void untrack_pfn_moved(struct vm_area_struct *vma);
- #endif
-
-+#ifdef CONFIG_UKSM
-+static inline int is_uksm_zero_pfn(unsigned long pfn)
-+{
-+ extern unsigned long uksm_zero_pfn;
-+ return pfn == uksm_zero_pfn;
-+}
-+#else
-+static inline int is_uksm_zero_pfn(unsigned long pfn)
-+{
-+ return 0;
-+}
-+#endif
-+
- #ifdef __HAVE_COLOR_ZERO_PAGE
- static inline int is_zero_pfn(unsigned long pfn)
- {
- extern unsigned long zero_pfn;
- unsigned long offset_from_zero_pfn = pfn - zero_pfn;
-- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
-+ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
- }
-
- #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
-@@ -1125,7 +1138,7 @@ static inline int is_zero_pfn(unsigned long pfn)
- static inline int is_zero_pfn(unsigned long pfn)
- {
- extern unsigned long zero_pfn;
-- return pfn == zero_pfn;
-+ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
- }
-
- static inline unsigned long my_zero_pfn(unsigned long addr)
-diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
-new file mode 100644
-index 000000000..d71edba6b
---- /dev/null
-+++ b/include/linux/sradix-tree.h
-@@ -0,0 +1,77 @@
-+#ifndef _LINUX_SRADIX_TREE_H
-+#define _LINUX_SRADIX_TREE_H
-+
-+
-+#define INIT_SRADIX_TREE(root, mask) \
-+do { \
-+ (root)->height = 0; \
-+ (root)->gfp_mask = (mask); \
-+ (root)->rnode = NULL; \
-+} while (0)
-+
-+#define ULONG_BITS (sizeof(unsigned long) * 8)
-+#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
-+//#define SRADIX_TREE_MAP_SHIFT 6
-+//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT)
-+//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1)
-+
-+struct sradix_tree_node {
-+ unsigned int height; /* Height from the bottom */
-+ unsigned int count;
-+ unsigned int fulls; /* Number of full sublevel trees */
-+ struct sradix_tree_node *parent;
-+ void *stores[0];
-+};
-+
-+/* A simple radix tree implementation */
-+struct sradix_tree_root {
-+ unsigned int height;
-+ struct sradix_tree_node *rnode;
-+
-+ /* Where found to have available empty stores in its sublevels */
-+ struct sradix_tree_node *enter_node;
-+ unsigned int shift;
-+ unsigned int stores_size;
-+ unsigned int mask;
-+ unsigned long min; /* The first hole index */
-+ unsigned long num;
-+ //unsigned long *height_to_maxindex;
-+
-+ /* How the node is allocated and freed. */
-+ struct sradix_tree_node *(*alloc)(void);
-+ void (*free)(struct sradix_tree_node *node);
-+
-+ /* When a new node is added and removed */
-+ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
-+ void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item);
-+ void (*rm)(struct sradix_tree_node *node, unsigned int offset);
-+};
-+
-+struct sradix_tree_path {
-+ struct sradix_tree_node *node;
-+ int offset;
-+};
-+
-+static inline
-+void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
-+{
-+ root->height = 0;
-+ root->rnode = NULL;
-+ root->shift = shift;
-+ root->stores_size = 1UL << shift;
-+ root->mask = root->stores_size - 1;
-+}
-+
-+
-+extern void *sradix_tree_next(struct sradix_tree_root *root,
-+ struct sradix_tree_node *node, unsigned long index,
-+ int (*iter)(void *, unsigned long));
-+
-+extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
-+
-+extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
-+ struct sradix_tree_node *node, unsigned long index);
-+
-+extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
-+
-+#endif /* _LINUX_SRADIX_TREE_H */
-diff --git a/include/linux/uksm.h b/include/linux/uksm.h
-new file mode 100644
-index 000000000..bb8651f53
---- /dev/null
-+++ b/include/linux/uksm.h
-@@ -0,0 +1,149 @@
-+#ifndef __LINUX_UKSM_H
-+#define __LINUX_UKSM_H
-+/*
-+ * Memory merging support.
-+ *
-+ * This code enables dynamic sharing of identical pages found in different
-+ * memory areas, even if they are not shared by fork().
-+ */
-+
-+/* if !CONFIG_UKSM this file should not be compiled at all. */
-+#ifdef CONFIG_UKSM
-+
-+#include <linux/bitops.h>
-+#include <linux/mm.h>
-+#include <linux/pagemap.h>
-+#include <linux/rmap.h>
-+#include <linux/sched.h>
-+
-+extern unsigned long zero_pfn __read_mostly;
-+extern unsigned long uksm_zero_pfn __read_mostly;
-+extern struct page *empty_uksm_zero_page;
-+
-+/* must be done before linked to mm */
-+extern void uksm_vma_add_new(struct vm_area_struct *vma);
-+extern void uksm_remove_vma(struct vm_area_struct *vma);
-+
-+#define UKSM_SLOT_NEED_SORT (1 << 0)
-+#define UKSM_SLOT_NEED_RERAND (1 << 1)
-+#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */
-+#define UKSM_SLOT_FUL_SCANNED (1 << 3)
-+#define UKSM_SLOT_IN_UKSM (1 << 4)
-+
-+struct vma_slot {
-+ struct sradix_tree_node *snode;
-+ unsigned long sindex;
-+
-+ struct list_head slot_list;
-+ unsigned long fully_scanned_round;
-+ unsigned long dedup_num;
-+ unsigned long pages_scanned;
-+ unsigned long this_sampled;
-+ unsigned long last_scanned;
-+ unsigned long pages_to_scan;
-+ struct scan_rung *rung;
-+ struct page **rmap_list_pool;
-+ unsigned int *pool_counts;
-+ unsigned long pool_size;
-+ struct vm_area_struct *vma;
-+ struct mm_struct *mm;
-+ unsigned long ctime_j;
-+ unsigned long pages;
-+ unsigned long flags;
-+ unsigned long pages_cowed; /* pages cowed this round */
-+ unsigned long pages_merged; /* pages merged this round */
-+ unsigned long pages_bemerged;
-+
-+ /* when it has page merged in this eval round */
-+ struct list_head dedup_list;
-+};
-+
-+static inline void uksm_unmap_zero_page(pte_t pte)
-+{
-+ if (pte_pfn(pte) == uksm_zero_pfn)
-+ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
-+}
-+
-+static inline void uksm_map_zero_page(pte_t pte)
-+{
-+ if (pte_pfn(pte) == uksm_zero_pfn)
-+ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
-+}
-+
-+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
-+{
-+ if (vma->uksm_vma_slot && PageKsm(page))
-+ vma->uksm_vma_slot->pages_cowed++;
-+}
-+
-+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
-+{
-+ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
-+ vma->uksm_vma_slot->pages_cowed++;
-+}
-+
-+static inline int uksm_flags_can_scan(unsigned long vm_flags)
-+{
-+#ifdef VM_SAO
-+ if (vm_flags & VM_SAO)
-+ return 0;
-+#endif
-+
-+ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND |
-+ VM_HUGETLB | VM_MIXEDMAP | VM_SHARED
-+ | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN));
-+}
-+
-+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
-+{
-+ if (uksm_flags_can_scan(*vm_flags_p))
-+ *vm_flags_p |= VM_MERGEABLE;
-+}
-+
-+/*
-+ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
-+ * be removed when uksm zero page patch is stable enough.
-+ */
-+static inline void uksm_bugon_zeropage(pte_t pte)
-+{
-+ BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
-+}
-+#else
-+static inline void uksm_vma_add_new(struct vm_area_struct *vma)
-+{
-+}
-+
-+static inline void uksm_remove_vma(struct vm_area_struct *vma)
-+{
-+}
-+
-+static inline void uksm_unmap_zero_page(pte_t pte)
-+{
-+}
-+
-+static inline void uksm_map_zero_page(pte_t pte)
-+{
-+}
-+
-+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
-+{
-+}
-+
-+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
-+{
-+}
-+
-+static inline int uksm_flags_can_scan(unsigned long vm_flags)
-+{
-+ return 0;
-+}
-+
-+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
-+{
-+}
-+
-+static inline void uksm_bugon_zeropage(pte_t pte)
-+{
-+}
-+#endif /* !CONFIG_UKSM */
-+#endif /* __LINUX_UKSM_H */
-diff --git a/kernel/fork.c b/kernel/fork.c
-index 426cd0c51..5fd356ca7 100644
---- a/kernel/fork.c
-+++ b/kernel/fork.c
-@@ -588,7 +588,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
- __vma_link_rb(mm, tmp, rb_link, rb_parent);
- rb_link = &tmp->vm_rb.rb_right;
- rb_parent = &tmp->vm_rb;
--
-+ uksm_vma_add_new(tmp);
- mm->map_count++;
- if (!(tmp->vm_flags & VM_WIPEONFORK))
- retval = copy_page_range(tmp, mpnt);
-diff --git a/lib/Makefile b/lib/Makefile
-index b5307d3ee..480b099e1 100644
---- a/lib/Makefile
-+++ b/lib/Makefile
-@@ -28,7 +28,7 @@ CFLAGS_string.o += -fno-stack-protector
- endif
-
- lib-y := ctype.o string.o vsprintf.o cmdline.o \
-- rbtree.o radix-tree.o timerqueue.o xarray.o \
-+ rbtree.o radix-tree.o sradix-tree.o timerqueue.o xarray.o \
- idr.o extable.o sha1.o irq_regs.o argv_split.o \
- flex_proportions.o ratelimit.o show_mem.o \
- is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c
-new file mode 100644
-index 000000000..ab21e6309
---- /dev/null
-+++ b/lib/sradix-tree.c
-@@ -0,0 +1,476 @@
-+#include <linux/errno.h>
-+#include <linux/mm.h>
-+#include <linux/mman.h>
-+#include <linux/spinlock.h>
-+#include <linux/slab.h>
-+#include <linux/gcd.h>
-+#include <linux/sradix-tree.h>
-+
-+static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
-+{
-+ return node->fulls == root->stores_size ||
-+ (node->height == 1 && node->count == root->stores_size);
-+}
-+
-+/*
-+ * Extend a sradix tree so it can store key @index.
-+ */
-+static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
-+{
-+ struct sradix_tree_node *node;
-+ unsigned int height;
-+
-+ if (unlikely(root->rnode == NULL)) {
-+ if (!(node = root->alloc()))
-+ return -ENOMEM;
-+
-+ node->height = 1;
-+ root->rnode = node;
-+ root->height = 1;
-+ }
-+
-+ /* Figure out what the height should be. */
-+ height = root->height;
-+ index >>= root->shift * height;
-+
-+ while (index) {
-+ index >>= root->shift;
-+ height++;
-+ }
-+
-+ while (height > root->height) {
-+ unsigned int newheight;
-+
-+ if (!(node = root->alloc()))
-+ return -ENOMEM;
-+
-+ /* Increase the height. */
-+ node->stores[0] = root->rnode;
-+ root->rnode->parent = node;
-+ if (root->extend)
-+ root->extend(node, root->rnode);
-+
-+ newheight = root->height + 1;
-+ node->height = newheight;
-+ node->count = 1;
-+ if (sradix_node_full(root, root->rnode))
-+ node->fulls = 1;
-+
-+ root->rnode = node;
-+ root->height = newheight;
-+ }
-+
-+ return 0;
-+}
-+
-+/*
-+ * Search the next item from the current node, that is not NULL
-+ * and can satify root->iter().
-+ */
-+void *sradix_tree_next(struct sradix_tree_root *root,
-+ struct sradix_tree_node *node, unsigned long index,
-+ int (*iter)(void *item, unsigned long height))
-+{
-+ unsigned long offset;
-+ void *item;
-+
-+ if (unlikely(node == NULL)) {
-+ node = root->rnode;
-+ for (offset = 0; offset < root->stores_size; offset++) {
-+ item = node->stores[offset];
-+ if (item && (!iter || iter(item, node->height)))
-+ break;
-+ }
-+
-+ if (unlikely(offset >= root->stores_size))
-+ return NULL;
-+
-+ if (node->height == 1)
-+ return item;
-+ else
-+ goto go_down;
-+ }
-+
-+ while (node) {
-+ offset = (index & root->mask) + 1;
-+ for (; offset < root->stores_size; offset++) {
-+ item = node->stores[offset];
-+ if (item && (!iter || iter(item, node->height)))
-+ break;
-+ }
-+
-+ if (offset < root->stores_size)
-+ break;
-+
-+ node = node->parent;
-+ index >>= root->shift;
-+ }
-+
-+ if (!node)
-+ return NULL;
-+
-+ while (node->height > 1) {
-+go_down:
-+ node = item;
-+ for (offset = 0; offset < root->stores_size; offset++) {
-+ item = node->stores[offset];
-+ if (item && (!iter || iter(item, node->height)))
-+ break;
-+ }
-+
-+ if (unlikely(offset >= root->stores_size))
-+ return NULL;
-+ }
-+
-+ BUG_ON(offset > root->stores_size);
-+
-+ return item;
-+}
-+
-+/*
-+ * Blindly insert the item to the tree. Typically, we reuse the
-+ * first empty store item.
-+ */
-+int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
-+{
-+ unsigned long index;
-+ unsigned int height;
-+ struct sradix_tree_node *node, *tmp = NULL;
-+ int offset, offset_saved;
-+ void **store = NULL;
-+ int error, i, j, shift;
-+
-+go_on:
-+ index = root->min;
-+
-+ if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
-+ node = root->enter_node;
-+ BUG_ON((index >> (root->shift * root->height)));
-+ } else {
-+ node = root->rnode;
-+ if (node == NULL || (index >> (root->shift * root->height))
-+ || sradix_node_full(root, node)) {
-+ error = sradix_tree_extend(root, index);
-+ if (error)
-+ return error;
-+
-+ node = root->rnode;
-+ }
-+ }
-+
-+
-+ height = node->height;
-+ shift = (height - 1) * root->shift;
-+ offset = (index >> shift) & root->mask;
-+ while (shift > 0) {
-+ offset_saved = offset;
-+ for (; offset < root->stores_size; offset++) {
-+ store = &node->stores[offset];
-+ tmp = *store;
-+
-+ if (!tmp || !sradix_node_full(root, tmp))
-+ break;
-+ }
-+ BUG_ON(offset >= root->stores_size);
-+
-+ if (offset != offset_saved) {
-+ index += (offset - offset_saved) << shift;
-+ index &= ~((1UL << shift) - 1);
-+ }
-+
-+ if (!tmp) {
-+ if (!(tmp = root->alloc()))
-+ return -ENOMEM;
-+
-+ tmp->height = shift / root->shift;
-+ *store = tmp;
-+ tmp->parent = node;
-+ node->count++;
-+// if (root->extend)
-+// root->extend(node, tmp);
-+ }
-+
-+ node = tmp;
-+ shift -= root->shift;
-+ offset = (index >> shift) & root->mask;
-+ }
-+
-+ BUG_ON(node->height != 1);
-+
-+
-+ store = &node->stores[offset];
-+ for (i = 0, j = 0;
-+ j < root->stores_size - node->count &&
-+ i < root->stores_size - offset && j < num; i++) {
-+ if (!store[i]) {
-+ store[i] = item[j];
-+ if (root->assign)
-+ root->assign(node, index + i, item[j]);
-+ j++;
-+ }
-+ }
-+
-+ node->count += j;
-+ root->num += j;
-+ num -= j;
-+
-+ while (sradix_node_full(root, node)) {
-+ node = node->parent;
-+ if (!node)
-+ break;
-+
-+ node->fulls++;
-+ }
-+
-+ if (unlikely(!node)) {
-+ /* All nodes are full */
-+ root->min = 1 << (root->height * root->shift);
-+ root->enter_node = NULL;
-+ } else {
-+ root->min = index + i - 1;
-+ root->min |= (1UL << (node->height - 1)) - 1;
-+ root->min++;
-+ root->enter_node = node;
-+ }
-+
-+ if (num) {
-+ item += j;
-+ goto go_on;
-+ }
-+
-+ return 0;
-+}
-+
-+
-+/**
-+ * sradix_tree_shrink - shrink height of a sradix tree to minimal
-+ * @root sradix tree root
-+ *
-+ */
-+static inline void sradix_tree_shrink(struct sradix_tree_root *root)
-+{
-+ /* try to shrink tree height */
-+ while (root->height > 1) {
-+ struct sradix_tree_node *to_free = root->rnode;
-+
-+ /*
-+ * The candidate node has more than one child, or its child
-+ * is not at the leftmost store, we cannot shrink.
-+ */
-+ if (to_free->count != 1 || !to_free->stores[0])
-+ break;
-+
-+ root->rnode = to_free->stores[0];
-+ root->rnode->parent = NULL;
-+ root->height--;
-+ if (unlikely(root->enter_node == to_free))
-+ root->enter_node = NULL;
-+ root->free(to_free);
-+ }
-+}
-+
-+/*
-+ * Del the item on the known leaf node and index
-+ */
-+void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
-+ struct sradix_tree_node *node, unsigned long index)
-+{
-+ unsigned int offset;
-+ struct sradix_tree_node *start, *end;
-+
-+ BUG_ON(node->height != 1);
-+
-+ start = node;
-+ while (node && !(--node->count))
-+ node = node->parent;
-+
-+ end = node;
-+ if (!node) {
-+ root->rnode = NULL;
-+ root->height = 0;
-+ root->min = 0;
-+ root->num = 0;
-+ root->enter_node = NULL;
-+ } else {
-+ offset = (index >> (root->shift * (node->height - 1))) & root->mask;
-+ if (root->rm)
-+ root->rm(node, offset);
-+ node->stores[offset] = NULL;
-+ root->num--;
-+ if (root->min > index) {
-+ root->min = index;
-+ root->enter_node = node;
-+ }
-+ }
-+
-+ if (start != end) {
-+ do {
-+ node = start;
-+ start = start->parent;
-+ if (unlikely(root->enter_node == node))
-+ root->enter_node = end;
-+ root->free(node);
-+ } while (start != end);
-+
-+ /*
-+ * Note that shrink may free "end", so enter_node still need to
-+ * be checked inside.
-+ */
-+ sradix_tree_shrink(root);
-+ } else if (node->count == root->stores_size - 1) {
-+ /* It WAS a full leaf node. Update the ancestors */
-+ node = node->parent;
-+ while (node) {
-+ node->fulls--;
-+ if (node->fulls != root->stores_size - 1)
-+ break;
-+
-+ node = node->parent;
-+ }
-+ }
-+}
-+
-+void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
-+{
-+ unsigned int height, offset;
-+ struct sradix_tree_node *node;
-+ int shift;
-+
-+ node = root->rnode;
-+ if (node == NULL || (index >> (root->shift * root->height)))
-+ return NULL;
-+
-+ height = root->height;
-+ shift = (height - 1) * root->shift;
-+
-+ do {
-+ offset = (index >> shift) & root->mask;
-+ node = node->stores[offset];
-+ if (!node)
-+ return NULL;
-+
-+ shift -= root->shift;
-+ } while (shift >= 0);
-+
-+ return node;
-+}
-+
-+/*
-+ * Return the item if it exists, otherwise create it in place
-+ * and return the created item.
-+ */
-+void *sradix_tree_lookup_create(struct sradix_tree_root *root,
-+ unsigned long index, void *(*item_alloc)(void))
-+{
-+ unsigned int height, offset;
-+ struct sradix_tree_node *node, *tmp;
-+ void *item;
-+ int shift, error;
-+
-+ if (root->rnode == NULL || (index >> (root->shift * root->height))) {
-+ if (item_alloc) {
-+ error = sradix_tree_extend(root, index);
-+ if (error)
-+ return NULL;
-+ } else {
-+ return NULL;
-+ }
-+ }
-+
-+ node = root->rnode;
-+ height = root->height;
-+ shift = (height - 1) * root->shift;
-+
-+ do {
-+ offset = (index >> shift) & root->mask;
-+ if (!node->stores[offset]) {
-+ if (!(tmp = root->alloc()))
-+ return NULL;
-+
-+ tmp->height = shift / root->shift;
-+ node->stores[offset] = tmp;
-+ tmp->parent = node;
-+ node->count++;
-+ node = tmp;
-+ } else {
-+ node = node->stores[offset];
-+ }
-+
-+ shift -= root->shift;
-+ } while (shift > 0);
-+
-+ BUG_ON(node->height != 1);
-+ offset = index & root->mask;
-+ if (node->stores[offset]) {
-+ return node->stores[offset];
-+ } else if (item_alloc) {
-+ if (!(item = item_alloc()))
-+ return NULL;
-+
-+ node->stores[offset] = item;
-+
-+ /*
-+ * NOTE: we do NOT call root->assign here, since this item is
-+ * newly created by us having no meaning. Caller can call this
-+ * if it's necessary to do so.
-+ */
-+
-+ node->count++;
-+ root->num++;
-+
-+ while (sradix_node_full(root, node)) {
-+ node = node->parent;
-+ if (!node)
-+ break;
-+
-+ node->fulls++;
-+ }
-+
-+ if (unlikely(!node)) {
-+ /* All nodes are full */
-+ root->min = 1 << (root->height * root->shift);
-+ } else {
-+ if (root->min == index) {
-+ root->min |= (1UL << (node->height - 1)) - 1;
-+ root->min++;
-+ root->enter_node = node;
-+ }
-+ }
-+
-+ return item;
-+ } else {
-+ return NULL;
-+ }
-+
-+}
-+
-+int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
-+{
-+ unsigned int height, offset;
-+ struct sradix_tree_node *node;
-+ int shift;
-+
-+ node = root->rnode;
-+ if (node == NULL || (index >> (root->shift * root->height)))
-+ return -ENOENT;
-+
-+ height = root->height;
-+ shift = (height - 1) * root->shift;
-+
-+ do {
-+ offset = (index >> shift) & root->mask;
-+ node = node->stores[offset];
-+ if (!node)
-+ return -ENOENT;
-+
-+ shift -= root->shift;
-+ } while (shift > 0);
-+
-+ offset = index & root->mask;
-+ if (!node->stores[offset])
-+ return -ENOENT;
-+
-+ sradix_tree_delete_from_leaf(root, node, index);
-+
-+ return 0;
-+}
-diff --git a/mm/Kconfig b/mm/Kconfig
-index 24c045b24..3ce98ecc2 100644
---- a/mm/Kconfig
-+++ b/mm/Kconfig
-@@ -317,6 +317,32 @@ config KSM
- See Documentation/vm/ksm.rst for more information: KSM is inactive
- until a program has madvised that an area is MADV_MERGEABLE, and
- root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
-+choice
-+ prompt "Choose UKSM/KSM strategy"
-+ default UKSM
-+ depends on KSM
-+ help
-+ This option allows to select a UKSM/KSM stragety.
-+
-+config UKSM
-+ bool "Ultra-KSM for page merging"
-+ depends on KSM
-+ help
-+ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
-+ page Merging), but with a fundamentally rewritten core algorithm. With
-+ an advanced algorithm, UKSM now can transparently scans all anonymously
-+ mapped user space applications with an significantly improved scan speed
-+ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
-+ UKSM. Now UKSM has its first stable release and first real world enterprise user.
-+ For more information, please goto its project page.
-+ (github.com/dolohow/uksm)
-+
-+config KSM_LEGACY
-+ bool "Legacy KSM implementation"
-+ depends on KSM
-+ help
-+ The legacy KSM implementation from Red Hat.
-+endchoice
-
- config DEFAULT_MMAP_MIN_ADDR
- int "Low address space to protect from user allocation"
-diff --git a/mm/Makefile b/mm/Makefile
-index 72227b24a..fd50a3a51 100644
---- a/mm/Makefile
-+++ b/mm/Makefile
-@@ -76,7 +76,8 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o
- obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
- obj-$(CONFIG_SLOB) += slob.o
- obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
--obj-$(CONFIG_KSM) += ksm.o
-+obj-$(CONFIG_KSM_LEGACY) += ksm.o
-+obj-$(CONFIG_UKSM) += uksm.o
- obj-$(CONFIG_PAGE_POISONING) += page_poison.o
- obj-$(CONFIG_SLAB) += slab.o
- obj-$(CONFIG_SLUB) += slub.o
-diff --git a/mm/ksm.c b/mm/ksm.c
-index 9694ee2c7..63af6a528 100644
---- a/mm/ksm.c
-+++ b/mm/ksm.c
-@@ -858,17 +858,6 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
- return err;
- }
-
--static inline struct stable_node *page_stable_node(struct page *page)
--{
-- return PageKsm(page) ? page_rmapping(page) : NULL;
--}
--
--static inline void set_page_stable_node(struct page *page,
-- struct stable_node *stable_node)
--{
-- page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
--}
--
- #ifdef CONFIG_SYSFS
- /*
- * Only called through the sysfs control interface:
-diff --git a/mm/memory.c b/mm/memory.c
-index 550405fc3..b4005b195 100644
---- a/mm/memory.c
-+++ b/mm/memory.c
-@@ -158,6 +158,25 @@ EXPORT_SYMBOL(zero_pfn);
-
- unsigned long highest_memmap_pfn __read_mostly;
-
-+#ifdef CONFIG_UKSM
-+unsigned long uksm_zero_pfn __read_mostly;
-+EXPORT_SYMBOL_GPL(uksm_zero_pfn);
-+struct page *empty_uksm_zero_page;
-+
-+static int __init setup_uksm_zero_page(void)
-+{
-+ empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0);
-+ if (!empty_uksm_zero_page)
-+ panic("Oh boy, that early out of memory?");
-+
-+ SetPageReserved(empty_uksm_zero_page);
-+ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
-+
-+ return 0;
-+}
-+core_initcall(setup_uksm_zero_page);
-+#endif
-+
- /*
- * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
- */
-@@ -173,6 +192,7 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
- trace_rss_stat(mm, member, count);
- }
-
-+
- #if defined(SPLIT_RSS_COUNTING)
-
- void sync_mm_rss(struct mm_struct *mm)
-@@ -875,6 +895,11 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
- get_page(page);
- page_dup_rmap(page, false);
- rss[mm_counter(page)]++;
-+
-+ /* Should return NULL in vm_normal_page() */
-+ uksm_bugon_zeropage(pte);
-+ } else {
-+ uksm_map_zero_page(pte);
- }
-
- /*
-@@ -1254,8 +1279,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
- tlb_remove_tlb_entry(tlb, pte, addr);
-- if (unlikely(!page))
-+ if (unlikely(!page)) {
-+ uksm_unmap_zero_page(ptent);
- continue;
-+ }
-
- if (!PageAnon(page)) {
- if (pte_dirty(ptent)) {
-@@ -2603,6 +2630,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
-
- if (likely(src)) {
- copy_user_highpage(dst, src, addr, vma);
-+ uksm_cow_page(vma, src);
- return true;
- }
-
-@@ -2849,6 +2877,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
- vmf->address);
- if (!new_page)
- goto oom;
-+ uksm_cow_pte(vma, vmf->orig_pte);
- } else {
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- vmf->address);
-@@ -2891,7 +2920,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
- mm_counter_file(old_page));
- inc_mm_counter_fast(mm, MM_ANONPAGES);
- }
-+ uksm_bugon_zeropage(vmf->orig_pte);
- } else {
-+ uksm_unmap_zero_page(vmf->orig_pte);
- inc_mm_counter_fast(mm, MM_ANONPAGES);
- }
- flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
-diff --git a/mm/mmap.c b/mm/mmap.c
-index 3f287599a..dc719db43 100644
---- a/mm/mmap.c
-+++ b/mm/mmap.c
-@@ -46,6 +46,7 @@
- #include <linux/moduleparam.h>
- #include <linux/pkeys.h>
- #include <linux/oom.h>
-+#include <linux/ksm.h>
- #include <linux/sched/mm.h>
-
- #include <linux/uaccess.h>
-@@ -181,6 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
- if (vma->vm_file)
- fput(vma->vm_file);
- mpol_put(vma_policy(vma));
-+ uksm_remove_vma(vma);
- vm_area_free(vma);
- return next;
- }
-@@ -748,9 +750,16 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- long adjust_next = 0;
- int remove_next = 0;
-
-+/*
-+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
-+ * acquired
-+ */
-+ uksm_remove_vma(vma);
-+
- if (next && !insert) {
- struct vm_area_struct *exporter = NULL, *importer = NULL;
-
-+ uksm_remove_vma(next);
- if (end >= next->vm_end) {
- /*
- * vma expands, overlapping all the next, and
-@@ -881,6 +890,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- end_changed = true;
- }
- vma->vm_pgoff = pgoff;
-+
- if (adjust_next) {
- next->vm_start += adjust_next;
- next->vm_pgoff += adjust_next >> PAGE_SHIFT;
-@@ -985,6 +995,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- if (remove_next == 2) {
- remove_next = 1;
- end = next->vm_end;
-+ uksm_remove_vma(next);
- goto again;
- }
- else if (next)
-@@ -1011,10 +1022,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- */
- VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
- }
-+ } else {
-+ if (next && !insert)
-+ uksm_vma_add_new(next);
- }
- if (insert && file)
- uprobe_mmap(insert);
-
-+ uksm_vma_add_new(vma);
- validate_mm(mm);
-
- return 0;
-@@ -1470,6 +1485,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
- vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
- mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
-
-+ /* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */
-+ uksm_vm_flags_mod(&vm_flags);
-+
- if (flags & MAP_LOCKED)
- if (!can_do_mlock())
- return -EPERM;
-@@ -1865,6 +1883,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
- allow_write_access(file);
- }
- file = vma->vm_file;
-+ uksm_vma_add_new(vma);
- out:
- perf_event_mmap(vma);
-
-@@ -1907,6 +1926,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
- if (vm_flags & VM_DENYWRITE)
- allow_write_access(file);
- free_vma:
-+ uksm_remove_vma(vma);
- vm_area_free(vma);
- unacct_error:
- if (charged)
-@@ -2766,6 +2786,8 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
- else
- err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
-
-+ uksm_vma_add_new(new);
-+
- /* Success. */
- if (!err)
- return 0;
-@@ -3073,6 +3095,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
- if ((flags & (~VM_EXEC)) != 0)
- return -EINVAL;
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-+ uksm_vm_flags_mod(&flags);
-
- mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (IS_ERR_VALUE(mapped_addr))
-@@ -3118,6 +3141,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
- vma->vm_flags = flags;
- vma->vm_page_prot = vm_get_page_prot(flags);
- vma_link(mm, vma, prev, rb_link, rb_parent);
-+ uksm_vma_add_new(vma);
- out:
- perf_event_mmap(vma);
- mm->total_vm += len >> PAGE_SHIFT;
-@@ -3195,6 +3219,12 @@ void exit_mmap(struct mm_struct *mm)
- mmap_write_unlock(mm);
- }
-
-+ /*
-+ * Taking write lock on mmap does not harm others,
-+ * but it's crucial for uksm to avoid races.
-+ */
-+ mmap_write_lock(mm);
-+
- if (mm->locked_vm) {
- vma = mm->mmap;
- while (vma) {
-@@ -3230,6 +3260,11 @@ void exit_mmap(struct mm_struct *mm)
- cond_resched();
- }
- vm_unacct_memory(nr_accounted);
-+
-+ mm->mmap = NULL;
-+ mm->mm_rb = RB_ROOT;
-+ vmacache_invalidate(mm);
-+ mmap_write_unlock(mm);
- }
-
- /* Insert vm structure into process list sorted by address
-@@ -3337,6 +3372,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
- new_vma->vm_ops->open(new_vma);
- vma_link(mm, new_vma, prev, rb_link, rb_parent);
- *need_rmap_locks = false;
-+ uksm_vma_add_new(new_vma);
- }
- return new_vma;
-
-@@ -3505,6 +3541,7 @@ static struct vm_area_struct *__install_special_mapping(
- vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
-
- perf_event_mmap(vma);
-+ uksm_vma_add_new(vma);
-
- return vma;
-
-diff --git a/mm/uksm.c b/mm/uksm.c
-new file mode 100644
-index 000000000..e4732c00b
---- /dev/null
-+++ b/mm/uksm.c
-@@ -0,0 +1,5614 @@
-+/*
-+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
-+ *
-+ * This is an improvement upon KSM. Some basic data structures and routines
-+ * are borrowed from ksm.c .
-+ *
-+ * Its new features:
-+ * 1. Full system scan:
-+ * It automatically scans all user processes' anonymous VMAs. Kernel-user
-+ * interaction to submit a memory area to KSM is no longer needed.
-+ *
-+ * 2. Rich area detection:
-+ * It automatically detects rich areas containing abundant duplicated
-+ * pages based. Rich areas are given a full scan speed. Poor areas are
-+ * sampled at a reasonable speed with very low CPU consumption.
-+ *
-+ * 3. Ultra Per-page scan speed improvement:
-+ * A new hash algorithm is proposed. As a result, on a machine with
-+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
-+ * can scan memory areas that does not contain duplicated pages at speed of
-+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
-+ * 477MB/sec ~ 923MB/sec.
-+ *
-+ * 4. Thrashing area avoidance:
-+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be
-+ * filtered out. My benchmark shows it's more efficient than KSM's per-page
-+ * hash value based volatile page detection.
-+ *
-+ *
-+ * 5. Misc changes upon KSM:
-+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
-+ * comparison. It's much faster than default C version on x86.
-+ * * rmap_item now has an struct *page member to loosely cache a
-+ * address-->page mapping, which reduces too much time-costly
-+ * follow_page().
-+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
-+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
-+ * ksm is needed for this case.
-+ *
-+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
-+ * Now uksmd consider full zero pages as special pages and merge them to an
-+ * special unswappable uksm zero page.
-+ */
-+
-+#include <linux/errno.h>
-+#include <linux/mm.h>
-+#include <linux/fs.h>
-+#include <linux/mman.h>
-+#include <linux/sched.h>
-+#include <linux/sched/mm.h>
-+#include <linux/sched/coredump.h>
-+#include <linux/sched/cputime.h>
-+#include <linux/rwsem.h>
-+#include <linux/pagemap.h>
-+#include <linux/rmap.h>
-+#include <linux/spinlock.h>
-+#include <linux/jhash.h>
-+#include <linux/delay.h>
-+#include <linux/kthread.h>
-+#include <linux/wait.h>
-+#include <linux/slab.h>
-+#include <linux/rbtree.h>
-+#include <linux/memory.h>
-+#include <linux/mmu_notifier.h>
-+#include <linux/swap.h>
-+#include <linux/ksm.h>
-+#include <linux/crypto.h>
-+#include <linux/scatterlist.h>
-+#include <crypto/hash.h>
-+#include <linux/random.h>
-+#include <linux/math64.h>
-+#include <linux/gcd.h>
-+#include <linux/freezer.h>
-+#include <linux/oom.h>
-+#include <linux/numa.h>
-+#include <linux/sradix-tree.h>
-+
-+#include <asm/tlbflush.h>
-+#include "internal.h"
-+
-+#ifdef CONFIG_X86
-+#undef memcmp
-+
-+#ifdef CONFIG_X86_32
-+#define memcmp memcmpx86_32
-+/*
-+ * Compare 4-byte-aligned address s1 and s2, with length n
-+ */
-+int memcmpx86_32(void *s1, void *s2, size_t n)
-+{
-+ size_t num = n / 4;
-+ register int res;
-+
-+ __asm__ __volatile__
-+ (
-+ "testl %3,%3\n\t"
-+ "repe; cmpsd\n\t"
-+ "je 1f\n\t"
-+ "sbbl %0,%0\n\t"
-+ "orl $1,%0\n"
-+ "1:"
-+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
-+ : "0" (0)
-+ : "cc");
-+
-+ return res;
-+}
-+
-+/*
-+ * Check the page is all zero ?
-+ */
-+static int is_full_zero(const void *s1, size_t len)
-+{
-+ unsigned char same;
-+
-+ len /= 4;
-+
-+ __asm__ __volatile__
-+ ("repe; scasl;"
-+ "sete %0"
-+ : "=qm" (same), "+D" (s1), "+c" (len)
-+ : "a" (0)
-+ : "cc");
-+
-+ return same;
-+}
-+
-+
-+#elif defined(CONFIG_X86_64)
-+#define memcmp memcmpx86_64
-+/*
-+ * Compare 8-byte-aligned address s1 and s2, with length n
-+ */
-+int memcmpx86_64(void *s1, void *s2, size_t n)
-+{
-+ size_t num = n / 8;
-+ register int res;
-+
-+ __asm__ __volatile__
-+ (
-+ "testq %q3,%q3\n\t"
-+ "repe; cmpsq\n\t"
-+ "je 1f\n\t"
-+ "sbbq %q0,%q0\n\t"
-+ "orq $1,%q0\n"
-+ "1:"
-+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
-+ : "0" (0)
-+ : "cc");
-+
-+ return res;
-+}
-+
-+static int is_full_zero(const void *s1, size_t len)
-+{
-+ unsigned char same;
-+
-+ len /= 8;
-+
-+ __asm__ __volatile__
-+ ("repe; scasq;"
-+ "sete %0"
-+ : "=qm" (same), "+D" (s1), "+c" (len)
-+ : "a" (0)
-+ : "cc");
-+
-+ return same;
-+}
-+
-+#endif
-+#else
-+static int is_full_zero(const void *s1, size_t len)
-+{
-+ unsigned long *src = s1;
-+ int i;
-+
-+ len /= sizeof(*src);
-+
-+ for (i = 0; i < len; i++) {
-+ if (src[i])
-+ return 0;
-+ }
-+
-+ return 1;
-+}
-+#endif
-+
-+#define UKSM_RUNG_ROUND_FINISHED (1 << 0)
-+#define TIME_RATIO_SCALE 10000
-+
-+#define SLOT_TREE_NODE_SHIFT 8
-+#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT)
-+struct slot_tree_node {
-+ unsigned long size;
-+ struct sradix_tree_node snode;
-+ void *stores[SLOT_TREE_NODE_STORE_SIZE];
-+};
-+
-+static struct kmem_cache *slot_tree_node_cachep;
-+
-+static struct sradix_tree_node *slot_tree_node_alloc(void)
-+{
-+ struct slot_tree_node *p;
-+
-+ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL |
-+ __GFP_NORETRY | __GFP_NOWARN);
-+ if (!p)
-+ return NULL;
-+
-+ return &p->snode;
-+}
-+
-+static void slot_tree_node_free(struct sradix_tree_node *node)
-+{
-+ struct slot_tree_node *p;
-+
-+ p = container_of(node, struct slot_tree_node, snode);
-+ kmem_cache_free(slot_tree_node_cachep, p);
-+}
-+
-+static void slot_tree_node_extend(struct sradix_tree_node *parent,
-+ struct sradix_tree_node *child)
-+{
-+ struct slot_tree_node *p, *c;
-+
-+ p = container_of(parent, struct slot_tree_node, snode);
-+ c = container_of(child, struct slot_tree_node, snode);
-+
-+ p->size += c->size;
-+}
-+
-+void slot_tree_node_assign(struct sradix_tree_node *node,
-+ unsigned int index, void *item)
-+{
-+ struct vma_slot *slot = item;
-+ struct slot_tree_node *cur;
-+
-+ slot->snode = node;
-+ slot->sindex = index;
-+
-+ while (node) {
-+ cur = container_of(node, struct slot_tree_node, snode);
-+ cur->size += slot->pages;
-+ node = node->parent;
-+ }
-+}
-+
-+void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset)
-+{
-+ struct vma_slot *slot;
-+ struct slot_tree_node *cur;
-+ unsigned long pages;
-+
-+ if (node->height == 1) {
-+ slot = node->stores[offset];
-+ pages = slot->pages;
-+ } else {
-+ cur = container_of(node->stores[offset],
-+ struct slot_tree_node, snode);
-+ pages = cur->size;
-+ }
-+
-+ while (node) {
-+ cur = container_of(node, struct slot_tree_node, snode);
-+ cur->size -= pages;
-+ node = node->parent;
-+ }
-+}
-+
-+unsigned long slot_iter_index;
-+int slot_iter(void *item, unsigned long height)
-+{
-+ struct slot_tree_node *node;
-+ struct vma_slot *slot;
-+
-+ if (height == 1) {
-+ slot = item;
-+ if (slot_iter_index < slot->pages) {
-+ /*in this one*/
-+ return 1;
-+ } else {
-+ slot_iter_index -= slot->pages;
-+ return 0;
-+ }
-+
-+ } else {
-+ node = container_of(item, struct slot_tree_node, snode);
-+ if (slot_iter_index < node->size) {
-+ /*in this one*/
-+ return 1;
-+ } else {
-+ slot_iter_index -= node->size;
-+ return 0;
-+ }
-+ }
-+}
-+
-+
-+static inline void slot_tree_init_root(struct sradix_tree_root *root)
-+{
-+ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
-+ root->alloc = slot_tree_node_alloc;
-+ root->free = slot_tree_node_free;
-+ root->extend = slot_tree_node_extend;
-+ root->assign = slot_tree_node_assign;
-+ root->rm = slot_tree_node_rm;
-+}
-+
-+void slot_tree_init(void)
-+{
-+ slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
-+ sizeof(struct slot_tree_node), 0,
-+ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
-+ NULL);
-+}
-+
-+
-+/* Each rung of this ladder is a list of VMAs having a same scan ratio */
-+struct scan_rung {
-+ //struct list_head scanned_list;
-+ struct sradix_tree_root vma_root;
-+ struct sradix_tree_root vma_root2;
-+
-+ struct vma_slot *current_scan;
-+ unsigned long current_offset;
-+
-+ /*
-+ * The initial value for current_offset, it should loop over
-+ * [0~ step - 1] to let all slot have its chance to be scanned.
-+ */
-+ unsigned long offset_init;
-+ unsigned long step; /* dynamic step for current_offset */
-+ unsigned int flags;
-+ unsigned long pages_to_scan;
-+ //unsigned long fully_scanned_slots;
-+ /*
-+ * a little bit tricky - if cpu_time_ratio > 0, then the value is the
-+ * the cpu time ratio it can spend in rung_i for every scan
-+ * period. if < 0, then it is the cpu time ratio relative to the
-+ * max cpu percentage user specified. Both in unit of
-+ * 1/TIME_RATIO_SCALE
-+ */
-+ int cpu_ratio;
-+
-+ /*
-+ * How long it will take for all slots in this rung to be fully
-+ * scanned? If it's zero, we don't care about the cover time:
-+ * it's fully scanned.
-+ */
-+ unsigned int cover_msecs;
-+ //unsigned long vma_num;
-+ //unsigned long pages; /* Sum of all slot's pages in rung */
-+};
-+
-+/**
-+ * node of either the stable or unstale rbtree
-+ *
-+ */
-+struct tree_node {
-+ struct rb_node node; /* link in the main (un)stable rbtree */
-+ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
-+ u32 hash;
-+ unsigned long count; /* TODO: merged with sub_root */
-+ struct list_head all_list; /* all tree nodes in stable/unstable tree */
-+};
-+
-+/**
-+ * struct stable_node - node of the stable rbtree
-+ * @node: rb node of this ksm page in the stable tree
-+ * @hlist: hlist head of rmap_items using this ksm page
-+ * @kpfn: page frame number of this ksm page
-+ */
-+struct stable_node {
-+ struct rb_node node; /* link in sub-rbtree */
-+ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
-+ struct hlist_head hlist;
-+ unsigned long kpfn;
-+ u32 hash_max; /* if ==0 then it's not been calculated yet */
-+ struct list_head all_list; /* in a list for all stable nodes */
-+};
-+
-+/**
-+ * struct node_vma - group rmap_items linked in a same stable
-+ * node together.
-+ */
-+struct node_vma {
-+ union {
-+ struct vma_slot *slot;
-+ unsigned long key; /* slot is used as key sorted on hlist */
-+ };
-+ struct hlist_node hlist;
-+ struct hlist_head rmap_hlist;
-+ struct stable_node *head;
-+};
-+
-+/**
-+ * struct rmap_item - reverse mapping item for virtual addresses
-+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
-+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
-+ * @mm: the memory structure this rmap_item is pointing into
-+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
-+ * @node: rb node of this rmap_item in the unstable tree
-+ * @head: pointer to stable_node heading this list in the stable tree
-+ * @hlist: link into hlist of rmap_items hanging off that stable_node
-+ */
-+struct rmap_item {
-+ struct vma_slot *slot;
-+ struct page *page;
-+ unsigned long address; /* + low bits used for flags below */
-+ unsigned long hash_round;
-+ unsigned long entry_index;
-+ union {
-+ struct {/* when in unstable tree */
-+ struct rb_node node;
-+ struct tree_node *tree_node;
-+ u32 hash_max;
-+ };
-+ struct { /* when in stable tree */
-+ struct node_vma *head;
-+ struct hlist_node hlist;
-+ struct anon_vma *anon_vma;
-+ };
-+ };
-+} __aligned(4);
-+
-+struct rmap_list_entry {
-+ union {
-+ struct rmap_item *item;
-+ unsigned long addr;
-+ };
-+ /* lowest bit is used for is_addr tag */
-+} __aligned(4); /* 4 aligned to fit in to pages*/
-+
-+
-+/* Basic data structure definition ends */
-+
-+
-+/*
-+ * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
-+ * The flags use the low bits of rmap_item.address
-+ */
-+#define UNSTABLE_FLAG 0x1
-+#define STABLE_FLAG 0x2
-+#define get_rmap_addr(x) ((x)->address & PAGE_MASK)
-+
-+/*
-+ * rmap_list_entry helpers
-+ */
-+#define IS_ADDR_FLAG 1
-+#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG)
-+#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG)
-+#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
-+
-+
-+/*
-+ * High speed caches for frequently allocated and freed structs
-+ */
-+static struct kmem_cache *rmap_item_cache;
-+static struct kmem_cache *stable_node_cache;
-+static struct kmem_cache *node_vma_cache;
-+static struct kmem_cache *vma_slot_cache;
-+static struct kmem_cache *tree_node_cache;
-+#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
-+ sizeof(struct __struct), __alignof__(struct __struct),\
-+ (__flags), NULL)
-+
-+/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
-+#define SCAN_LADDER_SIZE 4
-+static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
-+
-+/* The evaluation rounds uksmd has finished */
-+static unsigned long long uksm_eval_round = 1;
-+
-+/*
-+ * we add 1 to this var when we consider we should rebuild the whole
-+ * unstable tree.
-+ */
-+static unsigned long uksm_hash_round = 1;
-+
-+/*
-+ * How many times the whole memory is scanned.
-+ */
-+static unsigned long long fully_scanned_round = 1;
-+
-+/* The total number of virtual pages of all vma slots */
-+static u64 uksm_pages_total;
-+
-+/* The number of pages has been scanned since the start up */
-+static u64 uksm_pages_scanned;
-+
-+static u64 scanned_virtual_pages;
-+
-+/* The number of pages has been scanned since last encode_benefit call */
-+static u64 uksm_pages_scanned_last;
-+
-+/* If the scanned number is tooo large, we encode it here */
-+static u64 pages_scanned_stored;
-+
-+static unsigned long pages_scanned_base;
-+
-+/* The number of nodes in the stable tree */
-+static unsigned long uksm_pages_shared;
-+
-+/* The number of page slots additionally sharing those nodes */
-+static unsigned long uksm_pages_sharing;
-+
-+/* The number of nodes in the unstable tree */
-+static unsigned long uksm_pages_unshared;
-+
-+/*
-+ * Milliseconds ksmd should sleep between scans,
-+ * >= 100ms to be consistent with
-+ * scan_time_to_sleep_msec()
-+ */
-+static unsigned int uksm_sleep_jiffies;
-+
-+/* The real value for the uksmd next sleep */
-+static unsigned int uksm_sleep_real;
-+
-+/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
-+static unsigned int uksm_sleep_saved;
-+
-+/* Max percentage of cpu utilization ksmd can take to scan in one batch */
-+static unsigned int uksm_max_cpu_percentage;
-+
-+static int uksm_cpu_governor;
-+
-+static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
-+
-+struct uksm_cpu_preset_s {
-+ int cpu_ratio[SCAN_LADDER_SIZE];
-+ unsigned int cover_msecs[SCAN_LADDER_SIZE];
-+ unsigned int max_cpu; /* percentage */
-+};
-+
-+struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
-+ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
-+ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
-+ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
-+ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
-+};
-+
-+/* The default value for uksm_ema_page_time if it's not initialized */
-+#define UKSM_PAGE_TIME_DEFAULT 500
-+
-+/*cost to scan one page by expotional moving average in nsecs */
-+static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
-+
-+/* The expotional moving average alpha weight, in percentage. */
-+#define EMA_ALPHA 20
-+
-+/*
-+ * The threshold used to filter out thrashing areas,
-+ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
-+ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
-+ * will be considered as having a zero duplication ratio.
-+ */
-+static unsigned int uksm_thrash_threshold = 50;
-+
-+/* How much dedup ratio is considered to be abundant*/
-+static unsigned int uksm_abundant_threshold = 10;
-+
-+/* All slots having merged pages in this eval round. */
-+struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
-+
-+/* How many times the ksmd has slept since startup */
-+static unsigned long long uksm_sleep_times;
-+
-+#define UKSM_RUN_STOP 0
-+#define UKSM_RUN_MERGE 1
-+static unsigned int uksm_run = 1;
-+
-+static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
-+static DEFINE_MUTEX(uksm_thread_mutex);
-+
-+/*
-+ * List vma_slot_new is for newly created vma_slot waiting to be added by
-+ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
-+ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
-+ * VMA has been removed/freed.
-+ */
-+struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
-+struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
-+struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
-+static DEFINE_SPINLOCK(vma_slot_list_lock);
-+
-+/* The unstable tree heads */
-+static struct rb_root root_unstable_tree = RB_ROOT;
-+
-+/*
-+ * All tree_nodes are in a list to be freed at once when unstable tree is
-+ * freed after each scan round.
-+ */
-+static struct list_head unstable_tree_node_list =
-+ LIST_HEAD_INIT(unstable_tree_node_list);
-+
-+/* List contains all stable nodes */
-+static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
-+
-+/*
-+ * When the hash strength is changed, the stable tree must be delta_hashed and
-+ * re-structured. We use two set of below structs to speed up the
-+ * re-structuring of stable tree.
-+ */
-+static struct list_head
-+stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
-+ LIST_HEAD_INIT(stable_tree_node_list[1])};
-+
-+static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
-+static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
-+static struct rb_root *root_stable_treep = &root_stable_tree[0];
-+static unsigned long stable_tree_index;
-+
-+/* The hash strength needed to hash a full page */
-+#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32))
-+
-+/* The hash strength needed for loop-back hashing */
-+#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10)
-+
-+/* The random offsets in a page */
-+static u32 *random_nums;
-+
-+/* The hash strength */
-+static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
-+
-+/* The delta value each time the hash strength increases or decreases */
-+static unsigned long hash_strength_delta;
-+#define HASH_STRENGTH_DELTA_MAX 5
-+
-+/* The time we have saved due to random_sample_hash */
-+static u64 rshash_pos;
-+
-+/* The time we have wasted due to hash collision */
-+static u64 rshash_neg;
-+
-+struct uksm_benefit {
-+ u64 pos;
-+ u64 neg;
-+ u64 scanned;
-+ unsigned long base;
-+} benefit;
-+
-+/*
-+ * The relative cost of memcmp, compared to 1 time unit of random sample
-+ * hash, this value is tested when ksm module is initialized
-+ */
-+static unsigned long memcmp_cost;
-+
-+static unsigned long rshash_neg_cont_zero;
-+static unsigned long rshash_cont_obscure;
-+
-+/* The possible states of hash strength adjustment heuristic */
-+enum rshash_states {
-+ RSHASH_STILL,
-+ RSHASH_TRYUP,
-+ RSHASH_TRYDOWN,
-+ RSHASH_NEW,
-+ RSHASH_PRE_STILL,
-+};
-+
-+/* The possible direction we are about to adjust hash strength */
-+enum rshash_direct {
-+ GO_UP,
-+ GO_DOWN,
-+ OBSCURE,
-+ STILL,
-+};
-+
-+/* random sampling hash state machine */
-+static struct {
-+ enum rshash_states state;
-+ enum rshash_direct pre_direct;
-+ u8 below_count;
-+ /* Keep a lookup window of size 5, iff above_count/below_count > 3
-+ * in this window we stop trying.
-+ */
-+ u8 lookup_window_index;
-+ u64 stable_benefit;
-+ unsigned long turn_point_down;
-+ unsigned long turn_benefit_down;
-+ unsigned long turn_point_up;
-+ unsigned long turn_benefit_up;
-+ unsigned long stable_point;
-+} rshash_state;
-+
-+/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
-+static u32 *zero_hash_table;
-+
-+static inline struct node_vma *alloc_node_vma(void)
-+{
-+ struct node_vma *node_vma;
-+
-+ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL |
-+ __GFP_NORETRY | __GFP_NOWARN);
-+ if (node_vma) {
-+ INIT_HLIST_HEAD(&node_vma->rmap_hlist);
-+ INIT_HLIST_NODE(&node_vma->hlist);
-+ }
-+ return node_vma;
-+}
-+
-+static inline void free_node_vma(struct node_vma *node_vma)
-+{
-+ kmem_cache_free(node_vma_cache, node_vma);
-+}
-+
-+
-+static inline struct vma_slot *alloc_vma_slot(void)
-+{
-+ struct vma_slot *slot;
-+
-+ /*
-+ * In case ksm is not initialized by now.
-+ * Oops, we need to consider the call site of uksm_init() in the future.
-+ */
-+ if (!vma_slot_cache)
-+ return NULL;
-+
-+ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL |
-+ __GFP_NORETRY | __GFP_NOWARN);
-+ if (slot) {
-+ INIT_LIST_HEAD(&slot->slot_list);
-+ INIT_LIST_HEAD(&slot->dedup_list);
-+ slot->flags |= UKSM_SLOT_NEED_RERAND;
-+ }
-+ return slot;
-+}
-+
-+static inline void free_vma_slot(struct vma_slot *vma_slot)
-+{
-+ kmem_cache_free(vma_slot_cache, vma_slot);
-+}
-+
-+
-+
-+static inline struct rmap_item *alloc_rmap_item(void)
-+{
-+ struct rmap_item *rmap_item;
-+
-+ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
-+ __GFP_NORETRY | __GFP_NOWARN);
-+ if (rmap_item) {
-+ /* bug on lowest bit is not clear for flag use */
-+ BUG_ON(is_addr(rmap_item));
-+ }
-+ return rmap_item;
-+}
-+
-+static inline void free_rmap_item(struct rmap_item *rmap_item)
-+{
-+ rmap_item->slot = NULL; /* debug safety */
-+ kmem_cache_free(rmap_item_cache, rmap_item);
-+}
-+
-+static inline struct stable_node *alloc_stable_node(void)
-+{
-+ struct stable_node *node;
-+
-+ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL |
-+ __GFP_NORETRY | __GFP_NOWARN);
-+ if (!node)
-+ return NULL;
-+
-+ INIT_HLIST_HEAD(&node->hlist);
-+ list_add(&node->all_list, &stable_node_list);
-+ return node;
-+}
-+
-+static inline void free_stable_node(struct stable_node *stable_node)
-+{
-+ list_del(&stable_node->all_list);
-+ kmem_cache_free(stable_node_cache, stable_node);
-+}
-+
-+static inline struct tree_node *alloc_tree_node(struct list_head *list)
-+{
-+ struct tree_node *node;
-+
-+ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL |
-+ __GFP_NORETRY | __GFP_NOWARN);
-+ if (!node)
-+ return NULL;
-+
-+ list_add(&node->all_list, list);
-+ return node;
-+}
-+
-+static inline void free_tree_node(struct tree_node *node)
-+{
-+ list_del(&node->all_list);
-+ kmem_cache_free(tree_node_cache, node);
-+}
-+
-+static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
-+{
-+ struct anon_vma *anon_vma = rmap_item->anon_vma;
-+
-+ put_anon_vma(anon_vma);
-+}
-+
-+
-+/**
-+ * Remove a stable node from stable_tree, may unlink from its tree_node and
-+ * may remove its parent tree_node if no other stable node is pending.
-+ *
-+ * @stable_node The node need to be removed
-+ * @unlink_rb Will this node be unlinked from the rbtree?
-+ * @remove_tree_ node Will its tree_node be removed if empty?
-+ */
-+static void remove_node_from_stable_tree(struct stable_node *stable_node,
-+ int unlink_rb, int remove_tree_node)
-+{
-+ struct node_vma *node_vma;
-+ struct rmap_item *rmap_item;
-+ struct hlist_node *n;
-+
-+ if (!hlist_empty(&stable_node->hlist)) {
-+ hlist_for_each_entry_safe(node_vma, n,
-+ &stable_node->hlist, hlist) {
-+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
-+ uksm_pages_sharing--;
-+
-+ uksm_drop_anon_vma(rmap_item);
-+ rmap_item->address &= PAGE_MASK;
-+ }
-+ free_node_vma(node_vma);
-+ cond_resched();
-+ }
-+
-+ /* the last one is counted as shared */
-+ uksm_pages_shared--;
-+ uksm_pages_sharing++;
-+ }
-+
-+ if (stable_node->tree_node && unlink_rb) {
-+ rb_erase(&stable_node->node,
-+ &stable_node->tree_node->sub_root);
-+
-+ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
-+ remove_tree_node) {
-+ rb_erase(&stable_node->tree_node->node,
-+ root_stable_treep);
-+ free_tree_node(stable_node->tree_node);
-+ } else {
-+ stable_node->tree_node->count--;
-+ }
-+ }
-+
-+ free_stable_node(stable_node);
-+}
-+
-+
-+/*
-+ * get_uksm_page: checks if the page indicated by the stable node
-+ * is still its ksm page, despite having held no reference to it.
-+ * In which case we can trust the content of the page, and it
-+ * returns the gotten page; but if the page has now been zapped,
-+ * remove the stale node from the stable tree and return NULL.
-+ *
-+ * You would expect the stable_node to hold a reference to the ksm page.
-+ * But if it increments the page's count, swapping out has to wait for
-+ * ksmd to come around again before it can free the page, which may take
-+ * seconds or even minutes: much too unresponsive. So instead we use a
-+ * "keyhole reference": access to the ksm page from the stable node peeps
-+ * out through its keyhole to see if that page still holds the right key,
-+ * pointing back to this stable node. This relies on freeing a PageAnon
-+ * page to reset its page->mapping to NULL, and relies on no other use of
-+ * a page to put something that might look like our key in page->mapping.
-+ *
-+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
-+ * but this is different - made simpler by uksm_thread_mutex being held, but
-+ * interesting for assuming that no other use of the struct page could ever
-+ * put our expected_mapping into page->mapping (or a field of the union which
-+ * coincides with page->mapping). The RCU calls are not for KSM at all, but
-+ * to keep the page_count protocol described with page_cache_get_speculative.
-+ *
-+ * Note: it is possible that get_uksm_page() will return NULL one moment,
-+ * then page the next, if the page is in between page_freeze_refs() and
-+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
-+ * is on its way to being freed; but it is an anomaly to bear in mind.
-+ *
-+ * @unlink_rb: if the removal of this node will firstly unlink from
-+ * its rbtree. stable_node_reinsert will prevent this when restructuring the
-+ * node from its old tree.
-+ *
-+ * @remove_tree_node: if this is the last one of its tree_node, will the
-+ * tree_node be freed ? If we are inserting stable node, this tree_node may
-+ * be reused, so don't free it.
-+ */
-+static struct page *get_uksm_page(struct stable_node *stable_node,
-+ int unlink_rb, int remove_tree_node)
-+{
-+ struct page *page;
-+ void *expected_mapping;
-+ unsigned long kpfn;
-+
-+ expected_mapping = (void *)((unsigned long)stable_node |
-+ PAGE_MAPPING_KSM);
-+again:
-+ kpfn = READ_ONCE(stable_node->kpfn);
-+ page = pfn_to_page(kpfn);
-+
-+ /*
-+ * page is computed from kpfn, so on most architectures reading
-+ * page->mapping is naturally ordered after reading node->kpfn,
-+ * but on Alpha we need to be more careful.
-+ */
-+ smp_rmb();
-+
-+ if (READ_ONCE(page->mapping) != expected_mapping)
-+ goto stale;
-+
-+ /*
-+ * We cannot do anything with the page while its refcount is 0.
-+ * Usually 0 means free, or tail of a higher-order page: in which
-+ * case this node is no longer referenced, and should be freed;
-+ * however, it might mean that the page is under page_freeze_refs().
-+ * The __remove_mapping() case is easy, again the node is now stale;
-+ * but if page is swapcache in migrate_page_move_mapping(), it might
-+ * still be our page, in which case it's essential to keep the node.
-+ */
-+ while (!get_page_unless_zero(page)) {
-+ /*
-+ * Another check for page->mapping != expected_mapping would
-+ * work here too. We have chosen the !PageSwapCache test to
-+ * optimize the common case, when the page is or is about to
-+ * be freed: PageSwapCache is cleared (under spin_lock_irq)
-+ * in the freeze_refs section of __remove_mapping(); but Anon
-+ * page->mapping reset to NULL later, in free_pages_prepare().
-+ */
-+ if (!PageSwapCache(page))
-+ goto stale;
-+ cpu_relax();
-+ }
-+
-+ if (READ_ONCE(page->mapping) != expected_mapping) {
-+ put_page(page);
-+ goto stale;
-+ }
-+
-+ lock_page(page);
-+ if (READ_ONCE(page->mapping) != expected_mapping) {
-+ unlock_page(page);
-+ put_page(page);
-+ goto stale;
-+ }
-+ unlock_page(page);
-+ return page;
-+stale:
-+ /*
-+ * We come here from above when page->mapping or !PageSwapCache
-+ * suggests that the node is stale; but it might be under migration.
-+ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
-+ * before checking whether node->kpfn has been changed.
-+ */
-+ smp_rmb();
-+ if (stable_node->kpfn != kpfn)
-+ goto again;
-+
-+ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
-+
-+ return NULL;
-+}
-+
-+/*
-+ * Removing rmap_item from stable or unstable tree.
-+ * This function will clean the information from the stable/unstable tree.
-+ */
-+static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
-+{
-+ if (rmap_item->address & STABLE_FLAG) {
-+ struct stable_node *stable_node;
-+ struct node_vma *node_vma;
-+ struct page *page;
-+
-+ node_vma = rmap_item->head;
-+ stable_node = node_vma->head;
-+ page = get_uksm_page(stable_node, 1, 1);
-+ if (!page)
-+ goto out;
-+
-+ /*
-+ * page lock is needed because it's racing with
-+ * try_to_unmap_ksm(), etc.
-+ */
-+ lock_page(page);
-+ hlist_del(&rmap_item->hlist);
-+
-+ if (hlist_empty(&node_vma->rmap_hlist)) {
-+ hlist_del(&node_vma->hlist);
-+ free_node_vma(node_vma);
-+ }
-+ unlock_page(page);
-+
-+ put_page(page);
-+ if (hlist_empty(&stable_node->hlist)) {
-+ /* do NOT call remove_node_from_stable_tree() here,
-+ * it's possible for a forked rmap_item not in
-+ * stable tree while the in-tree rmap_items were
-+ * deleted.
-+ */
-+ uksm_pages_shared--;
-+ } else
-+ uksm_pages_sharing--;
-+
-+
-+ uksm_drop_anon_vma(rmap_item);
-+ } else if (rmap_item->address & UNSTABLE_FLAG) {
-+ if (rmap_item->hash_round == uksm_hash_round) {
-+
-+ rb_erase(&rmap_item->node,
-+ &rmap_item->tree_node->sub_root);
-+ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
-+ rb_erase(&rmap_item->tree_node->node,
-+ &root_unstable_tree);
-+
-+ free_tree_node(rmap_item->tree_node);
-+ } else
-+ rmap_item->tree_node->count--;
-+ }
-+ uksm_pages_unshared--;
-+ }
-+
-+ rmap_item->address &= PAGE_MASK;
-+ rmap_item->hash_max = 0;
-+
-+out:
-+ cond_resched(); /* we're called from many long loops */
-+}
-+
-+static inline int slot_in_uksm(struct vma_slot *slot)
-+{
-+ return list_empty(&slot->slot_list);
-+}
-+
-+/*
-+ * Test if the mm is exiting
-+ */
-+static inline bool uksm_test_exit(struct mm_struct *mm)
-+{
-+ return atomic_read(&mm->mm_users) == 0;
-+}
-+
-+static inline unsigned long vma_pool_size(struct vma_slot *slot)
-+{
-+ return round_up(sizeof(struct rmap_list_entry) * slot->pages,
-+ PAGE_SIZE) >> PAGE_SHIFT;
-+}
-+
-+#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
-+
-+/* must be done with sem locked */
-+static int slot_pool_alloc(struct vma_slot *slot)
-+{
-+ unsigned long pool_size;
-+
-+ if (slot->rmap_list_pool)
-+ return 0;
-+
-+ pool_size = vma_pool_size(slot);
-+ slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *),
-+ GFP_KERNEL);
-+ if (!slot->rmap_list_pool)
-+ return -ENOMEM;
-+
-+ slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int),
-+ GFP_KERNEL);
-+ if (!slot->pool_counts) {
-+ kfree(slot->rmap_list_pool);
-+ return -ENOMEM;
-+ }
-+
-+ slot->pool_size = pool_size;
-+ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
-+ slot->flags |= UKSM_SLOT_IN_UKSM;
-+ uksm_pages_total += slot->pages;
-+
-+ return 0;
-+}
-+
-+/*
-+ * Called after vma is unlinked from its mm
-+ */
-+void uksm_remove_vma(struct vm_area_struct *vma)
-+{
-+ struct vma_slot *slot;
-+
-+ if (!vma->uksm_vma_slot)
-+ return;
-+
-+ spin_lock(&vma_slot_list_lock);
-+ slot = vma->uksm_vma_slot;
-+ if (!slot)
-+ goto out;
-+
-+ if (slot_in_uksm(slot)) {
-+ /**
-+ * This slot has been added by ksmd, so move to the del list
-+ * waiting ksmd to free it.
-+ */
-+ list_add_tail(&slot->slot_list, &vma_slot_del);
-+ } else {
-+ /**
-+ * It's still on new list. It's ok to free slot directly.
-+ */
-+ list_del(&slot->slot_list);
-+ free_vma_slot(slot);
-+ }
-+out:
-+ vma->uksm_vma_slot = NULL;
-+ spin_unlock(&vma_slot_list_lock);
-+}
-+
-+/**
-+ * Need to do two things:
-+ * 1. check if slot was moved to del list
-+ * 2. make sure the mmap_sem is manipulated under valid vma.
-+ *
-+ * My concern here is that in some cases, this may make
-+ * vma_slot_list_lock() waiters to serialized further by some
-+ * sem->wait_lock, can this really be expensive?
-+ *
-+ *
-+ * @return
-+ * 0: if successfully locked mmap_sem
-+ * -ENOENT: this slot was moved to del list
-+ * -EBUSY: vma lock failed
-+ */
-+static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
-+{
-+ struct vm_area_struct *vma;
-+ struct mm_struct *mm;
-+ struct rw_semaphore *sem;
-+
-+ spin_lock(&vma_slot_list_lock);
-+
-+ /* the slot_list was removed and inited from new list, when it enters
-+ * uksm_list. If now it's not empty, then it must be moved to del list
-+ */
-+ if (!slot_in_uksm(slot)) {
-+ spin_unlock(&vma_slot_list_lock);
-+ return -ENOENT;
-+ }
-+
-+ BUG_ON(slot->pages != vma_pages(slot->vma));
-+ /* Ok, vma still valid */
-+ vma = slot->vma;
-+ mm = vma->vm_mm;
-+ sem = &mm->mmap_lock;
-+
-+ if (uksm_test_exit(mm)) {
-+ spin_unlock(&vma_slot_list_lock);
-+ return -ENOENT;
-+ }
-+
-+ if (down_read_trylock(sem)) {
-+ spin_unlock(&vma_slot_list_lock);
-+ if (slot_pool_alloc(slot)) {
-+ uksm_remove_vma(vma);
-+ up_read(sem);
-+ return -ENOENT;
-+ }
-+ return 0;
-+ }
-+
-+ spin_unlock(&vma_slot_list_lock);
-+ return -EBUSY;
-+}
-+
-+static inline unsigned long
-+vma_page_address(struct page *page, struct vm_area_struct *vma)
-+{
-+ pgoff_t pgoff = page->index;
-+ unsigned long address;
-+
-+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
-+ /* page should be within @vma mapping range */
-+ return -EFAULT;
-+ }
-+ return address;
-+}
-+
-+
-+/* return 0 on success with the item's mmap_sem locked */
-+static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
-+{
-+ struct mm_struct *mm;
-+ struct vma_slot *slot = item->slot;
-+ int err = -EINVAL;
-+
-+ struct page *page;
-+
-+ /*
-+ * try_down_read_slot_mmap_sem() returns non-zero if the slot
-+ * has been removed by uksm_remove_vma().
-+ */
-+ if (try_down_read_slot_mmap_sem(slot))
-+ return -EBUSY;
-+
-+ mm = slot->vma->vm_mm;
-+
-+ if (uksm_test_exit(mm))
-+ goto failout_up;
-+
-+ page = item->page;
-+ rcu_read_lock();
-+ if (!get_page_unless_zero(page)) {
-+ rcu_read_unlock();
-+ goto failout_up;
-+ }
-+
-+ /* No need to consider huge page here. */
-+ if (item->slot->vma->anon_vma != page_anon_vma(page) ||
-+ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
-+ /*
-+ * TODO:
-+ * should we release this item becase of its stale page
-+ * mapping?
-+ */
-+ put_page(page);
-+ rcu_read_unlock();
-+ goto failout_up;
-+ }
-+ rcu_read_unlock();
-+ return 0;
-+
-+failout_up:
-+ mmap_read_unlock(mm);
-+ return err;
-+}
-+
-+/*
-+ * What kind of VMA is considered ?
-+ */
-+static inline int vma_can_enter(struct vm_area_struct *vma)
-+{
-+ return uksm_flags_can_scan(vma->vm_flags);
-+}
-+
-+/*
-+ * Called whenever a fresh new vma is created A new vma_slot.
-+ * is created and inserted into a global list Must be called.
-+ * after vma is inserted to its mm.
-+ */
-+void uksm_vma_add_new(struct vm_area_struct *vma)
-+{
-+ struct vma_slot *slot;
-+
-+ if (!vma_can_enter(vma)) {
-+ vma->uksm_vma_slot = NULL;
-+ return;
-+ }
-+
-+ slot = alloc_vma_slot();
-+ if (!slot) {
-+ vma->uksm_vma_slot = NULL;
-+ return;
-+ }
-+
-+ vma->uksm_vma_slot = slot;
-+ vma->vm_flags |= VM_MERGEABLE;
-+ slot->vma = vma;
-+ slot->mm = vma->vm_mm;
-+ slot->ctime_j = jiffies;
-+ slot->pages = vma_pages(vma);
-+ spin_lock(&vma_slot_list_lock);
-+ list_add_tail(&slot->slot_list, &vma_slot_new);
-+ spin_unlock(&vma_slot_list_lock);
-+}
-+
-+/* 32/3 < they < 32/2 */
-+#define shiftl 8
-+#define shiftr 12
-+
-+#define HASH_FROM_TO(from, to) \
-+for (index = from; index < to; index++) { \
-+ pos = random_nums[index]; \
-+ hash += key[pos]; \
-+ hash += (hash << shiftl); \
-+ hash ^= (hash >> shiftr); \
-+}
-+
-+
-+#define HASH_FROM_DOWN_TO(from, to) \
-+for (index = from - 1; index >= to; index--) { \
-+ hash ^= (hash >> shiftr); \
-+ hash ^= (hash >> (shiftr*2)); \
-+ hash -= (hash << shiftl); \
-+ hash += (hash << (shiftl*2)); \
-+ pos = random_nums[index]; \
-+ hash -= key[pos]; \
-+}
-+
-+/*
-+ * The main random sample hash function.
-+ */
-+static u32 random_sample_hash(void *addr, u32 hash_strength)
-+{
-+ u32 hash = 0xdeadbeef;
-+ int index, pos, loop = hash_strength;
-+ u32 *key = (u32 *)addr;
-+
-+ if (loop > HASH_STRENGTH_FULL)
-+ loop = HASH_STRENGTH_FULL;
-+
-+ HASH_FROM_TO(0, loop);
-+
-+ if (hash_strength > HASH_STRENGTH_FULL) {
-+ loop = hash_strength - HASH_STRENGTH_FULL;
-+ HASH_FROM_TO(0, loop);
-+ }
-+
-+ return hash;
-+}
-+
-+
-+/**
-+ * It's used when hash strength is adjusted
-+ *
-+ * @addr The page's virtual address
-+ * @from The original hash strength
-+ * @to The hash strength changed to
-+ * @hash The hash value generated with "from" hash value
-+ *
-+ * return the hash value
-+ */
-+static u32 delta_hash(void *addr, int from, int to, u32 hash)
-+{
-+ u32 *key = (u32 *)addr;
-+ int index, pos; /* make sure they are int type */
-+
-+ if (to > from) {
-+ if (from >= HASH_STRENGTH_FULL) {
-+ from -= HASH_STRENGTH_FULL;
-+ to -= HASH_STRENGTH_FULL;
-+ HASH_FROM_TO(from, to);
-+ } else if (to <= HASH_STRENGTH_FULL) {
-+ HASH_FROM_TO(from, to);
-+ } else {
-+ HASH_FROM_TO(from, HASH_STRENGTH_FULL);
-+ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
-+ }
-+ } else {
-+ if (from <= HASH_STRENGTH_FULL) {
-+ HASH_FROM_DOWN_TO(from, to);
-+ } else if (to >= HASH_STRENGTH_FULL) {
-+ from -= HASH_STRENGTH_FULL;
-+ to -= HASH_STRENGTH_FULL;
-+ HASH_FROM_DOWN_TO(from, to);
-+ } else {
-+ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
-+ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
-+ }
-+ }
-+
-+ return hash;
-+}
-+
-+/**
-+ *
-+ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
-+ * has finished.
-+ *
-+ * return 0 if no page has been scanned since last call, 1 otherwise.
-+ */
-+static inline int encode_benefit(void)
-+{
-+ u64 scanned_delta, pos_delta, neg_delta;
-+ unsigned long base = benefit.base;
-+
-+ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
-+
-+ if (!scanned_delta)
-+ return 0;
-+
-+ scanned_delta >>= base;
-+ pos_delta = rshash_pos >> base;
-+ neg_delta = rshash_neg >> base;
-+
-+ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
-+ CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
-+ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
-+ benefit.scanned >>= 1;
-+ benefit.neg >>= 1;
-+ benefit.pos >>= 1;
-+ benefit.base++;
-+ scanned_delta >>= 1;
-+ pos_delta >>= 1;
-+ neg_delta >>= 1;
-+ }
-+
-+ benefit.pos += pos_delta;
-+ benefit.neg += neg_delta;
-+ benefit.scanned += scanned_delta;
-+
-+ BUG_ON(!benefit.scanned);
-+
-+ rshash_pos = rshash_neg = 0;
-+ uksm_pages_scanned_last = uksm_pages_scanned;
-+
-+ return 1;
-+}
-+
-+static inline void reset_benefit(void)
-+{
-+ benefit.pos = 0;
-+ benefit.neg = 0;
-+ benefit.base = 0;
-+ benefit.scanned = 0;
-+}
-+
-+static inline void inc_rshash_pos(unsigned long delta)
-+{
-+ if (CAN_OVERFLOW_U64(rshash_pos, delta))
-+ encode_benefit();
-+
-+ rshash_pos += delta;
-+}
-+
-+static inline void inc_rshash_neg(unsigned long delta)
-+{
-+ if (CAN_OVERFLOW_U64(rshash_neg, delta))
-+ encode_benefit();
-+
-+ rshash_neg += delta;
-+}
-+
-+
-+static inline u32 page_hash(struct page *page, unsigned long hash_strength,
-+ int cost_accounting)
-+{
-+ u32 val;
-+ unsigned long delta;
-+
-+ void *addr = kmap_atomic(page);
-+
-+ val = random_sample_hash(addr, hash_strength);
-+ kunmap_atomic(addr);
-+
-+ if (cost_accounting) {
-+ if (hash_strength < HASH_STRENGTH_FULL)
-+ delta = HASH_STRENGTH_FULL - hash_strength;
-+ else
-+ delta = 0;
-+
-+ inc_rshash_pos(delta);
-+ }
-+
-+ return val;
-+}
-+
-+static int memcmp_pages_with_cost(struct page *page1, struct page *page2,
-+ int cost_accounting)
-+{
-+ char *addr1, *addr2;
-+ int ret;
-+
-+ addr1 = kmap_atomic(page1);
-+ addr2 = kmap_atomic(page2);
-+ ret = memcmp(addr1, addr2, PAGE_SIZE);
-+ kunmap_atomic(addr2);
-+ kunmap_atomic(addr1);
-+
-+ if (cost_accounting)
-+ inc_rshash_neg(memcmp_cost);
-+
-+ return ret;
-+}
-+
-+static inline int pages_identical_with_cost(struct page *page1, struct page *page2)
-+{
-+ return !memcmp_pages_with_cost(page1, page2, 0);
-+}
-+
-+static inline int is_page_full_zero(struct page *page)
-+{
-+ char *addr;
-+ int ret;
-+
-+ addr = kmap_atomic(page);
-+ ret = is_full_zero(addr, PAGE_SIZE);
-+ kunmap_atomic(addr);
-+
-+ return ret;
-+}
-+
-+static int write_protect_page(struct vm_area_struct *vma, struct page *page,
-+ pte_t *orig_pte, pte_t *old_pte)
-+{
-+ struct mm_struct *mm = vma->vm_mm;
-+ struct page_vma_mapped_walk pvmw = {
-+ .page = page,
-+ .vma = vma,
-+ };
-+ struct mmu_notifier_range range;
-+ int swapped;
-+ int err = -EFAULT;
-+
-+ pvmw.address = page_address_in_vma(page, vma);
-+ if (pvmw.address == -EFAULT)
-+ goto out;
-+
-+ BUG_ON(PageTransCompound(page));
-+
-+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address,
-+ pvmw.address + PAGE_SIZE);
-+ mmu_notifier_invalidate_range_start(&range);
-+
-+ if (!page_vma_mapped_walk(&pvmw))
-+ goto out_mn;
-+ if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
-+ goto out_unlock;
-+
-+ if (old_pte)
-+ *old_pte = *pvmw.pte;
-+
-+ if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
-+ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) {
-+ pte_t entry;
-+
-+ swapped = PageSwapCache(page);
-+ flush_cache_page(vma, pvmw.address, page_to_pfn(page));
-+ /*
-+ * Ok this is tricky, when get_user_pages_fast() run it doesn't
-+ * take any lock, therefore the check that we are going to make
-+ * with the pagecount against the mapcount is racey and
-+ * O_DIRECT can happen right after the check.
-+ * So we clear the pte and flush the tlb before the check
-+ * this assure us that no O_DIRECT can happen after the check
-+ * or in the middle of the check.
-+ */
-+ entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
-+ /*
-+ * Check that no O_DIRECT or similar I/O is in progress on the
-+ * page
-+ */
-+ if (page_mapcount(page) + 1 + swapped != page_count(page)) {
-+ set_pte_at(mm, pvmw.address, pvmw.pte, entry);
-+ goto out_unlock;
-+ }
-+ if (pte_dirty(entry))
-+ set_page_dirty(page);
-+
-+ if (pte_protnone(entry))
-+ entry = pte_mkclean(pte_clear_savedwrite(entry));
-+ else
-+ entry = pte_mkclean(pte_wrprotect(entry));
-+
-+ set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
-+ }
-+ *orig_pte = *pvmw.pte;
-+ err = 0;
-+
-+out_unlock:
-+ page_vma_mapped_walk_done(&pvmw);
-+out_mn:
-+ mmu_notifier_invalidate_range_end(&range);
-+out:
-+ return err;
-+}
-+
-+#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */
-+#define MERGE_ERR_COLLI 2 /* there is a collision */
-+#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */
-+#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */
-+
-+
-+/**
-+ * replace_page - replace page in vma by new ksm page
-+ * @vma: vma that holds the pte pointing to page
-+ * @page: the page we are replacing by kpage
-+ * @kpage: the ksm page we replace page by
-+ * @orig_pte: the original value of the pte
-+ *
-+ * Returns 0 on success, MERGE_ERR_PGERR on failure.
-+ */
-+static int replace_page(struct vm_area_struct *vma, struct page *page,
-+ struct page *kpage, pte_t orig_pte)
-+{
-+ struct mm_struct *mm = vma->vm_mm;
-+ struct mmu_notifier_range range;
-+ pgd_t *pgd;
-+ p4d_t *p4d;
-+ pud_t *pud;
-+ pmd_t *pmd;
-+ pte_t *ptep;
-+ spinlock_t *ptl;
-+ pte_t entry;
-+
-+ unsigned long addr;
-+ int err = MERGE_ERR_PGERR;
-+
-+ addr = page_address_in_vma(page, vma);
-+ if (addr == -EFAULT)
-+ goto out;
-+
-+ pgd = pgd_offset(mm, addr);
-+ if (!pgd_present(*pgd))
-+ goto out;
-+
-+ p4d = p4d_offset(pgd, addr);
-+ pud = pud_offset(p4d, addr);
-+ if (!pud_present(*pud))
-+ goto out;
-+
-+ pmd = pmd_offset(pud, addr);
-+ BUG_ON(pmd_trans_huge(*pmd));
-+ if (!pmd_present(*pmd))
-+ goto out;
-+
-+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
-+ addr + PAGE_SIZE);
-+ mmu_notifier_invalidate_range_start(&range);
-+
-+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
-+ if (!pte_same(*ptep, orig_pte)) {
-+ pte_unmap_unlock(ptep, ptl);
-+ goto out_mn;
-+ }
-+
-+ flush_cache_page(vma, addr, pte_pfn(*ptep));
-+ ptep_clear_flush_notify(vma, addr, ptep);
-+ entry = mk_pte(kpage, vma->vm_page_prot);
-+
-+ /* special treatment is needed for zero_page */
-+ if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
-+ (page_to_pfn(kpage) == zero_pfn)) {
-+ entry = pte_mkspecial(entry);
-+ dec_mm_counter(mm, MM_ANONPAGES);
-+ inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
-+ } else {
-+ get_page(kpage);
-+ page_add_anon_rmap(kpage, vma, addr, false);
-+ }
-+
-+ set_pte_at_notify(mm, addr, ptep, entry);
-+
-+ page_remove_rmap(page, false);
-+ if (!page_mapped(page))
-+ try_to_free_swap(page);
-+ put_page(page);
-+
-+ pte_unmap_unlock(ptep, ptl);
-+ err = 0;
-+out_mn:
-+ mmu_notifier_invalidate_range_end(&range);
-+out:
-+ return err;
-+}
-+
-+
-+/**
-+ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
-+ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its
-+ * hash_max member has not been calculated.
-+ *
-+ * @page The page needs to be hashed
-+ * @hash_old The hash value calculated with current hash strength
-+ *
-+ * return the new hash value calculated at HASH_STRENGTH_MAX
-+ */
-+static inline u32 page_hash_max(struct page *page, u32 hash_old)
-+{
-+ u32 hash_max = 0;
-+ void *addr;
-+
-+ addr = kmap_atomic(page);
-+ hash_max = delta_hash(addr, hash_strength,
-+ HASH_STRENGTH_MAX, hash_old);
-+
-+ kunmap_atomic(addr);
-+
-+ if (!hash_max)
-+ hash_max = 1;
-+
-+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
-+ return hash_max;
-+}
-+
-+/*
-+ * We compare the hash again, to ensure that it is really a hash collision
-+ * instead of being caused by page write.
-+ */
-+static inline int check_collision(struct rmap_item *rmap_item,
-+ u32 hash)
-+{
-+ int err;
-+ struct page *page = rmap_item->page;
-+
-+ /* if this rmap_item has already been hash_maxed, then the collision
-+ * must appears in the second-level rbtree search. In this case we check
-+ * if its hash_max value has been changed. Otherwise, the collision
-+ * happens in the first-level rbtree search, so we check against it's
-+ * current hash value.
-+ */
-+ if (rmap_item->hash_max) {
-+ inc_rshash_neg(memcmp_cost);
-+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
-+
-+ if (rmap_item->hash_max == page_hash_max(page, hash))
-+ err = MERGE_ERR_COLLI;
-+ else
-+ err = MERGE_ERR_CHANGED;
-+ } else {
-+ inc_rshash_neg(memcmp_cost + hash_strength);
-+
-+ if (page_hash(page, hash_strength, 0) == hash)
-+ err = MERGE_ERR_COLLI;
-+ else
-+ err = MERGE_ERR_CHANGED;
-+ }
-+
-+ return err;
-+}
-+
-+/**
-+ * Try to merge a rmap_item.page with a kpage in stable node. kpage must
-+ * already be a ksm page.
-+ *
-+ * @return 0 if the pages were merged, -EFAULT otherwise.
-+ */
-+static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
-+ struct page *kpage, u32 hash)
-+{
-+ struct vm_area_struct *vma = rmap_item->slot->vma;
-+ struct mm_struct *mm = vma->vm_mm;
-+ pte_t orig_pte = __pte(0);
-+ int err = MERGE_ERR_PGERR;
-+ struct page *page;
-+
-+ if (uksm_test_exit(mm))
-+ goto out;
-+
-+ page = rmap_item->page;
-+
-+ if (page == kpage) { /* ksm page forked */
-+ err = 0;
-+ goto out;
-+ }
-+
-+ /*
-+ * We need the page lock to read a stable PageSwapCache in
-+ * write_protect_page(). We use trylock_page() instead of
-+ * lock_page() because we don't want to wait here - we
-+ * prefer to continue scanning and merging different pages,
-+ * then come back to this page when it is unlocked.
-+ */
-+ if (!trylock_page(page))
-+ goto out;
-+
-+ if (!PageAnon(page) || !PageKsm(kpage))
-+ goto out_unlock;
-+
-+ if (PageTransCompound(page)) {
-+ err = split_huge_page(page);
-+ if (err)
-+ goto out_unlock;
-+ }
-+
-+ /*
-+ * If this anonymous page is mapped only here, its pte may need
-+ * to be write-protected. If it's mapped elsewhere, all of its
-+ * ptes are necessarily already write-protected. But in either
-+ * case, we need to lock and check page_count is not raised.
-+ */
-+ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
-+ if (pages_identical_with_cost(page, kpage))
-+ err = replace_page(vma, page, kpage, orig_pte);
-+ else
-+ err = check_collision(rmap_item, hash);
-+ }
-+
-+ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
-+ munlock_vma_page(page);
-+ if (!PageMlocked(kpage)) {
-+ unlock_page(page);
-+ lock_page(kpage);
-+ mlock_vma_page(kpage);
-+ page = kpage; /* for final unlock */
-+ }
-+ }
-+
-+out_unlock:
-+ unlock_page(page);
-+out:
-+ return err;
-+}
-+
-+
-+
-+/**
-+ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
-+ * to restore a page mapping that has been changed in try_to_merge_two_pages.
-+ *
-+ * @return 0 on success.
-+ */
-+static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
-+ pte_t orig_pte, pte_t wprt_pte)
-+{
-+ struct mm_struct *mm = vma->vm_mm;
-+ pgd_t *pgd;
-+ p4d_t *p4d;
-+ pud_t *pud;
-+ pmd_t *pmd;
-+ pte_t *ptep;
-+ spinlock_t *ptl;
-+
-+ int err = -EFAULT;
-+
-+ pgd = pgd_offset(mm, addr);
-+ if (!pgd_present(*pgd))
-+ goto out;
-+
-+ p4d = p4d_offset(pgd, addr);
-+ pud = pud_offset(p4d, addr);
-+ if (!pud_present(*pud))
-+ goto out;
-+
-+ pmd = pmd_offset(pud, addr);
-+ if (!pmd_present(*pmd))
-+ goto out;
-+
-+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
-+ if (!pte_same(*ptep, wprt_pte)) {
-+ /* already copied, let it be */
-+ pte_unmap_unlock(ptep, ptl);
-+ goto out;
-+ }
-+
-+ /*
-+ * Good boy, still here. When we still get the ksm page, it does not
-+ * return to the free page pool, there is no way that a pte was changed
-+ * to other page and gets back to this page. And remind that ksm page
-+ * do not reuse in do_wp_page(). So it's safe to restore the original
-+ * pte.
-+ */
-+ flush_cache_page(vma, addr, pte_pfn(*ptep));
-+ ptep_clear_flush_notify(vma, addr, ptep);
-+ set_pte_at_notify(mm, addr, ptep, orig_pte);
-+
-+ pte_unmap_unlock(ptep, ptl);
-+ err = 0;
-+out:
-+ return err;
-+}
-+
-+/**
-+ * try_to_merge_two_pages() - take two identical pages and prepare
-+ * them to be merged into one page(rmap_item->page)
-+ *
-+ * @return 0 if we successfully merged two identical pages into
-+ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision
-+ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
-+ * changed since it's hashed. MERGE_ERR_PGERR otherwise.
-+ *
-+ */
-+static int try_to_merge_two_pages(struct rmap_item *rmap_item,
-+ struct rmap_item *tree_rmap_item,
-+ u32 hash)
-+{
-+ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
-+ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
-+ struct vm_area_struct *vma1 = rmap_item->slot->vma;
-+ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
-+ struct page *page = rmap_item->page;
-+ struct page *tree_page = tree_rmap_item->page;
-+ int err = MERGE_ERR_PGERR;
-+ struct address_space *saved_mapping;
-+
-+
-+ if (rmap_item->page == tree_rmap_item->page)
-+ goto out;
-+
-+ if (!trylock_page(page))
-+ goto out;
-+
-+ if (!PageAnon(page))
-+ goto out_unlock;
-+
-+ if (PageTransCompound(page)) {
-+ err = split_huge_page(page);
-+ if (err)
-+ goto out_unlock;
-+ }
-+
-+ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
-+ unlock_page(page);
-+ goto out;
-+ }
-+
-+ /*
-+ * While we hold page lock, upgrade page from
-+ * PageAnon+anon_vma to PageKsm+NULL stable_node:
-+ * stable_tree_insert() will update stable_node.
-+ */
-+ saved_mapping = page->mapping;
-+ set_page_stable_node(page, NULL);
-+ mark_page_accessed(page);
-+ if (!PageDirty(page))
-+ SetPageDirty(page);
-+
-+ unlock_page(page);
-+
-+ if (!trylock_page(tree_page))
-+ goto restore_out;
-+
-+ if (!PageAnon(tree_page)) {
-+ unlock_page(tree_page);
-+ goto restore_out;
-+ }
-+
-+ if (PageTransCompound(tree_page)) {
-+ err = split_huge_page(tree_page);
-+ if (err) {
-+ unlock_page(tree_page);
-+ goto restore_out;
-+ }
-+ }
-+
-+ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
-+ unlock_page(tree_page);
-+ goto restore_out;
-+ }
-+
-+ if (pages_identical_with_cost(page, tree_page)) {
-+ err = replace_page(vma2, tree_page, page, wprt_pte2);
-+ if (err) {
-+ unlock_page(tree_page);
-+ goto restore_out;
-+ }
-+
-+ if ((vma2->vm_flags & VM_LOCKED)) {
-+ munlock_vma_page(tree_page);
-+ if (!PageMlocked(page)) {
-+ unlock_page(tree_page);
-+ lock_page(page);
-+ mlock_vma_page(page);
-+ tree_page = page; /* for final unlock */
-+ }
-+ }
-+
-+ unlock_page(tree_page);
-+
-+ goto out; /* success */
-+
-+ } else {
-+ if (tree_rmap_item->hash_max &&
-+ tree_rmap_item->hash_max == rmap_item->hash_max) {
-+ err = MERGE_ERR_COLLI_MAX;
-+ } else if (page_hash(page, hash_strength, 0) ==
-+ page_hash(tree_page, hash_strength, 0)) {
-+ inc_rshash_neg(memcmp_cost + hash_strength * 2);
-+ err = MERGE_ERR_COLLI;
-+ } else {
-+ err = MERGE_ERR_CHANGED;
-+ }
-+
-+ unlock_page(tree_page);
-+ }
-+
-+restore_out:
-+ lock_page(page);
-+ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
-+ orig_pte1, wprt_pte1))
-+ page->mapping = saved_mapping;
-+
-+out_unlock:
-+ unlock_page(page);
-+out:
-+ return err;
-+}
-+
-+static inline int hash_cmp(u32 new_val, u32 node_val)
-+{
-+ if (new_val > node_val)
-+ return 1;
-+ else if (new_val < node_val)
-+ return -1;
-+ else
-+ return 0;
-+}
-+
-+static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
-+{
-+ u32 hash_max = item->hash_max;
-+
-+ if (!hash_max) {
-+ hash_max = page_hash_max(item->page, hash);
-+
-+ item->hash_max = hash_max;
-+ }
-+
-+ return hash_max;
-+}
-+
-+
-+
-+/**
-+ * stable_tree_search() - search the stable tree for a page
-+ *
-+ * @item: the rmap_item we are comparing with
-+ * @hash: the hash value of this item->page already calculated
-+ *
-+ * @return the page we have found, NULL otherwise. The page returned has
-+ * been gotten.
-+ */
-+static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
-+{
-+ struct rb_node *node = root_stable_treep->rb_node;
-+ struct tree_node *tree_node;
-+ unsigned long hash_max;
-+ struct page *page = item->page;
-+ struct stable_node *stable_node;
-+
-+ stable_node = page_stable_node(page);
-+ if (stable_node) {
-+ /* ksm page forked, that is
-+ * if (PageKsm(page) && !in_stable_tree(rmap_item))
-+ * it's actually gotten once outside.
-+ */
-+ get_page(page);
-+ return page;
-+ }
-+
-+ while (node) {
-+ int cmp;
-+
-+ tree_node = rb_entry(node, struct tree_node, node);
-+
-+ cmp = hash_cmp(hash, tree_node->hash);
-+
-+ if (cmp < 0)
-+ node = node->rb_left;
-+ else if (cmp > 0)
-+ node = node->rb_right;
-+ else
-+ break;
-+ }
-+
-+ if (!node)
-+ return NULL;
-+
-+ if (tree_node->count == 1) {
-+ stable_node = rb_entry(tree_node->sub_root.rb_node,
-+ struct stable_node, node);
-+ BUG_ON(!stable_node);
-+
-+ goto get_page_out;
-+ }
-+
-+ /*
-+ * ok, we have to search the second
-+ * level subtree, hash the page to a
-+ * full strength.
-+ */
-+ node = tree_node->sub_root.rb_node;
-+ BUG_ON(!node);
-+ hash_max = rmap_item_hash_max(item, hash);
-+
-+ while (node) {
-+ int cmp;
-+
-+ stable_node = rb_entry(node, struct stable_node, node);
-+
-+ cmp = hash_cmp(hash_max, stable_node->hash_max);
-+
-+ if (cmp < 0)
-+ node = node->rb_left;
-+ else if (cmp > 0)
-+ node = node->rb_right;
-+ else
-+ goto get_page_out;
-+ }
-+
-+ return NULL;
-+
-+get_page_out:
-+ page = get_uksm_page(stable_node, 1, 1);
-+ return page;
-+}
-+
-+static int try_merge_rmap_item(struct rmap_item *item,
-+ struct page *kpage,
-+ struct page *tree_page)
-+{
-+ struct vm_area_struct *vma = item->slot->vma;
-+ struct page_vma_mapped_walk pvmw = {
-+ .page = kpage,
-+ .vma = vma,
-+ };
-+
-+ pvmw.address = get_rmap_addr(item);
-+ if (!page_vma_mapped_walk(&pvmw))
-+ return 0;
-+
-+ if (pte_write(*pvmw.pte)) {
-+ /* has changed, abort! */
-+ page_vma_mapped_walk_done(&pvmw);
-+ return 0;
-+ }
-+
-+ get_page(tree_page);
-+ page_add_anon_rmap(tree_page, vma, pvmw.address, false);
-+
-+ flush_cache_page(vma, pvmw.address, page_to_pfn(kpage));
-+ ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
-+ set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte,
-+ mk_pte(tree_page, vma->vm_page_prot));
-+
-+ page_remove_rmap(kpage, false);
-+ put_page(kpage);
-+
-+ page_vma_mapped_walk_done(&pvmw);
-+
-+ return 1;
-+}
-+
-+/**
-+ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
-+ * into stable tree, the page was found to be identical to a stable ksm page,
-+ * this is the last chance we can merge them into one.
-+ *
-+ * @item1: the rmap_item holding the page which we wanted to insert
-+ * into stable tree.
-+ * @item2: the other rmap_item we found when unstable tree search
-+ * @oldpage: the page currently mapped by the two rmap_items
-+ * @tree_page: the page we found identical in stable tree node
-+ * @success1: return if item1 is successfully merged
-+ * @success2: return if item2 is successfully merged
-+ */
-+static void try_merge_with_stable(struct rmap_item *item1,
-+ struct rmap_item *item2,
-+ struct page **kpage,
-+ struct page *tree_page,
-+ int *success1, int *success2)
-+{
-+ struct vm_area_struct *vma1 = item1->slot->vma;
-+ struct vm_area_struct *vma2 = item2->slot->vma;
-+ *success1 = 0;
-+ *success2 = 0;
-+
-+ if (unlikely(*kpage == tree_page)) {
-+ /* I don't think this can really happen */
-+ pr_warn("UKSM: unexpected condition detected in "
-+ "%s -- *kpage == tree_page !\n", __func__);
-+ *success1 = 1;
-+ *success2 = 1;
-+ return;
-+ }
-+
-+ if (!PageAnon(*kpage) || !PageKsm(*kpage))
-+ goto failed;
-+
-+ if (!trylock_page(tree_page))
-+ goto failed;
-+
-+ /* If the oldpage is still ksm and still pointed
-+ * to in the right place, and still write protected,
-+ * we are confident it's not changed, no need to
-+ * memcmp anymore.
-+ * be ware, we cannot take nested pte locks,
-+ * deadlock risk.
-+ */
-+ if (!try_merge_rmap_item(item1, *kpage, tree_page))
-+ goto unlock_failed;
-+
-+ /* ok, then vma2, remind that pte1 already set */
-+ if (!try_merge_rmap_item(item2, *kpage, tree_page))
-+ goto success_1;
-+
-+ *success2 = 1;
-+success_1:
-+ *success1 = 1;
-+
-+
-+ if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
-+ (*success2 && vma2->vm_flags & VM_LOCKED)) {
-+ munlock_vma_page(*kpage);
-+ if (!PageMlocked(tree_page))
-+ mlock_vma_page(tree_page);
-+ }
-+
-+ /*
-+ * We do not need oldpage any more in the caller, so can break the lock
-+ * now.
-+ */
-+ unlock_page(*kpage);
-+ *kpage = tree_page; /* Get unlocked outside. */
-+ return;
-+
-+unlock_failed:
-+ unlock_page(tree_page);
-+failed:
-+ return;
-+}
-+
-+static inline void stable_node_hash_max(struct stable_node *node,
-+ struct page *page, u32 hash)
-+{
-+ u32 hash_max = node->hash_max;
-+
-+ if (!hash_max) {
-+ hash_max = page_hash_max(page, hash);
-+ node->hash_max = hash_max;
-+ }
-+}
-+
-+static inline
-+struct stable_node *new_stable_node(struct tree_node *tree_node,
-+ struct page *kpage, u32 hash_max)
-+{
-+ struct stable_node *new_stable_node;
-+
-+ new_stable_node = alloc_stable_node();
-+ if (!new_stable_node)
-+ return NULL;
-+
-+ new_stable_node->kpfn = page_to_pfn(kpage);
-+ new_stable_node->hash_max = hash_max;
-+ new_stable_node->tree_node = tree_node;
-+ set_page_stable_node(kpage, new_stable_node);
-+
-+ return new_stable_node;
-+}
-+
-+static inline
-+struct stable_node *first_level_insert(struct tree_node *tree_node,
-+ struct rmap_item *rmap_item,
-+ struct rmap_item *tree_rmap_item,
-+ struct page **kpage, u32 hash,
-+ int *success1, int *success2)
-+{
-+ int cmp;
-+ struct page *tree_page;
-+ u32 hash_max = 0;
-+ struct stable_node *stable_node, *new_snode;
-+ struct rb_node *parent = NULL, **new;
-+
-+ /* this tree node contains no sub-tree yet */
-+ stable_node = rb_entry(tree_node->sub_root.rb_node,
-+ struct stable_node, node);
-+
-+ tree_page = get_uksm_page(stable_node, 1, 0);
-+ if (tree_page) {
-+ cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
-+ if (!cmp) {
-+ try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
-+ tree_page, success1, success2);
-+ put_page(tree_page);
-+ if (!*success1 && !*success2)
-+ goto failed;
-+
-+ return stable_node;
-+
-+ } else {
-+ /*
-+ * collision in first level try to create a subtree.
-+ * A new node need to be created.
-+ */
-+ put_page(tree_page);
-+
-+ stable_node_hash_max(stable_node, tree_page,
-+ tree_node->hash);
-+ hash_max = rmap_item_hash_max(rmap_item, hash);
-+ cmp = hash_cmp(hash_max, stable_node->hash_max);
-+
-+ parent = &stable_node->node;
-+ if (cmp < 0)
-+ new = &parent->rb_left;
-+ else if (cmp > 0)
-+ new = &parent->rb_right;
-+ else
-+ goto failed;
-+ }
-+
-+ } else {
-+ /* the only stable_node deleted, we reuse its tree_node.
-+ */
-+ parent = NULL;
-+ new = &tree_node->sub_root.rb_node;
-+ }
-+
-+ new_snode = new_stable_node(tree_node, *kpage, hash_max);
-+ if (!new_snode)
-+ goto failed;
-+
-+ rb_link_node(&new_snode->node, parent, new);
-+ rb_insert_color(&new_snode->node, &tree_node->sub_root);
-+ tree_node->count++;
-+ *success1 = *success2 = 1;
-+
-+ return new_snode;
-+
-+failed:
-+ return NULL;
-+}
-+
-+static inline
-+struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
-+ struct rmap_item *rmap_item,
-+ struct rmap_item *tree_rmap_item,
-+ struct page **kpage, u32 hash,
-+ int *success1, int *success2)
-+{
-+ struct page *tree_page;
-+ u32 hash_max;
-+ struct stable_node *stable_node, *new_snode;
-+ struct rb_node *parent, **new;
-+
-+research:
-+ parent = NULL;
-+ new = &tree_node->sub_root.rb_node;
-+ BUG_ON(!*new);
-+ hash_max = rmap_item_hash_max(rmap_item, hash);
-+ while (*new) {
-+ int cmp;
-+
-+ stable_node = rb_entry(*new, struct stable_node, node);
-+
-+ cmp = hash_cmp(hash_max, stable_node->hash_max);
-+
-+ if (cmp < 0) {
-+ parent = *new;
-+ new = &parent->rb_left;
-+ } else if (cmp > 0) {
-+ parent = *new;
-+ new = &parent->rb_right;
-+ } else {
-+ tree_page = get_uksm_page(stable_node, 1, 0);
-+ if (tree_page) {
-+ cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
-+ if (!cmp) {
-+ try_merge_with_stable(rmap_item,
-+ tree_rmap_item, kpage,
-+ tree_page, success1, success2);
-+
-+ put_page(tree_page);
-+ if (!*success1 && !*success2)
-+ goto failed;
-+ /*
-+ * successfully merged with a stable
-+ * node
-+ */
-+ return stable_node;
-+ } else {
-+ put_page(tree_page);
-+ goto failed;
-+ }
-+ } else {
-+ /*
-+ * stable node may be deleted,
-+ * and subtree maybe
-+ * restructed, cannot
-+ * continue, research it.
-+ */
-+ if (tree_node->count) {
-+ goto research;
-+ } else {
-+ /* reuse the tree node*/
-+ parent = NULL;
-+ new = &tree_node->sub_root.rb_node;
-+ }
-+ }
-+ }
-+ }
-+
-+ new_snode = new_stable_node(tree_node, *kpage, hash_max);
-+ if (!new_snode)
-+ goto failed;
-+
-+ rb_link_node(&new_snode->node, parent, new);
-+ rb_insert_color(&new_snode->node, &tree_node->sub_root);
-+ tree_node->count++;
-+ *success1 = *success2 = 1;
-+
-+ return new_snode;
-+
-+failed:
-+ return NULL;
-+}
-+
-+
-+/**
-+ * stable_tree_insert() - try to insert a merged page in unstable tree to
-+ * the stable tree
-+ *
-+ * @kpage: the page need to be inserted
-+ * @hash: the current hash of this page
-+ * @rmap_item: the rmap_item being scanned
-+ * @tree_rmap_item: the rmap_item found on unstable tree
-+ * @success1: return if rmap_item is merged
-+ * @success2: return if tree_rmap_item is merged
-+ *
-+ * @return the stable_node on stable tree if at least one
-+ * rmap_item is inserted into stable tree, NULL
-+ * otherwise.
-+ */
-+static struct stable_node *
-+stable_tree_insert(struct page **kpage, u32 hash,
-+ struct rmap_item *rmap_item,
-+ struct rmap_item *tree_rmap_item,
-+ int *success1, int *success2)
-+{
-+ struct rb_node **new = &root_stable_treep->rb_node;
-+ struct rb_node *parent = NULL;
-+ struct stable_node *stable_node;
-+ struct tree_node *tree_node;
-+ u32 hash_max = 0;
-+
-+ *success1 = *success2 = 0;
-+
-+ while (*new) {
-+ int cmp;
-+
-+ tree_node = rb_entry(*new, struct tree_node, node);
-+
-+ cmp = hash_cmp(hash, tree_node->hash);
-+
-+ if (cmp < 0) {
-+ parent = *new;
-+ new = &parent->rb_left;
-+ } else if (cmp > 0) {
-+ parent = *new;
-+ new = &parent->rb_right;
-+ } else
-+ break;
-+ }
-+
-+ if (*new) {
-+ if (tree_node->count == 1) {
-+ stable_node = first_level_insert(tree_node, rmap_item,
-+ tree_rmap_item, kpage,
-+ hash, success1, success2);
-+ } else {
-+ stable_node = stable_subtree_insert(tree_node,
-+ rmap_item, tree_rmap_item, kpage,
-+ hash, success1, success2);
-+ }
-+ } else {
-+
-+ /* no tree node found */
-+ tree_node = alloc_tree_node(stable_tree_node_listp);
-+ if (!tree_node) {
-+ stable_node = NULL;
-+ goto out;
-+ }
-+
-+ stable_node = new_stable_node(tree_node, *kpage, hash_max);
-+ if (!stable_node) {
-+ free_tree_node(tree_node);
-+ goto out;
-+ }
-+
-+ tree_node->hash = hash;
-+ rb_link_node(&tree_node->node, parent, new);
-+ rb_insert_color(&tree_node->node, root_stable_treep);
-+ parent = NULL;
-+ new = &tree_node->sub_root.rb_node;
-+
-+ rb_link_node(&stable_node->node, parent, new);
-+ rb_insert_color(&stable_node->node, &tree_node->sub_root);
-+ tree_node->count++;
-+ *success1 = *success2 = 1;
-+ }
-+
-+out:
-+ return stable_node;
-+}
-+
-+
-+/**
-+ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
-+ *
-+ * @return 0 on success, -EBUSY if unable to lock the mmap_sem,
-+ * -EINVAL if the page mapping has been changed.
-+ */
-+static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
-+{
-+ int err;
-+
-+ err = get_mergeable_page_lock_mmap(tree_rmap_item);
-+
-+ if (err == -EINVAL) {
-+ /* its page map has been changed, remove it */
-+ remove_rmap_item_from_tree(tree_rmap_item);
-+ }
-+
-+ /* The page is gotten and mmap_sem is locked now. */
-+ return err;
-+}
-+
-+
-+/**
-+ * unstable_tree_search_insert() - search an unstable tree rmap_item with the
-+ * same hash value. Get its page and trylock the mmap_sem
-+ */
-+static inline
-+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
-+ u32 hash)
-+
-+{
-+ struct rb_node **new = &root_unstable_tree.rb_node;
-+ struct rb_node *parent = NULL;
-+ struct tree_node *tree_node;
-+ u32 hash_max;
-+ struct rmap_item *tree_rmap_item;
-+
-+ while (*new) {
-+ int cmp;
-+
-+ tree_node = rb_entry(*new, struct tree_node, node);
-+
-+ cmp = hash_cmp(hash, tree_node->hash);
-+
-+ if (cmp < 0) {
-+ parent = *new;
-+ new = &parent->rb_left;
-+ } else if (cmp > 0) {
-+ parent = *new;
-+ new = &parent->rb_right;
-+ } else
-+ break;
-+ }
-+
-+ if (*new) {
-+ /* got the tree_node */
-+ if (tree_node->count == 1) {
-+ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
-+ struct rmap_item, node);
-+ BUG_ON(!tree_rmap_item);
-+
-+ goto get_page_out;
-+ }
-+
-+ /* well, search the collision subtree */
-+ new = &tree_node->sub_root.rb_node;
-+ BUG_ON(!*new);
-+ hash_max = rmap_item_hash_max(rmap_item, hash);
-+
-+ while (*new) {
-+ int cmp;
-+
-+ tree_rmap_item = rb_entry(*new, struct rmap_item,
-+ node);
-+
-+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
-+ parent = *new;
-+ if (cmp < 0)
-+ new = &parent->rb_left;
-+ else if (cmp > 0)
-+ new = &parent->rb_right;
-+ else
-+ goto get_page_out;
-+ }
-+ } else {
-+ /* alloc a new tree_node */
-+ tree_node = alloc_tree_node(&unstable_tree_node_list);
-+ if (!tree_node)
-+ return NULL;
-+
-+ tree_node->hash = hash;
-+ rb_link_node(&tree_node->node, parent, new);
-+ rb_insert_color(&tree_node->node, &root_unstable_tree);
-+ parent = NULL;
-+ new = &tree_node->sub_root.rb_node;
-+ }
-+
-+ /* did not found even in sub-tree */
-+ rmap_item->tree_node = tree_node;
-+ rmap_item->address |= UNSTABLE_FLAG;
-+ rmap_item->hash_round = uksm_hash_round;
-+ rb_link_node(&rmap_item->node, parent, new);
-+ rb_insert_color(&rmap_item->node, &tree_node->sub_root);
-+
-+ uksm_pages_unshared++;
-+ return NULL;
-+
-+get_page_out:
-+ if (tree_rmap_item->page == rmap_item->page)
-+ return NULL;
-+
-+ if (get_tree_rmap_item_page(tree_rmap_item))
-+ return NULL;
-+
-+ return tree_rmap_item;
-+}
-+
-+static void hold_anon_vma(struct rmap_item *rmap_item,
-+ struct anon_vma *anon_vma)
-+{
-+ rmap_item->anon_vma = anon_vma;
-+ get_anon_vma(anon_vma);
-+}
-+
-+
-+/**
-+ * stable_tree_append() - append a rmap_item to a stable node. Deduplication
-+ * ratio statistics is done in this function.
-+ *
-+ */
-+static void stable_tree_append(struct rmap_item *rmap_item,
-+ struct stable_node *stable_node, int logdedup)
-+{
-+ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
-+ unsigned long key = (unsigned long)rmap_item->slot;
-+ unsigned long factor = rmap_item->slot->rung->step;
-+
-+ BUG_ON(!stable_node);
-+ rmap_item->address |= STABLE_FLAG;
-+
-+ if (hlist_empty(&stable_node->hlist)) {
-+ uksm_pages_shared++;
-+ goto node_vma_new;
-+ } else {
-+ uksm_pages_sharing++;
-+ }
-+
-+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
-+ if (node_vma->key >= key)
-+ break;
-+
-+ if (logdedup) {
-+ node_vma->slot->pages_bemerged += factor;
-+ if (list_empty(&node_vma->slot->dedup_list))
-+ list_add(&node_vma->slot->dedup_list,
-+ &vma_slot_dedup);
-+ }
-+ }
-+
-+ if (node_vma) {
-+ if (node_vma->key == key) {
-+ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
-+ goto node_vma_ok;
-+ } else if (node_vma->key > key) {
-+ node_vma_cont = node_vma;
-+ }
-+ }
-+
-+node_vma_new:
-+ /* no same vma already in node, alloc a new node_vma */
-+ new_node_vma = alloc_node_vma();
-+ BUG_ON(!new_node_vma);
-+ new_node_vma->head = stable_node;
-+ new_node_vma->slot = rmap_item->slot;
-+
-+ if (!node_vma) {
-+ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
-+ } else if (node_vma->key != key) {
-+ if (node_vma->key < key)
-+ hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
-+ else {
-+ hlist_add_before(&new_node_vma->hlist,
-+ &node_vma->hlist);
-+ }
-+
-+ }
-+ node_vma = new_node_vma;
-+
-+node_vma_ok: /* ok, ready to add to the list */
-+ rmap_item->head = node_vma;
-+ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
-+ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
-+ if (logdedup) {
-+ rmap_item->slot->pages_merged++;
-+ if (node_vma_cont) {
-+ node_vma = node_vma_cont;
-+ hlist_for_each_entry_continue(node_vma, hlist) {
-+ node_vma->slot->pages_bemerged += factor;
-+ if (list_empty(&node_vma->slot->dedup_list))
-+ list_add(&node_vma->slot->dedup_list,
-+ &vma_slot_dedup);
-+ }
-+ }
-+ }
-+}
-+
-+/*
-+ * We use break_ksm to break COW on a ksm page: it's a stripped down
-+ *
-+ * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
-+ * put_page(page);
-+ *
-+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
-+ * in case the application has unmapped and remapped mm,addr meanwhile.
-+ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
-+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
-+ */
-+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
-+{
-+ struct page *page;
-+ int ret = 0;
-+
-+ do {
-+ cond_resched();
-+ page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
-+ if (IS_ERR_OR_NULL(page))
-+ break;
-+ if (PageKsm(page)) {
-+ ret = handle_mm_fault(vma, addr,
-+ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
-+ NULL);
-+ } else
-+ ret = VM_FAULT_WRITE;
-+ put_page(page);
-+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
-+ /*
-+ * We must loop because handle_mm_fault() may back out if there's
-+ * any difficulty e.g. if pte accessed bit gets updated concurrently.
-+ *
-+ * VM_FAULT_WRITE is what we have been hoping for: it indicates that
-+ * COW has been broken, even if the vma does not permit VM_WRITE;
-+ * but note that a concurrent fault might break PageKsm for us.
-+ *
-+ * VM_FAULT_SIGBUS could occur if we race with truncation of the
-+ * backing file, which also invalidates anonymous pages: that's
-+ * okay, that truncation will have unmapped the PageKsm for us.
-+ *
-+ * VM_FAULT_OOM: at the time of writing (late July 2009), setting
-+ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
-+ * current task has TIF_MEMDIE set, and will be OOM killed on return
-+ * to user; and ksmd, having no mm, would never be chosen for that.
-+ *
-+ * But if the mm is in a limited mem_cgroup, then the fault may fail
-+ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
-+ * even ksmd can fail in this way - though it's usually breaking ksm
-+ * just to undo a merge it made a moment before, so unlikely to oom.
-+ *
-+ * That's a pity: we might therefore have more kernel pages allocated
-+ * than we're counting as nodes in the stable tree; but uksm_do_scan
-+ * will retry to break_cow on each pass, so should recover the page
-+ * in due course. The important thing is to not let VM_MERGEABLE
-+ * be cleared while any such pages might remain in the area.
-+ */
-+ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
-+}
-+
-+static void break_cow(struct rmap_item *rmap_item)
-+{
-+ struct vm_area_struct *vma = rmap_item->slot->vma;
-+ struct mm_struct *mm = vma->vm_mm;
-+ unsigned long addr = get_rmap_addr(rmap_item);
-+
-+ if (uksm_test_exit(mm))
-+ goto out;
-+
-+ break_ksm(vma, addr);
-+out:
-+ return;
-+}
-+
-+/*
-+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
-+ * than check every pte of a given vma, the locking doesn't quite work for
-+ * that - an rmap_item is assigned to the stable tree after inserting ksm
-+ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
-+ * rmap_items from parent to child at fork time (so as not to waste time
-+ * if exit comes before the next scan reaches it).
-+ *
-+ * Similarly, although we'd like to remove rmap_items (so updating counts
-+ * and freeing memory) when unmerging an area, it's easier to leave that
-+ * to the next pass of ksmd - consider, for example, how ksmd might be
-+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
-+ */
-+inline int unmerge_uksm_pages(struct vm_area_struct *vma,
-+ unsigned long start, unsigned long end)
-+{
-+ unsigned long addr;
-+ int err = 0;
-+
-+ for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
-+ if (uksm_test_exit(vma->vm_mm))
-+ break;
-+ if (signal_pending(current))
-+ err = -ERESTARTSYS;
-+ else
-+ err = break_ksm(vma, addr);
-+ }
-+ return err;
-+}
-+
-+static inline void inc_uksm_pages_scanned(void)
-+{
-+ u64 delta;
-+
-+
-+ if (uksm_pages_scanned == U64_MAX) {
-+ encode_benefit();
-+
-+ delta = uksm_pages_scanned >> pages_scanned_base;
-+
-+ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
-+ pages_scanned_stored >>= 1;
-+ delta >>= 1;
-+ pages_scanned_base++;
-+ }
-+
-+ pages_scanned_stored += delta;
-+
-+ uksm_pages_scanned = uksm_pages_scanned_last = 0;
-+ }
-+
-+ uksm_pages_scanned++;
-+}
-+
-+static inline int find_zero_page_hash(int strength, u32 hash)
-+{
-+ return (zero_hash_table[strength] == hash);
-+}
-+
-+static
-+int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
-+{
-+ struct page *zero_page = empty_uksm_zero_page;
-+ struct mm_struct *mm = vma->vm_mm;
-+ pte_t orig_pte = __pte(0);
-+ int err = -EFAULT;
-+
-+ if (uksm_test_exit(mm))
-+ goto out;
-+
-+ if (!trylock_page(page))
-+ goto out;
-+
-+ if (!PageAnon(page))
-+ goto out_unlock;
-+
-+ if (PageTransCompound(page)) {
-+ err = split_huge_page(page);
-+ if (err)
-+ goto out_unlock;
-+ }
-+
-+ if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
-+ if (is_page_full_zero(page))
-+ err = replace_page(vma, page, zero_page, orig_pte);
-+ }
-+
-+out_unlock:
-+ unlock_page(page);
-+out:
-+ return err;
-+}
-+
-+/*
-+ * cmp_and_merge_page() - first see if page can be merged into the stable
-+ * tree; if not, compare hash to previous and if it's the same, see if page
-+ * can be inserted into the unstable tree, or merged with a page already there
-+ * and both transferred to the stable tree.
-+ *
-+ * @page: the page that we are searching identical page to.
-+ * @rmap_item: the reverse mapping into the virtual address of this page
-+ */
-+static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
-+{
-+ struct rmap_item *tree_rmap_item;
-+ struct page *page;
-+ struct page *kpage = NULL;
-+ u32 hash_max;
-+ int err;
-+ unsigned int success1, success2;
-+ struct stable_node *snode;
-+ int cmp;
-+ struct rb_node *parent = NULL, **new;
-+
-+ remove_rmap_item_from_tree(rmap_item);
-+ page = rmap_item->page;
-+
-+ /* We first start with searching the page inside the stable tree */
-+ kpage = stable_tree_search(rmap_item, hash);
-+ if (kpage) {
-+ err = try_to_merge_with_uksm_page(rmap_item, kpage,
-+ hash);
-+ if (!err) {
-+ /*
-+ * The page was successfully merged, add
-+ * its rmap_item to the stable tree.
-+ * page lock is needed because it's
-+ * racing with try_to_unmap_ksm(), etc.
-+ */
-+ lock_page(kpage);
-+ snode = page_stable_node(kpage);
-+ stable_tree_append(rmap_item, snode, 1);
-+ unlock_page(kpage);
-+ put_page(kpage);
-+ return; /* success */
-+ }
-+ put_page(kpage);
-+
-+ /*
-+ * if it's a collision and it has been search in sub-rbtree
-+ * (hash_max != 0), we want to abort, because if it is
-+ * successfully merged in unstable tree, the collision trends to
-+ * happen again.
-+ */
-+ if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
-+ return;
-+ }
-+
-+ tree_rmap_item =
-+ unstable_tree_search_insert(rmap_item, hash);
-+ if (tree_rmap_item) {
-+ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
-+ /*
-+ * As soon as we merge this page, we want to remove the
-+ * rmap_item of the page we have merged with from the unstable
-+ * tree, and insert it instead as new node in the stable tree.
-+ */
-+ if (!err) {
-+ kpage = page;
-+ remove_rmap_item_from_tree(tree_rmap_item);
-+ lock_page(kpage);
-+ snode = stable_tree_insert(&kpage, hash,
-+ rmap_item, tree_rmap_item,
-+ &success1, &success2);
-+
-+ /*
-+ * Do not log dedup for tree item, it's not counted as
-+ * scanned in this round.
-+ */
-+ if (success2)
-+ stable_tree_append(tree_rmap_item, snode, 0);
-+
-+ /*
-+ * The order of these two stable append is important:
-+ * we are scanning rmap_item.
-+ */
-+ if (success1)
-+ stable_tree_append(rmap_item, snode, 1);
-+
-+ /*
-+ * The original kpage may be unlocked inside
-+ * stable_tree_insert() already. This page
-+ * should be unlocked before doing
-+ * break_cow().
-+ */
-+ unlock_page(kpage);
-+
-+ if (!success1)
-+ break_cow(rmap_item);
-+
-+ if (!success2)
-+ break_cow(tree_rmap_item);
-+
-+ } else if (err == MERGE_ERR_COLLI) {
-+ BUG_ON(tree_rmap_item->tree_node->count > 1);
-+
-+ rmap_item_hash_max(tree_rmap_item,
-+ tree_rmap_item->tree_node->hash);
-+
-+ hash_max = rmap_item_hash_max(rmap_item, hash);
-+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
-+ parent = &tree_rmap_item->node;
-+ if (cmp < 0)
-+ new = &parent->rb_left;
-+ else if (cmp > 0)
-+ new = &parent->rb_right;
-+ else
-+ goto put_up_out;
-+
-+ rmap_item->tree_node = tree_rmap_item->tree_node;
-+ rmap_item->address |= UNSTABLE_FLAG;
-+ rmap_item->hash_round = uksm_hash_round;
-+ rb_link_node(&rmap_item->node, parent, new);
-+ rb_insert_color(&rmap_item->node,
-+ &tree_rmap_item->tree_node->sub_root);
-+ rmap_item->tree_node->count++;
-+ } else {
-+ /*
-+ * either one of the page has changed or they collide
-+ * at the max hash, we consider them as ill items.
-+ */
-+ remove_rmap_item_from_tree(tree_rmap_item);
-+ }
-+put_up_out:
-+ put_page(tree_rmap_item->page);
-+ mmap_read_unlock(tree_rmap_item->slot->vma->vm_mm);
-+ }
-+}
-+
-+
-+
-+
-+static inline unsigned long get_pool_index(struct vma_slot *slot,
-+ unsigned long index)
-+{
-+ unsigned long pool_index;
-+
-+ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
-+ if (pool_index >= slot->pool_size)
-+ BUG();
-+ return pool_index;
-+}
-+
-+static inline unsigned long index_page_offset(unsigned long index)
-+{
-+ return offset_in_page(sizeof(struct rmap_list_entry *) * index);
-+}
-+
-+static inline
-+struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
-+ unsigned long index, int need_alloc)
-+{
-+ unsigned long pool_index;
-+ struct page *page;
-+ void *addr;
-+
-+
-+ pool_index = get_pool_index(slot, index);
-+ if (!slot->rmap_list_pool[pool_index]) {
-+ if (!need_alloc)
-+ return NULL;
-+
-+ page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
-+ if (!page)
-+ return NULL;
-+
-+ slot->rmap_list_pool[pool_index] = page;
-+ }
-+
-+ addr = kmap(slot->rmap_list_pool[pool_index]);
-+ addr += index_page_offset(index);
-+
-+ return addr;
-+}
-+
-+static inline void put_rmap_list_entry(struct vma_slot *slot,
-+ unsigned long index)
-+{
-+ unsigned long pool_index;
-+
-+ pool_index = get_pool_index(slot, index);
-+ BUG_ON(!slot->rmap_list_pool[pool_index]);
-+ kunmap(slot->rmap_list_pool[pool_index]);
-+}
-+
-+static inline int entry_is_new(struct rmap_list_entry *entry)
-+{
-+ return !entry->item;
-+}
-+
-+static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
-+ unsigned long index)
-+{
-+ return slot->vma->vm_start + (index << PAGE_SHIFT);
-+}
-+
-+static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
-+{
-+ unsigned long addr;
-+
-+ if (is_addr(entry->addr))
-+ addr = get_clean_addr(entry->addr);
-+ else if (entry->item)
-+ addr = get_rmap_addr(entry->item);
-+ else
-+ BUG();
-+
-+ return addr;
-+}
-+
-+static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
-+{
-+ if (is_addr(entry->addr))
-+ return NULL;
-+
-+ return entry->item;
-+}
-+
-+static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
-+ unsigned long index)
-+{
-+ unsigned long pool_index;
-+
-+ pool_index = get_pool_index(slot, index);
-+ BUG_ON(!slot->rmap_list_pool[pool_index]);
-+ slot->pool_counts[pool_index]++;
-+}
-+
-+static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
-+ unsigned long index)
-+{
-+ unsigned long pool_index;
-+
-+ pool_index = get_pool_index(slot, index);
-+ BUG_ON(!slot->rmap_list_pool[pool_index]);
-+ BUG_ON(!slot->pool_counts[pool_index]);
-+ slot->pool_counts[pool_index]--;
-+}
-+
-+static inline int entry_has_rmap(struct rmap_list_entry *entry)
-+{
-+ return !is_addr(entry->addr) && entry->item;
-+}
-+
-+static inline void swap_entries(struct rmap_list_entry *entry1,
-+ unsigned long index1,
-+ struct rmap_list_entry *entry2,
-+ unsigned long index2)
-+{
-+ struct rmap_list_entry tmp;
-+
-+ /* swapping two new entries is meaningless */
-+ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
-+
-+ tmp = *entry1;
-+ *entry1 = *entry2;
-+ *entry2 = tmp;
-+
-+ if (entry_has_rmap(entry1))
-+ entry1->item->entry_index = index1;
-+
-+ if (entry_has_rmap(entry2))
-+ entry2->item->entry_index = index2;
-+
-+ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
-+ inc_rmap_list_pool_count(entry1->item->slot, index1);
-+ dec_rmap_list_pool_count(entry1->item->slot, index2);
-+ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
-+ inc_rmap_list_pool_count(entry2->item->slot, index2);
-+ dec_rmap_list_pool_count(entry2->item->slot, index1);
-+ }
-+}
-+
-+static inline void free_entry_item(struct rmap_list_entry *entry)
-+{
-+ unsigned long index;
-+ struct rmap_item *item;
-+
-+ if (!is_addr(entry->addr)) {
-+ BUG_ON(!entry->item);
-+ item = entry->item;
-+ entry->addr = get_rmap_addr(item);
-+ set_is_addr(entry->addr);
-+ index = item->entry_index;
-+ remove_rmap_item_from_tree(item);
-+ dec_rmap_list_pool_count(item->slot, index);
-+ free_rmap_item(item);
-+ }
-+}
-+
-+static inline int pool_entry_boundary(unsigned long index)
-+{
-+ unsigned long linear_addr;
-+
-+ linear_addr = sizeof(struct rmap_list_entry *) * index;
-+ return index && !offset_in_page(linear_addr);
-+}
-+
-+static inline void try_free_last_pool(struct vma_slot *slot,
-+ unsigned long index)
-+{
-+ unsigned long pool_index;
-+
-+ pool_index = get_pool_index(slot, index);
-+ if (slot->rmap_list_pool[pool_index] &&
-+ !slot->pool_counts[pool_index]) {
-+ __free_page(slot->rmap_list_pool[pool_index]);
-+ slot->rmap_list_pool[pool_index] = NULL;
-+ slot->flags |= UKSM_SLOT_NEED_SORT;
-+ }
-+
-+}
-+
-+static inline unsigned long vma_item_index(struct vm_area_struct *vma,
-+ struct rmap_item *item)
-+{
-+ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
-+}
-+
-+static int within_same_pool(struct vma_slot *slot,
-+ unsigned long i, unsigned long j)
-+{
-+ unsigned long pool_i, pool_j;
-+
-+ pool_i = get_pool_index(slot, i);
-+ pool_j = get_pool_index(slot, j);
-+
-+ return (pool_i == pool_j);
-+}
-+
-+static void sort_rmap_entry_list(struct vma_slot *slot)
-+{
-+ unsigned long i, j;
-+ struct rmap_list_entry *entry, *swap_entry;
-+
-+ entry = get_rmap_list_entry(slot, 0, 0);
-+ for (i = 0; i < slot->pages; ) {
-+
-+ if (!entry)
-+ goto skip_whole_pool;
-+
-+ if (entry_is_new(entry))
-+ goto next_entry;
-+
-+ if (is_addr(entry->addr)) {
-+ entry->addr = 0;
-+ goto next_entry;
-+ }
-+
-+ j = vma_item_index(slot->vma, entry->item);
-+ if (j == i)
-+ goto next_entry;
-+
-+ if (within_same_pool(slot, i, j))
-+ swap_entry = entry + j - i;
-+ else
-+ swap_entry = get_rmap_list_entry(slot, j, 1);
-+
-+ swap_entries(entry, i, swap_entry, j);
-+ if (!within_same_pool(slot, i, j))
-+ put_rmap_list_entry(slot, j);
-+ continue;
-+
-+skip_whole_pool:
-+ i += PAGE_SIZE / sizeof(*entry);
-+ if (i < slot->pages)
-+ entry = get_rmap_list_entry(slot, i, 0);
-+ continue;
-+
-+next_entry:
-+ if (i >= slot->pages - 1 ||
-+ !within_same_pool(slot, i, i + 1)) {
-+ put_rmap_list_entry(slot, i);
-+ if (i + 1 < slot->pages)
-+ entry = get_rmap_list_entry(slot, i + 1, 0);
-+ } else
-+ entry++;
-+ i++;
-+ continue;
-+ }
-+
-+ /* free empty pool entries which contain no rmap_item */
-+ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
-+ for (i = 0; i < slot->pool_size; i++) {
-+ unsigned char has_rmap;
-+ void *addr;
-+
-+ if (!slot->rmap_list_pool[i])
-+ continue;
-+
-+ has_rmap = 0;
-+ addr = kmap(slot->rmap_list_pool[i]);
-+ BUG_ON(!addr);
-+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
-+ entry = (struct rmap_list_entry *)addr + j;
-+ if (is_addr(entry->addr))
-+ continue;
-+ if (!entry->item)
-+ continue;
-+ has_rmap = 1;
-+ }
-+ kunmap(slot->rmap_list_pool[i]);
-+ if (!has_rmap) {
-+ BUG_ON(slot->pool_counts[i]);
-+ __free_page(slot->rmap_list_pool[i]);
-+ slot->rmap_list_pool[i] = NULL;
-+ }
-+ }
-+
-+ slot->flags &= ~UKSM_SLOT_NEED_SORT;
-+}
-+
-+/*
-+ * vma_fully_scanned() - if all the pages in this slot have been scanned.
-+ */
-+static inline int vma_fully_scanned(struct vma_slot *slot)
-+{
-+ return slot->pages_scanned == slot->pages;
-+}
-+
-+/**
-+ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
-+ * its random permutation. This function is embedded with the random
-+ * permutation index management code.
-+ */
-+static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
-+{
-+ unsigned long rand_range, addr, swap_index, scan_index;
-+ struct rmap_item *item = NULL;
-+ struct rmap_list_entry *scan_entry, *swap_entry = NULL;
-+ struct page *page;
-+
-+ scan_index = swap_index = slot->pages_scanned % slot->pages;
-+
-+ if (pool_entry_boundary(scan_index))
-+ try_free_last_pool(slot, scan_index - 1);
-+
-+ if (vma_fully_scanned(slot)) {
-+ if (slot->flags & UKSM_SLOT_NEED_SORT)
-+ slot->flags |= UKSM_SLOT_NEED_RERAND;
-+ else
-+ slot->flags &= ~UKSM_SLOT_NEED_RERAND;
-+ if (slot->flags & UKSM_SLOT_NEED_SORT)
-+ sort_rmap_entry_list(slot);
-+ }
-+
-+ scan_entry = get_rmap_list_entry(slot, scan_index, 1);
-+ if (!scan_entry)
-+ return NULL;
-+
-+ if (entry_is_new(scan_entry)) {
-+ scan_entry->addr = get_index_orig_addr(slot, scan_index);
-+ set_is_addr(scan_entry->addr);
-+ }
-+
-+ if (slot->flags & UKSM_SLOT_NEED_RERAND) {
-+ rand_range = slot->pages - scan_index;
-+ BUG_ON(!rand_range);
-+ swap_index = scan_index + (prandom_u32() % rand_range);
-+ }
-+
-+ if (swap_index != scan_index) {
-+ swap_entry = get_rmap_list_entry(slot, swap_index, 1);
-+
-+ if (!swap_entry)
-+ return NULL;
-+
-+ if (entry_is_new(swap_entry)) {
-+ swap_entry->addr = get_index_orig_addr(slot,
-+ swap_index);
-+ set_is_addr(swap_entry->addr);
-+ }
-+ swap_entries(scan_entry, scan_index, swap_entry, swap_index);
-+ }
-+
-+ addr = get_entry_address(scan_entry);
-+ item = get_entry_item(scan_entry);
-+ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
-+
-+ page = follow_page(slot->vma, addr, FOLL_GET);
-+ if (IS_ERR_OR_NULL(page))
-+ goto nopage;
-+
-+ if (!PageAnon(page))
-+ goto putpage;
-+
-+ /*check is zero_page pfn or uksm_zero_page*/
-+ if ((page_to_pfn(page) == zero_pfn)
-+ || (page_to_pfn(page) == uksm_zero_pfn))
-+ goto putpage;
-+
-+ flush_anon_page(slot->vma, page, addr);
-+ flush_dcache_page(page);
-+
-+
-+ *hash = page_hash(page, hash_strength, 1);
-+ inc_uksm_pages_scanned();
-+ /*if the page content all zero, re-map to zero-page*/
-+ if (find_zero_page_hash(hash_strength, *hash)) {
-+ if (!cmp_and_merge_zero_page(slot->vma, page)) {
-+ slot->pages_merged++;
-+
-+ /* For full-zero pages, no need to create rmap item */
-+ goto putpage;
-+ } else {
-+ inc_rshash_neg(memcmp_cost / 2);
-+ }
-+ }
-+
-+ if (!item) {
-+ item = alloc_rmap_item();
-+ if (item) {
-+ /* It has already been zeroed */
-+ item->slot = slot;
-+ item->address = addr;
-+ item->entry_index = scan_index;
-+ scan_entry->item = item;
-+ inc_rmap_list_pool_count(slot, scan_index);
-+ } else
-+ goto putpage;
-+ }
-+
-+ BUG_ON(item->slot != slot);
-+ /* the page may have changed */
-+ item->page = page;
-+ put_rmap_list_entry(slot, scan_index);
-+ if (swap_entry)
-+ put_rmap_list_entry(slot, swap_index);
-+ return item;
-+
-+putpage:
-+ put_page(page);
-+ page = NULL;
-+nopage:
-+ /* no page, store addr back and free rmap_item if possible */
-+ free_entry_item(scan_entry);
-+ put_rmap_list_entry(slot, scan_index);
-+ if (swap_entry)
-+ put_rmap_list_entry(slot, swap_index);
-+ return NULL;
-+}
-+
-+static inline int in_stable_tree(struct rmap_item *rmap_item)
-+{
-+ return rmap_item->address & STABLE_FLAG;
-+}
-+
-+/**
-+ * scan_vma_one_page() - scan the next page in a vma_slot. Called with
-+ * mmap_sem locked.
-+ */
-+static noinline void scan_vma_one_page(struct vma_slot *slot)
-+{
-+ u32 hash;
-+ struct mm_struct *mm;
-+ struct rmap_item *rmap_item = NULL;
-+ struct vm_area_struct *vma = slot->vma;
-+
-+ mm = vma->vm_mm;
-+ BUG_ON(!mm);
-+ BUG_ON(!slot);
-+
-+ rmap_item = get_next_rmap_item(slot, &hash);
-+ if (!rmap_item)
-+ goto out1;
-+
-+ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
-+ goto out2;
-+
-+ cmp_and_merge_page(rmap_item, hash);
-+out2:
-+ put_page(rmap_item->page);
-+out1:
-+ slot->pages_scanned++;
-+ slot->this_sampled++;
-+ if (slot->fully_scanned_round != fully_scanned_round)
-+ scanned_virtual_pages++;
-+
-+ if (vma_fully_scanned(slot))
-+ slot->fully_scanned_round = fully_scanned_round;
-+}
-+
-+static inline unsigned long rung_get_pages(struct scan_rung *rung)
-+{
-+ struct slot_tree_node *node;
-+
-+ if (!rung->vma_root.rnode)
-+ return 0;
-+
-+ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
-+
-+ return node->size;
-+}
-+
-+#define RUNG_SAMPLED_MIN 3
-+
-+static inline
-+void uksm_calc_rung_step(struct scan_rung *rung,
-+ unsigned long page_time, unsigned long ratio)
-+{
-+ unsigned long sampled, pages;
-+
-+ /* will be fully scanned ? */
-+ if (!rung->cover_msecs) {
-+ rung->step = 1;
-+ return;
-+ }
-+
-+ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
-+ * ratio / page_time;
-+
-+ /*
-+ * Before we finsish a scan round and expensive per-round jobs,
-+ * we need to have a chance to estimate the per page time. So
-+ * the sampled number can not be too small.
-+ */
-+ if (sampled < RUNG_SAMPLED_MIN)
-+ sampled = RUNG_SAMPLED_MIN;
-+
-+ pages = rung_get_pages(rung);
-+ if (likely(pages > sampled))
-+ rung->step = pages / sampled;
-+ else
-+ rung->step = 1;
-+}
-+
-+static inline int step_need_recalc(struct scan_rung *rung)
-+{
-+ unsigned long pages, stepmax;
-+
-+ pages = rung_get_pages(rung);
-+ stepmax = pages / RUNG_SAMPLED_MIN;
-+
-+ return pages && (rung->step > pages ||
-+ (stepmax && rung->step > stepmax));
-+}
-+
-+static inline
-+void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
-+{
-+ struct vma_slot *slot;
-+
-+ if (finished)
-+ rung->flags |= UKSM_RUNG_ROUND_FINISHED;
-+
-+ if (step_recalc || step_need_recalc(rung)) {
-+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
-+ BUG_ON(step_need_recalc(rung));
-+ }
-+
-+ slot_iter_index = prandom_u32() % rung->step;
-+ BUG_ON(!rung->vma_root.rnode);
-+ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
-+ BUG_ON(!slot);
-+
-+ rung->current_scan = slot;
-+ rung->current_offset = slot_iter_index;
-+}
-+
-+static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
-+{
-+ return &slot->rung->vma_root;
-+}
-+
-+/*
-+ * return if resetted.
-+ */
-+static int advance_current_scan(struct scan_rung *rung)
-+{
-+ unsigned short n;
-+ struct vma_slot *slot, *next = NULL;
-+
-+ BUG_ON(!rung->vma_root.num);
-+
-+ slot = rung->current_scan;
-+ n = (slot->pages - rung->current_offset) % rung->step;
-+ slot_iter_index = rung->step - n;
-+ next = sradix_tree_next(&rung->vma_root, slot->snode,
-+ slot->sindex, slot_iter);
-+
-+ if (next) {
-+ rung->current_offset = slot_iter_index;
-+ rung->current_scan = next;
-+ return 0;
-+ } else {
-+ reset_current_scan(rung, 1, 0);
-+ return 1;
-+ }
-+}
-+
-+static inline void rung_rm_slot(struct vma_slot *slot)
-+{
-+ struct scan_rung *rung = slot->rung;
-+ struct sradix_tree_root *root;
-+
-+ if (rung->current_scan == slot)
-+ advance_current_scan(rung);
-+
-+ root = slot_get_root(slot);
-+ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
-+ slot->snode = NULL;
-+ if (step_need_recalc(rung)) {
-+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
-+ BUG_ON(step_need_recalc(rung));
-+ }
-+
-+ /* In case advance_current_scan loop back to this slot again */
-+ if (rung->vma_root.num && rung->current_scan == slot)
-+ reset_current_scan(slot->rung, 1, 0);
-+}
-+
-+static inline void rung_add_new_slots(struct scan_rung *rung,
-+ struct vma_slot **slots, unsigned long num)
-+{
-+ int err;
-+ struct vma_slot *slot;
-+ unsigned long i;
-+ struct sradix_tree_root *root = &rung->vma_root;
-+
-+ err = sradix_tree_enter(root, (void **)slots, num);
-+ BUG_ON(err);
-+
-+ for (i = 0; i < num; i++) {
-+ slot = slots[i];
-+ slot->rung = rung;
-+ BUG_ON(vma_fully_scanned(slot));
-+ }
-+
-+ if (rung->vma_root.num == num)
-+ reset_current_scan(rung, 0, 1);
-+}
-+
-+static inline int rung_add_one_slot(struct scan_rung *rung,
-+ struct vma_slot *slot)
-+{
-+ int err;
-+
-+ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
-+ if (err)
-+ return err;
-+
-+ slot->rung = rung;
-+ if (rung->vma_root.num == 1)
-+ reset_current_scan(rung, 0, 1);
-+
-+ return 0;
-+}
-+
-+/*
-+ * Return true if the slot is deleted from its rung.
-+ */
-+static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
-+{
-+ struct scan_rung *old_rung = slot->rung;
-+ int err;
-+
-+ if (old_rung == rung)
-+ return 0;
-+
-+ rung_rm_slot(slot);
-+ err = rung_add_one_slot(rung, slot);
-+ if (err) {
-+ err = rung_add_one_slot(old_rung, slot);
-+ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
-+ }
-+
-+ return 1;
-+}
-+
-+static inline int vma_rung_up(struct vma_slot *slot)
-+{
-+ struct scan_rung *rung;
-+
-+ rung = slot->rung;
-+ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
-+ rung++;
-+
-+ return vma_rung_enter(slot, rung);
-+}
-+
-+static inline int vma_rung_down(struct vma_slot *slot)
-+{
-+ struct scan_rung *rung;
-+
-+ rung = slot->rung;
-+ if (slot->rung != &uksm_scan_ladder[0])
-+ rung--;
-+
-+ return vma_rung_enter(slot, rung);
-+}
-+
-+/**
-+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
-+ */
-+static unsigned long cal_dedup_ratio(struct vma_slot *slot)
-+{
-+ unsigned long ret;
-+ unsigned long pages;
-+
-+ pages = slot->this_sampled;
-+ if (!pages)
-+ return 0;
-+
-+ BUG_ON(slot->pages_scanned == slot->last_scanned);
-+
-+ ret = slot->pages_merged;
-+
-+ /* Thrashing area filtering */
-+ if (ret && uksm_thrash_threshold) {
-+ if (slot->pages_cowed * 100 / slot->pages_merged
-+ > uksm_thrash_threshold) {
-+ ret = 0;
-+ } else {
-+ ret = slot->pages_merged - slot->pages_cowed;
-+ }
-+ }
-+
-+ return ret * 100 / pages;
-+}
-+
-+/**
-+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
-+ */
-+static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
-+{
-+ unsigned long ret;
-+ unsigned long pages;
-+
-+ pages = slot->pages;
-+ if (!pages)
-+ return 0;
-+
-+ ret = slot->pages_bemerged;
-+
-+ /* Thrashing area filtering */
-+ if (ret && uksm_thrash_threshold) {
-+ if (slot->pages_cowed * 100 / slot->pages_bemerged
-+ > uksm_thrash_threshold) {
-+ ret = 0;
-+ } else {
-+ ret = slot->pages_bemerged - slot->pages_cowed;
-+ }
-+ }
-+
-+ return ret * 100 / pages;
-+}
-+
-+/**
-+ * stable_node_reinsert() - When the hash_strength has been adjusted, the
-+ * stable tree need to be restructured, this is the function re-inserting the
-+ * stable node.
-+ */
-+static inline void stable_node_reinsert(struct stable_node *new_node,
-+ struct page *page,
-+ struct rb_root *root_treep,
-+ struct list_head *tree_node_listp,
-+ u32 hash)
-+{
-+ struct rb_node **new = &root_treep->rb_node;
-+ struct rb_node *parent = NULL;
-+ struct stable_node *stable_node;
-+ struct tree_node *tree_node;
-+ struct page *tree_page;
-+ int cmp;
-+
-+ while (*new) {
-+ int cmp;
-+
-+ tree_node = rb_entry(*new, struct tree_node, node);
-+
-+ cmp = hash_cmp(hash, tree_node->hash);
-+
-+ if (cmp < 0) {
-+ parent = *new;
-+ new = &parent->rb_left;
-+ } else if (cmp > 0) {
-+ parent = *new;
-+ new = &parent->rb_right;
-+ } else
-+ break;
-+ }
-+
-+ if (*new) {
-+ /* find a stable tree node with same first level hash value */
-+ stable_node_hash_max(new_node, page, hash);
-+ if (tree_node->count == 1) {
-+ stable_node = rb_entry(tree_node->sub_root.rb_node,
-+ struct stable_node, node);
-+ tree_page = get_uksm_page(stable_node, 1, 0);
-+ if (tree_page) {
-+ stable_node_hash_max(stable_node,
-+ tree_page, hash);
-+ put_page(tree_page);
-+
-+ /* prepare for stable node insertion */
-+
-+ cmp = hash_cmp(new_node->hash_max,
-+ stable_node->hash_max);
-+ parent = &stable_node->node;
-+ if (cmp < 0)
-+ new = &parent->rb_left;
-+ else if (cmp > 0)
-+ new = &parent->rb_right;
-+ else
-+ goto failed;
-+
-+ goto add_node;
-+ } else {
-+ /* the only stable_node deleted, the tree node
-+ * was not deleted.
-+ */
-+ goto tree_node_reuse;
-+ }
-+ }
-+
-+ /* well, search the collision subtree */
-+ new = &tree_node->sub_root.rb_node;
-+ parent = NULL;
-+ BUG_ON(!*new);
-+ while (*new) {
-+ int cmp;
-+
-+ stable_node = rb_entry(*new, struct stable_node, node);
-+
-+ cmp = hash_cmp(new_node->hash_max,
-+ stable_node->hash_max);
-+
-+ if (cmp < 0) {
-+ parent = *new;
-+ new = &parent->rb_left;
-+ } else if (cmp > 0) {
-+ parent = *new;
-+ new = &parent->rb_right;
-+ } else {
-+ /* oh, no, still a collision */
-+ goto failed;
-+ }
-+ }
-+
-+ goto add_node;
-+ }
-+
-+ /* no tree node found */
-+ tree_node = alloc_tree_node(tree_node_listp);
-+ if (!tree_node) {
-+ pr_err("UKSM: memory allocation error!\n");
-+ goto failed;
-+ } else {
-+ tree_node->hash = hash;
-+ rb_link_node(&tree_node->node, parent, new);
-+ rb_insert_color(&tree_node->node, root_treep);
-+
-+tree_node_reuse:
-+ /* prepare for stable node insertion */
-+ parent = NULL;
-+ new = &tree_node->sub_root.rb_node;
-+ }
-+
-+add_node:
-+ rb_link_node(&new_node->node, parent, new);
-+ rb_insert_color(&new_node->node, &tree_node->sub_root);
-+ new_node->tree_node = tree_node;
-+ tree_node->count++;
-+ return;
-+
-+failed:
-+ /* This can only happen when two nodes have collided
-+ * in two levels.
-+ */
-+ new_node->tree_node = NULL;
-+ return;
-+}
-+
-+static inline void free_all_tree_nodes(struct list_head *list)
-+{
-+ struct tree_node *node, *tmp;
-+
-+ list_for_each_entry_safe(node, tmp, list, all_list) {
-+ free_tree_node(node);
-+ }
-+}
-+
-+/**
-+ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
-+ * strength to the current hash_strength. It re-structures the hole tree.
-+ */
-+static inline void stable_tree_delta_hash(u32 prev_hash_strength)
-+{
-+ struct stable_node *node, *tmp;
-+ struct rb_root *root_new_treep;
-+ struct list_head *new_tree_node_listp;
-+
-+ stable_tree_index = (stable_tree_index + 1) % 2;
-+ root_new_treep = &root_stable_tree[stable_tree_index];
-+ new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
-+ *root_new_treep = RB_ROOT;
-+ BUG_ON(!list_empty(new_tree_node_listp));
-+
-+ /*
-+ * we need to be safe, the node could be removed by get_uksm_page()
-+ */
-+ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
-+ void *addr;
-+ struct page *node_page;
-+ u32 hash;
-+
-+ /*
-+ * We are completely re-structuring the stable nodes to a new
-+ * stable tree. We don't want to touch the old tree unlinks and
-+ * old tree_nodes. The old tree_nodes will be freed at once.
-+ */
-+ node_page = get_uksm_page(node, 0, 0);
-+ if (!node_page)
-+ continue;
-+
-+ if (node->tree_node) {
-+ hash = node->tree_node->hash;
-+
-+ addr = kmap_atomic(node_page);
-+
-+ hash = delta_hash(addr, prev_hash_strength,
-+ hash_strength, hash);
-+ kunmap_atomic(addr);
-+ } else {
-+ /*
-+ *it was not inserted to rbtree due to collision in last
-+ *round scan.
-+ */
-+ hash = page_hash(node_page, hash_strength, 0);
-+ }
-+
-+ stable_node_reinsert(node, node_page, root_new_treep,
-+ new_tree_node_listp, hash);
-+ put_page(node_page);
-+ }
-+
-+ root_stable_treep = root_new_treep;
-+ free_all_tree_nodes(stable_tree_node_listp);
-+ BUG_ON(!list_empty(stable_tree_node_listp));
-+ stable_tree_node_listp = new_tree_node_listp;
-+}
-+
-+static inline void inc_hash_strength(unsigned long delta)
-+{
-+ hash_strength += 1 << delta;
-+ if (hash_strength > HASH_STRENGTH_MAX)
-+ hash_strength = HASH_STRENGTH_MAX;
-+}
-+
-+static inline void dec_hash_strength(unsigned long delta)
-+{
-+ unsigned long change = 1 << delta;
-+
-+ if (hash_strength <= change + 1)
-+ hash_strength = 1;
-+ else
-+ hash_strength -= change;
-+}
-+
-+static inline void inc_hash_strength_delta(void)
-+{
-+ hash_strength_delta++;
-+ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
-+ hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
-+}
-+
-+static inline unsigned long get_current_neg_ratio(void)
-+{
-+ u64 pos = benefit.pos;
-+ u64 neg = benefit.neg;
-+
-+ if (!neg)
-+ return 0;
-+
-+ if (!pos || neg > pos)
-+ return 100;
-+
-+ if (neg > div64_u64(U64_MAX, 100))
-+ pos = div64_u64(pos, 100);
-+ else
-+ neg *= 100;
-+
-+ return div64_u64(neg, pos);
-+}
-+
-+static inline unsigned long get_current_benefit(void)
-+{
-+ u64 pos = benefit.pos;
-+ u64 neg = benefit.neg;
-+ u64 scanned = benefit.scanned;
-+
-+ if (neg > pos)
-+ return 0;
-+
-+ return div64_u64((pos - neg), scanned);
-+}
-+
-+static inline int judge_rshash_direction(void)
-+{
-+ u64 current_neg_ratio, stable_benefit;
-+ u64 current_benefit, delta = 0;
-+ int ret = STILL;
-+
-+ /*
-+ * Try to probe a value after the boot, and in case the system
-+ * are still for a long time.
-+ */
-+ if ((fully_scanned_round & 0xFFULL) == 10) {
-+ ret = OBSCURE;
-+ goto out;
-+ }
-+
-+ current_neg_ratio = get_current_neg_ratio();
-+
-+ if (current_neg_ratio == 0) {
-+ rshash_neg_cont_zero++;
-+ if (rshash_neg_cont_zero > 2)
-+ return GO_DOWN;
-+ else
-+ return STILL;
-+ }
-+ rshash_neg_cont_zero = 0;
-+
-+ if (current_neg_ratio > 90) {
-+ ret = GO_UP;
-+ goto out;
-+ }
-+
-+ current_benefit = get_current_benefit();
-+ stable_benefit = rshash_state.stable_benefit;
-+
-+ if (!stable_benefit) {
-+ ret = OBSCURE;
-+ goto out;
-+ }
-+
-+ if (current_benefit > stable_benefit)
-+ delta = current_benefit - stable_benefit;
-+ else if (current_benefit < stable_benefit)
-+ delta = stable_benefit - current_benefit;
-+
-+ delta = div64_u64(100 * delta, stable_benefit);
-+
-+ if (delta > 50) {
-+ rshash_cont_obscure++;
-+ if (rshash_cont_obscure > 2)
-+ return OBSCURE;
-+ else
-+ return STILL;
-+ }
-+
-+out:
-+ rshash_cont_obscure = 0;
-+ return ret;
-+}
-+
-+/**
-+ * rshash_adjust() - The main function to control the random sampling state
-+ * machine for hash strength adapting.
-+ *
-+ * return true if hash_strength has changed.
-+ */
-+static inline int rshash_adjust(void)
-+{
-+ unsigned long prev_hash_strength = hash_strength;
-+
-+ if (!encode_benefit())
-+ return 0;
-+
-+ switch (rshash_state.state) {
-+ case RSHASH_STILL:
-+ switch (judge_rshash_direction()) {
-+ case GO_UP:
-+ if (rshash_state.pre_direct == GO_DOWN)
-+ hash_strength_delta = 0;
-+
-+ inc_hash_strength(hash_strength_delta);
-+ inc_hash_strength_delta();
-+ rshash_state.stable_benefit = get_current_benefit();
-+ rshash_state.pre_direct = GO_UP;
-+ break;
-+
-+ case GO_DOWN:
-+ if (rshash_state.pre_direct == GO_UP)
-+ hash_strength_delta = 0;
-+
-+ dec_hash_strength(hash_strength_delta);
-+ inc_hash_strength_delta();
-+ rshash_state.stable_benefit = get_current_benefit();
-+ rshash_state.pre_direct = GO_DOWN;
-+ break;
-+
-+ case OBSCURE:
-+ rshash_state.stable_point = hash_strength;
-+ rshash_state.turn_point_down = hash_strength;
-+ rshash_state.turn_point_up = hash_strength;
-+ rshash_state.turn_benefit_down = get_current_benefit();
-+ rshash_state.turn_benefit_up = get_current_benefit();
-+ rshash_state.lookup_window_index = 0;
-+ rshash_state.state = RSHASH_TRYDOWN;
-+ dec_hash_strength(hash_strength_delta);
-+ inc_hash_strength_delta();
-+ break;
-+
-+ case STILL:
-+ break;
-+ default:
-+ BUG();
-+ }
-+ break;
-+
-+ case RSHASH_TRYDOWN:
-+ if (rshash_state.lookup_window_index++ % 5 == 0)
-+ rshash_state.below_count = 0;
-+
-+ if (get_current_benefit() < rshash_state.stable_benefit)
-+ rshash_state.below_count++;
-+ else if (get_current_benefit() >
-+ rshash_state.turn_benefit_down) {
-+ rshash_state.turn_point_down = hash_strength;
-+ rshash_state.turn_benefit_down = get_current_benefit();
-+ }
-+
-+ if (rshash_state.below_count >= 3 ||
-+ judge_rshash_direction() == GO_UP ||
-+ hash_strength == 1) {
-+ hash_strength = rshash_state.stable_point;
-+ hash_strength_delta = 0;
-+ inc_hash_strength(hash_strength_delta);
-+ inc_hash_strength_delta();
-+ rshash_state.lookup_window_index = 0;
-+ rshash_state.state = RSHASH_TRYUP;
-+ hash_strength_delta = 0;
-+ } else {
-+ dec_hash_strength(hash_strength_delta);
-+ inc_hash_strength_delta();
-+ }
-+ break;
-+
-+ case RSHASH_TRYUP:
-+ if (rshash_state.lookup_window_index++ % 5 == 0)
-+ rshash_state.below_count = 0;
-+
-+ if (get_current_benefit() < rshash_state.turn_benefit_down)
-+ rshash_state.below_count++;
-+ else if (get_current_benefit() > rshash_state.turn_benefit_up) {
-+ rshash_state.turn_point_up = hash_strength;
-+ rshash_state.turn_benefit_up = get_current_benefit();
-+ }
-+
-+ if (rshash_state.below_count >= 3 ||
-+ judge_rshash_direction() == GO_DOWN ||
-+ hash_strength == HASH_STRENGTH_MAX) {
-+ hash_strength = rshash_state.turn_benefit_up >
-+ rshash_state.turn_benefit_down ?
-+ rshash_state.turn_point_up :
-+ rshash_state.turn_point_down;
-+
-+ rshash_state.state = RSHASH_PRE_STILL;
-+ } else {
-+ inc_hash_strength(hash_strength_delta);
-+ inc_hash_strength_delta();
-+ }
-+
-+ break;
-+
-+ case RSHASH_NEW:
-+ case RSHASH_PRE_STILL:
-+ rshash_state.stable_benefit = get_current_benefit();
-+ rshash_state.state = RSHASH_STILL;
-+ hash_strength_delta = 0;
-+ break;
-+ default:
-+ BUG();
-+ }
-+
-+ /* rshash_neg = rshash_pos = 0; */
-+ reset_benefit();
-+
-+ if (prev_hash_strength != hash_strength)
-+ stable_tree_delta_hash(prev_hash_strength);
-+
-+ return prev_hash_strength != hash_strength;
-+}
-+
-+/**
-+ * round_update_ladder() - The main function to do update of all the
-+ * adjustments whenever a scan round is finished.
-+ */
-+static noinline void round_update_ladder(void)
-+{
-+ int i;
-+ unsigned long dedup;
-+ struct vma_slot *slot, *tmp_slot;
-+
-+ for (i = 0; i < SCAN_LADDER_SIZE; i++)
-+ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
-+
-+ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
-+
-+ /* slot may be rung_rm_slot() when mm exits */
-+ if (slot->snode) {
-+ dedup = cal_dedup_ratio_old(slot);
-+ if (dedup && dedup >= uksm_abundant_threshold)
-+ vma_rung_up(slot);
-+ }
-+
-+ slot->pages_bemerged = 0;
-+ slot->pages_cowed = 0;
-+
-+ list_del_init(&slot->dedup_list);
-+ }
-+}
-+
-+static void uksm_del_vma_slot(struct vma_slot *slot)
-+{
-+ int i, j;
-+ struct rmap_list_entry *entry;
-+
-+ if (slot->snode) {
-+ /*
-+ * In case it just failed when entering the rung, it's not
-+ * necessary.
-+ */
-+ rung_rm_slot(slot);
-+ }
-+
-+ if (!list_empty(&slot->dedup_list))
-+ list_del(&slot->dedup_list);
-+
-+ if (!slot->rmap_list_pool || !slot->pool_counts) {
-+ /* In case it OOMed in uksm_vma_enter() */
-+ goto out;
-+ }
-+
-+ for (i = 0; i < slot->pool_size; i++) {
-+ void *addr;
-+
-+ if (!slot->rmap_list_pool[i])
-+ continue;
-+
-+ addr = kmap(slot->rmap_list_pool[i]);
-+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
-+ entry = (struct rmap_list_entry *)addr + j;
-+ if (is_addr(entry->addr))
-+ continue;
-+ if (!entry->item)
-+ continue;
-+
-+ remove_rmap_item_from_tree(entry->item);
-+ free_rmap_item(entry->item);
-+ slot->pool_counts[i]--;
-+ }
-+ BUG_ON(slot->pool_counts[i]);
-+ kunmap(slot->rmap_list_pool[i]);
-+ __free_page(slot->rmap_list_pool[i]);
-+ }
-+ kfree(slot->rmap_list_pool);
-+ kfree(slot->pool_counts);
-+
-+out:
-+ slot->rung = NULL;
-+ if (slot->flags & UKSM_SLOT_IN_UKSM) {
-+ BUG_ON(uksm_pages_total < slot->pages);
-+ uksm_pages_total -= slot->pages;
-+ }
-+
-+ if (slot->fully_scanned_round == fully_scanned_round)
-+ scanned_virtual_pages -= slot->pages;
-+ else
-+ scanned_virtual_pages -= slot->pages_scanned;
-+ free_vma_slot(slot);
-+}
-+
-+
-+#define SPIN_LOCK_PERIOD 32
-+static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
-+static inline void cleanup_vma_slots(void)
-+{
-+ struct vma_slot *slot;
-+ int i;
-+
-+ i = 0;
-+ spin_lock(&vma_slot_list_lock);
-+ while (!list_empty(&vma_slot_del)) {
-+ slot = list_entry(vma_slot_del.next,
-+ struct vma_slot, slot_list);
-+ list_del(&slot->slot_list);
-+ cleanup_slots[i++] = slot;
-+ if (i == SPIN_LOCK_PERIOD) {
-+ spin_unlock(&vma_slot_list_lock);
-+ while (--i >= 0)
-+ uksm_del_vma_slot(cleanup_slots[i]);
-+ i = 0;
-+ spin_lock(&vma_slot_list_lock);
-+ }
-+ }
-+ spin_unlock(&vma_slot_list_lock);
-+
-+ while (--i >= 0)
-+ uksm_del_vma_slot(cleanup_slots[i]);
-+}
-+
-+/*
-+ * Expotional moving average formula
-+ */
-+static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
-+{
-+ /*
-+ * For a very high burst, even the ema cannot work well, a false very
-+ * high per-page time estimation can result in feedback in very high
-+ * overhead of context switch and rung update -- this will then lead
-+ * to higher per-paper time, this may not converge.
-+ *
-+ * Instead, we try to approach this value in a binary manner.
-+ */
-+ if (curr > last_ema * 10)
-+ return last_ema * 2;
-+
-+ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
-+}
-+
-+/*
-+ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
-+ * nanoseconds based on current uksm_sleep_jiffies.
-+ */
-+static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
-+{
-+ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
-+ (TIME_RATIO_SCALE - ratio) * ratio;
-+}
-+
-+
-+static inline unsigned long rung_real_ratio(int cpu_time_ratio)
-+{
-+ unsigned long ret;
-+
-+ BUG_ON(!cpu_time_ratio);
-+
-+ if (cpu_time_ratio > 0)
-+ ret = cpu_time_ratio;
-+ else
-+ ret = (unsigned long)(-cpu_time_ratio) *
-+ uksm_max_cpu_percentage / 100UL;
-+
-+ return ret ? ret : 1;
-+}
-+
-+static noinline void uksm_calc_scan_pages(void)
-+{
-+ struct scan_rung *ladder = uksm_scan_ladder;
-+ unsigned long sleep_usecs, nsecs;
-+ unsigned long ratio;
-+ int i;
-+ unsigned long per_page;
-+
-+ if (uksm_ema_page_time > 100000 ||
-+ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
-+ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
-+
-+ per_page = uksm_ema_page_time;
-+ BUG_ON(!per_page);
-+
-+ /*
-+ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
-+ * based on saved user input.
-+ */
-+ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
-+ uksm_sleep_jiffies = uksm_sleep_saved;
-+
-+ /* We require a rung scan at least 1 page in a period. */
-+ nsecs = per_page;
-+ ratio = rung_real_ratio(ladder[0].cpu_ratio);
-+ if (cpu_ratio_to_nsec(ratio) < nsecs) {
-+ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
-+ / NSEC_PER_USEC;
-+ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
-+ }
-+
-+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+ ratio = rung_real_ratio(ladder[i].cpu_ratio);
-+ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
-+ per_page;
-+ BUG_ON(!ladder[i].pages_to_scan);
-+ uksm_calc_rung_step(&ladder[i], per_page, ratio);
-+ }
-+}
-+
-+/*
-+ * From the scan time of this round (ns) to next expected min sleep time
-+ * (ms), be careful of the possible overflows. ratio is taken from
-+ * rung_real_ratio()
-+ */
-+static inline
-+unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
-+{
-+ scan_time >>= 20; /* to msec level now */
-+ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
-+
-+ return (unsigned int) ((unsigned long) scan_time *
-+ (TIME_RATIO_SCALE - ratio) / ratio);
-+}
-+
-+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
-+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
-+
-+static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
-+{
-+ struct scan_rung *rung;
-+
-+ rung = &uksm_scan_ladder[0];
-+ rung_add_new_slots(rung, slots, num);
-+}
-+
-+static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
-+
-+static void uksm_enter_all_slots(void)
-+{
-+ struct vma_slot *slot;
-+ unsigned long index;
-+ struct list_head empty_vma_list;
-+ int i;
-+
-+ i = 0;
-+ index = 0;
-+ INIT_LIST_HEAD(&empty_vma_list);
-+
-+ spin_lock(&vma_slot_list_lock);
-+ while (!list_empty(&vma_slot_new)) {
-+ slot = list_entry(vma_slot_new.next,
-+ struct vma_slot, slot_list);
-+
-+ if (!slot->vma->anon_vma) {
-+ list_move(&slot->slot_list, &empty_vma_list);
-+ } else if (vma_can_enter(slot->vma)) {
-+ batch_slots[index++] = slot;
-+ list_del_init(&slot->slot_list);
-+ } else {
-+ list_move(&slot->slot_list, &vma_slot_noadd);
-+ }
-+
-+ if (++i == SPIN_LOCK_PERIOD ||
-+ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
-+ spin_unlock(&vma_slot_list_lock);
-+
-+ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
-+ uksm_vma_enter(batch_slots, index);
-+ index = 0;
-+ }
-+ i = 0;
-+ cond_resched();
-+ spin_lock(&vma_slot_list_lock);
-+ }
-+ }
-+
-+ list_splice(&empty_vma_list, &vma_slot_new);
-+
-+ spin_unlock(&vma_slot_list_lock);
-+
-+ if (index)
-+ uksm_vma_enter(batch_slots, index);
-+
-+}
-+
-+static inline int rung_round_finished(struct scan_rung *rung)
-+{
-+ return rung->flags & UKSM_RUNG_ROUND_FINISHED;
-+}
-+
-+static inline void judge_slot(struct vma_slot *slot)
-+{
-+ struct scan_rung *rung = slot->rung;
-+ unsigned long dedup;
-+ int deleted;
-+
-+ dedup = cal_dedup_ratio(slot);
-+ if (vma_fully_scanned(slot) && uksm_thrash_threshold)
-+ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
-+ else if (dedup && dedup >= uksm_abundant_threshold)
-+ deleted = vma_rung_up(slot);
-+ else
-+ deleted = vma_rung_down(slot);
-+
-+ slot->pages_merged = 0;
-+ slot->pages_cowed = 0;
-+ slot->this_sampled = 0;
-+
-+ if (vma_fully_scanned(slot))
-+ slot->pages_scanned = 0;
-+
-+ slot->last_scanned = slot->pages_scanned;
-+
-+ /* If its deleted in above, then rung was already advanced. */
-+ if (!deleted)
-+ advance_current_scan(rung);
-+}
-+
-+
-+static inline int hash_round_finished(void)
-+{
-+ if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
-+ scanned_virtual_pages = 0;
-+ if (uksm_pages_scanned)
-+ fully_scanned_round++;
-+
-+ return 1;
-+ } else {
-+ return 0;
-+ }
-+}
-+
-+#define UKSM_MMSEM_BATCH 5
-+#define BUSY_RETRY 100
-+
-+/**
-+ * uksm_do_scan() - the main worker function.
-+ */
-+static noinline void uksm_do_scan(void)
-+{
-+ struct vma_slot *slot, *iter;
-+ struct mm_struct *busy_mm;
-+ unsigned char round_finished, all_rungs_emtpy;
-+ int i, err, mmsem_batch;
-+ unsigned long pcost;
-+ long long delta_exec;
-+ unsigned long vpages, max_cpu_ratio;
-+ unsigned long long start_time, end_time, scan_time;
-+ unsigned int expected_jiffies;
-+
-+ might_sleep();
-+
-+ vpages = 0;
-+
-+ start_time = task_sched_runtime(current);
-+ max_cpu_ratio = 0;
-+ mmsem_batch = 0;
-+
-+ for (i = 0; i < SCAN_LADDER_SIZE;) {
-+ struct scan_rung *rung = &uksm_scan_ladder[i];
-+ unsigned long ratio;
-+ int busy_retry;
-+
-+ if (!rung->pages_to_scan) {
-+ i++;
-+ continue;
-+ }
-+
-+ if (!rung->vma_root.num) {
-+ rung->pages_to_scan = 0;
-+ i++;
-+ continue;
-+ }
-+
-+ ratio = rung_real_ratio(rung->cpu_ratio);
-+ if (ratio > max_cpu_ratio)
-+ max_cpu_ratio = ratio;
-+
-+ busy_retry = BUSY_RETRY;
-+ /*
-+ * Do not consider rung_round_finished() here, just used up the
-+ * rung->pages_to_scan quota.
-+ */
-+ while (rung->pages_to_scan && rung->vma_root.num &&
-+ likely(!freezing(current))) {
-+ int reset = 0;
-+
-+ slot = rung->current_scan;
-+
-+ BUG_ON(vma_fully_scanned(slot));
-+
-+ if (mmsem_batch)
-+ err = 0;
-+ else
-+ err = try_down_read_slot_mmap_sem(slot);
-+
-+ if (err == -ENOENT) {
-+rm_slot:
-+ rung_rm_slot(slot);
-+ continue;
-+ }
-+
-+ busy_mm = slot->mm;
-+
-+ if (err == -EBUSY) {
-+ /* skip other vmas on the same mm */
-+ do {
-+ reset = advance_current_scan(rung);
-+ iter = rung->current_scan;
-+ busy_retry--;
-+ if (iter->vma->vm_mm != busy_mm ||
-+ !busy_retry || reset)
-+ break;
-+ } while (1);
-+
-+ if (iter->vma->vm_mm != busy_mm) {
-+ continue;
-+ } else {
-+ /* scan round finsished */
-+ break;
-+ }
-+ }
-+
-+ BUG_ON(!vma_can_enter(slot->vma));
-+ if (uksm_test_exit(slot->vma->vm_mm)) {
-+ mmsem_batch = 0;
-+ mmap_read_unlock(slot->vma->vm_mm);
-+ goto rm_slot;
-+ }
-+
-+ if (mmsem_batch)
-+ mmsem_batch--;
-+ else
-+ mmsem_batch = UKSM_MMSEM_BATCH;
-+
-+ /* Ok, we have take the mmap_sem, ready to scan */
-+ scan_vma_one_page(slot);
-+ rung->pages_to_scan--;
-+ vpages++;
-+
-+ if (rung->current_offset + rung->step > slot->pages - 1
-+ || vma_fully_scanned(slot)) {
-+ mmap_read_unlock(slot->vma->vm_mm);
-+ judge_slot(slot);
-+ mmsem_batch = 0;
-+ } else {
-+ rung->current_offset += rung->step;
-+ if (!mmsem_batch)
-+ mmap_read_unlock(slot->vma->vm_mm);
-+ }
-+
-+ busy_retry = BUSY_RETRY;
-+ cond_resched();
-+ }
-+
-+ if (mmsem_batch) {
-+ mmap_read_unlock(slot->vma->vm_mm);
-+ mmsem_batch = 0;
-+ }
-+
-+ if (freezing(current))
-+ break;
-+
-+ cond_resched();
-+ }
-+ end_time = task_sched_runtime(current);
-+ delta_exec = end_time - start_time;
-+
-+ if (freezing(current))
-+ return;
-+
-+ cleanup_vma_slots();
-+ uksm_enter_all_slots();
-+
-+ round_finished = 1;
-+ all_rungs_emtpy = 1;
-+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+ struct scan_rung *rung = &uksm_scan_ladder[i];
-+
-+ if (rung->vma_root.num) {
-+ all_rungs_emtpy = 0;
-+ if (!rung_round_finished(rung))
-+ round_finished = 0;
-+ }
-+ }
-+
-+ if (all_rungs_emtpy)
-+ round_finished = 0;
-+
-+ if (round_finished) {
-+ round_update_ladder();
-+ uksm_eval_round++;
-+
-+ if (hash_round_finished() && rshash_adjust()) {
-+ /* Reset the unstable root iff hash strength changed */
-+ uksm_hash_round++;
-+ root_unstable_tree = RB_ROOT;
-+ free_all_tree_nodes(&unstable_tree_node_list);
-+ }
-+
-+ /*
-+ * A number of pages can hang around indefinitely on per-cpu
-+ * pagevecs, raised page count preventing write_protect_page
-+ * from merging them. Though it doesn't really matter much,
-+ * it is puzzling to see some stuck in pages_volatile until
-+ * other activity jostles them out, and they also prevented
-+ * LTP's KSM test from succeeding deterministically; so drain
-+ * them here (here rather than on entry to uksm_do_scan(),
-+ * so we don't IPI too often when pages_to_scan is set low).
-+ */
-+ lru_add_drain_all();
-+ }
-+
-+
-+ if (vpages && delta_exec > 0) {
-+ pcost = (unsigned long) delta_exec / vpages;
-+ if (likely(uksm_ema_page_time))
-+ uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
-+ else
-+ uksm_ema_page_time = pcost;
-+ }
-+
-+ uksm_calc_scan_pages();
-+ uksm_sleep_real = uksm_sleep_jiffies;
-+ /* in case of radical cpu bursts, apply the upper bound */
-+ end_time = task_sched_runtime(current);
-+ if (max_cpu_ratio && end_time > start_time) {
-+ scan_time = end_time - start_time;
-+ expected_jiffies = msecs_to_jiffies(
-+ scan_time_to_sleep(scan_time, max_cpu_ratio));
-+
-+ if (expected_jiffies > uksm_sleep_real)
-+ uksm_sleep_real = expected_jiffies;
-+
-+ /* We have a 1 second up bound for responsiveness. */
-+ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
-+ uksm_sleep_real = msecs_to_jiffies(1000);
-+ }
-+
-+ return;
-+}
-+
-+static int ksmd_should_run(void)
-+{
-+ return uksm_run & UKSM_RUN_MERGE;
-+}
-+
-+static int uksm_scan_thread(void *nothing)
-+{
-+ set_freezable();
-+ set_user_nice(current, 5);
-+
-+ while (!kthread_should_stop()) {
-+ mutex_lock(&uksm_thread_mutex);
-+ if (ksmd_should_run())
-+ uksm_do_scan();
-+ mutex_unlock(&uksm_thread_mutex);
-+
-+ try_to_freeze();
-+
-+ if (ksmd_should_run()) {
-+ schedule_timeout_interruptible(uksm_sleep_real);
-+ uksm_sleep_times++;
-+ } else {
-+ wait_event_freezable(uksm_thread_wait,
-+ ksmd_should_run() || kthread_should_stop());
-+ }
-+ }
-+ return 0;
-+}
-+
-+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
-+{
-+ struct stable_node *stable_node;
-+ struct node_vma *node_vma;
-+ struct rmap_item *rmap_item;
-+ int search_new_forks = 0;
-+ unsigned long address;
-+
-+ VM_BUG_ON_PAGE(!PageKsm(page), page);
-+ VM_BUG_ON_PAGE(!PageLocked(page), page);
-+
-+ stable_node = page_stable_node(page);
-+ if (!stable_node)
-+ return;
-+again:
-+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
-+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
-+ struct anon_vma *anon_vma = rmap_item->anon_vma;
-+ struct anon_vma_chain *vmac;
-+ struct vm_area_struct *vma;
-+
-+ cond_resched();
-+ anon_vma_lock_read(anon_vma);
-+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
-+ 0, ULONG_MAX) {
-+ cond_resched();
-+ vma = vmac->vma;
-+ address = get_rmap_addr(rmap_item);
-+
-+ if (address < vma->vm_start ||
-+ address >= vma->vm_end)
-+ continue;
-+
-+ if ((rmap_item->slot->vma == vma) ==
-+ search_new_forks)
-+ continue;
-+
-+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
-+ continue;
-+
-+ if (!rwc->rmap_one(page, vma, address, rwc->arg)) {
-+ anon_vma_unlock_read(anon_vma);
-+ return;
-+ }
-+
-+ if (rwc->done && rwc->done(page)) {
-+ anon_vma_unlock_read(anon_vma);
-+ return;
-+ }
-+ }
-+ anon_vma_unlock_read(anon_vma);
-+ }
-+ }
-+ if (!search_new_forks++)
-+ goto again;
-+}
-+
-+#ifdef CONFIG_MIGRATION
-+/* Common ksm interface but may be specific to uksm */
-+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
-+{
-+ struct stable_node *stable_node;
-+
-+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
-+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-+ VM_BUG_ON(newpage->mapping != oldpage->mapping);
-+
-+ stable_node = page_stable_node(newpage);
-+ if (stable_node) {
-+ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
-+ stable_node->kpfn = page_to_pfn(newpage);
-+ /*
-+ * newpage->mapping was set in advance; now we need smp_wmb()
-+ * to make sure that the new stable_node->kpfn is visible
-+ * to get_ksm_page() before it can see that oldpage->mapping
-+ * has gone stale (or that PageSwapCache has been cleared).
-+ */
-+ smp_wmb();
-+ set_page_stable_node(oldpage, NULL);
-+ }
-+}
-+#endif /* CONFIG_MIGRATION */
-+
-+#ifdef CONFIG_MEMORY_HOTREMOVE
-+static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
-+ unsigned long end_pfn)
-+{
-+ struct rb_node *node;
-+
-+ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
-+ struct stable_node *stable_node;
-+
-+ stable_node = rb_entry(node, struct stable_node, node);
-+ if (stable_node->kpfn >= start_pfn &&
-+ stable_node->kpfn < end_pfn)
-+ return stable_node;
-+ }
-+ return NULL;
-+}
-+
-+static int uksm_memory_callback(struct notifier_block *self,
-+ unsigned long action, void *arg)
-+{
-+ struct memory_notify *mn = arg;
-+ struct stable_node *stable_node;
-+
-+ switch (action) {
-+ case MEM_GOING_OFFLINE:
-+ /*
-+ * Keep it very simple for now: just lock out ksmd and
-+ * MADV_UNMERGEABLE while any memory is going offline.
-+ * mutex_lock_nested() is necessary because lockdep was alarmed
-+ * that here we take uksm_thread_mutex inside notifier chain
-+ * mutex, and later take notifier chain mutex inside
-+ * uksm_thread_mutex to unlock it. But that's safe because both
-+ * are inside mem_hotplug_mutex.
-+ */
-+ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
-+ break;
-+
-+ case MEM_OFFLINE:
-+ /*
-+ * Most of the work is done by page migration; but there might
-+ * be a few stable_nodes left over, still pointing to struct
-+ * pages which have been offlined: prune those from the tree.
-+ */
-+ while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
-+ mn->start_pfn + mn->nr_pages)) != NULL)
-+ remove_node_from_stable_tree(stable_node, 1, 1);
-+ /* fallthrough */
-+
-+ case MEM_CANCEL_OFFLINE:
-+ mutex_unlock(&uksm_thread_mutex);
-+ break;
-+ }
-+ return NOTIFY_OK;
-+}
-+#endif /* CONFIG_MEMORY_HOTREMOVE */
-+
-+#ifdef CONFIG_SYSFS
-+/*
-+ * This all compiles without CONFIG_SYSFS, but is a waste of space.
-+ */
-+
-+#define UKSM_ATTR_RO(_name) \
-+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
-+#define UKSM_ATTR(_name) \
-+ static struct kobj_attribute _name##_attr = \
-+ __ATTR(_name, 0644, _name##_show, _name##_store)
-+
-+static ssize_t max_cpu_percentage_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
-+}
-+
-+static ssize_t max_cpu_percentage_store(struct kobject *kobj,
-+ struct kobj_attribute *attr,
-+ const char *buf, size_t count)
-+{
-+ unsigned long max_cpu_percentage;
-+ int err;
-+
-+ err = kstrtoul(buf, 10, &max_cpu_percentage);
-+ if (err || max_cpu_percentage > 100)
-+ return -EINVAL;
-+
-+ if (max_cpu_percentage == 100)
-+ max_cpu_percentage = 99;
-+ else if (max_cpu_percentage < 10)
-+ max_cpu_percentage = 10;
-+
-+ uksm_max_cpu_percentage = max_cpu_percentage;
-+
-+ return count;
-+}
-+UKSM_ATTR(max_cpu_percentage);
-+
-+static ssize_t sleep_millisecs_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
-+}
-+
-+static ssize_t sleep_millisecs_store(struct kobject *kobj,
-+ struct kobj_attribute *attr,
-+ const char *buf, size_t count)
-+{
-+ unsigned long msecs;
-+ int err;
-+
-+ err = kstrtoul(buf, 10, &msecs);
-+ if (err || msecs > MSEC_PER_SEC)
-+ return -EINVAL;
-+
-+ uksm_sleep_jiffies = msecs_to_jiffies(msecs);
-+ uksm_sleep_saved = uksm_sleep_jiffies;
-+
-+ return count;
-+}
-+UKSM_ATTR(sleep_millisecs);
-+
-+
-+static ssize_t cpu_governor_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
-+ int i;
-+
-+ buf[0] = '\0';
-+ for (i = 0; i < n ; i++) {
-+ if (uksm_cpu_governor == i)
-+ strcat(buf, "[");
-+
-+ strcat(buf, uksm_cpu_governor_str[i]);
-+
-+ if (uksm_cpu_governor == i)
-+ strcat(buf, "]");
-+
-+ strcat(buf, " ");
-+ }
-+ strcat(buf, "\n");
-+
-+ return strlen(buf);
-+}
-+
-+static inline void init_performance_values(void)
-+{
-+ int i;
-+ struct scan_rung *rung;
-+ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
-+
-+
-+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+ rung = uksm_scan_ladder + i;
-+ rung->cpu_ratio = preset->cpu_ratio[i];
-+ rung->cover_msecs = preset->cover_msecs[i];
-+ }
-+
-+ uksm_max_cpu_percentage = preset->max_cpu;
-+}
-+
-+static ssize_t cpu_governor_store(struct kobject *kobj,
-+ struct kobj_attribute *attr,
-+ const char *buf, size_t count)
-+{
-+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
-+
-+ for (n--; n >= 0 ; n--) {
-+ if (!strncmp(buf, uksm_cpu_governor_str[n],
-+ strlen(uksm_cpu_governor_str[n])))
-+ break;
-+ }
-+
-+ if (n < 0)
-+ return -EINVAL;
-+ else
-+ uksm_cpu_governor = n;
-+
-+ init_performance_values();
-+
-+ return count;
-+}
-+UKSM_ATTR(cpu_governor);
-+
-+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
-+ char *buf)
-+{
-+ return sprintf(buf, "%u\n", uksm_run);
-+}
-+
-+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
-+ const char *buf, size_t count)
-+{
-+ int err;
-+ unsigned long flags;
-+
-+ err = kstrtoul(buf, 10, &flags);
-+ if (err || flags > UINT_MAX)
-+ return -EINVAL;
-+ if (flags > UKSM_RUN_MERGE)
-+ return -EINVAL;
-+
-+ mutex_lock(&uksm_thread_mutex);
-+ if (uksm_run != flags)
-+ uksm_run = flags;
-+ mutex_unlock(&uksm_thread_mutex);
-+
-+ if (flags & UKSM_RUN_MERGE)
-+ wake_up_interruptible(&uksm_thread_wait);
-+
-+ return count;
-+}
-+UKSM_ATTR(run);
-+
-+static ssize_t abundant_threshold_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ return sprintf(buf, "%u\n", uksm_abundant_threshold);
-+}
-+
-+static ssize_t abundant_threshold_store(struct kobject *kobj,
-+ struct kobj_attribute *attr,
-+ const char *buf, size_t count)
-+{
-+ int err;
-+ unsigned long flags;
-+
-+ err = kstrtoul(buf, 10, &flags);
-+ if (err || flags > 99)
-+ return -EINVAL;
-+
-+ uksm_abundant_threshold = flags;
-+
-+ return count;
-+}
-+UKSM_ATTR(abundant_threshold);
-+
-+static ssize_t thrash_threshold_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ return sprintf(buf, "%u\n", uksm_thrash_threshold);
-+}
-+
-+static ssize_t thrash_threshold_store(struct kobject *kobj,
-+ struct kobj_attribute *attr,
-+ const char *buf, size_t count)
-+{
-+ int err;
-+ unsigned long flags;
-+
-+ err = kstrtoul(buf, 10, &flags);
-+ if (err || flags > 99)
-+ return -EINVAL;
-+
-+ uksm_thrash_threshold = flags;
-+
-+ return count;
-+}
-+UKSM_ATTR(thrash_threshold);
-+
-+static ssize_t cpu_ratios_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ int i, size;
-+ struct scan_rung *rung;
-+ char *p = buf;
-+
-+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+ rung = &uksm_scan_ladder[i];
-+
-+ if (rung->cpu_ratio > 0)
-+ size = sprintf(p, "%d ", rung->cpu_ratio);
-+ else
-+ size = sprintf(p, "MAX/%d ",
-+ TIME_RATIO_SCALE / -rung->cpu_ratio);
-+
-+ p += size;
-+ }
-+
-+ *p++ = '\n';
-+ *p = '\0';
-+
-+ return p - buf;
-+}
-+
-+static ssize_t cpu_ratios_store(struct kobject *kobj,
-+ struct kobj_attribute *attr,
-+ const char *buf, size_t count)
-+{
-+ int i, cpuratios[SCAN_LADDER_SIZE], err;
-+ unsigned long value;
-+ struct scan_rung *rung;
-+ char *p, *end = NULL;
-+
-+ p = kzalloc(count, GFP_KERNEL);
-+ if (!p)
-+ return -ENOMEM;
-+
-+ memcpy(p, buf, count);
-+
-+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+ if (i != SCAN_LADDER_SIZE - 1) {
-+ end = strchr(p, ' ');
-+ if (!end)
-+ return -EINVAL;
-+
-+ *end = '\0';
-+ }
-+
-+ if (strstr(p, "MAX/")) {
-+ p = strchr(p, '/') + 1;
-+ err = kstrtoul(p, 10, &value);
-+ if (err || value > TIME_RATIO_SCALE || !value)
-+ return -EINVAL;
-+
-+ cpuratios[i] = -(int) (TIME_RATIO_SCALE / value);
-+ } else {
-+ err = kstrtoul(p, 10, &value);
-+ if (err || value > TIME_RATIO_SCALE || !value)
-+ return -EINVAL;
-+
-+ cpuratios[i] = value;
-+ }
-+
-+ p = end + 1;
-+ }
-+
-+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+ rung = &uksm_scan_ladder[i];
-+
-+ rung->cpu_ratio = cpuratios[i];
-+ }
-+
-+ return count;
-+}
-+UKSM_ATTR(cpu_ratios);
-+
-+static ssize_t eval_intervals_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ int i, size;
-+ struct scan_rung *rung;
-+ char *p = buf;
-+
-+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+ rung = &uksm_scan_ladder[i];
-+ size = sprintf(p, "%u ", rung->cover_msecs);
-+ p += size;
-+ }
-+
-+ *p++ = '\n';
-+ *p = '\0';
-+
-+ return p - buf;
-+}
-+
-+static ssize_t eval_intervals_store(struct kobject *kobj,
-+ struct kobj_attribute *attr,
-+ const char *buf, size_t count)
-+{
-+ int i, err;
-+ unsigned long values[SCAN_LADDER_SIZE];
-+ struct scan_rung *rung;
-+ char *p, *end = NULL;
-+ ssize_t ret = count;
-+
-+ p = kzalloc(count + 2, GFP_KERNEL);
-+ if (!p)
-+ return -ENOMEM;
-+
-+ memcpy(p, buf, count);
-+
-+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+ if (i != SCAN_LADDER_SIZE - 1) {
-+ end = strchr(p, ' ');
-+ if (!end) {
-+ ret = -EINVAL;
-+ goto out;
-+ }
-+
-+ *end = '\0';
-+ }
-+
-+ err = kstrtoul(p, 10, &values[i]);
-+ if (err) {
-+ ret = -EINVAL;
-+ goto out;
-+ }
-+
-+ p = end + 1;
-+ }
-+
-+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+ rung = &uksm_scan_ladder[i];
-+
-+ rung->cover_msecs = values[i];
-+ }
-+
-+out:
-+ kfree(p);
-+ return ret;
-+}
-+UKSM_ATTR(eval_intervals);
-+
-+static ssize_t ema_per_page_time_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ return sprintf(buf, "%lu\n", uksm_ema_page_time);
-+}
-+UKSM_ATTR_RO(ema_per_page_time);
-+
-+static ssize_t pages_shared_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ return sprintf(buf, "%lu\n", uksm_pages_shared);
-+}
-+UKSM_ATTR_RO(pages_shared);
-+
-+static ssize_t pages_sharing_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ return sprintf(buf, "%lu\n", uksm_pages_sharing);
-+}
-+UKSM_ATTR_RO(pages_sharing);
-+
-+static ssize_t pages_unshared_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ return sprintf(buf, "%lu\n", uksm_pages_unshared);
-+}
-+UKSM_ATTR_RO(pages_unshared);
-+
-+static ssize_t full_scans_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ return sprintf(buf, "%llu\n", fully_scanned_round);
-+}
-+UKSM_ATTR_RO(full_scans);
-+
-+static ssize_t pages_scanned_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ unsigned long base = 0;
-+ u64 delta, ret;
-+
-+ if (pages_scanned_stored) {
-+ base = pages_scanned_base;
-+ ret = pages_scanned_stored;
-+ delta = uksm_pages_scanned >> base;
-+ if (CAN_OVERFLOW_U64(ret, delta)) {
-+ ret >>= 1;
-+ delta >>= 1;
-+ base++;
-+ ret += delta;
-+ }
-+ } else {
-+ ret = uksm_pages_scanned;
-+ }
-+
-+ while (ret > ULONG_MAX) {
-+ ret >>= 1;
-+ base++;
-+ }
-+
-+ if (base)
-+ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
-+ else
-+ return sprintf(buf, "%lu\n", (unsigned long)ret);
-+}
-+UKSM_ATTR_RO(pages_scanned);
-+
-+static ssize_t hash_strength_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ return sprintf(buf, "%lu\n", hash_strength);
-+}
-+UKSM_ATTR_RO(hash_strength);
-+
-+static ssize_t sleep_times_show(struct kobject *kobj,
-+ struct kobj_attribute *attr, char *buf)
-+{
-+ return sprintf(buf, "%llu\n", uksm_sleep_times);
-+}
-+UKSM_ATTR_RO(sleep_times);
-+
-+
-+static struct attribute *uksm_attrs[] = {
-+ &max_cpu_percentage_attr.attr,
-+ &sleep_millisecs_attr.attr,
-+ &cpu_governor_attr.attr,
-+ &run_attr.attr,
-+ &ema_per_page_time_attr.attr,
-+ &pages_shared_attr.attr,
-+ &pages_sharing_attr.attr,
-+ &pages_unshared_attr.attr,
-+ &full_scans_attr.attr,
-+ &pages_scanned_attr.attr,
-+ &hash_strength_attr.attr,
-+ &sleep_times_attr.attr,
-+ &thrash_threshold_attr.attr,
-+ &abundant_threshold_attr.attr,
-+ &cpu_ratios_attr.attr,
-+ &eval_intervals_attr.attr,
-+ NULL,
-+};
-+
-+static struct attribute_group uksm_attr_group = {
-+ .attrs = uksm_attrs,
-+ .name = "uksm",
-+};
-+#endif /* CONFIG_SYSFS */
-+
-+static inline void init_scan_ladder(void)
-+{
-+ int i;
-+ struct scan_rung *rung;
-+
-+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+ rung = uksm_scan_ladder + i;
-+ slot_tree_init_root(&rung->vma_root);
-+ }
-+
-+ init_performance_values();
-+ uksm_calc_scan_pages();
-+}
-+
-+static inline int cal_positive_negative_costs(void)
-+{
-+ struct page *p1, *p2;
-+ unsigned char *addr1, *addr2;
-+ unsigned long i, time_start, hash_cost;
-+ unsigned long loopnum = 0;
-+
-+ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
-+ volatile u32 hash;
-+ volatile int ret;
-+
-+ p1 = alloc_page(GFP_KERNEL);
-+ if (!p1)
-+ return -ENOMEM;
-+
-+ p2 = alloc_page(GFP_KERNEL);
-+ if (!p2)
-+ return -ENOMEM;
-+
-+ addr1 = kmap_atomic(p1);
-+ addr2 = kmap_atomic(p2);
-+ memset(addr1, prandom_u32(), PAGE_SIZE);
-+ memcpy(addr2, addr1, PAGE_SIZE);
-+
-+ /* make sure that the two pages differ in last byte */
-+ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
-+ kunmap_atomic(addr2);
-+ kunmap_atomic(addr1);
-+
-+ time_start = jiffies;
-+ while (jiffies - time_start < 100) {
-+ for (i = 0; i < 100; i++)
-+ hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
-+ loopnum += 100;
-+ }
-+ hash_cost = (jiffies - time_start);
-+
-+ time_start = jiffies;
-+ for (i = 0; i < loopnum; i++)
-+ ret = pages_identical_with_cost(p1, p2);
-+ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
-+ memcmp_cost /= hash_cost;
-+ pr_info("UKSM: relative memcmp_cost = %lu "
-+ "hash=%u cmp_ret=%d.\n",
-+ memcmp_cost, hash, ret);
-+
-+ __free_page(p1);
-+ __free_page(p2);
-+ return 0;
-+}
-+
-+static int init_zeropage_hash_table(void)
-+{
-+ struct page *page;
-+ char *addr;
-+ int i;
-+
-+ page = alloc_page(GFP_KERNEL);
-+ if (!page)
-+ return -ENOMEM;
-+
-+ addr = kmap_atomic(page);
-+ memset(addr, 0, PAGE_SIZE);
-+ kunmap_atomic(addr);
-+
-+ zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32),
-+ GFP_KERNEL);
-+ if (!zero_hash_table)
-+ return -ENOMEM;
-+
-+ for (i = 0; i < HASH_STRENGTH_MAX; i++)
-+ zero_hash_table[i] = page_hash(page, i, 0);
-+
-+ __free_page(page);
-+
-+ return 0;
-+}
-+
-+static inline int init_random_sampling(void)
-+{
-+ unsigned long i;
-+
-+ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
-+ if (!random_nums)
-+ return -ENOMEM;
-+
-+ for (i = 0; i < HASH_STRENGTH_FULL; i++)
-+ random_nums[i] = i;
-+
-+ for (i = 0; i < HASH_STRENGTH_FULL; i++) {
-+ unsigned long rand_range, swap_index, tmp;
-+
-+ rand_range = HASH_STRENGTH_FULL - i;
-+ swap_index = i + prandom_u32() % rand_range;
-+ tmp = random_nums[i];
-+ random_nums[i] = random_nums[swap_index];
-+ random_nums[swap_index] = tmp;
-+ }
-+
-+ rshash_state.state = RSHASH_NEW;
-+ rshash_state.below_count = 0;
-+ rshash_state.lookup_window_index = 0;
-+
-+ return cal_positive_negative_costs();
-+}
-+
-+static int __init uksm_slab_init(void)
-+{
-+ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
-+ if (!rmap_item_cache)
-+ goto out;
-+
-+ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
-+ if (!stable_node_cache)
-+ goto out_free1;
-+
-+ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
-+ if (!node_vma_cache)
-+ goto out_free2;
-+
-+ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
-+ if (!vma_slot_cache)
-+ goto out_free3;
-+
-+ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
-+ if (!tree_node_cache)
-+ goto out_free4;
-+
-+ return 0;
-+
-+out_free4:
-+ kmem_cache_destroy(vma_slot_cache);
-+out_free3:
-+ kmem_cache_destroy(node_vma_cache);
-+out_free2:
-+ kmem_cache_destroy(stable_node_cache);
-+out_free1:
-+ kmem_cache_destroy(rmap_item_cache);
-+out:
-+ return -ENOMEM;
-+}
-+
-+static void __init uksm_slab_free(void)
-+{
-+ kmem_cache_destroy(stable_node_cache);
-+ kmem_cache_destroy(rmap_item_cache);
-+ kmem_cache_destroy(node_vma_cache);
-+ kmem_cache_destroy(vma_slot_cache);
-+ kmem_cache_destroy(tree_node_cache);
-+}
-+
-+/* Common interface to ksm, different to it. */
-+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
-+ unsigned long end, int advice, unsigned long *vm_flags)
-+{
-+ int err;
-+
-+ switch (advice) {
-+ case MADV_MERGEABLE:
-+ return 0; /* just ignore the advice */
-+
-+ case MADV_UNMERGEABLE:
-+ if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags))
-+ return 0; /* just ignore the advice */
-+
-+ if (vma->anon_vma) {
-+ err = unmerge_uksm_pages(vma, start, end);
-+ if (err)
-+ return err;
-+ }
-+
-+ uksm_remove_vma(vma);
-+ *vm_flags &= ~VM_MERGEABLE;
-+ break;
-+ }
-+
-+ return 0;
-+}
-+
-+/* Common interface to ksm, actually the same. */
-+struct page *ksm_might_need_to_copy(struct page *page,
-+ struct vm_area_struct *vma, unsigned long address)
-+{
-+ struct anon_vma *anon_vma = page_anon_vma(page);
-+ struct page *new_page;
-+
-+ if (PageKsm(page)) {
-+ if (page_stable_node(page))
-+ return page; /* no need to copy it */
-+ } else if (!anon_vma) {
-+ return page; /* no need to copy it */
-+ } else if (anon_vma->root == vma->anon_vma->root &&
-+ page->index == linear_page_index(vma, address)) {
-+ return page; /* still no need to copy it */
-+ }
-+ if (!PageUptodate(page))
-+ return page; /* let do_swap_page report the error */
-+
-+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-+ if (new_page) {
-+ copy_user_highpage(new_page, page, address, vma);
-+
-+ SetPageDirty(new_page);
-+ __SetPageUptodate(new_page);
-+ __SetPageLocked(new_page);
-+ }
-+
-+ return new_page;
-+}
-+
-+/* Copied from mm/ksm.c and required from 5.1 */
-+bool reuse_ksm_page(struct page *page,
-+ struct vm_area_struct *vma,
-+ unsigned long address)
-+{
-+#ifdef CONFIG_DEBUG_VM
-+ if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
-+ WARN_ON(!page_mapped(page)) ||
-+ WARN_ON(!PageLocked(page))) {
-+ dump_page(page, "reuse_ksm_page");
-+ return false;
-+ }
-+#endif
-+
-+ if (PageSwapCache(page) || !page_stable_node(page))
-+ return false;
-+ /* Prohibit parallel get_ksm_page() */
-+ if (!page_ref_freeze(page, 1))
-+ return false;
-+
-+ page_move_anon_rmap(page, vma);
-+ page->index = linear_page_index(vma, address);
-+ page_ref_unfreeze(page, 1);
-+
-+ return true;
-+}
-+
-+static int __init uksm_init(void)
-+{
-+ struct task_struct *uksm_thread;
-+ int err;
-+
-+ uksm_sleep_jiffies = msecs_to_jiffies(100);
-+ uksm_sleep_saved = uksm_sleep_jiffies;
-+
-+ slot_tree_init();
-+ init_scan_ladder();
-+
-+
-+ err = init_random_sampling();
-+ if (err)
-+ goto out_free2;
-+
-+ err = uksm_slab_init();
-+ if (err)
-+ goto out_free1;
-+
-+ err = init_zeropage_hash_table();
-+ if (err)
-+ goto out_free0;
-+
-+ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
-+ if (IS_ERR(uksm_thread)) {
-+ pr_err("uksm: creating kthread failed\n");
-+ err = PTR_ERR(uksm_thread);
-+ goto out_free;
-+ }
-+
-+#ifdef CONFIG_SYSFS
-+ err = sysfs_create_group(mm_kobj, &uksm_attr_group);
-+ if (err) {
-+ pr_err("uksm: register sysfs failed\n");
-+ kthread_stop(uksm_thread);
-+ goto out_free;
-+ }
-+#else
-+ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */
-+
-+#endif /* CONFIG_SYSFS */
-+
-+#ifdef CONFIG_MEMORY_HOTREMOVE
-+ /*
-+ * Choose a high priority since the callback takes uksm_thread_mutex:
-+ * later callbacks could only be taking locks which nest within that.
-+ */
-+ hotplug_memory_notifier(uksm_memory_callback, 100);
-+#endif
-+ return 0;
-+
-+out_free:
-+ kfree(zero_hash_table);
-+out_free0:
-+ uksm_slab_free();
-+out_free1:
-+ kfree(random_nums);
-+out_free2:
-+ kfree(uksm_scan_ladder);
-+ return err;
-+}
-+
-+#ifdef MODULE
-+subsys_initcall(ksm_init);
-+#else
-+late_initcall(uksm_init);
-+#endif
-+
-diff --git a/mm/vmstat.c b/mm/vmstat.c
-index 74b2c374b..ae42103a8 100644
---- a/mm/vmstat.c
-+++ b/mm/vmstat.c
-@@ -1231,6 +1231,9 @@ const char * const vmstat_text[] = {
- "nr_swapcached",
- #endif
-
-+#ifdef CONFIG_UKSM
-+ "nr_uksm_zero_pages",
-+#endif
- /* enum writeback_stat_item counters */
- "nr_dirty_threshold",
- "nr_dirty_background_threshold",
---
-2.31.1.305.gd1b10fc6d8
-
diff --git a/0009-bbr2.patch b/0009-bbr2.patch
deleted file mode 100644
index 5a257ca22043..000000000000
--- a/0009-bbr2.patch
+++ /dev/null
@@ -1,3347 +0,0 @@
-From f3d069e2cafed9758d66fcfc42447b028d42493f Mon Sep 17 00:00:00 2001
-From: Piotr Gorski <lucjan.lucjanov@gmail.com>
-Date: Mon, 26 Apr 2021 21:14:18 +0200
-Subject: [PATCH] bbr2-5.12: introduce BBRv2
-
-Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
----
- include/linux/tcp.h | 3 +-
- include/net/inet_connection_sock.h | 5 +-
- include/net/tcp.h | 48 +-
- include/uapi/linux/inet_diag.h | 33 +
- net/ipv4/Kconfig | 22 +
- net/ipv4/Makefile | 1 +
- net/ipv4/bpf_tcp_ca.c | 2 +-
- net/ipv4/tcp.c | 1 +
- net/ipv4/tcp_bbr.c | 38 +-
- net/ipv4/tcp_bbr2.c | 2671 ++++++++++++++++++++++++++++
- net/ipv4/tcp_cong.c | 1 +
- net/ipv4/tcp_input.c | 38 +-
- net/ipv4/tcp_output.c | 25 +-
- net/ipv4/tcp_rate.c | 36 +-
- net/ipv4/tcp_timer.c | 1 +
- 15 files changed, 2879 insertions(+), 46 deletions(-)
- create mode 100644 net/ipv4/tcp_bbr2.c
-
-diff --git a/include/linux/tcp.h b/include/linux/tcp.h
-index 48d8a3633..1bd559c69 100644
---- a/include/linux/tcp.h
-+++ b/include/linux/tcp.h
-@@ -225,7 +225,8 @@ struct tcp_sock {
- u8 compressed_ack;
- u8 dup_ack_counter:2,
- tlp_retrans:1, /* TLP is a retransmission */
-- unused:5;
-+ fast_ack_mode:2, /* which fast ack mode ? */
-+ unused:3;
- u32 chrono_start; /* Start time in jiffies of a TCP chrono */
- u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
- u8 chrono_type:2, /* current chronograph type */
-diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
-index 3c8c59471..2cdc5a070 100644
---- a/include/net/inet_connection_sock.h
-+++ b/include/net/inet_connection_sock.h
-@@ -134,8 +134,9 @@ struct inet_connection_sock {
- u32 icsk_probes_tstamp;
- u32 icsk_user_timeout;
-
-- u64 icsk_ca_priv[104 / sizeof(u64)];
--#define ICSK_CA_PRIV_SIZE (13 * sizeof(u64))
-+/* XXX inflated by temporary internal debugging info */
-+#define ICSK_CA_PRIV_SIZE (216)
-+ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
- };
-
- #define ICSK_TIME_RETRANS 1 /* Retransmit timer */
-diff --git a/include/net/tcp.h b/include/net/tcp.h
-index 963cd86d1..5a86fa1d2 100644
---- a/include/net/tcp.h
-+++ b/include/net/tcp.h
-@@ -799,6 +799,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
- return max_t(s64, t1 - t0, 0);
- }
-
-+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
-+{
-+ return max_t(s32, t1 - t0, 0);
-+}
-+
- static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
- {
- return tcp_ns_to_ts(skb->skb_mstamp_ns);
-@@ -866,16 +871,22 @@ struct tcp_skb_cb {
- __u32 ack_seq; /* Sequence number ACK'd */
- union {
- struct {
-+#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1)
- /* There is space for up to 24 bytes */
-- __u32 in_flight:30,/* Bytes in flight at transmit */
-- is_app_limited:1, /* cwnd not fully used? */
-- unused:1;
-+ __u32 is_app_limited:1, /* cwnd not fully used? */
-+ delivered_ce:20,
-+ unused:11;
- /* pkts S/ACKed so far upon tx of skb, incl retrans: */
- __u32 delivered;
- /* start of send pipeline phase */
-- u64 first_tx_mstamp;
-+ u32 first_tx_mstamp;
- /* when we reached the "delivered" count */
-- u64 delivered_mstamp;
-+ u32 delivered_mstamp;
-+#define TCPCB_IN_FLIGHT_BITS 20
-+#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
-+ u32 in_flight:20, /* packets in flight at transmit */
-+ unused2:12;
-+ u32 lost; /* packets lost so far upon tx of skb */
- } tx; /* only used for outgoing skbs */
- union {
- struct inet_skb_parm h4;
-@@ -1025,7 +1036,11 @@ enum tcp_ca_ack_event_flags {
- #define TCP_CONG_NON_RESTRICTED 0x1
- /* Requires ECN/ECT set on all packets */
- #define TCP_CONG_NEEDS_ECN 0x2
--#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
-+/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
-+#define TCP_CONG_WANTS_CE_EVENTS 0x4
-+#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \
-+ TCP_CONG_NEEDS_ECN | \
-+ TCP_CONG_WANTS_CE_EVENTS)
-
- union tcp_cc_info;
-
-@@ -1045,8 +1060,13 @@ struct ack_sample {
- */
- struct rate_sample {
- u64 prior_mstamp; /* starting timestamp for interval */
-+ u32 prior_lost; /* tp->lost at "prior_mstamp" */
- u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
-+ u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
-+ u32 tx_in_flight; /* packets in flight at starting timestamp */
-+ s32 lost; /* number of packets lost over interval */
- s32 delivered; /* number of packets delivered over interval */
-+ s32 delivered_ce; /* packets delivered w/ CE mark over interval */
- long interval_us; /* time for tp->delivered to incr "delivered" */
- u32 snd_interval_us; /* snd interval for delivered packets */
- u32 rcv_interval_us; /* rcv interval for delivered packets */
-@@ -1057,6 +1077,7 @@ struct rate_sample {
- bool is_app_limited; /* is sample from packet with bubble in pipe? */
- bool is_retrans; /* is sample from retransmission? */
- bool is_ack_delayed; /* is this (likely) a delayed ACK? */
-+ bool is_ece; /* did this ACK have ECN marked? */
- };
-
- struct tcp_congestion_ops {
-@@ -1083,10 +1104,12 @@ struct tcp_congestion_ops {
- u32 (*undo_cwnd)(struct sock *sk);
- /* hook for packet ack accounting (optional) */
- void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
-- /* override sysctl_tcp_min_tso_segs */
-- u32 (*min_tso_segs)(struct sock *sk);
-+ /* pick target number of segments per TSO/GSO skb (optional): */
-+ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
- /* returns the multiplier used in tcp_sndbuf_expand (optional) */
- u32 (*sndbuf_expand)(struct sock *sk);
-+ /* react to a specific lost skb (optional) */
-+ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
- /* call when packets are delivered to update cwnd and pacing rate,
- * after all the ca_state processing. (optional)
- */
-@@ -1132,6 +1155,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
- }
- #endif
-
-+static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
-+{
-+ const struct inet_connection_sock *icsk = inet_csk(sk);
-+
-+ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
-+ TCP_CONG_WANTS_CE_EVENTS);
-+}
-+
- static inline bool tcp_ca_needs_ecn(const struct sock *sk)
- {
- const struct inet_connection_sock *icsk = inet_csk(sk);
-@@ -1157,6 +1188,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
- }
-
- /* From tcp_rate.c */
-+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
- void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
- void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
- struct rate_sample *rs);
-diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
-index 20ee93f0f..96d52dd9c 100644
---- a/include/uapi/linux/inet_diag.h
-+++ b/include/uapi/linux/inet_diag.h
-@@ -231,9 +231,42 @@ struct tcp_bbr_info {
- __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
- };
-
-+/* Phase as reported in netlink/ss stats. */
-+enum tcp_bbr2_phase {
-+ BBR2_PHASE_INVALID = 0,
-+ BBR2_PHASE_STARTUP = 1,
-+ BBR2_PHASE_DRAIN = 2,
-+ BBR2_PHASE_PROBE_RTT = 3,
-+ BBR2_PHASE_PROBE_BW_UP = 4,
-+ BBR2_PHASE_PROBE_BW_DOWN = 5,
-+ BBR2_PHASE_PROBE_BW_CRUISE = 6,
-+ BBR2_PHASE_PROBE_BW_REFILL = 7
-+};
-+
-+struct tcp_bbr2_info {
-+ /* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */
-+ __u32 bbr_bw_lsb; /* lower 32 bits of bw */
-+ __u32 bbr_bw_msb; /* upper 32 bits of bw */
-+ __u32 bbr_min_rtt; /* min-filtered RTT in uSec */
-+ __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */
-+ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
-+ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */
-+ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */
-+ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */
-+ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */
-+ __u8 bbr_mode; /* current bbr_mode in state machine */
-+ __u8 bbr_phase; /* current state machine phase */
-+ __u8 unused1; /* alignment padding; not used yet */
-+ __u8 bbr_version; /* MUST be at this offset in struct */
-+ __u32 bbr_inflight_lo; /* lower/short-term data volume bound */
-+ __u32 bbr_inflight_hi; /* higher/long-term data volume bound */
-+ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */
-+};
-+
- union tcp_cc_info {
- struct tcpvegas_info vegas;
- struct tcp_dctcp_info dctcp;
- struct tcp_bbr_info bbr;
-+ struct tcp_bbr2_info bbr2;
- };
- #endif /* _UAPI_INET_DIAG_H_ */
-diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
-index 87983e70f..a833a7a67 100644
---- a/net/ipv4/Kconfig
-+++ b/net/ipv4/Kconfig
-@@ -669,6 +669,24 @@ config TCP_CONG_BBR
- AQM schemes that do not provide a delay signal. It requires the fq
- ("Fair Queue") pacing packet scheduler.
-
-+config TCP_CONG_BBR2
-+ tristate "BBR2 TCP"
-+ default n
-+ help
-+
-+ BBR2 TCP congestion control is a model-based congestion control
-+ algorithm that aims to maximize network utilization, keep queues and
-+ retransmit rates low, and to be able to coexist with Reno/CUBIC in
-+ common scenarios. It builds an explicit model of the network path. It
-+ tolerates a targeted degree of random packet loss and delay that are
-+ unrelated to congestion. It can operate over LAN, WAN, cellular, wifi,
-+ or cable modem links, and can use DCTCP-L4S-style ECN signals. It can
-+ coexist with flows that use loss-based congestion control, and can
-+ operate with shallow buffers, deep buffers, bufferbloat, policers, or
-+ AQM schemes that do not provide a delay signal. It requires pacing,
-+ using either TCP internal pacing or the fq ("Fair Queue") pacing packet
-+ scheduler.
-+
- choice
- prompt "Default TCP congestion control"
- default DEFAULT_CUBIC
-@@ -706,6 +724,9 @@ choice
- config DEFAULT_BBR
- bool "BBR" if TCP_CONG_BBR=y
-
-+ config DEFAULT_BBR2
-+ bool "BBR2" if TCP_CONG_BBR2=y
-+
- config DEFAULT_RENO
- bool "Reno"
- endchoice
-@@ -730,6 +751,7 @@ config DEFAULT_TCP_CONG
- default "dctcp" if DEFAULT_DCTCP
- default "cdg" if DEFAULT_CDG
- default "bbr" if DEFAULT_BBR
-+ default "bbr2" if DEFAULT_BBR2
- default "cubic"
-
- config TCP_MD5SIG
-diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
-index 5b77a4688..8c5779dba 100644
---- a/net/ipv4/Makefile
-+++ b/net/ipv4/Makefile
-@@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
- obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
- obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
- obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
-+obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o
- obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
- obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
- obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
-diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
-index d520e6164..22129c1c5 100644
---- a/net/ipv4/bpf_tcp_ca.c
-+++ b/net/ipv4/bpf_tcp_ca.c
-@@ -16,7 +16,7 @@ static u32 optional_ops[] = {
- offsetof(struct tcp_congestion_ops, cwnd_event),
- offsetof(struct tcp_congestion_ops, in_ack_event),
- offsetof(struct tcp_congestion_ops, pkts_acked),
-- offsetof(struct tcp_congestion_ops, min_tso_segs),
-+ offsetof(struct tcp_congestion_ops, tso_segs),
- offsetof(struct tcp_congestion_ops, sndbuf_expand),
- offsetof(struct tcp_congestion_ops, cong_control),
- };
-diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
-index de7cc8445..521f310f2 100644
---- a/net/ipv4/tcp.c
-+++ b/net/ipv4/tcp.c
-@@ -3033,6 +3033,7 @@ int tcp_disconnect(struct sock *sk, int flags)
- tp->rx_opt.dsack = 0;
- tp->rx_opt.num_sacks = 0;
- tp->rcv_ooopack = 0;
-+ tp->fast_ack_mode = 0;
-
-
- /* Clean up fastopen related fields */
-diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
-index 6ea3dc2e4..8ef512fef 100644
---- a/net/ipv4/tcp_bbr.c
-+++ b/net/ipv4/tcp_bbr.c
-@@ -292,26 +292,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
- sk->sk_pacing_rate = rate;
- }
-
--/* override sysctl_tcp_min_tso_segs */
- static u32 bbr_min_tso_segs(struct sock *sk)
- {
- return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
- }
-
-+/* Return the number of segments BBR would like in a TSO/GSO skb, given
-+ * a particular max gso size as a constraint.
-+ */
-+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
-+ u32 gso_max_size)
-+{
-+ u32 segs;
-+ u64 bytes;
-+
-+ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
-+ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
-+
-+ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
-+ segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
-+ return segs;
-+}
-+
-+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
-+static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
-+{
-+ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
-+}
-+
-+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
- static u32 bbr_tso_segs_goal(struct sock *sk)
- {
- struct tcp_sock *tp = tcp_sk(sk);
-- u32 segs, bytes;
--
-- /* Sort of tcp_tso_autosize() but ignoring
-- * driver provided sk_gso_max_size.
-- */
-- bytes = min_t(unsigned long,
-- sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
-- GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
-- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
-
-- return min(segs, 0x7FU);
-+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
- }
-
- /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
-@@ -1147,7 +1161,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
- .undo_cwnd = bbr_undo_cwnd,
- .cwnd_event = bbr_cwnd_event,
- .ssthresh = bbr_ssthresh,
-- .min_tso_segs = bbr_min_tso_segs,
-+ .tso_segs = bbr_tso_segs,
- .get_info = bbr_get_info,
- .set_state = bbr_set_state,
- };
-diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
-new file mode 100644
-index 000000000..17d6a059d
---- /dev/null
-+++ b/net/ipv4/tcp_bbr2.c
-@@ -0,0 +1,2671 @@
-+/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2
-+ *
-+ * BBRv2 is a model-based congestion control algorithm that aims for low
-+ * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model
-+ * of the network path, it uses measurements of bandwidth and RTT, as well as
-+ * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals. Note that
-+ * although it can use ECN or loss signals explicitly, it does not require
-+ * either; it can bound its in-flight data based on its estimate of the BDP.
-+ *
-+ * The model has both higher and lower bounds for the operating range:
-+ * lo: bw_lo, inflight_lo: conservative short-term lower bound
-+ * hi: bw_hi, inflight_hi: robust long-term upper bound
-+ * The bandwidth-probing time scale is (a) extended dynamically based on
-+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
-+ * an interactive wall-clock time-scale to be more scalable and responsive
-+ * than Reno and CUBIC.
-+ *
-+ * Here is a state transition diagram for BBR:
-+ *
-+ * |
-+ * V
-+ * +---> STARTUP ----+
-+ * | | |
-+ * | V |
-+ * | DRAIN ----+
-+ * | | |
-+ * | V |
-+ * +---> PROBE_BW ----+
-+ * | ^ | |
-+ * | | | |
-+ * | +----+ |
-+ * | |
-+ * +---- PROBE_RTT <--+
-+ *
-+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
-+ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
-+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
-+ * A long-lived BBR flow spends the vast majority of its time remaining
-+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
-+ * in a fair manner, with a small, bounded queue. *If* a flow has been
-+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
-+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
-+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
-+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
-+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
-+ * otherwise we enter STARTUP to try to fill the pipe.
-+ *
-+ * BBR is described in detail in:
-+ * "BBR: Congestion-Based Congestion Control",
-+ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
-+ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
-+ *
-+ * There is a public e-mail list for discussing BBR development and testing:
-+ * https://groups.google.com/forum/#!forum/bbr-dev
-+ *
-+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
-+ * otherwise TCP stack falls back to an internal pacing using one high
-+ * resolution timer per TCP socket and may use more resources.
-+ */
-+#include <linux/module.h>
-+#include <net/tcp.h>
-+#include <linux/inet_diag.h>
-+#include <linux/inet.h>
-+#include <linux/random.h>
-+
-+#include "tcp_dctcp.h"
-+
-+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
-+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
-+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
-+ * Since the minimum window is >=4 packets, the lower bound isn't
-+ * an issue. The upper bound isn't an issue with existing technologies.
-+ */
-+#define BW_SCALE 24
-+#define BW_UNIT (1 << BW_SCALE)
-+
-+#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */
-+#define BBR_UNIT (1 << BBR_SCALE)
-+
-+#define FLAG_DEBUG_VERBOSE 0x1 /* Verbose debugging messages */
-+#define FLAG_DEBUG_LOOPBACK 0x2 /* Do NOT skip loopback addr */
-+
-+#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
-+
-+/* BBR has the following modes for deciding how fast to send: */
-+enum bbr_mode {
-+ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
-+ BBR_DRAIN, /* drain any queue created during startup */
-+ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
-+ BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */
-+};
-+
-+/* How does the incoming ACK stream relate to our bandwidth probing? */
-+enum bbr_ack_phase {
-+ BBR_ACKS_INIT, /* not probing; not getting probe feedback */
-+ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */
-+ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */
-+ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */
-+ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */
-+};
-+
-+/* BBR congestion control block */
-+struct bbr {
-+ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */
-+ u32 min_rtt_stamp; /* timestamp of min_rtt_us */
-+ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */
-+ u32 probe_rtt_min_us; /* min RTT in bbr_probe_rtt_win_ms window */
-+ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/
-+ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */
-+ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */
-+ u64 cycle_mstamp; /* time of this cycle phase start */
-+ u32 mode:3, /* current bbr_mode in state machine */
-+ prev_ca_state:3, /* CA state on previous ACK */
-+ packet_conservation:1, /* use packet conservation? */
-+ round_start:1, /* start of packet-timed tx->ack round? */
-+ ce_state:1, /* If most recent data has CE bit set */
-+ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */
-+ try_fast_path:1, /* can we take fast path? */
-+ unused2:11,
-+ idle_restart:1, /* restarting after idle? */
-+ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
-+ cycle_idx:3, /* current index in pacing_gain cycle array */
-+ has_seen_rtt:1; /* have we seen an RTT sample yet? */
-+ u32 pacing_gain:11, /* current gain for setting pacing rate */
-+ cwnd_gain:11, /* current gain for setting cwnd */
-+ full_bw_reached:1, /* reached full bw in Startup? */
-+ full_bw_cnt:2, /* number of rounds without large bw gains */
-+ init_cwnd:7; /* initial cwnd */
-+ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
-+ u32 full_bw; /* recent bw, to estimate if pipe is full */
-+
-+ /* For tracking ACK aggregation: */
-+ u64 ack_epoch_mstamp; /* start of ACK sampling epoch */
-+ u16 extra_acked[2]; /* max excess data ACKed in epoch */
-+ u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */
-+ extra_acked_win_rtts:5, /* age of extra_acked, in round trips */
-+ extra_acked_win_idx:1, /* current index in extra_acked array */
-+ /* BBR v2 state: */
-+ unused1:2,
-+ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */
-+ loss_in_cycle:1, /* packet loss in this cycle? */
-+ ecn_in_cycle:1; /* ECN in this cycle? */
-+ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */
-+ u32 undo_bw_lo; /* bw_lo before latest losses */
-+ u32 undo_inflight_lo; /* inflight_lo before latest losses */
-+ u32 undo_inflight_hi; /* inflight_hi before latest losses */
-+ u32 bw_latest; /* max delivered bw in last round trip */
-+ u32 bw_lo; /* lower bound on sending bandwidth */
-+ u32 bw_hi[2]; /* upper bound of sending bandwidth range*/
-+ u32 inflight_latest; /* max delivered data in last round trip */
-+ u32 inflight_lo; /* lower bound of inflight data range */
-+ u32 inflight_hi; /* upper bound of inflight data range */
-+ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
-+ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */
-+ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */
-+ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */
-+ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */
-+ bw_probe_samples:1, /* rate samples reflect bw probing? */
-+ prev_probe_too_high:1, /* did last PROBE_UP go too high? */
-+ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
-+ rounds_since_probe:8, /* packet-timed rounds since probed bw */
-+ loss_round_start:1, /* loss_round_delivered round trip? */
-+ loss_in_round:1, /* loss marked in this round trip? */
-+ ecn_in_round:1, /* ECN marked in this round trip? */
-+ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */
-+ loss_events_in_round:4,/* losses in STARTUP round */
-+ initialized:1; /* has bbr_init() been called? */
-+ u32 alpha_last_delivered; /* tp->delivered at alpha update */
-+ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
-+
-+ /* Params configurable using setsockopt. Refer to correspoding
-+ * module param for detailed description of params.
-+ */
-+ struct bbr_params {
-+ u32 high_gain:11, /* max allowed value: 2047 */
-+ drain_gain:10, /* max allowed value: 1023 */
-+ cwnd_gain:11; /* max allowed value: 2047 */
-+ u32 cwnd_min_target:4, /* max allowed value: 15 */
-+ min_rtt_win_sec:5, /* max allowed value: 31 */
-+ probe_rtt_mode_ms:9, /* max allowed value: 511 */
-+ full_bw_cnt:3, /* max allowed value: 7 */
-+ cwnd_tso_budget:1, /* allowed values: {0, 1} */
-+ unused3:6,
-+ drain_to_target:1, /* boolean */
-+ precise_ece_ack:1, /* boolean */
-+ extra_acked_in_startup:1, /* allowed values: {0, 1} */
-+ fast_path:1; /* boolean */
-+ u32 full_bw_thresh:10, /* max allowed value: 1023 */
-+ startup_cwnd_gain:11, /* max allowed value: 2047 */
-+ bw_probe_pif_gain:9, /* max allowed value: 511 */
-+ usage_based_cwnd:1, /* boolean */
-+ unused2:1;
-+ u16 probe_rtt_win_ms:14, /* max allowed value: 16383 */
-+ refill_add_inc:2; /* max allowed value: 3 */
-+ u16 extra_acked_gain:11, /* max allowed value: 2047 */
-+ extra_acked_win_rtts:5; /* max allowed value: 31*/
-+ u16 pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */
-+ /* Mostly BBR v2 parameters below here: */
-+ u32 ecn_alpha_gain:8, /* max allowed value: 255 */
-+ ecn_factor:8, /* max allowed value: 255 */
-+ ecn_thresh:8, /* max allowed value: 255 */
-+ beta:8; /* max allowed value: 255 */
-+ u32 ecn_max_rtt_us:19, /* max allowed value: 524287 */
-+ bw_probe_reno_gain:9, /* max allowed value: 511 */
-+ full_loss_cnt:4; /* max allowed value: 15 */
-+ u32 probe_rtt_cwnd_gain:8, /* max allowed value: 255 */
-+ inflight_headroom:8, /* max allowed value: 255 */
-+ loss_thresh:8, /* max allowed value: 255 */
-+ bw_probe_max_rounds:8; /* max allowed value: 255 */
-+ u32 bw_probe_rand_rounds:4, /* max allowed value: 15 */
-+ bw_probe_base_us:26, /* usecs: 0..2^26-1 (67 secs) */
-+ full_ecn_cnt:2; /* max allowed value: 3 */
-+ u32 bw_probe_rand_us:26, /* usecs: 0..2^26-1 (67 secs) */
-+ undo:1, /* boolean */
-+ tso_rtt_shift:4, /* max allowed value: 15 */
-+ unused5:1;
-+ u32 ecn_reprobe_gain:9, /* max allowed value: 511 */
-+ unused1:14,
-+ ecn_alpha_init:9; /* max allowed value: 256 */
-+ } params;
-+
-+ struct {
-+ u32 snd_isn; /* Initial sequence number */
-+ u32 rs_bw; /* last valid rate sample bw */
-+ u32 target_cwnd; /* target cwnd, based on BDP */
-+ u8 undo:1, /* Undo even happened but not yet logged */
-+ unused:7;
-+ char event; /* single-letter event debug codes */
-+ u16 unused2;
-+ } debug;
-+};
-+
-+struct bbr_context {
-+ u32 sample_bw;
-+ u32 target_cwnd;
-+ u32 log:1;
-+};
-+
-+/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */
-+static u32 bbr_min_rtt_win_sec = 10;
-+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode.
-+ * Max allowed value is 511 (0x1FF).
-+ */
-+static u32 bbr_probe_rtt_mode_ms = 200;
-+/* Window length of probe_rtt_min_us filter (in ms), and consequently the
-+ * typical interval between PROBE_RTT mode entries.
-+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
-+ */
-+static u32 bbr_probe_rtt_win_ms = 5000;
-+/* Skip TSO below the following bandwidth (bits/sec): */
-+static int bbr_min_tso_rate = 1200000;
-+
-+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
-+ * in bigger TSO bursts. By default we cut the RTT-based allowance in half
-+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
-+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
-+ */
-+static u32 bbr_tso_rtt_shift = 9; /* halve allowance per 2^9 usecs, 512us */
-+
-+/* Select cwnd TSO budget approach:
-+ * 0: padding
-+ * 1: flooring
-+ */
-+static uint bbr_cwnd_tso_budget = 1;
-+
-+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
-+ * In order to help drive the network toward lower queues and low latency while
-+ * maintaining high utilization, the average pacing rate aims to be slightly
-+ * lower than the estimated bandwidth. This is an important aspect of the
-+ * design.
-+ */
-+static const int bbr_pacing_margin_percent = 1;
-+
-+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
-+ * that will allow a smoothly increasing pacing rate that will double each RTT
-+ * and send the same number of packets per RTT that an un-paced, slow-starting
-+ * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF).
-+ */
-+static int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
-+/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */
-+static int bbr_startup_cwnd_gain = BBR_UNIT * 2885 / 1000 + 1;
-+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
-+ * the queue created in BBR_STARTUP in a single round. Max allowed value
-+ * is 1023 (0x3FF).
-+ */
-+static int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
-+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs.
-+ * Max allowed value is 2047 (0x7FF).
-+ */
-+static int bbr_cwnd_gain = BBR_UNIT * 2;
-+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw.
-+ * Max allowed value for each element is 1023 (0x3FF).
-+ */
-+enum bbr_pacing_gain_phase {
-+ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */
-+ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */
-+ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */
-+ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */
-+};
-+static int bbr_pacing_gain[] = {
-+ BBR_UNIT * 5 / 4, /* probe for more available bw */
-+ BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */
-+ BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */
-+ BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */
-+};
-+
-+/* Try to keep at least this many packets in flight, if things go smoothly. For
-+ * smooth functioning, a sliding window protocol ACKing every other packet
-+ * needs at least 4 packets in flight. Max allowed value is 15 (0xF).
-+ */
-+static u32 bbr_cwnd_min_target = 4;
-+
-+/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%.
-+ * Use 0 to disable. Max allowed value is 255.
-+ */
-+static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
-+
-+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
-+/* If bw has increased significantly (1.25x), there may be more bw available.
-+ * Max allowed value is 1023 (0x3FF).
-+ */
-+static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
-+/* But after 3 rounds w/o significant bw growth, estimate pipe is full.
-+ * Max allowed value is 7 (0x7).
-+ */
-+static u32 bbr_full_bw_cnt = 3;
-+
-+static u32 bbr_flags; /* Debugging related stuff */
-+
-+/* Whether to debug using printk.
-+ */
-+static bool bbr_debug_with_printk;
-+
-+/* Whether to debug using ftrace event tcp:tcp_bbr_event.
-+ * Ignored when bbr_debug_with_printk is set.
-+ */
-+static bool bbr_debug_ftrace;
-+
-+/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */
-+static bool bbr_drain_to_target = true; /* default: enabled */
-+
-+/* Experiment: Flags to control BBR with ECN behavior.
-+ */
-+static bool bbr_precise_ece_ack = true; /* default: enabled */
-+
-+/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is
-+ * (2^(16+14) B)/(1024 B/packet) = 1M packets.
-+ */
-+static u32 bbr_cwnd_warn_val = 1U << 20;
-+
-+static u16 bbr_debug_port_mask;
-+
-+/* BBR module parameters. These are module parameters only in Google prod.
-+ * Upstream these are intentionally not module parameters.
-+ */
-+static int bbr_pacing_gain_size = CYCLE_LEN;
-+
-+/* Gain factor for adding extra_acked to target cwnd: */
-+static int bbr_extra_acked_gain = 256;
-+
-+/* Window length of extra_acked window. Max allowed val is 31. */
-+static u32 bbr_extra_acked_win_rtts = 5;
-+
-+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
-+static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
-+
-+/* Time period for clamping cwnd increment due to ack aggregation */
-+static u32 bbr_extra_acked_max_us = 100 * 1000;
-+
-+/* Use extra acked in startup ?
-+ * 0: disabled
-+ * 1: use latest extra_acked value from 1-2 rtt in startup
-+ */
-+static int bbr_extra_acked_in_startup = 1; /* default: enabled */
-+
-+/* Experiment: don't grow cwnd beyond twice of what we just probed. */
-+static bool bbr_usage_based_cwnd; /* default: disabled */
-+
-+/* For lab testing, researchers can enable BBRv2 ECN support with this flag,
-+ * when they know that any ECN marks that the connections experience will be
-+ * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks.
-+ * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on
-+ * negotiation or configuration that is outside the scope of the BBRv2
-+ * alpha release.
-+ */
-+static bool bbr_ecn_enable = false;
-+
-+module_param_named(min_tso_rate, bbr_min_tso_rate, int, 0644);
-+module_param_named(tso_rtt_shift, bbr_tso_rtt_shift, int, 0644);
-+module_param_named(high_gain, bbr_high_gain, int, 0644);
-+module_param_named(drain_gain, bbr_drain_gain, int, 0644);
-+module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int, 0644);
-+module_param_named(cwnd_gain, bbr_cwnd_gain, int, 0644);
-+module_param_array_named(pacing_gain, bbr_pacing_gain, int,
-+ &bbr_pacing_gain_size, 0644);
-+module_param_named(cwnd_min_target, bbr_cwnd_min_target, uint, 0644);
-+module_param_named(probe_rtt_cwnd_gain,
-+ bbr_probe_rtt_cwnd_gain, uint, 0664);
-+module_param_named(cwnd_warn_val, bbr_cwnd_warn_val, uint, 0664);
-+module_param_named(debug_port_mask, bbr_debug_port_mask, ushort, 0644);
-+module_param_named(flags, bbr_flags, uint, 0644);
-+module_param_named(debug_ftrace, bbr_debug_ftrace, bool, 0644);
-+module_param_named(debug_with_printk, bbr_debug_with_printk, bool, 0644);
-+module_param_named(min_rtt_win_sec, bbr_min_rtt_win_sec, uint, 0644);
-+module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint, 0644);
-+module_param_named(probe_rtt_win_ms, bbr_probe_rtt_win_ms, uint, 0644);
-+module_param_named(full_bw_thresh, bbr_full_bw_thresh, uint, 0644);
-+module_param_named(full_bw_cnt, bbr_full_bw_cnt, uint, 0644);
-+module_param_named(cwnd_tso_bduget, bbr_cwnd_tso_budget, uint, 0664);
-+module_param_named(extra_acked_gain, bbr_extra_acked_gain, int, 0664);
-+module_param_named(extra_acked_win_rtts,
-+ bbr_extra_acked_win_rtts, uint, 0664);
-+module_param_named(extra_acked_max_us,
-+ bbr_extra_acked_max_us, uint, 0664);
-+module_param_named(ack_epoch_acked_reset_thresh,
-+ bbr_ack_epoch_acked_reset_thresh, uint, 0664);
-+module_param_named(drain_to_target, bbr_drain_to_target, bool, 0664);
-+module_param_named(precise_ece_ack, bbr_precise_ece_ack, bool, 0664);
-+module_param_named(extra_acked_in_startup,
-+ bbr_extra_acked_in_startup, int, 0664);
-+module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool, 0664);
-+module_param_named(ecn_enable, bbr_ecn_enable, bool, 0664);
-+
-+static void bbr2_exit_probe_rtt(struct sock *sk);
-+static void bbr2_reset_congestion_signals(struct sock *sk);
-+
-+static void bbr_check_probe_rtt_done(struct sock *sk);
-+
-+/* Do we estimate that STARTUP filled the pipe? */
-+static bool bbr_full_bw_reached(const struct sock *sk)
-+{
-+ const struct bbr *bbr = inet_csk_ca(sk);
-+
-+ return bbr->full_bw_reached;
-+}
-+
-+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
-+static u32 bbr_max_bw(const struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ return max(bbr->bw_hi[0], bbr->bw_hi[1]);
-+}
-+
-+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
-+static u32 bbr_bw(const struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ return min(bbr_max_bw(sk), bbr->bw_lo);
-+}
-+
-+/* Return maximum extra acked in past k-2k round trips,
-+ * where k = bbr_extra_acked_win_rtts.
-+ */
-+static u16 bbr_extra_acked(const struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ return max(bbr->extra_acked[0], bbr->extra_acked[1]);
-+}
-+
-+/* Return rate in bytes per second, optionally with a gain.
-+ * The order here is chosen carefully to avoid overflow of u64. This should
-+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
-+ */
-+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
-+ int margin)
-+{
-+ unsigned int mss = tcp_sk(sk)->mss_cache;
-+
-+ rate *= mss;
-+ rate *= gain;
-+ rate >>= BBR_SCALE;
-+ rate *= USEC_PER_SEC / 100 * (100 - margin);
-+ rate >>= BW_SCALE;
-+ rate = max(rate, 1ULL);
-+ return rate;
-+}
-+
-+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
-+{
-+ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
-+}
-+
-+static u64 bbr_rate_kbps(struct sock *sk, u64 rate)
-+{
-+ rate = bbr_bw_bytes_per_sec(sk, rate);
-+ rate *= 8;
-+ do_div(rate, 1000);
-+ return rate;
-+}
-+
-+static u32 bbr_tso_segs_goal(struct sock *sk);
-+static void bbr_debug(struct sock *sk, u32 acked,
-+ const struct rate_sample *rs, struct bbr_context *ctx)
-+{
-+ static const char ca_states[] = {
-+ [TCP_CA_Open] = 'O',
-+ [TCP_CA_Disorder] = 'D',
-+ [TCP_CA_CWR] = 'C',
-+ [TCP_CA_Recovery] = 'R',
-+ [TCP_CA_Loss] = 'L',
-+ };
-+ static const char mode[] = {
-+ 'G', /* Growing - BBR_STARTUP */
-+ 'D', /* Drain - BBR_DRAIN */
-+ 'W', /* Window - BBR_PROBE_BW */
-+ 'M', /* Min RTT - BBR_PROBE_RTT */
-+ };
-+ static const char ack_phase[] = { /* bbr_ack_phase strings */
-+ 'I', /* BBR_ACKS_INIT - 'Init' */
-+ 'R', /* BBR_ACKS_REFILLING - 'Refilling' */
-+ 'B', /* BBR_ACKS_PROBE_STARTING - 'Before' */
-+ 'F', /* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */
-+ 'A', /* BBR_ACKS_PROBE_STOPPING - 'After' */
-+ };
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ const u32 una = tp->snd_una - bbr->debug.snd_isn;
-+ const u32 fack = tcp_highest_sack_seq(tp);
-+ const u16 dport = ntohs(inet_sk(sk)->inet_dport);
-+ bool is_port_match = (bbr_debug_port_mask &&
-+ ((dport & bbr_debug_port_mask) == 0));
-+ char debugmsg[320];
-+
-+ if (sk->sk_state == TCP_SYN_SENT)
-+ return; /* no bbr_init() yet if SYN retransmit -> CA_Loss */
-+
-+ if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) {
-+ char addr[INET6_ADDRSTRLEN + 10] = { 0 };
-+
-+ if (sk->sk_family == AF_INET)
-+ snprintf(addr, sizeof(addr), "%pI4:%u",
-+ &inet_sk(sk)->inet_daddr, dport);
-+ else if (sk->sk_family == AF_INET6)
-+ snprintf(addr, sizeof(addr), "%pI6:%u",
-+ &sk->sk_daddr, dport);
-+
-+ WARN_ONCE(1,
-+ "BBR %s cwnd alert: %u "
-+ "snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u "
-+ "bw: %u rtt: %u min_rtt: %u "
-+ "acked: %u tso_segs: %u "
-+ "bw: %d %ld %d pif: %u\n",
-+ addr, tp->snd_cwnd,
-+ una, inet_csk(sk)->icsk_ca_state,
-+ bbr->pacing_gain, bbr->cwnd_gain,
-+ bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us,
-+ acked, bbr_tso_segs_goal(sk),
-+ rs->delivered, rs->interval_us, rs->is_retrans,
-+ tcp_packets_in_flight(tp));
-+ }
-+
-+ if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace))
-+ return;
-+
-+ if (!sock_flag(sk, SOCK_DBG) && !is_port_match)
-+ return;
-+
-+ if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE))
-+ return;
-+
-+ if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) &&
-+ !(bbr_flags & FLAG_DEBUG_LOOPBACK))
-+ return;
-+
-+ snprintf(debugmsg, sizeof(debugmsg) - 1,
-+ "BBR %pI4:%-5u %5u,%03u:%-7u %c "
-+ "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu "
-+ "bw %llu lb %llu ib %llu qb %llu "
-+ "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c "
-+ "lr %d er %d ea %d bwl %lld il %d ih %d c %d "
-+ "v %d %c %u %c %s\n",
-+ &inet_sk(sk)->inet_daddr, dport,
-+ una / 1000, una % 1000, fack - tp->snd_una,
-+ ca_states[inet_csk(sk)->icsk_ca_state],
-+ bbr->debug.undo ? '@' : mode[bbr->mode],
-+ tp->snd_cwnd,
-+ bbr_extra_acked(sk), /* br (legacy): extra_acked */
-+ rs->tx_in_flight, /* cr (legacy): tx_inflight */
-+ rs->rtt_us,
-+ rs->delivered,
-+ rs->interval_us,
-+ bbr->min_rtt_us,
-+ rs->is_app_limited ? '_' : 'l',
-+ bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */
-+ bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */
-+ 0ULL, /* lb: [obsolete] */
-+ 0ULL, /* ib: [obsolete] */
-+ (u64)sk->sk_pacing_rate * 8 / 1000,
-+ acked,
-+ tcp_packets_in_flight(tp),
-+ rs->is_ack_delayed ? 'd' : '.',
-+ bbr->round_start ? '*' : '.',
-+ tp->delivered, tp->lost,
-+ tp->app_limited,
-+ 0, /* #: [obsolete] */
-+ ctx->target_cwnd,
-+ tp->reord_seen ? 'r' : '.', /* r: reordering seen? */
-+ ca_states[bbr->prev_ca_state],
-+ (rs->lost + rs->delivered) > 0 ?
-+ (1000 * rs->lost /
-+ (rs->lost + rs->delivered)) : 0, /* lr: loss rate x1000 */
-+ (rs->delivered) > 0 ?
-+ (1000 * rs->delivered_ce /
-+ (rs->delivered)) : 0, /* er: ECN rate x1000 */
-+ 1000 * bbr->ecn_alpha >> BBR_SCALE, /* ea: ECN alpha x1000 */
-+ bbr->bw_lo == ~0U ?
-+ -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */
-+ bbr->inflight_lo, /* il */
-+ bbr->inflight_hi, /* ih */
-+ bbr->bw_probe_up_cnt, /* c */
-+ 2, /* v: version */
-+ bbr->debug.event,
-+ bbr->cycle_idx,
-+ ack_phase[bbr->ack_phase],
-+ bbr->bw_probe_samples ? "Y" : "N");
-+ debugmsg[sizeof(debugmsg) - 1] = 0;
-+
-+ /* printk takes a higher precedence. */
-+ if (bbr_debug_with_printk)
-+ printk(KERN_DEBUG "%s", debugmsg);
-+
-+ if (unlikely(bbr->debug.undo))
-+ bbr->debug.undo = 0;
-+}
-+
-+/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
-+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
-+{
-+ u64 rate = bw;
-+
-+ rate = bbr_rate_bytes_per_sec(sk, rate, gain,
-+ bbr_pacing_margin_percent);
-+ rate = min_t(u64, rate, sk->sk_max_pacing_rate);
-+ return rate;
-+}
-+
-+/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
-+static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u64 bw;
-+ u32 rtt_us;
-+
-+ if (tp->srtt_us) { /* any RTT sample yet? */
-+ rtt_us = max(tp->srtt_us >> 3, 1U);
-+ bbr->has_seen_rtt = 1;
-+ } else { /* no RTT sample yet */
-+ rtt_us = USEC_PER_MSEC; /* use nominal default RTT */
-+ }
-+ bw = (u64)tp->snd_cwnd * BW_UNIT;
-+ do_div(bw, rtt_us);
-+ sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain);
-+}
-+
-+/* Pace using current bw estimate and a gain factor. */
-+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
-+
-+ if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
-+ bbr_init_pacing_rate_from_rtt(sk);
-+ if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
-+ sk->sk_pacing_rate = rate;
-+}
-+
-+static u32 bbr_min_tso_segs(struct sock *sk)
-+{
-+ return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
-+}
-+
-+/* Return the number of segments BBR would like in a TSO/GSO skb, given
-+ * a particular max gso size as a constraint.
-+ */
-+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
-+ u32 gso_max_size)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 segs, r;
-+ u64 bytes;
-+
-+ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
-+ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
-+
-+ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every
-+ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
-+ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
-+ */
-+ if (bbr->params.tso_rtt_shift) {
-+ r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift;
-+ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */
-+ bytes += GSO_MAX_SIZE >> r;
-+ }
-+
-+ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
-+ segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
-+ return segs;
-+}
-+
-+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
-+static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
-+{
-+ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
-+}
-+
-+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
-+static u32 bbr_tso_segs_goal(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+
-+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
-+}
-+
-+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
-+static void bbr_save_cwnd(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
-+ bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */
-+ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
-+ bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
-+}
-+
-+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ if (event == CA_EVENT_TX_START && tp->app_limited) {
-+ bbr->idle_restart = 1;
-+ bbr->ack_epoch_mstamp = tp->tcp_mstamp;
-+ bbr->ack_epoch_acked = 0;
-+ /* Avoid pointless buffer overflows: pace at est. bw if we don't
-+ * need more speed (we're restarting from idle and app-limited).
-+ */
-+ if (bbr->mode == BBR_PROBE_BW)
-+ bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
-+ else if (bbr->mode == BBR_PROBE_RTT)
-+ bbr_check_probe_rtt_done(sk);
-+ } else if ((event == CA_EVENT_ECN_IS_CE ||
-+ event == CA_EVENT_ECN_NO_CE) &&
-+ bbr_ecn_enable &&
-+ bbr->params.precise_ece_ack) {
-+ u32 state = bbr->ce_state;
-+ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
-+ bbr->ce_state = state;
-+ if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE)
-+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
-+ }
-+}
-+
-+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
-+ *
-+ * bdp = ceil(bw * min_rtt * gain)
-+ *
-+ * The key factor, gain, controls the amount of queue. While a small gain
-+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
-+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
-+ * noise may cause BBR to under-estimate the rate.
-+ */
-+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 bdp;
-+ u64 w;
-+
-+ /* If we've never had a valid RTT sample, cap cwnd at the initial
-+ * default. This should only happen when the connection is not using TCP
-+ * timestamps and has retransmitted all of the SYN/SYNACK/data packets
-+ * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
-+ * case we need to slow-start up toward something safe: initial cwnd.
-+ */
-+ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */
-+ return bbr->init_cwnd; /* be safe: cap at initial cwnd */
-+
-+ w = (u64)bw * bbr->min_rtt_us;
-+
-+ /* Apply a gain to the given value, remove the BW_SCALE shift, and
-+ * round the value up to avoid a negative feedback loop.
-+ */
-+ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
-+
-+ return bdp;
-+}
-+
-+/* To achieve full performance in high-speed paths, we budget enough cwnd to
-+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
-+ * - one skb in sending host Qdisc,
-+ * - one skb in sending host TSO/GSO engine
-+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
-+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
-+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
-+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
-+ * full even with ACK-every-other-packet delayed ACKs.
-+ */
-+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 tso_segs_goal;
-+
-+ tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
-+
-+ /* Allow enough full-sized skbs in flight to utilize end systems. */
-+ if (bbr->params.cwnd_tso_budget == 1) {
-+ cwnd = max_t(u32, cwnd, tso_segs_goal);
-+ cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
-+ } else {
-+ cwnd += tso_segs_goal;
-+ cwnd = (cwnd + 1) & ~1U;
-+ }
-+ /* Ensure gain cycling gets inflight above BDP even for small BDPs. */
-+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
-+ cwnd += 2;
-+
-+ return cwnd;
-+}
-+
-+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
-+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
-+{
-+ u32 inflight;
-+
-+ inflight = bbr_bdp(sk, bw, gain);
-+ inflight = bbr_quantization_budget(sk, inflight);
-+
-+ return inflight;
-+}
-+
-+/* With pacing at lower layers, there's often less data "in the network" than
-+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
-+ * we often have several skbs queued in the pacing layer with a pre-scheduled
-+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
-+ * inflight level that it estimates has already been "baked in" by previous
-+ * departure time decisions. We calculate a rough estimate of the number of our
-+ * packets that might be in the network at the earliest departure time for the
-+ * next skb scheduled:
-+ * in_network_at_edt = inflight_at_edt - (EDT - now) * bw
-+ * If we're increasing inflight, then we want to know if the transmit of the
-+ * EDT skb will push inflight above the target, so inflight_at_edt includes
-+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
-+ * then estimate if inflight will sink too low just before the EDT transmit.
-+ */
-+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u64 now_ns, edt_ns, interval_us;
-+ u32 interval_delivered, inflight_at_edt;
-+
-+ now_ns = tp->tcp_clock_cache;
-+ edt_ns = max(tp->tcp_wstamp_ns, now_ns);
-+ interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
-+ interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
-+ inflight_at_edt = inflight_now;
-+ if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */
-+ inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */
-+ if (interval_delivered >= inflight_at_edt)
-+ return 0;
-+ return inflight_at_edt - interval_delivered;
-+}
-+
-+/* Find the cwnd increment based on estimate of ack aggregation */
-+static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 max_aggr_cwnd, aggr_cwnd = 0;
-+
-+ if (bbr->params.extra_acked_gain &&
-+ (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) {
-+ max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
-+ / BW_UNIT;
-+ aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk))
-+ >> BBR_SCALE;
-+ aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
-+ }
-+
-+ return aggr_cwnd;
-+}
-+
-+/* Returns the cwnd for PROBE_RTT mode. */
-+static u32 bbr_probe_rtt_cwnd(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ if (bbr->params.probe_rtt_cwnd_gain == 0)
-+ return bbr->params.cwnd_min_target;
-+ return max_t(u32, bbr->params.cwnd_min_target,
-+ bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain));
-+}
-+
-+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
-+ * has drawn us down below target), or snap down to target if we're above it.
-+ */
-+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
-+ u32 acked, u32 bw, int gain, u32 cwnd,
-+ struct bbr_context *ctx)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe;
-+
-+ if (!acked)
-+ goto done; /* no packet fully ACKed; just apply caps */
-+
-+ target_cwnd = bbr_bdp(sk, bw, gain);
-+
-+ /* Increment the cwnd to account for excess ACKed data that seems
-+ * due to aggregation (of data and/or ACKs) visible in the ACK stream.
-+ */
-+ target_cwnd += bbr_ack_aggregation_cwnd(sk);
-+ target_cwnd = bbr_quantization_budget(sk, target_cwnd);
-+
-+ /* If we're below target cwnd, slow start cwnd toward target cwnd. */
-+ bbr->debug.target_cwnd = target_cwnd;
-+
-+ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
-+ bbr->try_fast_path = 0;
-+ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
-+ cwnd += acked;
-+ if (cwnd >= target_cwnd) {
-+ cwnd = target_cwnd;
-+ bbr->try_fast_path = 1;
-+ }
-+ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) {
-+ cwnd += acked;
-+ } else {
-+ bbr->try_fast_path = 1;
-+ }
-+
-+ /* When growing cwnd, don't grow beyond twice what we just probed. */
-+ if (bbr->params.usage_based_cwnd) {
-+ max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd);
-+ cwnd = min(cwnd, max_probe);
-+ }
-+
-+ cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
-+done:
-+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */
-+ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */
-+ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk));
-+
-+ ctx->target_cwnd = target_cwnd;
-+ ctx->log = (tp->snd_cwnd != prev_cwnd);
-+}
-+
-+/* See if we have reached next round trip */
-+static void bbr_update_round_start(struct sock *sk,
-+ const struct rate_sample *rs, struct bbr_context *ctx)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr->round_start = 0;
-+
-+ /* See if we've reached the next RTT */
-+ if (rs->interval_us > 0 &&
-+ !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
-+ bbr->next_rtt_delivered = tp->delivered;
-+ bbr->round_start = 1;
-+ }
-+}
-+
-+/* Calculate the bandwidth based on how fast packets are delivered */
-+static void bbr_calculate_bw_sample(struct sock *sk,
-+ const struct rate_sample *rs, struct bbr_context *ctx)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u64 bw = 0;
-+
-+ /* Divide delivered by the interval to find a (lower bound) bottleneck
-+ * bandwidth sample. Delivered is in packets and interval_us in uS and
-+ * ratio will be <<1 for most connections. So delivered is first scaled.
-+ * Round up to allow growth at low rates, even with integer division.
-+ */
-+ if (rs->interval_us > 0) {
-+ if (WARN_ONCE(rs->delivered < 0,
-+ "negative delivered: %d interval_us: %ld\n",
-+ rs->delivered, rs->interval_us))
-+ return;
-+
-+ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
-+ }
-+
-+ ctx->sample_bw = bw;
-+ bbr->debug.rs_bw = bw;
-+}
-+
-+/* Estimates the windowed max degree of ack aggregation.
-+ * This is used to provision extra in-flight data to keep sending during
-+ * inter-ACK silences.
-+ *
-+ * Degree of ack aggregation is estimated as extra data acked beyond expected.
-+ *
-+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
-+ * cwnd += max_extra_acked
-+ *
-+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
-+ * Max filter is an approximate sliding window of 5-10 (packet timed) round
-+ * trips for non-startup phase, and 1-2 round trips for startup.
-+ */
-+static void bbr_update_ack_aggregation(struct sock *sk,
-+ const struct rate_sample *rs)
-+{
-+ u32 epoch_us, expected_acked, extra_acked;
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts;
-+
-+ if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 ||
-+ rs->delivered < 0 || rs->interval_us <= 0)
-+ return;
-+
-+ if (bbr->round_start) {
-+ bbr->extra_acked_win_rtts = min(0x1F,
-+ bbr->extra_acked_win_rtts + 1);
-+ if (bbr->params.extra_acked_in_startup &&
-+ !bbr_full_bw_reached(sk))
-+ extra_acked_win_rtts_thresh = 1;
-+ if (bbr->extra_acked_win_rtts >=
-+ extra_acked_win_rtts_thresh) {
-+ bbr->extra_acked_win_rtts = 0;
-+ bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
-+ 0 : 1;
-+ bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
-+ }
-+ }
-+
-+ /* Compute how many packets we expected to be delivered over epoch. */
-+ epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
-+ bbr->ack_epoch_mstamp);
-+ expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
-+
-+ /* Reset the aggregation epoch if ACK rate is below expected rate or
-+ * significantly large no. of ack received since epoch (potentially
-+ * quite old epoch).
-+ */
-+ if (bbr->ack_epoch_acked <= expected_acked ||
-+ (bbr->ack_epoch_acked + rs->acked_sacked >=
-+ bbr_ack_epoch_acked_reset_thresh)) {
-+ bbr->ack_epoch_acked = 0;
-+ bbr->ack_epoch_mstamp = tp->delivered_mstamp;
-+ expected_acked = 0;
-+ }
-+
-+ /* Compute excess data delivered, beyond what was expected. */
-+ bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
-+ bbr->ack_epoch_acked + rs->acked_sacked);
-+ extra_acked = bbr->ack_epoch_acked - expected_acked;
-+ extra_acked = min(extra_acked, tp->snd_cwnd);
-+ if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
-+ bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
-+}
-+
-+/* Estimate when the pipe is full, using the change in delivery rate: BBR
-+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
-+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
-+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
-+ * higher rwin, 3: we get higher delivery rate samples. Or transient
-+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
-+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
-+ */
-+static void bbr_check_full_bw_reached(struct sock *sk,
-+ const struct rate_sample *rs)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 bw_thresh;
-+
-+ if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
-+ return;
-+
-+ bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE;
-+ if (bbr_max_bw(sk) >= bw_thresh) {
-+ bbr->full_bw = bbr_max_bw(sk);
-+ bbr->full_bw_cnt = 0;
-+ return;
-+ }
-+ ++bbr->full_bw_cnt;
-+ bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt;
-+}
-+
-+/* If pipe is probably full, drain the queue and then enter steady-state. */
-+static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
-+ struct bbr_context *ctx)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
-+ bbr->mode = BBR_DRAIN; /* drain queue we created */
-+ tcp_sk(sk)->snd_ssthresh =
-+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
-+ bbr2_reset_congestion_signals(sk);
-+ } /* fall through to check if in-flight is already small: */
-+ if (bbr->mode == BBR_DRAIN &&
-+ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
-+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
-+ return true; /* exiting DRAIN now */
-+ return false;
-+}
-+
-+static void bbr_check_probe_rtt_done(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ if (!(bbr->probe_rtt_done_stamp &&
-+ after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
-+ return;
-+
-+ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
-+ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
-+ bbr2_exit_probe_rtt(sk);
-+}
-+
-+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
-+ * periodically drain the bottleneck queue, to converge to measure the true
-+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
-+ * small (reducing queuing delay and packet loss) and achieve fairness among
-+ * BBR flows.
-+ *
-+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
-+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
-+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
-+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
-+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
-+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
-+ *
-+ * Note that flows need only pay 2% if they are busy sending over the last 10
-+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
-+ * natural silences or low-rate periods within 10 seconds where the rate is low
-+ * enough for long enough to drain its queue in the bottleneck. We pick up
-+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
-+ */
-+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ bool probe_rtt_expired, min_rtt_expired;
-+ u32 expire;
-+
-+ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
-+ expire = bbr->probe_rtt_min_stamp +
-+ msecs_to_jiffies(bbr->params.probe_rtt_win_ms);
-+ probe_rtt_expired = after(tcp_jiffies32, expire);
-+ if (rs->rtt_us >= 0 &&
-+ (rs->rtt_us <= bbr->probe_rtt_min_us ||
-+ (probe_rtt_expired && !rs->is_ack_delayed))) {
-+ bbr->probe_rtt_min_us = rs->rtt_us;
-+ bbr->probe_rtt_min_stamp = tcp_jiffies32;
-+ }
-+ /* Track min RTT seen in the min_rtt_win_sec filter window: */
-+ expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ;
-+ min_rtt_expired = after(tcp_jiffies32, expire);
-+ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
-+ min_rtt_expired) {
-+ bbr->min_rtt_us = bbr->probe_rtt_min_us;
-+ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
-+ }
-+
-+ if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired &&
-+ !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
-+ bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
-+ bbr_save_cwnd(sk); /* note cwnd so we can restore it */
-+ bbr->probe_rtt_done_stamp = 0;
-+ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
-+ bbr->next_rtt_delivered = tp->delivered;
-+ }
-+
-+ if (bbr->mode == BBR_PROBE_RTT) {
-+ /* Ignore low rate samples during this mode. */
-+ tp->app_limited =
-+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
-+ /* Maintain min packets in flight for max(200 ms, 1 round). */
-+ if (!bbr->probe_rtt_done_stamp &&
-+ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
-+ bbr->probe_rtt_done_stamp = tcp_jiffies32 +
-+ msecs_to_jiffies(bbr->params.probe_rtt_mode_ms);
-+ bbr->probe_rtt_round_done = 0;
-+ bbr->next_rtt_delivered = tp->delivered;
-+ } else if (bbr->probe_rtt_done_stamp) {
-+ if (bbr->round_start)
-+ bbr->probe_rtt_round_done = 1;
-+ if (bbr->probe_rtt_round_done)
-+ bbr_check_probe_rtt_done(sk);
-+ }
-+ }
-+ /* Restart after idle ends only once we process a new S/ACK for data */
-+ if (rs->delivered > 0)
-+ bbr->idle_restart = 0;
-+}
-+
-+static void bbr_update_gains(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ switch (bbr->mode) {
-+ case BBR_STARTUP:
-+ bbr->pacing_gain = bbr->params.high_gain;
-+ bbr->cwnd_gain = bbr->params.startup_cwnd_gain;
-+ break;
-+ case BBR_DRAIN:
-+ bbr->pacing_gain = bbr->params.drain_gain; /* slow, to drain */
-+ bbr->cwnd_gain = bbr->params.startup_cwnd_gain; /* keep cwnd */
-+ break;
-+ case BBR_PROBE_BW:
-+ bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx];
-+ bbr->cwnd_gain = bbr->params.cwnd_gain;
-+ break;
-+ case BBR_PROBE_RTT:
-+ bbr->pacing_gain = BBR_UNIT;
-+ bbr->cwnd_gain = BBR_UNIT;
-+ break;
-+ default:
-+ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
-+ break;
-+ }
-+}
-+
-+static void bbr_init(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ int i;
-+
-+ WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val);
-+
-+ bbr->initialized = 1;
-+ bbr->params.high_gain = min(0x7FF, bbr_high_gain);
-+ bbr->params.drain_gain = min(0x3FF, bbr_drain_gain);
-+ bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain);
-+ bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain);
-+ bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget);
-+ bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target);
-+ bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec);
-+ bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms);
-+ bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt);
-+ bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh);
-+ bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain);
-+ bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts);
-+ bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0;
-+ bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0;
-+ bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0;
-+ bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain);
-+ bbr->params.probe_rtt_win_ms =
-+ min(0x3FFFU,
-+ min_t(u32, bbr_probe_rtt_win_ms,
-+ bbr->params.min_rtt_win_sec * MSEC_PER_SEC));
-+ for (i = 0; i < CYCLE_LEN; i++)
-+ bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]);
-+ bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0;
-+ bbr->params.tso_rtt_shift = min(0xFU, bbr_tso_rtt_shift);
-+
-+ bbr->debug.snd_isn = tp->snd_una;
-+ bbr->debug.target_cwnd = 0;
-+ bbr->debug.undo = 0;
-+
-+ bbr->init_cwnd = min(0x7FU, tp->snd_cwnd);
-+ bbr->prior_cwnd = tp->prior_cwnd;
-+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-+ bbr->next_rtt_delivered = 0;
-+ bbr->prev_ca_state = TCP_CA_Open;
-+ bbr->packet_conservation = 0;
-+
-+ bbr->probe_rtt_done_stamp = 0;
-+ bbr->probe_rtt_round_done = 0;
-+ bbr->probe_rtt_min_us = tcp_min_rtt(tp);
-+ bbr->probe_rtt_min_stamp = tcp_jiffies32;
-+ bbr->min_rtt_us = tcp_min_rtt(tp);
-+ bbr->min_rtt_stamp = tcp_jiffies32;
-+
-+ bbr->has_seen_rtt = 0;
-+ bbr_init_pacing_rate_from_rtt(sk);
-+
-+ bbr->round_start = 0;
-+ bbr->idle_restart = 0;
-+ bbr->full_bw_reached = 0;
-+ bbr->full_bw = 0;
-+ bbr->full_bw_cnt = 0;
-+ bbr->cycle_mstamp = 0;
-+ bbr->cycle_idx = 0;
-+ bbr->mode = BBR_STARTUP;
-+ bbr->debug.rs_bw = 0;
-+
-+ bbr->ack_epoch_mstamp = tp->tcp_mstamp;
-+ bbr->ack_epoch_acked = 0;
-+ bbr->extra_acked_win_rtts = 0;
-+ bbr->extra_acked_win_idx = 0;
-+ bbr->extra_acked[0] = 0;
-+ bbr->extra_acked[1] = 0;
-+
-+ bbr->ce_state = 0;
-+ bbr->prior_rcv_nxt = tp->rcv_nxt;
-+ bbr->try_fast_path = 0;
-+
-+ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
-+}
-+
-+static u32 bbr_sndbuf_expand(struct sock *sk)
-+{
-+ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */
-+ return 3;
-+}
-+
-+/* __________________________________________________________________________
-+ *
-+ * Functions new to BBR v2 ("bbr") congestion control are below here.
-+ * __________________________________________________________________________
-+ */
-+
-+/* Incorporate a new bw sample into the current window of our max filter. */
-+static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
-+}
-+
-+/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
-+static void bbr2_advance_bw_hi_filter(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ if (!bbr->bw_hi[1])
-+ return; /* no samples in this window; remember old window */
-+ bbr->bw_hi[0] = bbr->bw_hi[1];
-+ bbr->bw_hi[1] = 0;
-+}
-+
-+/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
-+static u32 bbr2_target_inflight(struct sock *sk)
-+{
-+ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
-+
-+ return min(bdp, tcp_sk(sk)->snd_cwnd);
-+}
-+
-+static bool bbr2_is_probing_bandwidth(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ return (bbr->mode == BBR_STARTUP) ||
-+ (bbr->mode == BBR_PROBE_BW &&
-+ (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
-+ bbr->cycle_idx == BBR_BW_PROBE_UP));
-+}
-+
-+/* Has the given amount of time elapsed since we marked the phase start? */
-+static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
-+{
-+ const struct tcp_sock *tp = tcp_sk(sk);
-+ const struct bbr *bbr = inet_csk_ca(sk);
-+
-+ return tcp_stamp_us_delta(tp->tcp_mstamp,
-+ bbr->cycle_mstamp + interval_us) > 0;
-+}
-+
-+static void bbr2_handle_queue_too_high_in_startup(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr->full_bw_reached = 1;
-+ bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
-+}
-+
-+/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
-+static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
-+ !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh)
-+ return;
-+
-+ if (ce_ratio >= bbr->params.ecn_thresh)
-+ bbr->startup_ecn_rounds++;
-+ else
-+ bbr->startup_ecn_rounds = 0;
-+
-+ if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) {
-+ bbr->debug.event = 'E'; /* ECN caused STARTUP exit */
-+ bbr2_handle_queue_too_high_in_startup(sk);
-+ return;
-+ }
-+}
-+
-+static void bbr2_update_ecn_alpha(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ s32 delivered, delivered_ce;
-+ u64 alpha, ce_ratio;
-+ u32 gain;
-+
-+ if (bbr->params.ecn_factor == 0)
-+ return;
-+
-+ delivered = tp->delivered - bbr->alpha_last_delivered;
-+ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
-+
-+ if (delivered == 0 || /* avoid divide by zero */
-+ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */
-+ return;
-+
-+ /* See if we should use ECN sender logic for this connection. */
-+ if (!bbr->ecn_eligible && bbr_ecn_enable &&
-+ (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us ||
-+ !bbr->params.ecn_max_rtt_us))
-+ bbr->ecn_eligible = 1;
-+
-+ ce_ratio = (u64)delivered_ce << BBR_SCALE;
-+ do_div(ce_ratio, delivered);
-+ gain = bbr->params.ecn_alpha_gain;
-+ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
-+ alpha += (gain * ce_ratio) >> BBR_SCALE;
-+ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
-+
-+ bbr->alpha_last_delivered = tp->delivered;
-+ bbr->alpha_last_delivered_ce = tp->delivered_ce;
-+
-+ bbr2_check_ecn_too_high_in_startup(sk, ce_ratio);
-+}
-+
-+/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
-+static void bbr2_raise_inflight_hi_slope(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 growth_this_round, cnt;
-+
-+ /* Calculate "slope": packets S/Acked per inflight_hi increment. */
-+ growth_this_round = 1 << bbr->bw_probe_up_rounds;
-+ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
-+ cnt = tp->snd_cwnd / growth_this_round;
-+ cnt = max(cnt, 1U);
-+ bbr->bw_probe_up_cnt = cnt;
-+ bbr->debug.event = 'G'; /* Grow inflight_hi slope */
-+}
-+
-+/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
-+static void bbr2_probe_inflight_hi_upward(struct sock *sk,
-+ const struct rate_sample *rs)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 delta;
-+
-+ if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) {
-+ bbr->bw_probe_up_acks = 0; /* don't accmulate unused credits */
-+ return; /* not fully using inflight_hi, so don't grow it */
-+ }
-+
-+ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
-+ bbr->bw_probe_up_acks += rs->acked_sacked;
-+ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) {
-+ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
-+ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
-+ bbr->inflight_hi += delta;
-+ bbr->debug.event = 'I'; /* Increment inflight_hi */
-+ }
-+
-+ if (bbr->round_start)
-+ bbr2_raise_inflight_hi_slope(sk);
-+}
-+
-+/* Does loss/ECN rate for this sample say inflight is "too high"?
-+ * This is used by both the bbr_check_loss_too_high_in_startup() function,
-+ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
-+ * uses it to notice when loss/ECN rates suggest inflight is too high.
-+ */
-+static bool bbr2_is_inflight_too_high(const struct sock *sk,
-+ const struct rate_sample *rs)
-+{
-+ const struct bbr *bbr = inet_csk_ca(sk);
-+ u32 loss_thresh, ecn_thresh;
-+
-+ if (rs->lost > 0 && rs->tx_in_flight) {
-+ loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >>
-+ BBR_SCALE;
-+ if (rs->lost > loss_thresh)
-+ return true;
-+ }
-+
-+ if (rs->delivered_ce > 0 && rs->delivered > 0 &&
-+ bbr->ecn_eligible && bbr->params.ecn_thresh) {
-+ ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >>
-+ BBR_SCALE;
-+ if (rs->delivered_ce >= ecn_thresh)
-+ return true;
-+ }
-+
-+ return false;
-+}
-+
-+/* Calculate the tx_in_flight level that corresponded to excessive loss.
-+ * We find "lost_prefix" segs of the skb where loss rate went too high,
-+ * by solving for "lost_prefix" in the following equation:
-+ * lost / inflight >= loss_thresh
-+ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
-+ * Then we take that equation, convert it to fixed point, and
-+ * round up to the nearest packet.
-+ */
-+static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk,
-+ const struct rate_sample *rs,
-+ const struct sk_buff *skb)
-+{
-+ const struct bbr *bbr = inet_csk_ca(sk);
-+ u32 loss_thresh = bbr->params.loss_thresh;
-+ u32 pcount, divisor, inflight_hi;
-+ s32 inflight_prev, lost_prev;
-+ u64 loss_budget, lost_prefix;
-+
-+ pcount = tcp_skb_pcount(skb);
-+
-+ /* How much data was in flight before this skb? */
-+ inflight_prev = rs->tx_in_flight - pcount;
-+ if (WARN_ONCE(inflight_prev < 0,
-+ "tx_in_flight: %u pcount: %u reneg: %u",
-+ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg))
-+ return ~0U;
-+
-+ /* How much inflight data was marked lost before this skb? */
-+ lost_prev = rs->lost - pcount;
-+ if (WARN_ON_ONCE(lost_prev < 0))
-+ return ~0U;
-+
-+ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */
-+ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
-+ loss_budget >>= BBR_SCALE;
-+ if (lost_prev >= loss_budget) {
-+ lost_prefix = 0; /* previous losses crossed loss_thresh */
-+ } else {
-+ lost_prefix = loss_budget - lost_prev;
-+ lost_prefix <<= BBR_SCALE;
-+ divisor = BBR_UNIT - loss_thresh;
-+ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */
-+ return ~0U;
-+ do_div(lost_prefix, divisor);
-+ }
-+
-+ inflight_hi = inflight_prev + lost_prefix;
-+ return inflight_hi;
-+}
-+
-+/* If loss/ECN rates during probing indicated we may have overfilled a
-+ * buffer, return an operating point that tries to leave unutilized headroom in
-+ * the path for other flows, for fairness convergence and lower RTTs and loss.
-+ */
-+static u32 bbr2_inflight_with_headroom(const struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 headroom, headroom_fraction;
-+
-+ if (bbr->inflight_hi == ~0U)
-+ return ~0U;
-+
-+ headroom_fraction = bbr->params.inflight_headroom;
-+ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
-+ headroom = max(headroom, 1U);
-+ return max_t(s32, bbr->inflight_hi - headroom,
-+ bbr->params.cwnd_min_target);
-+}
-+
-+/* Bound cwnd to a sensible level, based on our current probing state
-+ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
-+ */
-+static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 cap;
-+
-+ /* tcp_rcv_synsent_state_process() currently calls tcp_ack()
-+ * and thus cong_control() without first initializing us(!).
-+ */
-+ if (!bbr->initialized)
-+ return;
-+
-+ cap = ~0U;
-+ if (bbr->mode == BBR_PROBE_BW &&
-+ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
-+ /* Probe to see if more packets fit in the path. */
-+ cap = bbr->inflight_hi;
-+ } else {
-+ if (bbr->mode == BBR_PROBE_RTT ||
-+ (bbr->mode == BBR_PROBE_BW &&
-+ bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
-+ cap = bbr2_inflight_with_headroom(sk);
-+ }
-+ /* Adapt to any loss/ECN since our last bw probe. */
-+ cap = min(cap, bbr->inflight_lo);
-+
-+ cap = max_t(u32, cap, bbr->params.cwnd_min_target);
-+ tp->snd_cwnd = min(cap, tp->snd_cwnd);
-+}
-+
-+/* Estimate a short-term lower bound on the capacity available now, based
-+ * on measurements of the current delivery process and recent history. When we
-+ * are seeing loss/ECN at times when we are not probing bw, then conservatively
-+ * move toward flow balance by multiplicatively cutting our short-term
-+ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
-+ * multiplicative decrease in order to converge to a lower capacity in time
-+ * logarithmic in the magnitude of the decrease.
-+ *
-+ * However, we do not cut our short-term estimates lower than the current rate
-+ * and volume of delivered data from this round trip, since from the current
-+ * delivery process we can estimate the measured capacity available now.
-+ *
-+ * Anything faster than that approach would knowingly risk high loss, which can
-+ * cause low bw for Reno/CUBIC and high loss recovery latency for
-+ * request/response flows using any congestion control.
-+ */
-+static void bbr2_adapt_lower_bounds(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 ecn_cut, ecn_inflight_lo, beta;
-+
-+ /* We only use lower-bound estimates when not probing bw.
-+ * When probing we need to push inflight higher to probe bw.
-+ */
-+ if (bbr2_is_probing_bandwidth(sk))
-+ return;
-+
-+ /* ECN response. */
-+ if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) {
-+ /* Reduce inflight to (1 - alpha*ecn_factor). */
-+ ecn_cut = (BBR_UNIT -
-+ ((bbr->ecn_alpha * bbr->params.ecn_factor) >>
-+ BBR_SCALE));
-+ if (bbr->inflight_lo == ~0U)
-+ bbr->inflight_lo = tp->snd_cwnd;
-+ ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
-+ } else {
-+ ecn_inflight_lo = ~0U;
-+ }
-+
-+ /* Loss response. */
-+ if (bbr->loss_in_round) {
-+ /* Reduce bw and inflight to (1 - beta). */
-+ if (bbr->bw_lo == ~0U)
-+ bbr->bw_lo = bbr_max_bw(sk);
-+ if (bbr->inflight_lo == ~0U)
-+ bbr->inflight_lo = tp->snd_cwnd;
-+ beta = bbr->params.beta;
-+ bbr->bw_lo =
-+ max_t(u32, bbr->bw_latest,
-+ (u64)bbr->bw_lo *
-+ (BBR_UNIT - beta) >> BBR_SCALE);
-+ bbr->inflight_lo =
-+ max_t(u32, bbr->inflight_latest,
-+ (u64)bbr->inflight_lo *
-+ (BBR_UNIT - beta) >> BBR_SCALE);
-+ }
-+
-+ /* Adjust to the lower of the levels implied by loss or ECN. */
-+ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
-+}
-+
-+/* Reset any short-term lower-bound adaptation to congestion, so that we can
-+ * push our inflight up.
-+ */
-+static void bbr2_reset_lower_bounds(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr->bw_lo = ~0U;
-+ bbr->inflight_lo = ~0U;
-+}
-+
-+/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
-+ * machine phase where we adapt our lower bound based on congestion signals.
-+ */
-+static void bbr2_reset_congestion_signals(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr->loss_in_round = 0;
-+ bbr->ecn_in_round = 0;
-+ bbr->loss_in_cycle = 0;
-+ bbr->ecn_in_cycle = 0;
-+ bbr->bw_latest = 0;
-+ bbr->inflight_latest = 0;
-+}
-+
-+/* Update (most of) our congestion signals: track the recent rate and volume of
-+ * delivered data, presence of loss, and EWMA degree of ECN marking.
-+ */
-+static void bbr2_update_congestion_signals(
-+ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u64 bw;
-+
-+ bbr->loss_round_start = 0;
-+ if (rs->interval_us <= 0 || !rs->acked_sacked)
-+ return; /* Not a valid observation */
-+ bw = ctx->sample_bw;
-+
-+ if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
-+ bbr2_take_bw_hi_sample(sk, bw);
-+
-+ bbr->loss_in_round |= (rs->losses > 0);
-+
-+ /* Update rate and volume of delivered data from latest round trip: */
-+ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw);
-+ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
-+
-+ if (before(rs->prior_delivered, bbr->loss_round_delivered))
-+ return; /* skip the per-round-trip updates */
-+ /* Now do per-round-trip updates. */
-+ bbr->loss_round_delivered = tp->delivered; /* mark round trip */
-+ bbr->loss_round_start = 1;
-+ bbr2_adapt_lower_bounds(sk);
-+
-+ /* Update windowed "latest" (single-round-trip) filters. */
-+ bbr->loss_in_round = 0;
-+ bbr->ecn_in_round = 0;
-+ bbr->bw_latest = ctx->sample_bw;
-+ bbr->inflight_latest = rs->delivered;
-+}
-+
-+/* Bandwidth probing can cause loss. To help coexistence with loss-based
-+ * congestion control we spread out our probing in a Reno-conscious way. Due to
-+ * the shape of the Reno sawtooth, the time required between loss epochs for an
-+ * idealized Reno flow is a number of round trips that is the BDP of that
-+ * flow. We count packet-timed round trips directly, since measured RTT can
-+ * vary widely, and Reno is driven by packet-timed round trips.
-+ */
-+static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 inflight, rounds, reno_gain, reno_rounds;
-+
-+ /* Random loss can shave some small percentage off of our inflight
-+ * in each round. To survive this, flows need robust periodic probes.
-+ */
-+ rounds = bbr->params.bw_probe_max_rounds;
-+
-+ reno_gain = bbr->params.bw_probe_reno_gain;
-+ if (reno_gain) {
-+ inflight = bbr2_target_inflight(sk);
-+ reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE;
-+ rounds = min(rounds, reno_rounds);
-+ }
-+ return bbr->rounds_since_probe >= rounds;
-+}
-+
-+/* How long do we want to wait before probing for bandwidth (and risking
-+ * loss)? We randomize the wait, for better mixing and fairness convergence.
-+ *
-+ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
-+ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
-+ * (eg 4K video to a broadband user):
-+ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
-+ *
-+ * We bound the BBR-native inter-bw-probe wall clock time to be:
-+ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time
-+ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
-+ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
-+ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable
-+ * amount of time to discover unutilized bw on human-scale interactive
-+ * time-scales (e.g. perhaps traffic from a web page download that we
-+ * were competing with is now complete).
-+ */
-+static void bbr2_pick_probe_wait(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ /* Decide the random round-trip bound for wait until probe: */
-+ bbr->rounds_since_probe =
-+ prandom_u32_max(bbr->params.bw_probe_rand_rounds);
-+ /* Decide the random wall clock bound for wait until probe: */
-+ bbr->probe_wait_us = bbr->params.bw_probe_base_us +
-+ prandom_u32_max(bbr->params.bw_probe_rand_us);
-+}
-+
-+static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr->cycle_idx = cycle_idx;
-+ /* New phase, so need to update cwnd and pacing rate. */
-+ bbr->try_fast_path = 0;
-+}
-+
-+/* Send at estimated bw to fill the pipe, but not queue. We need this phase
-+ * before PROBE_UP, because as soon as we send faster than the available bw
-+ * we will start building a queue, and if the buffer is shallow we can cause
-+ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
-+ * inflight_hi estimates will underestimate.
-+ */
-+static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr2_reset_lower_bounds(sk);
-+ if (bbr->inflight_hi != ~0U)
-+ bbr->inflight_hi += bbr->params.refill_add_inc;
-+ bbr->bw_probe_up_rounds = bw_probe_up_rounds;
-+ bbr->bw_probe_up_acks = 0;
-+ bbr->stopped_risky_probe = 0;
-+ bbr->ack_phase = BBR_ACKS_REFILLING;
-+ bbr->next_rtt_delivered = tp->delivered;
-+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
-+}
-+
-+/* Now probe max deliverable data rate and volume. */
-+static void bbr2_start_bw_probe_up(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
-+ bbr->next_rtt_delivered = tp->delivered;
-+ bbr->cycle_mstamp = tp->tcp_mstamp;
-+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP);
-+ bbr2_raise_inflight_hi_slope(sk);
-+}
-+
-+/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
-+ * clock time at which to probe beyond an inflight that we think to be
-+ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
-+ * keep packet loss rates low. Also start a round-trip counter, to probe faster
-+ * if we estimate a Reno flow at our BDP would probe faster.
-+ */
-+static void bbr2_start_bw_probe_down(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr2_reset_congestion_signals(sk);
-+ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */
-+ bbr2_pick_probe_wait(sk);
-+ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */
-+ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
-+ bbr->next_rtt_delivered = tp->delivered;
-+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
-+}
-+
-+/* Cruise: maintain what we estimate to be a neutral, conservative
-+ * operating point, without attempting to probe up for bandwidth or down for
-+ * RTT, and only reducing inflight in response to loss/ECN signals.
-+ */
-+static void bbr2_start_bw_probe_cruise(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ if (bbr->inflight_lo != ~0U)
-+ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
-+
-+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
-+}
-+
-+/* Loss and/or ECN rate is too high while probing.
-+ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
-+ */
-+static void bbr2_handle_inflight_too_high(struct sock *sk,
-+ const struct rate_sample *rs)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ const u32 beta = bbr->params.beta;
-+
-+ bbr->prev_probe_too_high = 1;
-+ bbr->bw_probe_samples = 0; /* only react once per probe */
-+ bbr->debug.event = 'L'; /* Loss/ECN too high */
-+ /* If we are app-limited then we are not robustly
-+ * probing the max volume of inflight data we think
-+ * might be safe (analogous to how app-limited bw
-+ * samples are not known to be robustly probing bw).
-+ */
-+ if (!rs->is_app_limited)
-+ bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
-+ (u64)bbr2_target_inflight(sk) *
-+ (BBR_UNIT - beta) >> BBR_SCALE);
-+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
-+ bbr2_start_bw_probe_down(sk);
-+}
-+
-+/* If we're seeing bw and loss samples reflecting our bw probing, adapt
-+ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
-+ * inflight_hi downward. If we're able to push inflight higher without such
-+ * signals, push higher: adapt inflight_hi upward.
-+ */
-+static bool bbr2_adapt_upper_bounds(struct sock *sk,
-+ const struct rate_sample *rs)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ /* Track when we'll see bw/loss samples resulting from our bw probes. */
-+ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
-+ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
-+ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
-+ /* End of samples from bw probing phase. */
-+ bbr->bw_probe_samples = 0;
-+ bbr->ack_phase = BBR_ACKS_INIT;
-+ /* At this point in the cycle, our current bw sample is also
-+ * our best recent chance at finding the highest available bw
-+ * for this flow. So now is the best time to forget the bw
-+ * samples from the previous cycle, by advancing the window.
-+ */
-+ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
-+ bbr2_advance_bw_hi_filter(sk);
-+ /* If we had an inflight_hi, then probed and pushed inflight all
-+ * the way up to hit that inflight_hi without seeing any
-+ * high loss/ECN in all the resulting ACKs from that probing,
-+ * then probe up again, this time letting inflight persist at
-+ * inflight_hi for a round trip, then accelerating beyond.
-+ */
-+ if (bbr->mode == BBR_PROBE_BW &&
-+ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
-+ bbr->debug.event = 'R'; /* reprobe */
-+ bbr2_start_bw_probe_refill(sk, 0);
-+ return true; /* yes, decided state transition */
-+ }
-+ }
-+
-+ if (bbr2_is_inflight_too_high(sk, rs)) {
-+ if (bbr->bw_probe_samples) /* sample is from bw probing? */
-+ bbr2_handle_inflight_too_high(sk, rs);
-+ } else {
-+ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */
-+ if (bbr->inflight_hi == ~0U) /* no excess queue signals yet? */
-+ return false;
-+
-+ /* To be resilient to random loss, we must raise inflight_hi
-+ * if we observe in any phase that a higher level is safe.
-+ */
-+ if (rs->tx_in_flight > bbr->inflight_hi) {
-+ bbr->inflight_hi = rs->tx_in_flight;
-+ bbr->debug.event = 'U'; /* raise up inflight_hi */
-+ }
-+
-+ if (bbr->mode == BBR_PROBE_BW &&
-+ bbr->cycle_idx == BBR_BW_PROBE_UP)
-+ bbr2_probe_inflight_hi_upward(sk, rs);
-+ }
-+
-+ return false;
-+}
-+
-+/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
-+static bool bbr2_check_time_to_probe_bw(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 n;
-+
-+ /* If we seem to be at an operating point where we are not seeing loss
-+ * but we are seeing ECN marks, then when the ECN marks cease we reprobe
-+ * quickly (in case a burst of cross-traffic has ceased and freed up bw,
-+ * or in case we are sharing with multiplicatively probing traffic).
-+ */
-+ if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible &&
-+ bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
-+ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
-+ bbr->debug.event = 'A'; /* *A*ll clear to probe *A*gain */
-+ /* Calculate n so that when bbr2_raise_inflight_hi_slope()
-+ * computes growth_this_round as 2^n it will be roughly the
-+ * desired volume of data (inflight_hi*ecn_reprobe_gain).
-+ */
-+ n = ilog2((((u64)bbr->inflight_hi *
-+ bbr->params.ecn_reprobe_gain) >> BBR_SCALE));
-+ bbr2_start_bw_probe_refill(sk, n);
-+ return true;
-+ }
-+
-+ if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
-+ bbr2_is_reno_coexistence_probe_time(sk)) {
-+ bbr2_start_bw_probe_refill(sk, 0);
-+ return true;
-+ }
-+ return false;
-+}
-+
-+/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
-+static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ bool is_under_bdp, is_long_enough;
-+
-+ /* Always need to pull inflight down to leave headroom in queue. */
-+ if (inflight > bbr2_inflight_with_headroom(sk))
-+ return false;
-+
-+ is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT);
-+ if (bbr->params.drain_to_target)
-+ return is_under_bdp;
-+
-+ is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us);
-+ return is_under_bdp || is_long_enough;
-+}
-+
-+/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
-+static void bbr2_update_cycle_phase(struct sock *sk,
-+ const struct rate_sample *rs)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ bool is_risky = false, is_queuing = false;
-+ u32 inflight, bw;
-+
-+ if (!bbr_full_bw_reached(sk))
-+ return;
-+
-+ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
-+ if (bbr2_adapt_upper_bounds(sk, rs))
-+ return; /* already decided state transition */
-+
-+ if (bbr->mode != BBR_PROBE_BW)
-+ return;
-+
-+ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
-+ bw = bbr_max_bw(sk);
-+
-+ switch (bbr->cycle_idx) {
-+ /* First we spend most of our time cruising with a pacing_gain of 1.0,
-+ * which paces at the estimated bw, to try to fully use the pipe
-+ * without building queue. If we encounter loss/ECN marks, we adapt
-+ * by slowing down.
-+ */
-+ case BBR_BW_PROBE_CRUISE:
-+ if (bbr2_check_time_to_probe_bw(sk))
-+ return; /* already decided state transition */
-+ break;
-+
-+ /* After cruising, when it's time to probe, we first "refill": we send
-+ * at the estimated bw to fill the pipe, before probing higher and
-+ * knowingly risking overflowing the bottleneck buffer (causing loss).
-+ */
-+ case BBR_BW_PROBE_REFILL:
-+ if (bbr->round_start) {
-+ /* After one full round trip of sending in REFILL, we
-+ * start to see bw samples reflecting our REFILL, which
-+ * may be putting too much data in flight.
-+ */
-+ bbr->bw_probe_samples = 1;
-+ bbr2_start_bw_probe_up(sk);
-+ }
-+ break;
-+
-+ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
-+ * probe for bw. If we have not seen loss/ECN, we try to raise inflight
-+ * to at least pacing_gain*BDP; note that this may take more than
-+ * min_rtt if min_rtt is small (e.g. on a LAN).
-+ *
-+ * We terminate PROBE_UP bandwidth probing upon any of the following:
-+ *
-+ * (1) We've pushed inflight up to hit the inflight_hi target set in the
-+ * most recent previous bw probe phase. Thus we want to start
-+ * draining the queue immediately because it's very likely the most
-+ * recently sent packets will fill the queue and cause drops.
-+ * (checked here)
-+ * (2) We have probed for at least 1*min_rtt_us, and the
-+ * estimated queue is high enough (inflight > 1.25 * estimated_bdp).
-+ * (checked here)
-+ * (3) Loss filter says loss rate is "too high".
-+ * (checked in bbr_is_inflight_too_high())
-+ * (4) ECN filter says ECN mark rate is "too high".
-+ * (checked in bbr_is_inflight_too_high())
-+ */
-+ case BBR_BW_PROBE_UP:
-+ if (bbr->prev_probe_too_high &&
-+ inflight >= bbr->inflight_hi) {
-+ bbr->stopped_risky_probe = 1;
-+ is_risky = true;
-+ bbr->debug.event = 'D'; /* D for danger */
-+ } else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) &&
-+ inflight >=
-+ bbr_inflight(sk, bw,
-+ bbr->params.bw_probe_pif_gain)) {
-+ is_queuing = true;
-+ bbr->debug.event = 'Q'; /* building Queue */
-+ }
-+ if (is_risky || is_queuing) {
-+ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */
-+ bbr2_start_bw_probe_down(sk); /* restart w/ down */
-+ }
-+ break;
-+
-+ /* After probing in PROBE_UP, we have usually accumulated some data in
-+ * the bottleneck buffer (if bw probing didn't find more bw). We next
-+ * enter PROBE_DOWN to try to drain any excess data from the queue. To
-+ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
-+ * our inflight is less then that target cruising point, which is the
-+ * minimum of (a) the amount needed to leave headroom, and (b) the
-+ * estimated BDP. Once inflight falls to match the target, we estimate
-+ * the queue is drained; persisting would underutilize the pipe.
-+ */
-+ case BBR_BW_PROBE_DOWN:
-+ if (bbr2_check_time_to_probe_bw(sk))
-+ return; /* already decided state transition */
-+ if (bbr2_check_time_to_cruise(sk, inflight, bw))
-+ bbr2_start_bw_probe_cruise(sk);
-+ break;
-+
-+ default:
-+ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
-+ }
-+}
-+
-+/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
-+static void bbr2_exit_probe_rtt(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr2_reset_lower_bounds(sk);
-+ if (bbr_full_bw_reached(sk)) {
-+ bbr->mode = BBR_PROBE_BW;
-+ /* Raising inflight after PROBE_RTT may cause loss, so reset
-+ * the PROBE_BW clock and schedule the next bandwidth probe for
-+ * a friendly and randomized future point in time.
-+ */
-+ bbr2_start_bw_probe_down(sk);
-+ /* Since we are exiting PROBE_RTT, we know inflight is
-+ * below our estimated BDP, so it is reasonable to cruise.
-+ */
-+ bbr2_start_bw_probe_cruise(sk);
-+ } else {
-+ bbr->mode = BBR_STARTUP;
-+ }
-+}
-+
-+/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
-+ * the end of the round in recovery to get a good estimate of how many packets
-+ * have been lost, and how many we need to drain with a low pacing rate.
-+ */
-+static void bbr2_check_loss_too_high_in_startup(struct sock *sk,
-+ const struct rate_sample *rs)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ if (bbr_full_bw_reached(sk))
-+ return;
-+
-+ /* For STARTUP exit, check the loss rate at the end of each round trip
-+ * of Recovery episodes in STARTUP. We check the loss rate at the end
-+ * of the round trip to filter out noisy/low loss and have a better
-+ * sense of inflight (extent of loss), so we can drain more accurately.
-+ */
-+ if (rs->losses && bbr->loss_events_in_round < 0xf)
-+ bbr->loss_events_in_round++; /* update saturating counter */
-+ if (bbr->params.full_loss_cnt && bbr->loss_round_start &&
-+ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
-+ bbr->loss_events_in_round >= bbr->params.full_loss_cnt &&
-+ bbr2_is_inflight_too_high(sk, rs)) {
-+ bbr->debug.event = 'P'; /* Packet loss caused STARTUP exit */
-+ bbr2_handle_queue_too_high_in_startup(sk);
-+ return;
-+ }
-+ if (bbr->loss_round_start)
-+ bbr->loss_events_in_round = 0;
-+}
-+
-+/* If we are done draining, advance into steady state operation in PROBE_BW. */
-+static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs,
-+ struct bbr_context *ctx)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ if (bbr_check_drain(sk, rs, ctx)) {
-+ bbr->mode = BBR_PROBE_BW;
-+ bbr2_start_bw_probe_down(sk);
-+ }
-+}
-+
-+static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs,
-+ struct bbr_context *ctx)
-+{
-+ bbr2_update_congestion_signals(sk, rs, ctx);
-+ bbr_update_ack_aggregation(sk, rs);
-+ bbr2_check_loss_too_high_in_startup(sk, rs);
-+ bbr_check_full_bw_reached(sk, rs);
-+ bbr2_check_drain(sk, rs, ctx);
-+ bbr2_update_cycle_phase(sk, rs);
-+ bbr_update_min_rtt(sk, rs);
-+}
-+
-+/* Fast path for app-limited case.
-+ *
-+ * On each ack, we execute bbr state machine, which primarily consists of:
-+ * 1) update model based on new rate sample, and
-+ * 2) update control based on updated model or state change.
-+ *
-+ * There are certain workload/scenarios, e.g. app-limited case, where
-+ * either we can skip updating model or we can skip update of both model
-+ * as well as control. This provides signifcant softirq cpu savings for
-+ * processing incoming acks.
-+ *
-+ * In case of app-limited, if there is no congestion (loss/ecn) and
-+ * if observed bw sample is less than current estimated bw, then we can
-+ * skip some of the computation in bbr state processing:
-+ *
-+ * - if there is no rtt/mode/phase change: In this case, since all the
-+ * parameters of the network model are constant, we can skip model
-+ * as well control update.
-+ *
-+ * - else we can skip rest of the model update. But we still need to
-+ * update the control to account for the new rtt/mode/phase.
-+ *
-+ * Returns whether we can take fast path or not.
-+ */
-+static bool bbr2_fast_path(struct sock *sk, bool *update_model,
-+ const struct rate_sample *rs, struct bbr_context *ctx)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u32 prev_min_rtt_us, prev_mode;
-+
-+ if (bbr->params.fast_path && bbr->try_fast_path &&
-+ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
-+ !bbr->loss_in_round && !bbr->ecn_in_round) {
-+ prev_mode = bbr->mode;
-+ prev_min_rtt_us = bbr->min_rtt_us;
-+ bbr2_check_drain(sk, rs, ctx);
-+ bbr2_update_cycle_phase(sk, rs);
-+ bbr_update_min_rtt(sk, rs);
-+
-+ if (bbr->mode == prev_mode &&
-+ bbr->min_rtt_us == prev_min_rtt_us &&
-+ bbr->try_fast_path)
-+ return true;
-+
-+ /* Skip model update, but control still needs to be updated */
-+ *update_model = false;
-+ }
-+ return false;
-+}
-+
-+static void bbr2_main(struct sock *sk, const struct rate_sample *rs)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ struct bbr_context ctx = { 0 };
-+ bool update_model = true;
-+ u32 bw;
-+
-+ bbr->debug.event = '.'; /* init to default NOP (no event yet) */
-+
-+ bbr_update_round_start(sk, rs, &ctx);
-+ if (bbr->round_start) {
-+ bbr->rounds_since_probe =
-+ min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
-+ bbr2_update_ecn_alpha(sk);
-+ }
-+
-+ bbr->ecn_in_round |= rs->is_ece;
-+ bbr_calculate_bw_sample(sk, rs, &ctx);
-+
-+ if (bbr2_fast_path(sk, &update_model, rs, &ctx))
-+ goto out;
-+
-+ if (update_model)
-+ bbr2_update_model(sk, rs, &ctx);
-+
-+ bbr_update_gains(sk);
-+ bw = bbr_bw(sk);
-+ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
-+ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
-+ tp->snd_cwnd, &ctx);
-+ bbr2_bound_cwnd_for_inflight_model(sk);
-+
-+out:
-+ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
-+ bbr->loss_in_cycle |= rs->lost > 0;
-+ bbr->ecn_in_cycle |= rs->delivered_ce > 0;
-+
-+ bbr_debug(sk, rs->acked_sacked, rs, &ctx);
-+}
-+
-+/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared
-+ * down here, so that the algorithm functions that use the parameters must use
-+ * the per-socket parameters; if they accidentally use the global version
-+ * then there will be a compile error.
-+ * TODO(ncardwell): move all per-socket parameters down to this section.
-+ */
-+
-+/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
-+ * No loss response when 0. Max allwed value is 255.
-+ */
-+static u32 bbr_beta = BBR_UNIT * 30 / 100;
-+
-+/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE.
-+ * Max allowed value is 255.
-+ */
-+static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; /* 1/16 = 6.25% */
-+
-+/* The initial value for the ecn_alpha state variable. Default and max
-+ * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly
-+ * to congestion if the bottleneck is congested when the flow starts up.
-+ */
-+static u32 bbr_ecn_alpha_init = BBR_UNIT; /* 1.0, to respond quickly */
-+
-+/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
-+ * No ECN based bounding when 0. Max allwed value is 255.
-+ */
-+static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */
-+
-+/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
-+ * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255.
-+ */
-+static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */
-+
-+/* Max RTT (in usec) at which to use sender-side ECN logic.
-+ * Disabled when 0 (ECN allowed at any RTT).
-+ * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms.
-+ */
-+static u32 bbr_ecn_max_rtt_us = 5000;
-+
-+/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
-+ * clears then use a multiplicative increase to quickly reprobe bw by
-+ * starting inflight probing at the given multiple of inflight_hi.
-+ * Default for this experimental knob is 0 (disabled).
-+ * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5.
-+ */
-+static u32 bbr_ecn_reprobe_gain;
-+
-+/* Estimate bw probing has gone too far if loss rate exceeds this level. */
-+static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */
-+
-+/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
-+ * and loss rate is higher than bbr_loss_thresh.
-+ * Disabled if 0. Max allowed value is 15 (0xF).
-+ */
-+static u32 bbr_full_loss_cnt = 8;
-+
-+/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
-+ * meets this count. Max allowed value is 3.
-+ */
-+static u32 bbr_full_ecn_cnt = 2;
-+
-+/* Fraction of unutilized headroom to try to leave in path upon high loss. */
-+static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
-+
-+/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase.
-+ * Default is 1.25x, as in BBR v1. Max allowed is 511.
-+ */
-+static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4;
-+
-+/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips.
-+ * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism.
-+ * Max allowed is 511.
-+ */
-+static u32 bbr_bw_probe_reno_gain = BBR_UNIT;
-+
-+/* Max number of packet-timed rounds to wait before probing for bandwidth. If
-+ * we want to tolerate 1% random loss per round, and not have this cut our
-+ * inflight too much, we must probe for bw periodically on roughly this scale.
-+ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
-+ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
-+ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
-+ */
-+static u32 bbr_bw_probe_max_rounds = 63;
-+
-+/* Max amount of randomness to inject in round counting for Reno-coexistence.
-+ * Max value is 15.
-+ */
-+static u32 bbr_bw_probe_rand_rounds = 2;
-+
-+/* Use BBR-native probe time scale starting at this many usec.
-+ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
-+ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
-+ */
-+static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */
-+
-+/* Use BBR-native probes spread over this many usec: */
-+static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */
-+
-+/* Undo the model changes made in loss recovery if recovery was spurious? */
-+static bool bbr_undo = true;
-+
-+/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
-+static bool bbr_fast_path = true; /* default: enabled */
-+
-+/* Use fast ack mode ? */
-+static int bbr_fast_ack_mode = 1; /* default: rwnd check off */
-+
-+/* How much to additively increase inflight_hi when entering REFILL? */
-+static u32 bbr_refill_add_inc; /* default: disabled */
-+
-+module_param_named(beta, bbr_beta, uint, 0644);
-+module_param_named(ecn_alpha_gain, bbr_ecn_alpha_gain, uint, 0644);
-+module_param_named(ecn_alpha_init, bbr_ecn_alpha_init, uint, 0644);
-+module_param_named(ecn_factor, bbr_ecn_factor, uint, 0644);
-+module_param_named(ecn_thresh, bbr_ecn_thresh, uint, 0644);
-+module_param_named(ecn_max_rtt_us, bbr_ecn_max_rtt_us, uint, 0644);
-+module_param_named(ecn_reprobe_gain, bbr_ecn_reprobe_gain, uint, 0644);
-+module_param_named(loss_thresh, bbr_loss_thresh, uint, 0664);
-+module_param_named(full_loss_cnt, bbr_full_loss_cnt, uint, 0664);
-+module_param_named(full_ecn_cnt, bbr_full_ecn_cnt, uint, 0664);
-+module_param_named(inflight_headroom, bbr_inflight_headroom, uint, 0664);
-+module_param_named(bw_probe_pif_gain, bbr_bw_probe_pif_gain, uint, 0664);
-+module_param_named(bw_probe_reno_gain, bbr_bw_probe_reno_gain, uint, 0664);
-+module_param_named(bw_probe_max_rounds, bbr_bw_probe_max_rounds, uint, 0664);
-+module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664);
-+module_param_named(bw_probe_base_us, bbr_bw_probe_base_us, uint, 0664);
-+module_param_named(bw_probe_rand_us, bbr_bw_probe_rand_us, uint, 0664);
-+module_param_named(undo, bbr_undo, bool, 0664);
-+module_param_named(fast_path, bbr_fast_path, bool, 0664);
-+module_param_named(fast_ack_mode, bbr_fast_ack_mode, uint, 0664);
-+module_param_named(refill_add_inc, bbr_refill_add_inc, uint, 0664);
-+
-+static void bbr2_init(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr_init(sk); /* run shared init code for v1 and v2 */
-+
-+ /* BBR v2 parameters: */
-+ bbr->params.beta = min_t(u32, 0xFFU, bbr_beta);
-+ bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain);
-+ bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init);
-+ bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor);
-+ bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh);
-+ bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us);
-+ bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain);
-+ bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh);
-+ bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt);
-+ bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt);
-+ bbr->params.inflight_headroom =
-+ min_t(u32, 0xFFU, bbr_inflight_headroom);
-+ bbr->params.bw_probe_pif_gain =
-+ min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain);
-+ bbr->params.bw_probe_reno_gain =
-+ min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain);
-+ bbr->params.bw_probe_max_rounds =
-+ min_t(u32, 0xFFU, bbr_bw_probe_max_rounds);
-+ bbr->params.bw_probe_rand_rounds =
-+ min_t(u32, 0xFU, bbr_bw_probe_rand_rounds);
-+ bbr->params.bw_probe_base_us =
-+ min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us);
-+ bbr->params.bw_probe_rand_us =
-+ min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us);
-+ bbr->params.undo = bbr_undo;
-+ bbr->params.fast_path = bbr_fast_path ? 1 : 0;
-+ bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc);
-+
-+ /* BBR v2 state: */
-+ bbr->initialized = 1;
-+ /* Start sampling ECN mark rate after first full flight is ACKed: */
-+ bbr->loss_round_delivered = tp->delivered + 1;
-+ bbr->loss_round_start = 0;
-+ bbr->undo_bw_lo = 0;
-+ bbr->undo_inflight_lo = 0;
-+ bbr->undo_inflight_hi = 0;
-+ bbr->loss_events_in_round = 0;
-+ bbr->startup_ecn_rounds = 0;
-+ bbr2_reset_congestion_signals(sk);
-+ bbr->bw_lo = ~0U;
-+ bbr->bw_hi[0] = 0;
-+ bbr->bw_hi[1] = 0;
-+ bbr->inflight_lo = ~0U;
-+ bbr->inflight_hi = ~0U;
-+ bbr->bw_probe_up_cnt = ~0U;
-+ bbr->bw_probe_up_acks = 0;
-+ bbr->bw_probe_up_rounds = 0;
-+ bbr->probe_wait_us = 0;
-+ bbr->stopped_risky_probe = 0;
-+ bbr->ack_phase = BBR_ACKS_INIT;
-+ bbr->rounds_since_probe = 0;
-+ bbr->bw_probe_samples = 0;
-+ bbr->prev_probe_too_high = 0;
-+ bbr->ecn_eligible = 0;
-+ bbr->ecn_alpha = bbr->params.ecn_alpha_init;
-+ bbr->alpha_last_delivered = 0;
-+ bbr->alpha_last_delivered_ce = 0;
-+
-+ tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode);
-+}
-+
-+/* Core TCP stack informs us that the given skb was just marked lost. */
-+static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
-+ struct rate_sample rs;
-+
-+ /* Capture "current" data over the full round trip of loss,
-+ * to have a better chance to see the full capacity of the path.
-+ */
-+ if (!bbr->loss_in_round) /* first loss in this round trip? */
-+ bbr->loss_round_delivered = tp->delivered; /* set round trip */
-+ bbr->loss_in_round = 1;
-+ bbr->loss_in_cycle = 1;
-+
-+ if (!bbr->bw_probe_samples)
-+ return; /* not an skb sent while probing for bandwidth */
-+ if (unlikely(!scb->tx.delivered_mstamp))
-+ return; /* skb was SACKed, reneged, marked lost; ignore it */
-+ /* We are probing for bandwidth. Construct a rate sample that
-+ * estimates what happened in the flight leading up to this lost skb,
-+ * then see if the loss rate went too high, and if so at which packet.
-+ */
-+ memset(&rs, 0, sizeof(rs));
-+ rs.tx_in_flight = scb->tx.in_flight;
-+ rs.lost = tp->lost - scb->tx.lost;
-+ rs.is_app_limited = scb->tx.is_app_limited;
-+ if (bbr2_is_inflight_too_high(sk, &rs)) {
-+ rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb);
-+ bbr2_handle_inflight_too_high(sk, &rs);
-+ }
-+}
-+
-+/* Revert short-term model if current loss recovery event was spurious. */
-+static u32 bbr2_undo_cwnd(struct sock *sk)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr->debug.undo = 1;
-+ bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
-+ bbr->full_bw_cnt = 0;
-+ bbr->loss_in_round = 0;
-+
-+ if (!bbr->params.undo)
-+ return tp->snd_cwnd;
-+
-+ /* Revert to cwnd and other state saved before loss episode. */
-+ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
-+ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
-+ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
-+ return bbr->prior_cwnd;
-+}
-+
-+/* Entering loss recovery, so save state for when we undo recovery. */
-+static u32 bbr2_ssthresh(struct sock *sk)
-+{
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ bbr_save_cwnd(sk);
-+ /* For undo, save state that adapts based on loss signal. */
-+ bbr->undo_bw_lo = bbr->bw_lo;
-+ bbr->undo_inflight_lo = bbr->inflight_lo;
-+ bbr->undo_inflight_hi = bbr->inflight_hi;
-+ return tcp_sk(sk)->snd_ssthresh;
-+}
-+
-+static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr)
-+{
-+ switch (bbr->mode) {
-+ case BBR_STARTUP:
-+ return BBR2_PHASE_STARTUP;
-+ case BBR_DRAIN:
-+ return BBR2_PHASE_DRAIN;
-+ case BBR_PROBE_BW:
-+ break;
-+ case BBR_PROBE_RTT:
-+ return BBR2_PHASE_PROBE_RTT;
-+ default:
-+ return BBR2_PHASE_INVALID;
-+ }
-+ switch (bbr->cycle_idx) {
-+ case BBR_BW_PROBE_UP:
-+ return BBR2_PHASE_PROBE_BW_UP;
-+ case BBR_BW_PROBE_DOWN:
-+ return BBR2_PHASE_PROBE_BW_DOWN;
-+ case BBR_BW_PROBE_CRUISE:
-+ return BBR2_PHASE_PROBE_BW_CRUISE;
-+ case BBR_BW_PROBE_REFILL:
-+ return BBR2_PHASE_PROBE_BW_REFILL;
-+ default:
-+ return BBR2_PHASE_INVALID;
-+ }
-+}
-+
-+static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr,
-+ union tcp_cc_info *info)
-+{
-+ if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
-+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
-+ struct bbr *bbr = inet_csk_ca(sk);
-+ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
-+ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
-+ u64 bw_lo = bbr->bw_lo == ~0U ?
-+ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
-+
-+ memset(&info->bbr2, 0, sizeof(info->bbr2));
-+ info->bbr2.bbr_bw_lsb = (u32)bw;
-+ info->bbr2.bbr_bw_msb = (u32)(bw >> 32);
-+ info->bbr2.bbr_min_rtt = bbr->min_rtt_us;
-+ info->bbr2.bbr_pacing_gain = bbr->pacing_gain;
-+ info->bbr2.bbr_cwnd_gain = bbr->cwnd_gain;
-+ info->bbr2.bbr_bw_hi_lsb = (u32)bw_hi;
-+ info->bbr2.bbr_bw_hi_msb = (u32)(bw_hi >> 32);
-+ info->bbr2.bbr_bw_lo_lsb = (u32)bw_lo;
-+ info->bbr2.bbr_bw_lo_msb = (u32)(bw_lo >> 32);
-+ info->bbr2.bbr_mode = bbr->mode;
-+ info->bbr2.bbr_phase = (__u8)bbr2_get_phase(bbr);
-+ info->bbr2.bbr_version = (__u8)2;
-+ info->bbr2.bbr_inflight_lo = bbr->inflight_lo;
-+ info->bbr2.bbr_inflight_hi = bbr->inflight_hi;
-+ info->bbr2.bbr_extra_acked = bbr_extra_acked(sk);
-+ *attr = INET_DIAG_BBRINFO;
-+ return sizeof(info->bbr2);
-+ }
-+ return 0;
-+}
-+
-+static void bbr2_set_state(struct sock *sk, u8 new_state)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ struct bbr *bbr = inet_csk_ca(sk);
-+
-+ if (new_state == TCP_CA_Loss) {
-+ struct rate_sample rs = { .losses = 1 };
-+ struct bbr_context ctx = { 0 };
-+
-+ bbr->prev_ca_state = TCP_CA_Loss;
-+ bbr->full_bw = 0;
-+ if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
-+ /* bbr_adapt_lower_bounds() needs cwnd before
-+ * we suffered an RTO, to update inflight_lo:
-+ */
-+ bbr->inflight_lo =
-+ max(tp->snd_cwnd, bbr->prior_cwnd);
-+ }
-+ bbr_debug(sk, 0, &rs, &ctx);
-+ } else if (bbr->prev_ca_state == TCP_CA_Loss &&
-+ new_state != TCP_CA_Loss) {
-+ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
-+ bbr->try_fast_path = 0; /* bound cwnd using latest model */
-+ }
-+}
-+
-+static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = {
-+ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
-+ .name = "bbr2",
-+ .owner = THIS_MODULE,
-+ .init = bbr2_init,
-+ .cong_control = bbr2_main,
-+ .sndbuf_expand = bbr_sndbuf_expand,
-+ .skb_marked_lost = bbr2_skb_marked_lost,
-+ .undo_cwnd = bbr2_undo_cwnd,
-+ .cwnd_event = bbr_cwnd_event,
-+ .ssthresh = bbr2_ssthresh,
-+ .tso_segs = bbr_tso_segs,
-+ .get_info = bbr2_get_info,
-+ .set_state = bbr2_set_state,
-+};
-+
-+static int __init bbr_register(void)
-+{
-+ BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
-+ return tcp_register_congestion_control(&tcp_bbr2_cong_ops);
-+}
-+
-+static void __exit bbr_unregister(void)
-+{
-+ tcp_unregister_congestion_control(&tcp_bbr2_cong_ops);
-+}
-+
-+module_init(bbr_register);
-+module_exit(bbr_unregister);
-+
-+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
-+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
-+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
-+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
-+MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
-+MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
-+MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
-+MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
-+
-+MODULE_LICENSE("Dual BSD/GPL");
-+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
-diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
-index 563d016e7..1c94abb6f 100644
---- a/net/ipv4/tcp_cong.c
-+++ b/net/ipv4/tcp_cong.c
-@@ -179,6 +179,7 @@ void tcp_init_congestion_control(struct sock *sk)
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- tcp_sk(sk)->prior_ssthresh = 0;
-+ tcp_sk(sk)->fast_ack_mode = 0;
- if (icsk->icsk_ca_ops->init)
- icsk->icsk_ca_ops->init(sk);
- if (tcp_ca_needs_ecn(sk))
-diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
-index 69a545db8..45aaba87c 100644
---- a/net/ipv4/tcp_input.c
-+++ b/net/ipv4/tcp_input.c
-@@ -348,7 +348,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
- tcp_enter_quickack_mode(sk, 2);
- break;
- case INET_ECN_CE:
-- if (tcp_ca_needs_ecn(sk))
-+ if (tcp_ca_wants_ce_events(sk))
- tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
-
- if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
-@@ -359,7 +359,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
- tp->ecn_flags |= TCP_ECN_SEEN;
- break;
- default:
-- if (tcp_ca_needs_ecn(sk))
-+ if (tcp_ca_wants_ce_events(sk))
- tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
- tp->ecn_flags |= TCP_ECN_SEEN;
- break;
-@@ -1039,7 +1039,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
- */
- static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
- {
-+ struct sock *sk = (struct sock *)tp;
-+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-+
- tp->lost += tcp_skb_pcount(skb);
-+ if (ca_ops->skb_marked_lost)
-+ ca_ops->skb_marked_lost(sk, skb);
- }
-
- void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
-@@ -1420,6 +1425,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
- WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
- tcp_skb_pcount_add(skb, -pcount);
-
-+ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */
-+ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
-+ "prev in_flight: %u skb in_flight: %u pcount: %u",
-+ TCP_SKB_CB(prev)->tx.in_flight,
-+ TCP_SKB_CB(skb)->tx.in_flight,
-+ pcount))
-+ TCP_SKB_CB(skb)->tx.in_flight = 0;
-+ else
-+ TCP_SKB_CB(skb)->tx.in_flight -= pcount;
-+ TCP_SKB_CB(prev)->tx.in_flight += pcount;
-+
- /* When we're adding to gso_segs == 1, gso_size will be zero,
- * in theory this shouldn't be necessary but as long as DSACK
- * code can come after this skb later on it's better to keep
-@@ -3182,7 +3198,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
- long seq_rtt_us = -1L;
- long ca_rtt_us = -1L;
- u32 pkts_acked = 0;
-- u32 last_in_flight = 0;
- bool rtt_update;
- int flag = 0;
-
-@@ -3218,7 +3233,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
- if (!first_ackt)
- first_ackt = last_ackt;
-
-- last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
- if (before(start_seq, reord))
- reord = start_seq;
- if (!after(scb->end_seq, tp->high_seq))
-@@ -3284,8 +3298,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
- seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
- ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
-
-- if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
-- last_in_flight && !prior_sacked && fully_acked &&
-+ if (pkts_acked == 1 && fully_acked && !prior_sacked &&
-+ (tp->snd_una - prior_snd_una) < tp->mss_cache &&
- sack->rate->prior_delivered + 1 == tp->delivered &&
- !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
- /* Conservatively mark a delayed ACK. It's typically
-@@ -3342,9 +3356,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
-
- if (icsk->icsk_ca_ops->pkts_acked) {
- struct ack_sample sample = { .pkts_acked = pkts_acked,
-- .rtt_us = sack->rate->rtt_us,
-- .in_flight = last_in_flight };
-+ .rtt_us = sack->rate->rtt_us };
-
-+ sample.in_flight = tp->mss_cache *
-+ (tp->delivered - sack->rate->prior_delivered);
- icsk->icsk_ca_ops->pkts_acked(sk, &sample);
- }
-
-@@ -3742,6 +3757,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
-
- prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
- rs.prior_in_flight = tcp_packets_in_flight(tp);
-+ tcp_rate_check_app_limited(sk);
-
- /* ts_recent update must be made after we are sure that the packet
- * is in window.
-@@ -3839,6 +3855,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
- delivered = tcp_newly_delivered(sk, delivered, flag);
- lost = tp->lost - lost; /* freshly marked lost */
- rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
-+ rs.is_ece = !!(flag & FLAG_ECE);
- tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
- tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
- tcp_xmit_recovery(sk, rexmit);
-@@ -5399,13 +5416,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
-
- /* More than one full frame received... */
- if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
-+ (tp->fast_ack_mode == 1 ||
- /* ... and right edge of window advances far enough.
- * (tcp_recvmsg() will send ACK otherwise).
- * If application uses SO_RCVLOWAT, we want send ack now if
- * we have not received enough bytes to satisfy the condition.
- */
-- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
-- __tcp_select_window(sk) >= tp->rcv_wnd)) ||
-+ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
-+ __tcp_select_window(sk) >= tp->rcv_wnd))) ||
- /* We ACK each frame or... */
- tcp_in_quickack_mode(sk) ||
- /* Protocol state mandates a one-time immediate ACK */
-diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
-index fbf140a77..90d939375 100644
---- a/net/ipv4/tcp_output.c
-+++ b/net/ipv4/tcp_output.c
-@@ -1256,8 +1256,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
- tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
- skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
- if (clone_it) {
-- TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
-- - tp->snd_una;
- oskb = skb;
-
- tcp_skb_tsorted_save(oskb) {
-@@ -1536,7 +1534,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
- {
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *buff;
-- int nsize, old_factor;
-+ int nsize, old_factor, inflight_prev;
- long limit;
- int nlen;
- u8 flags;
-@@ -1615,6 +1613,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
-
- if (diff)
- tcp_adjust_pcount(sk, skb, diff);
-+
-+ /* Set buff tx.in_flight as if buff were sent by itself. */
-+ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
-+ if (WARN_ONCE(inflight_prev < 0,
-+ "inconsistent: tx.in_flight: %u old_factor: %d",
-+ TCP_SKB_CB(skb)->tx.in_flight, old_factor))
-+ inflight_prev = 0;
-+ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
-+ tcp_skb_pcount(buff);
- }
-
- /* Link BUFF into the send queue. */
-@@ -1982,13 +1989,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
- static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
- {
- const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-- u32 min_tso, tso_segs;
--
-- min_tso = ca_ops->min_tso_segs ?
-- ca_ops->min_tso_segs(sk) :
-- sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
-+ u32 tso_segs;
-
-- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
-+ tso_segs = ca_ops->tso_segs ?
-+ ca_ops->tso_segs(sk, mss_now) :
-+ tcp_tso_autosize(sk, mss_now,
-+ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
- return min_t(u32, tso_segs, sk->sk_gso_max_segs);
- }
-
-@@ -2628,6 +2634,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
- skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
- list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
- tcp_init_tso_segs(skb, mss_now);
-+ tcp_set_tx_in_flight(sk, skb);
- goto repair; /* Skip network transmission */
- }
-
-diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
-index 0de693565..796fa6e53 100644
---- a/net/ipv4/tcp_rate.c
-+++ b/net/ipv4/tcp_rate.c
-@@ -34,6 +34,24 @@
- * ready to send in the write queue.
- */
-
-+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
-+{
-+ struct tcp_sock *tp = tcp_sk(sk);
-+ u32 in_flight;
-+
-+ /* Check, sanitize, and record packets in flight after skb was sent. */
-+ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
-+ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
-+ "insane in_flight %u cc %s mss %u "
-+ "cwnd %u pif %u %u %u %u\n",
-+ in_flight, inet_csk(sk)->icsk_ca_ops->name,
-+ tp->mss_cache, tp->snd_cwnd,
-+ tp->packets_out, tp->retrans_out,
-+ tp->sacked_out, tp->lost_out))
-+ in_flight = TCPCB_IN_FLIGHT_MAX;
-+ TCP_SKB_CB(skb)->tx.in_flight = in_flight;
-+}
-+
- /* Snapshot the current delivery information in the skb, to generate
- * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
- */
-@@ -65,7 +83,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
- TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
- TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
- TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
-+ TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
-+ TCP_SKB_CB(skb)->tx.lost = tp->lost;
- TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
-+ tcp_set_tx_in_flight(sk, skb);
- }
-
- /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
-@@ -86,16 +107,20 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
-
- if (!rs->prior_delivered ||
- after(scb->tx.delivered, rs->prior_delivered)) {
-+ rs->prior_lost = scb->tx.lost;
-+ rs->prior_delivered_ce = scb->tx.delivered_ce;
- rs->prior_delivered = scb->tx.delivered;
- rs->prior_mstamp = scb->tx.delivered_mstamp;
- rs->is_app_limited = scb->tx.is_app_limited;
- rs->is_retrans = scb->sacked & TCPCB_RETRANS;
-+ rs->tx_in_flight = scb->tx.in_flight;
-
- /* Record send time of most recently ACKed packet: */
- tp->first_tx_mstamp = tcp_skb_timestamp_us(skb);
- /* Find the duration of the "send phase" of this window: */
-- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
-- scb->tx.first_tx_mstamp);
-+ rs->interval_us = tcp_stamp32_us_delta(
-+ tp->first_tx_mstamp,
-+ scb->tx.first_tx_mstamp);
-
- }
- /* Mark off the skb delivered once it's sacked to avoid being
-@@ -137,6 +162,11 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
- return;
- }
- rs->delivered = tp->delivered - rs->prior_delivered;
-+ rs->lost = tp->lost - rs->prior_lost;
-+
-+ rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
-+ /* delivered_ce occupies less than 32 bits in the skb control block */
-+ rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
-
- /* Model sending data and receiving ACKs as separate pipeline phases
- * for a window. Usually the ACK phase is longer, but with ACK
-@@ -144,7 +174,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
- * longer phase.
- */
- snd_us = rs->interval_us; /* send phase */
-- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
-+ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
- rs->prior_mstamp); /* ack phase */
- rs->interval_us = max(snd_us, ack_us);
-
-diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
-index 4ef08079c..b5b24caa8 100644
---- a/net/ipv4/tcp_timer.c
-+++ b/net/ipv4/tcp_timer.c
-@@ -607,6 +607,7 @@ void tcp_write_timer_handler(struct sock *sk)
- goto out;
- }
-
-+ tcp_rate_check_app_limited(sk);
- tcp_mstamp_refresh(tcp_sk(sk));
- event = icsk->icsk_pending;
-
---
-2.31.1.305.gd1b10fc6d8
-
diff --git a/0010-btrfs.patch b/0010-btrfs.patch
deleted file mode 100644
index 457e2445824d..000000000000
--- a/0010-btrfs.patch
+++ /dev/null
@@ -1,2157 +0,0 @@
-From dfe89528bf8d093c1df80ea3fea2a50d3dc4a302 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Mon, 5 Aug 2019 14:31:53 -0400
-Subject: [PATCH 01/22] btrfs: add a force_chunk_alloc to space_info's sysfs
-
-In testing various things such as the btrfsck patch to detect over
-allocation of chunks, empty block group deletion, and balance I've had
-various ways to force chunk allocations for debug purposes. Add a sysfs
-file to enable forcing of chunk allocation for the owning space info in
-order to enable us to add testcases in the future to test these various
-features easier.
-
-[HH: rebased for 5.4]
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/sysfs.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++
- 1 file changed, 64 insertions(+)
-
-diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
-index 6eb1c50fa..9372ef191 100644
---- a/fs/btrfs/sysfs.c
-+++ b/fs/btrfs/sysfs.c
-@@ -72,6 +72,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \
-
- static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
- static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
-+static inline struct kobject *get_btrfs_kobj(struct kobject *kobj);
-
- static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a)
- {
-@@ -640,6 +641,58 @@ static struct kobj_type btrfs_raid_ktype = {
- .default_groups = raid_groups,
- };
-
-+static ssize_t btrfs_space_info_force_chunk_alloc_show(struct kobject *kobj,
-+ struct kobj_attribute *a,
-+ char *buf)
-+{
-+ return snprintf(buf, PAGE_SIZE, "0\n");
-+}
-+
-+static ssize_t btrfs_space_info_force_chunk_alloc(struct kobject *kobj,
-+ struct kobj_attribute *a,
-+ const char *buf, size_t len)
-+{
-+ struct btrfs_space_info *space_info = to_space_info(kobj);
-+ struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
-+ struct btrfs_trans_handle *trans;
-+ unsigned long val;
-+ int ret;
-+
-+ if (!fs_info) {
-+ printk(KERN_ERR "couldn't get fs_info\n");
-+ return -EPERM;
-+ }
-+
-+ if (!capable(CAP_SYS_ADMIN))
-+ return -EPERM;
-+
-+ if (sb_rdonly(fs_info->sb))
-+ return -EROFS;
-+
-+ ret = kstrtoul(buf, 10, &val);
-+ if (ret)
-+ return ret;
-+
-+ /*
-+ * We don't really care, but if we echo 0 > force it seems silly to do
-+ * anything.
-+ */
-+ if (val == 0)
-+ return -EINVAL;
-+
-+ trans = btrfs_start_transaction(fs_info->extent_root, 0);
-+ if (!trans)
-+ return PTR_ERR(trans);
-+ ret = btrfs_force_chunk_alloc(trans, space_info->flags);
-+ btrfs_end_transaction(trans);
-+ if (ret == 1)
-+ return len;
-+ return -ENOSPC;
-+}
-+BTRFS_ATTR_RW(space_info, force_chunk_alloc,
-+ btrfs_space_info_force_chunk_alloc_show,
-+ btrfs_space_info_force_chunk_alloc);
-+
- #define SPACE_INFO_ATTR(field) \
- static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
- struct kobj_attribute *a, \
-@@ -684,6 +737,7 @@ static struct attribute *space_info_attrs[] = {
- BTRFS_ATTR_PTR(space_info, disk_used),
- BTRFS_ATTR_PTR(space_info, disk_total),
- BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
-+ BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
- NULL,
- };
- ATTRIBUTE_GROUPS(space_info);
-@@ -1006,6 +1060,16 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
- return to_fs_devs(kobj)->fs_info;
- }
-
-+static inline struct kobject *get_btrfs_kobj(struct kobject *kobj)
-+{
-+ while (kobj) {
-+ if (kobj->ktype == &btrfs_ktype)
-+ return kobj;
-+ kobj = kobj->parent;
-+ }
-+ return NULL;
-+}
-+
- #define NUM_FEATURE_BITS 64
- #define BTRFS_FEATURE_NAME_MAX 13
- static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
---
-2.32.0
-
-
-From e104f0dda22a999ddd5f0be76ffc62637b411a3f Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Fri, 24 Jul 2020 12:41:47 -0400
-Subject: [PATCH 02/22] btrfs: do not evaluate the expression with
- !CONFIG_BTRFS_ASSERT
-
-While investigating a performance issue I noticed that turning off
-CONFIG_BTRFS_ASSERT had no effect in what I was seeing in perf,
-specifically check_setget_bounds() was around 5% for this workload.
-Upon investigation I realized that I made a mistake when I added
-ASSERT(), I would still evaluate the expression, but simply ignore the
-result.
-
-This is useless, and has a marked impact on performance. This
-microbenchmark is the watered down version of an application that is
-experiencing performance issues, and does renames and creates over and
-over again. Doing these operations 200k times without this patch takes
-13 seconds on my machine. With this patch it takes 7 seconds.
-
-[HH: removed the second hunk for 5.7.x]
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
-Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
----
- fs/btrfs/ctree.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
-index 29ef96903..12921830e 100644
---- a/fs/btrfs/ctree.h
-+++ b/fs/btrfs/ctree.h
-@@ -3402,7 +3402,7 @@ static inline void assertfail(const char *expr, const char *file, int line)
-
- #else
- static inline void assertfail(const char *expr, const char* file, int line) { }
--#define ASSERT(expr) (void)(expr)
-+#define ASSERT(expr) ((void)0)
- #endif
-
- /*
---
-2.32.0
-
-
-From a8c8b6d8a9763fe25f616567c8005318d3cbd948 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Fri, 20 Mar 2020 14:34:36 -0400
-Subject: [PATCH 03/22] btrfs: restart snapshot delete if we have to end the
- transaction
-
-This is to fully fix the deadlock described in
-
-btrfs: do not resolve backrefs for roots that are being deleted
-
-Holding write locks on our deleted snapshot across trans handles will
-just lead to sadness, and our backref lookup code is going to want to
-still process dropped snapshots for things like qgroup accounting.
-
-Fix this by simply dropping our path before we restart our transaction,
-and picking back up from our drop_progress key. This is less efficient
-obviously, but it also doesn't deadlock, so it feels like a reasonable
-trade off.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/extent-tree.c | 16 ++++++++++++++++
- 1 file changed, 16 insertions(+)
-
-diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
-index 27c368007..1aff5769c 100644
---- a/fs/btrfs/extent-tree.c
-+++ b/fs/btrfs/extent-tree.c
-@@ -5563,6 +5563,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
- * already dropped.
- */
- set_bit(BTRFS_ROOT_DELETING, &root->state);
-+again:
- if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
- level = btrfs_header_level(root->node);
- path->nodes[level] = btrfs_lock_root_node(root);
-@@ -5574,7 +5575,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
- btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
- memcpy(&wc->update_progress, &key,
- sizeof(wc->update_progress));
-+ memcpy(&wc->drop_progress, &key, sizeof(key));
-
-+ wc->drop_level = btrfs_root_drop_level(root_item);
- level = btrfs_root_drop_level(root_item);
- BUG_ON(level == 0);
- path->lowest_level = level;
-@@ -5666,6 +5669,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
- goto out_end_trans;
- }
-
-+ /*
-+ * We used to keep the path open until we completed the
-+ * snapshot delete. However this can deadlock with
-+ * things like backref walking that may want to resolve
-+ * references that still point to this deleted root. We
-+ * already have the ability to restart snapshot
-+ * deletions on mount, so just clear our walk_control,
-+ * drop the path, and go to the beginning and re-lookup
-+ * our drop_progress key and continue from there.
-+ */
-+ memset(wc, 0, sizeof(*wc));
-+ btrfs_release_path(path);
- btrfs_end_transaction_throttle(trans);
- if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
- btrfs_debug(fs_info,
-@@ -5687,6 +5702,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
- err = PTR_ERR(trans);
- goto out_free;
- }
-+ goto again;
- }
- }
- btrfs_release_path(path);
---
-2.32.0
-
-
-From f241ea708d4c4da7800436d3c74d0cd7836c75e8 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 24 Mar 2021 09:44:21 -0400
-Subject: [PATCH 04/22] btrfs: use percpu_read_positive instead of sum_positive
- for need_preempt
-
-Looking at perf data for a fio workload I noticed that we were spending
-a pretty large chunk of time (around 5%) doing percpu_counter_sum() in
-need_preemptive_reclaim. This is silly, as we only want to know if we
-have more ordered than delalloc to see if we should be counting the
-delayed items in our threshold calculation. Change this to
-percpu_read_positive() to avoid the overhead.
-
-I ran this through fsperf to validate the changes, obviously the latency
-numbers in dbench and fio are quite jittery, so take them as you wish,
-but overall the improvements on throughput, iops, and bw are all
-positive. Each test was run two times, the given value is the average
-of both runs for their respective column.
-
-btrfs ssd normal test results
-
-bufferedrandwrite16g results
- metric baseline current diff
-==========================================================
-write_io_kbytes 16777216 16777216 0.00%
-read_clat_ns_p99 0 0 0.00%
-write_bw_bytes 1.04e+08 1.05e+08 1.12%
-read_iops 0 0 0.00%
-write_clat_ns_p50 13888 11840 -14.75%
-read_io_kbytes 0 0 0.00%
-read_io_bytes 0 0 0.00%
-write_clat_ns_p99 35008 29312 -16.27%
-read_bw_bytes 0 0 0.00%
-elapsed 170 167 -1.76%
-write_lat_ns_min 4221.50 3762.50 -10.87%
-sys_cpu 39.65 35.37 -10.79%
-write_lat_ns_max 2.67e+10 2.50e+10 -6.63%
-read_lat_ns_min 0 0 0.00%
-write_iops 25270.10 25553.43 1.12%
-read_lat_ns_max 0 0 0.00%
-read_clat_ns_p50 0 0 0.00%
-
-dbench60 results
- metric baseline current diff
-==================================================
-qpathinfo 11.12 12.73 14.52%
-throughput 416.09 445.66 7.11%
-flush 3485.63 1887.55 -45.85%
-qfileinfo 0.70 1.92 173.86%
-ntcreatex 992.60 695.76 -29.91%
-qfsinfo 2.43 3.71 52.48%
-close 1.67 3.14 88.09%
-sfileinfo 66.54 105.20 58.10%
-rename 809.23 619.59 -23.43%
-find 16.88 15.46 -8.41%
-unlink 820.54 670.86 -18.24%
-writex 3375.20 2637.91 -21.84%
-deltree 386.33 449.98 16.48%
-readx 3.43 3.41 -0.60%
-mkdir 0.05 0.03 -38.46%
-lockx 0.26 0.26 -0.76%
-unlockx 0.81 0.32 -60.33%
-
-dio4kbs16threads results
- metric baseline current diff
-================================================================
-write_io_kbytes 5249676 3357150 -36.05%
-read_clat_ns_p99 0 0 0.00%
-write_bw_bytes 89583501.50 57291192.50 -36.05%
-read_iops 0 0 0.00%
-write_clat_ns_p50 242688 263680 8.65%
-read_io_kbytes 0 0 0.00%
-read_io_bytes 0 0 0.00%
-write_clat_ns_p99 15826944 36732928 132.09%
-read_bw_bytes 0 0 0.00%
-elapsed 61 61 0.00%
-write_lat_ns_min 42704 42095 -1.43%
-sys_cpu 5.27 3.45 -34.52%
-write_lat_ns_max 7.43e+08 9.27e+08 24.71%
-read_lat_ns_min 0 0 0.00%
-write_iops 21870.97 13987.11 -36.05%
-read_lat_ns_max 0 0 0.00%
-read_clat_ns_p50 0 0 0.00%
-
-randwrite2xram results
- metric baseline current diff
-================================================================
-write_io_kbytes 24831972 28876262 16.29%
-read_clat_ns_p99 0 0 0.00%
-write_bw_bytes 83745273.50 92182192.50 10.07%
-read_iops 0 0 0.00%
-write_clat_ns_p50 13952 11648 -16.51%
-read_io_kbytes 0 0 0.00%
-read_io_bytes 0 0 0.00%
-write_clat_ns_p99 50176 52992 5.61%
-read_bw_bytes 0 0 0.00%
-elapsed 314 332 5.73%
-write_lat_ns_min 5920.50 5127 -13.40%
-sys_cpu 7.82 7.35 -6.07%
-write_lat_ns_max 5.27e+10 3.88e+10 -26.44%
-read_lat_ns_min 0 0 0.00%
-write_iops 20445.62 22505.42 10.07%
-read_lat_ns_max 0 0 0.00%
-read_clat_ns_p50 0 0 0.00%
-
-untarfirefox results
-metric baseline current diff
-==============================================
-elapsed 47.41 47.40 -0.03%
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index 2da6177f4..2dc674b7c 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -861,8 +861,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- * of heavy DIO or ordered reservations, preemptive flushing will just
- * waste time and cause us to slow down.
- */
-- ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
-- delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
-+ ordered = percpu_counter_read_positive(&fs_info->ordered_bytes);
-+ delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
- if (ordered >= delalloc)
- used += fs_info->delayed_refs_rsv.reserved +
- fs_info->delayed_block_rsv.reserved;
---
-2.32.0
-
-
-From aafa62450e938fae02462b7d47e055eb6307c57d Mon Sep 17 00:00:00 2001
-From: Filipe Manana <fdmanana@suse.com>
-Date: Mon, 1 Mar 2021 09:26:42 +0000
-Subject: [PATCH 05/22] btrfs: add btree read ahead for full send operations
-
-When doing a full send we know that we are going to be reading every node
-and leaf of the send root, so we benefit from enabling read ahead for the
-btree.
-
-This change enables read ahead for full send operations only, incremental
-sends will have read ahead enabled in a different way by a separate patch.
-
-The following test script was used to measure the improvement on a box
-using an average, consumer grade, spinning disk and with 16Gb of ram:
-
- $ cat test.sh
- #!/bin/bash
-
- DEV=/dev/sdj
- MNT=/mnt/sdj
- MKFS_OPTIONS="--nodesize 16384" # default, just to be explicit
- MOUNT_OPTIONS="-o max_inline=2048" # default, just to be explicit
-
- mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null
- mount $MOUNT_OPTIONS $DEV $MNT
-
- # Create files with inline data to make it easier and faster to create
- # large btrees.
- add_files()
- {
- local total=$1
- local start_offset=$2
- local number_jobs=$3
- local total_per_job=$(($total / $number_jobs))
-
- echo "Creating $total new files using $number_jobs jobs"
- for ((n = 0; n < $number_jobs; n++)); do
- (
- local start_num=$(($start_offset + $n * $total_per_job))
- for ((i = 1; i <= $total_per_job; i++)); do
- local file_num=$((start_num + $i))
- local file_path="$MNT/file_${file_num}"
- xfs_io -f -c "pwrite -S 0xab 0 2000" $file_path > /dev/null
- if [ $? -ne 0 ]; then
- echo "Failed creating file $file_path"
- break
- fi
- done
- ) &
- worker_pids[$n]=$!
- done
-
- wait ${worker_pids[@]}
-
- sync
- echo
- echo "btree node/leaf count: $(btrfs inspect-internal dump-tree -t 5 $DEV | egrep '^(node|leaf) ' | wc -l)"
- }
-
- initial_file_count=500000
- add_files $initial_file_count 0 4
-
- echo
- echo "Creating first snapshot..."
- btrfs subvolume snapshot -r $MNT $MNT/snap1
-
- echo
- echo "Adding more files..."
- add_files $((initial_file_count / 4)) $initial_file_count 4
-
- echo
- echo "Updating 1/50th of the initial files..."
- for ((i = 1; i < $initial_file_count; i += 50)); do
- xfs_io -c "pwrite -S 0xcd 0 20" $MNT/file_$i > /dev/null
- done
-
- echo
- echo "Creating second snapshot..."
- btrfs subvolume snapshot -r $MNT $MNT/snap2
-
- umount $MNT
-
- echo 3 > /proc/sys/vm/drop_caches
- blockdev --flushbufs $DEV &> /dev/null
- hdparm -F $DEV &> /dev/null
-
- mount $MOUNT_OPTIONS $DEV $MNT
-
- echo
- echo "Testing full send..."
- start=$(date +%s)
- btrfs send $MNT/snap1 > /dev/null
- end=$(date +%s)
- echo
- echo "Full send took $((end - start)) seconds"
-
- umount $MNT
-
- echo 3 > /proc/sys/vm/drop_caches
- blockdev --flushbufs $DEV &> /dev/null
- hdparm -F $DEV &> /dev/null
-
- mount $MOUNT_OPTIONS $DEV $MNT
-
- echo
- echo "Testing incremental send..."
- start=$(date +%s)
- btrfs send -p $MNT/snap1 $MNT/snap2 > /dev/null
- end=$(date +%s)
- echo
- echo "Incremental send took $((end - start)) seconds"
-
- umount $MNT
-
-Before this change, full send duration:
-
-with $initial_file_count == 200000: 165 seconds
-with $initial_file_count == 500000: 407 seconds
-
-After this change, full send duration:
-
-with $initial_file_count == 200000: 149 seconds (-10.2%)
-with $initial_file_count == 500000: 353 seconds (-14.2%)
-
-For $initial_file_count == 200000 there are 62600 nodes and leaves in the
-btree of the first snapshot, while for $initial_file_count == 500000 there
-are 152476 nodes and leaves. The roots were at level 2.
-
-Signed-off-by: Filipe Manana <fdmanana@suse.com>
----
- fs/btrfs/send.c | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
-index 8ae8f1732..9817da145 100644
---- a/fs/btrfs/send.c
-+++ b/fs/btrfs/send.c
-@@ -6650,6 +6650,7 @@ static int full_send_tree(struct send_ctx *sctx)
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
-+ path->reada = READA_FORWARD;
-
- key.objectid = BTRFS_FIRST_FREE_OBJECTID;
- key.type = BTRFS_INODE_ITEM_KEY;
---
-2.32.0
-
-
-From a1df61cf5c2efa2298869acced2eb5f6a51e27ed Mon Sep 17 00:00:00 2001
-From: Filipe Manana <fdmanana@suse.com>
-Date: Mon, 1 Mar 2021 09:26:43 +0000
-Subject: [PATCH 06/22] btrfs: add btree read ahead for incremental send
- operations
-
-Currently we do not do btree read ahead when doing an incremental send,
-however we know that we will read and process any node or leaf in the
-send root that has a generation greater than the generation of the parent
-root. So triggering read ahead for such nodes and leafs is beneficial
-for an incremental send.
-
-This change does that, triggers read ahead of any node or leaf in the
-send root that has a generation greater then the generation of the
-parent root. As for the parent root, no readahead is triggered because
-knowing in advance which nodes/leaves are going to be read is not so
-linear and there's often a large time window between visiting nodes or
-leaves of the parent root. So I opted to leave out the parent root,
-and triggering read ahead for its nodes/leaves seemed to have not made
-significant difference.
-
-The following test script was used to measure the improvement on a box
-using an average, consumer grade, spinning disk and with 16Gb of ram:
-
- $ cat test.sh
- #!/bin/bash
-
- DEV=/dev/sdj
- MNT=/mnt/sdj
- MKFS_OPTIONS="--nodesize 16384" # default, just to be explicit
- MOUNT_OPTIONS="-o max_inline=2048" # default, just to be explicit
-
- mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null
- mount $MOUNT_OPTIONS $DEV $MNT
-
- # Create files with inline data to make it easier and faster to create
- # large btrees.
- add_files()
- {
- local total=$1
- local start_offset=$2
- local number_jobs=$3
- local total_per_job=$(($total / $number_jobs))
-
- echo "Creating $total new files using $number_jobs jobs"
- for ((n = 0; n < $number_jobs; n++)); do
- (
- local start_num=$(($start_offset + $n * $total_per_job))
- for ((i = 1; i <= $total_per_job; i++)); do
- local file_num=$((start_num + $i))
- local file_path="$MNT/file_${file_num}"
- xfs_io -f -c "pwrite -S 0xab 0 2000" $file_path > /dev/null
- if [ $? -ne 0 ]; then
- echo "Failed creating file $file_path"
- break
- fi
- done
- ) &
- worker_pids[$n]=$!
- done
-
- wait ${worker_pids[@]}
-
- sync
- echo
- echo "btree node/leaf count: $(btrfs inspect-internal dump-tree -t 5 $DEV | egrep '^(node|leaf) ' | wc -l)"
- }
-
- initial_file_count=500000
- add_files $initial_file_count 0 4
-
- echo
- echo "Creating first snapshot..."
- btrfs subvolume snapshot -r $MNT $MNT/snap1
-
- echo
- echo "Adding more files..."
- add_files $((initial_file_count / 4)) $initial_file_count 4
-
- echo
- echo "Updating 1/50th of the initial files..."
- for ((i = 1; i < $initial_file_count; i += 50)); do
- xfs_io -c "pwrite -S 0xcd 0 20" $MNT/file_$i > /dev/null
- done
-
- echo
- echo "Creating second snapshot..."
- btrfs subvolume snapshot -r $MNT $MNT/snap2
-
- umount $MNT
-
- echo 3 > /proc/sys/vm/drop_caches
- blockdev --flushbufs $DEV &> /dev/null
- hdparm -F $DEV &> /dev/null
-
- mount $MOUNT_OPTIONS $DEV $MNT
-
- echo
- echo "Testing full send..."
- start=$(date +%s)
- btrfs send $MNT/snap1 > /dev/null
- end=$(date +%s)
- echo
- echo "Full send took $((end - start)) seconds"
-
- umount $MNT
-
- echo 3 > /proc/sys/vm/drop_caches
- blockdev --flushbufs $DEV &> /dev/null
- hdparm -F $DEV &> /dev/null
-
- mount $MOUNT_OPTIONS $DEV $MNT
-
- echo
- echo "Testing incremental send..."
- start=$(date +%s)
- btrfs send -p $MNT/snap1 $MNT/snap2 > /dev/null
- end=$(date +%s)
- echo
- echo "Incremental send took $((end - start)) seconds"
-
- umount $MNT
-
-Before this change, incremental send duration:
-
-with $initial_file_count == 200000: 51 seconds
-with $initial_file_count == 500000: 168 seconds
-
-After this change, incremental send duration:
-
-with $initial_file_count == 200000: 39 seconds (-26.7%)
-with $initial_file_count == 500000: 125 seconds (-29.4%)
-
-For $initial_file_count == 200000 there are 62600 nodes and leaves in the
-btree of the first snapshot, and 77759 nodes and leaves in the btree of
-the second snapshot. The root nodes were at level 2.
-
-While for $initial_file_count == 500000 there are 152476 nodes and leaves
-in the btree of the first snapshot, and 190511 nodes and leaves in the
-btree of the second snapshot. The root nodes were at level 2 as well.
-
-Signed-off-by: Filipe Manana <fdmanana@suse.com>
----
- fs/btrfs/send.c | 42 ++++++++++++++++++++++++++++++++++++------
- 1 file changed, 36 insertions(+), 6 deletions(-)
-
-diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
-index 9817da145..ed1310e38 100644
---- a/fs/btrfs/send.c
-+++ b/fs/btrfs/send.c
-@@ -6689,15 +6689,35 @@ static int full_send_tree(struct send_ctx *sctx)
- return ret;
- }
-
--static int tree_move_down(struct btrfs_path *path, int *level)
-+static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
- {
- struct extent_buffer *eb;
-+ struct extent_buffer *parent = path->nodes[*level];
-+ int slot = path->slots[*level];
-+ const int nritems = btrfs_header_nritems(parent);
-+ u64 reada_max;
-+ u64 reada_done = 0;
-
- BUG_ON(*level == 0);
-- eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]);
-+ eb = btrfs_read_node_slot(parent, slot);
- if (IS_ERR(eb))
- return PTR_ERR(eb);
-
-+ /*
-+ * Trigger readahead for the next leaves we will process, so that it is
-+ * very likely that when we need them they are already in memory and we
-+ * will not block on disk IO. For nodes we only do readahead for one,
-+ * since the time window between processing nodes is typically larger.
-+ */
-+ reada_max = *level == 1 ? SZ_128K : eb->fs_info->nodesize;
-+
-+ for (slot++; slot < nritems && reada_done < reada_max; slot++) {
-+ if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
-+ btrfs_readahead_node_child(parent, slot);
-+ reada_done += eb->fs_info->nodesize;
-+ }
-+ }
-+
- path->nodes[*level - 1] = eb;
- path->slots[*level - 1] = 0;
- (*level)--;
-@@ -6737,14 +6757,15 @@ static int tree_move_next_or_upnext(struct btrfs_path *path,
- static int tree_advance(struct btrfs_path *path,
- int *level, int root_level,
- int allow_down,
-- struct btrfs_key *key)
-+ struct btrfs_key *key,
-+ u64 reada_min_gen)
- {
- int ret;
-
- if (*level == 0 || !allow_down) {
- ret = tree_move_next_or_upnext(path, level, root_level);
- } else {
-- ret = tree_move_down(path, level);
-+ ret = tree_move_down(path, level, reada_min_gen);
- }
- if (ret >= 0) {
- if (*level == 0)
-@@ -6818,6 +6839,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
- u64 right_blockptr;
- u64 left_gen;
- u64 right_gen;
-+ u64 reada_min_gen;
-
- left_path = btrfs_alloc_path();
- if (!left_path) {
-@@ -6897,6 +6919,14 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
- ret = -ENOMEM;
- goto out;
- }
-+ /*
-+ * Our right root is the parent root, while the left root is the "send"
-+ * root. We know that all new nodes/leaves in the left root must have
-+ * a generation greater than the right root's generation, so we trigger
-+ * readahead for those nodes and leaves of the left root, as we know we
-+ * will need to read them at some point.
-+ */
-+ reada_min_gen = btrfs_header_generation(right_root->commit_root);
- up_read(&fs_info->commit_root_sem);
-
- if (left_level == 0)
-@@ -6921,7 +6951,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
- ret = tree_advance(left_path, &left_level,
- left_root_level,
- advance_left != ADVANCE_ONLY_NEXT,
-- &left_key);
-+ &left_key, reada_min_gen);
- if (ret == -1)
- left_end_reached = ADVANCE;
- else if (ret < 0)
-@@ -6932,7 +6962,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
- ret = tree_advance(right_path, &right_level,
- right_root_level,
- advance_right != ADVANCE_ONLY_NEXT,
-- &right_key);
-+ &right_key, reada_min_gen);
- if (ret == -1)
- right_end_reached = ADVANCE;
- else if (ret < 0)
---
-2.32.0
-
-
-From 54ee4da43eb299229eced65ffd9097b16003d1a3 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:42 -0400
-Subject: [PATCH 07/22] btrfs: check worker before need_preemptive_reclaim
-
-need_preemptive_reclaim() does some calculations, which aren't heavy,
-but if we're already running preemptive reclaim there's no reason to do
-them at all, so re-order the checks so that we don't do the calculation
-if we're already doing reclaim.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index 2dc674b7c..c9a5e003b 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -1588,8 +1588,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
- * the async reclaim as we will panic.
- */
- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
-- need_preemptive_reclaim(fs_info, space_info) &&
-- !work_busy(&fs_info->preempt_reclaim_work)) {
-+ !work_busy(&fs_info->preempt_reclaim_work) &&
-+ need_preemptive_reclaim(fs_info, space_info)) {
- trace_btrfs_trigger_flush(fs_info, space_info->flags,
- orig_bytes, flush, "preempt");
- queue_work(system_unbound_wq,
---
-2.32.0
-
-
-From 76efa702dc8f1626c48eb0836205b7fb5b0ea94d Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:43 -0400
-Subject: [PATCH 08/22] btrfs: only clamp the first time we have to start
- flushing
-
-We were clamping the threshold for preemptive reclaim any time we added
-a ticket to wait on, which if we have a lot of threads means we'd
-essentially max out the clamp the first time we start to flush. Instead
-of doing this, simply do it every time we have to start flushing, this
-will make us ramp up gradually instead of going to max clamping as soon
-as we start needing to do flushing.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 17 +++++++++--------
- 1 file changed, 9 insertions(+), 8 deletions(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index c9a5e003b..33edab17a 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -1561,6 +1561,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
- flush == BTRFS_RESERVE_FLUSH_DATA) {
- list_add_tail(&ticket.list, &space_info->tickets);
- if (!space_info->flush) {
-+ /*
-+ * We were forced to add a reserve ticket, so
-+ * our preemptive flushing is unable to keep
-+ * up. Clamp down on the threshold for the
-+ * preemptive flushing in order to keep up with
-+ * the workload.
-+ */
-+ maybe_clamp_preempt(fs_info, space_info);
-+
- space_info->flush = 1;
- trace_btrfs_trigger_flush(fs_info,
- space_info->flags,
-@@ -1572,14 +1581,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
- list_add_tail(&ticket.list,
- &space_info->priority_tickets);
- }
--
-- /*
-- * We were forced to add a reserve ticket, so our preemptive
-- * flushing is unable to keep up. Clamp down on the threshold
-- * for the preemptive flushing in order to keep up with the
-- * workload.
-- */
-- maybe_clamp_preempt(fs_info, space_info);
- } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
- used += orig_bytes;
- /*
---
-2.32.0
-
-
-From 9844f3f192822e44c70ee90722b704d785f4884e Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:46 -0400
-Subject: [PATCH 09/22] btrfs: don't include the global rsv size in the
- preemptive used amount
-
-When deciding if we should preemptively flush space, we will add in the
-amount of space used by all block rsvs. However this also includes the
-global block rsv, which isn't flushable so shouldn't be accounted for in
-this calculation. If we decide to use ->bytes_may_use in our used
-calculation we need to subtract the global rsv size from this amount so
-it most closely matches the flushable space.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index 33edab17a..52e3bfedc 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -867,7 +867,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- used += fs_info->delayed_refs_rsv.reserved +
- fs_info->delayed_block_rsv.reserved;
- else
-- used += space_info->bytes_may_use;
-+ used += space_info->bytes_may_use - global_rsv_size;
-
- return (used >= thresh && !btrfs_fs_closing(fs_info) &&
- !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
---
-2.32.0
-
-
-From 1d309a9a237acff83ae53ba25b79dc21e2454b3d Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:44 -0400
-Subject: [PATCH 10/22] btrfs: take into account global rsv in
- need_preemptive_reclaim
-
-Global rsv can't be used for normal allocations, and for very full file
-systems we can decide to try and async flush constantly even though
-there's really not a lot of space to reclaim. Deal with this by
-including the global block rsv size in the "total used" calculation.
-
-[HH: small context fix for 5.10.x]
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 4 +++-
- 1 file changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index 52e3bfedc..aeb1f0b7b 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -792,12 +792,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
- static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
- {
-+ u64 global_rsv_size = fs_info->global_block_rsv.reserved;
- u64 ordered, delalloc;
- u64 thresh = div_factor_fine(space_info->total_bytes, 98);
- u64 used;
-
- /* If we're just plain full then async reclaim just slows us down. */
-- if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
-+ if ((space_info->bytes_used + space_info->bytes_reserved +
-+ global_rsv_size) >= thresh)
- return false;
-
- /*
---
-2.32.0
-
-
-From fa148091ff63557e1d123194069b3d418d6129e4 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:45 -0400
-Subject: [PATCH 11/22] btrfs: use the global rsv size in the preemptive thresh
- calculation
-
-We calculate the amount of "free" space available for normal
-reservations by taking the total space and subtracting out the hard used
-space, which is readonly, used, and reserved space. However we weren't
-taking into account the global block rsv, which is essentially hard used
-space. Handle this by subtracting it from the available free space, so
-that our threshold more closely mirrors reality.
-
-[HH: small context fix for 5.10.x]
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 6 ++++--
- 1 file changed, 4 insertions(+), 2 deletions(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index aeb1f0b7b..4e3857474 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -840,8 +840,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
-
- thresh = calc_available_free_space(fs_info, space_info,
- BTRFS_RESERVE_FLUSH_ALL);
-- thresh += (space_info->total_bytes - space_info->bytes_used -
-- space_info->bytes_reserved - space_info->bytes_readonly);
-+ used = space_info->bytes_used + space_info->bytes_reserved +
-+ space_info->bytes_readonly + global_rsv_size;
-+ if (used < space_info->total_bytes)
-+ thresh += space_info->total_bytes - used;
- thresh >>= space_info->clamp;
-
- used = space_info->bytes_pinned;
---
-2.32.0
-
-
-From 5270bca333d3c4f4bf045f224f6b94fe12528086 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:47 -0400
-Subject: [PATCH 12/22] btrfs: only ignore delalloc if delalloc is much smaller
- than ordered
-
-While testing heavy delalloc workloads I noticed that sometimes we'd
-just stop preemptively flushing when we had loads of delalloc available
-to flush. This is because we skip preemptive flushing if delalloc <=
-ordered. However if we start with say 4gib of delalloc, and we flush
-2gib of that, we'll stop flushing there, when we still have 2gib of
-delalloc to flush.
-
-Instead adjust the ordered bytes down by half, this way if 2/3 of our
-outstanding delalloc reservations are tied up by ordered extents we
-don't bother preemptive flushing, as we're getting close to the state
-where we need to wait on ordered extents.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 8 +++++++-
- 1 file changed, 7 insertions(+), 1 deletion(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index 4e3857474..cf09b23f3 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -864,8 +864,14 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- * clearly be heavy enough to warrant preemptive flushing. In the case
- * of heavy DIO or ordered reservations, preemptive flushing will just
- * waste time and cause us to slow down.
-+ *
-+ * We want to make sure we truly are maxed out on ordered however, so
-+ * cut ordered in half, and if it's still higher than delalloc then we
-+ * can keep flushing. This is to avoid the case where we start
-+ * flushing, and now delalloc == ordered and we stop preemptively
-+ * flushing when we could still have several gigs of delalloc to flush.
- */
-- ordered = percpu_counter_read_positive(&fs_info->ordered_bytes);
-+ ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
- delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
- if (ordered >= delalloc)
- used += fs_info->delayed_refs_rsv.reserved +
---
-2.32.0
-
-
-From 7713c31e0c5896f22358551aff967cf9f7dfe91c Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 28 Apr 2021 13:38:48 -0400
-Subject: [PATCH 13/22] btrfs: handle preemptive delalloc flushing slightly
- differently
-
-If we decide to flush delalloc from the preemptive flusher, we really do
-not want to wait on ordered extents, as it gains us nothing. However
-there was logic to go ahead and wait on ordered extents if there was
-more ordered bytes than delalloc bytes. We do not want this behavior,
-so pass through whether this flushing is for preemption, and do not wait
-for ordered extents if that's the case. Also break out of the shrink
-loop after the first flushing, as we just want to one shot shrink
-delalloc.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 15 ++++++++++++---
- 1 file changed, 12 insertions(+), 3 deletions(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index cf09b23f3..b2d834b92 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -495,7 +495,8 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
- */
- static void shrink_delalloc(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
-- u64 to_reclaim, bool wait_ordered)
-+ u64 to_reclaim, bool wait_ordered,
-+ bool for_preempt)
- {
- struct btrfs_trans_handle *trans;
- u64 delalloc_bytes;
-@@ -532,7 +533,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
- * ordered extents, otherwise we'll waste time trying to flush delalloc
- * that likely won't give us the space back we need.
- */
-- if (ordered_bytes > delalloc_bytes)
-+ if (ordered_bytes > delalloc_bytes && !for_preempt)
- wait_ordered = true;
-
- loops = 0;
-@@ -551,6 +552,14 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
- break;
- }
-
-+ /*
-+ * If we are for preemption we just want a one-shot of delalloc
-+ * flushing so we can stop flushing if we decide we don't need
-+ * to anymore.
-+ */
-+ if (for_preempt)
-+ break;
-+
- spin_lock(&space_info->lock);
- if (list_empty(&space_info->tickets) &&
- list_empty(&space_info->priority_tickets)) {
-@@ -702,7 +711,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
- case FLUSH_DELALLOC:
- case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(fs_info, space_info, num_bytes,
-- state == FLUSH_DELALLOC_WAIT);
-+ state == FLUSH_DELALLOC_WAIT, for_preempt);
- break;
- case FLUSH_DELAYED_REFS_NR:
- case FLUSH_DELAYED_REFS:
---
-2.32.0
-
-
-From b1fa125daba80334e1efdc50e5a2e70f8585e755 Mon Sep 17 00:00:00 2001
-From: David Sterba <dsterba@suse.com>
-Date: Tue, 18 May 2021 16:49:35 +0200
-Subject: [PATCH 14/22] btrfs: scrub: per-device bandwidth control
-
-Add sysfs interface to limit io during scrub. We relied on the ionice
-interface to do that, eg. the idle class let the system usable while
-scrub was running. This has changed when mq-deadline got widespread and
-did not implement the scheduling classes. That was a CFQ thing that got
-deleted. We've got numerous complaints from users about degraded
-performance.
-
-Currently only BFQ supports that but it's not a common scheduler and we
-can't ask everybody to switch to it.
-
-Alternatively the cgroup io limiting can be used but that also a
-non-trivial setup (v2 required, the controller must be enabled on the
-system). This can still be used if desired.
-
-Other ideas that have been explored: piggy-back on ionice (that is set
-per-process and is accessible) and interpret the class and classdata as
-bandwidth limits, but this does not have enough flexibility as there are
-only 8 allowed and we'd have to map fixed limits to each value. Also
-adjusting the value would need to lookup the process that currently runs
-scrub on the given device, and the value is not sticky so would have to
-be adjusted each time scrub runs.
-
-Running out of options, sysfs does not look that bad:
-
-- it's accessible from scripts, or udev rules
-- the name is similar to what MD-RAID has
- (/proc/sys/dev/raid/speed_limit_max or /sys/block/mdX/md/sync_speed_max)
-- the value is sticky at least for filesystem mount time
-- adjusting the value has immediate effect
-- sysfs is available in constrained environments (eg. system rescue)
-- the limit also applies to device replace
-
-Sysfs:
-
-- raw value is in bytes
-- values written to the file accept suffixes like K, M
-- file is in the per-device directory /sys/fs/btrfs/FSID/devinfo/DEVID/scrub_speed_max
-- 0 means use default priority of IO
-
-The scheduler is a simple deadline one and the accuracy is up to nearest
-128K.
-
-[HH: trivial context fix in hunk #1 for 5.10.x]
-Signed-off-by: David Sterba <dsterba@suse.com>
----
- fs/btrfs/scrub.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++
- fs/btrfs/sysfs.c | 28 +++++++++++++++++++++
- fs/btrfs/volumes.h | 3 +++
- 3 files changed, 92 insertions(+)
-
-diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
-index b9202a1f1..adc8cf404 100644
---- a/fs/btrfs/scrub.c
-+++ b/fs/btrfs/scrub.c
-@@ -165,6 +165,10 @@ struct scrub_ctx {
- int readonly;
- int pages_per_rd_bio;
-
-+ /* State of IO submission throttling affecting the associated device */
-+ ktime_t throttle_deadline;
-+ u64 throttle_sent;
-+
- int is_dev_replace;
- u64 write_pointer;
-
-@@ -613,6 +617,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
- spin_lock_init(&sctx->list_lock);
- spin_lock_init(&sctx->stat_lock);
- init_waitqueue_head(&sctx->list_wait);
-+ sctx->throttle_deadline = 0;
-
- WARN_ON(sctx->wr_curr_bio != NULL);
- mutex_init(&sctx->wr_lock);
-@@ -1996,6 +2001,60 @@ static void scrub_page_put(struct scrub_page *spage)
- }
- }
-
-+/*
-+ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
-+ * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
-+ */
-+static void scrub_throttle(struct scrub_ctx *sctx)
-+{
-+ const int time_slice = 1000;
-+ struct scrub_bio *sbio;
-+ struct btrfs_device *device;
-+ s64 delta;
-+ ktime_t now;
-+ u32 div;
-+ u64 bwlimit;
-+
-+ sbio = sctx->bios[sctx->curr];
-+ device = sbio->dev;
-+ bwlimit = READ_ONCE(device->scrub_speed_max);
-+ if (bwlimit == 0)
-+ return;
-+
-+ /*
-+ * Slice is divided into intervals when the IO is submitted, adjust by
-+ * bwlimit and maximum of 64 intervals.
-+ */
-+ div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
-+ div = min_t(u32, 64, div);
-+
-+ /* Start new epoch, set deadline */
-+ now = ktime_get();
-+ if (sctx->throttle_deadline == 0) {
-+ sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
-+ sctx->throttle_sent = 0;
-+ }
-+
-+ /* Still in the time to send? */
-+ if (ktime_before(now, sctx->throttle_deadline)) {
-+ /* If current bio is within the limit, send it */
-+ sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
-+ if (sctx->throttle_sent <= bwlimit / div)
-+ return;
-+
-+ /* We're over the limit, sleep until the rest of the slice */
-+ delta = ktime_ms_delta(sctx->throttle_deadline, now);
-+ } else {
-+ /* New request after deadline, start new epoch */
-+ delta = 0;
-+ }
-+
-+ if (delta)
-+ schedule_timeout_interruptible(delta * HZ / 1000);
-+ /* Next call will start the deadline period */
-+ sctx->throttle_deadline = 0;
-+}
-+
- static void scrub_submit(struct scrub_ctx *sctx)
- {
- struct scrub_bio *sbio;
-@@ -2003,6 +2062,8 @@ static void scrub_submit(struct scrub_ctx *sctx)
- if (sctx->curr == -1)
- return;
-
-+ scrub_throttle(sctx);
-+
- sbio = sctx->bios[sctx->curr];
- sctx->curr = -1;
- scrub_pending_bio_inc(sctx);
-diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
-index 9372ef191..9dda3feda 100644
---- a/fs/btrfs/sysfs.c
-+++ b/fs/btrfs/sysfs.c
-@@ -1469,6 +1469,33 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
- }
- BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
-
-+static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
-+ struct kobj_attribute *a,
-+ char *buf)
-+{
-+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
-+ devid_kobj);
-+
-+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
-+ READ_ONCE(device->scrub_speed_max));
-+}
-+
-+static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
-+ struct kobj_attribute *a,
-+ const char *buf, size_t len)
-+{
-+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
-+ devid_kobj);
-+ char *endptr;
-+ unsigned long long limit;
-+
-+ limit = memparse(buf, &endptr);
-+ WRITE_ONCE(device->scrub_speed_max, limit);
-+ return len;
-+}
-+BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show,
-+ btrfs_devinfo_scrub_speed_max_store);
-+
- static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
- struct kobj_attribute *a, char *buf)
- {
-@@ -1486,6 +1513,7 @@ static struct attribute *devid_attrs[] = {
- BTRFS_ATTR_PTR(devid, in_fs_metadata),
- BTRFS_ATTR_PTR(devid, missing),
- BTRFS_ATTR_PTR(devid, replace_target),
-+ BTRFS_ATTR_PTR(devid, scrub_speed_max),
- BTRFS_ATTR_PTR(devid, writeable),
- NULL
- };
-diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
-index d4c3e0dd3..be7932d9b 100644
---- a/fs/btrfs/volumes.h
-+++ b/fs/btrfs/volumes.h
-@@ -143,6 +143,9 @@ struct btrfs_device {
- struct completion kobj_unregister;
- /* For sysfs/FSID/devinfo/devid/ */
- struct kobject devid_kobj;
-+
-+ /* Bandwidth limit for scrub, in bytes */
-+ u64 scrub_speed_max;
- };
-
- /*
---
-2.32.0
-
-
-From 69dcc9b1b2ab6c0f2be5fd2020651ddb63e00f9c Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 19 May 2021 11:29:03 -0400
-Subject: [PATCH 15/22] btrfs: abort the transaction if we fail to replay log
- trees
-
-During inspection of the return path for replay I noticed that we don't
-actually abort the transaction if we get a failure during replay. This
-isn't a problem necessarily, as we properly return the error and will
-fail to mount. However we still leave this dangling transaction that
-could conceivably be committed without thinking there was an error.
-Handle this by making sure we abort the transaction on error to
-safeguard us from any problems in the future.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/tree-log.c | 4 +++-
- 1 file changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
-index 276b5511f..f9332bb84 100644
---- a/fs/btrfs/tree-log.c
-+++ b/fs/btrfs/tree-log.c
-@@ -6363,8 +6363,10 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
-
- return 0;
- error:
-- if (wc.trans)
-+ if (wc.trans) {
-+ btrfs_abort_transaction(wc.trans, ret);
- btrfs_end_transaction(wc.trans);
-+ }
- btrfs_free_path(path);
- return ret;
- }
---
-2.32.0
-
-
-From df6c67b60e03836e5deabb8d8ea76cfe5f4c5886 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Wed, 19 May 2021 11:45:16 -0400
-Subject: [PATCH 16/22] btrfs: do not infinite loop in data reclaim if we
- aborted
-
-Error injection stressing uncovered a busy loop in our data reclaim
-loop. There are two cases here, one where we loop creating block groups
-until space_info->full is set, or in the main loop we will skip erroring
-out any tickets if space_info->full == 0. Unfortunately if we aborted
-the transaction then we will never allocate chunks or reclaim any space
-and thus never get ->full, and you'll see stack traces like this
-
-watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [kworker/u4:4:139]
-CPU: 0 PID: 139 Comm: kworker/u4:4 Tainted: G W 5.13.0-rc1+ #328
-Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
-Workqueue: events_unbound btrfs_async_reclaim_data_space
-RIP: 0010:btrfs_join_transaction+0x12/0x20
-RSP: 0018:ffffb2b780b77de0 EFLAGS: 00000246
-RAX: ffffb2b781863d58 RBX: 0000000000000000 RCX: 0000000000000000
-RDX: 0000000000000801 RSI: ffff987952b57400 RDI: ffff987940aa3000
-RBP: ffff987954d55000 R08: 0000000000000001 R09: ffff98795539e8f0
-R10: 000000000000000f R11: 000000000000000f R12: ffffffffffffffff
-R13: ffff987952b574c8 R14: ffff987952b57400 R15: 0000000000000008
-FS: 0000000000000000(0000) GS:ffff9879bbc00000(0000) knlGS:0000000000000000
-CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
-CR2: 00007f0703da4000 CR3: 0000000113398004 CR4: 0000000000370ef0
-Call Trace:
- flush_space+0x4a8/0x660
- btrfs_async_reclaim_data_space+0x55/0x130
- process_one_work+0x1e9/0x380
- worker_thread+0x53/0x3e0
- ? process_one_work+0x380/0x380
- kthread+0x118/0x140
- ? __kthread_bind_mask+0x60/0x60
- ret_from_fork+0x1f/0x30
-
-Fix this by checking to see if we have BTRFS_FS_STATE_TRANS_ABORTED in
-either of the reclaim loops, and if so fail the tickets and bail. In
-addition to this, fix maybe_fail_all_tickets() to not try to grant
-tickets if we've aborted, simply fail everything.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/space-info.c | 35 ++++++++++++++++++++++++++++++-----
- 1 file changed, 30 insertions(+), 5 deletions(-)
-
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index b2d834b92..208f47e60 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -941,6 +941,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
- struct reserve_ticket *ticket;
- u64 tickets_id = space_info->tickets_id;
- u64 first_ticket_bytes = 0;
-+ bool aborted = test_bit(BTRFS_FS_STATE_TRANS_ABORTED,
-+ &fs_info->fs_state);
-
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
- btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
-@@ -952,7 +954,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
- ticket = list_first_entry(&space_info->tickets,
- struct reserve_ticket, list);
-
-- if (ticket->steal &&
-+ if (!aborted && ticket->steal &&
- steal_from_global_rsv(fs_info, space_info, ticket))
- return true;
-
-@@ -968,15 +970,18 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
- */
- if (first_ticket_bytes == 0)
- first_ticket_bytes = ticket->bytes;
-- else if (first_ticket_bytes > ticket->bytes)
-+ else if (!aborted && first_ticket_bytes > ticket->bytes)
- return true;
-
-- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
-+ if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_info(fs_info, "failing ticket with %llu bytes",
- ticket->bytes);
-
- remove_ticket(space_info, ticket);
-- ticket->error = -ENOSPC;
-+ if (aborted)
-+ ticket->error = -EIO;
-+ else
-+ ticket->error = -ENOSPC;
- wake_up(&ticket->wait);
-
- /*
-@@ -985,7 +990,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
- * here to see if we can make progress with the next ticket in
- * the list.
- */
-- btrfs_try_granting_tickets(fs_info, space_info);
-+ if (!aborted)
-+ btrfs_try_granting_tickets(fs_info, space_info);
- }
- return (tickets_id != space_info->tickets_id);
- }
-@@ -1253,6 +1259,15 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
- spin_unlock(&space_info->lock);
- return;
- }
-+
-+ /* Something happened, fail everything and bail. */
-+ if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED,
-+ &fs_info->fs_state)) {
-+ maybe_fail_all_tickets(fs_info, space_info);
-+ space_info->flush = 0;
-+ spin_unlock(&space_info->lock);
-+ return;
-+ }
- last_tickets_id = space_info->tickets_id;
- spin_unlock(&space_info->lock);
- }
-@@ -1283,6 +1298,16 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
- } else {
- flush_state = 0;
- }
-+
-+ /* Something happened, fail everything and bail. */
-+ if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED,
-+ &fs_info->fs_state)) {
-+ maybe_fail_all_tickets(fs_info, space_info);
-+ space_info->flush = 0;
-+ spin_unlock(&space_info->lock);
-+ return;
-+ }
-+
- }
- spin_unlock(&space_info->lock);
- }
---
-2.32.0
-
-
-From c7c8879b24c228034f942521af8011a600b273ed Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Fri, 12 Mar 2021 15:25:05 -0500
-Subject: [PATCH 17/22] btrfs: handle btrfs_record_root_in_trans failure in
- btrfs_recover_log_trees
-
-btrfs_record_root_in_trans will return errors in the future, so handle
-the error properly in btrfs_recover_log_trees.
-
-This appears tricky, however we have a reference count on the
-destination root, so if this fails we need to continue on in the loop to
-make sure the proper cleanup is done.
-
-Reviewed-by: Qu Wenruo <wqu@suse.com>
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
-Reviewed-by: David Sterba <dsterba@suse.com>
-[ add comment ]
-Signed-off-by: David Sterba <dsterba@suse.com>
----
- fs/btrfs/tree-log.c | 9 +++++++--
- 1 file changed, 7 insertions(+), 2 deletions(-)
-
-diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
-index f9332bb84..1e3cfc935 100644
---- a/fs/btrfs/tree-log.c
-+++ b/fs/btrfs/tree-log.c
-@@ -6300,8 +6300,13 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
- }
-
- wc.replay_dest->log_root = log;
-- btrfs_record_root_in_trans(trans, wc.replay_dest);
-- ret = walk_log_tree(trans, log, &wc);
-+ ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
-+ if (ret)
-+ /* The loop needs to continue due to the root refs */
-+ btrfs_handle_fs_error(fs_info, ret,
-+ "failed to record the log root in transaction");
-+ else
-+ ret = walk_log_tree(trans, log, &wc);
-
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- ret = fixup_inode_link_counts(trans, wc.replay_dest,
---
-2.32.0
-
-
-From c2a7ee7bf274fa4b307dc78e36ac32fde7ad9e91 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef () toxicpanda ! com>
-Date: Thu, 20 May 2021 14:46:01 +0000
-Subject: [PATCH 18/22] btrfs: change handle_fs_error in recover_log_trees to
- aborts
-
-During inspection of the return path for replay I noticed that we don't
-actually abort the transaction if we get a failure during replay. This
-isn't a problem necessarily, as we properly return the error and will
-fail to mount. However we still leave this dangling transaction that
-could conceivably be committed without thinking there was an error.
-We were using btrfs_handle_fs_error() here, but that pre-dates the
-transaction abort code. Simply replace the btrfs_handle_fs_error()
-calls with transaction aborts, so we still know where exactly things
-went wrong, and add a few in some other un-handled error cases.
-
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/tree-log.c | 16 ++++++++--------
- 1 file changed, 8 insertions(+), 8 deletions(-)
-
-diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
-index 1e3cfc935..876445337 100644
---- a/fs/btrfs/tree-log.c
-+++ b/fs/btrfs/tree-log.c
-@@ -6247,8 +6247,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
- ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
-
- if (ret < 0) {
-- btrfs_handle_fs_error(fs_info, ret,
-- "Couldn't find tree log root.");
-+ btrfs_abort_transaction(trans, ret);
- goto error;
- }
- if (ret > 0) {
-@@ -6265,8 +6264,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
- log = btrfs_read_tree_root(log_root_tree, &found_key);
- if (IS_ERR(log)) {
- ret = PTR_ERR(log);
-- btrfs_handle_fs_error(fs_info, ret,
-- "Couldn't read tree log root.");
-+ btrfs_abort_transaction(trans, ret);
- goto error;
- }
-
-@@ -6294,8 +6292,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
-
- if (!ret)
- goto next;
-- btrfs_handle_fs_error(fs_info, ret,
-- "Couldn't read target root for tree log recovery.");
-+ btrfs_abort_transaction(trans, ret);
- goto error;
- }
-
-@@ -6303,14 +6300,15 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
- ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
- if (ret)
- /* The loop needs to continue due to the root refs */
-- btrfs_handle_fs_error(fs_info, ret,
-- "failed to record the log root in transaction");
-+ btrfs_abort_transaction(trans, ret);
- else
- ret = walk_log_tree(trans, log, &wc);
-
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- ret = fixup_inode_link_counts(trans, wc.replay_dest,
- path);
-+ if (ret)
-+ btrfs_abort_transaction(trans, ret);
- }
-
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
-@@ -6327,6 +6325,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
- * could only happen during mount.
- */
- ret = btrfs_init_root_free_objectid(root);
-+ if (ret)
-+ btrfs_abort_transaction(trans, ret);
- }
-
- wc.replay_dest->log_root = NULL;
---
-2.32.0
-
-
-From 03c9f21055825cd463b214cf8341e9c4907525f0 Mon Sep 17 00:00:00 2001
-From: Filipe Manana <fdmanana@suse.com>
-Date: Fri, 28 May 2021 11:37:32 +0100
-Subject: [PATCH 19/22] btrfs: avoid unnecessary logging of xattrs during fast
- fsyncs
-
-When logging an inode we always log all its xattrs, so that we are able
-to figure out which ones should be deleted during log replay. However this
-is unnecessary when we are doing a fast fsync and no xattrs were added,
-changed or deleted since the last time we logged the inode in the current
-transaction.
-
-So skip the logging of xattrs when the inode was previously logged in the
-current transaction and no xattrs were added, changed or deleted. If any
-changes to xattrs happened, than the inode has BTRFS_INODE_COPY_EVERYTHING
-set in its runtime flags and the xattrs get logged. This saves time on
-scanning for xattrs, allocating memory, COWing log tree extent buffers and
-adding more lock contention on the extent buffers when there are multiple
-tasks logging in parallel.
-
-The use of xattrs is common when using ACLs, some applications, or when
-using security modules like SELinux where every inode gets a security
-xattr added to it.
-
-The following test script, using fio, was used on a box with 12 cores, 64G
-of RAM, a NVMe device and the default non-debug kernel config from Debian.
-It uses 8 concurrent jobs each writing in blocks of 64K to its own 4G file,
-each file with a single xattr of 50 bytes (about the same size for an ACL
-or SELinux xattr), doing random buffered writes with an fsync after each
-write.
-
- $ cat test.sh
- #!/bin/bash
-
- DEV=/dev/nvme0n1
- MNT=/mnt/test
- MOUNT_OPTIONS="-o ssd"
- MKFS_OPTIONS="-d single -m single"
-
- NUM_JOBS=8
- FILE_SIZE=4G
-
- cat <<EOF > /tmp/fio-job.ini
- [writers]
- rw=randwrite
- fsync=1
- fallocate=none
- group_reporting=1
- direct=0
- bs=64K
- ioengine=sync
- size=$FILE_SIZE
- directory=$MNT
- numjobs=$NUM_JOBS
- EOF
-
- echo "performance" | \
- tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
-
- mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null
- mount $MOUNT_OPTIONS $DEV $MNT
-
- echo "Creating files before fio runs, each with 1 xattr of 50 bytes"
- for ((i = 0; i < $NUM_JOBS; i++)); do
- path="$MNT/writers.$i.0"
- truncate -s $FILE_SIZE $path
- setfattr -n user.xa1 -v $(printf '%0.sX' $(seq 50)) $path
- done
-
- fio /tmp/fio-job.ini
- umount $MNT
-
-fio output before this change:
-
-WRITE: bw=120MiB/s (126MB/s), 120MiB/s-120MiB/s (126MB/s-126MB/s), io=32.0GiB (34.4GB), run=272145-272145msec
-
-fio output after this change:
-
-WRITE: bw=142MiB/s (149MB/s), 142MiB/s-142MiB/s (149MB/s-149MB/s), io=32.0GiB (34.4GB), run=230408-230408msec
-
-+16.8% throughput, -16.6% runtime
-
-Signed-off-by: Filipe Manana <fdmanana@suse.com>
----
- fs/btrfs/tree-log.c | 16 +++++++++++++---
- 1 file changed, 13 insertions(+), 3 deletions(-)
-
-diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
-index 876445337..fb4704ed9 100644
---- a/fs/btrfs/tree-log.c
-+++ b/fs/btrfs/tree-log.c
-@@ -5465,13 +5465,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- btrfs_release_path(dst_path);
- if (need_log_inode_item) {
- err = log_inode_item(trans, log, dst_path, inode);
-- if (!err && !xattrs_logged) {
-+ if (err)
-+ goto out_unlock;
-+ /*
-+ * If we are doing a fast fsync and the inode was logged before
-+ * in this transaction, we don't need to log the xattrs because
-+ * they were logged before. If xattrs were added, changed or
-+ * deleted since the last time we logged the inode, then we have
-+ * already logged them because the inode had the runtime flag
-+ * BTRFS_INODE_COPY_EVERYTHING set.
-+ */
-+ if (!xattrs_logged && inode->logged_trans < trans->transid) {
- err = btrfs_log_all_xattrs(trans, root, inode, path,
- dst_path);
-+ if (err)
-+ goto out_unlock;
- btrfs_release_path(path);
- }
-- if (err)
-- goto out_unlock;
- }
- if (fast_search) {
- ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
---
-2.32.0
-
-
-From 5615df097f5c4d07edf14bb40292f26043b405f5 Mon Sep 17 00:00:00 2001
-From: Josef Bacik <josef@toxicpanda.com>
-Date: Tue, 1 Jun 2021 15:45:08 -0400
-Subject: [PATCH 20/22] btrfs: handle shrink_delalloc pages calculation
- differently
-
-We have been hitting some early ENOSPC issues in production with more
-recent kernels, and I tracked it down to us simply not flushing delalloc
-as aggressively as we should be. With tracing I was seeing us failing
-all tickets with all of the block rsvs at or around 0, with very little
-pinned space, but still around 120mib of outstanding bytes_may_used.
-Upon further investigation I saw that we were flushing around 14 pages
-per shrink call for delalloc, despite having around 2gib of delalloc
-outstanding.
-
-Consider the example of a 8 way machine, all cpu's trying to create a
-file in parallel, which at the time of this commit requires 5 items to
-do. Assuming a 16k leaf size, we have 10mib of total metadata reclaim
-size waiting on reservations. Now assume we have 128mib of delalloc
-outstanding. With our current math we would set items to 20, and then
-set to_reclaim to 20 * 256k, or 5mib.
-
-Assuming that we went through this loop all 3 times, for both
-FLUSH_DELALLOC and FLUSH_DELALLOC_WAIT, and then did the full loop
-twice, we'd only flush 60mib of the 128mib delalloc space. This could
-leave a fair bit of delalloc reservations still hanging around by the
-time we go to ENOSPC out all the remaining tickets.
-
-Fix this two ways. First, change the calculations to be a fraction of
-the total delalloc bytes on the system. Prior to my change we were
-calculating based on dirty inodes so our math made more sense, now it's
-just completely unrelated to what we're actually doing.
-
-Second add a FLUSH_DELALLOC_FULL state, that we hold off until we've
-gone through the flush states at least once. This will empty the system
-of all delalloc so we're sure to be truly out of space when we start
-failing tickets.
-
-I'm tagging stable 5.10 and forward, because this is where we started
-using the page stuff heavily again. This affects earlier kernel
-versions as well, but would be a pain to backport to them as the
-flushing mechanisms aren't the same.
-
-CC: stable@vger.kernel.org # 5.10
-Signed-off-by: Josef Bacik <josef@toxicpanda.com>
----
- fs/btrfs/ctree.h | 11 ++++++-----
- fs/btrfs/space-info.c | 36 +++++++++++++++++++++++++++---------
- include/trace/events/btrfs.h | 1 +
- 3 files changed, 34 insertions(+), 14 deletions(-)
-
-diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
-index 12921830e..75ba87dcc 100644
---- a/fs/btrfs/ctree.h
-+++ b/fs/btrfs/ctree.h
-@@ -2746,11 +2746,12 @@ enum btrfs_flush_state {
- FLUSH_DELAYED_REFS = 4,
- FLUSH_DELALLOC = 5,
- FLUSH_DELALLOC_WAIT = 6,
-- ALLOC_CHUNK = 7,
-- ALLOC_CHUNK_FORCE = 8,
-- RUN_DELAYED_IPUTS = 9,
-- COMMIT_TRANS = 10,
-- FORCE_COMMIT_TRANS = 11,
-+ FLUSH_DELALLOC_FULL = 7,
-+ ALLOC_CHUNK = 8,
-+ ALLOC_CHUNK_FORCE = 9,
-+ RUN_DELAYED_IPUTS = 10,
-+ COMMIT_TRANS = 11,
-+ FORCE_COMMIT_TRANS = 12,
- };
-
- int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
-diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
-index 208f47e60..88bac64d5 100644
---- a/fs/btrfs/space-info.c
-+++ b/fs/btrfs/space-info.c
-@@ -505,6 +505,10 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
- long time_left;
- int loops;
-
-+ delalloc_bytes = percpu_counter_sum_positive(
-+ &fs_info->delalloc_bytes);
-+ ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
-+
- /* Calc the number of the pages we need flush for space reservation */
- if (to_reclaim == U64_MAX) {
- items = U64_MAX;
-@@ -512,19 +516,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
- /*
- * to_reclaim is set to however much metadata we need to
- * reclaim, but reclaiming that much data doesn't really track
-- * exactly, so increase the amount to reclaim by 2x in order to
-- * make sure we're flushing enough delalloc to hopefully reclaim
-- * some metadata reservations.
-+ * exactly. What we really want to do is reclaim full inode's
-+ * worth of reservations, however that's not available to us
-+ * here. We will take a fraction of the delalloc bytes for our
-+ * flushing loops and hope for the best. Delalloc will expand
-+ * the amount we write to cover an entire dirty extent, which
-+ * will reclaim the metadata reservation for that range. If
-+ * it's not enough subsequent flush stages will be more
-+ * aggressive.
- */
-+ to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
- items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
-- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
- }
-
- trans = (struct btrfs_trans_handle *)current->journal_info;
-
-- delalloc_bytes = percpu_counter_sum_positive(
-- &fs_info->delalloc_bytes);
-- ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
- if (delalloc_bytes == 0 && ordered_bytes == 0)
- return;
-
-@@ -710,8 +716,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
- break;
- case FLUSH_DELALLOC:
- case FLUSH_DELALLOC_WAIT:
-+ case FLUSH_DELALLOC_FULL:
-+ if (state == FLUSH_DELALLOC_FULL)
-+ num_bytes = U64_MAX;
- shrink_delalloc(fs_info, space_info, num_bytes,
-- state == FLUSH_DELALLOC_WAIT, for_preempt);
-+ state != FLUSH_DELALLOC, for_preempt);
- break;
- case FLUSH_DELAYED_REFS_NR:
- case FLUSH_DELAYED_REFS:
-@@ -1043,6 +1052,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
- commit_cycles--;
- }
-
-+ /*
-+ * We do not want to empty the system of delalloc unless we're
-+ * under heavy pressure, so allow one trip through the flushing
-+ * logic before we start doing a FLUSH_DELALLOC_FULL.
-+ */
-+ if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
-+ flush_state++;
-+
- /*
- * We don't want to force a chunk allocation until we've tried
- * pretty hard to reclaim space. Think of the case where we
-@@ -1225,7 +1242,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
- * so if we now have space to allocate do the force chunk allocation.
- */
- static const enum btrfs_flush_state data_flush_states[] = {
-- FLUSH_DELALLOC_WAIT,
-+ FLUSH_DELALLOC_FULL,
- RUN_DELAYED_IPUTS,
- FLUSH_DELAYED_REFS,
- COMMIT_TRANS,
-@@ -1334,6 +1351,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
- FLUSH_DELAYED_REFS,
- FLUSH_DELALLOC,
- FLUSH_DELALLOC_WAIT,
-+ FLUSH_DELALLOC_FULL,
- ALLOC_CHUNK,
- COMMIT_TRANS,
- };
-diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
-index 0551ea653..7cda6c3d7 100644
---- a/include/trace/events/btrfs.h
-+++ b/include/trace/events/btrfs.h
-@@ -94,6 +94,7 @@ struct btrfs_space_info;
- EM( FLUSH_DELAYED_ITEMS, "FLUSH_DELAYED_ITEMS") \
- EM( FLUSH_DELALLOC, "FLUSH_DELALLOC") \
- EM( FLUSH_DELALLOC_WAIT, "FLUSH_DELALLOC_WAIT") \
-+ EM( FLUSH_DELALLOC_FULL, "FLUSH_DELALLOC_FULL") \
- EM( FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR") \
- EM( FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS") \
- EM( ALLOC_CHUNK, "ALLOC_CHUNK") \
---
-2.32.0
-
-
-From ec26628891bae6cd63bcd934dc3b13157cbc1024 Mon Sep 17 00:00:00 2001
-From: Filipe Manana <fdmanana@suse.com>
-Date: Wed, 9 Jun 2021 11:25:03 +0100
-Subject: [PATCH 21/22] btrfs: send: fix invalid path for unlink operations
- after parent orphanization
-
-During an incremental send operation, when processing the new references
-for the current inode, we might send an unlink operation for another inode
-that has a conflicting path and has more than one hard link. However this
-path was computed and cached before we processed previous new references
-for the current inode. We may have orphanized a directory of that path
-while processing a previous new reference, in which case the path will
-be invalid and cause the receiver process to fail.
-
-The following reproducer triggers the problem and explains how/why it
-happens in its comments:
-
- $ cat test-send-unlink.sh
- #!/bin/bash
-
- DEV=/dev/sdi
- MNT=/mnt/sdi
-
- mkfs.btrfs -f $DEV >/dev/null
- mount $DEV $MNT
-
- # Create our test files and directory. Inode 259 (file3) has two hard
- # links.
- touch $MNT/file1
- touch $MNT/file2
- touch $MNT/file3
-
- mkdir $MNT/A
- ln $MNT/file3 $MNT/A/hard_link
-
- # Filesystem looks like:
- #
- # . (ino 256)
- # |----- file1 (ino 257)
- # |----- file2 (ino 258)
- # |----- file3 (ino 259)
- # |----- A/ (ino 260)
- # |---- hard_link (ino 259)
- #
-
- # Now create the base snapshot, which is going to be the parent snapshot
- # for a later incremental send.
- btrfs subvolume snapshot -r $MNT $MNT/snap1
- btrfs send -f /tmp/snap1.send $MNT/snap1
-
- # Move inode 257 into directory inode 260. This results in computing the
- # path for inode 260 as "/A" and caching it.
- mv $MNT/file1 $MNT/A/file1
-
- # Move inode 258 (file2) into directory inode 260, with a name of
- # "hard_link", moving first inode 259 away since it currently has that
- # location and name.
- mv $MNT/A/hard_link $MNT/tmp
- mv $MNT/file2 $MNT/A/hard_link
-
- # Now rename inode 260 to something else (B for example) and then create
- # a hard link for inode 258 that has the old name and location of inode
- # 260 ("/A").
- mv $MNT/A $MNT/B
- ln $MNT/B/hard_link $MNT/A
-
- # Filesystem now looks like:
- #
- # . (ino 256)
- # |----- tmp (ino 259)
- # |----- file3 (ino 259)
- # |----- B/ (ino 260)
- # | |---- file1 (ino 257)
- # | |---- hard_link (ino 258)
- # |
- # |----- A (ino 258)
-
- # Create another snapshot of our subvolume and use it for an incremental
- # send.
- btrfs subvolume snapshot -r $MNT $MNT/snap2
- btrfs send -f /tmp/snap2.send -p $MNT/snap1 $MNT/snap2
-
- # Now unmount the filesystem, create a new one, mount it and try to
- # apply both send streams to recreate both snapshots.
- umount $DEV
-
- mkfs.btrfs -f $DEV >/dev/null
-
- mount $DEV $MNT
-
- # First add the first snapshot to the new filesystem by applying the
- # first send stream.
- btrfs receive -f /tmp/snap1.send $MNT
-
- # The incremental receive operation below used to fail with the
- # following error:
- #
- # ERROR: unlink A/hard_link failed: No such file or directory
- #
- # This is because when send is processing inode 257, it generates the
- # path for inode 260 as "/A", since that inode is its parent in the send
- # snapshot, and caches that path.
- #
- # Later when processing inode 258, it first processes its new reference
- # that has the path of "/A", which results in orphanizing inode 260
- # because there is a a path collision. This results in issuing a rename
- # operation from "/A" to "/o260-6-0".
- #
- # Finally when processing the new reference "B/hard_link" for inode 258,
- # it notices that it collides with inode 259 (not yet processed, because
- # it has a higher inode number), since that inode has the name
- # "hard_link" under the directory inode 260. It also checks that inode
- # 259 has two hardlinks, so it decides to issue a unlink operation for
- # the name "hard_link" for inode 259. However the path passed to the
- # unlink operation is "/A/hard_link", which is incorrect since currently
- # "/A" does not exists, due to the orphanization of inode 260 mentioned
- # before. The path is incorrect because it was computed and cached
- # before the orphanization. This results in the receiver to fail with
- # the above error.
- btrfs receive -f /tmp/snap2.send $MNT
-
- umount $MNT
-
-When running the test, it fails like this:
-
- $ ./test-send-unlink.sh
- Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1'
- At subvol /mnt/sdi/snap1
- Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2'
- At subvol /mnt/sdi/snap2
- At subvol snap1
- At snapshot snap2
- ERROR: unlink A/hard_link failed: No such file or directory
-
-Fix this by recomputing a path before issuing an unlink operation when
-processing the new references for the current inode if we previously
-have orphanized a directory.
-
-A test case for fstests will follow soon.
-
-Signed-off-by: Filipe Manana <fdmanana@suse.com>
----
- fs/btrfs/send.c | 11 +++++++++++
- 1 file changed, 11 insertions(+)
-
-diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
-index ed1310e38..f61ababf8 100644
---- a/fs/btrfs/send.c
-+++ b/fs/btrfs/send.c
-@@ -4064,6 +4064,17 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
- if (ret < 0)
- goto out;
- } else {
-+ /*
-+ * If we previously orphanized a directory that
-+ * collided with a new reference that we already
-+ * processed, recompute the current path because
-+ * that directory may be part of the path.
-+ */
-+ if (orphanized_dir) {
-+ ret = refresh_ref_path(sctx, cur);
-+ if (ret < 0)
-+ goto out;
-+ }
- ret = send_unlink(sctx, cur->full_path);
- if (ret < 0)
- goto out;
---
-2.32.0
-
-
-From 3a07c030c466316a5f74cb9f320d7b9df985ec1c Mon Sep 17 00:00:00 2001
-From: David Sterba <dsterba () suse ! com>
-Date: Fri, 11 Jun 2021 13:36:22 +0000
-Subject: [PATCH 22/22] btrfs: sysfs: export dev stats in devinfo directory
-
-The device stats can be read by ioctl, wrapped by command 'btrfs device
-stats'. Provide another source where to read the information in
-/sys/fs/btrfs/FSID/devinfo/DEVID/stats . The format is a list of
-'key value' pairs one per line, which is common in other stat files.
-The names are the same as used in other device stat outputs.
-
-The stats are all in one file as it's the snapshot of all available
-stats. The 'one value per file' is not very suitable here. The stats
-should be valid right after the stats item is read from disk, shortly
-after initializing the device.
-
-In case the stats are not yet valid, print just 'invalid' as the file
-contents.
-
-Signed-off-by: David Sterba <dsterba@suse.com>
----
- fs/btrfs/sysfs.c | 29 +++++++++++++++++++++++++++++
- 1 file changed, 29 insertions(+)
-
-diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
-index 9dda3feda..5c50fd77f 100644
---- a/fs/btrfs/sysfs.c
-+++ b/fs/btrfs/sysfs.c
-@@ -1509,7 +1509,36 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
- }
- BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
-
-+static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
-+ struct kobj_attribute *a, char *buf)
-+{
-+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
-+ devid_kobj);
-+
-+ if (!device->dev_stats_valid)
-+ return scnprintf(buf, PAGE_SIZE, "invalid\n");
-+
-+ /*
-+ * Print all at once so we get a snapshot of all values from the same
-+ * time. Keep them in sync and in order of definition of
-+ * btrfs_dev_stat_values.
-+ */
-+ return scnprintf(buf, PAGE_SIZE,
-+ "write_errs %d\n"
-+ "read_errs %d\n"
-+ "flush_errs %d\n"
-+ "corruption_errs %d\n"
-+ "generation_errs %d\n",
-+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS),
-+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS),
-+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS),
-+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS),
-+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS));
-+}
-+BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
-+
- static struct attribute *devid_attrs[] = {
-+ BTRFS_ATTR_PTR(devid, error_stats),
- BTRFS_ATTR_PTR(devid, in_fs_metadata),
- BTRFS_ATTR_PTR(devid, missing),
- BTRFS_ATTR_PTR(devid, replace_target),
---
-2.32.0
-
diff --git a/PKGBUILD b/PKGBUILD
index 8def1d039c87..b709f1c3f928 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -74,6 +74,7 @@ _major=5.12
_ckpatchversion=1
_ckpatch="patch-${_major}-ck${_ckpatchversion}"
_gcc_more_v=20210610
+_patches_url="https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/${_major}"
arch=(x86_64)
url="https://wiki.archlinux.org/index.php/Linux-ck"
license=(GPL2)
@@ -93,9 +94,9 @@ source=(
0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch
0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch
0007-x86-crash-remove-crash_reserve_low_1M.patch
- 0008-UKSM.patch
- 0009-bbr2.patch
- 0010-btrfs.patch
+ "0008-UKSM.patch::${_patches_url}/uksm-patches/0001-UKSM-for-5.12.patch"
+ "0009-bbr2.patch::${_patches_url}/bbr2-patches-v2/0001-bbr2-5.12-introduce-BBRv2.patch"
+ "0010-btrfs.patch::${_patches_url}/btrfs-patches-v13/0001-btrfs-patches.patch"
"0011-block.patch::${_patches_url}/block-patches-v6/0001-block-patches.patch"
"0012-bfq.patch::${_patches_url}/bfq-patches-v15/0001-bfq-patches.patch"
)