diff options
author | antman666 | 2021-07-06 17:15:37 +0800 |
---|---|---|
committer | antman666 | 2021-07-06 17:15:37 +0800 |
commit | 8782ea33567efa1b67c9fd54ca51bdefe596f89a (patch) | |
tree | 63d19e5daab3fe80230ea602f38a86f2dcaaf677 | |
parent | 7f8e8c158ef86353e2613129a89f5681f6093f34 (diff) | |
download | aur-8782ea33567efa1b67c9fd54ca51bdefe596f89a.tar.gz |
remove useless patches
-rw-r--r-- | .SRCINFO | 10 | ||||
-rw-r--r-- | 0008-UKSM.patch | 6970 | ||||
-rw-r--r-- | 0009-bbr2.patch | 3347 | ||||
-rw-r--r-- | 0010-btrfs.patch | 2157 | ||||
-rw-r--r-- | PKGBUILD | 7 |
5 files changed, 11 insertions, 12480 deletions
@@ -24,9 +24,11 @@ pkgbase = linux-ck-uksm source = 0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch source = 0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch source = 0007-x86-crash-remove-crash_reserve_low_1M.patch - source = 0008-UKSM.patch - source = 0009-bbr2.patch - source = 0010-btrfs.patch + source = 0008-UKSM.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/uksm-patches/0001-UKSM-for-5.12.patch + source = 0009-bbr2.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/bbr2-patches-v2/0001-bbr2-5.12-introduce-BBRv2.patch + source = 0010-btrfs.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/btrfs-patches-v13/0001-btrfs-patches.patch + source = 0011-block.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/block-patches-v6/0001-block-patches.patch + source = 0012-bfq.patch::https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/5.12/bfq-patches-v15/0001-bfq-patches.patch validpgpkeys = ABAF11C65A2970B130ABE3C479BE3E4300411886 validpgpkeys = 647F28654894E3BD457199BE38DBBDC86092693E b2sums = 3bc213b432d61c358f85b932dec8bd44a1ef73442f20424ad5ce374b6982a6909c5b318d5e9848996989d5e421ab6c2128cdb51a3724adc95222f96a859486a1 @@ -44,6 +46,8 @@ pkgbase = linux-ck-uksm b2sums = 14f45171afc3b15488b40a05e58b352c5057da3a5782e13527392f7750d8e45a8db54f9b50b218fedb8bf679de3b4e5d78e230a44f7b1aa482f7b3aa831bd641 b2sums = 0c5f2e21e27aee6c8d8eaa07daa111ff2687756413f8a909cf03acc8f836367c6b27050966f9b7bf1521ad11b84fe94fb42d70c33693c80a674ef223cf2cfc00 b2sums = 705a8f2037eef3afdd0f2a7648cc8d00bfc03112385b44a8907182812b6aed075519a9236909c0e3ba09df887381dd76cb01c601e0df05119136f7318587a416 + b2sums = 67067d624711d663c1be1d35c5e59cb588faba1769b27443a3a13b44dbe9e627edd054a4fd122d04d587e21b25be5520fffb61cfc7538aee77c33a1a8cb1b97a + b2sums = 9aba508592818a4b4f000fc1bd471ec74687c8f0f972f330e851bd2364eaf30cff4d5012f843625ca025bc2478a2c76e0d082d43f33358ab18ce829fab4f0c2b pkgname = linux-ck-uksm pkgdesc = The Linux-ck-uksm kernel and modules with the ck1 and uksm patchesset featuring MuQSS CPU scheduler diff --git a/0008-UKSM.patch b/0008-UKSM.patch deleted file mode 100644 index 3321eaa8ee58..000000000000 --- a/0008-UKSM.patch +++ /dev/null @@ -1,6970 +0,0 @@ -From 9a42006b641bc8e0c333174a9bf269ac9450d521 Mon Sep 17 00:00:00 2001 -From: Piotr Gorski <lucjan.lucjanov@gmail.com> -Date: Tue, 13 Apr 2021 16:27:12 +0200 -Subject: [PATCH] UKSM for 5.12 - -Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com> ---- - Documentation/vm/uksm.txt | 61 + - fs/exec.c | 1 + - fs/proc/meminfo.c | 4 + - include/linux/ksm.h | 43 +- - include/linux/mm_types.h | 3 + - include/linux/mmzone.h | 3 + - include/linux/pgtable.h | 17 +- - include/linux/sradix-tree.h | 77 + - include/linux/uksm.h | 149 + - kernel/fork.c | 2 +- - lib/Makefile | 2 +- - lib/sradix-tree.c | 476 +++ - mm/Kconfig | 26 + - mm/Makefile | 3 +- - mm/ksm.c | 11 - - mm/memory.c | 33 +- - mm/mmap.c | 37 + - mm/uksm.c | 5614 +++++++++++++++++++++++++++++++++++ - mm/vmstat.c | 3 + - 19 files changed, 6539 insertions(+), 26 deletions(-) - create mode 100644 Documentation/vm/uksm.txt - create mode 100644 include/linux/sradix-tree.h - create mode 100644 include/linux/uksm.h - create mode 100644 lib/sradix-tree.c - create mode 100644 mm/uksm.c - -diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt -new file mode 100644 -index 000000000..be19a3127 ---- /dev/null -+++ b/Documentation/vm/uksm.txt -@@ -0,0 +1,61 @@ -+The Ultra Kernel Samepage Merging feature -+---------------------------------------------- -+/* -+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia -+ * -+ * This is an improvement upon KSM. Some basic data structures and routines -+ * are borrowed from ksm.c . -+ * -+ * Its new features: -+ * 1. Full system scan: -+ * It automatically scans all user processes' anonymous VMAs. Kernel-user -+ * interaction to submit a memory area to KSM is no longer needed. -+ * -+ * 2. Rich area detection: -+ * It automatically detects rich areas containing abundant duplicated -+ * pages based. Rich areas are given a full scan speed. Poor areas are -+ * sampled at a reasonable speed with very low CPU consumption. -+ * -+ * 3. Ultra Per-page scan speed improvement: -+ * A new hash algorithm is proposed. As a result, on a machine with -+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it -+ * can scan memory areas that does not contain duplicated pages at speed of -+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of -+ * 477MB/sec ~ 923MB/sec. -+ * -+ * 4. Thrashing area avoidance: -+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be -+ * filtered out. My benchmark shows it's more efficient than KSM's per-page -+ * hash value based volatile page detection. -+ * -+ * -+ * 5. Misc changes upon KSM: -+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page -+ * comparison. It's much faster than default C version on x86. -+ * * rmap_item now has an struct *page member to loosely cache a -+ * address-->page mapping, which reduces too much time-costly -+ * follow_page(). -+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. -+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ -+ * ksm is needed for this case. -+ * -+ * 6. Full Zero Page consideration(contributed by Figo Zhang) -+ * Now uksmd consider full zero pages as special pages and merge them to an -+ * special unswappable uksm zero page. -+ */ -+ -+ChangeLog: -+ -+2012-05-05 The creation of this Doc -+2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. -+2012-05-28 UKSM 0.1.1.2 bug fix release -+2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 -+2012-07-2 UKSM 0.1.2-beta2 -+2012-07-10 UKSM 0.1.2-beta3 -+2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. -+2012-10-13 UKSM 0.1.2.1 Bug fixes. -+2012-12-31 UKSM 0.1.2.2 Minor bug fixes. -+2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug". -+2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings. -+2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation. -+2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration. -diff --git a/fs/exec.c b/fs/exec.c -index 18594f11c..aee636fd4 100644 ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -65,6 +65,7 @@ - #include <linux/vmalloc.h> - #include <linux/io_uring.h> - #include <linux/syscall_user_dispatch.h> -+#include <linux/ksm.h> - - #include <linux/uaccess.h> - #include <asm/mmu_context.h> -diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c -index 6fa761c9c..45fd59a0d 100644 ---- a/fs/proc/meminfo.c -+++ b/fs/proc/meminfo.c -@@ -108,6 +108,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) - #endif - show_val_kb(m, "PageTables: ", - global_node_page_state(NR_PAGETABLE)); -+#ifdef CONFIG_UKSM -+ show_val_kb(m, "KsmZeroPages: ", -+ global_zone_page_state(NR_UKSM_ZERO_PAGES)); -+#endif - - show_val_kb(m, "NFS_Unstable: ", 0); - show_val_kb(m, "Bounce: ", -diff --git a/include/linux/ksm.h b/include/linux/ksm.h -index 161e8164a..f0dbdf3c9 100644 ---- a/include/linux/ksm.h -+++ b/include/linux/ksm.h -@@ -21,20 +21,16 @@ struct mem_cgroup; - #ifdef CONFIG_KSM - int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags); --int __ksm_enter(struct mm_struct *mm); --void __ksm_exit(struct mm_struct *mm); - --static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -+static inline struct stable_node *page_stable_node(struct page *page) - { -- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) -- return __ksm_enter(mm); -- return 0; -+ return PageKsm(page) ? page_rmapping(page) : NULL; - } - --static inline void ksm_exit(struct mm_struct *mm) -+static inline void set_page_stable_node(struct page *page, -+ struct stable_node *stable_node) - { -- if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) -- __ksm_exit(mm); -+ page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); - } - - /* -@@ -54,6 +50,33 @@ struct page *ksm_might_need_to_copy(struct page *page, - void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); - void ksm_migrate_page(struct page *newpage, struct page *oldpage); - -+#ifdef CONFIG_KSM_LEGACY -+int __ksm_enter(struct mm_struct *mm); -+void __ksm_exit(struct mm_struct *mm); -+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -+{ -+ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) -+ return __ksm_enter(mm); -+ return 0; -+} -+ -+static inline void ksm_exit(struct mm_struct *mm) -+{ -+ if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) -+ __ksm_exit(mm); -+} -+ -+#elif defined(CONFIG_UKSM) -+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -+{ -+ return 0; -+} -+ -+static inline void ksm_exit(struct mm_struct *mm) -+{ -+} -+#endif /* !CONFIG_UKSM */ -+ - #else /* !CONFIG_KSM */ - - static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -@@ -89,4 +112,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) - #endif /* CONFIG_MMU */ - #endif /* !CONFIG_KSM */ - -+#include <linux/uksm.h> -+ - #endif /* __LINUX_KSM_H */ -diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index 6613b26a8..82e18e41b 100644 ---- a/include/linux/mm_types.h -+++ b/include/linux/mm_types.h -@@ -370,6 +370,9 @@ struct vm_area_struct { - struct mempolicy *vm_policy; /* NUMA policy for the VMA */ - #endif - struct vm_userfaultfd_ctx vm_userfaultfd_ctx; -+#ifdef CONFIG_UKSM -+ struct vma_slot *uksm_vma_slot; -+#endif - } __randomize_layout; - - struct core_thread { -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index 47946cec7..a6ce64844 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -157,6 +157,9 @@ enum zone_stat_item { - NR_ZSPAGES, /* allocated in zsmalloc */ - #endif - NR_FREE_CMA_PAGES, -+#ifdef CONFIG_UKSM -+ NR_UKSM_ZERO_PAGES, -+#endif - NR_VM_ZONE_STAT_ITEMS }; - - enum node_stat_item { -diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h -index 5e772392a..9d733540d 100644 ---- a/include/linux/pgtable.h -+++ b/include/linux/pgtable.h -@@ -1111,12 +1111,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - extern void untrack_pfn_moved(struct vm_area_struct *vma); - #endif - -+#ifdef CONFIG_UKSM -+static inline int is_uksm_zero_pfn(unsigned long pfn) -+{ -+ extern unsigned long uksm_zero_pfn; -+ return pfn == uksm_zero_pfn; -+} -+#else -+static inline int is_uksm_zero_pfn(unsigned long pfn) -+{ -+ return 0; -+} -+#endif -+ - #ifdef __HAVE_COLOR_ZERO_PAGE - static inline int is_zero_pfn(unsigned long pfn) - { - extern unsigned long zero_pfn; - unsigned long offset_from_zero_pfn = pfn - zero_pfn; -- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); -+ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); - } - - #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) -@@ -1125,7 +1138,7 @@ static inline int is_zero_pfn(unsigned long pfn) - static inline int is_zero_pfn(unsigned long pfn) - { - extern unsigned long zero_pfn; -- return pfn == zero_pfn; -+ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); - } - - static inline unsigned long my_zero_pfn(unsigned long addr) -diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h -new file mode 100644 -index 000000000..d71edba6b ---- /dev/null -+++ b/include/linux/sradix-tree.h -@@ -0,0 +1,77 @@ -+#ifndef _LINUX_SRADIX_TREE_H -+#define _LINUX_SRADIX_TREE_H -+ -+ -+#define INIT_SRADIX_TREE(root, mask) \ -+do { \ -+ (root)->height = 0; \ -+ (root)->gfp_mask = (mask); \ -+ (root)->rnode = NULL; \ -+} while (0) -+ -+#define ULONG_BITS (sizeof(unsigned long) * 8) -+#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -+//#define SRADIX_TREE_MAP_SHIFT 6 -+//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) -+//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) -+ -+struct sradix_tree_node { -+ unsigned int height; /* Height from the bottom */ -+ unsigned int count; -+ unsigned int fulls; /* Number of full sublevel trees */ -+ struct sradix_tree_node *parent; -+ void *stores[0]; -+}; -+ -+/* A simple radix tree implementation */ -+struct sradix_tree_root { -+ unsigned int height; -+ struct sradix_tree_node *rnode; -+ -+ /* Where found to have available empty stores in its sublevels */ -+ struct sradix_tree_node *enter_node; -+ unsigned int shift; -+ unsigned int stores_size; -+ unsigned int mask; -+ unsigned long min; /* The first hole index */ -+ unsigned long num; -+ //unsigned long *height_to_maxindex; -+ -+ /* How the node is allocated and freed. */ -+ struct sradix_tree_node *(*alloc)(void); -+ void (*free)(struct sradix_tree_node *node); -+ -+ /* When a new node is added and removed */ -+ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); -+ void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item); -+ void (*rm)(struct sradix_tree_node *node, unsigned int offset); -+}; -+ -+struct sradix_tree_path { -+ struct sradix_tree_node *node; -+ int offset; -+}; -+ -+static inline -+void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) -+{ -+ root->height = 0; -+ root->rnode = NULL; -+ root->shift = shift; -+ root->stores_size = 1UL << shift; -+ root->mask = root->stores_size - 1; -+} -+ -+ -+extern void *sradix_tree_next(struct sradix_tree_root *root, -+ struct sradix_tree_node *node, unsigned long index, -+ int (*iter)(void *, unsigned long)); -+ -+extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); -+ -+extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, -+ struct sradix_tree_node *node, unsigned long index); -+ -+extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); -+ -+#endif /* _LINUX_SRADIX_TREE_H */ -diff --git a/include/linux/uksm.h b/include/linux/uksm.h -new file mode 100644 -index 000000000..bb8651f53 ---- /dev/null -+++ b/include/linux/uksm.h -@@ -0,0 +1,149 @@ -+#ifndef __LINUX_UKSM_H -+#define __LINUX_UKSM_H -+/* -+ * Memory merging support. -+ * -+ * This code enables dynamic sharing of identical pages found in different -+ * memory areas, even if they are not shared by fork(). -+ */ -+ -+/* if !CONFIG_UKSM this file should not be compiled at all. */ -+#ifdef CONFIG_UKSM -+ -+#include <linux/bitops.h> -+#include <linux/mm.h> -+#include <linux/pagemap.h> -+#include <linux/rmap.h> -+#include <linux/sched.h> -+ -+extern unsigned long zero_pfn __read_mostly; -+extern unsigned long uksm_zero_pfn __read_mostly; -+extern struct page *empty_uksm_zero_page; -+ -+/* must be done before linked to mm */ -+extern void uksm_vma_add_new(struct vm_area_struct *vma); -+extern void uksm_remove_vma(struct vm_area_struct *vma); -+ -+#define UKSM_SLOT_NEED_SORT (1 << 0) -+#define UKSM_SLOT_NEED_RERAND (1 << 1) -+#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ -+#define UKSM_SLOT_FUL_SCANNED (1 << 3) -+#define UKSM_SLOT_IN_UKSM (1 << 4) -+ -+struct vma_slot { -+ struct sradix_tree_node *snode; -+ unsigned long sindex; -+ -+ struct list_head slot_list; -+ unsigned long fully_scanned_round; -+ unsigned long dedup_num; -+ unsigned long pages_scanned; -+ unsigned long this_sampled; -+ unsigned long last_scanned; -+ unsigned long pages_to_scan; -+ struct scan_rung *rung; -+ struct page **rmap_list_pool; -+ unsigned int *pool_counts; -+ unsigned long pool_size; -+ struct vm_area_struct *vma; -+ struct mm_struct *mm; -+ unsigned long ctime_j; -+ unsigned long pages; -+ unsigned long flags; -+ unsigned long pages_cowed; /* pages cowed this round */ -+ unsigned long pages_merged; /* pages merged this round */ -+ unsigned long pages_bemerged; -+ -+ /* when it has page merged in this eval round */ -+ struct list_head dedup_list; -+}; -+ -+static inline void uksm_unmap_zero_page(pte_t pte) -+{ -+ if (pte_pfn(pte) == uksm_zero_pfn) -+ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); -+} -+ -+static inline void uksm_map_zero_page(pte_t pte) -+{ -+ if (pte_pfn(pte) == uksm_zero_pfn) -+ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); -+} -+ -+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) -+{ -+ if (vma->uksm_vma_slot && PageKsm(page)) -+ vma->uksm_vma_slot->pages_cowed++; -+} -+ -+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) -+{ -+ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) -+ vma->uksm_vma_slot->pages_cowed++; -+} -+ -+static inline int uksm_flags_can_scan(unsigned long vm_flags) -+{ -+#ifdef VM_SAO -+ if (vm_flags & VM_SAO) -+ return 0; -+#endif -+ -+ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | -+ VM_HUGETLB | VM_MIXEDMAP | VM_SHARED -+ | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN)); -+} -+ -+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) -+{ -+ if (uksm_flags_can_scan(*vm_flags_p)) -+ *vm_flags_p |= VM_MERGEABLE; -+} -+ -+/* -+ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will -+ * be removed when uksm zero page patch is stable enough. -+ */ -+static inline void uksm_bugon_zeropage(pte_t pte) -+{ -+ BUG_ON(pte_pfn(pte) == uksm_zero_pfn); -+} -+#else -+static inline void uksm_vma_add_new(struct vm_area_struct *vma) -+{ -+} -+ -+static inline void uksm_remove_vma(struct vm_area_struct *vma) -+{ -+} -+ -+static inline void uksm_unmap_zero_page(pte_t pte) -+{ -+} -+ -+static inline void uksm_map_zero_page(pte_t pte) -+{ -+} -+ -+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) -+{ -+} -+ -+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) -+{ -+} -+ -+static inline int uksm_flags_can_scan(unsigned long vm_flags) -+{ -+ return 0; -+} -+ -+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) -+{ -+} -+ -+static inline void uksm_bugon_zeropage(pte_t pte) -+{ -+} -+#endif /* !CONFIG_UKSM */ -+#endif /* __LINUX_UKSM_H */ -diff --git a/kernel/fork.c b/kernel/fork.c -index 426cd0c51..5fd356ca7 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -588,7 +588,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; -- -+ uksm_vma_add_new(tmp); - mm->map_count++; - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(tmp, mpnt); -diff --git a/lib/Makefile b/lib/Makefile -index b5307d3ee..480b099e1 100644 ---- a/lib/Makefile -+++ b/lib/Makefile -@@ -28,7 +28,7 @@ CFLAGS_string.o += -fno-stack-protector - endif - - lib-y := ctype.o string.o vsprintf.o cmdline.o \ -- rbtree.o radix-tree.o timerqueue.o xarray.o \ -+ rbtree.o radix-tree.o sradix-tree.o timerqueue.o xarray.o \ - idr.o extable.o sha1.o irq_regs.o argv_split.o \ - flex_proportions.o ratelimit.o show_mem.o \ - is_single_threaded.o plist.o decompress.o kobject_uevent.o \ -diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c -new file mode 100644 -index 000000000..ab21e6309 ---- /dev/null -+++ b/lib/sradix-tree.c -@@ -0,0 +1,476 @@ -+#include <linux/errno.h> -+#include <linux/mm.h> -+#include <linux/mman.h> -+#include <linux/spinlock.h> -+#include <linux/slab.h> -+#include <linux/gcd.h> -+#include <linux/sradix-tree.h> -+ -+static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) -+{ -+ return node->fulls == root->stores_size || -+ (node->height == 1 && node->count == root->stores_size); -+} -+ -+/* -+ * Extend a sradix tree so it can store key @index. -+ */ -+static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) -+{ -+ struct sradix_tree_node *node; -+ unsigned int height; -+ -+ if (unlikely(root->rnode == NULL)) { -+ if (!(node = root->alloc())) -+ return -ENOMEM; -+ -+ node->height = 1; -+ root->rnode = node; -+ root->height = 1; -+ } -+ -+ /* Figure out what the height should be. */ -+ height = root->height; -+ index >>= root->shift * height; -+ -+ while (index) { -+ index >>= root->shift; -+ height++; -+ } -+ -+ while (height > root->height) { -+ unsigned int newheight; -+ -+ if (!(node = root->alloc())) -+ return -ENOMEM; -+ -+ /* Increase the height. */ -+ node->stores[0] = root->rnode; -+ root->rnode->parent = node; -+ if (root->extend) -+ root->extend(node, root->rnode); -+ -+ newheight = root->height + 1; -+ node->height = newheight; -+ node->count = 1; -+ if (sradix_node_full(root, root->rnode)) -+ node->fulls = 1; -+ -+ root->rnode = node; -+ root->height = newheight; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Search the next item from the current node, that is not NULL -+ * and can satify root->iter(). -+ */ -+void *sradix_tree_next(struct sradix_tree_root *root, -+ struct sradix_tree_node *node, unsigned long index, -+ int (*iter)(void *item, unsigned long height)) -+{ -+ unsigned long offset; -+ void *item; -+ -+ if (unlikely(node == NULL)) { -+ node = root->rnode; -+ for (offset = 0; offset < root->stores_size; offset++) { -+ item = node->stores[offset]; -+ if (item && (!iter || iter(item, node->height))) -+ break; -+ } -+ -+ if (unlikely(offset >= root->stores_size)) -+ return NULL; -+ -+ if (node->height == 1) -+ return item; -+ else -+ goto go_down; -+ } -+ -+ while (node) { -+ offset = (index & root->mask) + 1; -+ for (; offset < root->stores_size; offset++) { -+ item = node->stores[offset]; -+ if (item && (!iter || iter(item, node->height))) -+ break; -+ } -+ -+ if (offset < root->stores_size) -+ break; -+ -+ node = node->parent; -+ index >>= root->shift; -+ } -+ -+ if (!node) -+ return NULL; -+ -+ while (node->height > 1) { -+go_down: -+ node = item; -+ for (offset = 0; offset < root->stores_size; offset++) { -+ item = node->stores[offset]; -+ if (item && (!iter || iter(item, node->height))) -+ break; -+ } -+ -+ if (unlikely(offset >= root->stores_size)) -+ return NULL; -+ } -+ -+ BUG_ON(offset > root->stores_size); -+ -+ return item; -+} -+ -+/* -+ * Blindly insert the item to the tree. Typically, we reuse the -+ * first empty store item. -+ */ -+int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) -+{ -+ unsigned long index; -+ unsigned int height; -+ struct sradix_tree_node *node, *tmp = NULL; -+ int offset, offset_saved; -+ void **store = NULL; -+ int error, i, j, shift; -+ -+go_on: -+ index = root->min; -+ -+ if (root->enter_node && !sradix_node_full(root, root->enter_node)) { -+ node = root->enter_node; -+ BUG_ON((index >> (root->shift * root->height))); -+ } else { -+ node = root->rnode; -+ if (node == NULL || (index >> (root->shift * root->height)) -+ || sradix_node_full(root, node)) { -+ error = sradix_tree_extend(root, index); -+ if (error) -+ return error; -+ -+ node = root->rnode; -+ } -+ } -+ -+ -+ height = node->height; -+ shift = (height - 1) * root->shift; -+ offset = (index >> shift) & root->mask; -+ while (shift > 0) { -+ offset_saved = offset; -+ for (; offset < root->stores_size; offset++) { -+ store = &node->stores[offset]; -+ tmp = *store; -+ -+ if (!tmp || !sradix_node_full(root, tmp)) -+ break; -+ } -+ BUG_ON(offset >= root->stores_size); -+ -+ if (offset != offset_saved) { -+ index += (offset - offset_saved) << shift; -+ index &= ~((1UL << shift) - 1); -+ } -+ -+ if (!tmp) { -+ if (!(tmp = root->alloc())) -+ return -ENOMEM; -+ -+ tmp->height = shift / root->shift; -+ *store = tmp; -+ tmp->parent = node; -+ node->count++; -+// if (root->extend) -+// root->extend(node, tmp); -+ } -+ -+ node = tmp; -+ shift -= root->shift; -+ offset = (index >> shift) & root->mask; -+ } -+ -+ BUG_ON(node->height != 1); -+ -+ -+ store = &node->stores[offset]; -+ for (i = 0, j = 0; -+ j < root->stores_size - node->count && -+ i < root->stores_size - offset && j < num; i++) { -+ if (!store[i]) { -+ store[i] = item[j]; -+ if (root->assign) -+ root->assign(node, index + i, item[j]); -+ j++; -+ } -+ } -+ -+ node->count += j; -+ root->num += j; -+ num -= j; -+ -+ while (sradix_node_full(root, node)) { -+ node = node->parent; -+ if (!node) -+ break; -+ -+ node->fulls++; -+ } -+ -+ if (unlikely(!node)) { -+ /* All nodes are full */ -+ root->min = 1 << (root->height * root->shift); -+ root->enter_node = NULL; -+ } else { -+ root->min = index + i - 1; -+ root->min |= (1UL << (node->height - 1)) - 1; -+ root->min++; -+ root->enter_node = node; -+ } -+ -+ if (num) { -+ item += j; -+ goto go_on; -+ } -+ -+ return 0; -+} -+ -+ -+/** -+ * sradix_tree_shrink - shrink height of a sradix tree to minimal -+ * @root sradix tree root -+ * -+ */ -+static inline void sradix_tree_shrink(struct sradix_tree_root *root) -+{ -+ /* try to shrink tree height */ -+ while (root->height > 1) { -+ struct sradix_tree_node *to_free = root->rnode; -+ -+ /* -+ * The candidate node has more than one child, or its child -+ * is not at the leftmost store, we cannot shrink. -+ */ -+ if (to_free->count != 1 || !to_free->stores[0]) -+ break; -+ -+ root->rnode = to_free->stores[0]; -+ root->rnode->parent = NULL; -+ root->height--; -+ if (unlikely(root->enter_node == to_free)) -+ root->enter_node = NULL; -+ root->free(to_free); -+ } -+} -+ -+/* -+ * Del the item on the known leaf node and index -+ */ -+void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, -+ struct sradix_tree_node *node, unsigned long index) -+{ -+ unsigned int offset; -+ struct sradix_tree_node *start, *end; -+ -+ BUG_ON(node->height != 1); -+ -+ start = node; -+ while (node && !(--node->count)) -+ node = node->parent; -+ -+ end = node; -+ if (!node) { -+ root->rnode = NULL; -+ root->height = 0; -+ root->min = 0; -+ root->num = 0; -+ root->enter_node = NULL; -+ } else { -+ offset = (index >> (root->shift * (node->height - 1))) & root->mask; -+ if (root->rm) -+ root->rm(node, offset); -+ node->stores[offset] = NULL; -+ root->num--; -+ if (root->min > index) { -+ root->min = index; -+ root->enter_node = node; -+ } -+ } -+ -+ if (start != end) { -+ do { -+ node = start; -+ start = start->parent; -+ if (unlikely(root->enter_node == node)) -+ root->enter_node = end; -+ root->free(node); -+ } while (start != end); -+ -+ /* -+ * Note that shrink may free "end", so enter_node still need to -+ * be checked inside. -+ */ -+ sradix_tree_shrink(root); -+ } else if (node->count == root->stores_size - 1) { -+ /* It WAS a full leaf node. Update the ancestors */ -+ node = node->parent; -+ while (node) { -+ node->fulls--; -+ if (node->fulls != root->stores_size - 1) -+ break; -+ -+ node = node->parent; -+ } -+ } -+} -+ -+void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) -+{ -+ unsigned int height, offset; -+ struct sradix_tree_node *node; -+ int shift; -+ -+ node = root->rnode; -+ if (node == NULL || (index >> (root->shift * root->height))) -+ return NULL; -+ -+ height = root->height; -+ shift = (height - 1) * root->shift; -+ -+ do { -+ offset = (index >> shift) & root->mask; -+ node = node->stores[offset]; -+ if (!node) -+ return NULL; -+ -+ shift -= root->shift; -+ } while (shift >= 0); -+ -+ return node; -+} -+ -+/* -+ * Return the item if it exists, otherwise create it in place -+ * and return the created item. -+ */ -+void *sradix_tree_lookup_create(struct sradix_tree_root *root, -+ unsigned long index, void *(*item_alloc)(void)) -+{ -+ unsigned int height, offset; -+ struct sradix_tree_node *node, *tmp; -+ void *item; -+ int shift, error; -+ -+ if (root->rnode == NULL || (index >> (root->shift * root->height))) { -+ if (item_alloc) { -+ error = sradix_tree_extend(root, index); -+ if (error) -+ return NULL; -+ } else { -+ return NULL; -+ } -+ } -+ -+ node = root->rnode; -+ height = root->height; -+ shift = (height - 1) * root->shift; -+ -+ do { -+ offset = (index >> shift) & root->mask; -+ if (!node->stores[offset]) { -+ if (!(tmp = root->alloc())) -+ return NULL; -+ -+ tmp->height = shift / root->shift; -+ node->stores[offset] = tmp; -+ tmp->parent = node; -+ node->count++; -+ node = tmp; -+ } else { -+ node = node->stores[offset]; -+ } -+ -+ shift -= root->shift; -+ } while (shift > 0); -+ -+ BUG_ON(node->height != 1); -+ offset = index & root->mask; -+ if (node->stores[offset]) { -+ return node->stores[offset]; -+ } else if (item_alloc) { -+ if (!(item = item_alloc())) -+ return NULL; -+ -+ node->stores[offset] = item; -+ -+ /* -+ * NOTE: we do NOT call root->assign here, since this item is -+ * newly created by us having no meaning. Caller can call this -+ * if it's necessary to do so. -+ */ -+ -+ node->count++; -+ root->num++; -+ -+ while (sradix_node_full(root, node)) { -+ node = node->parent; -+ if (!node) -+ break; -+ -+ node->fulls++; -+ } -+ -+ if (unlikely(!node)) { -+ /* All nodes are full */ -+ root->min = 1 << (root->height * root->shift); -+ } else { -+ if (root->min == index) { -+ root->min |= (1UL << (node->height - 1)) - 1; -+ root->min++; -+ root->enter_node = node; -+ } -+ } -+ -+ return item; -+ } else { -+ return NULL; -+ } -+ -+} -+ -+int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) -+{ -+ unsigned int height, offset; -+ struct sradix_tree_node *node; -+ int shift; -+ -+ node = root->rnode; -+ if (node == NULL || (index >> (root->shift * root->height))) -+ return -ENOENT; -+ -+ height = root->height; -+ shift = (height - 1) * root->shift; -+ -+ do { -+ offset = (index >> shift) & root->mask; -+ node = node->stores[offset]; -+ if (!node) -+ return -ENOENT; -+ -+ shift -= root->shift; -+ } while (shift > 0); -+ -+ offset = index & root->mask; -+ if (!node->stores[offset]) -+ return -ENOENT; -+ -+ sradix_tree_delete_from_leaf(root, node, index); -+ -+ return 0; -+} -diff --git a/mm/Kconfig b/mm/Kconfig -index 24c045b24..3ce98ecc2 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -317,6 +317,32 @@ config KSM - See Documentation/vm/ksm.rst for more information: KSM is inactive - until a program has madvised that an area is MADV_MERGEABLE, and - root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). -+choice -+ prompt "Choose UKSM/KSM strategy" -+ default UKSM -+ depends on KSM -+ help -+ This option allows to select a UKSM/KSM stragety. -+ -+config UKSM -+ bool "Ultra-KSM for page merging" -+ depends on KSM -+ help -+ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same -+ page Merging), but with a fundamentally rewritten core algorithm. With -+ an advanced algorithm, UKSM now can transparently scans all anonymously -+ mapped user space applications with an significantly improved scan speed -+ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from -+ UKSM. Now UKSM has its first stable release and first real world enterprise user. -+ For more information, please goto its project page. -+ (github.com/dolohow/uksm) -+ -+config KSM_LEGACY -+ bool "Legacy KSM implementation" -+ depends on KSM -+ help -+ The legacy KSM implementation from Red Hat. -+endchoice - - config DEFAULT_MMAP_MIN_ADDR - int "Low address space to protect from user allocation" -diff --git a/mm/Makefile b/mm/Makefile -index 72227b24a..fd50a3a51 100644 ---- a/mm/Makefile -+++ b/mm/Makefile -@@ -76,7 +76,8 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o - obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o - obj-$(CONFIG_SLOB) += slob.o - obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o --obj-$(CONFIG_KSM) += ksm.o -+obj-$(CONFIG_KSM_LEGACY) += ksm.o -+obj-$(CONFIG_UKSM) += uksm.o - obj-$(CONFIG_PAGE_POISONING) += page_poison.o - obj-$(CONFIG_SLAB) += slab.o - obj-$(CONFIG_SLUB) += slub.o -diff --git a/mm/ksm.c b/mm/ksm.c -index 9694ee2c7..63af6a528 100644 ---- a/mm/ksm.c -+++ b/mm/ksm.c -@@ -858,17 +858,6 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, - return err; - } - --static inline struct stable_node *page_stable_node(struct page *page) --{ -- return PageKsm(page) ? page_rmapping(page) : NULL; --} -- --static inline void set_page_stable_node(struct page *page, -- struct stable_node *stable_node) --{ -- page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); --} -- - #ifdef CONFIG_SYSFS - /* - * Only called through the sysfs control interface: -diff --git a/mm/memory.c b/mm/memory.c -index 550405fc3..b4005b195 100644 ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -158,6 +158,25 @@ EXPORT_SYMBOL(zero_pfn); - - unsigned long highest_memmap_pfn __read_mostly; - -+#ifdef CONFIG_UKSM -+unsigned long uksm_zero_pfn __read_mostly; -+EXPORT_SYMBOL_GPL(uksm_zero_pfn); -+struct page *empty_uksm_zero_page; -+ -+static int __init setup_uksm_zero_page(void) -+{ -+ empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0); -+ if (!empty_uksm_zero_page) -+ panic("Oh boy, that early out of memory?"); -+ -+ SetPageReserved(empty_uksm_zero_page); -+ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); -+ -+ return 0; -+} -+core_initcall(setup_uksm_zero_page); -+#endif -+ - /* - * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() - */ -@@ -173,6 +192,7 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) - trace_rss_stat(mm, member, count); - } - -+ - #if defined(SPLIT_RSS_COUNTING) - - void sync_mm_rss(struct mm_struct *mm) -@@ -875,6 +895,11 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, - get_page(page); - page_dup_rmap(page, false); - rss[mm_counter(page)]++; -+ -+ /* Should return NULL in vm_normal_page() */ -+ uksm_bugon_zeropage(pte); -+ } else { -+ uksm_map_zero_page(pte); - } - - /* -@@ -1254,8 +1279,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - tlb_remove_tlb_entry(tlb, pte, addr); -- if (unlikely(!page)) -+ if (unlikely(!page)) { -+ uksm_unmap_zero_page(ptent); - continue; -+ } - - if (!PageAnon(page)) { - if (pte_dirty(ptent)) { -@@ -2603,6 +2630,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src, - - if (likely(src)) { - copy_user_highpage(dst, src, addr, vma); -+ uksm_cow_page(vma, src); - return true; - } - -@@ -2849,6 +2877,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) - vmf->address); - if (!new_page) - goto oom; -+ uksm_cow_pte(vma, vmf->orig_pte); - } else { - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - vmf->address); -@@ -2891,7 +2920,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) - mm_counter_file(old_page)); - inc_mm_counter_fast(mm, MM_ANONPAGES); - } -+ uksm_bugon_zeropage(vmf->orig_pte); - } else { -+ uksm_unmap_zero_page(vmf->orig_pte); - inc_mm_counter_fast(mm, MM_ANONPAGES); - } - flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); -diff --git a/mm/mmap.c b/mm/mmap.c -index 3f287599a..dc719db43 100644 ---- a/mm/mmap.c -+++ b/mm/mmap.c -@@ -46,6 +46,7 @@ - #include <linux/moduleparam.h> - #include <linux/pkeys.h> - #include <linux/oom.h> -+#include <linux/ksm.h> - #include <linux/sched/mm.h> - - #include <linux/uaccess.h> -@@ -181,6 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) - if (vma->vm_file) - fput(vma->vm_file); - mpol_put(vma_policy(vma)); -+ uksm_remove_vma(vma); - vm_area_free(vma); - return next; - } -@@ -748,9 +750,16 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, - long adjust_next = 0; - int remove_next = 0; - -+/* -+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is -+ * acquired -+ */ -+ uksm_remove_vma(vma); -+ - if (next && !insert) { - struct vm_area_struct *exporter = NULL, *importer = NULL; - -+ uksm_remove_vma(next); - if (end >= next->vm_end) { - /* - * vma expands, overlapping all the next, and -@@ -881,6 +890,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, - end_changed = true; - } - vma->vm_pgoff = pgoff; -+ - if (adjust_next) { - next->vm_start += adjust_next; - next->vm_pgoff += adjust_next >> PAGE_SHIFT; -@@ -985,6 +995,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, - if (remove_next == 2) { - remove_next = 1; - end = next->vm_end; -+ uksm_remove_vma(next); - goto again; - } - else if (next) -@@ -1011,10 +1022,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, - */ - VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); - } -+ } else { -+ if (next && !insert) -+ uksm_vma_add_new(next); - } - if (insert && file) - uprobe_mmap(insert); - -+ uksm_vma_add_new(vma); - validate_mm(mm); - - return 0; -@@ -1470,6 +1485,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, - vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | - mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - -+ /* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */ -+ uksm_vm_flags_mod(&vm_flags); -+ - if (flags & MAP_LOCKED) - if (!can_do_mlock()) - return -EPERM; -@@ -1865,6 +1883,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, - allow_write_access(file); - } - file = vma->vm_file; -+ uksm_vma_add_new(vma); - out: - perf_event_mmap(vma); - -@@ -1907,6 +1926,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); - free_vma: -+ uksm_remove_vma(vma); - vm_area_free(vma); - unacct_error: - if (charged) -@@ -2766,6 +2786,8 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, - else - err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); - -+ uksm_vma_add_new(new); -+ - /* Success. */ - if (!err) - return 0; -@@ -3073,6 +3095,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla - if ((flags & (~VM_EXEC)) != 0) - return -EINVAL; - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; -+ uksm_vm_flags_mod(&flags); - - mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (IS_ERR_VALUE(mapped_addr)) -@@ -3118,6 +3141,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla - vma->vm_flags = flags; - vma->vm_page_prot = vm_get_page_prot(flags); - vma_link(mm, vma, prev, rb_link, rb_parent); -+ uksm_vma_add_new(vma); - out: - perf_event_mmap(vma); - mm->total_vm += len >> PAGE_SHIFT; -@@ -3195,6 +3219,12 @@ void exit_mmap(struct mm_struct *mm) - mmap_write_unlock(mm); - } - -+ /* -+ * Taking write lock on mmap does not harm others, -+ * but it's crucial for uksm to avoid races. -+ */ -+ mmap_write_lock(mm); -+ - if (mm->locked_vm) { - vma = mm->mmap; - while (vma) { -@@ -3230,6 +3260,11 @@ void exit_mmap(struct mm_struct *mm) - cond_resched(); - } - vm_unacct_memory(nr_accounted); -+ -+ mm->mmap = NULL; -+ mm->mm_rb = RB_ROOT; -+ vmacache_invalidate(mm); -+ mmap_write_unlock(mm); - } - - /* Insert vm structure into process list sorted by address -@@ -3337,6 +3372,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, - new_vma->vm_ops->open(new_vma); - vma_link(mm, new_vma, prev, rb_link, rb_parent); - *need_rmap_locks = false; -+ uksm_vma_add_new(new_vma); - } - return new_vma; - -@@ -3505,6 +3541,7 @@ static struct vm_area_struct *__install_special_mapping( - vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); - - perf_event_mmap(vma); -+ uksm_vma_add_new(vma); - - return vma; - -diff --git a/mm/uksm.c b/mm/uksm.c -new file mode 100644 -index 000000000..e4732c00b ---- /dev/null -+++ b/mm/uksm.c -@@ -0,0 +1,5614 @@ -+/* -+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia -+ * -+ * This is an improvement upon KSM. Some basic data structures and routines -+ * are borrowed from ksm.c . -+ * -+ * Its new features: -+ * 1. Full system scan: -+ * It automatically scans all user processes' anonymous VMAs. Kernel-user -+ * interaction to submit a memory area to KSM is no longer needed. -+ * -+ * 2. Rich area detection: -+ * It automatically detects rich areas containing abundant duplicated -+ * pages based. Rich areas are given a full scan speed. Poor areas are -+ * sampled at a reasonable speed with very low CPU consumption. -+ * -+ * 3. Ultra Per-page scan speed improvement: -+ * A new hash algorithm is proposed. As a result, on a machine with -+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it -+ * can scan memory areas that does not contain duplicated pages at speed of -+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of -+ * 477MB/sec ~ 923MB/sec. -+ * -+ * 4. Thrashing area avoidance: -+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be -+ * filtered out. My benchmark shows it's more efficient than KSM's per-page -+ * hash value based volatile page detection. -+ * -+ * -+ * 5. Misc changes upon KSM: -+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page -+ * comparison. It's much faster than default C version on x86. -+ * * rmap_item now has an struct *page member to loosely cache a -+ * address-->page mapping, which reduces too much time-costly -+ * follow_page(). -+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. -+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ -+ * ksm is needed for this case. -+ * -+ * 6. Full Zero Page consideration(contributed by Figo Zhang) -+ * Now uksmd consider full zero pages as special pages and merge them to an -+ * special unswappable uksm zero page. -+ */ -+ -+#include <linux/errno.h> -+#include <linux/mm.h> -+#include <linux/fs.h> -+#include <linux/mman.h> -+#include <linux/sched.h> -+#include <linux/sched/mm.h> -+#include <linux/sched/coredump.h> -+#include <linux/sched/cputime.h> -+#include <linux/rwsem.h> -+#include <linux/pagemap.h> -+#include <linux/rmap.h> -+#include <linux/spinlock.h> -+#include <linux/jhash.h> -+#include <linux/delay.h> -+#include <linux/kthread.h> -+#include <linux/wait.h> -+#include <linux/slab.h> -+#include <linux/rbtree.h> -+#include <linux/memory.h> -+#include <linux/mmu_notifier.h> -+#include <linux/swap.h> -+#include <linux/ksm.h> -+#include <linux/crypto.h> -+#include <linux/scatterlist.h> -+#include <crypto/hash.h> -+#include <linux/random.h> -+#include <linux/math64.h> -+#include <linux/gcd.h> -+#include <linux/freezer.h> -+#include <linux/oom.h> -+#include <linux/numa.h> -+#include <linux/sradix-tree.h> -+ -+#include <asm/tlbflush.h> -+#include "internal.h" -+ -+#ifdef CONFIG_X86 -+#undef memcmp -+ -+#ifdef CONFIG_X86_32 -+#define memcmp memcmpx86_32 -+/* -+ * Compare 4-byte-aligned address s1 and s2, with length n -+ */ -+int memcmpx86_32(void *s1, void *s2, size_t n) -+{ -+ size_t num = n / 4; -+ register int res; -+ -+ __asm__ __volatile__ -+ ( -+ "testl %3,%3\n\t" -+ "repe; cmpsd\n\t" -+ "je 1f\n\t" -+ "sbbl %0,%0\n\t" -+ "orl $1,%0\n" -+ "1:" -+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) -+ : "0" (0) -+ : "cc"); -+ -+ return res; -+} -+ -+/* -+ * Check the page is all zero ? -+ */ -+static int is_full_zero(const void *s1, size_t len) -+{ -+ unsigned char same; -+ -+ len /= 4; -+ -+ __asm__ __volatile__ -+ ("repe; scasl;" -+ "sete %0" -+ : "=qm" (same), "+D" (s1), "+c" (len) -+ : "a" (0) -+ : "cc"); -+ -+ return same; -+} -+ -+ -+#elif defined(CONFIG_X86_64) -+#define memcmp memcmpx86_64 -+/* -+ * Compare 8-byte-aligned address s1 and s2, with length n -+ */ -+int memcmpx86_64(void *s1, void *s2, size_t n) -+{ -+ size_t num = n / 8; -+ register int res; -+ -+ __asm__ __volatile__ -+ ( -+ "testq %q3,%q3\n\t" -+ "repe; cmpsq\n\t" -+ "je 1f\n\t" -+ "sbbq %q0,%q0\n\t" -+ "orq $1,%q0\n" -+ "1:" -+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) -+ : "0" (0) -+ : "cc"); -+ -+ return res; -+} -+ -+static int is_full_zero(const void *s1, size_t len) -+{ -+ unsigned char same; -+ -+ len /= 8; -+ -+ __asm__ __volatile__ -+ ("repe; scasq;" -+ "sete %0" -+ : "=qm" (same), "+D" (s1), "+c" (len) -+ : "a" (0) -+ : "cc"); -+ -+ return same; -+} -+ -+#endif -+#else -+static int is_full_zero(const void *s1, size_t len) -+{ -+ unsigned long *src = s1; -+ int i; -+ -+ len /= sizeof(*src); -+ -+ for (i = 0; i < len; i++) { -+ if (src[i]) -+ return 0; -+ } -+ -+ return 1; -+} -+#endif -+ -+#define UKSM_RUNG_ROUND_FINISHED (1 << 0) -+#define TIME_RATIO_SCALE 10000 -+ -+#define SLOT_TREE_NODE_SHIFT 8 -+#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) -+struct slot_tree_node { -+ unsigned long size; -+ struct sradix_tree_node snode; -+ void *stores[SLOT_TREE_NODE_STORE_SIZE]; -+}; -+ -+static struct kmem_cache *slot_tree_node_cachep; -+ -+static struct sradix_tree_node *slot_tree_node_alloc(void) -+{ -+ struct slot_tree_node *p; -+ -+ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL | -+ __GFP_NORETRY | __GFP_NOWARN); -+ if (!p) -+ return NULL; -+ -+ return &p->snode; -+} -+ -+static void slot_tree_node_free(struct sradix_tree_node *node) -+{ -+ struct slot_tree_node *p; -+ -+ p = container_of(node, struct slot_tree_node, snode); -+ kmem_cache_free(slot_tree_node_cachep, p); -+} -+ -+static void slot_tree_node_extend(struct sradix_tree_node *parent, -+ struct sradix_tree_node *child) -+{ -+ struct slot_tree_node *p, *c; -+ -+ p = container_of(parent, struct slot_tree_node, snode); -+ c = container_of(child, struct slot_tree_node, snode); -+ -+ p->size += c->size; -+} -+ -+void slot_tree_node_assign(struct sradix_tree_node *node, -+ unsigned int index, void *item) -+{ -+ struct vma_slot *slot = item; -+ struct slot_tree_node *cur; -+ -+ slot->snode = node; -+ slot->sindex = index; -+ -+ while (node) { -+ cur = container_of(node, struct slot_tree_node, snode); -+ cur->size += slot->pages; -+ node = node->parent; -+ } -+} -+ -+void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset) -+{ -+ struct vma_slot *slot; -+ struct slot_tree_node *cur; -+ unsigned long pages; -+ -+ if (node->height == 1) { -+ slot = node->stores[offset]; -+ pages = slot->pages; -+ } else { -+ cur = container_of(node->stores[offset], -+ struct slot_tree_node, snode); -+ pages = cur->size; -+ } -+ -+ while (node) { -+ cur = container_of(node, struct slot_tree_node, snode); -+ cur->size -= pages; -+ node = node->parent; -+ } -+} -+ -+unsigned long slot_iter_index; -+int slot_iter(void *item, unsigned long height) -+{ -+ struct slot_tree_node *node; -+ struct vma_slot *slot; -+ -+ if (height == 1) { -+ slot = item; -+ if (slot_iter_index < slot->pages) { -+ /*in this one*/ -+ return 1; -+ } else { -+ slot_iter_index -= slot->pages; -+ return 0; -+ } -+ -+ } else { -+ node = container_of(item, struct slot_tree_node, snode); -+ if (slot_iter_index < node->size) { -+ /*in this one*/ -+ return 1; -+ } else { -+ slot_iter_index -= node->size; -+ return 0; -+ } -+ } -+} -+ -+ -+static inline void slot_tree_init_root(struct sradix_tree_root *root) -+{ -+ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); -+ root->alloc = slot_tree_node_alloc; -+ root->free = slot_tree_node_free; -+ root->extend = slot_tree_node_extend; -+ root->assign = slot_tree_node_assign; -+ root->rm = slot_tree_node_rm; -+} -+ -+void slot_tree_init(void) -+{ -+ slot_tree_node_cachep = kmem_cache_create("slot_tree_node", -+ sizeof(struct slot_tree_node), 0, -+ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, -+ NULL); -+} -+ -+ -+/* Each rung of this ladder is a list of VMAs having a same scan ratio */ -+struct scan_rung { -+ //struct list_head scanned_list; -+ struct sradix_tree_root vma_root; -+ struct sradix_tree_root vma_root2; -+ -+ struct vma_slot *current_scan; -+ unsigned long current_offset; -+ -+ /* -+ * The initial value for current_offset, it should loop over -+ * [0~ step - 1] to let all slot have its chance to be scanned. -+ */ -+ unsigned long offset_init; -+ unsigned long step; /* dynamic step for current_offset */ -+ unsigned int flags; -+ unsigned long pages_to_scan; -+ //unsigned long fully_scanned_slots; -+ /* -+ * a little bit tricky - if cpu_time_ratio > 0, then the value is the -+ * the cpu time ratio it can spend in rung_i for every scan -+ * period. if < 0, then it is the cpu time ratio relative to the -+ * max cpu percentage user specified. Both in unit of -+ * 1/TIME_RATIO_SCALE -+ */ -+ int cpu_ratio; -+ -+ /* -+ * How long it will take for all slots in this rung to be fully -+ * scanned? If it's zero, we don't care about the cover time: -+ * it's fully scanned. -+ */ -+ unsigned int cover_msecs; -+ //unsigned long vma_num; -+ //unsigned long pages; /* Sum of all slot's pages in rung */ -+}; -+ -+/** -+ * node of either the stable or unstale rbtree -+ * -+ */ -+struct tree_node { -+ struct rb_node node; /* link in the main (un)stable rbtree */ -+ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ -+ u32 hash; -+ unsigned long count; /* TODO: merged with sub_root */ -+ struct list_head all_list; /* all tree nodes in stable/unstable tree */ -+}; -+ -+/** -+ * struct stable_node - node of the stable rbtree -+ * @node: rb node of this ksm page in the stable tree -+ * @hlist: hlist head of rmap_items using this ksm page -+ * @kpfn: page frame number of this ksm page -+ */ -+struct stable_node { -+ struct rb_node node; /* link in sub-rbtree */ -+ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ -+ struct hlist_head hlist; -+ unsigned long kpfn; -+ u32 hash_max; /* if ==0 then it's not been calculated yet */ -+ struct list_head all_list; /* in a list for all stable nodes */ -+}; -+ -+/** -+ * struct node_vma - group rmap_items linked in a same stable -+ * node together. -+ */ -+struct node_vma { -+ union { -+ struct vma_slot *slot; -+ unsigned long key; /* slot is used as key sorted on hlist */ -+ }; -+ struct hlist_node hlist; -+ struct hlist_head rmap_hlist; -+ struct stable_node *head; -+}; -+ -+/** -+ * struct rmap_item - reverse mapping item for virtual addresses -+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list -+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree -+ * @mm: the memory structure this rmap_item is pointing into -+ * @address: the virtual address this rmap_item tracks (+ flags in low bits) -+ * @node: rb node of this rmap_item in the unstable tree -+ * @head: pointer to stable_node heading this list in the stable tree -+ * @hlist: link into hlist of rmap_items hanging off that stable_node -+ */ -+struct rmap_item { -+ struct vma_slot *slot; -+ struct page *page; -+ unsigned long address; /* + low bits used for flags below */ -+ unsigned long hash_round; -+ unsigned long entry_index; -+ union { -+ struct {/* when in unstable tree */ -+ struct rb_node node; -+ struct tree_node *tree_node; -+ u32 hash_max; -+ }; -+ struct { /* when in stable tree */ -+ struct node_vma *head; -+ struct hlist_node hlist; -+ struct anon_vma *anon_vma; -+ }; -+ }; -+} __aligned(4); -+ -+struct rmap_list_entry { -+ union { -+ struct rmap_item *item; -+ unsigned long addr; -+ }; -+ /* lowest bit is used for is_addr tag */ -+} __aligned(4); /* 4 aligned to fit in to pages*/ -+ -+ -+/* Basic data structure definition ends */ -+ -+ -+/* -+ * Flags for rmap_item to judge if it's listed in the stable/unstable tree. -+ * The flags use the low bits of rmap_item.address -+ */ -+#define UNSTABLE_FLAG 0x1 -+#define STABLE_FLAG 0x2 -+#define get_rmap_addr(x) ((x)->address & PAGE_MASK) -+ -+/* -+ * rmap_list_entry helpers -+ */ -+#define IS_ADDR_FLAG 1 -+#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) -+#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) -+#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) -+ -+ -+/* -+ * High speed caches for frequently allocated and freed structs -+ */ -+static struct kmem_cache *rmap_item_cache; -+static struct kmem_cache *stable_node_cache; -+static struct kmem_cache *node_vma_cache; -+static struct kmem_cache *vma_slot_cache; -+static struct kmem_cache *tree_node_cache; -+#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ -+ sizeof(struct __struct), __alignof__(struct __struct),\ -+ (__flags), NULL) -+ -+/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ -+#define SCAN_LADDER_SIZE 4 -+static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; -+ -+/* The evaluation rounds uksmd has finished */ -+static unsigned long long uksm_eval_round = 1; -+ -+/* -+ * we add 1 to this var when we consider we should rebuild the whole -+ * unstable tree. -+ */ -+static unsigned long uksm_hash_round = 1; -+ -+/* -+ * How many times the whole memory is scanned. -+ */ -+static unsigned long long fully_scanned_round = 1; -+ -+/* The total number of virtual pages of all vma slots */ -+static u64 uksm_pages_total; -+ -+/* The number of pages has been scanned since the start up */ -+static u64 uksm_pages_scanned; -+ -+static u64 scanned_virtual_pages; -+ -+/* The number of pages has been scanned since last encode_benefit call */ -+static u64 uksm_pages_scanned_last; -+ -+/* If the scanned number is tooo large, we encode it here */ -+static u64 pages_scanned_stored; -+ -+static unsigned long pages_scanned_base; -+ -+/* The number of nodes in the stable tree */ -+static unsigned long uksm_pages_shared; -+ -+/* The number of page slots additionally sharing those nodes */ -+static unsigned long uksm_pages_sharing; -+ -+/* The number of nodes in the unstable tree */ -+static unsigned long uksm_pages_unshared; -+ -+/* -+ * Milliseconds ksmd should sleep between scans, -+ * >= 100ms to be consistent with -+ * scan_time_to_sleep_msec() -+ */ -+static unsigned int uksm_sleep_jiffies; -+ -+/* The real value for the uksmd next sleep */ -+static unsigned int uksm_sleep_real; -+ -+/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ -+static unsigned int uksm_sleep_saved; -+ -+/* Max percentage of cpu utilization ksmd can take to scan in one batch */ -+static unsigned int uksm_max_cpu_percentage; -+ -+static int uksm_cpu_governor; -+ -+static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; -+ -+struct uksm_cpu_preset_s { -+ int cpu_ratio[SCAN_LADDER_SIZE]; -+ unsigned int cover_msecs[SCAN_LADDER_SIZE]; -+ unsigned int max_cpu; /* percentage */ -+}; -+ -+struct uksm_cpu_preset_s uksm_cpu_preset[4] = { -+ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, -+ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, -+ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, -+ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, -+}; -+ -+/* The default value for uksm_ema_page_time if it's not initialized */ -+#define UKSM_PAGE_TIME_DEFAULT 500 -+ -+/*cost to scan one page by expotional moving average in nsecs */ -+static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; -+ -+/* The expotional moving average alpha weight, in percentage. */ -+#define EMA_ALPHA 20 -+ -+/* -+ * The threshold used to filter out thrashing areas, -+ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound -+ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio -+ * will be considered as having a zero duplication ratio. -+ */ -+static unsigned int uksm_thrash_threshold = 50; -+ -+/* How much dedup ratio is considered to be abundant*/ -+static unsigned int uksm_abundant_threshold = 10; -+ -+/* All slots having merged pages in this eval round. */ -+struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); -+ -+/* How many times the ksmd has slept since startup */ -+static unsigned long long uksm_sleep_times; -+ -+#define UKSM_RUN_STOP 0 -+#define UKSM_RUN_MERGE 1 -+static unsigned int uksm_run = 1; -+ -+static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); -+static DEFINE_MUTEX(uksm_thread_mutex); -+ -+/* -+ * List vma_slot_new is for newly created vma_slot waiting to be added by -+ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to -+ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding -+ * VMA has been removed/freed. -+ */ -+struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); -+struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); -+struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); -+static DEFINE_SPINLOCK(vma_slot_list_lock); -+ -+/* The unstable tree heads */ -+static struct rb_root root_unstable_tree = RB_ROOT; -+ -+/* -+ * All tree_nodes are in a list to be freed at once when unstable tree is -+ * freed after each scan round. -+ */ -+static struct list_head unstable_tree_node_list = -+ LIST_HEAD_INIT(unstable_tree_node_list); -+ -+/* List contains all stable nodes */ -+static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); -+ -+/* -+ * When the hash strength is changed, the stable tree must be delta_hashed and -+ * re-structured. We use two set of below structs to speed up the -+ * re-structuring of stable tree. -+ */ -+static struct list_head -+stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), -+ LIST_HEAD_INIT(stable_tree_node_list[1])}; -+ -+static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; -+static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; -+static struct rb_root *root_stable_treep = &root_stable_tree[0]; -+static unsigned long stable_tree_index; -+ -+/* The hash strength needed to hash a full page */ -+#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) -+ -+/* The hash strength needed for loop-back hashing */ -+#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) -+ -+/* The random offsets in a page */ -+static u32 *random_nums; -+ -+/* The hash strength */ -+static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; -+ -+/* The delta value each time the hash strength increases or decreases */ -+static unsigned long hash_strength_delta; -+#define HASH_STRENGTH_DELTA_MAX 5 -+ -+/* The time we have saved due to random_sample_hash */ -+static u64 rshash_pos; -+ -+/* The time we have wasted due to hash collision */ -+static u64 rshash_neg; -+ -+struct uksm_benefit { -+ u64 pos; -+ u64 neg; -+ u64 scanned; -+ unsigned long base; -+} benefit; -+ -+/* -+ * The relative cost of memcmp, compared to 1 time unit of random sample -+ * hash, this value is tested when ksm module is initialized -+ */ -+static unsigned long memcmp_cost; -+ -+static unsigned long rshash_neg_cont_zero; -+static unsigned long rshash_cont_obscure; -+ -+/* The possible states of hash strength adjustment heuristic */ -+enum rshash_states { -+ RSHASH_STILL, -+ RSHASH_TRYUP, -+ RSHASH_TRYDOWN, -+ RSHASH_NEW, -+ RSHASH_PRE_STILL, -+}; -+ -+/* The possible direction we are about to adjust hash strength */ -+enum rshash_direct { -+ GO_UP, -+ GO_DOWN, -+ OBSCURE, -+ STILL, -+}; -+ -+/* random sampling hash state machine */ -+static struct { -+ enum rshash_states state; -+ enum rshash_direct pre_direct; -+ u8 below_count; -+ /* Keep a lookup window of size 5, iff above_count/below_count > 3 -+ * in this window we stop trying. -+ */ -+ u8 lookup_window_index; -+ u64 stable_benefit; -+ unsigned long turn_point_down; -+ unsigned long turn_benefit_down; -+ unsigned long turn_point_up; -+ unsigned long turn_benefit_up; -+ unsigned long stable_point; -+} rshash_state; -+ -+/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ -+static u32 *zero_hash_table; -+ -+static inline struct node_vma *alloc_node_vma(void) -+{ -+ struct node_vma *node_vma; -+ -+ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL | -+ __GFP_NORETRY | __GFP_NOWARN); -+ if (node_vma) { -+ INIT_HLIST_HEAD(&node_vma->rmap_hlist); -+ INIT_HLIST_NODE(&node_vma->hlist); -+ } -+ return node_vma; -+} -+ -+static inline void free_node_vma(struct node_vma *node_vma) -+{ -+ kmem_cache_free(node_vma_cache, node_vma); -+} -+ -+ -+static inline struct vma_slot *alloc_vma_slot(void) -+{ -+ struct vma_slot *slot; -+ -+ /* -+ * In case ksm is not initialized by now. -+ * Oops, we need to consider the call site of uksm_init() in the future. -+ */ -+ if (!vma_slot_cache) -+ return NULL; -+ -+ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL | -+ __GFP_NORETRY | __GFP_NOWARN); -+ if (slot) { -+ INIT_LIST_HEAD(&slot->slot_list); -+ INIT_LIST_HEAD(&slot->dedup_list); -+ slot->flags |= UKSM_SLOT_NEED_RERAND; -+ } -+ return slot; -+} -+ -+static inline void free_vma_slot(struct vma_slot *vma_slot) -+{ -+ kmem_cache_free(vma_slot_cache, vma_slot); -+} -+ -+ -+ -+static inline struct rmap_item *alloc_rmap_item(void) -+{ -+ struct rmap_item *rmap_item; -+ -+ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | -+ __GFP_NORETRY | __GFP_NOWARN); -+ if (rmap_item) { -+ /* bug on lowest bit is not clear for flag use */ -+ BUG_ON(is_addr(rmap_item)); -+ } -+ return rmap_item; -+} -+ -+static inline void free_rmap_item(struct rmap_item *rmap_item) -+{ -+ rmap_item->slot = NULL; /* debug safety */ -+ kmem_cache_free(rmap_item_cache, rmap_item); -+} -+ -+static inline struct stable_node *alloc_stable_node(void) -+{ -+ struct stable_node *node; -+ -+ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | -+ __GFP_NORETRY | __GFP_NOWARN); -+ if (!node) -+ return NULL; -+ -+ INIT_HLIST_HEAD(&node->hlist); -+ list_add(&node->all_list, &stable_node_list); -+ return node; -+} -+ -+static inline void free_stable_node(struct stable_node *stable_node) -+{ -+ list_del(&stable_node->all_list); -+ kmem_cache_free(stable_node_cache, stable_node); -+} -+ -+static inline struct tree_node *alloc_tree_node(struct list_head *list) -+{ -+ struct tree_node *node; -+ -+ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | -+ __GFP_NORETRY | __GFP_NOWARN); -+ if (!node) -+ return NULL; -+ -+ list_add(&node->all_list, list); -+ return node; -+} -+ -+static inline void free_tree_node(struct tree_node *node) -+{ -+ list_del(&node->all_list); -+ kmem_cache_free(tree_node_cache, node); -+} -+ -+static void uksm_drop_anon_vma(struct rmap_item *rmap_item) -+{ -+ struct anon_vma *anon_vma = rmap_item->anon_vma; -+ -+ put_anon_vma(anon_vma); -+} -+ -+ -+/** -+ * Remove a stable node from stable_tree, may unlink from its tree_node and -+ * may remove its parent tree_node if no other stable node is pending. -+ * -+ * @stable_node The node need to be removed -+ * @unlink_rb Will this node be unlinked from the rbtree? -+ * @remove_tree_ node Will its tree_node be removed if empty? -+ */ -+static void remove_node_from_stable_tree(struct stable_node *stable_node, -+ int unlink_rb, int remove_tree_node) -+{ -+ struct node_vma *node_vma; -+ struct rmap_item *rmap_item; -+ struct hlist_node *n; -+ -+ if (!hlist_empty(&stable_node->hlist)) { -+ hlist_for_each_entry_safe(node_vma, n, -+ &stable_node->hlist, hlist) { -+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { -+ uksm_pages_sharing--; -+ -+ uksm_drop_anon_vma(rmap_item); -+ rmap_item->address &= PAGE_MASK; -+ } -+ free_node_vma(node_vma); -+ cond_resched(); -+ } -+ -+ /* the last one is counted as shared */ -+ uksm_pages_shared--; -+ uksm_pages_sharing++; -+ } -+ -+ if (stable_node->tree_node && unlink_rb) { -+ rb_erase(&stable_node->node, -+ &stable_node->tree_node->sub_root); -+ -+ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && -+ remove_tree_node) { -+ rb_erase(&stable_node->tree_node->node, -+ root_stable_treep); -+ free_tree_node(stable_node->tree_node); -+ } else { -+ stable_node->tree_node->count--; -+ } -+ } -+ -+ free_stable_node(stable_node); -+} -+ -+ -+/* -+ * get_uksm_page: checks if the page indicated by the stable node -+ * is still its ksm page, despite having held no reference to it. -+ * In which case we can trust the content of the page, and it -+ * returns the gotten page; but if the page has now been zapped, -+ * remove the stale node from the stable tree and return NULL. -+ * -+ * You would expect the stable_node to hold a reference to the ksm page. -+ * But if it increments the page's count, swapping out has to wait for -+ * ksmd to come around again before it can free the page, which may take -+ * seconds or even minutes: much too unresponsive. So instead we use a -+ * "keyhole reference": access to the ksm page from the stable node peeps -+ * out through its keyhole to see if that page still holds the right key, -+ * pointing back to this stable node. This relies on freeing a PageAnon -+ * page to reset its page->mapping to NULL, and relies on no other use of -+ * a page to put something that might look like our key in page->mapping. -+ * -+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference, -+ * but this is different - made simpler by uksm_thread_mutex being held, but -+ * interesting for assuming that no other use of the struct page could ever -+ * put our expected_mapping into page->mapping (or a field of the union which -+ * coincides with page->mapping). The RCU calls are not for KSM at all, but -+ * to keep the page_count protocol described with page_cache_get_speculative. -+ * -+ * Note: it is possible that get_uksm_page() will return NULL one moment, -+ * then page the next, if the page is in between page_freeze_refs() and -+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page -+ * is on its way to being freed; but it is an anomaly to bear in mind. -+ * -+ * @unlink_rb: if the removal of this node will firstly unlink from -+ * its rbtree. stable_node_reinsert will prevent this when restructuring the -+ * node from its old tree. -+ * -+ * @remove_tree_node: if this is the last one of its tree_node, will the -+ * tree_node be freed ? If we are inserting stable node, this tree_node may -+ * be reused, so don't free it. -+ */ -+static struct page *get_uksm_page(struct stable_node *stable_node, -+ int unlink_rb, int remove_tree_node) -+{ -+ struct page *page; -+ void *expected_mapping; -+ unsigned long kpfn; -+ -+ expected_mapping = (void *)((unsigned long)stable_node | -+ PAGE_MAPPING_KSM); -+again: -+ kpfn = READ_ONCE(stable_node->kpfn); -+ page = pfn_to_page(kpfn); -+ -+ /* -+ * page is computed from kpfn, so on most architectures reading -+ * page->mapping is naturally ordered after reading node->kpfn, -+ * but on Alpha we need to be more careful. -+ */ -+ smp_rmb(); -+ -+ if (READ_ONCE(page->mapping) != expected_mapping) -+ goto stale; -+ -+ /* -+ * We cannot do anything with the page while its refcount is 0. -+ * Usually 0 means free, or tail of a higher-order page: in which -+ * case this node is no longer referenced, and should be freed; -+ * however, it might mean that the page is under page_freeze_refs(). -+ * The __remove_mapping() case is easy, again the node is now stale; -+ * but if page is swapcache in migrate_page_move_mapping(), it might -+ * still be our page, in which case it's essential to keep the node. -+ */ -+ while (!get_page_unless_zero(page)) { -+ /* -+ * Another check for page->mapping != expected_mapping would -+ * work here too. We have chosen the !PageSwapCache test to -+ * optimize the common case, when the page is or is about to -+ * be freed: PageSwapCache is cleared (under spin_lock_irq) -+ * in the freeze_refs section of __remove_mapping(); but Anon -+ * page->mapping reset to NULL later, in free_pages_prepare(). -+ */ -+ if (!PageSwapCache(page)) -+ goto stale; -+ cpu_relax(); -+ } -+ -+ if (READ_ONCE(page->mapping) != expected_mapping) { -+ put_page(page); -+ goto stale; -+ } -+ -+ lock_page(page); -+ if (READ_ONCE(page->mapping) != expected_mapping) { -+ unlock_page(page); -+ put_page(page); -+ goto stale; -+ } -+ unlock_page(page); -+ return page; -+stale: -+ /* -+ * We come here from above when page->mapping or !PageSwapCache -+ * suggests that the node is stale; but it might be under migration. -+ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), -+ * before checking whether node->kpfn has been changed. -+ */ -+ smp_rmb(); -+ if (stable_node->kpfn != kpfn) -+ goto again; -+ -+ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); -+ -+ return NULL; -+} -+ -+/* -+ * Removing rmap_item from stable or unstable tree. -+ * This function will clean the information from the stable/unstable tree. -+ */ -+static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) -+{ -+ if (rmap_item->address & STABLE_FLAG) { -+ struct stable_node *stable_node; -+ struct node_vma *node_vma; -+ struct page *page; -+ -+ node_vma = rmap_item->head; -+ stable_node = node_vma->head; -+ page = get_uksm_page(stable_node, 1, 1); -+ if (!page) -+ goto out; -+ -+ /* -+ * page lock is needed because it's racing with -+ * try_to_unmap_ksm(), etc. -+ */ -+ lock_page(page); -+ hlist_del(&rmap_item->hlist); -+ -+ if (hlist_empty(&node_vma->rmap_hlist)) { -+ hlist_del(&node_vma->hlist); -+ free_node_vma(node_vma); -+ } -+ unlock_page(page); -+ -+ put_page(page); -+ if (hlist_empty(&stable_node->hlist)) { -+ /* do NOT call remove_node_from_stable_tree() here, -+ * it's possible for a forked rmap_item not in -+ * stable tree while the in-tree rmap_items were -+ * deleted. -+ */ -+ uksm_pages_shared--; -+ } else -+ uksm_pages_sharing--; -+ -+ -+ uksm_drop_anon_vma(rmap_item); -+ } else if (rmap_item->address & UNSTABLE_FLAG) { -+ if (rmap_item->hash_round == uksm_hash_round) { -+ -+ rb_erase(&rmap_item->node, -+ &rmap_item->tree_node->sub_root); -+ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { -+ rb_erase(&rmap_item->tree_node->node, -+ &root_unstable_tree); -+ -+ free_tree_node(rmap_item->tree_node); -+ } else -+ rmap_item->tree_node->count--; -+ } -+ uksm_pages_unshared--; -+ } -+ -+ rmap_item->address &= PAGE_MASK; -+ rmap_item->hash_max = 0; -+ -+out: -+ cond_resched(); /* we're called from many long loops */ -+} -+ -+static inline int slot_in_uksm(struct vma_slot *slot) -+{ -+ return list_empty(&slot->slot_list); -+} -+ -+/* -+ * Test if the mm is exiting -+ */ -+static inline bool uksm_test_exit(struct mm_struct *mm) -+{ -+ return atomic_read(&mm->mm_users) == 0; -+} -+ -+static inline unsigned long vma_pool_size(struct vma_slot *slot) -+{ -+ return round_up(sizeof(struct rmap_list_entry) * slot->pages, -+ PAGE_SIZE) >> PAGE_SHIFT; -+} -+ -+#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) -+ -+/* must be done with sem locked */ -+static int slot_pool_alloc(struct vma_slot *slot) -+{ -+ unsigned long pool_size; -+ -+ if (slot->rmap_list_pool) -+ return 0; -+ -+ pool_size = vma_pool_size(slot); -+ slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *), -+ GFP_KERNEL); -+ if (!slot->rmap_list_pool) -+ return -ENOMEM; -+ -+ slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int), -+ GFP_KERNEL); -+ if (!slot->pool_counts) { -+ kfree(slot->rmap_list_pool); -+ return -ENOMEM; -+ } -+ -+ slot->pool_size = pool_size; -+ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); -+ slot->flags |= UKSM_SLOT_IN_UKSM; -+ uksm_pages_total += slot->pages; -+ -+ return 0; -+} -+ -+/* -+ * Called after vma is unlinked from its mm -+ */ -+void uksm_remove_vma(struct vm_area_struct *vma) -+{ -+ struct vma_slot *slot; -+ -+ if (!vma->uksm_vma_slot) -+ return; -+ -+ spin_lock(&vma_slot_list_lock); -+ slot = vma->uksm_vma_slot; -+ if (!slot) -+ goto out; -+ -+ if (slot_in_uksm(slot)) { -+ /** -+ * This slot has been added by ksmd, so move to the del list -+ * waiting ksmd to free it. -+ */ -+ list_add_tail(&slot->slot_list, &vma_slot_del); -+ } else { -+ /** -+ * It's still on new list. It's ok to free slot directly. -+ */ -+ list_del(&slot->slot_list); -+ free_vma_slot(slot); -+ } -+out: -+ vma->uksm_vma_slot = NULL; -+ spin_unlock(&vma_slot_list_lock); -+} -+ -+/** -+ * Need to do two things: -+ * 1. check if slot was moved to del list -+ * 2. make sure the mmap_sem is manipulated under valid vma. -+ * -+ * My concern here is that in some cases, this may make -+ * vma_slot_list_lock() waiters to serialized further by some -+ * sem->wait_lock, can this really be expensive? -+ * -+ * -+ * @return -+ * 0: if successfully locked mmap_sem -+ * -ENOENT: this slot was moved to del list -+ * -EBUSY: vma lock failed -+ */ -+static int try_down_read_slot_mmap_sem(struct vma_slot *slot) -+{ -+ struct vm_area_struct *vma; -+ struct mm_struct *mm; -+ struct rw_semaphore *sem; -+ -+ spin_lock(&vma_slot_list_lock); -+ -+ /* the slot_list was removed and inited from new list, when it enters -+ * uksm_list. If now it's not empty, then it must be moved to del list -+ */ -+ if (!slot_in_uksm(slot)) { -+ spin_unlock(&vma_slot_list_lock); -+ return -ENOENT; -+ } -+ -+ BUG_ON(slot->pages != vma_pages(slot->vma)); -+ /* Ok, vma still valid */ -+ vma = slot->vma; -+ mm = vma->vm_mm; -+ sem = &mm->mmap_lock; -+ -+ if (uksm_test_exit(mm)) { -+ spin_unlock(&vma_slot_list_lock); -+ return -ENOENT; -+ } -+ -+ if (down_read_trylock(sem)) { -+ spin_unlock(&vma_slot_list_lock); -+ if (slot_pool_alloc(slot)) { -+ uksm_remove_vma(vma); -+ up_read(sem); -+ return -ENOENT; -+ } -+ return 0; -+ } -+ -+ spin_unlock(&vma_slot_list_lock); -+ return -EBUSY; -+} -+ -+static inline unsigned long -+vma_page_address(struct page *page, struct vm_area_struct *vma) -+{ -+ pgoff_t pgoff = page->index; -+ unsigned long address; -+ -+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); -+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { -+ /* page should be within @vma mapping range */ -+ return -EFAULT; -+ } -+ return address; -+} -+ -+ -+/* return 0 on success with the item's mmap_sem locked */ -+static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) -+{ -+ struct mm_struct *mm; -+ struct vma_slot *slot = item->slot; -+ int err = -EINVAL; -+ -+ struct page *page; -+ -+ /* -+ * try_down_read_slot_mmap_sem() returns non-zero if the slot -+ * has been removed by uksm_remove_vma(). -+ */ -+ if (try_down_read_slot_mmap_sem(slot)) -+ return -EBUSY; -+ -+ mm = slot->vma->vm_mm; -+ -+ if (uksm_test_exit(mm)) -+ goto failout_up; -+ -+ page = item->page; -+ rcu_read_lock(); -+ if (!get_page_unless_zero(page)) { -+ rcu_read_unlock(); -+ goto failout_up; -+ } -+ -+ /* No need to consider huge page here. */ -+ if (item->slot->vma->anon_vma != page_anon_vma(page) || -+ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { -+ /* -+ * TODO: -+ * should we release this item becase of its stale page -+ * mapping? -+ */ -+ put_page(page); -+ rcu_read_unlock(); -+ goto failout_up; -+ } -+ rcu_read_unlock(); -+ return 0; -+ -+failout_up: -+ mmap_read_unlock(mm); -+ return err; -+} -+ -+/* -+ * What kind of VMA is considered ? -+ */ -+static inline int vma_can_enter(struct vm_area_struct *vma) -+{ -+ return uksm_flags_can_scan(vma->vm_flags); -+} -+ -+/* -+ * Called whenever a fresh new vma is created A new vma_slot. -+ * is created and inserted into a global list Must be called. -+ * after vma is inserted to its mm. -+ */ -+void uksm_vma_add_new(struct vm_area_struct *vma) -+{ -+ struct vma_slot *slot; -+ -+ if (!vma_can_enter(vma)) { -+ vma->uksm_vma_slot = NULL; -+ return; -+ } -+ -+ slot = alloc_vma_slot(); -+ if (!slot) { -+ vma->uksm_vma_slot = NULL; -+ return; -+ } -+ -+ vma->uksm_vma_slot = slot; -+ vma->vm_flags |= VM_MERGEABLE; -+ slot->vma = vma; -+ slot->mm = vma->vm_mm; -+ slot->ctime_j = jiffies; -+ slot->pages = vma_pages(vma); -+ spin_lock(&vma_slot_list_lock); -+ list_add_tail(&slot->slot_list, &vma_slot_new); -+ spin_unlock(&vma_slot_list_lock); -+} -+ -+/* 32/3 < they < 32/2 */ -+#define shiftl 8 -+#define shiftr 12 -+ -+#define HASH_FROM_TO(from, to) \ -+for (index = from; index < to; index++) { \ -+ pos = random_nums[index]; \ -+ hash += key[pos]; \ -+ hash += (hash << shiftl); \ -+ hash ^= (hash >> shiftr); \ -+} -+ -+ -+#define HASH_FROM_DOWN_TO(from, to) \ -+for (index = from - 1; index >= to; index--) { \ -+ hash ^= (hash >> shiftr); \ -+ hash ^= (hash >> (shiftr*2)); \ -+ hash -= (hash << shiftl); \ -+ hash += (hash << (shiftl*2)); \ -+ pos = random_nums[index]; \ -+ hash -= key[pos]; \ -+} -+ -+/* -+ * The main random sample hash function. -+ */ -+static u32 random_sample_hash(void *addr, u32 hash_strength) -+{ -+ u32 hash = 0xdeadbeef; -+ int index, pos, loop = hash_strength; -+ u32 *key = (u32 *)addr; -+ -+ if (loop > HASH_STRENGTH_FULL) -+ loop = HASH_STRENGTH_FULL; -+ -+ HASH_FROM_TO(0, loop); -+ -+ if (hash_strength > HASH_STRENGTH_FULL) { -+ loop = hash_strength - HASH_STRENGTH_FULL; -+ HASH_FROM_TO(0, loop); -+ } -+ -+ return hash; -+} -+ -+ -+/** -+ * It's used when hash strength is adjusted -+ * -+ * @addr The page's virtual address -+ * @from The original hash strength -+ * @to The hash strength changed to -+ * @hash The hash value generated with "from" hash value -+ * -+ * return the hash value -+ */ -+static u32 delta_hash(void *addr, int from, int to, u32 hash) -+{ -+ u32 *key = (u32 *)addr; -+ int index, pos; /* make sure they are int type */ -+ -+ if (to > from) { -+ if (from >= HASH_STRENGTH_FULL) { -+ from -= HASH_STRENGTH_FULL; -+ to -= HASH_STRENGTH_FULL; -+ HASH_FROM_TO(from, to); -+ } else if (to <= HASH_STRENGTH_FULL) { -+ HASH_FROM_TO(from, to); -+ } else { -+ HASH_FROM_TO(from, HASH_STRENGTH_FULL); -+ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); -+ } -+ } else { -+ if (from <= HASH_STRENGTH_FULL) { -+ HASH_FROM_DOWN_TO(from, to); -+ } else if (to >= HASH_STRENGTH_FULL) { -+ from -= HASH_STRENGTH_FULL; -+ to -= HASH_STRENGTH_FULL; -+ HASH_FROM_DOWN_TO(from, to); -+ } else { -+ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); -+ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); -+ } -+ } -+ -+ return hash; -+} -+ -+/** -+ * -+ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round -+ * has finished. -+ * -+ * return 0 if no page has been scanned since last call, 1 otherwise. -+ */ -+static inline int encode_benefit(void) -+{ -+ u64 scanned_delta, pos_delta, neg_delta; -+ unsigned long base = benefit.base; -+ -+ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; -+ -+ if (!scanned_delta) -+ return 0; -+ -+ scanned_delta >>= base; -+ pos_delta = rshash_pos >> base; -+ neg_delta = rshash_neg >> base; -+ -+ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || -+ CAN_OVERFLOW_U64(benefit.neg, neg_delta) || -+ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { -+ benefit.scanned >>= 1; -+ benefit.neg >>= 1; -+ benefit.pos >>= 1; -+ benefit.base++; -+ scanned_delta >>= 1; -+ pos_delta >>= 1; -+ neg_delta >>= 1; -+ } -+ -+ benefit.pos += pos_delta; -+ benefit.neg += neg_delta; -+ benefit.scanned += scanned_delta; -+ -+ BUG_ON(!benefit.scanned); -+ -+ rshash_pos = rshash_neg = 0; -+ uksm_pages_scanned_last = uksm_pages_scanned; -+ -+ return 1; -+} -+ -+static inline void reset_benefit(void) -+{ -+ benefit.pos = 0; -+ benefit.neg = 0; -+ benefit.base = 0; -+ benefit.scanned = 0; -+} -+ -+static inline void inc_rshash_pos(unsigned long delta) -+{ -+ if (CAN_OVERFLOW_U64(rshash_pos, delta)) -+ encode_benefit(); -+ -+ rshash_pos += delta; -+} -+ -+static inline void inc_rshash_neg(unsigned long delta) -+{ -+ if (CAN_OVERFLOW_U64(rshash_neg, delta)) -+ encode_benefit(); -+ -+ rshash_neg += delta; -+} -+ -+ -+static inline u32 page_hash(struct page *page, unsigned long hash_strength, -+ int cost_accounting) -+{ -+ u32 val; -+ unsigned long delta; -+ -+ void *addr = kmap_atomic(page); -+ -+ val = random_sample_hash(addr, hash_strength); -+ kunmap_atomic(addr); -+ -+ if (cost_accounting) { -+ if (hash_strength < HASH_STRENGTH_FULL) -+ delta = HASH_STRENGTH_FULL - hash_strength; -+ else -+ delta = 0; -+ -+ inc_rshash_pos(delta); -+ } -+ -+ return val; -+} -+ -+static int memcmp_pages_with_cost(struct page *page1, struct page *page2, -+ int cost_accounting) -+{ -+ char *addr1, *addr2; -+ int ret; -+ -+ addr1 = kmap_atomic(page1); -+ addr2 = kmap_atomic(page2); -+ ret = memcmp(addr1, addr2, PAGE_SIZE); -+ kunmap_atomic(addr2); -+ kunmap_atomic(addr1); -+ -+ if (cost_accounting) -+ inc_rshash_neg(memcmp_cost); -+ -+ return ret; -+} -+ -+static inline int pages_identical_with_cost(struct page *page1, struct page *page2) -+{ -+ return !memcmp_pages_with_cost(page1, page2, 0); -+} -+ -+static inline int is_page_full_zero(struct page *page) -+{ -+ char *addr; -+ int ret; -+ -+ addr = kmap_atomic(page); -+ ret = is_full_zero(addr, PAGE_SIZE); -+ kunmap_atomic(addr); -+ -+ return ret; -+} -+ -+static int write_protect_page(struct vm_area_struct *vma, struct page *page, -+ pte_t *orig_pte, pte_t *old_pte) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ struct page_vma_mapped_walk pvmw = { -+ .page = page, -+ .vma = vma, -+ }; -+ struct mmu_notifier_range range; -+ int swapped; -+ int err = -EFAULT; -+ -+ pvmw.address = page_address_in_vma(page, vma); -+ if (pvmw.address == -EFAULT) -+ goto out; -+ -+ BUG_ON(PageTransCompound(page)); -+ -+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address, -+ pvmw.address + PAGE_SIZE); -+ mmu_notifier_invalidate_range_start(&range); -+ -+ if (!page_vma_mapped_walk(&pvmw)) -+ goto out_mn; -+ if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) -+ goto out_unlock; -+ -+ if (old_pte) -+ *old_pte = *pvmw.pte; -+ -+ if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || -+ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) { -+ pte_t entry; -+ -+ swapped = PageSwapCache(page); -+ flush_cache_page(vma, pvmw.address, page_to_pfn(page)); -+ /* -+ * Ok this is tricky, when get_user_pages_fast() run it doesn't -+ * take any lock, therefore the check that we are going to make -+ * with the pagecount against the mapcount is racey and -+ * O_DIRECT can happen right after the check. -+ * So we clear the pte and flush the tlb before the check -+ * this assure us that no O_DIRECT can happen after the check -+ * or in the middle of the check. -+ */ -+ entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); -+ /* -+ * Check that no O_DIRECT or similar I/O is in progress on the -+ * page -+ */ -+ if (page_mapcount(page) + 1 + swapped != page_count(page)) { -+ set_pte_at(mm, pvmw.address, pvmw.pte, entry); -+ goto out_unlock; -+ } -+ if (pte_dirty(entry)) -+ set_page_dirty(page); -+ -+ if (pte_protnone(entry)) -+ entry = pte_mkclean(pte_clear_savedwrite(entry)); -+ else -+ entry = pte_mkclean(pte_wrprotect(entry)); -+ -+ set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); -+ } -+ *orig_pte = *pvmw.pte; -+ err = 0; -+ -+out_unlock: -+ page_vma_mapped_walk_done(&pvmw); -+out_mn: -+ mmu_notifier_invalidate_range_end(&range); -+out: -+ return err; -+} -+ -+#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ -+#define MERGE_ERR_COLLI 2 /* there is a collision */ -+#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ -+#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ -+ -+ -+/** -+ * replace_page - replace page in vma by new ksm page -+ * @vma: vma that holds the pte pointing to page -+ * @page: the page we are replacing by kpage -+ * @kpage: the ksm page we replace page by -+ * @orig_pte: the original value of the pte -+ * -+ * Returns 0 on success, MERGE_ERR_PGERR on failure. -+ */ -+static int replace_page(struct vm_area_struct *vma, struct page *page, -+ struct page *kpage, pte_t orig_pte) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ struct mmu_notifier_range range; -+ pgd_t *pgd; -+ p4d_t *p4d; -+ pud_t *pud; -+ pmd_t *pmd; -+ pte_t *ptep; -+ spinlock_t *ptl; -+ pte_t entry; -+ -+ unsigned long addr; -+ int err = MERGE_ERR_PGERR; -+ -+ addr = page_address_in_vma(page, vma); -+ if (addr == -EFAULT) -+ goto out; -+ -+ pgd = pgd_offset(mm, addr); -+ if (!pgd_present(*pgd)) -+ goto out; -+ -+ p4d = p4d_offset(pgd, addr); -+ pud = pud_offset(p4d, addr); -+ if (!pud_present(*pud)) -+ goto out; -+ -+ pmd = pmd_offset(pud, addr); -+ BUG_ON(pmd_trans_huge(*pmd)); -+ if (!pmd_present(*pmd)) -+ goto out; -+ -+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, -+ addr + PAGE_SIZE); -+ mmu_notifier_invalidate_range_start(&range); -+ -+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); -+ if (!pte_same(*ptep, orig_pte)) { -+ pte_unmap_unlock(ptep, ptl); -+ goto out_mn; -+ } -+ -+ flush_cache_page(vma, addr, pte_pfn(*ptep)); -+ ptep_clear_flush_notify(vma, addr, ptep); -+ entry = mk_pte(kpage, vma->vm_page_prot); -+ -+ /* special treatment is needed for zero_page */ -+ if ((page_to_pfn(kpage) == uksm_zero_pfn) || -+ (page_to_pfn(kpage) == zero_pfn)) { -+ entry = pte_mkspecial(entry); -+ dec_mm_counter(mm, MM_ANONPAGES); -+ inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); -+ } else { -+ get_page(kpage); -+ page_add_anon_rmap(kpage, vma, addr, false); -+ } -+ -+ set_pte_at_notify(mm, addr, ptep, entry); -+ -+ page_remove_rmap(page, false); -+ if (!page_mapped(page)) -+ try_to_free_swap(page); -+ put_page(page); -+ -+ pte_unmap_unlock(ptep, ptl); -+ err = 0; -+out_mn: -+ mmu_notifier_invalidate_range_end(&range); -+out: -+ return err; -+} -+ -+ -+/** -+ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The -+ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its -+ * hash_max member has not been calculated. -+ * -+ * @page The page needs to be hashed -+ * @hash_old The hash value calculated with current hash strength -+ * -+ * return the new hash value calculated at HASH_STRENGTH_MAX -+ */ -+static inline u32 page_hash_max(struct page *page, u32 hash_old) -+{ -+ u32 hash_max = 0; -+ void *addr; -+ -+ addr = kmap_atomic(page); -+ hash_max = delta_hash(addr, hash_strength, -+ HASH_STRENGTH_MAX, hash_old); -+ -+ kunmap_atomic(addr); -+ -+ if (!hash_max) -+ hash_max = 1; -+ -+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); -+ return hash_max; -+} -+ -+/* -+ * We compare the hash again, to ensure that it is really a hash collision -+ * instead of being caused by page write. -+ */ -+static inline int check_collision(struct rmap_item *rmap_item, -+ u32 hash) -+{ -+ int err; -+ struct page *page = rmap_item->page; -+ -+ /* if this rmap_item has already been hash_maxed, then the collision -+ * must appears in the second-level rbtree search. In this case we check -+ * if its hash_max value has been changed. Otherwise, the collision -+ * happens in the first-level rbtree search, so we check against it's -+ * current hash value. -+ */ -+ if (rmap_item->hash_max) { -+ inc_rshash_neg(memcmp_cost); -+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); -+ -+ if (rmap_item->hash_max == page_hash_max(page, hash)) -+ err = MERGE_ERR_COLLI; -+ else -+ err = MERGE_ERR_CHANGED; -+ } else { -+ inc_rshash_neg(memcmp_cost + hash_strength); -+ -+ if (page_hash(page, hash_strength, 0) == hash) -+ err = MERGE_ERR_COLLI; -+ else -+ err = MERGE_ERR_CHANGED; -+ } -+ -+ return err; -+} -+ -+/** -+ * Try to merge a rmap_item.page with a kpage in stable node. kpage must -+ * already be a ksm page. -+ * -+ * @return 0 if the pages were merged, -EFAULT otherwise. -+ */ -+static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, -+ struct page *kpage, u32 hash) -+{ -+ struct vm_area_struct *vma = rmap_item->slot->vma; -+ struct mm_struct *mm = vma->vm_mm; -+ pte_t orig_pte = __pte(0); -+ int err = MERGE_ERR_PGERR; -+ struct page *page; -+ -+ if (uksm_test_exit(mm)) -+ goto out; -+ -+ page = rmap_item->page; -+ -+ if (page == kpage) { /* ksm page forked */ -+ err = 0; -+ goto out; -+ } -+ -+ /* -+ * We need the page lock to read a stable PageSwapCache in -+ * write_protect_page(). We use trylock_page() instead of -+ * lock_page() because we don't want to wait here - we -+ * prefer to continue scanning and merging different pages, -+ * then come back to this page when it is unlocked. -+ */ -+ if (!trylock_page(page)) -+ goto out; -+ -+ if (!PageAnon(page) || !PageKsm(kpage)) -+ goto out_unlock; -+ -+ if (PageTransCompound(page)) { -+ err = split_huge_page(page); -+ if (err) -+ goto out_unlock; -+ } -+ -+ /* -+ * If this anonymous page is mapped only here, its pte may need -+ * to be write-protected. If it's mapped elsewhere, all of its -+ * ptes are necessarily already write-protected. But in either -+ * case, we need to lock and check page_count is not raised. -+ */ -+ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { -+ if (pages_identical_with_cost(page, kpage)) -+ err = replace_page(vma, page, kpage, orig_pte); -+ else -+ err = check_collision(rmap_item, hash); -+ } -+ -+ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { -+ munlock_vma_page(page); -+ if (!PageMlocked(kpage)) { -+ unlock_page(page); -+ lock_page(kpage); -+ mlock_vma_page(kpage); -+ page = kpage; /* for final unlock */ -+ } -+ } -+ -+out_unlock: -+ unlock_page(page); -+out: -+ return err; -+} -+ -+ -+ -+/** -+ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance -+ * to restore a page mapping that has been changed in try_to_merge_two_pages. -+ * -+ * @return 0 on success. -+ */ -+static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, -+ pte_t orig_pte, pte_t wprt_pte) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ pgd_t *pgd; -+ p4d_t *p4d; -+ pud_t *pud; -+ pmd_t *pmd; -+ pte_t *ptep; -+ spinlock_t *ptl; -+ -+ int err = -EFAULT; -+ -+ pgd = pgd_offset(mm, addr); -+ if (!pgd_present(*pgd)) -+ goto out; -+ -+ p4d = p4d_offset(pgd, addr); -+ pud = pud_offset(p4d, addr); -+ if (!pud_present(*pud)) -+ goto out; -+ -+ pmd = pmd_offset(pud, addr); -+ if (!pmd_present(*pmd)) -+ goto out; -+ -+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); -+ if (!pte_same(*ptep, wprt_pte)) { -+ /* already copied, let it be */ -+ pte_unmap_unlock(ptep, ptl); -+ goto out; -+ } -+ -+ /* -+ * Good boy, still here. When we still get the ksm page, it does not -+ * return to the free page pool, there is no way that a pte was changed -+ * to other page and gets back to this page. And remind that ksm page -+ * do not reuse in do_wp_page(). So it's safe to restore the original -+ * pte. -+ */ -+ flush_cache_page(vma, addr, pte_pfn(*ptep)); -+ ptep_clear_flush_notify(vma, addr, ptep); -+ set_pte_at_notify(mm, addr, ptep, orig_pte); -+ -+ pte_unmap_unlock(ptep, ptl); -+ err = 0; -+out: -+ return err; -+} -+ -+/** -+ * try_to_merge_two_pages() - take two identical pages and prepare -+ * them to be merged into one page(rmap_item->page) -+ * -+ * @return 0 if we successfully merged two identical pages into -+ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision -+ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been -+ * changed since it's hashed. MERGE_ERR_PGERR otherwise. -+ * -+ */ -+static int try_to_merge_two_pages(struct rmap_item *rmap_item, -+ struct rmap_item *tree_rmap_item, -+ u32 hash) -+{ -+ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); -+ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); -+ struct vm_area_struct *vma1 = rmap_item->slot->vma; -+ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; -+ struct page *page = rmap_item->page; -+ struct page *tree_page = tree_rmap_item->page; -+ int err = MERGE_ERR_PGERR; -+ struct address_space *saved_mapping; -+ -+ -+ if (rmap_item->page == tree_rmap_item->page) -+ goto out; -+ -+ if (!trylock_page(page)) -+ goto out; -+ -+ if (!PageAnon(page)) -+ goto out_unlock; -+ -+ if (PageTransCompound(page)) { -+ err = split_huge_page(page); -+ if (err) -+ goto out_unlock; -+ } -+ -+ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { -+ unlock_page(page); -+ goto out; -+ } -+ -+ /* -+ * While we hold page lock, upgrade page from -+ * PageAnon+anon_vma to PageKsm+NULL stable_node: -+ * stable_tree_insert() will update stable_node. -+ */ -+ saved_mapping = page->mapping; -+ set_page_stable_node(page, NULL); -+ mark_page_accessed(page); -+ if (!PageDirty(page)) -+ SetPageDirty(page); -+ -+ unlock_page(page); -+ -+ if (!trylock_page(tree_page)) -+ goto restore_out; -+ -+ if (!PageAnon(tree_page)) { -+ unlock_page(tree_page); -+ goto restore_out; -+ } -+ -+ if (PageTransCompound(tree_page)) { -+ err = split_huge_page(tree_page); -+ if (err) { -+ unlock_page(tree_page); -+ goto restore_out; -+ } -+ } -+ -+ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { -+ unlock_page(tree_page); -+ goto restore_out; -+ } -+ -+ if (pages_identical_with_cost(page, tree_page)) { -+ err = replace_page(vma2, tree_page, page, wprt_pte2); -+ if (err) { -+ unlock_page(tree_page); -+ goto restore_out; -+ } -+ -+ if ((vma2->vm_flags & VM_LOCKED)) { -+ munlock_vma_page(tree_page); -+ if (!PageMlocked(page)) { -+ unlock_page(tree_page); -+ lock_page(page); -+ mlock_vma_page(page); -+ tree_page = page; /* for final unlock */ -+ } -+ } -+ -+ unlock_page(tree_page); -+ -+ goto out; /* success */ -+ -+ } else { -+ if (tree_rmap_item->hash_max && -+ tree_rmap_item->hash_max == rmap_item->hash_max) { -+ err = MERGE_ERR_COLLI_MAX; -+ } else if (page_hash(page, hash_strength, 0) == -+ page_hash(tree_page, hash_strength, 0)) { -+ inc_rshash_neg(memcmp_cost + hash_strength * 2); -+ err = MERGE_ERR_COLLI; -+ } else { -+ err = MERGE_ERR_CHANGED; -+ } -+ -+ unlock_page(tree_page); -+ } -+ -+restore_out: -+ lock_page(page); -+ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), -+ orig_pte1, wprt_pte1)) -+ page->mapping = saved_mapping; -+ -+out_unlock: -+ unlock_page(page); -+out: -+ return err; -+} -+ -+static inline int hash_cmp(u32 new_val, u32 node_val) -+{ -+ if (new_val > node_val) -+ return 1; -+ else if (new_val < node_val) -+ return -1; -+ else -+ return 0; -+} -+ -+static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) -+{ -+ u32 hash_max = item->hash_max; -+ -+ if (!hash_max) { -+ hash_max = page_hash_max(item->page, hash); -+ -+ item->hash_max = hash_max; -+ } -+ -+ return hash_max; -+} -+ -+ -+ -+/** -+ * stable_tree_search() - search the stable tree for a page -+ * -+ * @item: the rmap_item we are comparing with -+ * @hash: the hash value of this item->page already calculated -+ * -+ * @return the page we have found, NULL otherwise. The page returned has -+ * been gotten. -+ */ -+static struct page *stable_tree_search(struct rmap_item *item, u32 hash) -+{ -+ struct rb_node *node = root_stable_treep->rb_node; -+ struct tree_node *tree_node; -+ unsigned long hash_max; -+ struct page *page = item->page; -+ struct stable_node *stable_node; -+ -+ stable_node = page_stable_node(page); -+ if (stable_node) { -+ /* ksm page forked, that is -+ * if (PageKsm(page) && !in_stable_tree(rmap_item)) -+ * it's actually gotten once outside. -+ */ -+ get_page(page); -+ return page; -+ } -+ -+ while (node) { -+ int cmp; -+ -+ tree_node = rb_entry(node, struct tree_node, node); -+ -+ cmp = hash_cmp(hash, tree_node->hash); -+ -+ if (cmp < 0) -+ node = node->rb_left; -+ else if (cmp > 0) -+ node = node->rb_right; -+ else -+ break; -+ } -+ -+ if (!node) -+ return NULL; -+ -+ if (tree_node->count == 1) { -+ stable_node = rb_entry(tree_node->sub_root.rb_node, -+ struct stable_node, node); -+ BUG_ON(!stable_node); -+ -+ goto get_page_out; -+ } -+ -+ /* -+ * ok, we have to search the second -+ * level subtree, hash the page to a -+ * full strength. -+ */ -+ node = tree_node->sub_root.rb_node; -+ BUG_ON(!node); -+ hash_max = rmap_item_hash_max(item, hash); -+ -+ while (node) { -+ int cmp; -+ -+ stable_node = rb_entry(node, struct stable_node, node); -+ -+ cmp = hash_cmp(hash_max, stable_node->hash_max); -+ -+ if (cmp < 0) -+ node = node->rb_left; -+ else if (cmp > 0) -+ node = node->rb_right; -+ else -+ goto get_page_out; -+ } -+ -+ return NULL; -+ -+get_page_out: -+ page = get_uksm_page(stable_node, 1, 1); -+ return page; -+} -+ -+static int try_merge_rmap_item(struct rmap_item *item, -+ struct page *kpage, -+ struct page *tree_page) -+{ -+ struct vm_area_struct *vma = item->slot->vma; -+ struct page_vma_mapped_walk pvmw = { -+ .page = kpage, -+ .vma = vma, -+ }; -+ -+ pvmw.address = get_rmap_addr(item); -+ if (!page_vma_mapped_walk(&pvmw)) -+ return 0; -+ -+ if (pte_write(*pvmw.pte)) { -+ /* has changed, abort! */ -+ page_vma_mapped_walk_done(&pvmw); -+ return 0; -+ } -+ -+ get_page(tree_page); -+ page_add_anon_rmap(tree_page, vma, pvmw.address, false); -+ -+ flush_cache_page(vma, pvmw.address, page_to_pfn(kpage)); -+ ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); -+ set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte, -+ mk_pte(tree_page, vma->vm_page_prot)); -+ -+ page_remove_rmap(kpage, false); -+ put_page(kpage); -+ -+ page_vma_mapped_walk_done(&pvmw); -+ -+ return 1; -+} -+ -+/** -+ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted -+ * into stable tree, the page was found to be identical to a stable ksm page, -+ * this is the last chance we can merge them into one. -+ * -+ * @item1: the rmap_item holding the page which we wanted to insert -+ * into stable tree. -+ * @item2: the other rmap_item we found when unstable tree search -+ * @oldpage: the page currently mapped by the two rmap_items -+ * @tree_page: the page we found identical in stable tree node -+ * @success1: return if item1 is successfully merged -+ * @success2: return if item2 is successfully merged -+ */ -+static void try_merge_with_stable(struct rmap_item *item1, -+ struct rmap_item *item2, -+ struct page **kpage, -+ struct page *tree_page, -+ int *success1, int *success2) -+{ -+ struct vm_area_struct *vma1 = item1->slot->vma; -+ struct vm_area_struct *vma2 = item2->slot->vma; -+ *success1 = 0; -+ *success2 = 0; -+ -+ if (unlikely(*kpage == tree_page)) { -+ /* I don't think this can really happen */ -+ pr_warn("UKSM: unexpected condition detected in " -+ "%s -- *kpage == tree_page !\n", __func__); -+ *success1 = 1; -+ *success2 = 1; -+ return; -+ } -+ -+ if (!PageAnon(*kpage) || !PageKsm(*kpage)) -+ goto failed; -+ -+ if (!trylock_page(tree_page)) -+ goto failed; -+ -+ /* If the oldpage is still ksm and still pointed -+ * to in the right place, and still write protected, -+ * we are confident it's not changed, no need to -+ * memcmp anymore. -+ * be ware, we cannot take nested pte locks, -+ * deadlock risk. -+ */ -+ if (!try_merge_rmap_item(item1, *kpage, tree_page)) -+ goto unlock_failed; -+ -+ /* ok, then vma2, remind that pte1 already set */ -+ if (!try_merge_rmap_item(item2, *kpage, tree_page)) -+ goto success_1; -+ -+ *success2 = 1; -+success_1: -+ *success1 = 1; -+ -+ -+ if ((*success1 && vma1->vm_flags & VM_LOCKED) || -+ (*success2 && vma2->vm_flags & VM_LOCKED)) { -+ munlock_vma_page(*kpage); -+ if (!PageMlocked(tree_page)) -+ mlock_vma_page(tree_page); -+ } -+ -+ /* -+ * We do not need oldpage any more in the caller, so can break the lock -+ * now. -+ */ -+ unlock_page(*kpage); -+ *kpage = tree_page; /* Get unlocked outside. */ -+ return; -+ -+unlock_failed: -+ unlock_page(tree_page); -+failed: -+ return; -+} -+ -+static inline void stable_node_hash_max(struct stable_node *node, -+ struct page *page, u32 hash) -+{ -+ u32 hash_max = node->hash_max; -+ -+ if (!hash_max) { -+ hash_max = page_hash_max(page, hash); -+ node->hash_max = hash_max; -+ } -+} -+ -+static inline -+struct stable_node *new_stable_node(struct tree_node *tree_node, -+ struct page *kpage, u32 hash_max) -+{ -+ struct stable_node *new_stable_node; -+ -+ new_stable_node = alloc_stable_node(); -+ if (!new_stable_node) -+ return NULL; -+ -+ new_stable_node->kpfn = page_to_pfn(kpage); -+ new_stable_node->hash_max = hash_max; -+ new_stable_node->tree_node = tree_node; -+ set_page_stable_node(kpage, new_stable_node); -+ -+ return new_stable_node; -+} -+ -+static inline -+struct stable_node *first_level_insert(struct tree_node *tree_node, -+ struct rmap_item *rmap_item, -+ struct rmap_item *tree_rmap_item, -+ struct page **kpage, u32 hash, -+ int *success1, int *success2) -+{ -+ int cmp; -+ struct page *tree_page; -+ u32 hash_max = 0; -+ struct stable_node *stable_node, *new_snode; -+ struct rb_node *parent = NULL, **new; -+ -+ /* this tree node contains no sub-tree yet */ -+ stable_node = rb_entry(tree_node->sub_root.rb_node, -+ struct stable_node, node); -+ -+ tree_page = get_uksm_page(stable_node, 1, 0); -+ if (tree_page) { -+ cmp = memcmp_pages_with_cost(*kpage, tree_page, 1); -+ if (!cmp) { -+ try_merge_with_stable(rmap_item, tree_rmap_item, kpage, -+ tree_page, success1, success2); -+ put_page(tree_page); -+ if (!*success1 && !*success2) -+ goto failed; -+ -+ return stable_node; -+ -+ } else { -+ /* -+ * collision in first level try to create a subtree. -+ * A new node need to be created. -+ */ -+ put_page(tree_page); -+ -+ stable_node_hash_max(stable_node, tree_page, -+ tree_node->hash); -+ hash_max = rmap_item_hash_max(rmap_item, hash); -+ cmp = hash_cmp(hash_max, stable_node->hash_max); -+ -+ parent = &stable_node->node; -+ if (cmp < 0) -+ new = &parent->rb_left; -+ else if (cmp > 0) -+ new = &parent->rb_right; -+ else -+ goto failed; -+ } -+ -+ } else { -+ /* the only stable_node deleted, we reuse its tree_node. -+ */ -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ } -+ -+ new_snode = new_stable_node(tree_node, *kpage, hash_max); -+ if (!new_snode) -+ goto failed; -+ -+ rb_link_node(&new_snode->node, parent, new); -+ rb_insert_color(&new_snode->node, &tree_node->sub_root); -+ tree_node->count++; -+ *success1 = *success2 = 1; -+ -+ return new_snode; -+ -+failed: -+ return NULL; -+} -+ -+static inline -+struct stable_node *stable_subtree_insert(struct tree_node *tree_node, -+ struct rmap_item *rmap_item, -+ struct rmap_item *tree_rmap_item, -+ struct page **kpage, u32 hash, -+ int *success1, int *success2) -+{ -+ struct page *tree_page; -+ u32 hash_max; -+ struct stable_node *stable_node, *new_snode; -+ struct rb_node *parent, **new; -+ -+research: -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ BUG_ON(!*new); -+ hash_max = rmap_item_hash_max(rmap_item, hash); -+ while (*new) { -+ int cmp; -+ -+ stable_node = rb_entry(*new, struct stable_node, node); -+ -+ cmp = hash_cmp(hash_max, stable_node->hash_max); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else { -+ tree_page = get_uksm_page(stable_node, 1, 0); -+ if (tree_page) { -+ cmp = memcmp_pages_with_cost(*kpage, tree_page, 1); -+ if (!cmp) { -+ try_merge_with_stable(rmap_item, -+ tree_rmap_item, kpage, -+ tree_page, success1, success2); -+ -+ put_page(tree_page); -+ if (!*success1 && !*success2) -+ goto failed; -+ /* -+ * successfully merged with a stable -+ * node -+ */ -+ return stable_node; -+ } else { -+ put_page(tree_page); -+ goto failed; -+ } -+ } else { -+ /* -+ * stable node may be deleted, -+ * and subtree maybe -+ * restructed, cannot -+ * continue, research it. -+ */ -+ if (tree_node->count) { -+ goto research; -+ } else { -+ /* reuse the tree node*/ -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ } -+ } -+ } -+ } -+ -+ new_snode = new_stable_node(tree_node, *kpage, hash_max); -+ if (!new_snode) -+ goto failed; -+ -+ rb_link_node(&new_snode->node, parent, new); -+ rb_insert_color(&new_snode->node, &tree_node->sub_root); -+ tree_node->count++; -+ *success1 = *success2 = 1; -+ -+ return new_snode; -+ -+failed: -+ return NULL; -+} -+ -+ -+/** -+ * stable_tree_insert() - try to insert a merged page in unstable tree to -+ * the stable tree -+ * -+ * @kpage: the page need to be inserted -+ * @hash: the current hash of this page -+ * @rmap_item: the rmap_item being scanned -+ * @tree_rmap_item: the rmap_item found on unstable tree -+ * @success1: return if rmap_item is merged -+ * @success2: return if tree_rmap_item is merged -+ * -+ * @return the stable_node on stable tree if at least one -+ * rmap_item is inserted into stable tree, NULL -+ * otherwise. -+ */ -+static struct stable_node * -+stable_tree_insert(struct page **kpage, u32 hash, -+ struct rmap_item *rmap_item, -+ struct rmap_item *tree_rmap_item, -+ int *success1, int *success2) -+{ -+ struct rb_node **new = &root_stable_treep->rb_node; -+ struct rb_node *parent = NULL; -+ struct stable_node *stable_node; -+ struct tree_node *tree_node; -+ u32 hash_max = 0; -+ -+ *success1 = *success2 = 0; -+ -+ while (*new) { -+ int cmp; -+ -+ tree_node = rb_entry(*new, struct tree_node, node); -+ -+ cmp = hash_cmp(hash, tree_node->hash); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else -+ break; -+ } -+ -+ if (*new) { -+ if (tree_node->count == 1) { -+ stable_node = first_level_insert(tree_node, rmap_item, -+ tree_rmap_item, kpage, -+ hash, success1, success2); -+ } else { -+ stable_node = stable_subtree_insert(tree_node, -+ rmap_item, tree_rmap_item, kpage, -+ hash, success1, success2); -+ } -+ } else { -+ -+ /* no tree node found */ -+ tree_node = alloc_tree_node(stable_tree_node_listp); -+ if (!tree_node) { -+ stable_node = NULL; -+ goto out; -+ } -+ -+ stable_node = new_stable_node(tree_node, *kpage, hash_max); -+ if (!stable_node) { -+ free_tree_node(tree_node); -+ goto out; -+ } -+ -+ tree_node->hash = hash; -+ rb_link_node(&tree_node->node, parent, new); -+ rb_insert_color(&tree_node->node, root_stable_treep); -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ -+ rb_link_node(&stable_node->node, parent, new); -+ rb_insert_color(&stable_node->node, &tree_node->sub_root); -+ tree_node->count++; -+ *success1 = *success2 = 1; -+ } -+ -+out: -+ return stable_node; -+} -+ -+ -+/** -+ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem -+ * -+ * @return 0 on success, -EBUSY if unable to lock the mmap_sem, -+ * -EINVAL if the page mapping has been changed. -+ */ -+static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) -+{ -+ int err; -+ -+ err = get_mergeable_page_lock_mmap(tree_rmap_item); -+ -+ if (err == -EINVAL) { -+ /* its page map has been changed, remove it */ -+ remove_rmap_item_from_tree(tree_rmap_item); -+ } -+ -+ /* The page is gotten and mmap_sem is locked now. */ -+ return err; -+} -+ -+ -+/** -+ * unstable_tree_search_insert() - search an unstable tree rmap_item with the -+ * same hash value. Get its page and trylock the mmap_sem -+ */ -+static inline -+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, -+ u32 hash) -+ -+{ -+ struct rb_node **new = &root_unstable_tree.rb_node; -+ struct rb_node *parent = NULL; -+ struct tree_node *tree_node; -+ u32 hash_max; -+ struct rmap_item *tree_rmap_item; -+ -+ while (*new) { -+ int cmp; -+ -+ tree_node = rb_entry(*new, struct tree_node, node); -+ -+ cmp = hash_cmp(hash, tree_node->hash); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else -+ break; -+ } -+ -+ if (*new) { -+ /* got the tree_node */ -+ if (tree_node->count == 1) { -+ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, -+ struct rmap_item, node); -+ BUG_ON(!tree_rmap_item); -+ -+ goto get_page_out; -+ } -+ -+ /* well, search the collision subtree */ -+ new = &tree_node->sub_root.rb_node; -+ BUG_ON(!*new); -+ hash_max = rmap_item_hash_max(rmap_item, hash); -+ -+ while (*new) { -+ int cmp; -+ -+ tree_rmap_item = rb_entry(*new, struct rmap_item, -+ node); -+ -+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); -+ parent = *new; -+ if (cmp < 0) -+ new = &parent->rb_left; -+ else if (cmp > 0) -+ new = &parent->rb_right; -+ else -+ goto get_page_out; -+ } -+ } else { -+ /* alloc a new tree_node */ -+ tree_node = alloc_tree_node(&unstable_tree_node_list); -+ if (!tree_node) -+ return NULL; -+ -+ tree_node->hash = hash; -+ rb_link_node(&tree_node->node, parent, new); -+ rb_insert_color(&tree_node->node, &root_unstable_tree); -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ } -+ -+ /* did not found even in sub-tree */ -+ rmap_item->tree_node = tree_node; -+ rmap_item->address |= UNSTABLE_FLAG; -+ rmap_item->hash_round = uksm_hash_round; -+ rb_link_node(&rmap_item->node, parent, new); -+ rb_insert_color(&rmap_item->node, &tree_node->sub_root); -+ -+ uksm_pages_unshared++; -+ return NULL; -+ -+get_page_out: -+ if (tree_rmap_item->page == rmap_item->page) -+ return NULL; -+ -+ if (get_tree_rmap_item_page(tree_rmap_item)) -+ return NULL; -+ -+ return tree_rmap_item; -+} -+ -+static void hold_anon_vma(struct rmap_item *rmap_item, -+ struct anon_vma *anon_vma) -+{ -+ rmap_item->anon_vma = anon_vma; -+ get_anon_vma(anon_vma); -+} -+ -+ -+/** -+ * stable_tree_append() - append a rmap_item to a stable node. Deduplication -+ * ratio statistics is done in this function. -+ * -+ */ -+static void stable_tree_append(struct rmap_item *rmap_item, -+ struct stable_node *stable_node, int logdedup) -+{ -+ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; -+ unsigned long key = (unsigned long)rmap_item->slot; -+ unsigned long factor = rmap_item->slot->rung->step; -+ -+ BUG_ON(!stable_node); -+ rmap_item->address |= STABLE_FLAG; -+ -+ if (hlist_empty(&stable_node->hlist)) { -+ uksm_pages_shared++; -+ goto node_vma_new; -+ } else { -+ uksm_pages_sharing++; -+ } -+ -+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { -+ if (node_vma->key >= key) -+ break; -+ -+ if (logdedup) { -+ node_vma->slot->pages_bemerged += factor; -+ if (list_empty(&node_vma->slot->dedup_list)) -+ list_add(&node_vma->slot->dedup_list, -+ &vma_slot_dedup); -+ } -+ } -+ -+ if (node_vma) { -+ if (node_vma->key == key) { -+ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); -+ goto node_vma_ok; -+ } else if (node_vma->key > key) { -+ node_vma_cont = node_vma; -+ } -+ } -+ -+node_vma_new: -+ /* no same vma already in node, alloc a new node_vma */ -+ new_node_vma = alloc_node_vma(); -+ BUG_ON(!new_node_vma); -+ new_node_vma->head = stable_node; -+ new_node_vma->slot = rmap_item->slot; -+ -+ if (!node_vma) { -+ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); -+ } else if (node_vma->key != key) { -+ if (node_vma->key < key) -+ hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist); -+ else { -+ hlist_add_before(&new_node_vma->hlist, -+ &node_vma->hlist); -+ } -+ -+ } -+ node_vma = new_node_vma; -+ -+node_vma_ok: /* ok, ready to add to the list */ -+ rmap_item->head = node_vma; -+ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); -+ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); -+ if (logdedup) { -+ rmap_item->slot->pages_merged++; -+ if (node_vma_cont) { -+ node_vma = node_vma_cont; -+ hlist_for_each_entry_continue(node_vma, hlist) { -+ node_vma->slot->pages_bemerged += factor; -+ if (list_empty(&node_vma->slot->dedup_list)) -+ list_add(&node_vma->slot->dedup_list, -+ &vma_slot_dedup); -+ } -+ } -+ } -+} -+ -+/* -+ * We use break_ksm to break COW on a ksm page: it's a stripped down -+ * -+ * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) -+ * put_page(page); -+ * -+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, -+ * in case the application has unmapped and remapped mm,addr meanwhile. -+ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP -+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. -+ */ -+static int break_ksm(struct vm_area_struct *vma, unsigned long addr) -+{ -+ struct page *page; -+ int ret = 0; -+ -+ do { -+ cond_resched(); -+ page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); -+ if (IS_ERR_OR_NULL(page)) -+ break; -+ if (PageKsm(page)) { -+ ret = handle_mm_fault(vma, addr, -+ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, -+ NULL); -+ } else -+ ret = VM_FAULT_WRITE; -+ put_page(page); -+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); -+ /* -+ * We must loop because handle_mm_fault() may back out if there's -+ * any difficulty e.g. if pte accessed bit gets updated concurrently. -+ * -+ * VM_FAULT_WRITE is what we have been hoping for: it indicates that -+ * COW has been broken, even if the vma does not permit VM_WRITE; -+ * but note that a concurrent fault might break PageKsm for us. -+ * -+ * VM_FAULT_SIGBUS could occur if we race with truncation of the -+ * backing file, which also invalidates anonymous pages: that's -+ * okay, that truncation will have unmapped the PageKsm for us. -+ * -+ * VM_FAULT_OOM: at the time of writing (late July 2009), setting -+ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the -+ * current task has TIF_MEMDIE set, and will be OOM killed on return -+ * to user; and ksmd, having no mm, would never be chosen for that. -+ * -+ * But if the mm is in a limited mem_cgroup, then the fault may fail -+ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and -+ * even ksmd can fail in this way - though it's usually breaking ksm -+ * just to undo a merge it made a moment before, so unlikely to oom. -+ * -+ * That's a pity: we might therefore have more kernel pages allocated -+ * than we're counting as nodes in the stable tree; but uksm_do_scan -+ * will retry to break_cow on each pass, so should recover the page -+ * in due course. The important thing is to not let VM_MERGEABLE -+ * be cleared while any such pages might remain in the area. -+ */ -+ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; -+} -+ -+static void break_cow(struct rmap_item *rmap_item) -+{ -+ struct vm_area_struct *vma = rmap_item->slot->vma; -+ struct mm_struct *mm = vma->vm_mm; -+ unsigned long addr = get_rmap_addr(rmap_item); -+ -+ if (uksm_test_exit(mm)) -+ goto out; -+ -+ break_ksm(vma, addr); -+out: -+ return; -+} -+ -+/* -+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather -+ * than check every pte of a given vma, the locking doesn't quite work for -+ * that - an rmap_item is assigned to the stable tree after inserting ksm -+ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing -+ * rmap_items from parent to child at fork time (so as not to waste time -+ * if exit comes before the next scan reaches it). -+ * -+ * Similarly, although we'd like to remove rmap_items (so updating counts -+ * and freeing memory) when unmerging an area, it's easier to leave that -+ * to the next pass of ksmd - consider, for example, how ksmd might be -+ * in cmp_and_merge_page on one of the rmap_items we would be removing. -+ */ -+inline int unmerge_uksm_pages(struct vm_area_struct *vma, -+ unsigned long start, unsigned long end) -+{ -+ unsigned long addr; -+ int err = 0; -+ -+ for (addr = start; addr < end && !err; addr += PAGE_SIZE) { -+ if (uksm_test_exit(vma->vm_mm)) -+ break; -+ if (signal_pending(current)) -+ err = -ERESTARTSYS; -+ else -+ err = break_ksm(vma, addr); -+ } -+ return err; -+} -+ -+static inline void inc_uksm_pages_scanned(void) -+{ -+ u64 delta; -+ -+ -+ if (uksm_pages_scanned == U64_MAX) { -+ encode_benefit(); -+ -+ delta = uksm_pages_scanned >> pages_scanned_base; -+ -+ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { -+ pages_scanned_stored >>= 1; -+ delta >>= 1; -+ pages_scanned_base++; -+ } -+ -+ pages_scanned_stored += delta; -+ -+ uksm_pages_scanned = uksm_pages_scanned_last = 0; -+ } -+ -+ uksm_pages_scanned++; -+} -+ -+static inline int find_zero_page_hash(int strength, u32 hash) -+{ -+ return (zero_hash_table[strength] == hash); -+} -+ -+static -+int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) -+{ -+ struct page *zero_page = empty_uksm_zero_page; -+ struct mm_struct *mm = vma->vm_mm; -+ pte_t orig_pte = __pte(0); -+ int err = -EFAULT; -+ -+ if (uksm_test_exit(mm)) -+ goto out; -+ -+ if (!trylock_page(page)) -+ goto out; -+ -+ if (!PageAnon(page)) -+ goto out_unlock; -+ -+ if (PageTransCompound(page)) { -+ err = split_huge_page(page); -+ if (err) -+ goto out_unlock; -+ } -+ -+ if (write_protect_page(vma, page, &orig_pte, 0) == 0) { -+ if (is_page_full_zero(page)) -+ err = replace_page(vma, page, zero_page, orig_pte); -+ } -+ -+out_unlock: -+ unlock_page(page); -+out: -+ return err; -+} -+ -+/* -+ * cmp_and_merge_page() - first see if page can be merged into the stable -+ * tree; if not, compare hash to previous and if it's the same, see if page -+ * can be inserted into the unstable tree, or merged with a page already there -+ * and both transferred to the stable tree. -+ * -+ * @page: the page that we are searching identical page to. -+ * @rmap_item: the reverse mapping into the virtual address of this page -+ */ -+static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) -+{ -+ struct rmap_item *tree_rmap_item; -+ struct page *page; -+ struct page *kpage = NULL; -+ u32 hash_max; -+ int err; -+ unsigned int success1, success2; -+ struct stable_node *snode; -+ int cmp; -+ struct rb_node *parent = NULL, **new; -+ -+ remove_rmap_item_from_tree(rmap_item); -+ page = rmap_item->page; -+ -+ /* We first start with searching the page inside the stable tree */ -+ kpage = stable_tree_search(rmap_item, hash); -+ if (kpage) { -+ err = try_to_merge_with_uksm_page(rmap_item, kpage, -+ hash); -+ if (!err) { -+ /* -+ * The page was successfully merged, add -+ * its rmap_item to the stable tree. -+ * page lock is needed because it's -+ * racing with try_to_unmap_ksm(), etc. -+ */ -+ lock_page(kpage); -+ snode = page_stable_node(kpage); -+ stable_tree_append(rmap_item, snode, 1); -+ unlock_page(kpage); -+ put_page(kpage); -+ return; /* success */ -+ } -+ put_page(kpage); -+ -+ /* -+ * if it's a collision and it has been search in sub-rbtree -+ * (hash_max != 0), we want to abort, because if it is -+ * successfully merged in unstable tree, the collision trends to -+ * happen again. -+ */ -+ if (err == MERGE_ERR_COLLI && rmap_item->hash_max) -+ return; -+ } -+ -+ tree_rmap_item = -+ unstable_tree_search_insert(rmap_item, hash); -+ if (tree_rmap_item) { -+ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); -+ /* -+ * As soon as we merge this page, we want to remove the -+ * rmap_item of the page we have merged with from the unstable -+ * tree, and insert it instead as new node in the stable tree. -+ */ -+ if (!err) { -+ kpage = page; -+ remove_rmap_item_from_tree(tree_rmap_item); -+ lock_page(kpage); -+ snode = stable_tree_insert(&kpage, hash, -+ rmap_item, tree_rmap_item, -+ &success1, &success2); -+ -+ /* -+ * Do not log dedup for tree item, it's not counted as -+ * scanned in this round. -+ */ -+ if (success2) -+ stable_tree_append(tree_rmap_item, snode, 0); -+ -+ /* -+ * The order of these two stable append is important: -+ * we are scanning rmap_item. -+ */ -+ if (success1) -+ stable_tree_append(rmap_item, snode, 1); -+ -+ /* -+ * The original kpage may be unlocked inside -+ * stable_tree_insert() already. This page -+ * should be unlocked before doing -+ * break_cow(). -+ */ -+ unlock_page(kpage); -+ -+ if (!success1) -+ break_cow(rmap_item); -+ -+ if (!success2) -+ break_cow(tree_rmap_item); -+ -+ } else if (err == MERGE_ERR_COLLI) { -+ BUG_ON(tree_rmap_item->tree_node->count > 1); -+ -+ rmap_item_hash_max(tree_rmap_item, -+ tree_rmap_item->tree_node->hash); -+ -+ hash_max = rmap_item_hash_max(rmap_item, hash); -+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); -+ parent = &tree_rmap_item->node; -+ if (cmp < 0) -+ new = &parent->rb_left; -+ else if (cmp > 0) -+ new = &parent->rb_right; -+ else -+ goto put_up_out; -+ -+ rmap_item->tree_node = tree_rmap_item->tree_node; -+ rmap_item->address |= UNSTABLE_FLAG; -+ rmap_item->hash_round = uksm_hash_round; -+ rb_link_node(&rmap_item->node, parent, new); -+ rb_insert_color(&rmap_item->node, -+ &tree_rmap_item->tree_node->sub_root); -+ rmap_item->tree_node->count++; -+ } else { -+ /* -+ * either one of the page has changed or they collide -+ * at the max hash, we consider them as ill items. -+ */ -+ remove_rmap_item_from_tree(tree_rmap_item); -+ } -+put_up_out: -+ put_page(tree_rmap_item->page); -+ mmap_read_unlock(tree_rmap_item->slot->vma->vm_mm); -+ } -+} -+ -+ -+ -+ -+static inline unsigned long get_pool_index(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; -+ if (pool_index >= slot->pool_size) -+ BUG(); -+ return pool_index; -+} -+ -+static inline unsigned long index_page_offset(unsigned long index) -+{ -+ return offset_in_page(sizeof(struct rmap_list_entry *) * index); -+} -+ -+static inline -+struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, -+ unsigned long index, int need_alloc) -+{ -+ unsigned long pool_index; -+ struct page *page; -+ void *addr; -+ -+ -+ pool_index = get_pool_index(slot, index); -+ if (!slot->rmap_list_pool[pool_index]) { -+ if (!need_alloc) -+ return NULL; -+ -+ page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); -+ if (!page) -+ return NULL; -+ -+ slot->rmap_list_pool[pool_index] = page; -+ } -+ -+ addr = kmap(slot->rmap_list_pool[pool_index]); -+ addr += index_page_offset(index); -+ -+ return addr; -+} -+ -+static inline void put_rmap_list_entry(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = get_pool_index(slot, index); -+ BUG_ON(!slot->rmap_list_pool[pool_index]); -+ kunmap(slot->rmap_list_pool[pool_index]); -+} -+ -+static inline int entry_is_new(struct rmap_list_entry *entry) -+{ -+ return !entry->item; -+} -+ -+static inline unsigned long get_index_orig_addr(struct vma_slot *slot, -+ unsigned long index) -+{ -+ return slot->vma->vm_start + (index << PAGE_SHIFT); -+} -+ -+static inline unsigned long get_entry_address(struct rmap_list_entry *entry) -+{ -+ unsigned long addr; -+ -+ if (is_addr(entry->addr)) -+ addr = get_clean_addr(entry->addr); -+ else if (entry->item) -+ addr = get_rmap_addr(entry->item); -+ else -+ BUG(); -+ -+ return addr; -+} -+ -+static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) -+{ -+ if (is_addr(entry->addr)) -+ return NULL; -+ -+ return entry->item; -+} -+ -+static inline void inc_rmap_list_pool_count(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = get_pool_index(slot, index); -+ BUG_ON(!slot->rmap_list_pool[pool_index]); -+ slot->pool_counts[pool_index]++; -+} -+ -+static inline void dec_rmap_list_pool_count(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = get_pool_index(slot, index); -+ BUG_ON(!slot->rmap_list_pool[pool_index]); -+ BUG_ON(!slot->pool_counts[pool_index]); -+ slot->pool_counts[pool_index]--; -+} -+ -+static inline int entry_has_rmap(struct rmap_list_entry *entry) -+{ -+ return !is_addr(entry->addr) && entry->item; -+} -+ -+static inline void swap_entries(struct rmap_list_entry *entry1, -+ unsigned long index1, -+ struct rmap_list_entry *entry2, -+ unsigned long index2) -+{ -+ struct rmap_list_entry tmp; -+ -+ /* swapping two new entries is meaningless */ -+ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); -+ -+ tmp = *entry1; -+ *entry1 = *entry2; -+ *entry2 = tmp; -+ -+ if (entry_has_rmap(entry1)) -+ entry1->item->entry_index = index1; -+ -+ if (entry_has_rmap(entry2)) -+ entry2->item->entry_index = index2; -+ -+ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { -+ inc_rmap_list_pool_count(entry1->item->slot, index1); -+ dec_rmap_list_pool_count(entry1->item->slot, index2); -+ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { -+ inc_rmap_list_pool_count(entry2->item->slot, index2); -+ dec_rmap_list_pool_count(entry2->item->slot, index1); -+ } -+} -+ -+static inline void free_entry_item(struct rmap_list_entry *entry) -+{ -+ unsigned long index; -+ struct rmap_item *item; -+ -+ if (!is_addr(entry->addr)) { -+ BUG_ON(!entry->item); -+ item = entry->item; -+ entry->addr = get_rmap_addr(item); -+ set_is_addr(entry->addr); -+ index = item->entry_index; -+ remove_rmap_item_from_tree(item); -+ dec_rmap_list_pool_count(item->slot, index); -+ free_rmap_item(item); -+ } -+} -+ -+static inline int pool_entry_boundary(unsigned long index) -+{ -+ unsigned long linear_addr; -+ -+ linear_addr = sizeof(struct rmap_list_entry *) * index; -+ return index && !offset_in_page(linear_addr); -+} -+ -+static inline void try_free_last_pool(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = get_pool_index(slot, index); -+ if (slot->rmap_list_pool[pool_index] && -+ !slot->pool_counts[pool_index]) { -+ __free_page(slot->rmap_list_pool[pool_index]); -+ slot->rmap_list_pool[pool_index] = NULL; -+ slot->flags |= UKSM_SLOT_NEED_SORT; -+ } -+ -+} -+ -+static inline unsigned long vma_item_index(struct vm_area_struct *vma, -+ struct rmap_item *item) -+{ -+ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; -+} -+ -+static int within_same_pool(struct vma_slot *slot, -+ unsigned long i, unsigned long j) -+{ -+ unsigned long pool_i, pool_j; -+ -+ pool_i = get_pool_index(slot, i); -+ pool_j = get_pool_index(slot, j); -+ -+ return (pool_i == pool_j); -+} -+ -+static void sort_rmap_entry_list(struct vma_slot *slot) -+{ -+ unsigned long i, j; -+ struct rmap_list_entry *entry, *swap_entry; -+ -+ entry = get_rmap_list_entry(slot, 0, 0); -+ for (i = 0; i < slot->pages; ) { -+ -+ if (!entry) -+ goto skip_whole_pool; -+ -+ if (entry_is_new(entry)) -+ goto next_entry; -+ -+ if (is_addr(entry->addr)) { -+ entry->addr = 0; -+ goto next_entry; -+ } -+ -+ j = vma_item_index(slot->vma, entry->item); -+ if (j == i) -+ goto next_entry; -+ -+ if (within_same_pool(slot, i, j)) -+ swap_entry = entry + j - i; -+ else -+ swap_entry = get_rmap_list_entry(slot, j, 1); -+ -+ swap_entries(entry, i, swap_entry, j); -+ if (!within_same_pool(slot, i, j)) -+ put_rmap_list_entry(slot, j); -+ continue; -+ -+skip_whole_pool: -+ i += PAGE_SIZE / sizeof(*entry); -+ if (i < slot->pages) -+ entry = get_rmap_list_entry(slot, i, 0); -+ continue; -+ -+next_entry: -+ if (i >= slot->pages - 1 || -+ !within_same_pool(slot, i, i + 1)) { -+ put_rmap_list_entry(slot, i); -+ if (i + 1 < slot->pages) -+ entry = get_rmap_list_entry(slot, i + 1, 0); -+ } else -+ entry++; -+ i++; -+ continue; -+ } -+ -+ /* free empty pool entries which contain no rmap_item */ -+ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ -+ for (i = 0; i < slot->pool_size; i++) { -+ unsigned char has_rmap; -+ void *addr; -+ -+ if (!slot->rmap_list_pool[i]) -+ continue; -+ -+ has_rmap = 0; -+ addr = kmap(slot->rmap_list_pool[i]); -+ BUG_ON(!addr); -+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { -+ entry = (struct rmap_list_entry *)addr + j; -+ if (is_addr(entry->addr)) -+ continue; -+ if (!entry->item) -+ continue; -+ has_rmap = 1; -+ } -+ kunmap(slot->rmap_list_pool[i]); -+ if (!has_rmap) { -+ BUG_ON(slot->pool_counts[i]); -+ __free_page(slot->rmap_list_pool[i]); -+ slot->rmap_list_pool[i] = NULL; -+ } -+ } -+ -+ slot->flags &= ~UKSM_SLOT_NEED_SORT; -+} -+ -+/* -+ * vma_fully_scanned() - if all the pages in this slot have been scanned. -+ */ -+static inline int vma_fully_scanned(struct vma_slot *slot) -+{ -+ return slot->pages_scanned == slot->pages; -+} -+ -+/** -+ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to -+ * its random permutation. This function is embedded with the random -+ * permutation index management code. -+ */ -+static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) -+{ -+ unsigned long rand_range, addr, swap_index, scan_index; -+ struct rmap_item *item = NULL; -+ struct rmap_list_entry *scan_entry, *swap_entry = NULL; -+ struct page *page; -+ -+ scan_index = swap_index = slot->pages_scanned % slot->pages; -+ -+ if (pool_entry_boundary(scan_index)) -+ try_free_last_pool(slot, scan_index - 1); -+ -+ if (vma_fully_scanned(slot)) { -+ if (slot->flags & UKSM_SLOT_NEED_SORT) -+ slot->flags |= UKSM_SLOT_NEED_RERAND; -+ else -+ slot->flags &= ~UKSM_SLOT_NEED_RERAND; -+ if (slot->flags & UKSM_SLOT_NEED_SORT) -+ sort_rmap_entry_list(slot); -+ } -+ -+ scan_entry = get_rmap_list_entry(slot, scan_index, 1); -+ if (!scan_entry) -+ return NULL; -+ -+ if (entry_is_new(scan_entry)) { -+ scan_entry->addr = get_index_orig_addr(slot, scan_index); -+ set_is_addr(scan_entry->addr); -+ } -+ -+ if (slot->flags & UKSM_SLOT_NEED_RERAND) { -+ rand_range = slot->pages - scan_index; -+ BUG_ON(!rand_range); -+ swap_index = scan_index + (prandom_u32() % rand_range); -+ } -+ -+ if (swap_index != scan_index) { -+ swap_entry = get_rmap_list_entry(slot, swap_index, 1); -+ -+ if (!swap_entry) -+ return NULL; -+ -+ if (entry_is_new(swap_entry)) { -+ swap_entry->addr = get_index_orig_addr(slot, -+ swap_index); -+ set_is_addr(swap_entry->addr); -+ } -+ swap_entries(scan_entry, scan_index, swap_entry, swap_index); -+ } -+ -+ addr = get_entry_address(scan_entry); -+ item = get_entry_item(scan_entry); -+ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); -+ -+ page = follow_page(slot->vma, addr, FOLL_GET); -+ if (IS_ERR_OR_NULL(page)) -+ goto nopage; -+ -+ if (!PageAnon(page)) -+ goto putpage; -+ -+ /*check is zero_page pfn or uksm_zero_page*/ -+ if ((page_to_pfn(page) == zero_pfn) -+ || (page_to_pfn(page) == uksm_zero_pfn)) -+ goto putpage; -+ -+ flush_anon_page(slot->vma, page, addr); -+ flush_dcache_page(page); -+ -+ -+ *hash = page_hash(page, hash_strength, 1); -+ inc_uksm_pages_scanned(); -+ /*if the page content all zero, re-map to zero-page*/ -+ if (find_zero_page_hash(hash_strength, *hash)) { -+ if (!cmp_and_merge_zero_page(slot->vma, page)) { -+ slot->pages_merged++; -+ -+ /* For full-zero pages, no need to create rmap item */ -+ goto putpage; -+ } else { -+ inc_rshash_neg(memcmp_cost / 2); -+ } -+ } -+ -+ if (!item) { -+ item = alloc_rmap_item(); -+ if (item) { -+ /* It has already been zeroed */ -+ item->slot = slot; -+ item->address = addr; -+ item->entry_index = scan_index; -+ scan_entry->item = item; -+ inc_rmap_list_pool_count(slot, scan_index); -+ } else -+ goto putpage; -+ } -+ -+ BUG_ON(item->slot != slot); -+ /* the page may have changed */ -+ item->page = page; -+ put_rmap_list_entry(slot, scan_index); -+ if (swap_entry) -+ put_rmap_list_entry(slot, swap_index); -+ return item; -+ -+putpage: -+ put_page(page); -+ page = NULL; -+nopage: -+ /* no page, store addr back and free rmap_item if possible */ -+ free_entry_item(scan_entry); -+ put_rmap_list_entry(slot, scan_index); -+ if (swap_entry) -+ put_rmap_list_entry(slot, swap_index); -+ return NULL; -+} -+ -+static inline int in_stable_tree(struct rmap_item *rmap_item) -+{ -+ return rmap_item->address & STABLE_FLAG; -+} -+ -+/** -+ * scan_vma_one_page() - scan the next page in a vma_slot. Called with -+ * mmap_sem locked. -+ */ -+static noinline void scan_vma_one_page(struct vma_slot *slot) -+{ -+ u32 hash; -+ struct mm_struct *mm; -+ struct rmap_item *rmap_item = NULL; -+ struct vm_area_struct *vma = slot->vma; -+ -+ mm = vma->vm_mm; -+ BUG_ON(!mm); -+ BUG_ON(!slot); -+ -+ rmap_item = get_next_rmap_item(slot, &hash); -+ if (!rmap_item) -+ goto out1; -+ -+ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) -+ goto out2; -+ -+ cmp_and_merge_page(rmap_item, hash); -+out2: -+ put_page(rmap_item->page); -+out1: -+ slot->pages_scanned++; -+ slot->this_sampled++; -+ if (slot->fully_scanned_round != fully_scanned_round) -+ scanned_virtual_pages++; -+ -+ if (vma_fully_scanned(slot)) -+ slot->fully_scanned_round = fully_scanned_round; -+} -+ -+static inline unsigned long rung_get_pages(struct scan_rung *rung) -+{ -+ struct slot_tree_node *node; -+ -+ if (!rung->vma_root.rnode) -+ return 0; -+ -+ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); -+ -+ return node->size; -+} -+ -+#define RUNG_SAMPLED_MIN 3 -+ -+static inline -+void uksm_calc_rung_step(struct scan_rung *rung, -+ unsigned long page_time, unsigned long ratio) -+{ -+ unsigned long sampled, pages; -+ -+ /* will be fully scanned ? */ -+ if (!rung->cover_msecs) { -+ rung->step = 1; -+ return; -+ } -+ -+ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) -+ * ratio / page_time; -+ -+ /* -+ * Before we finsish a scan round and expensive per-round jobs, -+ * we need to have a chance to estimate the per page time. So -+ * the sampled number can not be too small. -+ */ -+ if (sampled < RUNG_SAMPLED_MIN) -+ sampled = RUNG_SAMPLED_MIN; -+ -+ pages = rung_get_pages(rung); -+ if (likely(pages > sampled)) -+ rung->step = pages / sampled; -+ else -+ rung->step = 1; -+} -+ -+static inline int step_need_recalc(struct scan_rung *rung) -+{ -+ unsigned long pages, stepmax; -+ -+ pages = rung_get_pages(rung); -+ stepmax = pages / RUNG_SAMPLED_MIN; -+ -+ return pages && (rung->step > pages || -+ (stepmax && rung->step > stepmax)); -+} -+ -+static inline -+void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) -+{ -+ struct vma_slot *slot; -+ -+ if (finished) -+ rung->flags |= UKSM_RUNG_ROUND_FINISHED; -+ -+ if (step_recalc || step_need_recalc(rung)) { -+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); -+ BUG_ON(step_need_recalc(rung)); -+ } -+ -+ slot_iter_index = prandom_u32() % rung->step; -+ BUG_ON(!rung->vma_root.rnode); -+ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); -+ BUG_ON(!slot); -+ -+ rung->current_scan = slot; -+ rung->current_offset = slot_iter_index; -+} -+ -+static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) -+{ -+ return &slot->rung->vma_root; -+} -+ -+/* -+ * return if resetted. -+ */ -+static int advance_current_scan(struct scan_rung *rung) -+{ -+ unsigned short n; -+ struct vma_slot *slot, *next = NULL; -+ -+ BUG_ON(!rung->vma_root.num); -+ -+ slot = rung->current_scan; -+ n = (slot->pages - rung->current_offset) % rung->step; -+ slot_iter_index = rung->step - n; -+ next = sradix_tree_next(&rung->vma_root, slot->snode, -+ slot->sindex, slot_iter); -+ -+ if (next) { -+ rung->current_offset = slot_iter_index; -+ rung->current_scan = next; -+ return 0; -+ } else { -+ reset_current_scan(rung, 1, 0); -+ return 1; -+ } -+} -+ -+static inline void rung_rm_slot(struct vma_slot *slot) -+{ -+ struct scan_rung *rung = slot->rung; -+ struct sradix_tree_root *root; -+ -+ if (rung->current_scan == slot) -+ advance_current_scan(rung); -+ -+ root = slot_get_root(slot); -+ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); -+ slot->snode = NULL; -+ if (step_need_recalc(rung)) { -+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); -+ BUG_ON(step_need_recalc(rung)); -+ } -+ -+ /* In case advance_current_scan loop back to this slot again */ -+ if (rung->vma_root.num && rung->current_scan == slot) -+ reset_current_scan(slot->rung, 1, 0); -+} -+ -+static inline void rung_add_new_slots(struct scan_rung *rung, -+ struct vma_slot **slots, unsigned long num) -+{ -+ int err; -+ struct vma_slot *slot; -+ unsigned long i; -+ struct sradix_tree_root *root = &rung->vma_root; -+ -+ err = sradix_tree_enter(root, (void **)slots, num); -+ BUG_ON(err); -+ -+ for (i = 0; i < num; i++) { -+ slot = slots[i]; -+ slot->rung = rung; -+ BUG_ON(vma_fully_scanned(slot)); -+ } -+ -+ if (rung->vma_root.num == num) -+ reset_current_scan(rung, 0, 1); -+} -+ -+static inline int rung_add_one_slot(struct scan_rung *rung, -+ struct vma_slot *slot) -+{ -+ int err; -+ -+ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); -+ if (err) -+ return err; -+ -+ slot->rung = rung; -+ if (rung->vma_root.num == 1) -+ reset_current_scan(rung, 0, 1); -+ -+ return 0; -+} -+ -+/* -+ * Return true if the slot is deleted from its rung. -+ */ -+static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) -+{ -+ struct scan_rung *old_rung = slot->rung; -+ int err; -+ -+ if (old_rung == rung) -+ return 0; -+ -+ rung_rm_slot(slot); -+ err = rung_add_one_slot(rung, slot); -+ if (err) { -+ err = rung_add_one_slot(old_rung, slot); -+ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ -+ } -+ -+ return 1; -+} -+ -+static inline int vma_rung_up(struct vma_slot *slot) -+{ -+ struct scan_rung *rung; -+ -+ rung = slot->rung; -+ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) -+ rung++; -+ -+ return vma_rung_enter(slot, rung); -+} -+ -+static inline int vma_rung_down(struct vma_slot *slot) -+{ -+ struct scan_rung *rung; -+ -+ rung = slot->rung; -+ if (slot->rung != &uksm_scan_ladder[0]) -+ rung--; -+ -+ return vma_rung_enter(slot, rung); -+} -+ -+/** -+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. -+ */ -+static unsigned long cal_dedup_ratio(struct vma_slot *slot) -+{ -+ unsigned long ret; -+ unsigned long pages; -+ -+ pages = slot->this_sampled; -+ if (!pages) -+ return 0; -+ -+ BUG_ON(slot->pages_scanned == slot->last_scanned); -+ -+ ret = slot->pages_merged; -+ -+ /* Thrashing area filtering */ -+ if (ret && uksm_thrash_threshold) { -+ if (slot->pages_cowed * 100 / slot->pages_merged -+ > uksm_thrash_threshold) { -+ ret = 0; -+ } else { -+ ret = slot->pages_merged - slot->pages_cowed; -+ } -+ } -+ -+ return ret * 100 / pages; -+} -+ -+/** -+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. -+ */ -+static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) -+{ -+ unsigned long ret; -+ unsigned long pages; -+ -+ pages = slot->pages; -+ if (!pages) -+ return 0; -+ -+ ret = slot->pages_bemerged; -+ -+ /* Thrashing area filtering */ -+ if (ret && uksm_thrash_threshold) { -+ if (slot->pages_cowed * 100 / slot->pages_bemerged -+ > uksm_thrash_threshold) { -+ ret = 0; -+ } else { -+ ret = slot->pages_bemerged - slot->pages_cowed; -+ } -+ } -+ -+ return ret * 100 / pages; -+} -+ -+/** -+ * stable_node_reinsert() - When the hash_strength has been adjusted, the -+ * stable tree need to be restructured, this is the function re-inserting the -+ * stable node. -+ */ -+static inline void stable_node_reinsert(struct stable_node *new_node, -+ struct page *page, -+ struct rb_root *root_treep, -+ struct list_head *tree_node_listp, -+ u32 hash) -+{ -+ struct rb_node **new = &root_treep->rb_node; -+ struct rb_node *parent = NULL; -+ struct stable_node *stable_node; -+ struct tree_node *tree_node; -+ struct page *tree_page; -+ int cmp; -+ -+ while (*new) { -+ int cmp; -+ -+ tree_node = rb_entry(*new, struct tree_node, node); -+ -+ cmp = hash_cmp(hash, tree_node->hash); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else -+ break; -+ } -+ -+ if (*new) { -+ /* find a stable tree node with same first level hash value */ -+ stable_node_hash_max(new_node, page, hash); -+ if (tree_node->count == 1) { -+ stable_node = rb_entry(tree_node->sub_root.rb_node, -+ struct stable_node, node); -+ tree_page = get_uksm_page(stable_node, 1, 0); -+ if (tree_page) { -+ stable_node_hash_max(stable_node, -+ tree_page, hash); -+ put_page(tree_page); -+ -+ /* prepare for stable node insertion */ -+ -+ cmp = hash_cmp(new_node->hash_max, -+ stable_node->hash_max); -+ parent = &stable_node->node; -+ if (cmp < 0) -+ new = &parent->rb_left; -+ else if (cmp > 0) -+ new = &parent->rb_right; -+ else -+ goto failed; -+ -+ goto add_node; -+ } else { -+ /* the only stable_node deleted, the tree node -+ * was not deleted. -+ */ -+ goto tree_node_reuse; -+ } -+ } -+ -+ /* well, search the collision subtree */ -+ new = &tree_node->sub_root.rb_node; -+ parent = NULL; -+ BUG_ON(!*new); -+ while (*new) { -+ int cmp; -+ -+ stable_node = rb_entry(*new, struct stable_node, node); -+ -+ cmp = hash_cmp(new_node->hash_max, -+ stable_node->hash_max); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else { -+ /* oh, no, still a collision */ -+ goto failed; -+ } -+ } -+ -+ goto add_node; -+ } -+ -+ /* no tree node found */ -+ tree_node = alloc_tree_node(tree_node_listp); -+ if (!tree_node) { -+ pr_err("UKSM: memory allocation error!\n"); -+ goto failed; -+ } else { -+ tree_node->hash = hash; -+ rb_link_node(&tree_node->node, parent, new); -+ rb_insert_color(&tree_node->node, root_treep); -+ -+tree_node_reuse: -+ /* prepare for stable node insertion */ -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ } -+ -+add_node: -+ rb_link_node(&new_node->node, parent, new); -+ rb_insert_color(&new_node->node, &tree_node->sub_root); -+ new_node->tree_node = tree_node; -+ tree_node->count++; -+ return; -+ -+failed: -+ /* This can only happen when two nodes have collided -+ * in two levels. -+ */ -+ new_node->tree_node = NULL; -+ return; -+} -+ -+static inline void free_all_tree_nodes(struct list_head *list) -+{ -+ struct tree_node *node, *tmp; -+ -+ list_for_each_entry_safe(node, tmp, list, all_list) { -+ free_tree_node(node); -+ } -+} -+ -+/** -+ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash -+ * strength to the current hash_strength. It re-structures the hole tree. -+ */ -+static inline void stable_tree_delta_hash(u32 prev_hash_strength) -+{ -+ struct stable_node *node, *tmp; -+ struct rb_root *root_new_treep; -+ struct list_head *new_tree_node_listp; -+ -+ stable_tree_index = (stable_tree_index + 1) % 2; -+ root_new_treep = &root_stable_tree[stable_tree_index]; -+ new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; -+ *root_new_treep = RB_ROOT; -+ BUG_ON(!list_empty(new_tree_node_listp)); -+ -+ /* -+ * we need to be safe, the node could be removed by get_uksm_page() -+ */ -+ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { -+ void *addr; -+ struct page *node_page; -+ u32 hash; -+ -+ /* -+ * We are completely re-structuring the stable nodes to a new -+ * stable tree. We don't want to touch the old tree unlinks and -+ * old tree_nodes. The old tree_nodes will be freed at once. -+ */ -+ node_page = get_uksm_page(node, 0, 0); -+ if (!node_page) -+ continue; -+ -+ if (node->tree_node) { -+ hash = node->tree_node->hash; -+ -+ addr = kmap_atomic(node_page); -+ -+ hash = delta_hash(addr, prev_hash_strength, -+ hash_strength, hash); -+ kunmap_atomic(addr); -+ } else { -+ /* -+ *it was not inserted to rbtree due to collision in last -+ *round scan. -+ */ -+ hash = page_hash(node_page, hash_strength, 0); -+ } -+ -+ stable_node_reinsert(node, node_page, root_new_treep, -+ new_tree_node_listp, hash); -+ put_page(node_page); -+ } -+ -+ root_stable_treep = root_new_treep; -+ free_all_tree_nodes(stable_tree_node_listp); -+ BUG_ON(!list_empty(stable_tree_node_listp)); -+ stable_tree_node_listp = new_tree_node_listp; -+} -+ -+static inline void inc_hash_strength(unsigned long delta) -+{ -+ hash_strength += 1 << delta; -+ if (hash_strength > HASH_STRENGTH_MAX) -+ hash_strength = HASH_STRENGTH_MAX; -+} -+ -+static inline void dec_hash_strength(unsigned long delta) -+{ -+ unsigned long change = 1 << delta; -+ -+ if (hash_strength <= change + 1) -+ hash_strength = 1; -+ else -+ hash_strength -= change; -+} -+ -+static inline void inc_hash_strength_delta(void) -+{ -+ hash_strength_delta++; -+ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) -+ hash_strength_delta = HASH_STRENGTH_DELTA_MAX; -+} -+ -+static inline unsigned long get_current_neg_ratio(void) -+{ -+ u64 pos = benefit.pos; -+ u64 neg = benefit.neg; -+ -+ if (!neg) -+ return 0; -+ -+ if (!pos || neg > pos) -+ return 100; -+ -+ if (neg > div64_u64(U64_MAX, 100)) -+ pos = div64_u64(pos, 100); -+ else -+ neg *= 100; -+ -+ return div64_u64(neg, pos); -+} -+ -+static inline unsigned long get_current_benefit(void) -+{ -+ u64 pos = benefit.pos; -+ u64 neg = benefit.neg; -+ u64 scanned = benefit.scanned; -+ -+ if (neg > pos) -+ return 0; -+ -+ return div64_u64((pos - neg), scanned); -+} -+ -+static inline int judge_rshash_direction(void) -+{ -+ u64 current_neg_ratio, stable_benefit; -+ u64 current_benefit, delta = 0; -+ int ret = STILL; -+ -+ /* -+ * Try to probe a value after the boot, and in case the system -+ * are still for a long time. -+ */ -+ if ((fully_scanned_round & 0xFFULL) == 10) { -+ ret = OBSCURE; -+ goto out; -+ } -+ -+ current_neg_ratio = get_current_neg_ratio(); -+ -+ if (current_neg_ratio == 0) { -+ rshash_neg_cont_zero++; -+ if (rshash_neg_cont_zero > 2) -+ return GO_DOWN; -+ else -+ return STILL; -+ } -+ rshash_neg_cont_zero = 0; -+ -+ if (current_neg_ratio > 90) { -+ ret = GO_UP; -+ goto out; -+ } -+ -+ current_benefit = get_current_benefit(); -+ stable_benefit = rshash_state.stable_benefit; -+ -+ if (!stable_benefit) { -+ ret = OBSCURE; -+ goto out; -+ } -+ -+ if (current_benefit > stable_benefit) -+ delta = current_benefit - stable_benefit; -+ else if (current_benefit < stable_benefit) -+ delta = stable_benefit - current_benefit; -+ -+ delta = div64_u64(100 * delta, stable_benefit); -+ -+ if (delta > 50) { -+ rshash_cont_obscure++; -+ if (rshash_cont_obscure > 2) -+ return OBSCURE; -+ else -+ return STILL; -+ } -+ -+out: -+ rshash_cont_obscure = 0; -+ return ret; -+} -+ -+/** -+ * rshash_adjust() - The main function to control the random sampling state -+ * machine for hash strength adapting. -+ * -+ * return true if hash_strength has changed. -+ */ -+static inline int rshash_adjust(void) -+{ -+ unsigned long prev_hash_strength = hash_strength; -+ -+ if (!encode_benefit()) -+ return 0; -+ -+ switch (rshash_state.state) { -+ case RSHASH_STILL: -+ switch (judge_rshash_direction()) { -+ case GO_UP: -+ if (rshash_state.pre_direct == GO_DOWN) -+ hash_strength_delta = 0; -+ -+ inc_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ rshash_state.stable_benefit = get_current_benefit(); -+ rshash_state.pre_direct = GO_UP; -+ break; -+ -+ case GO_DOWN: -+ if (rshash_state.pre_direct == GO_UP) -+ hash_strength_delta = 0; -+ -+ dec_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ rshash_state.stable_benefit = get_current_benefit(); -+ rshash_state.pre_direct = GO_DOWN; -+ break; -+ -+ case OBSCURE: -+ rshash_state.stable_point = hash_strength; -+ rshash_state.turn_point_down = hash_strength; -+ rshash_state.turn_point_up = hash_strength; -+ rshash_state.turn_benefit_down = get_current_benefit(); -+ rshash_state.turn_benefit_up = get_current_benefit(); -+ rshash_state.lookup_window_index = 0; -+ rshash_state.state = RSHASH_TRYDOWN; -+ dec_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ break; -+ -+ case STILL: -+ break; -+ default: -+ BUG(); -+ } -+ break; -+ -+ case RSHASH_TRYDOWN: -+ if (rshash_state.lookup_window_index++ % 5 == 0) -+ rshash_state.below_count = 0; -+ -+ if (get_current_benefit() < rshash_state.stable_benefit) -+ rshash_state.below_count++; -+ else if (get_current_benefit() > -+ rshash_state.turn_benefit_down) { -+ rshash_state.turn_point_down = hash_strength; -+ rshash_state.turn_benefit_down = get_current_benefit(); -+ } -+ -+ if (rshash_state.below_count >= 3 || -+ judge_rshash_direction() == GO_UP || -+ hash_strength == 1) { -+ hash_strength = rshash_state.stable_point; -+ hash_strength_delta = 0; -+ inc_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ rshash_state.lookup_window_index = 0; -+ rshash_state.state = RSHASH_TRYUP; -+ hash_strength_delta = 0; -+ } else { -+ dec_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ } -+ break; -+ -+ case RSHASH_TRYUP: -+ if (rshash_state.lookup_window_index++ % 5 == 0) -+ rshash_state.below_count = 0; -+ -+ if (get_current_benefit() < rshash_state.turn_benefit_down) -+ rshash_state.below_count++; -+ else if (get_current_benefit() > rshash_state.turn_benefit_up) { -+ rshash_state.turn_point_up = hash_strength; -+ rshash_state.turn_benefit_up = get_current_benefit(); -+ } -+ -+ if (rshash_state.below_count >= 3 || -+ judge_rshash_direction() == GO_DOWN || -+ hash_strength == HASH_STRENGTH_MAX) { -+ hash_strength = rshash_state.turn_benefit_up > -+ rshash_state.turn_benefit_down ? -+ rshash_state.turn_point_up : -+ rshash_state.turn_point_down; -+ -+ rshash_state.state = RSHASH_PRE_STILL; -+ } else { -+ inc_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ } -+ -+ break; -+ -+ case RSHASH_NEW: -+ case RSHASH_PRE_STILL: -+ rshash_state.stable_benefit = get_current_benefit(); -+ rshash_state.state = RSHASH_STILL; -+ hash_strength_delta = 0; -+ break; -+ default: -+ BUG(); -+ } -+ -+ /* rshash_neg = rshash_pos = 0; */ -+ reset_benefit(); -+ -+ if (prev_hash_strength != hash_strength) -+ stable_tree_delta_hash(prev_hash_strength); -+ -+ return prev_hash_strength != hash_strength; -+} -+ -+/** -+ * round_update_ladder() - The main function to do update of all the -+ * adjustments whenever a scan round is finished. -+ */ -+static noinline void round_update_ladder(void) -+{ -+ int i; -+ unsigned long dedup; -+ struct vma_slot *slot, *tmp_slot; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) -+ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; -+ -+ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { -+ -+ /* slot may be rung_rm_slot() when mm exits */ -+ if (slot->snode) { -+ dedup = cal_dedup_ratio_old(slot); -+ if (dedup && dedup >= uksm_abundant_threshold) -+ vma_rung_up(slot); -+ } -+ -+ slot->pages_bemerged = 0; -+ slot->pages_cowed = 0; -+ -+ list_del_init(&slot->dedup_list); -+ } -+} -+ -+static void uksm_del_vma_slot(struct vma_slot *slot) -+{ -+ int i, j; -+ struct rmap_list_entry *entry; -+ -+ if (slot->snode) { -+ /* -+ * In case it just failed when entering the rung, it's not -+ * necessary. -+ */ -+ rung_rm_slot(slot); -+ } -+ -+ if (!list_empty(&slot->dedup_list)) -+ list_del(&slot->dedup_list); -+ -+ if (!slot->rmap_list_pool || !slot->pool_counts) { -+ /* In case it OOMed in uksm_vma_enter() */ -+ goto out; -+ } -+ -+ for (i = 0; i < slot->pool_size; i++) { -+ void *addr; -+ -+ if (!slot->rmap_list_pool[i]) -+ continue; -+ -+ addr = kmap(slot->rmap_list_pool[i]); -+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { -+ entry = (struct rmap_list_entry *)addr + j; -+ if (is_addr(entry->addr)) -+ continue; -+ if (!entry->item) -+ continue; -+ -+ remove_rmap_item_from_tree(entry->item); -+ free_rmap_item(entry->item); -+ slot->pool_counts[i]--; -+ } -+ BUG_ON(slot->pool_counts[i]); -+ kunmap(slot->rmap_list_pool[i]); -+ __free_page(slot->rmap_list_pool[i]); -+ } -+ kfree(slot->rmap_list_pool); -+ kfree(slot->pool_counts); -+ -+out: -+ slot->rung = NULL; -+ if (slot->flags & UKSM_SLOT_IN_UKSM) { -+ BUG_ON(uksm_pages_total < slot->pages); -+ uksm_pages_total -= slot->pages; -+ } -+ -+ if (slot->fully_scanned_round == fully_scanned_round) -+ scanned_virtual_pages -= slot->pages; -+ else -+ scanned_virtual_pages -= slot->pages_scanned; -+ free_vma_slot(slot); -+} -+ -+ -+#define SPIN_LOCK_PERIOD 32 -+static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; -+static inline void cleanup_vma_slots(void) -+{ -+ struct vma_slot *slot; -+ int i; -+ -+ i = 0; -+ spin_lock(&vma_slot_list_lock); -+ while (!list_empty(&vma_slot_del)) { -+ slot = list_entry(vma_slot_del.next, -+ struct vma_slot, slot_list); -+ list_del(&slot->slot_list); -+ cleanup_slots[i++] = slot; -+ if (i == SPIN_LOCK_PERIOD) { -+ spin_unlock(&vma_slot_list_lock); -+ while (--i >= 0) -+ uksm_del_vma_slot(cleanup_slots[i]); -+ i = 0; -+ spin_lock(&vma_slot_list_lock); -+ } -+ } -+ spin_unlock(&vma_slot_list_lock); -+ -+ while (--i >= 0) -+ uksm_del_vma_slot(cleanup_slots[i]); -+} -+ -+/* -+ * Expotional moving average formula -+ */ -+static inline unsigned long ema(unsigned long curr, unsigned long last_ema) -+{ -+ /* -+ * For a very high burst, even the ema cannot work well, a false very -+ * high per-page time estimation can result in feedback in very high -+ * overhead of context switch and rung update -- this will then lead -+ * to higher per-paper time, this may not converge. -+ * -+ * Instead, we try to approach this value in a binary manner. -+ */ -+ if (curr > last_ema * 10) -+ return last_ema * 2; -+ -+ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; -+} -+ -+/* -+ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to -+ * nanoseconds based on current uksm_sleep_jiffies. -+ */ -+static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) -+{ -+ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / -+ (TIME_RATIO_SCALE - ratio) * ratio; -+} -+ -+ -+static inline unsigned long rung_real_ratio(int cpu_time_ratio) -+{ -+ unsigned long ret; -+ -+ BUG_ON(!cpu_time_ratio); -+ -+ if (cpu_time_ratio > 0) -+ ret = cpu_time_ratio; -+ else -+ ret = (unsigned long)(-cpu_time_ratio) * -+ uksm_max_cpu_percentage / 100UL; -+ -+ return ret ? ret : 1; -+} -+ -+static noinline void uksm_calc_scan_pages(void) -+{ -+ struct scan_rung *ladder = uksm_scan_ladder; -+ unsigned long sleep_usecs, nsecs; -+ unsigned long ratio; -+ int i; -+ unsigned long per_page; -+ -+ if (uksm_ema_page_time > 100000 || -+ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) -+ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; -+ -+ per_page = uksm_ema_page_time; -+ BUG_ON(!per_page); -+ -+ /* -+ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value -+ * based on saved user input. -+ */ -+ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) -+ uksm_sleep_jiffies = uksm_sleep_saved; -+ -+ /* We require a rung scan at least 1 page in a period. */ -+ nsecs = per_page; -+ ratio = rung_real_ratio(ladder[0].cpu_ratio); -+ if (cpu_ratio_to_nsec(ratio) < nsecs) { -+ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio -+ / NSEC_PER_USEC; -+ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; -+ } -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ ratio = rung_real_ratio(ladder[i].cpu_ratio); -+ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / -+ per_page; -+ BUG_ON(!ladder[i].pages_to_scan); -+ uksm_calc_rung_step(&ladder[i], per_page, ratio); -+ } -+} -+ -+/* -+ * From the scan time of this round (ns) to next expected min sleep time -+ * (ms), be careful of the possible overflows. ratio is taken from -+ * rung_real_ratio() -+ */ -+static inline -+unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) -+{ -+ scan_time >>= 20; /* to msec level now */ -+ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); -+ -+ return (unsigned int) ((unsigned long) scan_time * -+ (TIME_RATIO_SCALE - ratio) / ratio); -+} -+ -+#define __round_mask(x, y) ((__typeof__(x))((y)-1)) -+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) -+ -+static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) -+{ -+ struct scan_rung *rung; -+ -+ rung = &uksm_scan_ladder[0]; -+ rung_add_new_slots(rung, slots, num); -+} -+ -+static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; -+ -+static void uksm_enter_all_slots(void) -+{ -+ struct vma_slot *slot; -+ unsigned long index; -+ struct list_head empty_vma_list; -+ int i; -+ -+ i = 0; -+ index = 0; -+ INIT_LIST_HEAD(&empty_vma_list); -+ -+ spin_lock(&vma_slot_list_lock); -+ while (!list_empty(&vma_slot_new)) { -+ slot = list_entry(vma_slot_new.next, -+ struct vma_slot, slot_list); -+ -+ if (!slot->vma->anon_vma) { -+ list_move(&slot->slot_list, &empty_vma_list); -+ } else if (vma_can_enter(slot->vma)) { -+ batch_slots[index++] = slot; -+ list_del_init(&slot->slot_list); -+ } else { -+ list_move(&slot->slot_list, &vma_slot_noadd); -+ } -+ -+ if (++i == SPIN_LOCK_PERIOD || -+ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { -+ spin_unlock(&vma_slot_list_lock); -+ -+ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { -+ uksm_vma_enter(batch_slots, index); -+ index = 0; -+ } -+ i = 0; -+ cond_resched(); -+ spin_lock(&vma_slot_list_lock); -+ } -+ } -+ -+ list_splice(&empty_vma_list, &vma_slot_new); -+ -+ spin_unlock(&vma_slot_list_lock); -+ -+ if (index) -+ uksm_vma_enter(batch_slots, index); -+ -+} -+ -+static inline int rung_round_finished(struct scan_rung *rung) -+{ -+ return rung->flags & UKSM_RUNG_ROUND_FINISHED; -+} -+ -+static inline void judge_slot(struct vma_slot *slot) -+{ -+ struct scan_rung *rung = slot->rung; -+ unsigned long dedup; -+ int deleted; -+ -+ dedup = cal_dedup_ratio(slot); -+ if (vma_fully_scanned(slot) && uksm_thrash_threshold) -+ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); -+ else if (dedup && dedup >= uksm_abundant_threshold) -+ deleted = vma_rung_up(slot); -+ else -+ deleted = vma_rung_down(slot); -+ -+ slot->pages_merged = 0; -+ slot->pages_cowed = 0; -+ slot->this_sampled = 0; -+ -+ if (vma_fully_scanned(slot)) -+ slot->pages_scanned = 0; -+ -+ slot->last_scanned = slot->pages_scanned; -+ -+ /* If its deleted in above, then rung was already advanced. */ -+ if (!deleted) -+ advance_current_scan(rung); -+} -+ -+ -+static inline int hash_round_finished(void) -+{ -+ if (scanned_virtual_pages > (uksm_pages_total >> 2)) { -+ scanned_virtual_pages = 0; -+ if (uksm_pages_scanned) -+ fully_scanned_round++; -+ -+ return 1; -+ } else { -+ return 0; -+ } -+} -+ -+#define UKSM_MMSEM_BATCH 5 -+#define BUSY_RETRY 100 -+ -+/** -+ * uksm_do_scan() - the main worker function. -+ */ -+static noinline void uksm_do_scan(void) -+{ -+ struct vma_slot *slot, *iter; -+ struct mm_struct *busy_mm; -+ unsigned char round_finished, all_rungs_emtpy; -+ int i, err, mmsem_batch; -+ unsigned long pcost; -+ long long delta_exec; -+ unsigned long vpages, max_cpu_ratio; -+ unsigned long long start_time, end_time, scan_time; -+ unsigned int expected_jiffies; -+ -+ might_sleep(); -+ -+ vpages = 0; -+ -+ start_time = task_sched_runtime(current); -+ max_cpu_ratio = 0; -+ mmsem_batch = 0; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE;) { -+ struct scan_rung *rung = &uksm_scan_ladder[i]; -+ unsigned long ratio; -+ int busy_retry; -+ -+ if (!rung->pages_to_scan) { -+ i++; -+ continue; -+ } -+ -+ if (!rung->vma_root.num) { -+ rung->pages_to_scan = 0; -+ i++; -+ continue; -+ } -+ -+ ratio = rung_real_ratio(rung->cpu_ratio); -+ if (ratio > max_cpu_ratio) -+ max_cpu_ratio = ratio; -+ -+ busy_retry = BUSY_RETRY; -+ /* -+ * Do not consider rung_round_finished() here, just used up the -+ * rung->pages_to_scan quota. -+ */ -+ while (rung->pages_to_scan && rung->vma_root.num && -+ likely(!freezing(current))) { -+ int reset = 0; -+ -+ slot = rung->current_scan; -+ -+ BUG_ON(vma_fully_scanned(slot)); -+ -+ if (mmsem_batch) -+ err = 0; -+ else -+ err = try_down_read_slot_mmap_sem(slot); -+ -+ if (err == -ENOENT) { -+rm_slot: -+ rung_rm_slot(slot); -+ continue; -+ } -+ -+ busy_mm = slot->mm; -+ -+ if (err == -EBUSY) { -+ /* skip other vmas on the same mm */ -+ do { -+ reset = advance_current_scan(rung); -+ iter = rung->current_scan; -+ busy_retry--; -+ if (iter->vma->vm_mm != busy_mm || -+ !busy_retry || reset) -+ break; -+ } while (1); -+ -+ if (iter->vma->vm_mm != busy_mm) { -+ continue; -+ } else { -+ /* scan round finsished */ -+ break; -+ } -+ } -+ -+ BUG_ON(!vma_can_enter(slot->vma)); -+ if (uksm_test_exit(slot->vma->vm_mm)) { -+ mmsem_batch = 0; -+ mmap_read_unlock(slot->vma->vm_mm); -+ goto rm_slot; -+ } -+ -+ if (mmsem_batch) -+ mmsem_batch--; -+ else -+ mmsem_batch = UKSM_MMSEM_BATCH; -+ -+ /* Ok, we have take the mmap_sem, ready to scan */ -+ scan_vma_one_page(slot); -+ rung->pages_to_scan--; -+ vpages++; -+ -+ if (rung->current_offset + rung->step > slot->pages - 1 -+ || vma_fully_scanned(slot)) { -+ mmap_read_unlock(slot->vma->vm_mm); -+ judge_slot(slot); -+ mmsem_batch = 0; -+ } else { -+ rung->current_offset += rung->step; -+ if (!mmsem_batch) -+ mmap_read_unlock(slot->vma->vm_mm); -+ } -+ -+ busy_retry = BUSY_RETRY; -+ cond_resched(); -+ } -+ -+ if (mmsem_batch) { -+ mmap_read_unlock(slot->vma->vm_mm); -+ mmsem_batch = 0; -+ } -+ -+ if (freezing(current)) -+ break; -+ -+ cond_resched(); -+ } -+ end_time = task_sched_runtime(current); -+ delta_exec = end_time - start_time; -+ -+ if (freezing(current)) -+ return; -+ -+ cleanup_vma_slots(); -+ uksm_enter_all_slots(); -+ -+ round_finished = 1; -+ all_rungs_emtpy = 1; -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ struct scan_rung *rung = &uksm_scan_ladder[i]; -+ -+ if (rung->vma_root.num) { -+ all_rungs_emtpy = 0; -+ if (!rung_round_finished(rung)) -+ round_finished = 0; -+ } -+ } -+ -+ if (all_rungs_emtpy) -+ round_finished = 0; -+ -+ if (round_finished) { -+ round_update_ladder(); -+ uksm_eval_round++; -+ -+ if (hash_round_finished() && rshash_adjust()) { -+ /* Reset the unstable root iff hash strength changed */ -+ uksm_hash_round++; -+ root_unstable_tree = RB_ROOT; -+ free_all_tree_nodes(&unstable_tree_node_list); -+ } -+ -+ /* -+ * A number of pages can hang around indefinitely on per-cpu -+ * pagevecs, raised page count preventing write_protect_page -+ * from merging them. Though it doesn't really matter much, -+ * it is puzzling to see some stuck in pages_volatile until -+ * other activity jostles them out, and they also prevented -+ * LTP's KSM test from succeeding deterministically; so drain -+ * them here (here rather than on entry to uksm_do_scan(), -+ * so we don't IPI too often when pages_to_scan is set low). -+ */ -+ lru_add_drain_all(); -+ } -+ -+ -+ if (vpages && delta_exec > 0) { -+ pcost = (unsigned long) delta_exec / vpages; -+ if (likely(uksm_ema_page_time)) -+ uksm_ema_page_time = ema(pcost, uksm_ema_page_time); -+ else -+ uksm_ema_page_time = pcost; -+ } -+ -+ uksm_calc_scan_pages(); -+ uksm_sleep_real = uksm_sleep_jiffies; -+ /* in case of radical cpu bursts, apply the upper bound */ -+ end_time = task_sched_runtime(current); -+ if (max_cpu_ratio && end_time > start_time) { -+ scan_time = end_time - start_time; -+ expected_jiffies = msecs_to_jiffies( -+ scan_time_to_sleep(scan_time, max_cpu_ratio)); -+ -+ if (expected_jiffies > uksm_sleep_real) -+ uksm_sleep_real = expected_jiffies; -+ -+ /* We have a 1 second up bound for responsiveness. */ -+ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) -+ uksm_sleep_real = msecs_to_jiffies(1000); -+ } -+ -+ return; -+} -+ -+static int ksmd_should_run(void) -+{ -+ return uksm_run & UKSM_RUN_MERGE; -+} -+ -+static int uksm_scan_thread(void *nothing) -+{ -+ set_freezable(); -+ set_user_nice(current, 5); -+ -+ while (!kthread_should_stop()) { -+ mutex_lock(&uksm_thread_mutex); -+ if (ksmd_should_run()) -+ uksm_do_scan(); -+ mutex_unlock(&uksm_thread_mutex); -+ -+ try_to_freeze(); -+ -+ if (ksmd_should_run()) { -+ schedule_timeout_interruptible(uksm_sleep_real); -+ uksm_sleep_times++; -+ } else { -+ wait_event_freezable(uksm_thread_wait, -+ ksmd_should_run() || kthread_should_stop()); -+ } -+ } -+ return 0; -+} -+ -+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) -+{ -+ struct stable_node *stable_node; -+ struct node_vma *node_vma; -+ struct rmap_item *rmap_item; -+ int search_new_forks = 0; -+ unsigned long address; -+ -+ VM_BUG_ON_PAGE(!PageKsm(page), page); -+ VM_BUG_ON_PAGE(!PageLocked(page), page); -+ -+ stable_node = page_stable_node(page); -+ if (!stable_node) -+ return; -+again: -+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { -+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { -+ struct anon_vma *anon_vma = rmap_item->anon_vma; -+ struct anon_vma_chain *vmac; -+ struct vm_area_struct *vma; -+ -+ cond_resched(); -+ anon_vma_lock_read(anon_vma); -+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, -+ 0, ULONG_MAX) { -+ cond_resched(); -+ vma = vmac->vma; -+ address = get_rmap_addr(rmap_item); -+ -+ if (address < vma->vm_start || -+ address >= vma->vm_end) -+ continue; -+ -+ if ((rmap_item->slot->vma == vma) == -+ search_new_forks) -+ continue; -+ -+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) -+ continue; -+ -+ if (!rwc->rmap_one(page, vma, address, rwc->arg)) { -+ anon_vma_unlock_read(anon_vma); -+ return; -+ } -+ -+ if (rwc->done && rwc->done(page)) { -+ anon_vma_unlock_read(anon_vma); -+ return; -+ } -+ } -+ anon_vma_unlock_read(anon_vma); -+ } -+ } -+ if (!search_new_forks++) -+ goto again; -+} -+ -+#ifdef CONFIG_MIGRATION -+/* Common ksm interface but may be specific to uksm */ -+void ksm_migrate_page(struct page *newpage, struct page *oldpage) -+{ -+ struct stable_node *stable_node; -+ -+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); -+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); -+ VM_BUG_ON(newpage->mapping != oldpage->mapping); -+ -+ stable_node = page_stable_node(newpage); -+ if (stable_node) { -+ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); -+ stable_node->kpfn = page_to_pfn(newpage); -+ /* -+ * newpage->mapping was set in advance; now we need smp_wmb() -+ * to make sure that the new stable_node->kpfn is visible -+ * to get_ksm_page() before it can see that oldpage->mapping -+ * has gone stale (or that PageSwapCache has been cleared). -+ */ -+ smp_wmb(); -+ set_page_stable_node(oldpage, NULL); -+ } -+} -+#endif /* CONFIG_MIGRATION */ -+ -+#ifdef CONFIG_MEMORY_HOTREMOVE -+static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, -+ unsigned long end_pfn) -+{ -+ struct rb_node *node; -+ -+ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { -+ struct stable_node *stable_node; -+ -+ stable_node = rb_entry(node, struct stable_node, node); -+ if (stable_node->kpfn >= start_pfn && -+ stable_node->kpfn < end_pfn) -+ return stable_node; -+ } -+ return NULL; -+} -+ -+static int uksm_memory_callback(struct notifier_block *self, -+ unsigned long action, void *arg) -+{ -+ struct memory_notify *mn = arg; -+ struct stable_node *stable_node; -+ -+ switch (action) { -+ case MEM_GOING_OFFLINE: -+ /* -+ * Keep it very simple for now: just lock out ksmd and -+ * MADV_UNMERGEABLE while any memory is going offline. -+ * mutex_lock_nested() is necessary because lockdep was alarmed -+ * that here we take uksm_thread_mutex inside notifier chain -+ * mutex, and later take notifier chain mutex inside -+ * uksm_thread_mutex to unlock it. But that's safe because both -+ * are inside mem_hotplug_mutex. -+ */ -+ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); -+ break; -+ -+ case MEM_OFFLINE: -+ /* -+ * Most of the work is done by page migration; but there might -+ * be a few stable_nodes left over, still pointing to struct -+ * pages which have been offlined: prune those from the tree. -+ */ -+ while ((stable_node = uksm_check_stable_tree(mn->start_pfn, -+ mn->start_pfn + mn->nr_pages)) != NULL) -+ remove_node_from_stable_tree(stable_node, 1, 1); -+ /* fallthrough */ -+ -+ case MEM_CANCEL_OFFLINE: -+ mutex_unlock(&uksm_thread_mutex); -+ break; -+ } -+ return NOTIFY_OK; -+} -+#endif /* CONFIG_MEMORY_HOTREMOVE */ -+ -+#ifdef CONFIG_SYSFS -+/* -+ * This all compiles without CONFIG_SYSFS, but is a waste of space. -+ */ -+ -+#define UKSM_ATTR_RO(_name) \ -+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) -+#define UKSM_ATTR(_name) \ -+ static struct kobj_attribute _name##_attr = \ -+ __ATTR(_name, 0644, _name##_show, _name##_store) -+ -+static ssize_t max_cpu_percentage_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", uksm_max_cpu_percentage); -+} -+ -+static ssize_t max_cpu_percentage_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ unsigned long max_cpu_percentage; -+ int err; -+ -+ err = kstrtoul(buf, 10, &max_cpu_percentage); -+ if (err || max_cpu_percentage > 100) -+ return -EINVAL; -+ -+ if (max_cpu_percentage == 100) -+ max_cpu_percentage = 99; -+ else if (max_cpu_percentage < 10) -+ max_cpu_percentage = 10; -+ -+ uksm_max_cpu_percentage = max_cpu_percentage; -+ -+ return count; -+} -+UKSM_ATTR(max_cpu_percentage); -+ -+static ssize_t sleep_millisecs_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); -+} -+ -+static ssize_t sleep_millisecs_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ unsigned long msecs; -+ int err; -+ -+ err = kstrtoul(buf, 10, &msecs); -+ if (err || msecs > MSEC_PER_SEC) -+ return -EINVAL; -+ -+ uksm_sleep_jiffies = msecs_to_jiffies(msecs); -+ uksm_sleep_saved = uksm_sleep_jiffies; -+ -+ return count; -+} -+UKSM_ATTR(sleep_millisecs); -+ -+ -+static ssize_t cpu_governor_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); -+ int i; -+ -+ buf[0] = '\0'; -+ for (i = 0; i < n ; i++) { -+ if (uksm_cpu_governor == i) -+ strcat(buf, "["); -+ -+ strcat(buf, uksm_cpu_governor_str[i]); -+ -+ if (uksm_cpu_governor == i) -+ strcat(buf, "]"); -+ -+ strcat(buf, " "); -+ } -+ strcat(buf, "\n"); -+ -+ return strlen(buf); -+} -+ -+static inline void init_performance_values(void) -+{ -+ int i; -+ struct scan_rung *rung; -+ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; -+ -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = uksm_scan_ladder + i; -+ rung->cpu_ratio = preset->cpu_ratio[i]; -+ rung->cover_msecs = preset->cover_msecs[i]; -+ } -+ -+ uksm_max_cpu_percentage = preset->max_cpu; -+} -+ -+static ssize_t cpu_governor_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); -+ -+ for (n--; n >= 0 ; n--) { -+ if (!strncmp(buf, uksm_cpu_governor_str[n], -+ strlen(uksm_cpu_governor_str[n]))) -+ break; -+ } -+ -+ if (n < 0) -+ return -EINVAL; -+ else -+ uksm_cpu_governor = n; -+ -+ init_performance_values(); -+ -+ return count; -+} -+UKSM_ATTR(cpu_governor); -+ -+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, -+ char *buf) -+{ -+ return sprintf(buf, "%u\n", uksm_run); -+} -+ -+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int err; -+ unsigned long flags; -+ -+ err = kstrtoul(buf, 10, &flags); -+ if (err || flags > UINT_MAX) -+ return -EINVAL; -+ if (flags > UKSM_RUN_MERGE) -+ return -EINVAL; -+ -+ mutex_lock(&uksm_thread_mutex); -+ if (uksm_run != flags) -+ uksm_run = flags; -+ mutex_unlock(&uksm_thread_mutex); -+ -+ if (flags & UKSM_RUN_MERGE) -+ wake_up_interruptible(&uksm_thread_wait); -+ -+ return count; -+} -+UKSM_ATTR(run); -+ -+static ssize_t abundant_threshold_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", uksm_abundant_threshold); -+} -+ -+static ssize_t abundant_threshold_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int err; -+ unsigned long flags; -+ -+ err = kstrtoul(buf, 10, &flags); -+ if (err || flags > 99) -+ return -EINVAL; -+ -+ uksm_abundant_threshold = flags; -+ -+ return count; -+} -+UKSM_ATTR(abundant_threshold); -+ -+static ssize_t thrash_threshold_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", uksm_thrash_threshold); -+} -+ -+static ssize_t thrash_threshold_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int err; -+ unsigned long flags; -+ -+ err = kstrtoul(buf, 10, &flags); -+ if (err || flags > 99) -+ return -EINVAL; -+ -+ uksm_thrash_threshold = flags; -+ -+ return count; -+} -+UKSM_ATTR(thrash_threshold); -+ -+static ssize_t cpu_ratios_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ int i, size; -+ struct scan_rung *rung; -+ char *p = buf; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = &uksm_scan_ladder[i]; -+ -+ if (rung->cpu_ratio > 0) -+ size = sprintf(p, "%d ", rung->cpu_ratio); -+ else -+ size = sprintf(p, "MAX/%d ", -+ TIME_RATIO_SCALE / -rung->cpu_ratio); -+ -+ p += size; -+ } -+ -+ *p++ = '\n'; -+ *p = '\0'; -+ -+ return p - buf; -+} -+ -+static ssize_t cpu_ratios_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int i, cpuratios[SCAN_LADDER_SIZE], err; -+ unsigned long value; -+ struct scan_rung *rung; -+ char *p, *end = NULL; -+ -+ p = kzalloc(count, GFP_KERNEL); -+ if (!p) -+ return -ENOMEM; -+ -+ memcpy(p, buf, count); -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ if (i != SCAN_LADDER_SIZE - 1) { -+ end = strchr(p, ' '); -+ if (!end) -+ return -EINVAL; -+ -+ *end = '\0'; -+ } -+ -+ if (strstr(p, "MAX/")) { -+ p = strchr(p, '/') + 1; -+ err = kstrtoul(p, 10, &value); -+ if (err || value > TIME_RATIO_SCALE || !value) -+ return -EINVAL; -+ -+ cpuratios[i] = -(int) (TIME_RATIO_SCALE / value); -+ } else { -+ err = kstrtoul(p, 10, &value); -+ if (err || value > TIME_RATIO_SCALE || !value) -+ return -EINVAL; -+ -+ cpuratios[i] = value; -+ } -+ -+ p = end + 1; -+ } -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = &uksm_scan_ladder[i]; -+ -+ rung->cpu_ratio = cpuratios[i]; -+ } -+ -+ return count; -+} -+UKSM_ATTR(cpu_ratios); -+ -+static ssize_t eval_intervals_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ int i, size; -+ struct scan_rung *rung; -+ char *p = buf; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = &uksm_scan_ladder[i]; -+ size = sprintf(p, "%u ", rung->cover_msecs); -+ p += size; -+ } -+ -+ *p++ = '\n'; -+ *p = '\0'; -+ -+ return p - buf; -+} -+ -+static ssize_t eval_intervals_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int i, err; -+ unsigned long values[SCAN_LADDER_SIZE]; -+ struct scan_rung *rung; -+ char *p, *end = NULL; -+ ssize_t ret = count; -+ -+ p = kzalloc(count + 2, GFP_KERNEL); -+ if (!p) -+ return -ENOMEM; -+ -+ memcpy(p, buf, count); -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ if (i != SCAN_LADDER_SIZE - 1) { -+ end = strchr(p, ' '); -+ if (!end) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ *end = '\0'; -+ } -+ -+ err = kstrtoul(p, 10, &values[i]); -+ if (err) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ p = end + 1; -+ } -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = &uksm_scan_ladder[i]; -+ -+ rung->cover_msecs = values[i]; -+ } -+ -+out: -+ kfree(p); -+ return ret; -+} -+UKSM_ATTR(eval_intervals); -+ -+static ssize_t ema_per_page_time_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", uksm_ema_page_time); -+} -+UKSM_ATTR_RO(ema_per_page_time); -+ -+static ssize_t pages_shared_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", uksm_pages_shared); -+} -+UKSM_ATTR_RO(pages_shared); -+ -+static ssize_t pages_sharing_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", uksm_pages_sharing); -+} -+UKSM_ATTR_RO(pages_sharing); -+ -+static ssize_t pages_unshared_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", uksm_pages_unshared); -+} -+UKSM_ATTR_RO(pages_unshared); -+ -+static ssize_t full_scans_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%llu\n", fully_scanned_round); -+} -+UKSM_ATTR_RO(full_scans); -+ -+static ssize_t pages_scanned_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ unsigned long base = 0; -+ u64 delta, ret; -+ -+ if (pages_scanned_stored) { -+ base = pages_scanned_base; -+ ret = pages_scanned_stored; -+ delta = uksm_pages_scanned >> base; -+ if (CAN_OVERFLOW_U64(ret, delta)) { -+ ret >>= 1; -+ delta >>= 1; -+ base++; -+ ret += delta; -+ } -+ } else { -+ ret = uksm_pages_scanned; -+ } -+ -+ while (ret > ULONG_MAX) { -+ ret >>= 1; -+ base++; -+ } -+ -+ if (base) -+ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); -+ else -+ return sprintf(buf, "%lu\n", (unsigned long)ret); -+} -+UKSM_ATTR_RO(pages_scanned); -+ -+static ssize_t hash_strength_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", hash_strength); -+} -+UKSM_ATTR_RO(hash_strength); -+ -+static ssize_t sleep_times_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%llu\n", uksm_sleep_times); -+} -+UKSM_ATTR_RO(sleep_times); -+ -+ -+static struct attribute *uksm_attrs[] = { -+ &max_cpu_percentage_attr.attr, -+ &sleep_millisecs_attr.attr, -+ &cpu_governor_attr.attr, -+ &run_attr.attr, -+ &ema_per_page_time_attr.attr, -+ &pages_shared_attr.attr, -+ &pages_sharing_attr.attr, -+ &pages_unshared_attr.attr, -+ &full_scans_attr.attr, -+ &pages_scanned_attr.attr, -+ &hash_strength_attr.attr, -+ &sleep_times_attr.attr, -+ &thrash_threshold_attr.attr, -+ &abundant_threshold_attr.attr, -+ &cpu_ratios_attr.attr, -+ &eval_intervals_attr.attr, -+ NULL, -+}; -+ -+static struct attribute_group uksm_attr_group = { -+ .attrs = uksm_attrs, -+ .name = "uksm", -+}; -+#endif /* CONFIG_SYSFS */ -+ -+static inline void init_scan_ladder(void) -+{ -+ int i; -+ struct scan_rung *rung; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = uksm_scan_ladder + i; -+ slot_tree_init_root(&rung->vma_root); -+ } -+ -+ init_performance_values(); -+ uksm_calc_scan_pages(); -+} -+ -+static inline int cal_positive_negative_costs(void) -+{ -+ struct page *p1, *p2; -+ unsigned char *addr1, *addr2; -+ unsigned long i, time_start, hash_cost; -+ unsigned long loopnum = 0; -+ -+ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ -+ volatile u32 hash; -+ volatile int ret; -+ -+ p1 = alloc_page(GFP_KERNEL); -+ if (!p1) -+ return -ENOMEM; -+ -+ p2 = alloc_page(GFP_KERNEL); -+ if (!p2) -+ return -ENOMEM; -+ -+ addr1 = kmap_atomic(p1); -+ addr2 = kmap_atomic(p2); -+ memset(addr1, prandom_u32(), PAGE_SIZE); -+ memcpy(addr2, addr1, PAGE_SIZE); -+ -+ /* make sure that the two pages differ in last byte */ -+ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; -+ kunmap_atomic(addr2); -+ kunmap_atomic(addr1); -+ -+ time_start = jiffies; -+ while (jiffies - time_start < 100) { -+ for (i = 0; i < 100; i++) -+ hash = page_hash(p1, HASH_STRENGTH_FULL, 0); -+ loopnum += 100; -+ } -+ hash_cost = (jiffies - time_start); -+ -+ time_start = jiffies; -+ for (i = 0; i < loopnum; i++) -+ ret = pages_identical_with_cost(p1, p2); -+ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); -+ memcmp_cost /= hash_cost; -+ pr_info("UKSM: relative memcmp_cost = %lu " -+ "hash=%u cmp_ret=%d.\n", -+ memcmp_cost, hash, ret); -+ -+ __free_page(p1); -+ __free_page(p2); -+ return 0; -+} -+ -+static int init_zeropage_hash_table(void) -+{ -+ struct page *page; -+ char *addr; -+ int i; -+ -+ page = alloc_page(GFP_KERNEL); -+ if (!page) -+ return -ENOMEM; -+ -+ addr = kmap_atomic(page); -+ memset(addr, 0, PAGE_SIZE); -+ kunmap_atomic(addr); -+ -+ zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32), -+ GFP_KERNEL); -+ if (!zero_hash_table) -+ return -ENOMEM; -+ -+ for (i = 0; i < HASH_STRENGTH_MAX; i++) -+ zero_hash_table[i] = page_hash(page, i, 0); -+ -+ __free_page(page); -+ -+ return 0; -+} -+ -+static inline int init_random_sampling(void) -+{ -+ unsigned long i; -+ -+ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); -+ if (!random_nums) -+ return -ENOMEM; -+ -+ for (i = 0; i < HASH_STRENGTH_FULL; i++) -+ random_nums[i] = i; -+ -+ for (i = 0; i < HASH_STRENGTH_FULL; i++) { -+ unsigned long rand_range, swap_index, tmp; -+ -+ rand_range = HASH_STRENGTH_FULL - i; -+ swap_index = i + prandom_u32() % rand_range; -+ tmp = random_nums[i]; -+ random_nums[i] = random_nums[swap_index]; -+ random_nums[swap_index] = tmp; -+ } -+ -+ rshash_state.state = RSHASH_NEW; -+ rshash_state.below_count = 0; -+ rshash_state.lookup_window_index = 0; -+ -+ return cal_positive_negative_costs(); -+} -+ -+static int __init uksm_slab_init(void) -+{ -+ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); -+ if (!rmap_item_cache) -+ goto out; -+ -+ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); -+ if (!stable_node_cache) -+ goto out_free1; -+ -+ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); -+ if (!node_vma_cache) -+ goto out_free2; -+ -+ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); -+ if (!vma_slot_cache) -+ goto out_free3; -+ -+ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); -+ if (!tree_node_cache) -+ goto out_free4; -+ -+ return 0; -+ -+out_free4: -+ kmem_cache_destroy(vma_slot_cache); -+out_free3: -+ kmem_cache_destroy(node_vma_cache); -+out_free2: -+ kmem_cache_destroy(stable_node_cache); -+out_free1: -+ kmem_cache_destroy(rmap_item_cache); -+out: -+ return -ENOMEM; -+} -+ -+static void __init uksm_slab_free(void) -+{ -+ kmem_cache_destroy(stable_node_cache); -+ kmem_cache_destroy(rmap_item_cache); -+ kmem_cache_destroy(node_vma_cache); -+ kmem_cache_destroy(vma_slot_cache); -+ kmem_cache_destroy(tree_node_cache); -+} -+ -+/* Common interface to ksm, different to it. */ -+int ksm_madvise(struct vm_area_struct *vma, unsigned long start, -+ unsigned long end, int advice, unsigned long *vm_flags) -+{ -+ int err; -+ -+ switch (advice) { -+ case MADV_MERGEABLE: -+ return 0; /* just ignore the advice */ -+ -+ case MADV_UNMERGEABLE: -+ if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags)) -+ return 0; /* just ignore the advice */ -+ -+ if (vma->anon_vma) { -+ err = unmerge_uksm_pages(vma, start, end); -+ if (err) -+ return err; -+ } -+ -+ uksm_remove_vma(vma); -+ *vm_flags &= ~VM_MERGEABLE; -+ break; -+ } -+ -+ return 0; -+} -+ -+/* Common interface to ksm, actually the same. */ -+struct page *ksm_might_need_to_copy(struct page *page, -+ struct vm_area_struct *vma, unsigned long address) -+{ -+ struct anon_vma *anon_vma = page_anon_vma(page); -+ struct page *new_page; -+ -+ if (PageKsm(page)) { -+ if (page_stable_node(page)) -+ return page; /* no need to copy it */ -+ } else if (!anon_vma) { -+ return page; /* no need to copy it */ -+ } else if (anon_vma->root == vma->anon_vma->root && -+ page->index == linear_page_index(vma, address)) { -+ return page; /* still no need to copy it */ -+ } -+ if (!PageUptodate(page)) -+ return page; /* let do_swap_page report the error */ -+ -+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); -+ if (new_page) { -+ copy_user_highpage(new_page, page, address, vma); -+ -+ SetPageDirty(new_page); -+ __SetPageUptodate(new_page); -+ __SetPageLocked(new_page); -+ } -+ -+ return new_page; -+} -+ -+/* Copied from mm/ksm.c and required from 5.1 */ -+bool reuse_ksm_page(struct page *page, -+ struct vm_area_struct *vma, -+ unsigned long address) -+{ -+#ifdef CONFIG_DEBUG_VM -+ if (WARN_ON(is_zero_pfn(page_to_pfn(page))) || -+ WARN_ON(!page_mapped(page)) || -+ WARN_ON(!PageLocked(page))) { -+ dump_page(page, "reuse_ksm_page"); -+ return false; -+ } -+#endif -+ -+ if (PageSwapCache(page) || !page_stable_node(page)) -+ return false; -+ /* Prohibit parallel get_ksm_page() */ -+ if (!page_ref_freeze(page, 1)) -+ return false; -+ -+ page_move_anon_rmap(page, vma); -+ page->index = linear_page_index(vma, address); -+ page_ref_unfreeze(page, 1); -+ -+ return true; -+} -+ -+static int __init uksm_init(void) -+{ -+ struct task_struct *uksm_thread; -+ int err; -+ -+ uksm_sleep_jiffies = msecs_to_jiffies(100); -+ uksm_sleep_saved = uksm_sleep_jiffies; -+ -+ slot_tree_init(); -+ init_scan_ladder(); -+ -+ -+ err = init_random_sampling(); -+ if (err) -+ goto out_free2; -+ -+ err = uksm_slab_init(); -+ if (err) -+ goto out_free1; -+ -+ err = init_zeropage_hash_table(); -+ if (err) -+ goto out_free0; -+ -+ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); -+ if (IS_ERR(uksm_thread)) { -+ pr_err("uksm: creating kthread failed\n"); -+ err = PTR_ERR(uksm_thread); -+ goto out_free; -+ } -+ -+#ifdef CONFIG_SYSFS -+ err = sysfs_create_group(mm_kobj, &uksm_attr_group); -+ if (err) { -+ pr_err("uksm: register sysfs failed\n"); -+ kthread_stop(uksm_thread); -+ goto out_free; -+ } -+#else -+ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ -+ -+#endif /* CONFIG_SYSFS */ -+ -+#ifdef CONFIG_MEMORY_HOTREMOVE -+ /* -+ * Choose a high priority since the callback takes uksm_thread_mutex: -+ * later callbacks could only be taking locks which nest within that. -+ */ -+ hotplug_memory_notifier(uksm_memory_callback, 100); -+#endif -+ return 0; -+ -+out_free: -+ kfree(zero_hash_table); -+out_free0: -+ uksm_slab_free(); -+out_free1: -+ kfree(random_nums); -+out_free2: -+ kfree(uksm_scan_ladder); -+ return err; -+} -+ -+#ifdef MODULE -+subsys_initcall(ksm_init); -+#else -+late_initcall(uksm_init); -+#endif -+ -diff --git a/mm/vmstat.c b/mm/vmstat.c -index 74b2c374b..ae42103a8 100644 ---- a/mm/vmstat.c -+++ b/mm/vmstat.c -@@ -1231,6 +1231,9 @@ const char * const vmstat_text[] = { - "nr_swapcached", - #endif - -+#ifdef CONFIG_UKSM -+ "nr_uksm_zero_pages", -+#endif - /* enum writeback_stat_item counters */ - "nr_dirty_threshold", - "nr_dirty_background_threshold", --- -2.31.1.305.gd1b10fc6d8 - diff --git a/0009-bbr2.patch b/0009-bbr2.patch deleted file mode 100644 index 5a257ca22043..000000000000 --- a/0009-bbr2.patch +++ /dev/null @@ -1,3347 +0,0 @@ -From f3d069e2cafed9758d66fcfc42447b028d42493f Mon Sep 17 00:00:00 2001 -From: Piotr Gorski <lucjan.lucjanov@gmail.com> -Date: Mon, 26 Apr 2021 21:14:18 +0200 -Subject: [PATCH] bbr2-5.12: introduce BBRv2 - -Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com> ---- - include/linux/tcp.h | 3 +- - include/net/inet_connection_sock.h | 5 +- - include/net/tcp.h | 48 +- - include/uapi/linux/inet_diag.h | 33 + - net/ipv4/Kconfig | 22 + - net/ipv4/Makefile | 1 + - net/ipv4/bpf_tcp_ca.c | 2 +- - net/ipv4/tcp.c | 1 + - net/ipv4/tcp_bbr.c | 38 +- - net/ipv4/tcp_bbr2.c | 2671 ++++++++++++++++++++++++++++ - net/ipv4/tcp_cong.c | 1 + - net/ipv4/tcp_input.c | 38 +- - net/ipv4/tcp_output.c | 25 +- - net/ipv4/tcp_rate.c | 36 +- - net/ipv4/tcp_timer.c | 1 + - 15 files changed, 2879 insertions(+), 46 deletions(-) - create mode 100644 net/ipv4/tcp_bbr2.c - -diff --git a/include/linux/tcp.h b/include/linux/tcp.h -index 48d8a3633..1bd559c69 100644 ---- a/include/linux/tcp.h -+++ b/include/linux/tcp.h -@@ -225,7 +225,8 @@ struct tcp_sock { - u8 compressed_ack; - u8 dup_ack_counter:2, - tlp_retrans:1, /* TLP is a retransmission */ -- unused:5; -+ fast_ack_mode:2, /* which fast ack mode ? */ -+ unused:3; - u32 chrono_start; /* Start time in jiffies of a TCP chrono */ - u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ - u8 chrono_type:2, /* current chronograph type */ -diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h -index 3c8c59471..2cdc5a070 100644 ---- a/include/net/inet_connection_sock.h -+++ b/include/net/inet_connection_sock.h -@@ -134,8 +134,9 @@ struct inet_connection_sock { - u32 icsk_probes_tstamp; - u32 icsk_user_timeout; - -- u64 icsk_ca_priv[104 / sizeof(u64)]; --#define ICSK_CA_PRIV_SIZE (13 * sizeof(u64)) -+/* XXX inflated by temporary internal debugging info */ -+#define ICSK_CA_PRIV_SIZE (216) -+ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; - }; - - #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ -diff --git a/include/net/tcp.h b/include/net/tcp.h -index 963cd86d1..5a86fa1d2 100644 ---- a/include/net/tcp.h -+++ b/include/net/tcp.h -@@ -799,6 +799,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) - return max_t(s64, t1 - t0, 0); - } - -+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) -+{ -+ return max_t(s32, t1 - t0, 0); -+} -+ - static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) - { - return tcp_ns_to_ts(skb->skb_mstamp_ns); -@@ -866,16 +871,22 @@ struct tcp_skb_cb { - __u32 ack_seq; /* Sequence number ACK'd */ - union { - struct { -+#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1) - /* There is space for up to 24 bytes */ -- __u32 in_flight:30,/* Bytes in flight at transmit */ -- is_app_limited:1, /* cwnd not fully used? */ -- unused:1; -+ __u32 is_app_limited:1, /* cwnd not fully used? */ -+ delivered_ce:20, -+ unused:11; - /* pkts S/ACKed so far upon tx of skb, incl retrans: */ - __u32 delivered; - /* start of send pipeline phase */ -- u64 first_tx_mstamp; -+ u32 first_tx_mstamp; - /* when we reached the "delivered" count */ -- u64 delivered_mstamp; -+ u32 delivered_mstamp; -+#define TCPCB_IN_FLIGHT_BITS 20 -+#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) -+ u32 in_flight:20, /* packets in flight at transmit */ -+ unused2:12; -+ u32 lost; /* packets lost so far upon tx of skb */ - } tx; /* only used for outgoing skbs */ - union { - struct inet_skb_parm h4; -@@ -1025,7 +1036,11 @@ enum tcp_ca_ack_event_flags { - #define TCP_CONG_NON_RESTRICTED 0x1 - /* Requires ECN/ECT set on all packets */ - #define TCP_CONG_NEEDS_ECN 0x2 --#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) -+/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ -+#define TCP_CONG_WANTS_CE_EVENTS 0x4 -+#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ -+ TCP_CONG_NEEDS_ECN | \ -+ TCP_CONG_WANTS_CE_EVENTS) - - union tcp_cc_info; - -@@ -1045,8 +1060,13 @@ struct ack_sample { - */ - struct rate_sample { - u64 prior_mstamp; /* starting timestamp for interval */ -+ u32 prior_lost; /* tp->lost at "prior_mstamp" */ - u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ -+ u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ -+ u32 tx_in_flight; /* packets in flight at starting timestamp */ -+ s32 lost; /* number of packets lost over interval */ - s32 delivered; /* number of packets delivered over interval */ -+ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ - long interval_us; /* time for tp->delivered to incr "delivered" */ - u32 snd_interval_us; /* snd interval for delivered packets */ - u32 rcv_interval_us; /* rcv interval for delivered packets */ -@@ -1057,6 +1077,7 @@ struct rate_sample { - bool is_app_limited; /* is sample from packet with bubble in pipe? */ - bool is_retrans; /* is sample from retransmission? */ - bool is_ack_delayed; /* is this (likely) a delayed ACK? */ -+ bool is_ece; /* did this ACK have ECN marked? */ - }; - - struct tcp_congestion_ops { -@@ -1083,10 +1104,12 @@ struct tcp_congestion_ops { - u32 (*undo_cwnd)(struct sock *sk); - /* hook for packet ack accounting (optional) */ - void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); -- /* override sysctl_tcp_min_tso_segs */ -- u32 (*min_tso_segs)(struct sock *sk); -+ /* pick target number of segments per TSO/GSO skb (optional): */ -+ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); - /* returns the multiplier used in tcp_sndbuf_expand (optional) */ - u32 (*sndbuf_expand)(struct sock *sk); -+ /* react to a specific lost skb (optional) */ -+ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); - /* call when packets are delivered to update cwnd and pacing rate, - * after all the ca_state processing. (optional) - */ -@@ -1132,6 +1155,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) - } - #endif - -+static inline bool tcp_ca_wants_ce_events(const struct sock *sk) -+{ -+ const struct inet_connection_sock *icsk = inet_csk(sk); -+ -+ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | -+ TCP_CONG_WANTS_CE_EVENTS); -+} -+ - static inline bool tcp_ca_needs_ecn(const struct sock *sk) - { - const struct inet_connection_sock *icsk = inet_csk(sk); -@@ -1157,6 +1188,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) - } - - /* From tcp_rate.c */ -+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); - void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); - void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, - struct rate_sample *rs); -diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h -index 20ee93f0f..96d52dd9c 100644 ---- a/include/uapi/linux/inet_diag.h -+++ b/include/uapi/linux/inet_diag.h -@@ -231,9 +231,42 @@ struct tcp_bbr_info { - __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ - }; - -+/* Phase as reported in netlink/ss stats. */ -+enum tcp_bbr2_phase { -+ BBR2_PHASE_INVALID = 0, -+ BBR2_PHASE_STARTUP = 1, -+ BBR2_PHASE_DRAIN = 2, -+ BBR2_PHASE_PROBE_RTT = 3, -+ BBR2_PHASE_PROBE_BW_UP = 4, -+ BBR2_PHASE_PROBE_BW_DOWN = 5, -+ BBR2_PHASE_PROBE_BW_CRUISE = 6, -+ BBR2_PHASE_PROBE_BW_REFILL = 7 -+}; -+ -+struct tcp_bbr2_info { -+ /* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */ -+ __u32 bbr_bw_lsb; /* lower 32 bits of bw */ -+ __u32 bbr_bw_msb; /* upper 32 bits of bw */ -+ __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ -+ __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ -+ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ -+ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ -+ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ -+ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ -+ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ -+ __u8 bbr_mode; /* current bbr_mode in state machine */ -+ __u8 bbr_phase; /* current state machine phase */ -+ __u8 unused1; /* alignment padding; not used yet */ -+ __u8 bbr_version; /* MUST be at this offset in struct */ -+ __u32 bbr_inflight_lo; /* lower/short-term data volume bound */ -+ __u32 bbr_inflight_hi; /* higher/long-term data volume bound */ -+ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ -+}; -+ - union tcp_cc_info { - struct tcpvegas_info vegas; - struct tcp_dctcp_info dctcp; - struct tcp_bbr_info bbr; -+ struct tcp_bbr2_info bbr2; - }; - #endif /* _UAPI_INET_DIAG_H_ */ -diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index 87983e70f..a833a7a67 100644 ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -669,6 +669,24 @@ config TCP_CONG_BBR - AQM schemes that do not provide a delay signal. It requires the fq - ("Fair Queue") pacing packet scheduler. - -+config TCP_CONG_BBR2 -+ tristate "BBR2 TCP" -+ default n -+ help -+ -+ BBR2 TCP congestion control is a model-based congestion control -+ algorithm that aims to maximize network utilization, keep queues and -+ retransmit rates low, and to be able to coexist with Reno/CUBIC in -+ common scenarios. It builds an explicit model of the network path. It -+ tolerates a targeted degree of random packet loss and delay that are -+ unrelated to congestion. It can operate over LAN, WAN, cellular, wifi, -+ or cable modem links, and can use DCTCP-L4S-style ECN signals. It can -+ coexist with flows that use loss-based congestion control, and can -+ operate with shallow buffers, deep buffers, bufferbloat, policers, or -+ AQM schemes that do not provide a delay signal. It requires pacing, -+ using either TCP internal pacing or the fq ("Fair Queue") pacing packet -+ scheduler. -+ - choice - prompt "Default TCP congestion control" - default DEFAULT_CUBIC -@@ -706,6 +724,9 @@ choice - config DEFAULT_BBR - bool "BBR" if TCP_CONG_BBR=y - -+ config DEFAULT_BBR2 -+ bool "BBR2" if TCP_CONG_BBR2=y -+ - config DEFAULT_RENO - bool "Reno" - endchoice -@@ -730,6 +751,7 @@ config DEFAULT_TCP_CONG - default "dctcp" if DEFAULT_DCTCP - default "cdg" if DEFAULT_CDG - default "bbr" if DEFAULT_BBR -+ default "bbr2" if DEFAULT_BBR2 - default "cubic" - - config TCP_MD5SIG -diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile -index 5b77a4688..8c5779dba 100644 ---- a/net/ipv4/Makefile -+++ b/net/ipv4/Makefile -@@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o - obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o - obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o - obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o -+obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o - obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o - obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o - obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o -diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c -index d520e6164..22129c1c5 100644 ---- a/net/ipv4/bpf_tcp_ca.c -+++ b/net/ipv4/bpf_tcp_ca.c -@@ -16,7 +16,7 @@ static u32 optional_ops[] = { - offsetof(struct tcp_congestion_ops, cwnd_event), - offsetof(struct tcp_congestion_ops, in_ack_event), - offsetof(struct tcp_congestion_ops, pkts_acked), -- offsetof(struct tcp_congestion_ops, min_tso_segs), -+ offsetof(struct tcp_congestion_ops, tso_segs), - offsetof(struct tcp_congestion_ops, sndbuf_expand), - offsetof(struct tcp_congestion_ops, cong_control), - }; -diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index de7cc8445..521f310f2 100644 ---- a/net/ipv4/tcp.c -+++ b/net/ipv4/tcp.c -@@ -3033,6 +3033,7 @@ int tcp_disconnect(struct sock *sk, int flags) - tp->rx_opt.dsack = 0; - tp->rx_opt.num_sacks = 0; - tp->rcv_ooopack = 0; -+ tp->fast_ack_mode = 0; - - - /* Clean up fastopen related fields */ -diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c -index 6ea3dc2e4..8ef512fef 100644 ---- a/net/ipv4/tcp_bbr.c -+++ b/net/ipv4/tcp_bbr.c -@@ -292,26 +292,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) - sk->sk_pacing_rate = rate; - } - --/* override sysctl_tcp_min_tso_segs */ - static u32 bbr_min_tso_segs(struct sock *sk) - { - return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; - } - -+/* Return the number of segments BBR would like in a TSO/GSO skb, given -+ * a particular max gso size as a constraint. -+ */ -+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, -+ u32 gso_max_size) -+{ -+ u32 segs; -+ u64 bytes; -+ -+ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ -+ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; -+ -+ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); -+ segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk)); -+ return segs; -+} -+ -+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ -+static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) -+{ -+ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); -+} -+ -+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ - static u32 bbr_tso_segs_goal(struct sock *sk) - { - struct tcp_sock *tp = tcp_sk(sk); -- u32 segs, bytes; -- -- /* Sort of tcp_tso_autosize() but ignoring -- * driver provided sk_gso_max_size. -- */ -- bytes = min_t(unsigned long, -- sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), -- GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); -- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); - -- return min(segs, 0x7FU); -+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); - } - - /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ -@@ -1147,7 +1161,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { - .undo_cwnd = bbr_undo_cwnd, - .cwnd_event = bbr_cwnd_event, - .ssthresh = bbr_ssthresh, -- .min_tso_segs = bbr_min_tso_segs, -+ .tso_segs = bbr_tso_segs, - .get_info = bbr_get_info, - .set_state = bbr_set_state, - }; -diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c -new file mode 100644 -index 000000000..17d6a059d ---- /dev/null -+++ b/net/ipv4/tcp_bbr2.c -@@ -0,0 +1,2671 @@ -+/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2 -+ * -+ * BBRv2 is a model-based congestion control algorithm that aims for low -+ * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model -+ * of the network path, it uses measurements of bandwidth and RTT, as well as -+ * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals. Note that -+ * although it can use ECN or loss signals explicitly, it does not require -+ * either; it can bound its in-flight data based on its estimate of the BDP. -+ * -+ * The model has both higher and lower bounds for the operating range: -+ * lo: bw_lo, inflight_lo: conservative short-term lower bound -+ * hi: bw_hi, inflight_hi: robust long-term upper bound -+ * The bandwidth-probing time scale is (a) extended dynamically based on -+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by -+ * an interactive wall-clock time-scale to be more scalable and responsive -+ * than Reno and CUBIC. -+ * -+ * Here is a state transition diagram for BBR: -+ * -+ * | -+ * V -+ * +---> STARTUP ----+ -+ * | | | -+ * | V | -+ * | DRAIN ----+ -+ * | | | -+ * | V | -+ * +---> PROBE_BW ----+ -+ * | ^ | | -+ * | | | | -+ * | +----+ | -+ * | | -+ * +---- PROBE_RTT <--+ -+ * -+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. -+ * When it estimates the pipe is full, it enters DRAIN to drain the queue. -+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. -+ * A long-lived BBR flow spends the vast majority of its time remaining -+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth -+ * in a fair manner, with a small, bounded queue. *If* a flow has been -+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT -+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then -+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe -+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if -+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; -+ * otherwise we enter STARTUP to try to fill the pipe. -+ * -+ * BBR is described in detail in: -+ * "BBR: Congestion-Based Congestion Control", -+ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, -+ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. -+ * -+ * There is a public e-mail list for discussing BBR development and testing: -+ * https://groups.google.com/forum/#!forum/bbr-dev -+ * -+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, -+ * otherwise TCP stack falls back to an internal pacing using one high -+ * resolution timer per TCP socket and may use more resources. -+ */ -+#include <linux/module.h> -+#include <net/tcp.h> -+#include <linux/inet_diag.h> -+#include <linux/inet.h> -+#include <linux/random.h> -+ -+#include "tcp_dctcp.h" -+ -+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth -+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. -+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. -+ * Since the minimum window is >=4 packets, the lower bound isn't -+ * an issue. The upper bound isn't an issue with existing technologies. -+ */ -+#define BW_SCALE 24 -+#define BW_UNIT (1 << BW_SCALE) -+ -+#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ -+#define BBR_UNIT (1 << BBR_SCALE) -+ -+#define FLAG_DEBUG_VERBOSE 0x1 /* Verbose debugging messages */ -+#define FLAG_DEBUG_LOOPBACK 0x2 /* Do NOT skip loopback addr */ -+ -+#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ -+ -+/* BBR has the following modes for deciding how fast to send: */ -+enum bbr_mode { -+ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ -+ BBR_DRAIN, /* drain any queue created during startup */ -+ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ -+ BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ -+}; -+ -+/* How does the incoming ACK stream relate to our bandwidth probing? */ -+enum bbr_ack_phase { -+ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ -+ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ -+ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ -+ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ -+ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ -+}; -+ -+/* BBR congestion control block */ -+struct bbr { -+ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ -+ u32 min_rtt_stamp; /* timestamp of min_rtt_us */ -+ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ -+ u32 probe_rtt_min_us; /* min RTT in bbr_probe_rtt_win_ms window */ -+ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ -+ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ -+ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ -+ u64 cycle_mstamp; /* time of this cycle phase start */ -+ u32 mode:3, /* current bbr_mode in state machine */ -+ prev_ca_state:3, /* CA state on previous ACK */ -+ packet_conservation:1, /* use packet conservation? */ -+ round_start:1, /* start of packet-timed tx->ack round? */ -+ ce_state:1, /* If most recent data has CE bit set */ -+ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ -+ try_fast_path:1, /* can we take fast path? */ -+ unused2:11, -+ idle_restart:1, /* restarting after idle? */ -+ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ -+ cycle_idx:3, /* current index in pacing_gain cycle array */ -+ has_seen_rtt:1; /* have we seen an RTT sample yet? */ -+ u32 pacing_gain:11, /* current gain for setting pacing rate */ -+ cwnd_gain:11, /* current gain for setting cwnd */ -+ full_bw_reached:1, /* reached full bw in Startup? */ -+ full_bw_cnt:2, /* number of rounds without large bw gains */ -+ init_cwnd:7; /* initial cwnd */ -+ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ -+ u32 full_bw; /* recent bw, to estimate if pipe is full */ -+ -+ /* For tracking ACK aggregation: */ -+ u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ -+ u16 extra_acked[2]; /* max excess data ACKed in epoch */ -+ u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ -+ extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ -+ extra_acked_win_idx:1, /* current index in extra_acked array */ -+ /* BBR v2 state: */ -+ unused1:2, -+ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ -+ loss_in_cycle:1, /* packet loss in this cycle? */ -+ ecn_in_cycle:1; /* ECN in this cycle? */ -+ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ -+ u32 undo_bw_lo; /* bw_lo before latest losses */ -+ u32 undo_inflight_lo; /* inflight_lo before latest losses */ -+ u32 undo_inflight_hi; /* inflight_hi before latest losses */ -+ u32 bw_latest; /* max delivered bw in last round trip */ -+ u32 bw_lo; /* lower bound on sending bandwidth */ -+ u32 bw_hi[2]; /* upper bound of sending bandwidth range*/ -+ u32 inflight_latest; /* max delivered data in last round trip */ -+ u32 inflight_lo; /* lower bound of inflight data range */ -+ u32 inflight_hi; /* upper bound of inflight data range */ -+ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ -+ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ -+ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ -+ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ -+ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ -+ bw_probe_samples:1, /* rate samples reflect bw probing? */ -+ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ -+ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ -+ rounds_since_probe:8, /* packet-timed rounds since probed bw */ -+ loss_round_start:1, /* loss_round_delivered round trip? */ -+ loss_in_round:1, /* loss marked in this round trip? */ -+ ecn_in_round:1, /* ECN marked in this round trip? */ -+ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ -+ loss_events_in_round:4,/* losses in STARTUP round */ -+ initialized:1; /* has bbr_init() been called? */ -+ u32 alpha_last_delivered; /* tp->delivered at alpha update */ -+ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ -+ -+ /* Params configurable using setsockopt. Refer to correspoding -+ * module param for detailed description of params. -+ */ -+ struct bbr_params { -+ u32 high_gain:11, /* max allowed value: 2047 */ -+ drain_gain:10, /* max allowed value: 1023 */ -+ cwnd_gain:11; /* max allowed value: 2047 */ -+ u32 cwnd_min_target:4, /* max allowed value: 15 */ -+ min_rtt_win_sec:5, /* max allowed value: 31 */ -+ probe_rtt_mode_ms:9, /* max allowed value: 511 */ -+ full_bw_cnt:3, /* max allowed value: 7 */ -+ cwnd_tso_budget:1, /* allowed values: {0, 1} */ -+ unused3:6, -+ drain_to_target:1, /* boolean */ -+ precise_ece_ack:1, /* boolean */ -+ extra_acked_in_startup:1, /* allowed values: {0, 1} */ -+ fast_path:1; /* boolean */ -+ u32 full_bw_thresh:10, /* max allowed value: 1023 */ -+ startup_cwnd_gain:11, /* max allowed value: 2047 */ -+ bw_probe_pif_gain:9, /* max allowed value: 511 */ -+ usage_based_cwnd:1, /* boolean */ -+ unused2:1; -+ u16 probe_rtt_win_ms:14, /* max allowed value: 16383 */ -+ refill_add_inc:2; /* max allowed value: 3 */ -+ u16 extra_acked_gain:11, /* max allowed value: 2047 */ -+ extra_acked_win_rtts:5; /* max allowed value: 31*/ -+ u16 pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */ -+ /* Mostly BBR v2 parameters below here: */ -+ u32 ecn_alpha_gain:8, /* max allowed value: 255 */ -+ ecn_factor:8, /* max allowed value: 255 */ -+ ecn_thresh:8, /* max allowed value: 255 */ -+ beta:8; /* max allowed value: 255 */ -+ u32 ecn_max_rtt_us:19, /* max allowed value: 524287 */ -+ bw_probe_reno_gain:9, /* max allowed value: 511 */ -+ full_loss_cnt:4; /* max allowed value: 15 */ -+ u32 probe_rtt_cwnd_gain:8, /* max allowed value: 255 */ -+ inflight_headroom:8, /* max allowed value: 255 */ -+ loss_thresh:8, /* max allowed value: 255 */ -+ bw_probe_max_rounds:8; /* max allowed value: 255 */ -+ u32 bw_probe_rand_rounds:4, /* max allowed value: 15 */ -+ bw_probe_base_us:26, /* usecs: 0..2^26-1 (67 secs) */ -+ full_ecn_cnt:2; /* max allowed value: 3 */ -+ u32 bw_probe_rand_us:26, /* usecs: 0..2^26-1 (67 secs) */ -+ undo:1, /* boolean */ -+ tso_rtt_shift:4, /* max allowed value: 15 */ -+ unused5:1; -+ u32 ecn_reprobe_gain:9, /* max allowed value: 511 */ -+ unused1:14, -+ ecn_alpha_init:9; /* max allowed value: 256 */ -+ } params; -+ -+ struct { -+ u32 snd_isn; /* Initial sequence number */ -+ u32 rs_bw; /* last valid rate sample bw */ -+ u32 target_cwnd; /* target cwnd, based on BDP */ -+ u8 undo:1, /* Undo even happened but not yet logged */ -+ unused:7; -+ char event; /* single-letter event debug codes */ -+ u16 unused2; -+ } debug; -+}; -+ -+struct bbr_context { -+ u32 sample_bw; -+ u32 target_cwnd; -+ u32 log:1; -+}; -+ -+/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */ -+static u32 bbr_min_rtt_win_sec = 10; -+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode. -+ * Max allowed value is 511 (0x1FF). -+ */ -+static u32 bbr_probe_rtt_mode_ms = 200; -+/* Window length of probe_rtt_min_us filter (in ms), and consequently the -+ * typical interval between PROBE_RTT mode entries. -+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC -+ */ -+static u32 bbr_probe_rtt_win_ms = 5000; -+/* Skip TSO below the following bandwidth (bits/sec): */ -+static int bbr_min_tso_rate = 1200000; -+ -+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting -+ * in bigger TSO bursts. By default we cut the RTT-based allowance in half -+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance -+ * is below 1500 bytes after 6 * ~500 usec = 3ms. -+ */ -+static u32 bbr_tso_rtt_shift = 9; /* halve allowance per 2^9 usecs, 512us */ -+ -+/* Select cwnd TSO budget approach: -+ * 0: padding -+ * 1: flooring -+ */ -+static uint bbr_cwnd_tso_budget = 1; -+ -+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. -+ * In order to help drive the network toward lower queues and low latency while -+ * maintaining high utilization, the average pacing rate aims to be slightly -+ * lower than the estimated bandwidth. This is an important aspect of the -+ * design. -+ */ -+static const int bbr_pacing_margin_percent = 1; -+ -+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain -+ * that will allow a smoothly increasing pacing rate that will double each RTT -+ * and send the same number of packets per RTT that an un-paced, slow-starting -+ * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF). -+ */ -+static int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; -+/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */ -+static int bbr_startup_cwnd_gain = BBR_UNIT * 2885 / 1000 + 1; -+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain -+ * the queue created in BBR_STARTUP in a single round. Max allowed value -+ * is 1023 (0x3FF). -+ */ -+static int bbr_drain_gain = BBR_UNIT * 1000 / 2885; -+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs. -+ * Max allowed value is 2047 (0x7FF). -+ */ -+static int bbr_cwnd_gain = BBR_UNIT * 2; -+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw. -+ * Max allowed value for each element is 1023 (0x3FF). -+ */ -+enum bbr_pacing_gain_phase { -+ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ -+ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ -+ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ -+ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ -+}; -+static int bbr_pacing_gain[] = { -+ BBR_UNIT * 5 / 4, /* probe for more available bw */ -+ BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ -+ BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ -+ BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ -+}; -+ -+/* Try to keep at least this many packets in flight, if things go smoothly. For -+ * smooth functioning, a sliding window protocol ACKing every other packet -+ * needs at least 4 packets in flight. Max allowed value is 15 (0xF). -+ */ -+static u32 bbr_cwnd_min_target = 4; -+ -+/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%. -+ * Use 0 to disable. Max allowed value is 255. -+ */ -+static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; -+ -+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ -+/* If bw has increased significantly (1.25x), there may be more bw available. -+ * Max allowed value is 1023 (0x3FF). -+ */ -+static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; -+/* But after 3 rounds w/o significant bw growth, estimate pipe is full. -+ * Max allowed value is 7 (0x7). -+ */ -+static u32 bbr_full_bw_cnt = 3; -+ -+static u32 bbr_flags; /* Debugging related stuff */ -+ -+/* Whether to debug using printk. -+ */ -+static bool bbr_debug_with_printk; -+ -+/* Whether to debug using ftrace event tcp:tcp_bbr_event. -+ * Ignored when bbr_debug_with_printk is set. -+ */ -+static bool bbr_debug_ftrace; -+ -+/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */ -+static bool bbr_drain_to_target = true; /* default: enabled */ -+ -+/* Experiment: Flags to control BBR with ECN behavior. -+ */ -+static bool bbr_precise_ece_ack = true; /* default: enabled */ -+ -+/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is -+ * (2^(16+14) B)/(1024 B/packet) = 1M packets. -+ */ -+static u32 bbr_cwnd_warn_val = 1U << 20; -+ -+static u16 bbr_debug_port_mask; -+ -+/* BBR module parameters. These are module parameters only in Google prod. -+ * Upstream these are intentionally not module parameters. -+ */ -+static int bbr_pacing_gain_size = CYCLE_LEN; -+ -+/* Gain factor for adding extra_acked to target cwnd: */ -+static int bbr_extra_acked_gain = 256; -+ -+/* Window length of extra_acked window. Max allowed val is 31. */ -+static u32 bbr_extra_acked_win_rtts = 5; -+ -+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ -+static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; -+ -+/* Time period for clamping cwnd increment due to ack aggregation */ -+static u32 bbr_extra_acked_max_us = 100 * 1000; -+ -+/* Use extra acked in startup ? -+ * 0: disabled -+ * 1: use latest extra_acked value from 1-2 rtt in startup -+ */ -+static int bbr_extra_acked_in_startup = 1; /* default: enabled */ -+ -+/* Experiment: don't grow cwnd beyond twice of what we just probed. */ -+static bool bbr_usage_based_cwnd; /* default: disabled */ -+ -+/* For lab testing, researchers can enable BBRv2 ECN support with this flag, -+ * when they know that any ECN marks that the connections experience will be -+ * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks. -+ * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on -+ * negotiation or configuration that is outside the scope of the BBRv2 -+ * alpha release. -+ */ -+static bool bbr_ecn_enable = false; -+ -+module_param_named(min_tso_rate, bbr_min_tso_rate, int, 0644); -+module_param_named(tso_rtt_shift, bbr_tso_rtt_shift, int, 0644); -+module_param_named(high_gain, bbr_high_gain, int, 0644); -+module_param_named(drain_gain, bbr_drain_gain, int, 0644); -+module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int, 0644); -+module_param_named(cwnd_gain, bbr_cwnd_gain, int, 0644); -+module_param_array_named(pacing_gain, bbr_pacing_gain, int, -+ &bbr_pacing_gain_size, 0644); -+module_param_named(cwnd_min_target, bbr_cwnd_min_target, uint, 0644); -+module_param_named(probe_rtt_cwnd_gain, -+ bbr_probe_rtt_cwnd_gain, uint, 0664); -+module_param_named(cwnd_warn_val, bbr_cwnd_warn_val, uint, 0664); -+module_param_named(debug_port_mask, bbr_debug_port_mask, ushort, 0644); -+module_param_named(flags, bbr_flags, uint, 0644); -+module_param_named(debug_ftrace, bbr_debug_ftrace, bool, 0644); -+module_param_named(debug_with_printk, bbr_debug_with_printk, bool, 0644); -+module_param_named(min_rtt_win_sec, bbr_min_rtt_win_sec, uint, 0644); -+module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint, 0644); -+module_param_named(probe_rtt_win_ms, bbr_probe_rtt_win_ms, uint, 0644); -+module_param_named(full_bw_thresh, bbr_full_bw_thresh, uint, 0644); -+module_param_named(full_bw_cnt, bbr_full_bw_cnt, uint, 0644); -+module_param_named(cwnd_tso_bduget, bbr_cwnd_tso_budget, uint, 0664); -+module_param_named(extra_acked_gain, bbr_extra_acked_gain, int, 0664); -+module_param_named(extra_acked_win_rtts, -+ bbr_extra_acked_win_rtts, uint, 0664); -+module_param_named(extra_acked_max_us, -+ bbr_extra_acked_max_us, uint, 0664); -+module_param_named(ack_epoch_acked_reset_thresh, -+ bbr_ack_epoch_acked_reset_thresh, uint, 0664); -+module_param_named(drain_to_target, bbr_drain_to_target, bool, 0664); -+module_param_named(precise_ece_ack, bbr_precise_ece_ack, bool, 0664); -+module_param_named(extra_acked_in_startup, -+ bbr_extra_acked_in_startup, int, 0664); -+module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool, 0664); -+module_param_named(ecn_enable, bbr_ecn_enable, bool, 0664); -+ -+static void bbr2_exit_probe_rtt(struct sock *sk); -+static void bbr2_reset_congestion_signals(struct sock *sk); -+ -+static void bbr_check_probe_rtt_done(struct sock *sk); -+ -+/* Do we estimate that STARTUP filled the pipe? */ -+static bool bbr_full_bw_reached(const struct sock *sk) -+{ -+ const struct bbr *bbr = inet_csk_ca(sk); -+ -+ return bbr->full_bw_reached; -+} -+ -+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ -+static u32 bbr_max_bw(const struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ return max(bbr->bw_hi[0], bbr->bw_hi[1]); -+} -+ -+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ -+static u32 bbr_bw(const struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ return min(bbr_max_bw(sk), bbr->bw_lo); -+} -+ -+/* Return maximum extra acked in past k-2k round trips, -+ * where k = bbr_extra_acked_win_rtts. -+ */ -+static u16 bbr_extra_acked(const struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ return max(bbr->extra_acked[0], bbr->extra_acked[1]); -+} -+ -+/* Return rate in bytes per second, optionally with a gain. -+ * The order here is chosen carefully to avoid overflow of u64. This should -+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. -+ */ -+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, -+ int margin) -+{ -+ unsigned int mss = tcp_sk(sk)->mss_cache; -+ -+ rate *= mss; -+ rate *= gain; -+ rate >>= BBR_SCALE; -+ rate *= USEC_PER_SEC / 100 * (100 - margin); -+ rate >>= BW_SCALE; -+ rate = max(rate, 1ULL); -+ return rate; -+} -+ -+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) -+{ -+ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); -+} -+ -+static u64 bbr_rate_kbps(struct sock *sk, u64 rate) -+{ -+ rate = bbr_bw_bytes_per_sec(sk, rate); -+ rate *= 8; -+ do_div(rate, 1000); -+ return rate; -+} -+ -+static u32 bbr_tso_segs_goal(struct sock *sk); -+static void bbr_debug(struct sock *sk, u32 acked, -+ const struct rate_sample *rs, struct bbr_context *ctx) -+{ -+ static const char ca_states[] = { -+ [TCP_CA_Open] = 'O', -+ [TCP_CA_Disorder] = 'D', -+ [TCP_CA_CWR] = 'C', -+ [TCP_CA_Recovery] = 'R', -+ [TCP_CA_Loss] = 'L', -+ }; -+ static const char mode[] = { -+ 'G', /* Growing - BBR_STARTUP */ -+ 'D', /* Drain - BBR_DRAIN */ -+ 'W', /* Window - BBR_PROBE_BW */ -+ 'M', /* Min RTT - BBR_PROBE_RTT */ -+ }; -+ static const char ack_phase[] = { /* bbr_ack_phase strings */ -+ 'I', /* BBR_ACKS_INIT - 'Init' */ -+ 'R', /* BBR_ACKS_REFILLING - 'Refilling' */ -+ 'B', /* BBR_ACKS_PROBE_STARTING - 'Before' */ -+ 'F', /* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */ -+ 'A', /* BBR_ACKS_PROBE_STOPPING - 'After' */ -+ }; -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ const u32 una = tp->snd_una - bbr->debug.snd_isn; -+ const u32 fack = tcp_highest_sack_seq(tp); -+ const u16 dport = ntohs(inet_sk(sk)->inet_dport); -+ bool is_port_match = (bbr_debug_port_mask && -+ ((dport & bbr_debug_port_mask) == 0)); -+ char debugmsg[320]; -+ -+ if (sk->sk_state == TCP_SYN_SENT) -+ return; /* no bbr_init() yet if SYN retransmit -> CA_Loss */ -+ -+ if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) { -+ char addr[INET6_ADDRSTRLEN + 10] = { 0 }; -+ -+ if (sk->sk_family == AF_INET) -+ snprintf(addr, sizeof(addr), "%pI4:%u", -+ &inet_sk(sk)->inet_daddr, dport); -+ else if (sk->sk_family == AF_INET6) -+ snprintf(addr, sizeof(addr), "%pI6:%u", -+ &sk->sk_daddr, dport); -+ -+ WARN_ONCE(1, -+ "BBR %s cwnd alert: %u " -+ "snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u " -+ "bw: %u rtt: %u min_rtt: %u " -+ "acked: %u tso_segs: %u " -+ "bw: %d %ld %d pif: %u\n", -+ addr, tp->snd_cwnd, -+ una, inet_csk(sk)->icsk_ca_state, -+ bbr->pacing_gain, bbr->cwnd_gain, -+ bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us, -+ acked, bbr_tso_segs_goal(sk), -+ rs->delivered, rs->interval_us, rs->is_retrans, -+ tcp_packets_in_flight(tp)); -+ } -+ -+ if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace)) -+ return; -+ -+ if (!sock_flag(sk, SOCK_DBG) && !is_port_match) -+ return; -+ -+ if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE)) -+ return; -+ -+ if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) && -+ !(bbr_flags & FLAG_DEBUG_LOOPBACK)) -+ return; -+ -+ snprintf(debugmsg, sizeof(debugmsg) - 1, -+ "BBR %pI4:%-5u %5u,%03u:%-7u %c " -+ "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu " -+ "bw %llu lb %llu ib %llu qb %llu " -+ "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c " -+ "lr %d er %d ea %d bwl %lld il %d ih %d c %d " -+ "v %d %c %u %c %s\n", -+ &inet_sk(sk)->inet_daddr, dport, -+ una / 1000, una % 1000, fack - tp->snd_una, -+ ca_states[inet_csk(sk)->icsk_ca_state], -+ bbr->debug.undo ? '@' : mode[bbr->mode], -+ tp->snd_cwnd, -+ bbr_extra_acked(sk), /* br (legacy): extra_acked */ -+ rs->tx_in_flight, /* cr (legacy): tx_inflight */ -+ rs->rtt_us, -+ rs->delivered, -+ rs->interval_us, -+ bbr->min_rtt_us, -+ rs->is_app_limited ? '_' : 'l', -+ bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */ -+ bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */ -+ 0ULL, /* lb: [obsolete] */ -+ 0ULL, /* ib: [obsolete] */ -+ (u64)sk->sk_pacing_rate * 8 / 1000, -+ acked, -+ tcp_packets_in_flight(tp), -+ rs->is_ack_delayed ? 'd' : '.', -+ bbr->round_start ? '*' : '.', -+ tp->delivered, tp->lost, -+ tp->app_limited, -+ 0, /* #: [obsolete] */ -+ ctx->target_cwnd, -+ tp->reord_seen ? 'r' : '.', /* r: reordering seen? */ -+ ca_states[bbr->prev_ca_state], -+ (rs->lost + rs->delivered) > 0 ? -+ (1000 * rs->lost / -+ (rs->lost + rs->delivered)) : 0, /* lr: loss rate x1000 */ -+ (rs->delivered) > 0 ? -+ (1000 * rs->delivered_ce / -+ (rs->delivered)) : 0, /* er: ECN rate x1000 */ -+ 1000 * bbr->ecn_alpha >> BBR_SCALE, /* ea: ECN alpha x1000 */ -+ bbr->bw_lo == ~0U ? -+ -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */ -+ bbr->inflight_lo, /* il */ -+ bbr->inflight_hi, /* ih */ -+ bbr->bw_probe_up_cnt, /* c */ -+ 2, /* v: version */ -+ bbr->debug.event, -+ bbr->cycle_idx, -+ ack_phase[bbr->ack_phase], -+ bbr->bw_probe_samples ? "Y" : "N"); -+ debugmsg[sizeof(debugmsg) - 1] = 0; -+ -+ /* printk takes a higher precedence. */ -+ if (bbr_debug_with_printk) -+ printk(KERN_DEBUG "%s", debugmsg); -+ -+ if (unlikely(bbr->debug.undo)) -+ bbr->debug.undo = 0; -+} -+ -+/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ -+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) -+{ -+ u64 rate = bw; -+ -+ rate = bbr_rate_bytes_per_sec(sk, rate, gain, -+ bbr_pacing_margin_percent); -+ rate = min_t(u64, rate, sk->sk_max_pacing_rate); -+ return rate; -+} -+ -+/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ -+static void bbr_init_pacing_rate_from_rtt(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u64 bw; -+ u32 rtt_us; -+ -+ if (tp->srtt_us) { /* any RTT sample yet? */ -+ rtt_us = max(tp->srtt_us >> 3, 1U); -+ bbr->has_seen_rtt = 1; -+ } else { /* no RTT sample yet */ -+ rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ -+ } -+ bw = (u64)tp->snd_cwnd * BW_UNIT; -+ do_div(bw, rtt_us); -+ sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain); -+} -+ -+/* Pace using current bw estimate and a gain factor. */ -+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); -+ -+ if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) -+ bbr_init_pacing_rate_from_rtt(sk); -+ if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) -+ sk->sk_pacing_rate = rate; -+} -+ -+static u32 bbr_min_tso_segs(struct sock *sk) -+{ -+ return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; -+} -+ -+/* Return the number of segments BBR would like in a TSO/GSO skb, given -+ * a particular max gso size as a constraint. -+ */ -+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, -+ u32 gso_max_size) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 segs, r; -+ u64 bytes; -+ -+ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ -+ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; -+ -+ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every -+ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. -+ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) -+ */ -+ if (bbr->params.tso_rtt_shift) { -+ r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift; -+ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ -+ bytes += GSO_MAX_SIZE >> r; -+ } -+ -+ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); -+ segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk)); -+ return segs; -+} -+ -+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ -+static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) -+{ -+ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); -+} -+ -+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ -+static u32 bbr_tso_segs_goal(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); -+} -+ -+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ -+static void bbr_save_cwnd(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) -+ bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ -+ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ -+ bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); -+} -+ -+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (event == CA_EVENT_TX_START && tp->app_limited) { -+ bbr->idle_restart = 1; -+ bbr->ack_epoch_mstamp = tp->tcp_mstamp; -+ bbr->ack_epoch_acked = 0; -+ /* Avoid pointless buffer overflows: pace at est. bw if we don't -+ * need more speed (we're restarting from idle and app-limited). -+ */ -+ if (bbr->mode == BBR_PROBE_BW) -+ bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); -+ else if (bbr->mode == BBR_PROBE_RTT) -+ bbr_check_probe_rtt_done(sk); -+ } else if ((event == CA_EVENT_ECN_IS_CE || -+ event == CA_EVENT_ECN_NO_CE) && -+ bbr_ecn_enable && -+ bbr->params.precise_ece_ack) { -+ u32 state = bbr->ce_state; -+ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); -+ bbr->ce_state = state; -+ if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE) -+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); -+ } -+} -+ -+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: -+ * -+ * bdp = ceil(bw * min_rtt * gain) -+ * -+ * The key factor, gain, controls the amount of queue. While a small gain -+ * builds a smaller queue, it becomes more vulnerable to noise in RTT -+ * measurements (e.g., delayed ACKs or other ACK compression effects). This -+ * noise may cause BBR to under-estimate the rate. -+ */ -+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 bdp; -+ u64 w; -+ -+ /* If we've never had a valid RTT sample, cap cwnd at the initial -+ * default. This should only happen when the connection is not using TCP -+ * timestamps and has retransmitted all of the SYN/SYNACK/data packets -+ * ACKed so far. In this case, an RTO can cut cwnd to 1, in which -+ * case we need to slow-start up toward something safe: initial cwnd. -+ */ -+ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ -+ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ -+ -+ w = (u64)bw * bbr->min_rtt_us; -+ -+ /* Apply a gain to the given value, remove the BW_SCALE shift, and -+ * round the value up to avoid a negative feedback loop. -+ */ -+ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; -+ -+ return bdp; -+} -+ -+/* To achieve full performance in high-speed paths, we budget enough cwnd to -+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path: -+ * - one skb in sending host Qdisc, -+ * - one skb in sending host TSO/GSO engine -+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine -+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because -+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, -+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe -+ * full even with ACK-every-other-packet delayed ACKs. -+ */ -+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 tso_segs_goal; -+ -+ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); -+ -+ /* Allow enough full-sized skbs in flight to utilize end systems. */ -+ if (bbr->params.cwnd_tso_budget == 1) { -+ cwnd = max_t(u32, cwnd, tso_segs_goal); -+ cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); -+ } else { -+ cwnd += tso_segs_goal; -+ cwnd = (cwnd + 1) & ~1U; -+ } -+ /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ -+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) -+ cwnd += 2; -+ -+ return cwnd; -+} -+ -+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ -+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) -+{ -+ u32 inflight; -+ -+ inflight = bbr_bdp(sk, bw, gain); -+ inflight = bbr_quantization_budget(sk, inflight); -+ -+ return inflight; -+} -+ -+/* With pacing at lower layers, there's often less data "in the network" than -+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), -+ * we often have several skbs queued in the pacing layer with a pre-scheduled -+ * earliest departure time (EDT). BBR adapts its pacing rate based on the -+ * inflight level that it estimates has already been "baked in" by previous -+ * departure time decisions. We calculate a rough estimate of the number of our -+ * packets that might be in the network at the earliest departure time for the -+ * next skb scheduled: -+ * in_network_at_edt = inflight_at_edt - (EDT - now) * bw -+ * If we're increasing inflight, then we want to know if the transmit of the -+ * EDT skb will push inflight above the target, so inflight_at_edt includes -+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, -+ * then estimate if inflight will sink too low just before the EDT transmit. -+ */ -+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u64 now_ns, edt_ns, interval_us; -+ u32 interval_delivered, inflight_at_edt; -+ -+ now_ns = tp->tcp_clock_cache; -+ edt_ns = max(tp->tcp_wstamp_ns, now_ns); -+ interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); -+ interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; -+ inflight_at_edt = inflight_now; -+ if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ -+ inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ -+ if (interval_delivered >= inflight_at_edt) -+ return 0; -+ return inflight_at_edt - interval_delivered; -+} -+ -+/* Find the cwnd increment based on estimate of ack aggregation */ -+static u32 bbr_ack_aggregation_cwnd(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 max_aggr_cwnd, aggr_cwnd = 0; -+ -+ if (bbr->params.extra_acked_gain && -+ (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) { -+ max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) -+ / BW_UNIT; -+ aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk)) -+ >> BBR_SCALE; -+ aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); -+ } -+ -+ return aggr_cwnd; -+} -+ -+/* Returns the cwnd for PROBE_RTT mode. */ -+static u32 bbr_probe_rtt_cwnd(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr->params.probe_rtt_cwnd_gain == 0) -+ return bbr->params.cwnd_min_target; -+ return max_t(u32, bbr->params.cwnd_min_target, -+ bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain)); -+} -+ -+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss -+ * has drawn us down below target), or snap down to target if we're above it. -+ */ -+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, -+ u32 acked, u32 bw, int gain, u32 cwnd, -+ struct bbr_context *ctx) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe; -+ -+ if (!acked) -+ goto done; /* no packet fully ACKed; just apply caps */ -+ -+ target_cwnd = bbr_bdp(sk, bw, gain); -+ -+ /* Increment the cwnd to account for excess ACKed data that seems -+ * due to aggregation (of data and/or ACKs) visible in the ACK stream. -+ */ -+ target_cwnd += bbr_ack_aggregation_cwnd(sk); -+ target_cwnd = bbr_quantization_budget(sk, target_cwnd); -+ -+ /* If we're below target cwnd, slow start cwnd toward target cwnd. */ -+ bbr->debug.target_cwnd = target_cwnd; -+ -+ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ -+ bbr->try_fast_path = 0; -+ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ -+ cwnd += acked; -+ if (cwnd >= target_cwnd) { -+ cwnd = target_cwnd; -+ bbr->try_fast_path = 1; -+ } -+ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { -+ cwnd += acked; -+ } else { -+ bbr->try_fast_path = 1; -+ } -+ -+ /* When growing cwnd, don't grow beyond twice what we just probed. */ -+ if (bbr->params.usage_based_cwnd) { -+ max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd); -+ cwnd = min(cwnd, max_probe); -+ } -+ -+ cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); -+done: -+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ -+ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ -+ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk)); -+ -+ ctx->target_cwnd = target_cwnd; -+ ctx->log = (tp->snd_cwnd != prev_cwnd); -+} -+ -+/* See if we have reached next round trip */ -+static void bbr_update_round_start(struct sock *sk, -+ const struct rate_sample *rs, struct bbr_context *ctx) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->round_start = 0; -+ -+ /* See if we've reached the next RTT */ -+ if (rs->interval_us > 0 && -+ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { -+ bbr->next_rtt_delivered = tp->delivered; -+ bbr->round_start = 1; -+ } -+} -+ -+/* Calculate the bandwidth based on how fast packets are delivered */ -+static void bbr_calculate_bw_sample(struct sock *sk, -+ const struct rate_sample *rs, struct bbr_context *ctx) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u64 bw = 0; -+ -+ /* Divide delivered by the interval to find a (lower bound) bottleneck -+ * bandwidth sample. Delivered is in packets and interval_us in uS and -+ * ratio will be <<1 for most connections. So delivered is first scaled. -+ * Round up to allow growth at low rates, even with integer division. -+ */ -+ if (rs->interval_us > 0) { -+ if (WARN_ONCE(rs->delivered < 0, -+ "negative delivered: %d interval_us: %ld\n", -+ rs->delivered, rs->interval_us)) -+ return; -+ -+ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); -+ } -+ -+ ctx->sample_bw = bw; -+ bbr->debug.rs_bw = bw; -+} -+ -+/* Estimates the windowed max degree of ack aggregation. -+ * This is used to provision extra in-flight data to keep sending during -+ * inter-ACK silences. -+ * -+ * Degree of ack aggregation is estimated as extra data acked beyond expected. -+ * -+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" -+ * cwnd += max_extra_acked -+ * -+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). -+ * Max filter is an approximate sliding window of 5-10 (packet timed) round -+ * trips for non-startup phase, and 1-2 round trips for startup. -+ */ -+static void bbr_update_ack_aggregation(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ u32 epoch_us, expected_acked, extra_acked; -+ struct bbr *bbr = inet_csk_ca(sk); -+ struct tcp_sock *tp = tcp_sk(sk); -+ u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts; -+ -+ if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 || -+ rs->delivered < 0 || rs->interval_us <= 0) -+ return; -+ -+ if (bbr->round_start) { -+ bbr->extra_acked_win_rtts = min(0x1F, -+ bbr->extra_acked_win_rtts + 1); -+ if (bbr->params.extra_acked_in_startup && -+ !bbr_full_bw_reached(sk)) -+ extra_acked_win_rtts_thresh = 1; -+ if (bbr->extra_acked_win_rtts >= -+ extra_acked_win_rtts_thresh) { -+ bbr->extra_acked_win_rtts = 0; -+ bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? -+ 0 : 1; -+ bbr->extra_acked[bbr->extra_acked_win_idx] = 0; -+ } -+ } -+ -+ /* Compute how many packets we expected to be delivered over epoch. */ -+ epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, -+ bbr->ack_epoch_mstamp); -+ expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; -+ -+ /* Reset the aggregation epoch if ACK rate is below expected rate or -+ * significantly large no. of ack received since epoch (potentially -+ * quite old epoch). -+ */ -+ if (bbr->ack_epoch_acked <= expected_acked || -+ (bbr->ack_epoch_acked + rs->acked_sacked >= -+ bbr_ack_epoch_acked_reset_thresh)) { -+ bbr->ack_epoch_acked = 0; -+ bbr->ack_epoch_mstamp = tp->delivered_mstamp; -+ expected_acked = 0; -+ } -+ -+ /* Compute excess data delivered, beyond what was expected. */ -+ bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, -+ bbr->ack_epoch_acked + rs->acked_sacked); -+ extra_acked = bbr->ack_epoch_acked - expected_acked; -+ extra_acked = min(extra_acked, tp->snd_cwnd); -+ if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) -+ bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; -+} -+ -+/* Estimate when the pipe is full, using the change in delivery rate: BBR -+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by -+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited -+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the -+ * higher rwin, 3: we get higher delivery rate samples. Or transient -+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar -+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. -+ */ -+static void bbr_check_full_bw_reached(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 bw_thresh; -+ -+ if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) -+ return; -+ -+ bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE; -+ if (bbr_max_bw(sk) >= bw_thresh) { -+ bbr->full_bw = bbr_max_bw(sk); -+ bbr->full_bw_cnt = 0; -+ return; -+ } -+ ++bbr->full_bw_cnt; -+ bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt; -+} -+ -+/* If pipe is probably full, drain the queue and then enter steady-state. */ -+static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs, -+ struct bbr_context *ctx) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { -+ bbr->mode = BBR_DRAIN; /* drain queue we created */ -+ tcp_sk(sk)->snd_ssthresh = -+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); -+ bbr2_reset_congestion_signals(sk); -+ } /* fall through to check if in-flight is already small: */ -+ if (bbr->mode == BBR_DRAIN && -+ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= -+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) -+ return true; /* exiting DRAIN now */ -+ return false; -+} -+ -+static void bbr_check_probe_rtt_done(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (!(bbr->probe_rtt_done_stamp && -+ after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) -+ return; -+ -+ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ -+ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); -+ bbr2_exit_probe_rtt(sk); -+} -+ -+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and -+ * periodically drain the bottleneck queue, to converge to measure the true -+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues -+ * small (reducing queuing delay and packet loss) and achieve fairness among -+ * BBR flows. -+ * -+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, -+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. -+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed -+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and -+ * re-enter the previous mode. BBR uses 200ms to approximately bound the -+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). -+ * -+ * Note that flows need only pay 2% if they are busy sending over the last 10 -+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have -+ * natural silences or low-rate periods within 10 seconds where the rate is low -+ * enough for long enough to drain its queue in the bottleneck. We pick up -+ * these min RTT measurements opportunistically with our min_rtt filter. :-) -+ */ -+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ bool probe_rtt_expired, min_rtt_expired; -+ u32 expire; -+ -+ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ -+ expire = bbr->probe_rtt_min_stamp + -+ msecs_to_jiffies(bbr->params.probe_rtt_win_ms); -+ probe_rtt_expired = after(tcp_jiffies32, expire); -+ if (rs->rtt_us >= 0 && -+ (rs->rtt_us <= bbr->probe_rtt_min_us || -+ (probe_rtt_expired && !rs->is_ack_delayed))) { -+ bbr->probe_rtt_min_us = rs->rtt_us; -+ bbr->probe_rtt_min_stamp = tcp_jiffies32; -+ } -+ /* Track min RTT seen in the min_rtt_win_sec filter window: */ -+ expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ; -+ min_rtt_expired = after(tcp_jiffies32, expire); -+ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || -+ min_rtt_expired) { -+ bbr->min_rtt_us = bbr->probe_rtt_min_us; -+ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; -+ } -+ -+ if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired && -+ !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { -+ bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ -+ bbr_save_cwnd(sk); /* note cwnd so we can restore it */ -+ bbr->probe_rtt_done_stamp = 0; -+ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; -+ bbr->next_rtt_delivered = tp->delivered; -+ } -+ -+ if (bbr->mode == BBR_PROBE_RTT) { -+ /* Ignore low rate samples during this mode. */ -+ tp->app_limited = -+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; -+ /* Maintain min packets in flight for max(200 ms, 1 round). */ -+ if (!bbr->probe_rtt_done_stamp && -+ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { -+ bbr->probe_rtt_done_stamp = tcp_jiffies32 + -+ msecs_to_jiffies(bbr->params.probe_rtt_mode_ms); -+ bbr->probe_rtt_round_done = 0; -+ bbr->next_rtt_delivered = tp->delivered; -+ } else if (bbr->probe_rtt_done_stamp) { -+ if (bbr->round_start) -+ bbr->probe_rtt_round_done = 1; -+ if (bbr->probe_rtt_round_done) -+ bbr_check_probe_rtt_done(sk); -+ } -+ } -+ /* Restart after idle ends only once we process a new S/ACK for data */ -+ if (rs->delivered > 0) -+ bbr->idle_restart = 0; -+} -+ -+static void bbr_update_gains(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ switch (bbr->mode) { -+ case BBR_STARTUP: -+ bbr->pacing_gain = bbr->params.high_gain; -+ bbr->cwnd_gain = bbr->params.startup_cwnd_gain; -+ break; -+ case BBR_DRAIN: -+ bbr->pacing_gain = bbr->params.drain_gain; /* slow, to drain */ -+ bbr->cwnd_gain = bbr->params.startup_cwnd_gain; /* keep cwnd */ -+ break; -+ case BBR_PROBE_BW: -+ bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx]; -+ bbr->cwnd_gain = bbr->params.cwnd_gain; -+ break; -+ case BBR_PROBE_RTT: -+ bbr->pacing_gain = BBR_UNIT; -+ bbr->cwnd_gain = BBR_UNIT; -+ break; -+ default: -+ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); -+ break; -+ } -+} -+ -+static void bbr_init(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ int i; -+ -+ WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val); -+ -+ bbr->initialized = 1; -+ bbr->params.high_gain = min(0x7FF, bbr_high_gain); -+ bbr->params.drain_gain = min(0x3FF, bbr_drain_gain); -+ bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain); -+ bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain); -+ bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget); -+ bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target); -+ bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec); -+ bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms); -+ bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt); -+ bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh); -+ bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain); -+ bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts); -+ bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0; -+ bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0; -+ bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0; -+ bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain); -+ bbr->params.probe_rtt_win_ms = -+ min(0x3FFFU, -+ min_t(u32, bbr_probe_rtt_win_ms, -+ bbr->params.min_rtt_win_sec * MSEC_PER_SEC)); -+ for (i = 0; i < CYCLE_LEN; i++) -+ bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]); -+ bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0; -+ bbr->params.tso_rtt_shift = min(0xFU, bbr_tso_rtt_shift); -+ -+ bbr->debug.snd_isn = tp->snd_una; -+ bbr->debug.target_cwnd = 0; -+ bbr->debug.undo = 0; -+ -+ bbr->init_cwnd = min(0x7FU, tp->snd_cwnd); -+ bbr->prior_cwnd = tp->prior_cwnd; -+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; -+ bbr->next_rtt_delivered = 0; -+ bbr->prev_ca_state = TCP_CA_Open; -+ bbr->packet_conservation = 0; -+ -+ bbr->probe_rtt_done_stamp = 0; -+ bbr->probe_rtt_round_done = 0; -+ bbr->probe_rtt_min_us = tcp_min_rtt(tp); -+ bbr->probe_rtt_min_stamp = tcp_jiffies32; -+ bbr->min_rtt_us = tcp_min_rtt(tp); -+ bbr->min_rtt_stamp = tcp_jiffies32; -+ -+ bbr->has_seen_rtt = 0; -+ bbr_init_pacing_rate_from_rtt(sk); -+ -+ bbr->round_start = 0; -+ bbr->idle_restart = 0; -+ bbr->full_bw_reached = 0; -+ bbr->full_bw = 0; -+ bbr->full_bw_cnt = 0; -+ bbr->cycle_mstamp = 0; -+ bbr->cycle_idx = 0; -+ bbr->mode = BBR_STARTUP; -+ bbr->debug.rs_bw = 0; -+ -+ bbr->ack_epoch_mstamp = tp->tcp_mstamp; -+ bbr->ack_epoch_acked = 0; -+ bbr->extra_acked_win_rtts = 0; -+ bbr->extra_acked_win_idx = 0; -+ bbr->extra_acked[0] = 0; -+ bbr->extra_acked[1] = 0; -+ -+ bbr->ce_state = 0; -+ bbr->prior_rcv_nxt = tp->rcv_nxt; -+ bbr->try_fast_path = 0; -+ -+ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); -+} -+ -+static u32 bbr_sndbuf_expand(struct sock *sk) -+{ -+ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ -+ return 3; -+} -+ -+/* __________________________________________________________________________ -+ * -+ * Functions new to BBR v2 ("bbr") congestion control are below here. -+ * __________________________________________________________________________ -+ */ -+ -+/* Incorporate a new bw sample into the current window of our max filter. */ -+static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); -+} -+ -+/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ -+static void bbr2_advance_bw_hi_filter(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (!bbr->bw_hi[1]) -+ return; /* no samples in this window; remember old window */ -+ bbr->bw_hi[0] = bbr->bw_hi[1]; -+ bbr->bw_hi[1] = 0; -+} -+ -+/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ -+static u32 bbr2_target_inflight(struct sock *sk) -+{ -+ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); -+ -+ return min(bdp, tcp_sk(sk)->snd_cwnd); -+} -+ -+static bool bbr2_is_probing_bandwidth(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ return (bbr->mode == BBR_STARTUP) || -+ (bbr->mode == BBR_PROBE_BW && -+ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || -+ bbr->cycle_idx == BBR_BW_PROBE_UP)); -+} -+ -+/* Has the given amount of time elapsed since we marked the phase start? */ -+static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) -+{ -+ const struct tcp_sock *tp = tcp_sk(sk); -+ const struct bbr *bbr = inet_csk_ca(sk); -+ -+ return tcp_stamp_us_delta(tp->tcp_mstamp, -+ bbr->cycle_mstamp + interval_us) > 0; -+} -+ -+static void bbr2_handle_queue_too_high_in_startup(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->full_bw_reached = 1; -+ bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); -+} -+ -+/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ -+static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || -+ !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh) -+ return; -+ -+ if (ce_ratio >= bbr->params.ecn_thresh) -+ bbr->startup_ecn_rounds++; -+ else -+ bbr->startup_ecn_rounds = 0; -+ -+ if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) { -+ bbr->debug.event = 'E'; /* ECN caused STARTUP exit */ -+ bbr2_handle_queue_too_high_in_startup(sk); -+ return; -+ } -+} -+ -+static void bbr2_update_ecn_alpha(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ s32 delivered, delivered_ce; -+ u64 alpha, ce_ratio; -+ u32 gain; -+ -+ if (bbr->params.ecn_factor == 0) -+ return; -+ -+ delivered = tp->delivered - bbr->alpha_last_delivered; -+ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; -+ -+ if (delivered == 0 || /* avoid divide by zero */ -+ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ -+ return; -+ -+ /* See if we should use ECN sender logic for this connection. */ -+ if (!bbr->ecn_eligible && bbr_ecn_enable && -+ (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us || -+ !bbr->params.ecn_max_rtt_us)) -+ bbr->ecn_eligible = 1; -+ -+ ce_ratio = (u64)delivered_ce << BBR_SCALE; -+ do_div(ce_ratio, delivered); -+ gain = bbr->params.ecn_alpha_gain; -+ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; -+ alpha += (gain * ce_ratio) >> BBR_SCALE; -+ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); -+ -+ bbr->alpha_last_delivered = tp->delivered; -+ bbr->alpha_last_delivered_ce = tp->delivered_ce; -+ -+ bbr2_check_ecn_too_high_in_startup(sk, ce_ratio); -+} -+ -+/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ -+static void bbr2_raise_inflight_hi_slope(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 growth_this_round, cnt; -+ -+ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ -+ growth_this_round = 1 << bbr->bw_probe_up_rounds; -+ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); -+ cnt = tp->snd_cwnd / growth_this_round; -+ cnt = max(cnt, 1U); -+ bbr->bw_probe_up_cnt = cnt; -+ bbr->debug.event = 'G'; /* Grow inflight_hi slope */ -+} -+ -+/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ -+static void bbr2_probe_inflight_hi_upward(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 delta; -+ -+ if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) { -+ bbr->bw_probe_up_acks = 0; /* don't accmulate unused credits */ -+ return; /* not fully using inflight_hi, so don't grow it */ -+ } -+ -+ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ -+ bbr->bw_probe_up_acks += rs->acked_sacked; -+ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { -+ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; -+ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; -+ bbr->inflight_hi += delta; -+ bbr->debug.event = 'I'; /* Increment inflight_hi */ -+ } -+ -+ if (bbr->round_start) -+ bbr2_raise_inflight_hi_slope(sk); -+} -+ -+/* Does loss/ECN rate for this sample say inflight is "too high"? -+ * This is used by both the bbr_check_loss_too_high_in_startup() function, -+ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which -+ * uses it to notice when loss/ECN rates suggest inflight is too high. -+ */ -+static bool bbr2_is_inflight_too_high(const struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ const struct bbr *bbr = inet_csk_ca(sk); -+ u32 loss_thresh, ecn_thresh; -+ -+ if (rs->lost > 0 && rs->tx_in_flight) { -+ loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >> -+ BBR_SCALE; -+ if (rs->lost > loss_thresh) -+ return true; -+ } -+ -+ if (rs->delivered_ce > 0 && rs->delivered > 0 && -+ bbr->ecn_eligible && bbr->params.ecn_thresh) { -+ ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >> -+ BBR_SCALE; -+ if (rs->delivered_ce >= ecn_thresh) -+ return true; -+ } -+ -+ return false; -+} -+ -+/* Calculate the tx_in_flight level that corresponded to excessive loss. -+ * We find "lost_prefix" segs of the skb where loss rate went too high, -+ * by solving for "lost_prefix" in the following equation: -+ * lost / inflight >= loss_thresh -+ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh -+ * Then we take that equation, convert it to fixed point, and -+ * round up to the nearest packet. -+ */ -+static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk, -+ const struct rate_sample *rs, -+ const struct sk_buff *skb) -+{ -+ const struct bbr *bbr = inet_csk_ca(sk); -+ u32 loss_thresh = bbr->params.loss_thresh; -+ u32 pcount, divisor, inflight_hi; -+ s32 inflight_prev, lost_prev; -+ u64 loss_budget, lost_prefix; -+ -+ pcount = tcp_skb_pcount(skb); -+ -+ /* How much data was in flight before this skb? */ -+ inflight_prev = rs->tx_in_flight - pcount; -+ if (WARN_ONCE(inflight_prev < 0, -+ "tx_in_flight: %u pcount: %u reneg: %u", -+ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg)) -+ return ~0U; -+ -+ /* How much inflight data was marked lost before this skb? */ -+ lost_prev = rs->lost - pcount; -+ if (WARN_ON_ONCE(lost_prev < 0)) -+ return ~0U; -+ -+ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ -+ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; -+ loss_budget >>= BBR_SCALE; -+ if (lost_prev >= loss_budget) { -+ lost_prefix = 0; /* previous losses crossed loss_thresh */ -+ } else { -+ lost_prefix = loss_budget - lost_prev; -+ lost_prefix <<= BBR_SCALE; -+ divisor = BBR_UNIT - loss_thresh; -+ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ -+ return ~0U; -+ do_div(lost_prefix, divisor); -+ } -+ -+ inflight_hi = inflight_prev + lost_prefix; -+ return inflight_hi; -+} -+ -+/* If loss/ECN rates during probing indicated we may have overfilled a -+ * buffer, return an operating point that tries to leave unutilized headroom in -+ * the path for other flows, for fairness convergence and lower RTTs and loss. -+ */ -+static u32 bbr2_inflight_with_headroom(const struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 headroom, headroom_fraction; -+ -+ if (bbr->inflight_hi == ~0U) -+ return ~0U; -+ -+ headroom_fraction = bbr->params.inflight_headroom; -+ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; -+ headroom = max(headroom, 1U); -+ return max_t(s32, bbr->inflight_hi - headroom, -+ bbr->params.cwnd_min_target); -+} -+ -+/* Bound cwnd to a sensible level, based on our current probing state -+ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). -+ */ -+static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 cap; -+ -+ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() -+ * and thus cong_control() without first initializing us(!). -+ */ -+ if (!bbr->initialized) -+ return; -+ -+ cap = ~0U; -+ if (bbr->mode == BBR_PROBE_BW && -+ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { -+ /* Probe to see if more packets fit in the path. */ -+ cap = bbr->inflight_hi; -+ } else { -+ if (bbr->mode == BBR_PROBE_RTT || -+ (bbr->mode == BBR_PROBE_BW && -+ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) -+ cap = bbr2_inflight_with_headroom(sk); -+ } -+ /* Adapt to any loss/ECN since our last bw probe. */ -+ cap = min(cap, bbr->inflight_lo); -+ -+ cap = max_t(u32, cap, bbr->params.cwnd_min_target); -+ tp->snd_cwnd = min(cap, tp->snd_cwnd); -+} -+ -+/* Estimate a short-term lower bound on the capacity available now, based -+ * on measurements of the current delivery process and recent history. When we -+ * are seeing loss/ECN at times when we are not probing bw, then conservatively -+ * move toward flow balance by multiplicatively cutting our short-term -+ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a -+ * multiplicative decrease in order to converge to a lower capacity in time -+ * logarithmic in the magnitude of the decrease. -+ * -+ * However, we do not cut our short-term estimates lower than the current rate -+ * and volume of delivered data from this round trip, since from the current -+ * delivery process we can estimate the measured capacity available now. -+ * -+ * Anything faster than that approach would knowingly risk high loss, which can -+ * cause low bw for Reno/CUBIC and high loss recovery latency for -+ * request/response flows using any congestion control. -+ */ -+static void bbr2_adapt_lower_bounds(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 ecn_cut, ecn_inflight_lo, beta; -+ -+ /* We only use lower-bound estimates when not probing bw. -+ * When probing we need to push inflight higher to probe bw. -+ */ -+ if (bbr2_is_probing_bandwidth(sk)) -+ return; -+ -+ /* ECN response. */ -+ if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) { -+ /* Reduce inflight to (1 - alpha*ecn_factor). */ -+ ecn_cut = (BBR_UNIT - -+ ((bbr->ecn_alpha * bbr->params.ecn_factor) >> -+ BBR_SCALE)); -+ if (bbr->inflight_lo == ~0U) -+ bbr->inflight_lo = tp->snd_cwnd; -+ ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; -+ } else { -+ ecn_inflight_lo = ~0U; -+ } -+ -+ /* Loss response. */ -+ if (bbr->loss_in_round) { -+ /* Reduce bw and inflight to (1 - beta). */ -+ if (bbr->bw_lo == ~0U) -+ bbr->bw_lo = bbr_max_bw(sk); -+ if (bbr->inflight_lo == ~0U) -+ bbr->inflight_lo = tp->snd_cwnd; -+ beta = bbr->params.beta; -+ bbr->bw_lo = -+ max_t(u32, bbr->bw_latest, -+ (u64)bbr->bw_lo * -+ (BBR_UNIT - beta) >> BBR_SCALE); -+ bbr->inflight_lo = -+ max_t(u32, bbr->inflight_latest, -+ (u64)bbr->inflight_lo * -+ (BBR_UNIT - beta) >> BBR_SCALE); -+ } -+ -+ /* Adjust to the lower of the levels implied by loss or ECN. */ -+ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); -+} -+ -+/* Reset any short-term lower-bound adaptation to congestion, so that we can -+ * push our inflight up. -+ */ -+static void bbr2_reset_lower_bounds(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->bw_lo = ~0U; -+ bbr->inflight_lo = ~0U; -+} -+ -+/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state -+ * machine phase where we adapt our lower bound based on congestion signals. -+ */ -+static void bbr2_reset_congestion_signals(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->loss_in_round = 0; -+ bbr->ecn_in_round = 0; -+ bbr->loss_in_cycle = 0; -+ bbr->ecn_in_cycle = 0; -+ bbr->bw_latest = 0; -+ bbr->inflight_latest = 0; -+} -+ -+/* Update (most of) our congestion signals: track the recent rate and volume of -+ * delivered data, presence of loss, and EWMA degree of ECN marking. -+ */ -+static void bbr2_update_congestion_signals( -+ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u64 bw; -+ -+ bbr->loss_round_start = 0; -+ if (rs->interval_us <= 0 || !rs->acked_sacked) -+ return; /* Not a valid observation */ -+ bw = ctx->sample_bw; -+ -+ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) -+ bbr2_take_bw_hi_sample(sk, bw); -+ -+ bbr->loss_in_round |= (rs->losses > 0); -+ -+ /* Update rate and volume of delivered data from latest round trip: */ -+ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); -+ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); -+ -+ if (before(rs->prior_delivered, bbr->loss_round_delivered)) -+ return; /* skip the per-round-trip updates */ -+ /* Now do per-round-trip updates. */ -+ bbr->loss_round_delivered = tp->delivered; /* mark round trip */ -+ bbr->loss_round_start = 1; -+ bbr2_adapt_lower_bounds(sk); -+ -+ /* Update windowed "latest" (single-round-trip) filters. */ -+ bbr->loss_in_round = 0; -+ bbr->ecn_in_round = 0; -+ bbr->bw_latest = ctx->sample_bw; -+ bbr->inflight_latest = rs->delivered; -+} -+ -+/* Bandwidth probing can cause loss. To help coexistence with loss-based -+ * congestion control we spread out our probing in a Reno-conscious way. Due to -+ * the shape of the Reno sawtooth, the time required between loss epochs for an -+ * idealized Reno flow is a number of round trips that is the BDP of that -+ * flow. We count packet-timed round trips directly, since measured RTT can -+ * vary widely, and Reno is driven by packet-timed round trips. -+ */ -+static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 inflight, rounds, reno_gain, reno_rounds; -+ -+ /* Random loss can shave some small percentage off of our inflight -+ * in each round. To survive this, flows need robust periodic probes. -+ */ -+ rounds = bbr->params.bw_probe_max_rounds; -+ -+ reno_gain = bbr->params.bw_probe_reno_gain; -+ if (reno_gain) { -+ inflight = bbr2_target_inflight(sk); -+ reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE; -+ rounds = min(rounds, reno_rounds); -+ } -+ return bbr->rounds_since_probe >= rounds; -+} -+ -+/* How long do we want to wait before probing for bandwidth (and risking -+ * loss)? We randomize the wait, for better mixing and fairness convergence. -+ * -+ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. -+ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, -+ * (eg 4K video to a broadband user): -+ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets -+ * -+ * We bound the BBR-native inter-bw-probe wall clock time to be: -+ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time -+ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must -+ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs -+ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable -+ * amount of time to discover unutilized bw on human-scale interactive -+ * time-scales (e.g. perhaps traffic from a web page download that we -+ * were competing with is now complete). -+ */ -+static void bbr2_pick_probe_wait(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ /* Decide the random round-trip bound for wait until probe: */ -+ bbr->rounds_since_probe = -+ prandom_u32_max(bbr->params.bw_probe_rand_rounds); -+ /* Decide the random wall clock bound for wait until probe: */ -+ bbr->probe_wait_us = bbr->params.bw_probe_base_us + -+ prandom_u32_max(bbr->params.bw_probe_rand_us); -+} -+ -+static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->cycle_idx = cycle_idx; -+ /* New phase, so need to update cwnd and pacing rate. */ -+ bbr->try_fast_path = 0; -+} -+ -+/* Send at estimated bw to fill the pipe, but not queue. We need this phase -+ * before PROBE_UP, because as soon as we send faster than the available bw -+ * we will start building a queue, and if the buffer is shallow we can cause -+ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and -+ * inflight_hi estimates will underestimate. -+ */ -+static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr2_reset_lower_bounds(sk); -+ if (bbr->inflight_hi != ~0U) -+ bbr->inflight_hi += bbr->params.refill_add_inc; -+ bbr->bw_probe_up_rounds = bw_probe_up_rounds; -+ bbr->bw_probe_up_acks = 0; -+ bbr->stopped_risky_probe = 0; -+ bbr->ack_phase = BBR_ACKS_REFILLING; -+ bbr->next_rtt_delivered = tp->delivered; -+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); -+} -+ -+/* Now probe max deliverable data rate and volume. */ -+static void bbr2_start_bw_probe_up(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; -+ bbr->next_rtt_delivered = tp->delivered; -+ bbr->cycle_mstamp = tp->tcp_mstamp; -+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP); -+ bbr2_raise_inflight_hi_slope(sk); -+} -+ -+/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall -+ * clock time at which to probe beyond an inflight that we think to be -+ * safe. This will knowingly risk packet loss, so we want to do this rarely, to -+ * keep packet loss rates low. Also start a round-trip counter, to probe faster -+ * if we estimate a Reno flow at our BDP would probe faster. -+ */ -+static void bbr2_start_bw_probe_down(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr2_reset_congestion_signals(sk); -+ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ -+ bbr2_pick_probe_wait(sk); -+ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ -+ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; -+ bbr->next_rtt_delivered = tp->delivered; -+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); -+} -+ -+/* Cruise: maintain what we estimate to be a neutral, conservative -+ * operating point, without attempting to probe up for bandwidth or down for -+ * RTT, and only reducing inflight in response to loss/ECN signals. -+ */ -+static void bbr2_start_bw_probe_cruise(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr->inflight_lo != ~0U) -+ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); -+ -+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); -+} -+ -+/* Loss and/or ECN rate is too high while probing. -+ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. -+ */ -+static void bbr2_handle_inflight_too_high(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ const u32 beta = bbr->params.beta; -+ -+ bbr->prev_probe_too_high = 1; -+ bbr->bw_probe_samples = 0; /* only react once per probe */ -+ bbr->debug.event = 'L'; /* Loss/ECN too high */ -+ /* If we are app-limited then we are not robustly -+ * probing the max volume of inflight data we think -+ * might be safe (analogous to how app-limited bw -+ * samples are not known to be robustly probing bw). -+ */ -+ if (!rs->is_app_limited) -+ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, -+ (u64)bbr2_target_inflight(sk) * -+ (BBR_UNIT - beta) >> BBR_SCALE); -+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) -+ bbr2_start_bw_probe_down(sk); -+} -+ -+/* If we're seeing bw and loss samples reflecting our bw probing, adapt -+ * using the signals we see. If loss or ECN mark rate gets too high, then adapt -+ * inflight_hi downward. If we're able to push inflight higher without such -+ * signals, push higher: adapt inflight_hi upward. -+ */ -+static bool bbr2_adapt_upper_bounds(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ /* Track when we'll see bw/loss samples resulting from our bw probes. */ -+ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) -+ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; -+ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { -+ /* End of samples from bw probing phase. */ -+ bbr->bw_probe_samples = 0; -+ bbr->ack_phase = BBR_ACKS_INIT; -+ /* At this point in the cycle, our current bw sample is also -+ * our best recent chance at finding the highest available bw -+ * for this flow. So now is the best time to forget the bw -+ * samples from the previous cycle, by advancing the window. -+ */ -+ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) -+ bbr2_advance_bw_hi_filter(sk); -+ /* If we had an inflight_hi, then probed and pushed inflight all -+ * the way up to hit that inflight_hi without seeing any -+ * high loss/ECN in all the resulting ACKs from that probing, -+ * then probe up again, this time letting inflight persist at -+ * inflight_hi for a round trip, then accelerating beyond. -+ */ -+ if (bbr->mode == BBR_PROBE_BW && -+ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { -+ bbr->debug.event = 'R'; /* reprobe */ -+ bbr2_start_bw_probe_refill(sk, 0); -+ return true; /* yes, decided state transition */ -+ } -+ } -+ -+ if (bbr2_is_inflight_too_high(sk, rs)) { -+ if (bbr->bw_probe_samples) /* sample is from bw probing? */ -+ bbr2_handle_inflight_too_high(sk, rs); -+ } else { -+ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ -+ if (bbr->inflight_hi == ~0U) /* no excess queue signals yet? */ -+ return false; -+ -+ /* To be resilient to random loss, we must raise inflight_hi -+ * if we observe in any phase that a higher level is safe. -+ */ -+ if (rs->tx_in_flight > bbr->inflight_hi) { -+ bbr->inflight_hi = rs->tx_in_flight; -+ bbr->debug.event = 'U'; /* raise up inflight_hi */ -+ } -+ -+ if (bbr->mode == BBR_PROBE_BW && -+ bbr->cycle_idx == BBR_BW_PROBE_UP) -+ bbr2_probe_inflight_hi_upward(sk, rs); -+ } -+ -+ return false; -+} -+ -+/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ -+static bool bbr2_check_time_to_probe_bw(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 n; -+ -+ /* If we seem to be at an operating point where we are not seeing loss -+ * but we are seeing ECN marks, then when the ECN marks cease we reprobe -+ * quickly (in case a burst of cross-traffic has ceased and freed up bw, -+ * or in case we are sharing with multiplicatively probing traffic). -+ */ -+ if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible && -+ bbr->ecn_in_cycle && !bbr->loss_in_cycle && -+ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { -+ bbr->debug.event = 'A'; /* *A*ll clear to probe *A*gain */ -+ /* Calculate n so that when bbr2_raise_inflight_hi_slope() -+ * computes growth_this_round as 2^n it will be roughly the -+ * desired volume of data (inflight_hi*ecn_reprobe_gain). -+ */ -+ n = ilog2((((u64)bbr->inflight_hi * -+ bbr->params.ecn_reprobe_gain) >> BBR_SCALE)); -+ bbr2_start_bw_probe_refill(sk, n); -+ return true; -+ } -+ -+ if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) || -+ bbr2_is_reno_coexistence_probe_time(sk)) { -+ bbr2_start_bw_probe_refill(sk, 0); -+ return true; -+ } -+ return false; -+} -+ -+/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ -+static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ bool is_under_bdp, is_long_enough; -+ -+ /* Always need to pull inflight down to leave headroom in queue. */ -+ if (inflight > bbr2_inflight_with_headroom(sk)) -+ return false; -+ -+ is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT); -+ if (bbr->params.drain_to_target) -+ return is_under_bdp; -+ -+ is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us); -+ return is_under_bdp || is_long_enough; -+} -+ -+/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ -+static void bbr2_update_cycle_phase(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ bool is_risky = false, is_queuing = false; -+ u32 inflight, bw; -+ -+ if (!bbr_full_bw_reached(sk)) -+ return; -+ -+ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ -+ if (bbr2_adapt_upper_bounds(sk, rs)) -+ return; /* already decided state transition */ -+ -+ if (bbr->mode != BBR_PROBE_BW) -+ return; -+ -+ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); -+ bw = bbr_max_bw(sk); -+ -+ switch (bbr->cycle_idx) { -+ /* First we spend most of our time cruising with a pacing_gain of 1.0, -+ * which paces at the estimated bw, to try to fully use the pipe -+ * without building queue. If we encounter loss/ECN marks, we adapt -+ * by slowing down. -+ */ -+ case BBR_BW_PROBE_CRUISE: -+ if (bbr2_check_time_to_probe_bw(sk)) -+ return; /* already decided state transition */ -+ break; -+ -+ /* After cruising, when it's time to probe, we first "refill": we send -+ * at the estimated bw to fill the pipe, before probing higher and -+ * knowingly risking overflowing the bottleneck buffer (causing loss). -+ */ -+ case BBR_BW_PROBE_REFILL: -+ if (bbr->round_start) { -+ /* After one full round trip of sending in REFILL, we -+ * start to see bw samples reflecting our REFILL, which -+ * may be putting too much data in flight. -+ */ -+ bbr->bw_probe_samples = 1; -+ bbr2_start_bw_probe_up(sk); -+ } -+ break; -+ -+ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to -+ * probe for bw. If we have not seen loss/ECN, we try to raise inflight -+ * to at least pacing_gain*BDP; note that this may take more than -+ * min_rtt if min_rtt is small (e.g. on a LAN). -+ * -+ * We terminate PROBE_UP bandwidth probing upon any of the following: -+ * -+ * (1) We've pushed inflight up to hit the inflight_hi target set in the -+ * most recent previous bw probe phase. Thus we want to start -+ * draining the queue immediately because it's very likely the most -+ * recently sent packets will fill the queue and cause drops. -+ * (checked here) -+ * (2) We have probed for at least 1*min_rtt_us, and the -+ * estimated queue is high enough (inflight > 1.25 * estimated_bdp). -+ * (checked here) -+ * (3) Loss filter says loss rate is "too high". -+ * (checked in bbr_is_inflight_too_high()) -+ * (4) ECN filter says ECN mark rate is "too high". -+ * (checked in bbr_is_inflight_too_high()) -+ */ -+ case BBR_BW_PROBE_UP: -+ if (bbr->prev_probe_too_high && -+ inflight >= bbr->inflight_hi) { -+ bbr->stopped_risky_probe = 1; -+ is_risky = true; -+ bbr->debug.event = 'D'; /* D for danger */ -+ } else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) && -+ inflight >= -+ bbr_inflight(sk, bw, -+ bbr->params.bw_probe_pif_gain)) { -+ is_queuing = true; -+ bbr->debug.event = 'Q'; /* building Queue */ -+ } -+ if (is_risky || is_queuing) { -+ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ -+ bbr2_start_bw_probe_down(sk); /* restart w/ down */ -+ } -+ break; -+ -+ /* After probing in PROBE_UP, we have usually accumulated some data in -+ * the bottleneck buffer (if bw probing didn't find more bw). We next -+ * enter PROBE_DOWN to try to drain any excess data from the queue. To -+ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until -+ * our inflight is less then that target cruising point, which is the -+ * minimum of (a) the amount needed to leave headroom, and (b) the -+ * estimated BDP. Once inflight falls to match the target, we estimate -+ * the queue is drained; persisting would underutilize the pipe. -+ */ -+ case BBR_BW_PROBE_DOWN: -+ if (bbr2_check_time_to_probe_bw(sk)) -+ return; /* already decided state transition */ -+ if (bbr2_check_time_to_cruise(sk, inflight, bw)) -+ bbr2_start_bw_probe_cruise(sk); -+ break; -+ -+ default: -+ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); -+ } -+} -+ -+/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ -+static void bbr2_exit_probe_rtt(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr2_reset_lower_bounds(sk); -+ if (bbr_full_bw_reached(sk)) { -+ bbr->mode = BBR_PROBE_BW; -+ /* Raising inflight after PROBE_RTT may cause loss, so reset -+ * the PROBE_BW clock and schedule the next bandwidth probe for -+ * a friendly and randomized future point in time. -+ */ -+ bbr2_start_bw_probe_down(sk); -+ /* Since we are exiting PROBE_RTT, we know inflight is -+ * below our estimated BDP, so it is reasonable to cruise. -+ */ -+ bbr2_start_bw_probe_cruise(sk); -+ } else { -+ bbr->mode = BBR_STARTUP; -+ } -+} -+ -+/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until -+ * the end of the round in recovery to get a good estimate of how many packets -+ * have been lost, and how many we need to drain with a low pacing rate. -+ */ -+static void bbr2_check_loss_too_high_in_startup(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr_full_bw_reached(sk)) -+ return; -+ -+ /* For STARTUP exit, check the loss rate at the end of each round trip -+ * of Recovery episodes in STARTUP. We check the loss rate at the end -+ * of the round trip to filter out noisy/low loss and have a better -+ * sense of inflight (extent of loss), so we can drain more accurately. -+ */ -+ if (rs->losses && bbr->loss_events_in_round < 0xf) -+ bbr->loss_events_in_round++; /* update saturating counter */ -+ if (bbr->params.full_loss_cnt && bbr->loss_round_start && -+ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && -+ bbr->loss_events_in_round >= bbr->params.full_loss_cnt && -+ bbr2_is_inflight_too_high(sk, rs)) { -+ bbr->debug.event = 'P'; /* Packet loss caused STARTUP exit */ -+ bbr2_handle_queue_too_high_in_startup(sk); -+ return; -+ } -+ if (bbr->loss_round_start) -+ bbr->loss_events_in_round = 0; -+} -+ -+/* If we are done draining, advance into steady state operation in PROBE_BW. */ -+static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs, -+ struct bbr_context *ctx) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr_check_drain(sk, rs, ctx)) { -+ bbr->mode = BBR_PROBE_BW; -+ bbr2_start_bw_probe_down(sk); -+ } -+} -+ -+static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs, -+ struct bbr_context *ctx) -+{ -+ bbr2_update_congestion_signals(sk, rs, ctx); -+ bbr_update_ack_aggregation(sk, rs); -+ bbr2_check_loss_too_high_in_startup(sk, rs); -+ bbr_check_full_bw_reached(sk, rs); -+ bbr2_check_drain(sk, rs, ctx); -+ bbr2_update_cycle_phase(sk, rs); -+ bbr_update_min_rtt(sk, rs); -+} -+ -+/* Fast path for app-limited case. -+ * -+ * On each ack, we execute bbr state machine, which primarily consists of: -+ * 1) update model based on new rate sample, and -+ * 2) update control based on updated model or state change. -+ * -+ * There are certain workload/scenarios, e.g. app-limited case, where -+ * either we can skip updating model or we can skip update of both model -+ * as well as control. This provides signifcant softirq cpu savings for -+ * processing incoming acks. -+ * -+ * In case of app-limited, if there is no congestion (loss/ecn) and -+ * if observed bw sample is less than current estimated bw, then we can -+ * skip some of the computation in bbr state processing: -+ * -+ * - if there is no rtt/mode/phase change: In this case, since all the -+ * parameters of the network model are constant, we can skip model -+ * as well control update. -+ * -+ * - else we can skip rest of the model update. But we still need to -+ * update the control to account for the new rtt/mode/phase. -+ * -+ * Returns whether we can take fast path or not. -+ */ -+static bool bbr2_fast_path(struct sock *sk, bool *update_model, -+ const struct rate_sample *rs, struct bbr_context *ctx) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 prev_min_rtt_us, prev_mode; -+ -+ if (bbr->params.fast_path && bbr->try_fast_path && -+ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && -+ !bbr->loss_in_round && !bbr->ecn_in_round) { -+ prev_mode = bbr->mode; -+ prev_min_rtt_us = bbr->min_rtt_us; -+ bbr2_check_drain(sk, rs, ctx); -+ bbr2_update_cycle_phase(sk, rs); -+ bbr_update_min_rtt(sk, rs); -+ -+ if (bbr->mode == prev_mode && -+ bbr->min_rtt_us == prev_min_rtt_us && -+ bbr->try_fast_path) -+ return true; -+ -+ /* Skip model update, but control still needs to be updated */ -+ *update_model = false; -+ } -+ return false; -+} -+ -+static void bbr2_main(struct sock *sk, const struct rate_sample *rs) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ struct bbr_context ctx = { 0 }; -+ bool update_model = true; -+ u32 bw; -+ -+ bbr->debug.event = '.'; /* init to default NOP (no event yet) */ -+ -+ bbr_update_round_start(sk, rs, &ctx); -+ if (bbr->round_start) { -+ bbr->rounds_since_probe = -+ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); -+ bbr2_update_ecn_alpha(sk); -+ } -+ -+ bbr->ecn_in_round |= rs->is_ece; -+ bbr_calculate_bw_sample(sk, rs, &ctx); -+ -+ if (bbr2_fast_path(sk, &update_model, rs, &ctx)) -+ goto out; -+ -+ if (update_model) -+ bbr2_update_model(sk, rs, &ctx); -+ -+ bbr_update_gains(sk); -+ bw = bbr_bw(sk); -+ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); -+ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, -+ tp->snd_cwnd, &ctx); -+ bbr2_bound_cwnd_for_inflight_model(sk); -+ -+out: -+ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; -+ bbr->loss_in_cycle |= rs->lost > 0; -+ bbr->ecn_in_cycle |= rs->delivered_ce > 0; -+ -+ bbr_debug(sk, rs->acked_sacked, rs, &ctx); -+} -+ -+/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared -+ * down here, so that the algorithm functions that use the parameters must use -+ * the per-socket parameters; if they accidentally use the global version -+ * then there will be a compile error. -+ * TODO(ncardwell): move all per-socket parameters down to this section. -+ */ -+ -+/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. -+ * No loss response when 0. Max allwed value is 255. -+ */ -+static u32 bbr_beta = BBR_UNIT * 30 / 100; -+ -+/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE. -+ * Max allowed value is 255. -+ */ -+static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; /* 1/16 = 6.25% */ -+ -+/* The initial value for the ecn_alpha state variable. Default and max -+ * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly -+ * to congestion if the bottleneck is congested when the flow starts up. -+ */ -+static u32 bbr_ecn_alpha_init = BBR_UNIT; /* 1.0, to respond quickly */ -+ -+/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. -+ * No ECN based bounding when 0. Max allwed value is 255. -+ */ -+static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ -+ -+/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. -+ * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255. -+ */ -+static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ -+ -+/* Max RTT (in usec) at which to use sender-side ECN logic. -+ * Disabled when 0 (ECN allowed at any RTT). -+ * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms. -+ */ -+static u32 bbr_ecn_max_rtt_us = 5000; -+ -+/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN -+ * clears then use a multiplicative increase to quickly reprobe bw by -+ * starting inflight probing at the given multiple of inflight_hi. -+ * Default for this experimental knob is 0 (disabled). -+ * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5. -+ */ -+static u32 bbr_ecn_reprobe_gain; -+ -+/* Estimate bw probing has gone too far if loss rate exceeds this level. */ -+static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ -+ -+/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, -+ * and loss rate is higher than bbr_loss_thresh. -+ * Disabled if 0. Max allowed value is 15 (0xF). -+ */ -+static u32 bbr_full_loss_cnt = 8; -+ -+/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh -+ * meets this count. Max allowed value is 3. -+ */ -+static u32 bbr_full_ecn_cnt = 2; -+ -+/* Fraction of unutilized headroom to try to leave in path upon high loss. */ -+static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; -+ -+/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase. -+ * Default is 1.25x, as in BBR v1. Max allowed is 511. -+ */ -+static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4; -+ -+/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips. -+ * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism. -+ * Max allowed is 511. -+ */ -+static u32 bbr_bw_probe_reno_gain = BBR_UNIT; -+ -+/* Max number of packet-timed rounds to wait before probing for bandwidth. If -+ * we want to tolerate 1% random loss per round, and not have this cut our -+ * inflight too much, we must probe for bw periodically on roughly this scale. -+ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. -+ * We aim to be fair with Reno/CUBIC up to a BDP of at least: -+ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets -+ */ -+static u32 bbr_bw_probe_max_rounds = 63; -+ -+/* Max amount of randomness to inject in round counting for Reno-coexistence. -+ * Max value is 15. -+ */ -+static u32 bbr_bw_probe_rand_rounds = 2; -+ -+/* Use BBR-native probe time scale starting at this many usec. -+ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: -+ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs -+ */ -+static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ -+ -+/* Use BBR-native probes spread over this many usec: */ -+static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ -+ -+/* Undo the model changes made in loss recovery if recovery was spurious? */ -+static bool bbr_undo = true; -+ -+/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ -+static bool bbr_fast_path = true; /* default: enabled */ -+ -+/* Use fast ack mode ? */ -+static int bbr_fast_ack_mode = 1; /* default: rwnd check off */ -+ -+/* How much to additively increase inflight_hi when entering REFILL? */ -+static u32 bbr_refill_add_inc; /* default: disabled */ -+ -+module_param_named(beta, bbr_beta, uint, 0644); -+module_param_named(ecn_alpha_gain, bbr_ecn_alpha_gain, uint, 0644); -+module_param_named(ecn_alpha_init, bbr_ecn_alpha_init, uint, 0644); -+module_param_named(ecn_factor, bbr_ecn_factor, uint, 0644); -+module_param_named(ecn_thresh, bbr_ecn_thresh, uint, 0644); -+module_param_named(ecn_max_rtt_us, bbr_ecn_max_rtt_us, uint, 0644); -+module_param_named(ecn_reprobe_gain, bbr_ecn_reprobe_gain, uint, 0644); -+module_param_named(loss_thresh, bbr_loss_thresh, uint, 0664); -+module_param_named(full_loss_cnt, bbr_full_loss_cnt, uint, 0664); -+module_param_named(full_ecn_cnt, bbr_full_ecn_cnt, uint, 0664); -+module_param_named(inflight_headroom, bbr_inflight_headroom, uint, 0664); -+module_param_named(bw_probe_pif_gain, bbr_bw_probe_pif_gain, uint, 0664); -+module_param_named(bw_probe_reno_gain, bbr_bw_probe_reno_gain, uint, 0664); -+module_param_named(bw_probe_max_rounds, bbr_bw_probe_max_rounds, uint, 0664); -+module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664); -+module_param_named(bw_probe_base_us, bbr_bw_probe_base_us, uint, 0664); -+module_param_named(bw_probe_rand_us, bbr_bw_probe_rand_us, uint, 0664); -+module_param_named(undo, bbr_undo, bool, 0664); -+module_param_named(fast_path, bbr_fast_path, bool, 0664); -+module_param_named(fast_ack_mode, bbr_fast_ack_mode, uint, 0664); -+module_param_named(refill_add_inc, bbr_refill_add_inc, uint, 0664); -+ -+static void bbr2_init(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr_init(sk); /* run shared init code for v1 and v2 */ -+ -+ /* BBR v2 parameters: */ -+ bbr->params.beta = min_t(u32, 0xFFU, bbr_beta); -+ bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain); -+ bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init); -+ bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor); -+ bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh); -+ bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us); -+ bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain); -+ bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh); -+ bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt); -+ bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt); -+ bbr->params.inflight_headroom = -+ min_t(u32, 0xFFU, bbr_inflight_headroom); -+ bbr->params.bw_probe_pif_gain = -+ min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain); -+ bbr->params.bw_probe_reno_gain = -+ min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain); -+ bbr->params.bw_probe_max_rounds = -+ min_t(u32, 0xFFU, bbr_bw_probe_max_rounds); -+ bbr->params.bw_probe_rand_rounds = -+ min_t(u32, 0xFU, bbr_bw_probe_rand_rounds); -+ bbr->params.bw_probe_base_us = -+ min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us); -+ bbr->params.bw_probe_rand_us = -+ min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us); -+ bbr->params.undo = bbr_undo; -+ bbr->params.fast_path = bbr_fast_path ? 1 : 0; -+ bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc); -+ -+ /* BBR v2 state: */ -+ bbr->initialized = 1; -+ /* Start sampling ECN mark rate after first full flight is ACKed: */ -+ bbr->loss_round_delivered = tp->delivered + 1; -+ bbr->loss_round_start = 0; -+ bbr->undo_bw_lo = 0; -+ bbr->undo_inflight_lo = 0; -+ bbr->undo_inflight_hi = 0; -+ bbr->loss_events_in_round = 0; -+ bbr->startup_ecn_rounds = 0; -+ bbr2_reset_congestion_signals(sk); -+ bbr->bw_lo = ~0U; -+ bbr->bw_hi[0] = 0; -+ bbr->bw_hi[1] = 0; -+ bbr->inflight_lo = ~0U; -+ bbr->inflight_hi = ~0U; -+ bbr->bw_probe_up_cnt = ~0U; -+ bbr->bw_probe_up_acks = 0; -+ bbr->bw_probe_up_rounds = 0; -+ bbr->probe_wait_us = 0; -+ bbr->stopped_risky_probe = 0; -+ bbr->ack_phase = BBR_ACKS_INIT; -+ bbr->rounds_since_probe = 0; -+ bbr->bw_probe_samples = 0; -+ bbr->prev_probe_too_high = 0; -+ bbr->ecn_eligible = 0; -+ bbr->ecn_alpha = bbr->params.ecn_alpha_init; -+ bbr->alpha_last_delivered = 0; -+ bbr->alpha_last_delivered_ce = 0; -+ -+ tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode); -+} -+ -+/* Core TCP stack informs us that the given skb was just marked lost. */ -+static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); -+ struct rate_sample rs; -+ -+ /* Capture "current" data over the full round trip of loss, -+ * to have a better chance to see the full capacity of the path. -+ */ -+ if (!bbr->loss_in_round) /* first loss in this round trip? */ -+ bbr->loss_round_delivered = tp->delivered; /* set round trip */ -+ bbr->loss_in_round = 1; -+ bbr->loss_in_cycle = 1; -+ -+ if (!bbr->bw_probe_samples) -+ return; /* not an skb sent while probing for bandwidth */ -+ if (unlikely(!scb->tx.delivered_mstamp)) -+ return; /* skb was SACKed, reneged, marked lost; ignore it */ -+ /* We are probing for bandwidth. Construct a rate sample that -+ * estimates what happened in the flight leading up to this lost skb, -+ * then see if the loss rate went too high, and if so at which packet. -+ */ -+ memset(&rs, 0, sizeof(rs)); -+ rs.tx_in_flight = scb->tx.in_flight; -+ rs.lost = tp->lost - scb->tx.lost; -+ rs.is_app_limited = scb->tx.is_app_limited; -+ if (bbr2_is_inflight_too_high(sk, &rs)) { -+ rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb); -+ bbr2_handle_inflight_too_high(sk, &rs); -+ } -+} -+ -+/* Revert short-term model if current loss recovery event was spurious. */ -+static u32 bbr2_undo_cwnd(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->debug.undo = 1; -+ bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ -+ bbr->full_bw_cnt = 0; -+ bbr->loss_in_round = 0; -+ -+ if (!bbr->params.undo) -+ return tp->snd_cwnd; -+ -+ /* Revert to cwnd and other state saved before loss episode. */ -+ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); -+ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); -+ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); -+ return bbr->prior_cwnd; -+} -+ -+/* Entering loss recovery, so save state for when we undo recovery. */ -+static u32 bbr2_ssthresh(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr_save_cwnd(sk); -+ /* For undo, save state that adapts based on loss signal. */ -+ bbr->undo_bw_lo = bbr->bw_lo; -+ bbr->undo_inflight_lo = bbr->inflight_lo; -+ bbr->undo_inflight_hi = bbr->inflight_hi; -+ return tcp_sk(sk)->snd_ssthresh; -+} -+ -+static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr) -+{ -+ switch (bbr->mode) { -+ case BBR_STARTUP: -+ return BBR2_PHASE_STARTUP; -+ case BBR_DRAIN: -+ return BBR2_PHASE_DRAIN; -+ case BBR_PROBE_BW: -+ break; -+ case BBR_PROBE_RTT: -+ return BBR2_PHASE_PROBE_RTT; -+ default: -+ return BBR2_PHASE_INVALID; -+ } -+ switch (bbr->cycle_idx) { -+ case BBR_BW_PROBE_UP: -+ return BBR2_PHASE_PROBE_BW_UP; -+ case BBR_BW_PROBE_DOWN: -+ return BBR2_PHASE_PROBE_BW_DOWN; -+ case BBR_BW_PROBE_CRUISE: -+ return BBR2_PHASE_PROBE_BW_CRUISE; -+ case BBR_BW_PROBE_REFILL: -+ return BBR2_PHASE_PROBE_BW_REFILL; -+ default: -+ return BBR2_PHASE_INVALID; -+ } -+} -+ -+static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr, -+ union tcp_cc_info *info) -+{ -+ if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || -+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) { -+ struct bbr *bbr = inet_csk_ca(sk); -+ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); -+ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); -+ u64 bw_lo = bbr->bw_lo == ~0U ? -+ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); -+ -+ memset(&info->bbr2, 0, sizeof(info->bbr2)); -+ info->bbr2.bbr_bw_lsb = (u32)bw; -+ info->bbr2.bbr_bw_msb = (u32)(bw >> 32); -+ info->bbr2.bbr_min_rtt = bbr->min_rtt_us; -+ info->bbr2.bbr_pacing_gain = bbr->pacing_gain; -+ info->bbr2.bbr_cwnd_gain = bbr->cwnd_gain; -+ info->bbr2.bbr_bw_hi_lsb = (u32)bw_hi; -+ info->bbr2.bbr_bw_hi_msb = (u32)(bw_hi >> 32); -+ info->bbr2.bbr_bw_lo_lsb = (u32)bw_lo; -+ info->bbr2.bbr_bw_lo_msb = (u32)(bw_lo >> 32); -+ info->bbr2.bbr_mode = bbr->mode; -+ info->bbr2.bbr_phase = (__u8)bbr2_get_phase(bbr); -+ info->bbr2.bbr_version = (__u8)2; -+ info->bbr2.bbr_inflight_lo = bbr->inflight_lo; -+ info->bbr2.bbr_inflight_hi = bbr->inflight_hi; -+ info->bbr2.bbr_extra_acked = bbr_extra_acked(sk); -+ *attr = INET_DIAG_BBRINFO; -+ return sizeof(info->bbr2); -+ } -+ return 0; -+} -+ -+static void bbr2_set_state(struct sock *sk, u8 new_state) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (new_state == TCP_CA_Loss) { -+ struct rate_sample rs = { .losses = 1 }; -+ struct bbr_context ctx = { 0 }; -+ -+ bbr->prev_ca_state = TCP_CA_Loss; -+ bbr->full_bw = 0; -+ if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { -+ /* bbr_adapt_lower_bounds() needs cwnd before -+ * we suffered an RTO, to update inflight_lo: -+ */ -+ bbr->inflight_lo = -+ max(tp->snd_cwnd, bbr->prior_cwnd); -+ } -+ bbr_debug(sk, 0, &rs, &ctx); -+ } else if (bbr->prev_ca_state == TCP_CA_Loss && -+ new_state != TCP_CA_Loss) { -+ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); -+ bbr->try_fast_path = 0; /* bound cwnd using latest model */ -+ } -+} -+ -+static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = { -+ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, -+ .name = "bbr2", -+ .owner = THIS_MODULE, -+ .init = bbr2_init, -+ .cong_control = bbr2_main, -+ .sndbuf_expand = bbr_sndbuf_expand, -+ .skb_marked_lost = bbr2_skb_marked_lost, -+ .undo_cwnd = bbr2_undo_cwnd, -+ .cwnd_event = bbr_cwnd_event, -+ .ssthresh = bbr2_ssthresh, -+ .tso_segs = bbr_tso_segs, -+ .get_info = bbr2_get_info, -+ .set_state = bbr2_set_state, -+}; -+ -+static int __init bbr_register(void) -+{ -+ BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); -+ return tcp_register_congestion_control(&tcp_bbr2_cong_ops); -+} -+ -+static void __exit bbr_unregister(void) -+{ -+ tcp_unregister_congestion_control(&tcp_bbr2_cong_ops); -+} -+ -+module_init(bbr_register); -+module_exit(bbr_unregister); -+ -+MODULE_AUTHOR("Van Jacobson <vanj@google.com>"); -+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>"); -+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>"); -+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>"); -+MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>"); -+MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>"); -+MODULE_AUTHOR("Kevin Yang <yyd@google.com>"); -+MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>"); -+ -+MODULE_LICENSE("Dual BSD/GPL"); -+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); -diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c -index 563d016e7..1c94abb6f 100644 ---- a/net/ipv4/tcp_cong.c -+++ b/net/ipv4/tcp_cong.c -@@ -179,6 +179,7 @@ void tcp_init_congestion_control(struct sock *sk) - struct inet_connection_sock *icsk = inet_csk(sk); - - tcp_sk(sk)->prior_ssthresh = 0; -+ tcp_sk(sk)->fast_ack_mode = 0; - if (icsk->icsk_ca_ops->init) - icsk->icsk_ca_ops->init(sk); - if (tcp_ca_needs_ecn(sk)) -diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index 69a545db8..45aaba87c 100644 ---- a/net/ipv4/tcp_input.c -+++ b/net/ipv4/tcp_input.c -@@ -348,7 +348,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) - tcp_enter_quickack_mode(sk, 2); - break; - case INET_ECN_CE: -- if (tcp_ca_needs_ecn(sk)) -+ if (tcp_ca_wants_ce_events(sk)) - tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); - - if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { -@@ -359,7 +359,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) - tp->ecn_flags |= TCP_ECN_SEEN; - break; - default: -- if (tcp_ca_needs_ecn(sk)) -+ if (tcp_ca_wants_ce_events(sk)) - tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); - tp->ecn_flags |= TCP_ECN_SEEN; - break; -@@ -1039,7 +1039,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) - */ - static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) - { -+ struct sock *sk = (struct sock *)tp; -+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; -+ - tp->lost += tcp_skb_pcount(skb); -+ if (ca_ops->skb_marked_lost) -+ ca_ops->skb_marked_lost(sk, skb); - } - - void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) -@@ -1420,6 +1425,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, - WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); - tcp_skb_pcount_add(skb, -pcount); - -+ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ -+ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, -+ "prev in_flight: %u skb in_flight: %u pcount: %u", -+ TCP_SKB_CB(prev)->tx.in_flight, -+ TCP_SKB_CB(skb)->tx.in_flight, -+ pcount)) -+ TCP_SKB_CB(skb)->tx.in_flight = 0; -+ else -+ TCP_SKB_CB(skb)->tx.in_flight -= pcount; -+ TCP_SKB_CB(prev)->tx.in_flight += pcount; -+ - /* When we're adding to gso_segs == 1, gso_size will be zero, - * in theory this shouldn't be necessary but as long as DSACK - * code can come after this skb later on it's better to keep -@@ -3182,7 +3198,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, - long seq_rtt_us = -1L; - long ca_rtt_us = -1L; - u32 pkts_acked = 0; -- u32 last_in_flight = 0; - bool rtt_update; - int flag = 0; - -@@ -3218,7 +3233,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, - if (!first_ackt) - first_ackt = last_ackt; - -- last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; - if (before(start_seq, reord)) - reord = start_seq; - if (!after(scb->end_seq, tp->high_seq)) -@@ -3284,8 +3298,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, - seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); - ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); - -- if (pkts_acked == 1 && last_in_flight < tp->mss_cache && -- last_in_flight && !prior_sacked && fully_acked && -+ if (pkts_acked == 1 && fully_acked && !prior_sacked && -+ (tp->snd_una - prior_snd_una) < tp->mss_cache && - sack->rate->prior_delivered + 1 == tp->delivered && - !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { - /* Conservatively mark a delayed ACK. It's typically -@@ -3342,9 +3356,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, - - if (icsk->icsk_ca_ops->pkts_acked) { - struct ack_sample sample = { .pkts_acked = pkts_acked, -- .rtt_us = sack->rate->rtt_us, -- .in_flight = last_in_flight }; -+ .rtt_us = sack->rate->rtt_us }; - -+ sample.in_flight = tp->mss_cache * -+ (tp->delivered - sack->rate->prior_delivered); - icsk->icsk_ca_ops->pkts_acked(sk, &sample); - } - -@@ -3742,6 +3757,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) - - prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; - rs.prior_in_flight = tcp_packets_in_flight(tp); -+ tcp_rate_check_app_limited(sk); - - /* ts_recent update must be made after we are sure that the packet - * is in window. -@@ -3839,6 +3855,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) - delivered = tcp_newly_delivered(sk, delivered, flag); - lost = tp->lost - lost; /* freshly marked lost */ - rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); -+ rs.is_ece = !!(flag & FLAG_ECE); - tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); - tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); - tcp_xmit_recovery(sk, rexmit); -@@ -5399,13 +5416,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) - - /* More than one full frame received... */ - if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && -+ (tp->fast_ack_mode == 1 || - /* ... and right edge of window advances far enough. - * (tcp_recvmsg() will send ACK otherwise). - * If application uses SO_RCVLOWAT, we want send ack now if - * we have not received enough bytes to satisfy the condition. - */ -- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || -- __tcp_select_window(sk) >= tp->rcv_wnd)) || -+ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || -+ __tcp_select_window(sk) >= tp->rcv_wnd))) || - /* We ACK each frame or... */ - tcp_in_quickack_mode(sk) || - /* Protocol state mandates a one-time immediate ACK */ -diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index fbf140a77..90d939375 100644 ---- a/net/ipv4/tcp_output.c -+++ b/net/ipv4/tcp_output.c -@@ -1256,8 +1256,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, - tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); - skb->skb_mstamp_ns = tp->tcp_wstamp_ns; - if (clone_it) { -- TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq -- - tp->snd_una; - oskb = skb; - - tcp_skb_tsorted_save(oskb) { -@@ -1536,7 +1534,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, - { - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *buff; -- int nsize, old_factor; -+ int nsize, old_factor, inflight_prev; - long limit; - int nlen; - u8 flags; -@@ -1615,6 +1613,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, - - if (diff) - tcp_adjust_pcount(sk, skb, diff); -+ -+ /* Set buff tx.in_flight as if buff were sent by itself. */ -+ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; -+ if (WARN_ONCE(inflight_prev < 0, -+ "inconsistent: tx.in_flight: %u old_factor: %d", -+ TCP_SKB_CB(skb)->tx.in_flight, old_factor)) -+ inflight_prev = 0; -+ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + -+ tcp_skb_pcount(buff); - } - - /* Link BUFF into the send queue. */ -@@ -1982,13 +1989,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, - static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) - { - const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; -- u32 min_tso, tso_segs; -- -- min_tso = ca_ops->min_tso_segs ? -- ca_ops->min_tso_segs(sk) : -- sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs; -+ u32 tso_segs; - -- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); -+ tso_segs = ca_ops->tso_segs ? -+ ca_ops->tso_segs(sk, mss_now) : -+ tcp_tso_autosize(sk, mss_now, -+ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); - return min_t(u32, tso_segs, sk->sk_gso_max_segs); - } - -@@ -2628,6 +2634,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, - skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; - list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); - tcp_init_tso_segs(skb, mss_now); -+ tcp_set_tx_in_flight(sk, skb); - goto repair; /* Skip network transmission */ - } - -diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c -index 0de693565..796fa6e53 100644 ---- a/net/ipv4/tcp_rate.c -+++ b/net/ipv4/tcp_rate.c -@@ -34,6 +34,24 @@ - * ready to send in the write queue. - */ - -+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ u32 in_flight; -+ -+ /* Check, sanitize, and record packets in flight after skb was sent. */ -+ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); -+ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, -+ "insane in_flight %u cc %s mss %u " -+ "cwnd %u pif %u %u %u %u\n", -+ in_flight, inet_csk(sk)->icsk_ca_ops->name, -+ tp->mss_cache, tp->snd_cwnd, -+ tp->packets_out, tp->retrans_out, -+ tp->sacked_out, tp->lost_out)) -+ in_flight = TCPCB_IN_FLIGHT_MAX; -+ TCP_SKB_CB(skb)->tx.in_flight = in_flight; -+} -+ - /* Snapshot the current delivery information in the skb, to generate - * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). - */ -@@ -65,7 +83,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) - TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; - TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; - TCP_SKB_CB(skb)->tx.delivered = tp->delivered; -+ TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; -+ TCP_SKB_CB(skb)->tx.lost = tp->lost; - TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; -+ tcp_set_tx_in_flight(sk, skb); - } - - /* When an skb is sacked or acked, we fill in the rate sample with the (prior) -@@ -86,16 +107,20 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, - - if (!rs->prior_delivered || - after(scb->tx.delivered, rs->prior_delivered)) { -+ rs->prior_lost = scb->tx.lost; -+ rs->prior_delivered_ce = scb->tx.delivered_ce; - rs->prior_delivered = scb->tx.delivered; - rs->prior_mstamp = scb->tx.delivered_mstamp; - rs->is_app_limited = scb->tx.is_app_limited; - rs->is_retrans = scb->sacked & TCPCB_RETRANS; -+ rs->tx_in_flight = scb->tx.in_flight; - - /* Record send time of most recently ACKed packet: */ - tp->first_tx_mstamp = tcp_skb_timestamp_us(skb); - /* Find the duration of the "send phase" of this window: */ -- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, -- scb->tx.first_tx_mstamp); -+ rs->interval_us = tcp_stamp32_us_delta( -+ tp->first_tx_mstamp, -+ scb->tx.first_tx_mstamp); - - } - /* Mark off the skb delivered once it's sacked to avoid being -@@ -137,6 +162,11 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - return; - } - rs->delivered = tp->delivered - rs->prior_delivered; -+ rs->lost = tp->lost - rs->prior_lost; -+ -+ rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; -+ /* delivered_ce occupies less than 32 bits in the skb control block */ -+ rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; - - /* Model sending data and receiving ACKs as separate pipeline phases - * for a window. Usually the ACK phase is longer, but with ACK -@@ -144,7 +174,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - * longer phase. - */ - snd_us = rs->interval_us; /* send phase */ -- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, -+ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, - rs->prior_mstamp); /* ack phase */ - rs->interval_us = max(snd_us, ack_us); - -diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c -index 4ef08079c..b5b24caa8 100644 ---- a/net/ipv4/tcp_timer.c -+++ b/net/ipv4/tcp_timer.c -@@ -607,6 +607,7 @@ void tcp_write_timer_handler(struct sock *sk) - goto out; - } - -+ tcp_rate_check_app_limited(sk); - tcp_mstamp_refresh(tcp_sk(sk)); - event = icsk->icsk_pending; - --- -2.31.1.305.gd1b10fc6d8 - diff --git a/0010-btrfs.patch b/0010-btrfs.patch deleted file mode 100644 index 457e2445824d..000000000000 --- a/0010-btrfs.patch +++ /dev/null @@ -1,2157 +0,0 @@ -From dfe89528bf8d093c1df80ea3fea2a50d3dc4a302 Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Mon, 5 Aug 2019 14:31:53 -0400 -Subject: [PATCH 01/22] btrfs: add a force_chunk_alloc to space_info's sysfs - -In testing various things such as the btrfsck patch to detect over -allocation of chunks, empty block group deletion, and balance I've had -various ways to force chunk allocations for debug purposes. Add a sysfs -file to enable forcing of chunk allocation for the owning space info in -order to enable us to add testcases in the future to test these various -features easier. - -[HH: rebased for 5.4] -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/sysfs.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 64 insertions(+) - -diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c -index 6eb1c50fa..9372ef191 100644 ---- a/fs/btrfs/sysfs.c -+++ b/fs/btrfs/sysfs.c -@@ -72,6 +72,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ - - static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); - static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); -+static inline struct kobject *get_btrfs_kobj(struct kobject *kobj); - - static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) - { -@@ -640,6 +641,58 @@ static struct kobj_type btrfs_raid_ktype = { - .default_groups = raid_groups, - }; - -+static ssize_t btrfs_space_info_force_chunk_alloc_show(struct kobject *kobj, -+ struct kobj_attribute *a, -+ char *buf) -+{ -+ return snprintf(buf, PAGE_SIZE, "0\n"); -+} -+ -+static ssize_t btrfs_space_info_force_chunk_alloc(struct kobject *kobj, -+ struct kobj_attribute *a, -+ const char *buf, size_t len) -+{ -+ struct btrfs_space_info *space_info = to_space_info(kobj); -+ struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); -+ struct btrfs_trans_handle *trans; -+ unsigned long val; -+ int ret; -+ -+ if (!fs_info) { -+ printk(KERN_ERR "couldn't get fs_info\n"); -+ return -EPERM; -+ } -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if (sb_rdonly(fs_info->sb)) -+ return -EROFS; -+ -+ ret = kstrtoul(buf, 10, &val); -+ if (ret) -+ return ret; -+ -+ /* -+ * We don't really care, but if we echo 0 > force it seems silly to do -+ * anything. -+ */ -+ if (val == 0) -+ return -EINVAL; -+ -+ trans = btrfs_start_transaction(fs_info->extent_root, 0); -+ if (!trans) -+ return PTR_ERR(trans); -+ ret = btrfs_force_chunk_alloc(trans, space_info->flags); -+ btrfs_end_transaction(trans); -+ if (ret == 1) -+ return len; -+ return -ENOSPC; -+} -+BTRFS_ATTR_RW(space_info, force_chunk_alloc, -+ btrfs_space_info_force_chunk_alloc_show, -+ btrfs_space_info_force_chunk_alloc); -+ - #define SPACE_INFO_ATTR(field) \ - static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ - struct kobj_attribute *a, \ -@@ -684,6 +737,7 @@ static struct attribute *space_info_attrs[] = { - BTRFS_ATTR_PTR(space_info, disk_used), - BTRFS_ATTR_PTR(space_info, disk_total), - BTRFS_ATTR_PTR(space_info, total_bytes_pinned), -+ BTRFS_ATTR_PTR(space_info, force_chunk_alloc), - NULL, - }; - ATTRIBUTE_GROUPS(space_info); -@@ -1006,6 +1060,16 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) - return to_fs_devs(kobj)->fs_info; - } - -+static inline struct kobject *get_btrfs_kobj(struct kobject *kobj) -+{ -+ while (kobj) { -+ if (kobj->ktype == &btrfs_ktype) -+ return kobj; -+ kobj = kobj->parent; -+ } -+ return NULL; -+} -+ - #define NUM_FEATURE_BITS 64 - #define BTRFS_FEATURE_NAME_MAX 13 - static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; --- -2.32.0 - - -From e104f0dda22a999ddd5f0be76ffc62637b411a3f Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Fri, 24 Jul 2020 12:41:47 -0400 -Subject: [PATCH 02/22] btrfs: do not evaluate the expression with - !CONFIG_BTRFS_ASSERT - -While investigating a performance issue I noticed that turning off -CONFIG_BTRFS_ASSERT had no effect in what I was seeing in perf, -specifically check_setget_bounds() was around 5% for this workload. -Upon investigation I realized that I made a mistake when I added -ASSERT(), I would still evaluate the expression, but simply ignore the -result. - -This is useless, and has a marked impact on performance. This -microbenchmark is the watered down version of an application that is -experiencing performance issues, and does renames and creates over and -over again. Doing these operations 200k times without this patch takes -13 seconds on my machine. With this patch it takes 7 seconds. - -[HH: removed the second hunk for 5.7.x] -Signed-off-by: Josef Bacik <josef@toxicpanda.com> -Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> ---- - fs/btrfs/ctree.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h -index 29ef96903..12921830e 100644 ---- a/fs/btrfs/ctree.h -+++ b/fs/btrfs/ctree.h -@@ -3402,7 +3402,7 @@ static inline void assertfail(const char *expr, const char *file, int line) - - #else - static inline void assertfail(const char *expr, const char* file, int line) { } --#define ASSERT(expr) (void)(expr) -+#define ASSERT(expr) ((void)0) - #endif - - /* --- -2.32.0 - - -From a8c8b6d8a9763fe25f616567c8005318d3cbd948 Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Fri, 20 Mar 2020 14:34:36 -0400 -Subject: [PATCH 03/22] btrfs: restart snapshot delete if we have to end the - transaction - -This is to fully fix the deadlock described in - -btrfs: do not resolve backrefs for roots that are being deleted - -Holding write locks on our deleted snapshot across trans handles will -just lead to sadness, and our backref lookup code is going to want to -still process dropped snapshots for things like qgroup accounting. - -Fix this by simply dropping our path before we restart our transaction, -and picking back up from our drop_progress key. This is less efficient -obviously, but it also doesn't deadlock, so it feels like a reasonable -trade off. - -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/extent-tree.c | 16 ++++++++++++++++ - 1 file changed, 16 insertions(+) - -diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c -index 27c368007..1aff5769c 100644 ---- a/fs/btrfs/extent-tree.c -+++ b/fs/btrfs/extent-tree.c -@@ -5563,6 +5563,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) - * already dropped. - */ - set_bit(BTRFS_ROOT_DELETING, &root->state); -+again: - if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { - level = btrfs_header_level(root->node); - path->nodes[level] = btrfs_lock_root_node(root); -@@ -5574,7 +5575,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) - btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); - memcpy(&wc->update_progress, &key, - sizeof(wc->update_progress)); -+ memcpy(&wc->drop_progress, &key, sizeof(key)); - -+ wc->drop_level = btrfs_root_drop_level(root_item); - level = btrfs_root_drop_level(root_item); - BUG_ON(level == 0); - path->lowest_level = level; -@@ -5666,6 +5669,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) - goto out_end_trans; - } - -+ /* -+ * We used to keep the path open until we completed the -+ * snapshot delete. However this can deadlock with -+ * things like backref walking that may want to resolve -+ * references that still point to this deleted root. We -+ * already have the ability to restart snapshot -+ * deletions on mount, so just clear our walk_control, -+ * drop the path, and go to the beginning and re-lookup -+ * our drop_progress key and continue from there. -+ */ -+ memset(wc, 0, sizeof(*wc)); -+ btrfs_release_path(path); - btrfs_end_transaction_throttle(trans); - if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { - btrfs_debug(fs_info, -@@ -5687,6 +5702,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) - err = PTR_ERR(trans); - goto out_free; - } -+ goto again; - } - } - btrfs_release_path(path); --- -2.32.0 - - -From f241ea708d4c4da7800436d3c74d0cd7836c75e8 Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Wed, 24 Mar 2021 09:44:21 -0400 -Subject: [PATCH 04/22] btrfs: use percpu_read_positive instead of sum_positive - for need_preempt - -Looking at perf data for a fio workload I noticed that we were spending -a pretty large chunk of time (around 5%) doing percpu_counter_sum() in -need_preemptive_reclaim. This is silly, as we only want to know if we -have more ordered than delalloc to see if we should be counting the -delayed items in our threshold calculation. Change this to -percpu_read_positive() to avoid the overhead. - -I ran this through fsperf to validate the changes, obviously the latency -numbers in dbench and fio are quite jittery, so take them as you wish, -but overall the improvements on throughput, iops, and bw are all -positive. Each test was run two times, the given value is the average -of both runs for their respective column. - -btrfs ssd normal test results - -bufferedrandwrite16g results - metric baseline current diff -========================================================== -write_io_kbytes 16777216 16777216 0.00% -read_clat_ns_p99 0 0 0.00% -write_bw_bytes 1.04e+08 1.05e+08 1.12% -read_iops 0 0 0.00% -write_clat_ns_p50 13888 11840 -14.75% -read_io_kbytes 0 0 0.00% -read_io_bytes 0 0 0.00% -write_clat_ns_p99 35008 29312 -16.27% -read_bw_bytes 0 0 0.00% -elapsed 170 167 -1.76% -write_lat_ns_min 4221.50 3762.50 -10.87% -sys_cpu 39.65 35.37 -10.79% -write_lat_ns_max 2.67e+10 2.50e+10 -6.63% -read_lat_ns_min 0 0 0.00% -write_iops 25270.10 25553.43 1.12% -read_lat_ns_max 0 0 0.00% -read_clat_ns_p50 0 0 0.00% - -dbench60 results - metric baseline current diff -================================================== -qpathinfo 11.12 12.73 14.52% -throughput 416.09 445.66 7.11% -flush 3485.63 1887.55 -45.85% -qfileinfo 0.70 1.92 173.86% -ntcreatex 992.60 695.76 -29.91% -qfsinfo 2.43 3.71 52.48% -close 1.67 3.14 88.09% -sfileinfo 66.54 105.20 58.10% -rename 809.23 619.59 -23.43% -find 16.88 15.46 -8.41% -unlink 820.54 670.86 -18.24% -writex 3375.20 2637.91 -21.84% -deltree 386.33 449.98 16.48% -readx 3.43 3.41 -0.60% -mkdir 0.05 0.03 -38.46% -lockx 0.26 0.26 -0.76% -unlockx 0.81 0.32 -60.33% - -dio4kbs16threads results - metric baseline current diff -================================================================ -write_io_kbytes 5249676 3357150 -36.05% -read_clat_ns_p99 0 0 0.00% -write_bw_bytes 89583501.50 57291192.50 -36.05% -read_iops 0 0 0.00% -write_clat_ns_p50 242688 263680 8.65% -read_io_kbytes 0 0 0.00% -read_io_bytes 0 0 0.00% -write_clat_ns_p99 15826944 36732928 132.09% -read_bw_bytes 0 0 0.00% -elapsed 61 61 0.00% -write_lat_ns_min 42704 42095 -1.43% -sys_cpu 5.27 3.45 -34.52% -write_lat_ns_max 7.43e+08 9.27e+08 24.71% -read_lat_ns_min 0 0 0.00% -write_iops 21870.97 13987.11 -36.05% -read_lat_ns_max 0 0 0.00% -read_clat_ns_p50 0 0 0.00% - -randwrite2xram results - metric baseline current diff -================================================================ -write_io_kbytes 24831972 28876262 16.29% -read_clat_ns_p99 0 0 0.00% -write_bw_bytes 83745273.50 92182192.50 10.07% -read_iops 0 0 0.00% -write_clat_ns_p50 13952 11648 -16.51% -read_io_kbytes 0 0 0.00% -read_io_bytes 0 0 0.00% -write_clat_ns_p99 50176 52992 5.61% -read_bw_bytes 0 0 0.00% -elapsed 314 332 5.73% -write_lat_ns_min 5920.50 5127 -13.40% -sys_cpu 7.82 7.35 -6.07% -write_lat_ns_max 5.27e+10 3.88e+10 -26.44% -read_lat_ns_min 0 0 0.00% -write_iops 20445.62 22505.42 10.07% -read_lat_ns_max 0 0 0.00% -read_clat_ns_p50 0 0 0.00% - -untarfirefox results -metric baseline current diff -============================================== -elapsed 47.41 47.40 -0.03% - -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/space-info.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c -index 2da6177f4..2dc674b7c 100644 ---- a/fs/btrfs/space-info.c -+++ b/fs/btrfs/space-info.c -@@ -861,8 +861,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - * of heavy DIO or ordered reservations, preemptive flushing will just - * waste time and cause us to slow down. - */ -- ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); -- delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); -+ ordered = percpu_counter_read_positive(&fs_info->ordered_bytes); -+ delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); - if (ordered >= delalloc) - used += fs_info->delayed_refs_rsv.reserved + - fs_info->delayed_block_rsv.reserved; --- -2.32.0 - - -From aafa62450e938fae02462b7d47e055eb6307c57d Mon Sep 17 00:00:00 2001 -From: Filipe Manana <fdmanana@suse.com> -Date: Mon, 1 Mar 2021 09:26:42 +0000 -Subject: [PATCH 05/22] btrfs: add btree read ahead for full send operations - -When doing a full send we know that we are going to be reading every node -and leaf of the send root, so we benefit from enabling read ahead for the -btree. - -This change enables read ahead for full send operations only, incremental -sends will have read ahead enabled in a different way by a separate patch. - -The following test script was used to measure the improvement on a box -using an average, consumer grade, spinning disk and with 16Gb of ram: - - $ cat test.sh - #!/bin/bash - - DEV=/dev/sdj - MNT=/mnt/sdj - MKFS_OPTIONS="--nodesize 16384" # default, just to be explicit - MOUNT_OPTIONS="-o max_inline=2048" # default, just to be explicit - - mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null - mount $MOUNT_OPTIONS $DEV $MNT - - # Create files with inline data to make it easier and faster to create - # large btrees. - add_files() - { - local total=$1 - local start_offset=$2 - local number_jobs=$3 - local total_per_job=$(($total / $number_jobs)) - - echo "Creating $total new files using $number_jobs jobs" - for ((n = 0; n < $number_jobs; n++)); do - ( - local start_num=$(($start_offset + $n * $total_per_job)) - for ((i = 1; i <= $total_per_job; i++)); do - local file_num=$((start_num + $i)) - local file_path="$MNT/file_${file_num}" - xfs_io -f -c "pwrite -S 0xab 0 2000" $file_path > /dev/null - if [ $? -ne 0 ]; then - echo "Failed creating file $file_path" - break - fi - done - ) & - worker_pids[$n]=$! - done - - wait ${worker_pids[@]} - - sync - echo - echo "btree node/leaf count: $(btrfs inspect-internal dump-tree -t 5 $DEV | egrep '^(node|leaf) ' | wc -l)" - } - - initial_file_count=500000 - add_files $initial_file_count 0 4 - - echo - echo "Creating first snapshot..." - btrfs subvolume snapshot -r $MNT $MNT/snap1 - - echo - echo "Adding more files..." - add_files $((initial_file_count / 4)) $initial_file_count 4 - - echo - echo "Updating 1/50th of the initial files..." - for ((i = 1; i < $initial_file_count; i += 50)); do - xfs_io -c "pwrite -S 0xcd 0 20" $MNT/file_$i > /dev/null - done - - echo - echo "Creating second snapshot..." - btrfs subvolume snapshot -r $MNT $MNT/snap2 - - umount $MNT - - echo 3 > /proc/sys/vm/drop_caches - blockdev --flushbufs $DEV &> /dev/null - hdparm -F $DEV &> /dev/null - - mount $MOUNT_OPTIONS $DEV $MNT - - echo - echo "Testing full send..." - start=$(date +%s) - btrfs send $MNT/snap1 > /dev/null - end=$(date +%s) - echo - echo "Full send took $((end - start)) seconds" - - umount $MNT - - echo 3 > /proc/sys/vm/drop_caches - blockdev --flushbufs $DEV &> /dev/null - hdparm -F $DEV &> /dev/null - - mount $MOUNT_OPTIONS $DEV $MNT - - echo - echo "Testing incremental send..." - start=$(date +%s) - btrfs send -p $MNT/snap1 $MNT/snap2 > /dev/null - end=$(date +%s) - echo - echo "Incremental send took $((end - start)) seconds" - - umount $MNT - -Before this change, full send duration: - -with $initial_file_count == 200000: 165 seconds -with $initial_file_count == 500000: 407 seconds - -After this change, full send duration: - -with $initial_file_count == 200000: 149 seconds (-10.2%) -with $initial_file_count == 500000: 353 seconds (-14.2%) - -For $initial_file_count == 200000 there are 62600 nodes and leaves in the -btree of the first snapshot, while for $initial_file_count == 500000 there -are 152476 nodes and leaves. The roots were at level 2. - -Signed-off-by: Filipe Manana <fdmanana@suse.com> ---- - fs/btrfs/send.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c -index 8ae8f1732..9817da145 100644 ---- a/fs/btrfs/send.c -+++ b/fs/btrfs/send.c -@@ -6650,6 +6650,7 @@ static int full_send_tree(struct send_ctx *sctx) - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; -+ path->reada = READA_FORWARD; - - key.objectid = BTRFS_FIRST_FREE_OBJECTID; - key.type = BTRFS_INODE_ITEM_KEY; --- -2.32.0 - - -From a1df61cf5c2efa2298869acced2eb5f6a51e27ed Mon Sep 17 00:00:00 2001 -From: Filipe Manana <fdmanana@suse.com> -Date: Mon, 1 Mar 2021 09:26:43 +0000 -Subject: [PATCH 06/22] btrfs: add btree read ahead for incremental send - operations - -Currently we do not do btree read ahead when doing an incremental send, -however we know that we will read and process any node or leaf in the -send root that has a generation greater than the generation of the parent -root. So triggering read ahead for such nodes and leafs is beneficial -for an incremental send. - -This change does that, triggers read ahead of any node or leaf in the -send root that has a generation greater then the generation of the -parent root. As for the parent root, no readahead is triggered because -knowing in advance which nodes/leaves are going to be read is not so -linear and there's often a large time window between visiting nodes or -leaves of the parent root. So I opted to leave out the parent root, -and triggering read ahead for its nodes/leaves seemed to have not made -significant difference. - -The following test script was used to measure the improvement on a box -using an average, consumer grade, spinning disk and with 16Gb of ram: - - $ cat test.sh - #!/bin/bash - - DEV=/dev/sdj - MNT=/mnt/sdj - MKFS_OPTIONS="--nodesize 16384" # default, just to be explicit - MOUNT_OPTIONS="-o max_inline=2048" # default, just to be explicit - - mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null - mount $MOUNT_OPTIONS $DEV $MNT - - # Create files with inline data to make it easier and faster to create - # large btrees. - add_files() - { - local total=$1 - local start_offset=$2 - local number_jobs=$3 - local total_per_job=$(($total / $number_jobs)) - - echo "Creating $total new files using $number_jobs jobs" - for ((n = 0; n < $number_jobs; n++)); do - ( - local start_num=$(($start_offset + $n * $total_per_job)) - for ((i = 1; i <= $total_per_job; i++)); do - local file_num=$((start_num + $i)) - local file_path="$MNT/file_${file_num}" - xfs_io -f -c "pwrite -S 0xab 0 2000" $file_path > /dev/null - if [ $? -ne 0 ]; then - echo "Failed creating file $file_path" - break - fi - done - ) & - worker_pids[$n]=$! - done - - wait ${worker_pids[@]} - - sync - echo - echo "btree node/leaf count: $(btrfs inspect-internal dump-tree -t 5 $DEV | egrep '^(node|leaf) ' | wc -l)" - } - - initial_file_count=500000 - add_files $initial_file_count 0 4 - - echo - echo "Creating first snapshot..." - btrfs subvolume snapshot -r $MNT $MNT/snap1 - - echo - echo "Adding more files..." - add_files $((initial_file_count / 4)) $initial_file_count 4 - - echo - echo "Updating 1/50th of the initial files..." - for ((i = 1; i < $initial_file_count; i += 50)); do - xfs_io -c "pwrite -S 0xcd 0 20" $MNT/file_$i > /dev/null - done - - echo - echo "Creating second snapshot..." - btrfs subvolume snapshot -r $MNT $MNT/snap2 - - umount $MNT - - echo 3 > /proc/sys/vm/drop_caches - blockdev --flushbufs $DEV &> /dev/null - hdparm -F $DEV &> /dev/null - - mount $MOUNT_OPTIONS $DEV $MNT - - echo - echo "Testing full send..." - start=$(date +%s) - btrfs send $MNT/snap1 > /dev/null - end=$(date +%s) - echo - echo "Full send took $((end - start)) seconds" - - umount $MNT - - echo 3 > /proc/sys/vm/drop_caches - blockdev --flushbufs $DEV &> /dev/null - hdparm -F $DEV &> /dev/null - - mount $MOUNT_OPTIONS $DEV $MNT - - echo - echo "Testing incremental send..." - start=$(date +%s) - btrfs send -p $MNT/snap1 $MNT/snap2 > /dev/null - end=$(date +%s) - echo - echo "Incremental send took $((end - start)) seconds" - - umount $MNT - -Before this change, incremental send duration: - -with $initial_file_count == 200000: 51 seconds -with $initial_file_count == 500000: 168 seconds - -After this change, incremental send duration: - -with $initial_file_count == 200000: 39 seconds (-26.7%) -with $initial_file_count == 500000: 125 seconds (-29.4%) - -For $initial_file_count == 200000 there are 62600 nodes and leaves in the -btree of the first snapshot, and 77759 nodes and leaves in the btree of -the second snapshot. The root nodes were at level 2. - -While for $initial_file_count == 500000 there are 152476 nodes and leaves -in the btree of the first snapshot, and 190511 nodes and leaves in the -btree of the second snapshot. The root nodes were at level 2 as well. - -Signed-off-by: Filipe Manana <fdmanana@suse.com> ---- - fs/btrfs/send.c | 42 ++++++++++++++++++++++++++++++++++++------ - 1 file changed, 36 insertions(+), 6 deletions(-) - -diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c -index 9817da145..ed1310e38 100644 ---- a/fs/btrfs/send.c -+++ b/fs/btrfs/send.c -@@ -6689,15 +6689,35 @@ static int full_send_tree(struct send_ctx *sctx) - return ret; - } - --static int tree_move_down(struct btrfs_path *path, int *level) -+static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) - { - struct extent_buffer *eb; -+ struct extent_buffer *parent = path->nodes[*level]; -+ int slot = path->slots[*level]; -+ const int nritems = btrfs_header_nritems(parent); -+ u64 reada_max; -+ u64 reada_done = 0; - - BUG_ON(*level == 0); -- eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]); -+ eb = btrfs_read_node_slot(parent, slot); - if (IS_ERR(eb)) - return PTR_ERR(eb); - -+ /* -+ * Trigger readahead for the next leaves we will process, so that it is -+ * very likely that when we need them they are already in memory and we -+ * will not block on disk IO. For nodes we only do readahead for one, -+ * since the time window between processing nodes is typically larger. -+ */ -+ reada_max = *level == 1 ? SZ_128K : eb->fs_info->nodesize; -+ -+ for (slot++; slot < nritems && reada_done < reada_max; slot++) { -+ if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) { -+ btrfs_readahead_node_child(parent, slot); -+ reada_done += eb->fs_info->nodesize; -+ } -+ } -+ - path->nodes[*level - 1] = eb; - path->slots[*level - 1] = 0; - (*level)--; -@@ -6737,14 +6757,15 @@ static int tree_move_next_or_upnext(struct btrfs_path *path, - static int tree_advance(struct btrfs_path *path, - int *level, int root_level, - int allow_down, -- struct btrfs_key *key) -+ struct btrfs_key *key, -+ u64 reada_min_gen) - { - int ret; - - if (*level == 0 || !allow_down) { - ret = tree_move_next_or_upnext(path, level, root_level); - } else { -- ret = tree_move_down(path, level); -+ ret = tree_move_down(path, level, reada_min_gen); - } - if (ret >= 0) { - if (*level == 0) -@@ -6818,6 +6839,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, - u64 right_blockptr; - u64 left_gen; - u64 right_gen; -+ u64 reada_min_gen; - - left_path = btrfs_alloc_path(); - if (!left_path) { -@@ -6897,6 +6919,14 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, - ret = -ENOMEM; - goto out; - } -+ /* -+ * Our right root is the parent root, while the left root is the "send" -+ * root. We know that all new nodes/leaves in the left root must have -+ * a generation greater than the right root's generation, so we trigger -+ * readahead for those nodes and leaves of the left root, as we know we -+ * will need to read them at some point. -+ */ -+ reada_min_gen = btrfs_header_generation(right_root->commit_root); - up_read(&fs_info->commit_root_sem); - - if (left_level == 0) -@@ -6921,7 +6951,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, - ret = tree_advance(left_path, &left_level, - left_root_level, - advance_left != ADVANCE_ONLY_NEXT, -- &left_key); -+ &left_key, reada_min_gen); - if (ret == -1) - left_end_reached = ADVANCE; - else if (ret < 0) -@@ -6932,7 +6962,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, - ret = tree_advance(right_path, &right_level, - right_root_level, - advance_right != ADVANCE_ONLY_NEXT, -- &right_key); -+ &right_key, reada_min_gen); - if (ret == -1) - right_end_reached = ADVANCE; - else if (ret < 0) --- -2.32.0 - - -From 54ee4da43eb299229eced65ffd9097b16003d1a3 Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Wed, 28 Apr 2021 13:38:42 -0400 -Subject: [PATCH 07/22] btrfs: check worker before need_preemptive_reclaim - -need_preemptive_reclaim() does some calculations, which aren't heavy, -but if we're already running preemptive reclaim there's no reason to do -them at all, so re-order the checks so that we don't do the calculation -if we're already doing reclaim. - -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/space-info.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c -index 2dc674b7c..c9a5e003b 100644 ---- a/fs/btrfs/space-info.c -+++ b/fs/btrfs/space-info.c -@@ -1588,8 +1588,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, - * the async reclaim as we will panic. - */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && -- need_preemptive_reclaim(fs_info, space_info) && -- !work_busy(&fs_info->preempt_reclaim_work)) { -+ !work_busy(&fs_info->preempt_reclaim_work) && -+ need_preemptive_reclaim(fs_info, space_info)) { - trace_btrfs_trigger_flush(fs_info, space_info->flags, - orig_bytes, flush, "preempt"); - queue_work(system_unbound_wq, --- -2.32.0 - - -From 76efa702dc8f1626c48eb0836205b7fb5b0ea94d Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Wed, 28 Apr 2021 13:38:43 -0400 -Subject: [PATCH 08/22] btrfs: only clamp the first time we have to start - flushing - -We were clamping the threshold for preemptive reclaim any time we added -a ticket to wait on, which if we have a lot of threads means we'd -essentially max out the clamp the first time we start to flush. Instead -of doing this, simply do it every time we have to start flushing, this -will make us ramp up gradually instead of going to max clamping as soon -as we start needing to do flushing. - -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/space-info.c | 17 +++++++++-------- - 1 file changed, 9 insertions(+), 8 deletions(-) - -diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c -index c9a5e003b..33edab17a 100644 ---- a/fs/btrfs/space-info.c -+++ b/fs/btrfs/space-info.c -@@ -1561,6 +1561,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, - flush == BTRFS_RESERVE_FLUSH_DATA) { - list_add_tail(&ticket.list, &space_info->tickets); - if (!space_info->flush) { -+ /* -+ * We were forced to add a reserve ticket, so -+ * our preemptive flushing is unable to keep -+ * up. Clamp down on the threshold for the -+ * preemptive flushing in order to keep up with -+ * the workload. -+ */ -+ maybe_clamp_preempt(fs_info, space_info); -+ - space_info->flush = 1; - trace_btrfs_trigger_flush(fs_info, - space_info->flags, -@@ -1572,14 +1581,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, - list_add_tail(&ticket.list, - &space_info->priority_tickets); - } -- -- /* -- * We were forced to add a reserve ticket, so our preemptive -- * flushing is unable to keep up. Clamp down on the threshold -- * for the preemptive flushing in order to keep up with the -- * workload. -- */ -- maybe_clamp_preempt(fs_info, space_info); - } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { - used += orig_bytes; - /* --- -2.32.0 - - -From 9844f3f192822e44c70ee90722b704d785f4884e Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Wed, 28 Apr 2021 13:38:46 -0400 -Subject: [PATCH 09/22] btrfs: don't include the global rsv size in the - preemptive used amount - -When deciding if we should preemptively flush space, we will add in the -amount of space used by all block rsvs. However this also includes the -global block rsv, which isn't flushable so shouldn't be accounted for in -this calculation. If we decide to use ->bytes_may_use in our used -calculation we need to subtract the global rsv size from this amount so -it most closely matches the flushable space. - -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/space-info.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c -index 33edab17a..52e3bfedc 100644 ---- a/fs/btrfs/space-info.c -+++ b/fs/btrfs/space-info.c -@@ -867,7 +867,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - used += fs_info->delayed_refs_rsv.reserved + - fs_info->delayed_block_rsv.reserved; - else -- used += space_info->bytes_may_use; -+ used += space_info->bytes_may_use - global_rsv_size; - - return (used >= thresh && !btrfs_fs_closing(fs_info) && - !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); --- -2.32.0 - - -From 1d309a9a237acff83ae53ba25b79dc21e2454b3d Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Wed, 28 Apr 2021 13:38:44 -0400 -Subject: [PATCH 10/22] btrfs: take into account global rsv in - need_preemptive_reclaim - -Global rsv can't be used for normal allocations, and for very full file -systems we can decide to try and async flush constantly even though -there's really not a lot of space to reclaim. Deal with this by -including the global block rsv size in the "total used" calculation. - -[HH: small context fix for 5.10.x] -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/space-info.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c -index 52e3bfedc..aeb1f0b7b 100644 ---- a/fs/btrfs/space-info.c -+++ b/fs/btrfs/space-info.c -@@ -792,12 +792,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) - { -+ u64 global_rsv_size = fs_info->global_block_rsv.reserved; - u64 ordered, delalloc; - u64 thresh = div_factor_fine(space_info->total_bytes, 98); - u64 used; - - /* If we're just plain full then async reclaim just slows us down. */ -- if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) -+ if ((space_info->bytes_used + space_info->bytes_reserved + -+ global_rsv_size) >= thresh) - return false; - - /* --- -2.32.0 - - -From fa148091ff63557e1d123194069b3d418d6129e4 Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Wed, 28 Apr 2021 13:38:45 -0400 -Subject: [PATCH 11/22] btrfs: use the global rsv size in the preemptive thresh - calculation - -We calculate the amount of "free" space available for normal -reservations by taking the total space and subtracting out the hard used -space, which is readonly, used, and reserved space. However we weren't -taking into account the global block rsv, which is essentially hard used -space. Handle this by subtracting it from the available free space, so -that our threshold more closely mirrors reality. - -[HH: small context fix for 5.10.x] -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/space-info.c | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c -index aeb1f0b7b..4e3857474 100644 ---- a/fs/btrfs/space-info.c -+++ b/fs/btrfs/space-info.c -@@ -840,8 +840,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - - thresh = calc_available_free_space(fs_info, space_info, - BTRFS_RESERVE_FLUSH_ALL); -- thresh += (space_info->total_bytes - space_info->bytes_used - -- space_info->bytes_reserved - space_info->bytes_readonly); -+ used = space_info->bytes_used + space_info->bytes_reserved + -+ space_info->bytes_readonly + global_rsv_size; -+ if (used < space_info->total_bytes) -+ thresh += space_info->total_bytes - used; - thresh >>= space_info->clamp; - - used = space_info->bytes_pinned; --- -2.32.0 - - -From 5270bca333d3c4f4bf045f224f6b94fe12528086 Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Wed, 28 Apr 2021 13:38:47 -0400 -Subject: [PATCH 12/22] btrfs: only ignore delalloc if delalloc is much smaller - than ordered - -While testing heavy delalloc workloads I noticed that sometimes we'd -just stop preemptively flushing when we had loads of delalloc available -to flush. This is because we skip preemptive flushing if delalloc <= -ordered. However if we start with say 4gib of delalloc, and we flush -2gib of that, we'll stop flushing there, when we still have 2gib of -delalloc to flush. - -Instead adjust the ordered bytes down by half, this way if 2/3 of our -outstanding delalloc reservations are tied up by ordered extents we -don't bother preemptive flushing, as we're getting close to the state -where we need to wait on ordered extents. - -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/space-info.c | 8 +++++++- - 1 file changed, 7 insertions(+), 1 deletion(-) - -diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c -index 4e3857474..cf09b23f3 100644 ---- a/fs/btrfs/space-info.c -+++ b/fs/btrfs/space-info.c -@@ -864,8 +864,14 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - * clearly be heavy enough to warrant preemptive flushing. In the case - * of heavy DIO or ordered reservations, preemptive flushing will just - * waste time and cause us to slow down. -+ * -+ * We want to make sure we truly are maxed out on ordered however, so -+ * cut ordered in half, and if it's still higher than delalloc then we -+ * can keep flushing. This is to avoid the case where we start -+ * flushing, and now delalloc == ordered and we stop preemptively -+ * flushing when we could still have several gigs of delalloc to flush. - */ -- ordered = percpu_counter_read_positive(&fs_info->ordered_bytes); -+ ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1; - delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); - if (ordered >= delalloc) - used += fs_info->delayed_refs_rsv.reserved + --- -2.32.0 - - -From 7713c31e0c5896f22358551aff967cf9f7dfe91c Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Wed, 28 Apr 2021 13:38:48 -0400 -Subject: [PATCH 13/22] btrfs: handle preemptive delalloc flushing slightly - differently - -If we decide to flush delalloc from the preemptive flusher, we really do -not want to wait on ordered extents, as it gains us nothing. However -there was logic to go ahead and wait on ordered extents if there was -more ordered bytes than delalloc bytes. We do not want this behavior, -so pass through whether this flushing is for preemption, and do not wait -for ordered extents if that's the case. Also break out of the shrink -loop after the first flushing, as we just want to one shot shrink -delalloc. - -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/space-info.c | 15 ++++++++++++--- - 1 file changed, 12 insertions(+), 3 deletions(-) - -diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c -index cf09b23f3..b2d834b92 100644 ---- a/fs/btrfs/space-info.c -+++ b/fs/btrfs/space-info.c -@@ -495,7 +495,8 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, - */ - static void shrink_delalloc(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, -- u64 to_reclaim, bool wait_ordered) -+ u64 to_reclaim, bool wait_ordered, -+ bool for_preempt) - { - struct btrfs_trans_handle *trans; - u64 delalloc_bytes; -@@ -532,7 +533,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, - * ordered extents, otherwise we'll waste time trying to flush delalloc - * that likely won't give us the space back we need. - */ -- if (ordered_bytes > delalloc_bytes) -+ if (ordered_bytes > delalloc_bytes && !for_preempt) - wait_ordered = true; - - loops = 0; -@@ -551,6 +552,14 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, - break; - } - -+ /* -+ * If we are for preemption we just want a one-shot of delalloc -+ * flushing so we can stop flushing if we decide we don't need -+ * to anymore. -+ */ -+ if (for_preempt) -+ break; -+ - spin_lock(&space_info->lock); - if (list_empty(&space_info->tickets) && - list_empty(&space_info->priority_tickets)) { -@@ -702,7 +711,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, - case FLUSH_DELALLOC: - case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, space_info, num_bytes, -- state == FLUSH_DELALLOC_WAIT); -+ state == FLUSH_DELALLOC_WAIT, for_preempt); - break; - case FLUSH_DELAYED_REFS_NR: - case FLUSH_DELAYED_REFS: --- -2.32.0 - - -From b1fa125daba80334e1efdc50e5a2e70f8585e755 Mon Sep 17 00:00:00 2001 -From: David Sterba <dsterba@suse.com> -Date: Tue, 18 May 2021 16:49:35 +0200 -Subject: [PATCH 14/22] btrfs: scrub: per-device bandwidth control - -Add sysfs interface to limit io during scrub. We relied on the ionice -interface to do that, eg. the idle class let the system usable while -scrub was running. This has changed when mq-deadline got widespread and -did not implement the scheduling classes. That was a CFQ thing that got -deleted. We've got numerous complaints from users about degraded -performance. - -Currently only BFQ supports that but it's not a common scheduler and we -can't ask everybody to switch to it. - -Alternatively the cgroup io limiting can be used but that also a -non-trivial setup (v2 required, the controller must be enabled on the -system). This can still be used if desired. - -Other ideas that have been explored: piggy-back on ionice (that is set -per-process and is accessible) and interpret the class and classdata as -bandwidth limits, but this does not have enough flexibility as there are -only 8 allowed and we'd have to map fixed limits to each value. Also -adjusting the value would need to lookup the process that currently runs -scrub on the given device, and the value is not sticky so would have to -be adjusted each time scrub runs. - -Running out of options, sysfs does not look that bad: - -- it's accessible from scripts, or udev rules -- the name is similar to what MD-RAID has - (/proc/sys/dev/raid/speed_limit_max or /sys/block/mdX/md/sync_speed_max) -- the value is sticky at least for filesystem mount time -- adjusting the value has immediate effect -- sysfs is available in constrained environments (eg. system rescue) -- the limit also applies to device replace - -Sysfs: - -- raw value is in bytes -- values written to the file accept suffixes like K, M -- file is in the per-device directory /sys/fs/btrfs/FSID/devinfo/DEVID/scrub_speed_max -- 0 means use default priority of IO - -The scheduler is a simple deadline one and the accuracy is up to nearest -128K. - -[HH: trivial context fix in hunk #1 for 5.10.x] -Signed-off-by: David Sterba <dsterba@suse.com> ---- - fs/btrfs/scrub.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++ - fs/btrfs/sysfs.c | 28 +++++++++++++++++++++ - fs/btrfs/volumes.h | 3 +++ - 3 files changed, 92 insertions(+) - -diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c -index b9202a1f1..adc8cf404 100644 ---- a/fs/btrfs/scrub.c -+++ b/fs/btrfs/scrub.c -@@ -165,6 +165,10 @@ struct scrub_ctx { - int readonly; - int pages_per_rd_bio; - -+ /* State of IO submission throttling affecting the associated device */ -+ ktime_t throttle_deadline; -+ u64 throttle_sent; -+ - int is_dev_replace; - u64 write_pointer; - -@@ -613,6 +617,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( - spin_lock_init(&sctx->list_lock); - spin_lock_init(&sctx->stat_lock); - init_waitqueue_head(&sctx->list_wait); -+ sctx->throttle_deadline = 0; - - WARN_ON(sctx->wr_curr_bio != NULL); - mutex_init(&sctx->wr_lock); -@@ -1996,6 +2001,60 @@ static void scrub_page_put(struct scrub_page *spage) - } - } - -+/* -+ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 -+ * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. -+ */ -+static void scrub_throttle(struct scrub_ctx *sctx) -+{ -+ const int time_slice = 1000; -+ struct scrub_bio *sbio; -+ struct btrfs_device *device; -+ s64 delta; -+ ktime_t now; -+ u32 div; -+ u64 bwlimit; -+ -+ sbio = sctx->bios[sctx->curr]; -+ device = sbio->dev; -+ bwlimit = READ_ONCE(device->scrub_speed_max); -+ if (bwlimit == 0) -+ return; -+ -+ /* -+ * Slice is divided into intervals when the IO is submitted, adjust by -+ * bwlimit and maximum of 64 intervals. -+ */ -+ div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); -+ div = min_t(u32, 64, div); -+ -+ /* Start new epoch, set deadline */ -+ now = ktime_get(); -+ if (sctx->throttle_deadline == 0) { -+ sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); -+ sctx->throttle_sent = 0; -+ } -+ -+ /* Still in the time to send? */ -+ if (ktime_before(now, sctx->throttle_deadline)) { -+ /* If current bio is within the limit, send it */ -+ sctx->throttle_sent += sbio->bio->bi_iter.bi_size; -+ if (sctx->throttle_sent <= bwlimit / div) -+ return; -+ -+ /* We're over the limit, sleep until the rest of the slice */ -+ delta = ktime_ms_delta(sctx->throttle_deadline, now); -+ } else { -+ /* New request after deadline, start new epoch */ -+ delta = 0; -+ } -+ -+ if (delta) -+ schedule_timeout_interruptible(delta * HZ / 1000); -+ /* Next call will start the deadline period */ -+ sctx->throttle_deadline = 0; -+} -+ - static void scrub_submit(struct scrub_ctx *sctx) - { - struct scrub_bio *sbio; -@@ -2003,6 +2062,8 @@ static void scrub_submit(struct scrub_ctx *sctx) - if (sctx->curr == -1) - return; - -+ scrub_throttle(sctx); -+ - sbio = sctx->bios[sctx->curr]; - sctx->curr = -1; - scrub_pending_bio_inc(sctx); -diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c -index 9372ef191..9dda3feda 100644 ---- a/fs/btrfs/sysfs.c -+++ b/fs/btrfs/sysfs.c -@@ -1469,6 +1469,33 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, - } - BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); - -+static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj, -+ struct kobj_attribute *a, -+ char *buf) -+{ -+ struct btrfs_device *device = container_of(kobj, struct btrfs_device, -+ devid_kobj); -+ -+ return scnprintf(buf, PAGE_SIZE, "%llu\n", -+ READ_ONCE(device->scrub_speed_max)); -+} -+ -+static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, -+ struct kobj_attribute *a, -+ const char *buf, size_t len) -+{ -+ struct btrfs_device *device = container_of(kobj, struct btrfs_device, -+ devid_kobj); -+ char *endptr; -+ unsigned long long limit; -+ -+ limit = memparse(buf, &endptr); -+ WRITE_ONCE(device->scrub_speed_max, limit); -+ return len; -+} -+BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show, -+ btrfs_devinfo_scrub_speed_max_store); -+ - static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, - struct kobj_attribute *a, char *buf) - { -@@ -1486,6 +1513,7 @@ static struct attribute *devid_attrs[] = { - BTRFS_ATTR_PTR(devid, in_fs_metadata), - BTRFS_ATTR_PTR(devid, missing), - BTRFS_ATTR_PTR(devid, replace_target), -+ BTRFS_ATTR_PTR(devid, scrub_speed_max), - BTRFS_ATTR_PTR(devid, writeable), - NULL - }; -diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h -index d4c3e0dd3..be7932d9b 100644 ---- a/fs/btrfs/volumes.h -+++ b/fs/btrfs/volumes.h -@@ -143,6 +143,9 @@ struct btrfs_device { - struct completion kobj_unregister; - /* For sysfs/FSID/devinfo/devid/ */ - struct kobject devid_kobj; -+ -+ /* Bandwidth limit for scrub, in bytes */ -+ u64 scrub_speed_max; - }; - - /* --- -2.32.0 - - -From 69dcc9b1b2ab6c0f2be5fd2020651ddb63e00f9c Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Wed, 19 May 2021 11:29:03 -0400 -Subject: [PATCH 15/22] btrfs: abort the transaction if we fail to replay log - trees - -During inspection of the return path for replay I noticed that we don't -actually abort the transaction if we get a failure during replay. This -isn't a problem necessarily, as we properly return the error and will -fail to mount. However we still leave this dangling transaction that -could conceivably be committed without thinking there was an error. -Handle this by making sure we abort the transaction on error to -safeguard us from any problems in the future. - -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/tree-log.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c -index 276b5511f..f9332bb84 100644 ---- a/fs/btrfs/tree-log.c -+++ b/fs/btrfs/tree-log.c -@@ -6363,8 +6363,10 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) - - return 0; - error: -- if (wc.trans) -+ if (wc.trans) { -+ btrfs_abort_transaction(wc.trans, ret); - btrfs_end_transaction(wc.trans); -+ } - btrfs_free_path(path); - return ret; - } --- -2.32.0 - - -From df6c67b60e03836e5deabb8d8ea76cfe5f4c5886 Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Wed, 19 May 2021 11:45:16 -0400 -Subject: [PATCH 16/22] btrfs: do not infinite loop in data reclaim if we - aborted - -Error injection stressing uncovered a busy loop in our data reclaim -loop. There are two cases here, one where we loop creating block groups -until space_info->full is set, or in the main loop we will skip erroring -out any tickets if space_info->full == 0. Unfortunately if we aborted -the transaction then we will never allocate chunks or reclaim any space -and thus never get ->full, and you'll see stack traces like this - -watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [kworker/u4:4:139] -CPU: 0 PID: 139 Comm: kworker/u4:4 Tainted: G W 5.13.0-rc1+ #328 -Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 -Workqueue: events_unbound btrfs_async_reclaim_data_space -RIP: 0010:btrfs_join_transaction+0x12/0x20 -RSP: 0018:ffffb2b780b77de0 EFLAGS: 00000246 -RAX: ffffb2b781863d58 RBX: 0000000000000000 RCX: 0000000000000000 -RDX: 0000000000000801 RSI: ffff987952b57400 RDI: ffff987940aa3000 -RBP: ffff987954d55000 R08: 0000000000000001 R09: ffff98795539e8f0 -R10: 000000000000000f R11: 000000000000000f R12: ffffffffffffffff -R13: ffff987952b574c8 R14: ffff987952b57400 R15: 0000000000000008 -FS: 0000000000000000(0000) GS:ffff9879bbc00000(0000) knlGS:0000000000000000 -CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 -CR2: 00007f0703da4000 CR3: 0000000113398004 CR4: 0000000000370ef0 -Call Trace: - flush_space+0x4a8/0x660 - btrfs_async_reclaim_data_space+0x55/0x130 - process_one_work+0x1e9/0x380 - worker_thread+0x53/0x3e0 - ? process_one_work+0x380/0x380 - kthread+0x118/0x140 - ? __kthread_bind_mask+0x60/0x60 - ret_from_fork+0x1f/0x30 - -Fix this by checking to see if we have BTRFS_FS_STATE_TRANS_ABORTED in -either of the reclaim loops, and if so fail the tickets and bail. In -addition to this, fix maybe_fail_all_tickets() to not try to grant -tickets if we've aborted, simply fail everything. - -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/space-info.c | 35 ++++++++++++++++++++++++++++++----- - 1 file changed, 30 insertions(+), 5 deletions(-) - -diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c -index b2d834b92..208f47e60 100644 ---- a/fs/btrfs/space-info.c -+++ b/fs/btrfs/space-info.c -@@ -941,6 +941,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, - struct reserve_ticket *ticket; - u64 tickets_id = space_info->tickets_id; - u64 first_ticket_bytes = 0; -+ bool aborted = test_bit(BTRFS_FS_STATE_TRANS_ABORTED, -+ &fs_info->fs_state); - - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); -@@ -952,7 +954,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, - ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); - -- if (ticket->steal && -+ if (!aborted && ticket->steal && - steal_from_global_rsv(fs_info, space_info, ticket)) - return true; - -@@ -968,15 +970,18 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, - */ - if (first_ticket_bytes == 0) - first_ticket_bytes = ticket->bytes; -- else if (first_ticket_bytes > ticket->bytes) -+ else if (!aborted && first_ticket_bytes > ticket->bytes) - return true; - -- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) -+ if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_info(fs_info, "failing ticket with %llu bytes", - ticket->bytes); - - remove_ticket(space_info, ticket); -- ticket->error = -ENOSPC; -+ if (aborted) -+ ticket->error = -EIO; -+ else -+ ticket->error = -ENOSPC; - wake_up(&ticket->wait); - - /* -@@ -985,7 +990,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, - * here to see if we can make progress with the next ticket in - * the list. - */ -- btrfs_try_granting_tickets(fs_info, space_info); -+ if (!aborted) -+ btrfs_try_granting_tickets(fs_info, space_info); - } - return (tickets_id != space_info->tickets_id); - } -@@ -1253,6 +1259,15 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) - spin_unlock(&space_info->lock); - return; - } -+ -+ /* Something happened, fail everything and bail. */ -+ if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, -+ &fs_info->fs_state)) { -+ maybe_fail_all_tickets(fs_info, space_info); -+ space_info->flush = 0; -+ spin_unlock(&space_info->lock); -+ return; -+ } - last_tickets_id = space_info->tickets_id; - spin_unlock(&space_info->lock); - } -@@ -1283,6 +1298,16 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) - } else { - flush_state = 0; - } -+ -+ /* Something happened, fail everything and bail. */ -+ if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, -+ &fs_info->fs_state)) { -+ maybe_fail_all_tickets(fs_info, space_info); -+ space_info->flush = 0; -+ spin_unlock(&space_info->lock); -+ return; -+ } -+ - } - spin_unlock(&space_info->lock); - } --- -2.32.0 - - -From c7c8879b24c228034f942521af8011a600b273ed Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Fri, 12 Mar 2021 15:25:05 -0500 -Subject: [PATCH 17/22] btrfs: handle btrfs_record_root_in_trans failure in - btrfs_recover_log_trees - -btrfs_record_root_in_trans will return errors in the future, so handle -the error properly in btrfs_recover_log_trees. - -This appears tricky, however we have a reference count on the -destination root, so if this fails we need to continue on in the loop to -make sure the proper cleanup is done. - -Reviewed-by: Qu Wenruo <wqu@suse.com> -Signed-off-by: Josef Bacik <josef@toxicpanda.com> -Reviewed-by: David Sterba <dsterba@suse.com> -[ add comment ] -Signed-off-by: David Sterba <dsterba@suse.com> ---- - fs/btrfs/tree-log.c | 9 +++++++-- - 1 file changed, 7 insertions(+), 2 deletions(-) - -diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c -index f9332bb84..1e3cfc935 100644 ---- a/fs/btrfs/tree-log.c -+++ b/fs/btrfs/tree-log.c -@@ -6300,8 +6300,13 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) - } - - wc.replay_dest->log_root = log; -- btrfs_record_root_in_trans(trans, wc.replay_dest); -- ret = walk_log_tree(trans, log, &wc); -+ ret = btrfs_record_root_in_trans(trans, wc.replay_dest); -+ if (ret) -+ /* The loop needs to continue due to the root refs */ -+ btrfs_handle_fs_error(fs_info, ret, -+ "failed to record the log root in transaction"); -+ else -+ ret = walk_log_tree(trans, log, &wc); - - if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { - ret = fixup_inode_link_counts(trans, wc.replay_dest, --- -2.32.0 - - -From c2a7ee7bf274fa4b307dc78e36ac32fde7ad9e91 Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef () toxicpanda ! com> -Date: Thu, 20 May 2021 14:46:01 +0000 -Subject: [PATCH 18/22] btrfs: change handle_fs_error in recover_log_trees to - aborts - -During inspection of the return path for replay I noticed that we don't -actually abort the transaction if we get a failure during replay. This -isn't a problem necessarily, as we properly return the error and will -fail to mount. However we still leave this dangling transaction that -could conceivably be committed without thinking there was an error. -We were using btrfs_handle_fs_error() here, but that pre-dates the -transaction abort code. Simply replace the btrfs_handle_fs_error() -calls with transaction aborts, so we still know where exactly things -went wrong, and add a few in some other un-handled error cases. - -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/tree-log.c | 16 ++++++++-------- - 1 file changed, 8 insertions(+), 8 deletions(-) - -diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c -index 1e3cfc935..876445337 100644 ---- a/fs/btrfs/tree-log.c -+++ b/fs/btrfs/tree-log.c -@@ -6247,8 +6247,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) - ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); - - if (ret < 0) { -- btrfs_handle_fs_error(fs_info, ret, -- "Couldn't find tree log root."); -+ btrfs_abort_transaction(trans, ret); - goto error; - } - if (ret > 0) { -@@ -6265,8 +6264,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) - log = btrfs_read_tree_root(log_root_tree, &found_key); - if (IS_ERR(log)) { - ret = PTR_ERR(log); -- btrfs_handle_fs_error(fs_info, ret, -- "Couldn't read tree log root."); -+ btrfs_abort_transaction(trans, ret); - goto error; - } - -@@ -6294,8 +6292,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) - - if (!ret) - goto next; -- btrfs_handle_fs_error(fs_info, ret, -- "Couldn't read target root for tree log recovery."); -+ btrfs_abort_transaction(trans, ret); - goto error; - } - -@@ -6303,14 +6300,15 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) - ret = btrfs_record_root_in_trans(trans, wc.replay_dest); - if (ret) - /* The loop needs to continue due to the root refs */ -- btrfs_handle_fs_error(fs_info, ret, -- "failed to record the log root in transaction"); -+ btrfs_abort_transaction(trans, ret); - else - ret = walk_log_tree(trans, log, &wc); - - if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { - ret = fixup_inode_link_counts(trans, wc.replay_dest, - path); -+ if (ret) -+ btrfs_abort_transaction(trans, ret); - } - - if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { -@@ -6327,6 +6325,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) - * could only happen during mount. - */ - ret = btrfs_init_root_free_objectid(root); -+ if (ret) -+ btrfs_abort_transaction(trans, ret); - } - - wc.replay_dest->log_root = NULL; --- -2.32.0 - - -From 03c9f21055825cd463b214cf8341e9c4907525f0 Mon Sep 17 00:00:00 2001 -From: Filipe Manana <fdmanana@suse.com> -Date: Fri, 28 May 2021 11:37:32 +0100 -Subject: [PATCH 19/22] btrfs: avoid unnecessary logging of xattrs during fast - fsyncs - -When logging an inode we always log all its xattrs, so that we are able -to figure out which ones should be deleted during log replay. However this -is unnecessary when we are doing a fast fsync and no xattrs were added, -changed or deleted since the last time we logged the inode in the current -transaction. - -So skip the logging of xattrs when the inode was previously logged in the -current transaction and no xattrs were added, changed or deleted. If any -changes to xattrs happened, than the inode has BTRFS_INODE_COPY_EVERYTHING -set in its runtime flags and the xattrs get logged. This saves time on -scanning for xattrs, allocating memory, COWing log tree extent buffers and -adding more lock contention on the extent buffers when there are multiple -tasks logging in parallel. - -The use of xattrs is common when using ACLs, some applications, or when -using security modules like SELinux where every inode gets a security -xattr added to it. - -The following test script, using fio, was used on a box with 12 cores, 64G -of RAM, a NVMe device and the default non-debug kernel config from Debian. -It uses 8 concurrent jobs each writing in blocks of 64K to its own 4G file, -each file with a single xattr of 50 bytes (about the same size for an ACL -or SELinux xattr), doing random buffered writes with an fsync after each -write. - - $ cat test.sh - #!/bin/bash - - DEV=/dev/nvme0n1 - MNT=/mnt/test - MOUNT_OPTIONS="-o ssd" - MKFS_OPTIONS="-d single -m single" - - NUM_JOBS=8 - FILE_SIZE=4G - - cat <<EOF > /tmp/fio-job.ini - [writers] - rw=randwrite - fsync=1 - fallocate=none - group_reporting=1 - direct=0 - bs=64K - ioengine=sync - size=$FILE_SIZE - directory=$MNT - numjobs=$NUM_JOBS - EOF - - echo "performance" | \ - tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor - - mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null - mount $MOUNT_OPTIONS $DEV $MNT - - echo "Creating files before fio runs, each with 1 xattr of 50 bytes" - for ((i = 0; i < $NUM_JOBS; i++)); do - path="$MNT/writers.$i.0" - truncate -s $FILE_SIZE $path - setfattr -n user.xa1 -v $(printf '%0.sX' $(seq 50)) $path - done - - fio /tmp/fio-job.ini - umount $MNT - -fio output before this change: - -WRITE: bw=120MiB/s (126MB/s), 120MiB/s-120MiB/s (126MB/s-126MB/s), io=32.0GiB (34.4GB), run=272145-272145msec - -fio output after this change: - -WRITE: bw=142MiB/s (149MB/s), 142MiB/s-142MiB/s (149MB/s-149MB/s), io=32.0GiB (34.4GB), run=230408-230408msec - -+16.8% throughput, -16.6% runtime - -Signed-off-by: Filipe Manana <fdmanana@suse.com> ---- - fs/btrfs/tree-log.c | 16 +++++++++++++--- - 1 file changed, 13 insertions(+), 3 deletions(-) - -diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c -index 876445337..fb4704ed9 100644 ---- a/fs/btrfs/tree-log.c -+++ b/fs/btrfs/tree-log.c -@@ -5465,13 +5465,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, - btrfs_release_path(dst_path); - if (need_log_inode_item) { - err = log_inode_item(trans, log, dst_path, inode); -- if (!err && !xattrs_logged) { -+ if (err) -+ goto out_unlock; -+ /* -+ * If we are doing a fast fsync and the inode was logged before -+ * in this transaction, we don't need to log the xattrs because -+ * they were logged before. If xattrs were added, changed or -+ * deleted since the last time we logged the inode, then we have -+ * already logged them because the inode had the runtime flag -+ * BTRFS_INODE_COPY_EVERYTHING set. -+ */ -+ if (!xattrs_logged && inode->logged_trans < trans->transid) { - err = btrfs_log_all_xattrs(trans, root, inode, path, - dst_path); -+ if (err) -+ goto out_unlock; - btrfs_release_path(path); - } -- if (err) -- goto out_unlock; - } - if (fast_search) { - ret = btrfs_log_changed_extents(trans, root, inode, dst_path, --- -2.32.0 - - -From 5615df097f5c4d07edf14bb40292f26043b405f5 Mon Sep 17 00:00:00 2001 -From: Josef Bacik <josef@toxicpanda.com> -Date: Tue, 1 Jun 2021 15:45:08 -0400 -Subject: [PATCH 20/22] btrfs: handle shrink_delalloc pages calculation - differently - -We have been hitting some early ENOSPC issues in production with more -recent kernels, and I tracked it down to us simply not flushing delalloc -as aggressively as we should be. With tracing I was seeing us failing -all tickets with all of the block rsvs at or around 0, with very little -pinned space, but still around 120mib of outstanding bytes_may_used. -Upon further investigation I saw that we were flushing around 14 pages -per shrink call for delalloc, despite having around 2gib of delalloc -outstanding. - -Consider the example of a 8 way machine, all cpu's trying to create a -file in parallel, which at the time of this commit requires 5 items to -do. Assuming a 16k leaf size, we have 10mib of total metadata reclaim -size waiting on reservations. Now assume we have 128mib of delalloc -outstanding. With our current math we would set items to 20, and then -set to_reclaim to 20 * 256k, or 5mib. - -Assuming that we went through this loop all 3 times, for both -FLUSH_DELALLOC and FLUSH_DELALLOC_WAIT, and then did the full loop -twice, we'd only flush 60mib of the 128mib delalloc space. This could -leave a fair bit of delalloc reservations still hanging around by the -time we go to ENOSPC out all the remaining tickets. - -Fix this two ways. First, change the calculations to be a fraction of -the total delalloc bytes on the system. Prior to my change we were -calculating based on dirty inodes so our math made more sense, now it's -just completely unrelated to what we're actually doing. - -Second add a FLUSH_DELALLOC_FULL state, that we hold off until we've -gone through the flush states at least once. This will empty the system -of all delalloc so we're sure to be truly out of space when we start -failing tickets. - -I'm tagging stable 5.10 and forward, because this is where we started -using the page stuff heavily again. This affects earlier kernel -versions as well, but would be a pain to backport to them as the -flushing mechanisms aren't the same. - -CC: stable@vger.kernel.org # 5.10 -Signed-off-by: Josef Bacik <josef@toxicpanda.com> ---- - fs/btrfs/ctree.h | 11 ++++++----- - fs/btrfs/space-info.c | 36 +++++++++++++++++++++++++++--------- - include/trace/events/btrfs.h | 1 + - 3 files changed, 34 insertions(+), 14 deletions(-) - -diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h -index 12921830e..75ba87dcc 100644 ---- a/fs/btrfs/ctree.h -+++ b/fs/btrfs/ctree.h -@@ -2746,11 +2746,12 @@ enum btrfs_flush_state { - FLUSH_DELAYED_REFS = 4, - FLUSH_DELALLOC = 5, - FLUSH_DELALLOC_WAIT = 6, -- ALLOC_CHUNK = 7, -- ALLOC_CHUNK_FORCE = 8, -- RUN_DELAYED_IPUTS = 9, -- COMMIT_TRANS = 10, -- FORCE_COMMIT_TRANS = 11, -+ FLUSH_DELALLOC_FULL = 7, -+ ALLOC_CHUNK = 8, -+ ALLOC_CHUNK_FORCE = 9, -+ RUN_DELAYED_IPUTS = 10, -+ COMMIT_TRANS = 11, -+ FORCE_COMMIT_TRANS = 12, - }; - - int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, -diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c -index 208f47e60..88bac64d5 100644 ---- a/fs/btrfs/space-info.c -+++ b/fs/btrfs/space-info.c -@@ -505,6 +505,10 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, - long time_left; - int loops; - -+ delalloc_bytes = percpu_counter_sum_positive( -+ &fs_info->delalloc_bytes); -+ ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); -+ - /* Calc the number of the pages we need flush for space reservation */ - if (to_reclaim == U64_MAX) { - items = U64_MAX; -@@ -512,19 +516,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, - /* - * to_reclaim is set to however much metadata we need to - * reclaim, but reclaiming that much data doesn't really track -- * exactly, so increase the amount to reclaim by 2x in order to -- * make sure we're flushing enough delalloc to hopefully reclaim -- * some metadata reservations. -+ * exactly. What we really want to do is reclaim full inode's -+ * worth of reservations, however that's not available to us -+ * here. We will take a fraction of the delalloc bytes for our -+ * flushing loops and hope for the best. Delalloc will expand -+ * the amount we write to cover an entire dirty extent, which -+ * will reclaim the metadata reservation for that range. If -+ * it's not enough subsequent flush stages will be more -+ * aggressive. - */ -+ to_reclaim = max(to_reclaim, delalloc_bytes >> 3); - items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; -- to_reclaim = items * EXTENT_SIZE_PER_ITEM; - } - - trans = (struct btrfs_trans_handle *)current->journal_info; - -- delalloc_bytes = percpu_counter_sum_positive( -- &fs_info->delalloc_bytes); -- ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); - if (delalloc_bytes == 0 && ordered_bytes == 0) - return; - -@@ -710,8 +716,11 @@ static void flush_space(struct btrfs_fs_info *fs_info, - break; - case FLUSH_DELALLOC: - case FLUSH_DELALLOC_WAIT: -+ case FLUSH_DELALLOC_FULL: -+ if (state == FLUSH_DELALLOC_FULL) -+ num_bytes = U64_MAX; - shrink_delalloc(fs_info, space_info, num_bytes, -- state == FLUSH_DELALLOC_WAIT, for_preempt); -+ state != FLUSH_DELALLOC, for_preempt); - break; - case FLUSH_DELAYED_REFS_NR: - case FLUSH_DELAYED_REFS: -@@ -1043,6 +1052,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) - commit_cycles--; - } - -+ /* -+ * We do not want to empty the system of delalloc unless we're -+ * under heavy pressure, so allow one trip through the flushing -+ * logic before we start doing a FLUSH_DELALLOC_FULL. -+ */ -+ if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles) -+ flush_state++; -+ - /* - * We don't want to force a chunk allocation until we've tried - * pretty hard to reclaim space. Think of the case where we -@@ -1225,7 +1242,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) - * so if we now have space to allocate do the force chunk allocation. - */ - static const enum btrfs_flush_state data_flush_states[] = { -- FLUSH_DELALLOC_WAIT, -+ FLUSH_DELALLOC_FULL, - RUN_DELAYED_IPUTS, - FLUSH_DELAYED_REFS, - COMMIT_TRANS, -@@ -1334,6 +1351,7 @@ static const enum btrfs_flush_state evict_flush_states[] = { - FLUSH_DELAYED_REFS, - FLUSH_DELALLOC, - FLUSH_DELALLOC_WAIT, -+ FLUSH_DELALLOC_FULL, - ALLOC_CHUNK, - COMMIT_TRANS, - }; -diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h -index 0551ea653..7cda6c3d7 100644 ---- a/include/trace/events/btrfs.h -+++ b/include/trace/events/btrfs.h -@@ -94,6 +94,7 @@ struct btrfs_space_info; - EM( FLUSH_DELAYED_ITEMS, "FLUSH_DELAYED_ITEMS") \ - EM( FLUSH_DELALLOC, "FLUSH_DELALLOC") \ - EM( FLUSH_DELALLOC_WAIT, "FLUSH_DELALLOC_WAIT") \ -+ EM( FLUSH_DELALLOC_FULL, "FLUSH_DELALLOC_FULL") \ - EM( FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR") \ - EM( FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS") \ - EM( ALLOC_CHUNK, "ALLOC_CHUNK") \ --- -2.32.0 - - -From ec26628891bae6cd63bcd934dc3b13157cbc1024 Mon Sep 17 00:00:00 2001 -From: Filipe Manana <fdmanana@suse.com> -Date: Wed, 9 Jun 2021 11:25:03 +0100 -Subject: [PATCH 21/22] btrfs: send: fix invalid path for unlink operations - after parent orphanization - -During an incremental send operation, when processing the new references -for the current inode, we might send an unlink operation for another inode -that has a conflicting path and has more than one hard link. However this -path was computed and cached before we processed previous new references -for the current inode. We may have orphanized a directory of that path -while processing a previous new reference, in which case the path will -be invalid and cause the receiver process to fail. - -The following reproducer triggers the problem and explains how/why it -happens in its comments: - - $ cat test-send-unlink.sh - #!/bin/bash - - DEV=/dev/sdi - MNT=/mnt/sdi - - mkfs.btrfs -f $DEV >/dev/null - mount $DEV $MNT - - # Create our test files and directory. Inode 259 (file3) has two hard - # links. - touch $MNT/file1 - touch $MNT/file2 - touch $MNT/file3 - - mkdir $MNT/A - ln $MNT/file3 $MNT/A/hard_link - - # Filesystem looks like: - # - # . (ino 256) - # |----- file1 (ino 257) - # |----- file2 (ino 258) - # |----- file3 (ino 259) - # |----- A/ (ino 260) - # |---- hard_link (ino 259) - # - - # Now create the base snapshot, which is going to be the parent snapshot - # for a later incremental send. - btrfs subvolume snapshot -r $MNT $MNT/snap1 - btrfs send -f /tmp/snap1.send $MNT/snap1 - - # Move inode 257 into directory inode 260. This results in computing the - # path for inode 260 as "/A" and caching it. - mv $MNT/file1 $MNT/A/file1 - - # Move inode 258 (file2) into directory inode 260, with a name of - # "hard_link", moving first inode 259 away since it currently has that - # location and name. - mv $MNT/A/hard_link $MNT/tmp - mv $MNT/file2 $MNT/A/hard_link - - # Now rename inode 260 to something else (B for example) and then create - # a hard link for inode 258 that has the old name and location of inode - # 260 ("/A"). - mv $MNT/A $MNT/B - ln $MNT/B/hard_link $MNT/A - - # Filesystem now looks like: - # - # . (ino 256) - # |----- tmp (ino 259) - # |----- file3 (ino 259) - # |----- B/ (ino 260) - # | |---- file1 (ino 257) - # | |---- hard_link (ino 258) - # | - # |----- A (ino 258) - - # Create another snapshot of our subvolume and use it for an incremental - # send. - btrfs subvolume snapshot -r $MNT $MNT/snap2 - btrfs send -f /tmp/snap2.send -p $MNT/snap1 $MNT/snap2 - - # Now unmount the filesystem, create a new one, mount it and try to - # apply both send streams to recreate both snapshots. - umount $DEV - - mkfs.btrfs -f $DEV >/dev/null - - mount $DEV $MNT - - # First add the first snapshot to the new filesystem by applying the - # first send stream. - btrfs receive -f /tmp/snap1.send $MNT - - # The incremental receive operation below used to fail with the - # following error: - # - # ERROR: unlink A/hard_link failed: No such file or directory - # - # This is because when send is processing inode 257, it generates the - # path for inode 260 as "/A", since that inode is its parent in the send - # snapshot, and caches that path. - # - # Later when processing inode 258, it first processes its new reference - # that has the path of "/A", which results in orphanizing inode 260 - # because there is a a path collision. This results in issuing a rename - # operation from "/A" to "/o260-6-0". - # - # Finally when processing the new reference "B/hard_link" for inode 258, - # it notices that it collides with inode 259 (not yet processed, because - # it has a higher inode number), since that inode has the name - # "hard_link" under the directory inode 260. It also checks that inode - # 259 has two hardlinks, so it decides to issue a unlink operation for - # the name "hard_link" for inode 259. However the path passed to the - # unlink operation is "/A/hard_link", which is incorrect since currently - # "/A" does not exists, due to the orphanization of inode 260 mentioned - # before. The path is incorrect because it was computed and cached - # before the orphanization. This results in the receiver to fail with - # the above error. - btrfs receive -f /tmp/snap2.send $MNT - - umount $MNT - -When running the test, it fails like this: - - $ ./test-send-unlink.sh - Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1' - At subvol /mnt/sdi/snap1 - Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2' - At subvol /mnt/sdi/snap2 - At subvol snap1 - At snapshot snap2 - ERROR: unlink A/hard_link failed: No such file or directory - -Fix this by recomputing a path before issuing an unlink operation when -processing the new references for the current inode if we previously -have orphanized a directory. - -A test case for fstests will follow soon. - -Signed-off-by: Filipe Manana <fdmanana@suse.com> ---- - fs/btrfs/send.c | 11 +++++++++++ - 1 file changed, 11 insertions(+) - -diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c -index ed1310e38..f61ababf8 100644 ---- a/fs/btrfs/send.c -+++ b/fs/btrfs/send.c -@@ -4064,6 +4064,17 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) - if (ret < 0) - goto out; - } else { -+ /* -+ * If we previously orphanized a directory that -+ * collided with a new reference that we already -+ * processed, recompute the current path because -+ * that directory may be part of the path. -+ */ -+ if (orphanized_dir) { -+ ret = refresh_ref_path(sctx, cur); -+ if (ret < 0) -+ goto out; -+ } - ret = send_unlink(sctx, cur->full_path); - if (ret < 0) - goto out; --- -2.32.0 - - -From 3a07c030c466316a5f74cb9f320d7b9df985ec1c Mon Sep 17 00:00:00 2001 -From: David Sterba <dsterba () suse ! com> -Date: Fri, 11 Jun 2021 13:36:22 +0000 -Subject: [PATCH 22/22] btrfs: sysfs: export dev stats in devinfo directory - -The device stats can be read by ioctl, wrapped by command 'btrfs device -stats'. Provide another source where to read the information in -/sys/fs/btrfs/FSID/devinfo/DEVID/stats . The format is a list of -'key value' pairs one per line, which is common in other stat files. -The names are the same as used in other device stat outputs. - -The stats are all in one file as it's the snapshot of all available -stats. The 'one value per file' is not very suitable here. The stats -should be valid right after the stats item is read from disk, shortly -after initializing the device. - -In case the stats are not yet valid, print just 'invalid' as the file -contents. - -Signed-off-by: David Sterba <dsterba@suse.com> ---- - fs/btrfs/sysfs.c | 29 +++++++++++++++++++++++++++++ - 1 file changed, 29 insertions(+) - -diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c -index 9dda3feda..5c50fd77f 100644 ---- a/fs/btrfs/sysfs.c -+++ b/fs/btrfs/sysfs.c -@@ -1509,7 +1509,36 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, - } - BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); - -+static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, -+ struct kobj_attribute *a, char *buf) -+{ -+ struct btrfs_device *device = container_of(kobj, struct btrfs_device, -+ devid_kobj); -+ -+ if (!device->dev_stats_valid) -+ return scnprintf(buf, PAGE_SIZE, "invalid\n"); -+ -+ /* -+ * Print all at once so we get a snapshot of all values from the same -+ * time. Keep them in sync and in order of definition of -+ * btrfs_dev_stat_values. -+ */ -+ return scnprintf(buf, PAGE_SIZE, -+ "write_errs %d\n" -+ "read_errs %d\n" -+ "flush_errs %d\n" -+ "corruption_errs %d\n" -+ "generation_errs %d\n", -+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS), -+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS), -+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS), -+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS), -+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS)); -+} -+BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); -+ - static struct attribute *devid_attrs[] = { -+ BTRFS_ATTR_PTR(devid, error_stats), - BTRFS_ATTR_PTR(devid, in_fs_metadata), - BTRFS_ATTR_PTR(devid, missing), - BTRFS_ATTR_PTR(devid, replace_target), --- -2.32.0 - @@ -74,6 +74,7 @@ _major=5.12 _ckpatchversion=1 _ckpatch="patch-${_major}-ck${_ckpatchversion}" _gcc_more_v=20210610 +_patches_url="https://gitlab.com/sirlucjan/kernel-patches/-/raw/master/${_major}" arch=(x86_64) url="https://wiki.archlinux.org/index.php/Linux-ck" license=(GPL2) @@ -93,9 +94,9 @@ source=( 0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch 0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch 0007-x86-crash-remove-crash_reserve_low_1M.patch - 0008-UKSM.patch - 0009-bbr2.patch - 0010-btrfs.patch + "0008-UKSM.patch::${_patches_url}/uksm-patches/0001-UKSM-for-5.12.patch" + "0009-bbr2.patch::${_patches_url}/bbr2-patches-v2/0001-bbr2-5.12-introduce-BBRv2.patch" + "0010-btrfs.patch::${_patches_url}/btrfs-patches-v13/0001-btrfs-patches.patch" "0011-block.patch::${_patches_url}/block-patches-v6/0001-block-patches.patch" "0012-bfq.patch::${_patches_url}/bfq-patches-v15/0001-bfq-patches.patch" ) |