diff options
author | Scott B | 2021-11-19 11:49:20 -0800 |
---|---|---|
committer | Scott B | 2021-11-19 11:49:37 -0800 |
commit | 09b48c99189e48476f88b254f1c477192094ce04 (patch) | |
tree | 47ca4abce8b07ac519989512ce72eae9dc9b7151 | |
parent | dc9629e44241e52ccea16daba7137098b7a71e98 (diff) | |
download | aur-09b48c99189e48476f88b254f1c477192094ce04.tar.gz |
patch: drop lru_gen v5, included in xanmod
-rw-r--r-- | .SRCINFO | 2 | ||||
-rw-r--r-- | PKGBUILD | 8 | ||||
-rw-r--r-- | squashed-mm-multigenerational-lru-v5-for-5.15.y.patch | 4992 |
3 files changed, 3 insertions, 4999 deletions
@@ -24,7 +24,6 @@ pkgbase = linux-xanmod-rog source = https://cdn.kernel.org/pub/linux/kernel/v5.x/incr/patch-5.15.2-3.xz source = revert-amd-pstate-v3.patch source = squashed-amd-pstate-v4-for-5.15.patch - source = squashed-mm-multigenerational-lru-v5-for-5.15.y.patch source = x86-ACPI-State-Optimize-C3-entry-on-AMD-CPUs.patch source = acpi-battery-Always-read-fresh-battery-state-on-update.patch source = x86-change-default-to-spec_store_bypass_disable-prct.patch @@ -54,7 +53,6 @@ pkgbase = linux-xanmod-rog sha256sums = 074f5dd036079f81fcc4a239b6d159528fa22a57e06d5658f8b5c4970de65c26 sha256sums = 12d78853f582ccf8027dbd2cbc27b0fcde106202958c6be31054815bae7752ae sha256sums = 8592fc434cc7d52b0fefe478de19d0af3d7e06406d8d09bd1bc85e6805738e68 - sha256sums = 7675116fc7da55cfb9c6ceaaf120c5ebd285a13b1f1641ca810dd05516c88e0b sha256sums = 923230ed8367e28adfdeed75d3cdba9eec6b781818c37f6f3d3eb64101d2e716 sha256sums = f7a4bf6293912bfc4a20743e58a5a266be8c4dbe3c1862d196d3a3b45f2f7c90 sha256sums = cc401107f1bf7b7d8e8a78ee594f9db4b6fa252b7239b6aa88f678aef84d935c @@ -127,8 +127,9 @@ source=("https://cdn.kernel.org/pub/linux/kernel/v${_branch}/linux-${_major}.tar "revert-amd-pstate-v3.patch" "squashed-amd-pstate-v4-for-5.15.patch" - # multigenerational lru v5 - "squashed-mm-multigenerational-lru-v5-for-5.15.y.patch" + # multigenerational lru v5 included in Xanmod + + # 5.15: k10temp support for Zen3 APUs # 5.16: don't drop shared caches on C3 state transitions "x86-ACPI-State-Optimize-C3-entry-on-AMD-CPUs.patch" @@ -136,8 +137,6 @@ source=("https://cdn.kernel.org/pub/linux/kernel/v${_branch}/linux-${_major}.tar # -- patch from Chromium developers; more accurately report battery state changes "acpi-battery-Always-read-fresh-battery-state-on-update.patch" - # 5.15: k10temp support for Zen3 APUs - # 5.16 spectre defaults "x86-change-default-to-spec_store_bypass_disable-prct.patch" @@ -184,7 +183,6 @@ sha256sums=('57b2cf6991910e3b67a1b3490022e8a0674b6965c74c12da1e99d138d1991ee8' '074f5dd036079f81fcc4a239b6d159528fa22a57e06d5658f8b5c4970de65c26' '12d78853f582ccf8027dbd2cbc27b0fcde106202958c6be31054815bae7752ae' '8592fc434cc7d52b0fefe478de19d0af3d7e06406d8d09bd1bc85e6805738e68' - '7675116fc7da55cfb9c6ceaaf120c5ebd285a13b1f1641ca810dd05516c88e0b' '923230ed8367e28adfdeed75d3cdba9eec6b781818c37f6f3d3eb64101d2e716' 'f7a4bf6293912bfc4a20743e58a5a266be8c4dbe3c1862d196d3a3b45f2f7c90' 'cc401107f1bf7b7d8e8a78ee594f9db4b6fa252b7239b6aa88f678aef84d935c' diff --git a/squashed-mm-multigenerational-lru-v5-for-5.15.y.patch b/squashed-mm-multigenerational-lru-v5-for-5.15.y.patch deleted file mode 100644 index ac999b039a6b..000000000000 --- a/squashed-mm-multigenerational-lru-v5-for-5.15.y.patch +++ /dev/null @@ -1,4992 +0,0 @@ -From 7a36e5837e0242a27b8f8b27b2ed46617678d62b Mon Sep 17 00:00:00 2001 -From: Scott B <arglebargle@arglebargle.dev> -Date: Sun, 14 Nov 2021 04:20:28 -0800 -Subject: [PATCH] squashed mm multigenerational lru v5 for 5.15.y - -Squashed commit of the following: - -commit 19e8961fca725ce8ee2643cb2ff4a818c864ee30 -Author: Yu Zhao <yuzhao@google.com> -Date: Wed Nov 10 21:15:10 2021 -0700 - - mm: multigenerational lru: documentation - - Add Documentation/vm/multigen_lru.rst. - - Signed-off-by: Yu Zhao <yuzhao@google.com> - Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> - -commit 4326538c0c4edf38c30156b2ff956bc7df448249 -Author: Yu Zhao <yuzhao@google.com> -Date: Wed Nov 10 21:15:09 2021 -0700 - - mm: multigenerational lru: Kconfig - - Add configuration options for the multigenerational lru. - - Signed-off-by: Yu Zhao <yuzhao@google.com> - Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> - -commit 1fbf643cde24bf5b7925a6dc2b6f04832519213a -Author: Yu Zhao <yuzhao@google.com> -Date: Wed Nov 10 21:15:08 2021 -0700 - - mm: multigenerational lru: user interface - - Add /sys/kernel/mm/lru_gen/enabled to enable and disable the - multigenerational lru at runtime. - - Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a - given number of milliseconds. The OOM killer is invoked if this - working set cannot be kept in memory. - - Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and - invoke the aging and the eviction. This file has the following output: - memcg memcg_id memcg_path - node node_id - min_gen birth_time anon_size file_size - ... - max_gen birth_time anon_size file_size - - min_gen is the oldest generation number and max_gen is the youngest - generation number. birth_time is in milliseconds. anon_size and - file_size are in pages. - - This file takes the following input: - + memcg_id node_id max_gen [swappiness] [use_bloom_filter] - - memcg_id node_id min_gen [swappiness] [nr_to_reclaim] - - The first command line invokes the aging, which scans PTEs for - accessed pages and then creates the next generation max_gen+1. A swap - file and a non-zero swappiness, which overrides vm.swappiness, are - required to scan PTEs mapping anon pages. The second command line - invokes the eviction, which evicts generations less than or equal to - min_gen. min_gen should be less than max_gen-1 as max_gen and - max_gen-1 are not fully aged and therefore cannot be evicted. - Setting nr_to_reclaim to N limits the number of pages to evict. - Setting use_bloom_filter to 0 overrides the default behavior which - only scans PTE tables found populated. Multiple command lines are - supported, as is concatenation with delimiters "," and ";". - - Signed-off-by: Yu Zhao <yuzhao@google.com> - Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> - -commit 273f053f193dcca266045fabef04e6083e65e7a7 -Author: Yu Zhao <yuzhao@google.com> -Date: Wed Nov 10 21:15:07 2021 -0700 - - mm: multigenerational lru: eviction - - The eviction consumes old generations. Given an lruvec, the eviction - scans pages on lrugen->lists indexed by anon and file min_seq[] - (modulo MAX_NR_GENS). It first tries to select a type based on the - values of min_seq[]. If they are equal, it selects the type that has - a lower refaulted %. The eviction sorts a page according to its - updated generation number if the aging has found this page accessed. - It also moves a page to the next generation if this page is from an - upper tier that has a higher refaulted % than the base tier. The - eviction increments min_seq[] of a selected type when it finds - lrugen->lists indexed by min_seq[] of this selected type are empty. - - Each generation is divided into multiple tiers. Tiers represent - different ranges of numbers of accesses from file descriptors only. - Pages accessed N times via file descriptors belong to tier - order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers, - and they require additional MAX_NR_TIERS-2 bits in page->flags. In - contrast to moving between generations which requires list operations, - moving between tiers only involves operations on page->flags and - therefore has a negligible cost. A feedback loop modeled after the PID - controller monitors refaulted % across all tiers and decides when to - protect pages from which tiers. - - Unmapped pages are initially added to the oldest generation and then - conditionally protected by tiers. Each tier keeps track of how many - pages from it have refaulted. Tier 0 is the base tier and pages from - it are evicted unconditionally because there are no better candidates. - Pages from an upper tier are either evicted or moved to the next - generation, depending on whether this upper tier has a higher - refaulted % than the base tier. This model has the following - advantages: - 1) It removes the cost in the buffered access path and reduces the - overall cost of protection because pages are conditionally protected - in the reclaim path. - 2) It takes mapped pages into account and avoids overprotecting - pages accessed multiple times via file descriptors. - 3 Additional tiers improve the protection of pages accessed more - than twice. - - Signed-off-by: Yu Zhao <yuzhao@google.com> - Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> - -commit b8ff3328e2852e88df92042311f4169d231b8628 -Author: Yu Zhao <yuzhao@google.com> -Date: Wed Nov 10 21:15:06 2021 -0700 - - mm: multigenerational lru: aging - - The aging produces young generations. Given an lruvec, the aging - traverses lruvec_memcg()->mm_list and calls walk_page_range() to scan - PTEs for accessed pages. Upon finding one, the aging updates its - generation number to max_seq (modulo MAX_NR_GENS). After each round of - traversal, the aging increments max_seq. The aging is due when - min_seq[] reaches max_seq-1. - - The aging uses the following optimizations when walking page tables: - 1) It skips non-leaf PMD entries that have the accessed bit cleared - when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y. - 2) It does not zigzag between a PGD table and the same PMD or PTE - table spanning multiple VMAs. In other words, it finishes all the - VMAs within the range of the same PMD or PTE table before it returns - to this PGD table. This optimizes workloads that have large numbers - of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5. - - Signed-off-by: Yu Zhao <yuzhao@google.com> - Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> - -commit 18f292b52b7de4353ea1083a9f985fa69381ada8 -Author: Yu Zhao <yuzhao@google.com> -Date: Wed Nov 10 21:15:05 2021 -0700 - - mm: multigenerational lru: mm_struct list - - To scan PTEs for accessed pages, a mm_struct list is maintained for - each memcg. When multiple threads traverse the same memcg->mm_list, - each of them gets a unique mm_struct and therefore they can run - walk_page_range() concurrently to reach page tables of all processes - of this memcg. - - This infrastructure also provides the following optimizations: - 1) it allows walkers to skip processes that have been sleeping since - the last walk by tracking the usage of mm_struct between context - switches. - 2) it allows walkers to add interesting items they find during a - walk to a Bloom filter so that they can skip uninteresting items - during the next walk by testing whether an item is in this Bloom - filter. - - Signed-off-by: Yu Zhao <yuzhao@google.com> - Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> - -commit 089e38620dad513b66f01490392cb9dd40667853 -Author: Yu Zhao <yuzhao@google.com> -Date: Wed Nov 10 21:15:04 2021 -0700 - - mm: multigenerational lru: groundwork - - For each lruvec, evictable pages are divided into multiple - generations. The youngest generation number is stored in - lrugen->max_seq for both anon and file types as they are aged on an - equal footing. The oldest generation numbers are stored in - lrugen->min_seq[] separately for anon and file types as clean file - pages can be evicted regardless of swap constraints. These three - variables are monotonically increasing. Generation numbers are - truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into - page->flags. The sliding window technique is used to prevent truncated - generation numbers from overlapping. Each truncated generation number - is an index to - lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. - - The framework comprises two conceptually independent components: the - aging, which produces young generations, and the eviction, which - consumes old generations. Both can be invoked independently from user - space for the purpose of working set estimation and proactive reclaim. - - The protection of hot pages and the selection of cold pages are based - on page access types and patterns. There are two access types: one via - page tables and the other via file descriptors. The protection of the - former type is by design stronger because: - 1) The uncertainty in determining the access patterns of the former - type is higher due to the coalesced nature of the accessed bit. - 2) The cost of evicting the former type is higher due to the TLB - flushes required and the likelihood of involving I/O. - 3) The penalty of under-protecting the former type is higher because - applications usually do not prepare themselves for major faults like - they do for blocked I/O. For example, client applications commonly - dedicate blocked I/O to separate threads to avoid UI janks that - negatively affect user experience. - - There are also two access patterns: one with temporal locality and the - other without. The latter pattern, e.g., random and sequential, needs - to be explicitly excluded to avoid weakening the protection of the - former pattern. Generally the former type follows the former pattern - unless MADV_SEQUENTIAL is specified and the latter type follows the - latter pattern unless outlying refaults have been observed. - - Upon faulting, a page is added to the youngest generation, which - provides the strongest protection as the eviction will not consider - this page before the aging has scanned it at least twice. The first - scan clears the accessed bit set during the initial fault. And the - second scan makes sure this page has not been used since the first - scan. A page from any other generations is brought back to the - youngest generation whenever the aging finds the accessed bit set on - any of the PTEs mapping this page. - - Unmapped pages are initially added to the oldest generation and then - conditionally protected by tiers. This is done later [PATCH 07/10]. - - Signed-off-by: Yu Zhao <yuzhao@google.com> - Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> - -commit ece1399ffe2c76ce020e7041c4790f1ace13f77c -Author: Yu Zhao <yuzhao@google.com> -Date: Wed Nov 10 21:15:03 2021 -0700 - - mm/vmscan.c: refactor shrink_node() - - This patch refactors shrink_node(). This will make the upcoming - changes to mm/vmscan.c more readable. - - Signed-off-by: Yu Zhao <yuzhao@google.com> - Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> - -commit 103c3bb04169ccac581e524ca267961377e474fb -Author: Yu Zhao <yuzhao@google.com> -Date: Wed Nov 10 21:15:02 2021 -0700 - - mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG - - Some architectures support the accessed bit on non-leaf PMD entries, - e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using - it as part of linear address translation [1]. As an optimization, page - table walkers who are interested in the accessed bit can skip the PTEs - under a non-leaf PMD entry if the accessed bit is cleared on this PMD - entry. - - Although an inline function may be preferable, this capability is - added as a configuration option to look consistent when used with the - existing macros. - - [1]: Intel 64 and IA-32 Architectures Software Developer's Manual - Volume 3 (June 2021), section 4.8 - - Signed-off-by: Yu Zhao <yuzhao@google.com> - Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> - -commit fa446f371f7398741969012da6364558fe209d9a -Author: Yu Zhao <yuzhao@google.com> -Date: Wed Nov 10 21:15:01 2021 -0700 - - mm: x86, arm64: add arch_has_hw_pte_young() - - Some architectures automatically set the accessed bit in PTEs, e.g., - x86 and arm64 v8.2. On architectures that do not have this capability, - clearing the accessed bit in a PTE triggers a page fault following the - TLB miss of this PTE. - - Being aware of this capability can help make better decisions, i.e., - whether to limit the size of each batch of PTEs and the burst of - batches when clearing the accessed bit. - - Signed-off-by: Yu Zhao <yuzhao@google.com> - Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> ---- - Documentation/vm/index.rst | 1 + - Documentation/vm/multigen_lru.rst | 132 ++ - arch/Kconfig | 9 + - arch/arm64/include/asm/cpufeature.h | 5 + - arch/arm64/include/asm/pgtable.h | 13 +- - arch/arm64/kernel/cpufeature.c | 10 + - arch/arm64/tools/cpucaps | 1 + - arch/x86/Kconfig | 1 + - arch/x86/include/asm/pgtable.h | 9 +- - arch/x86/mm/pgtable.c | 5 +- - fs/exec.c | 2 + - fs/fuse/dev.c | 3 +- - include/linux/cgroup.h | 15 +- - include/linux/memcontrol.h | 7 + - include/linux/mm.h | 36 + - include/linux/mm_inline.h | 198 ++ - include/linux/mm_types.h | 106 ++ - include/linux/mmzone.h | 175 ++ - include/linux/nodemask.h | 1 + - include/linux/oom.h | 16 + - include/linux/page-flags-layout.h | 19 +- - include/linux/page-flags.h | 4 +- - include/linux/pgtable.h | 17 +- - include/linux/sched.h | 3 + - include/linux/swap.h | 3 + - kernel/bounds.c | 3 + - kernel/cgroup/cgroup-internal.h | 1 - - kernel/exit.c | 1 + - kernel/fork.c | 10 + - kernel/kthread.c | 1 + - kernel/sched/core.c | 2 + - mm/Kconfig | 59 + - mm/huge_memory.c | 3 +- - mm/memcontrol.c | 31 + - mm/memory.c | 21 +- - mm/mm_init.c | 6 +- - mm/oom_kill.c | 4 +- - mm/page_alloc.c | 1 + - mm/rmap.c | 8 + - mm/swap.c | 51 +- - mm/swapfile.c | 2 + - mm/vmscan.c | 2697 ++++++++++++++++++++++++++- - mm/workingset.c | 120 +- - 43 files changed, 3679 insertions(+), 133 deletions(-) - create mode 100644 Documentation/vm/multigen_lru.rst - -diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst -index b51f0d8992f8..779772a025a0 100644 ---- a/Documentation/vm/index.rst -+++ b/Documentation/vm/index.rst -@@ -17,6 +17,7 @@ various features of the Linux memory management - - swap_numa - zswap -+ multigen_lru - - Kernel developers MM documentation - ================================== -diff --git a/Documentation/vm/multigen_lru.rst b/Documentation/vm/multigen_lru.rst -new file mode 100644 -index 000000000000..7c064a378b85 ---- /dev/null -+++ b/Documentation/vm/multigen_lru.rst -@@ -0,0 +1,132 @@ -+.. SPDX-License-Identifier: GPL-2.0 -+ -+===================== -+Multigenerational LRU -+===================== -+ -+Quick Start -+=========== -+Build Configurations -+-------------------- -+:Required: Set ``CONFIG_LRU_GEN=y``. -+ -+:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by -+ default. -+ -+Runtime Configurations -+---------------------- -+:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the -+ feature was not turned on by default. -+ -+:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to -+ protect the working set of ``N`` milliseconds. The OOM killer is -+ invoked if this working set cannot be kept in memory. -+ -+:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature -+ is turned on. This file has the following output: -+ -+:: -+ -+ memcg memcg_id memcg_path -+ node node_id -+ min_gen birth_time anon_size file_size -+ ... -+ max_gen birth_time anon_size file_size -+ -+``min_gen`` is the oldest generation number and ``max_gen`` is the -+youngest generation number. ``birth_time`` is in milliseconds. -+``anon_size`` and ``file_size`` are in pages. -+ -+Phones/Laptops/Workstations -+--------------------------- -+No additional configurations required. -+ -+Servers/Data Centers -+-------------------- -+:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a -+ larger number. -+ -+:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger -+ number. -+ -+:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``. -+ -+:Working set estimation: Write ``+ memcg_id node_id max_gen -+ [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to -+ invoke the aging, which scans PTEs for accessed pages and then -+ creates the next generation ``max_gen+1``. A swap file and a non-zero -+ ``swappiness``, which overrides ``vm.swappiness``, are required to -+ scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to -+ override the default behavior which only scans PTE tables found -+ populated. -+ -+:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness] -+ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the -+ eviction, which evicts generations less than or equal to ``min_gen``. -+ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and -+ ``max_gen-1`` are not fully aged and therefore cannot be evicted. -+ Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple -+ command lines are supported, so does concatenation with delimiters -+ ``,`` and ``;``. -+ -+Framework -+========= -+For each ``lruvec``, evictable pages are divided into multiple -+generations. The youngest generation number is stored in -+``lrugen->max_seq`` for both anon and file types as they are aged on -+an equal footing. The oldest generation numbers are stored in -+``lrugen->min_seq[]`` separately for anon and file types as clean -+file pages can be evicted regardless of swap and writeback -+constraints. These three variables are monotonically increasing. -+Generation numbers are truncated into -+``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into -+``page->flags``. The sliding window technique is used to prevent -+truncated generation numbers from overlapping. Each truncated -+generation number is an index to an array of per-type and per-zone -+lists ``lrugen->lists``. -+ -+Each generation is divided into multiple tiers. Tiers represent -+different ranges of numbers of accesses from file descriptors only. -+Pages accessed ``N`` times via file descriptors belong to tier -+``order_base_2(N)``. Each generation contains at most -+``CONFIG_TIERS_PER_GEN`` tiers, and they require additional -+``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to -+moving between generations which requires list operations, moving -+between tiers only involves operations on ``page->flags`` and -+therefore has a negligible cost. A feedback loop modeled after the PID -+controller monitors refaulted % across all tiers and decides when to -+protect pages from which tiers. -+ -+The framework comprises two conceptually independent components: the -+aging and the eviction, which can be invoked separately from user -+space for the purpose of working set estimation and proactive reclaim. -+ -+Aging -+----- -+The aging produces young generations. Given an ``lruvec``, the aging -+traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()`` -+to scan PTEs for accessed pages (a ``mm_struct`` list is maintained -+for each ``memcg``). Upon finding one, the aging updates its -+generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``). -+After each round of traversal, the aging increments ``max_seq``. The -+aging is due when ``min_seq[]`` reaches ``max_seq-1``. -+ -+Eviction -+-------- -+The eviction consumes old generations. Given an ``lruvec``, the -+eviction scans pages on the per-zone lists indexed by anon and file -+``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to -+select a type based on the values of ``min_seq[]``. If they are -+equal, it selects the type that has a lower refaulted %. The eviction -+sorts a page according to its updated generation number if the aging -+has found this page accessed. It also moves a page to the next -+generation if this page is from an upper tier that has a higher -+refaulted % than the base tier. The eviction increments ``min_seq[]`` -+of a selected type when it finds all the per-zone lists indexed by -+``min_seq[]`` of this selected type are empty. -+ -+To-do List -+========== -+KVM Optimization -+---------------- -+Support shadow page table walk. -diff --git a/arch/Kconfig b/arch/Kconfig -index 8df1c7102643..7392fcc88777 100644 ---- a/arch/Kconfig -+++ b/arch/Kconfig -@@ -1288,6 +1288,15 @@ config ARCH_HAS_ELFCORE_COMPAT - config ARCH_HAS_PARANOID_L1D_FLUSH - bool - -+config ARCH_HAS_NONLEAF_PMD_YOUNG -+ bool -+ depends on PGTABLE_LEVELS > 2 -+ help -+ Architectures that select this are able to set the accessed bit on -+ non-leaf PMD entries in addition to leaf PTE entries where pages are -+ mapped. For them, page table walkers that clear the accessed bit may -+ stop at non-leaf PMD entries if they do not see the accessed bit. -+ - source "kernel/gcov/Kconfig" - - source "scripts/gcc-plugins/Kconfig" -diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h -index ef6be92b1921..99518b4b2a9e 100644 ---- a/arch/arm64/include/asm/cpufeature.h -+++ b/arch/arm64/include/asm/cpufeature.h -@@ -779,6 +779,11 @@ static inline bool system_supports_tlb_range(void) - cpus_have_const_cap(ARM64_HAS_TLB_RANGE); - } - -+static inline bool system_has_hw_af(void) -+{ -+ return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF); -+} -+ - extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); - - static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) -diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h -index dfa76afa0ccf..880ae658a69a 100644 ---- a/arch/arm64/include/asm/pgtable.h -+++ b/arch/arm64/include/asm/pgtable.h -@@ -993,13 +993,16 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, - * page after fork() + CoW for pfn mappings. We don't always have a - * hardware-managed access flag on arm64. - */ --static inline bool arch_faults_on_old_pte(void) -+static inline bool arch_has_hw_pte_young(bool local) - { -- WARN_ON(preemptible()); -+ if (local) { -+ WARN_ON(preemptible()); -+ return cpu_has_hw_af(); -+ } - -- return !cpu_has_hw_af(); -+ return system_has_hw_af(); - } --#define arch_faults_on_old_pte arch_faults_on_old_pte -+#define arch_has_hw_pte_young arch_has_hw_pte_young - - /* - * Experimentally, it's cheap to set the access flag in hardware and we -@@ -1007,7 +1010,7 @@ static inline bool arch_faults_on_old_pte(void) - */ - static inline bool arch_wants_old_prefaulted_pte(void) - { -- return !arch_faults_on_old_pte(); -+ return arch_has_hw_pte_young(true); - } - #define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte - -diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c -index 6ec7036ef7e1..940615d33845 100644 ---- a/arch/arm64/kernel/cpufeature.c -+++ b/arch/arm64/kernel/cpufeature.c -@@ -2157,6 +2157,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { - .matches = has_hw_dbm, - .cpu_enable = cpu_enable_hw_dbm, - }, -+ { -+ .desc = "Hardware update of the Access flag", -+ .type = ARM64_CPUCAP_SYSTEM_FEATURE, -+ .capability = ARM64_HW_AF, -+ .sys_reg = SYS_ID_AA64MMFR1_EL1, -+ .sign = FTR_UNSIGNED, -+ .field_pos = ID_AA64MMFR1_HADBS_SHIFT, -+ .min_field_value = 1, -+ .matches = has_cpuid_feature, -+ }, - #endif - { - .desc = "CRC32 instructions", -diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps -index 49305c2e6dfd..d52f50671e60 100644 ---- a/arch/arm64/tools/cpucaps -+++ b/arch/arm64/tools/cpucaps -@@ -35,6 +35,7 @@ HAS_STAGE2_FWB - HAS_SYSREG_GIC_CPUIF - HAS_TLB_RANGE - HAS_VIRT_HOST_EXTN -+HW_AF - HW_DBM - KVM_PROTECTED_MODE - MISMATCHED_CACHE_TYPE -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index d9830e7e1060..245742b79be9 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -84,6 +84,7 @@ config X86 - select ARCH_HAS_PMEM_API if X86_64 - select ARCH_HAS_PTE_DEVMAP if X86_64 - select ARCH_HAS_PTE_SPECIAL -+ select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64 - select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 - select ARCH_HAS_COPY_MC if X86_64 - select ARCH_HAS_SET_MEMORY -diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h -index 448cd01eb3ec..36205ec0acac 100644 ---- a/arch/x86/include/asm/pgtable.h -+++ b/arch/x86/include/asm/pgtable.h -@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) - - static inline int pmd_bad(pmd_t pmd) - { -- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; -+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != -+ (_KERNPG_TABLE & ~_PAGE_ACCESSED); - } - - static inline unsigned long pages_to_mb(unsigned long npg) -@@ -1397,10 +1398,10 @@ static inline bool arch_has_pfn_modify_check(void) - return boot_cpu_has_bug(X86_BUG_L1TF); - } - --#define arch_faults_on_old_pte arch_faults_on_old_pte --static inline bool arch_faults_on_old_pte(void) -+#define arch_has_hw_pte_young arch_has_hw_pte_young -+static inline bool arch_has_hw_pte_young(bool local) - { -- return false; -+ return true; - } - - #endif /* __ASSEMBLY__ */ -diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c -index 3481b35cb4ec..a224193d84bf 100644 ---- a/arch/x86/mm/pgtable.c -+++ b/arch/x86/mm/pgtable.c -@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, - return ret; - } - --#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) - int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) - { -@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, - - return ret; - } -+#endif -+ -+#ifdef CONFIG_TRANSPARENT_HUGEPAGE - int pudp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pud_t *pudp) - { -diff --git a/fs/exec.c b/fs/exec.c -index a098c133d8d7..c7e55b757e87 100644 ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -1007,6 +1007,7 @@ static int exec_mmap(struct mm_struct *mm) - active_mm = tsk->active_mm; - tsk->active_mm = mm; - tsk->mm = mm; -+ lru_gen_add_mm(mm); - /* - * This prevents preemption while active_mm is being loaded and - * it and mm are being updated, which could cause problems for -@@ -1017,6 +1018,7 @@ static int exec_mmap(struct mm_struct *mm) - if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) - local_irq_enable(); - activate_mm(active_mm, mm); -+ lru_gen_switch_mm(active_mm, mm); - if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) - local_irq_enable(); - tsk->mm->vmacache_seqnum = 0; -diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c -index dde341a6388a..33a2fda7462e 100644 ---- a/fs/fuse/dev.c -+++ b/fs/fuse/dev.c -@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *page) - 1 << PG_active | - 1 << PG_workingset | - 1 << PG_reclaim | -- 1 << PG_waiters))) { -+ 1 << PG_waiters | -+ LRU_GEN_MASK | LRU_REFS_MASK))) { - dump_page(page, "fuse: trying to steal weird page"); - return 1; - } -diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h -index 75c151413fda..b145025f3eac 100644 ---- a/include/linux/cgroup.h -+++ b/include/linux/cgroup.h -@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp) - css_put(&cgrp->self); - } - -+extern struct mutex cgroup_mutex; -+ -+static inline void cgroup_lock(void) -+{ -+ mutex_lock(&cgroup_mutex); -+} -+ -+static inline void cgroup_unlock(void) -+{ -+ mutex_unlock(&cgroup_mutex); -+} -+ - /** - * task_css_set_check - obtain a task's css_set with extra access conditions - * @task: the task to obtain css_set for -@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp) - * as locks used during the cgroup_subsys::attach() methods. - */ - #ifdef CONFIG_PROVE_RCU --extern struct mutex cgroup_mutex; - extern spinlock_t css_set_lock; - #define task_css_set_check(task, __c) \ - rcu_dereference_check((task)->cgroups, \ -@@ -707,6 +718,8 @@ struct cgroup; - static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } - static inline void css_get(struct cgroup_subsys_state *css) {} - static inline void css_put(struct cgroup_subsys_state *css) {} -+static inline void cgroup_lock(void) {} -+static inline void cgroup_unlock(void) {} - static inline int cgroup_attach_task_all(struct task_struct *from, - struct task_struct *t) { return 0; } - static inline int cgroupstats_build(struct cgroupstats *stats, -diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h -index 3096c9a0ee01..e284a4aa1bd8 100644 ---- a/include/linux/memcontrol.h -+++ b/include/linux/memcontrol.h -@@ -347,6 +347,10 @@ struct mem_cgroup { - struct deferred_split deferred_split_queue; - #endif - -+#ifdef CONFIG_LRU_GEN -+ struct lru_gen_mm_list mm_list; -+#endif -+ - struct mem_cgroup_per_node *nodeinfo[]; - }; - -@@ -1350,10 +1354,13 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) - - static inline void lock_page_memcg(struct page *page) - { -+ /* to match page_memcg_rcu() */ -+ rcu_read_lock(); - } - - static inline void unlock_page_memcg(struct page *page) - { -+ rcu_read_unlock(); - } - - static inline void mem_cgroup_handle_over_high(void) -diff --git a/include/linux/mm.h b/include/linux/mm.h -index 73a52aba448f..9d3bcc30dfda 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); - #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) - #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) - #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) -+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) -+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) - - /* - * Define the bit shifts to access each section. For non-existent -@@ -1800,6 +1802,40 @@ static inline void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows) { } - #endif - -+#ifdef CONFIG_LRU_GEN -+static inline void task_enter_nonseq_fault(void) -+{ -+ WARN_ON(current->in_nonseq_fault); -+ -+ current->in_nonseq_fault = 1; -+} -+ -+static inline void task_exit_nonseq_fault(void) -+{ -+ WARN_ON(!current->in_nonseq_fault); -+ -+ current->in_nonseq_fault = 0; -+} -+ -+static inline bool task_in_nonseq_fault(void) -+{ -+ return current->in_nonseq_fault; -+} -+#else -+static inline void task_enter_nonseq_fault(void) -+{ -+} -+ -+static inline void task_exit_nonseq_fault(void) -+{ -+} -+ -+static inline bool task_in_nonseq_fault(void) -+{ -+ return false; -+} -+#endif /* CONFIG_LRU_GEN */ -+ - static inline void unmap_shared_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) - { -diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h -index 355ea1ee32bd..7d520f45e612 100644 ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -79,11 +79,203 @@ static __always_inline enum lru_list page_lru(struct page *page) - return lru; - } - -+#ifdef CONFIG_LRU_GEN -+ -+static inline bool lru_gen_enabled(void) -+{ -+#ifdef CONFIG_LRU_GEN_ENABLED -+ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key); -+ -+ return static_branch_likely(&lru_gen_static_key); -+#else -+ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key); -+ -+ return static_branch_unlikely(&lru_gen_static_key); -+#endif -+} -+ -+/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */ -+static inline int lru_gen_from_seq(unsigned long seq) -+{ -+ return seq % MAX_NR_GENS; -+} -+ -+/* Return a proper index regardless whether we keep stats for historical generations. */ -+static inline int lru_hist_from_seq(unsigned long seq) -+{ -+ return seq % NR_HIST_GENS; -+} -+ -+/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */ -+static inline int lru_tier_from_refs(int refs) -+{ -+ VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH)); -+ -+ return order_base_2(refs + 1); -+} -+ -+/* The youngest and the second youngest generations are counted as active. */ -+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) -+{ -+ unsigned long max_seq = lruvec->evictable.max_seq; -+ -+ VM_BUG_ON(gen >= MAX_NR_GENS); -+ -+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); -+} -+ -+/* Update the sizes of the multigenerational lru lists. */ -+static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec, -+ int old_gen, int new_gen) -+{ -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ int delta = thp_nr_pages(page); -+ enum lru_list lru = type * LRU_FILE; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ lockdep_assert_held(&lruvec->lru_lock); -+ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); -+ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); -+ VM_BUG_ON(old_gen == -1 && new_gen == -1); -+ -+ if (old_gen >= 0) -+ WRITE_ONCE(lrugen->sizes[old_gen][type][zone], -+ lrugen->sizes[old_gen][type][zone] - delta); -+ if (new_gen >= 0) -+ WRITE_ONCE(lrugen->sizes[new_gen][type][zone], -+ lrugen->sizes[new_gen][type][zone] + delta); -+ -+ if (old_gen < 0) { -+ if (lru_gen_is_active(lruvec, new_gen)) -+ lru += LRU_ACTIVE; -+ update_lru_size(lruvec, lru, zone, delta); -+ return; -+ } -+ -+ if (new_gen < 0) { -+ if (lru_gen_is_active(lruvec, old_gen)) -+ lru += LRU_ACTIVE; -+ update_lru_size(lruvec, lru, zone, -delta); -+ return; -+ } -+ -+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { -+ update_lru_size(lruvec, lru, zone, -delta); -+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); -+ } -+ -+ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); -+} -+ -+/* Add a page to one of the multigenerational lru lists. Return true on success. */ -+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ int gen; -+ unsigned long old_flags, new_flags; -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ if (PageUnevictable(page) || !lrugen->enabled[type]) -+ return false; -+ /* -+ * If a page shouldn't be considered for eviction, i.e., a page mapped -+ * upon fault during which the accessed bit is set, add it to the -+ * youngest generation. -+ * -+ * If a page can't be evicted immediately, i.e., an anon page not in -+ * swap cache or a dirty page pending writeback, add it to the second -+ * oldest generation. -+ * -+ * If a page could be evicted immediately, e.g., a clean page, add it to -+ * the oldest generation. -+ */ -+ if (PageActive(page)) -+ gen = lru_gen_from_seq(lrugen->max_seq); -+ else if ((!type && !PageSwapCache(page)) || -+ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page)))) -+ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); -+ else -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page); -+ -+ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active)); -+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; -+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ lru_gen_update_size(page, lruvec, -1, gen); -+ /* for rotate_reclaimable_page() */ -+ if (reclaiming) -+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); -+ else -+ list_add(&page->lru, &lrugen->lists[gen][type][zone]); -+ -+ return true; -+} -+ -+/* Delete a page from one of the multigenerational lru lists. Return true on success. */ -+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ int gen; -+ unsigned long old_flags, new_flags; -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ if (!(new_flags & LRU_GEN_MASK)) -+ return false; -+ -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ -+ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+ -+ new_flags &= ~LRU_GEN_MASK; -+ if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS) -+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); -+ /* for shrink_page_list() */ -+ if (reclaiming) -+ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); -+ else if (lru_gen_is_active(lruvec, gen)) -+ new_flags |= BIT(PG_active); -+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ lru_gen_update_size(page, lruvec, gen, -1); -+ list_del(&page->lru); -+ -+ return true; -+} -+ -+#else -+ -+static inline bool lru_gen_enabled(void) -+{ -+ return false; -+} -+ -+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ return false; -+} -+ -+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ return false; -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - static __always_inline void add_page_to_lru_list(struct page *page, - struct lruvec *lruvec) - { - enum lru_list lru = page_lru(page); - -+ if (lru_gen_add_page(page, lruvec, false)) -+ return; -+ - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add(&page->lru, &lruvec->lists[lru]); - } -@@ -93,6 +285,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, - { - enum lru_list lru = page_lru(page); - -+ if (lru_gen_add_page(page, lruvec, true)) -+ return; -+ - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add_tail(&page->lru, &lruvec->lists[lru]); - } -@@ -100,6 +295,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, - static __always_inline void del_page_from_lru_list(struct page *page, - struct lruvec *lruvec) - { -+ if (lru_gen_del_page(page, lruvec, false)) -+ return; -+ - list_del(&page->lru); - update_lru_size(lruvec, page_lru(page), page_zonenum(page), - -thp_nr_pages(page)); -diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index 7f8ee09c711f..a6ca0607c549 100644 ---- a/include/linux/mm_types.h -+++ b/include/linux/mm_types.h -@@ -15,6 +15,8 @@ - #include <linux/page-flags-layout.h> - #include <linux/workqueue.h> - #include <linux/seqlock.h> -+#include <linux/nodemask.h> -+#include <linux/mmdebug.h> - - #include <asm/mmu.h> - -@@ -580,6 +582,22 @@ struct mm_struct { - #ifdef CONFIG_IOMMU_SUPPORT - u32 pasid; - #endif -+#ifdef CONFIG_LRU_GEN -+ struct { -+ /* the node of a global or per-memcg mm_struct list */ -+ struct list_head list; -+#ifdef CONFIG_MEMCG -+ /* points to the memcg of the owner task above */ -+ struct mem_cgroup *memcg; -+#endif -+ /* whether this mm_struct has been used since the last walk */ -+ nodemask_t nodes; -+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -+ /* the number of CPUs using this mm_struct */ -+ atomic_t nr_cpus; -+#endif -+ } lrugen; -+#endif /* CONFIG_LRU_GEN */ - } __randomize_layout; - - /* -@@ -606,6 +624,94 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) - return (struct cpumask *)&mm->cpu_bitmap; - } - -+#ifdef CONFIG_LRU_GEN -+ -+struct lru_gen_mm_list { -+ /* a global or per-memcg mm_struct list */ -+ struct list_head fifo; -+ /* protects the list above */ -+ spinlock_t lock; -+}; -+ -+void lru_gen_add_mm(struct mm_struct *mm); -+void lru_gen_del_mm(struct mm_struct *mm); -+#ifdef CONFIG_MEMCG -+void lru_gen_migrate_mm(struct mm_struct *mm); -+#endif -+ -+static inline void lru_gen_init_mm(struct mm_struct *mm) -+{ -+ INIT_LIST_HEAD(&mm->lrugen.list); -+#ifdef CONFIG_MEMCG -+ mm->lrugen.memcg = NULL; -+#endif -+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -+ atomic_set(&mm->lrugen.nr_cpus, 0); -+#endif -+ nodes_clear(mm->lrugen.nodes); -+} -+ -+/* Track the usage of each mm_struct so that we can skip inactive ones. */ -+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) -+{ -+ /* exclude init_mm, efi_mm, etc. */ -+ if (!core_kernel_data((unsigned long)old)) { -+ nodes_setall(old->lrugen.nodes); -+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -+ atomic_dec(&old->lrugen.nr_cpus); -+#endif -+ } -+ -+ if (!core_kernel_data((unsigned long)new)) { -+ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */ -+ VM_WARN_ON(list_empty(&new->lrugen.list)); -+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -+ atomic_inc(&new->lrugen.nr_cpus); -+#endif -+ } -+} -+ -+/* Return whether this mm_struct is being used on any CPUs. */ -+static inline bool lru_gen_mm_is_active(struct mm_struct *mm) -+{ -+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -+ return !cpumask_empty(mm_cpumask(mm)); -+#else -+ return atomic_read(&mm->lrugen.nr_cpus); -+#endif -+} -+ -+#else /* !CONFIG_LRU_GEN */ -+ -+static inline void lru_gen_add_mm(struct mm_struct *mm) -+{ -+} -+ -+static inline void lru_gen_del_mm(struct mm_struct *mm) -+{ -+} -+ -+#ifdef CONFIG_MEMCG -+static inline void lru_gen_migrate_mm(struct mm_struct *mm) -+{ -+} -+#endif -+ -+static inline void lru_gen_init_mm(struct mm_struct *mm) -+{ -+} -+ -+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) -+{ -+} -+ -+static inline bool lru_gen_mm_is_active(struct mm_struct *mm) -+{ -+ return false; -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - struct mmu_gather; - extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); - extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index 6a1d79d84675..a7544dc4c91b 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -294,6 +294,172 @@ enum lruvec_flags { - */ - }; - -+struct lruvec; -+struct page_vma_mapped_walk; -+ -+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) -+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) -+ -+#ifdef CONFIG_LRU_GEN -+ -+/* -+ * For each lruvec, evictable pages are divided into multiple generations. The -+ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are -+ * monotonically increasing. The sliding window technique is used to track at -+ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the -+ * window, AKA gen, indexes an array of per-type and per-zone lists for the -+ * corresponding generation. The counter in page->flags stores gen+1 while a -+ * page is on one of the multigenerational lru lists. Otherwise, it stores 0. -+ * -+ * After a page is faulted in, the aging must check the accessed bit at least -+ * twice before the eviction would consider it. The first check clears the -+ * accessed bit set during the initial fault. The second check makes sure this -+ * page hasn't been used since then. -+ */ -+#define MIN_NR_GENS 2 -+#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) -+ -+/* -+ * Each generation is divided into multiple tiers. Tiers represent different -+ * ranges of numbers of accesses from file descriptors, i.e., -+ * mark_page_accessed(). In contrast to moving between generations which -+ * requires the lru lock, moving between tiers only involves an atomic -+ * operation on page->flags and therefore has a negligible cost. -+ * -+ * The purposes of tiers are to: -+ * 1) estimate whether pages accessed multiple times via file descriptors are -+ * more active than pages accessed only via page tables by separating the two -+ * access types into upper tiers and the base tier, and comparing refaulted % -+ * across all tiers. -+ * 2) improve buffered io performance by deferring the protection of pages -+ * accessed multiple times until the eviction. That is the protection happens -+ * in the reclaim path, not the access path. -+ * -+ * Pages accessed N times via file descriptors belong to tier order_base_2(N). -+ * The base tier may be marked by PageReferenced(). All upper tiers are marked -+ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are -+ * used to support more than one upper tier. -+ */ -+#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN) -+#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) -+ -+/* Whether to keep stats for historical generations. */ -+#ifdef CONFIG_LRU_GEN_STATS -+#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) -+#else -+#define NR_HIST_GENS 1U -+#endif -+ -+struct lrugen { -+ /* the aging increments the max generation number */ -+ unsigned long max_seq; -+ /* the eviction increments the min generation numbers */ -+ unsigned long min_seq[ANON_AND_FILE]; -+ /* the birth time of each generation in jiffies */ -+ unsigned long timestamps[MAX_NR_GENS]; -+ /* the multigenerational lru lists */ -+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* the sizes of the multigenerational lru lists in pages */ -+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* the exponential moving average of refaulted */ -+ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; -+ /* the exponential moving average of protected+evicted */ -+ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; -+ /* the base tier isn't protected, hence the minus one */ -+ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; -+ /* incremented without holding the lru lock */ -+ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; -+ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; -+ /* whether the multigenerational lru is enabled */ -+ bool enabled[ANON_AND_FILE]; -+}; -+ -+enum { -+ MM_LEAF_TOTAL, /* total leaf entries */ -+ MM_LEAF_OLD, /* old leaf entries */ -+ MM_LEAF_YOUNG, /* young leaf entries */ -+ MM_NONLEAF_TOTAL, /* total non-leaf entries */ -+ MM_NONLEAF_PREV, /* previously worthy non-leaf entries */ -+ MM_NONLEAF_CUR, /* currently worthy non-leaf entries */ -+ NR_MM_STATS -+}; -+ -+/* mnemonic codes for the stats above */ -+#define MM_STAT_CODES "toydpc" -+ -+/* double buffering bloom filters */ -+#define NR_BLOOM_FILTERS 2 -+ -+struct lru_gen_mm_walk { -+ /* set to max_seq after each round of walk */ -+ unsigned long seq; -+ /* the next mm_struct on the list to walk */ -+ struct list_head *head; -+ /* the first mm_struct never walked before */ -+ struct list_head *tail; -+ /* to wait for the last walker to finish */ -+ struct wait_queue_head wait; -+ /* bloom filters flip after each round of walk */ -+ unsigned long *filters[NR_BLOOM_FILTERS]; -+ /* page table stats for debugging */ -+ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; -+ /* the number of concurrent walkers */ -+ int nr_walkers; -+}; -+ -+#define MIN_BATCH_SIZE 64 -+#define MAX_BATCH_SIZE 8192 -+ -+struct mm_walk_args { -+ struct mem_cgroup *memcg; -+ unsigned long max_seq; -+ unsigned long start_pfn; -+ unsigned long end_pfn; -+ unsigned long next_addr; -+ unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)]; -+ int node_id; -+ int swappiness; -+ int batch_size; -+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ int mm_stats[NR_MM_STATS]; -+ bool use_filter; -+}; -+ -+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); -+void lru_gen_change_state(bool enable, bool main, bool swap); -+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_init_memcg(struct mem_cgroup *memcg); -+void lru_gen_free_memcg(struct mem_cgroup *memcg); -+#endif -+ -+#else /* !CONFIG_LRU_GEN */ -+ -+static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) -+{ -+} -+ -+static inline void lru_gen_change_state(bool enable, bool main, bool swap) -+{ -+} -+ -+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) -+{ -+} -+ -+#ifdef CONFIG_MEMCG -+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) -+{ -+} -+ -+static inline void lru_gen_free_memcg(struct mem_cgroup *memcg) -+{ -+} -+#endif -+ -+#endif /* CONFIG_LRU_GEN */ -+ - struct lruvec { - struct list_head lists[NR_LRU_LISTS]; - /* per lruvec lru_lock for memcg */ -@@ -311,6 +477,12 @@ struct lruvec { - unsigned long refaults[ANON_AND_FILE]; - /* Various lruvec state flags (enum lruvec_flags) */ - unsigned long flags; -+#ifdef CONFIG_LRU_GEN -+ /* unevictable pages are on LRU_UNEVICTABLE */ -+ struct lrugen evictable; -+ /* state for mm list and page table walks */ -+ struct lru_gen_mm_walk mm_walk; -+#endif - #ifdef CONFIG_MEMCG - struct pglist_data *pgdat; - #endif -@@ -895,6 +1067,9 @@ typedef struct pglist_data { - - unsigned long flags; - -+#ifdef CONFIG_LRU_GEN -+ struct mm_walk_args mm_walk_args; -+#endif - ZONE_PADDING(_pad2_) - - /* Per-node vmstats */ -diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h -index 567c3ddba2c4..90840c459abc 100644 ---- a/include/linux/nodemask.h -+++ b/include/linux/nodemask.h -@@ -486,6 +486,7 @@ static inline int num_node_state(enum node_states state) - #define first_online_node 0 - #define first_memory_node 0 - #define next_online_node(nid) (MAX_NUMNODES) -+#define next_memory_node(nid) (MAX_NUMNODES) - #define nr_node_ids 1U - #define nr_online_nodes 1U - -diff --git a/include/linux/oom.h b/include/linux/oom.h -index 2db9a1432511..c4c8c7e71099 100644 ---- a/include/linux/oom.h -+++ b/include/linux/oom.h -@@ -57,6 +57,22 @@ struct oom_control { - extern struct mutex oom_lock; - extern struct mutex oom_adj_mutex; - -+#ifdef CONFIG_MMU -+extern struct task_struct *oom_reaper_list; -+extern struct wait_queue_head oom_reaper_wait; -+ -+static inline bool oom_reaping_in_progress(void) -+{ -+ /* racy check to see if oom reaping could be in progress */ -+ return READ_ONCE(oom_reaper_list) || !waitqueue_active(&oom_reaper_wait); -+} -+#else -+static inline bool oom_reaping_in_progress(void) -+{ -+ return false; -+} -+#endif -+ - static inline void set_current_oom_origin(void) - { - current->signal->oom_flag_origin = true; -diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h -index ef1e3e736e14..8cdbbdccb5ad 100644 ---- a/include/linux/page-flags-layout.h -+++ b/include/linux/page-flags-layout.h -@@ -26,6 +26,14 @@ - - #define ZONES_WIDTH ZONES_SHIFT - -+#ifdef CONFIG_LRU_GEN -+/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ -+#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2) -+#else -+#define LRU_GEN_WIDTH 0 -+#define LRU_REFS_WIDTH 0 -+#endif /* CONFIG_LRU_GEN */ -+ - #ifdef CONFIG_SPARSEMEM - #include <asm/sparsemem.h> - #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) -@@ -55,7 +63,8 @@ - #define SECTIONS_WIDTH 0 - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ -+ <= BITS_PER_LONG - NR_PAGEFLAGS - #define NODES_WIDTH NODES_SHIFT - #elif defined(CONFIG_SPARSEMEM_VMEMMAP) - #error "Vmemmap: No space for nodes field in page flags" -@@ -89,8 +98,8 @@ - #define LAST_CPUPID_SHIFT 0 - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ -- <= BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ -+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS - #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT - #else - #define LAST_CPUPID_WIDTH 0 -@@ -100,8 +109,8 @@ - #define LAST_CPUPID_NOT_IN_PAGE_FLAGS - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ -- > BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ -+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS - #error "Not enough bits in page flags" - #endif - -diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h -index fbfd3fad48f2..a7d7ff4c621d 100644 ---- a/include/linux/page-flags.h -+++ b/include/linux/page-flags.h -@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) - 1UL << PG_private | 1UL << PG_private_2 | \ - 1UL << PG_writeback | 1UL << PG_reserved | \ - 1UL << PG_slab | 1UL << PG_active | \ -- 1UL << PG_unevictable | __PG_MLOCKED) -+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) - - /* - * Flags checked when a page is prepped for return by the page allocator. -@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) - * alloc-free cycle to prevent from reusing the page. - */ - #define PAGE_FLAGS_CHECK_AT_PREP \ -- (PAGEFLAGS_MASK & ~__PG_HWPOISON) -+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) - - #define PAGE_FLAGS_PRIVATE \ - (1UL << PG_private | 1UL << PG_private_2) -diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h -index e24d2c992b11..afb9004b778a 100644 ---- a/include/linux/pgtable.h -+++ b/include/linux/pgtable.h -@@ -211,7 +211,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - #endif - - #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG --#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) - static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp) -@@ -232,7 +232,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - BUILD_BUG(); - return 0; - } --#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ - #endif - - #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -@@ -258,6 +258,19 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, - #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - #endif - -+#ifndef arch_has_hw_pte_young -+/* -+ * Return whether the accessed bit is supported by the local CPU or all CPUs. -+ * -+ * Those arches which have hw access flag feature need to implement their own -+ * helper. By default, "false" means pagefault will be hit on old pte. -+ */ -+static inline bool arch_has_hw_pte_young(bool local) -+{ -+ return false; -+} -+#endif -+ - #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR - static inline pte_t ptep_get_and_clear(struct mm_struct *mm, - unsigned long address, -diff --git a/include/linux/sched.h b/include/linux/sched.h -index c1a927ddec64..556b3d66c70d 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -911,6 +911,9 @@ struct task_struct { - #ifdef CONFIG_MEMCG - unsigned in_user_fault:1; - #endif -+#ifdef CONFIG_LRU_GEN -+ unsigned in_nonseq_fault:1; -+#endif - #ifdef CONFIG_COMPAT_BRK - unsigned brk_randomized:1; - #endif -diff --git a/include/linux/swap.h b/include/linux/swap.h -index ba52f3a3478e..2c4a5a88f83c 100644 ---- a/include/linux/swap.h -+++ b/include/linux/swap.h -@@ -137,6 +137,9 @@ union swap_header { - */ - struct reclaim_state { - unsigned long reclaimed_slab; -+#ifdef CONFIG_LRU_GEN -+ struct mm_walk_args *mm_walk_args; -+#endif - }; - - #ifdef __KERNEL__ -diff --git a/kernel/bounds.c b/kernel/bounds.c -index 9795d75b09b2..aba13aa7336c 100644 ---- a/kernel/bounds.c -+++ b/kernel/bounds.c -@@ -22,6 +22,9 @@ int main(void) - DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); - #endif - DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); -+#ifdef CONFIG_LRU_GEN -+ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1)); -+#endif - /* End of constants */ - - return 0; -diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h -index bfbeabc17a9d..bec59189e206 100644 ---- a/kernel/cgroup/cgroup-internal.h -+++ b/kernel/cgroup/cgroup-internal.h -@@ -146,7 +146,6 @@ struct cgroup_mgctx { - #define DEFINE_CGROUP_MGCTX(name) \ - struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) - --extern struct mutex cgroup_mutex; - extern spinlock_t css_set_lock; - extern struct cgroup_subsys *cgroup_subsys[]; - extern struct list_head cgroup_roots; -diff --git a/kernel/exit.c b/kernel/exit.c -index 91a43e57a32e..788a299abb4e 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -422,6 +422,7 @@ void mm_update_next_owner(struct mm_struct *mm) - goto retry; - } - WRITE_ONCE(mm->owner, c); -+ lru_gen_migrate_mm(mm); - task_unlock(c); - put_task_struct(c); - } -diff --git a/kernel/fork.c b/kernel/fork.c -index 38681ad44c76..8d6bb4e76904 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -681,6 +681,7 @@ static void check_mm(struct mm_struct *mm) - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON_MM(mm->pmd_huge_pte, mm); - #endif -+ VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm); - } - - #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) -@@ -1080,6 +1081,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, - goto fail_nocontext; - - mm->user_ns = get_user_ns(user_ns); -+ lru_gen_init_mm(mm); - return mm; - - fail_nocontext: -@@ -1122,6 +1124,7 @@ static inline void __mmput(struct mm_struct *mm) - } - if (mm->binfmt) - module_put(mm->binfmt->module); -+ lru_gen_del_mm(mm); - mmdrop(mm); - } - -@@ -2605,6 +2608,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) - get_task_struct(p); - } - -+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { -+ /* lock the task to synchronize with memcg migration */ -+ task_lock(p); -+ lru_gen_add_mm(p->mm); -+ task_unlock(p); -+ } -+ - wake_up_new_task(p); - - /* forking complete and child started to run, tell ptracer */ -diff --git a/kernel/kthread.c b/kernel/kthread.c -index 5b37a8567168..fd827fdad26b 100644 ---- a/kernel/kthread.c -+++ b/kernel/kthread.c -@@ -1361,6 +1361,7 @@ void kthread_use_mm(struct mm_struct *mm) - tsk->mm = mm; - membarrier_update_current_mm(mm); - switch_mm_irqs_off(active_mm, mm, tsk); -+ lru_gen_switch_mm(active_mm, mm); - local_irq_enable(); - task_unlock(tsk); - #ifdef finish_arch_post_lock_switch -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index f21714ea3db8..e5e4cc86e296 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -4924,6 +4924,7 @@ context_switch(struct rq *rq, struct task_struct *prev, - * finish_task_switch()'s mmdrop(). - */ - switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ lru_gen_switch_mm(prev->active_mm, next->mm); - - if (!prev->mm) { // from kernel - /* will mmdrop() in finish_task_switch(). */ -@@ -8792,6 +8793,7 @@ void idle_task_exit(void) - - if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); -+ lru_gen_switch_mm(mm, &init_mm); - finish_arch_post_lock_switch(); - } - -diff --git a/mm/Kconfig b/mm/Kconfig -index d16ba9249bc5..48e7babb22e5 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -896,4 +896,63 @@ config SECRETMEM - - source "mm/damon/Kconfig" - -+# the multigenerational lru { -+config LRU_GEN -+ bool "Multigenerational LRU" -+ depends on MMU -+ # the following options may leave not enough spare bits in page->flags -+ depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP) -+ help -+ A high performance LRU implementation to heavily overcommit workloads -+ that are not IO bound. See Documentation/vm/multigen_lru.rst for -+ details. -+ -+ Warning: do not enable this option unless you plan to use it because -+ it introduces a small per-process and per-memcg and per-node memory -+ overhead. -+ -+config LRU_GEN_ENABLED -+ bool "Turn on by default" -+ depends on LRU_GEN -+ help -+ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option -+ changes it to 1. -+ -+ Warning: the default value is the fast path. See -+ Documentation/static-keys.txt for details. -+ -+config LRU_GEN_STATS -+ bool "Full stats for debugging" -+ depends on LRU_GEN -+ help -+ This option keeps full stats for each generation, which can be read -+ from /sys/kernel/debug/lru_gen_full. -+ -+ Warning: do not enable this option unless you plan to use it because -+ it introduces an additional small per-process and per-memcg and -+ per-node memory overhead. -+ -+config NR_LRU_GENS -+ int "Max number of generations" -+ depends on LRU_GEN -+ range 4 31 -+ default 7 -+ help -+ This will use order_base_2(N+1) spare bits from page flags. -+ -+ Warning: do not use numbers larger than necessary because each -+ generation introduces a small per-node and per-memcg memory overhead. -+ -+config TIERS_PER_GEN -+ int "Number of tiers per generation" -+ depends on LRU_GEN -+ range 2 5 -+ default 4 -+ help -+ This will use N-2 spare bits from page flags. -+ -+ Larger values generally offer better protection to active pages under -+ heavy buffered I/O workloads. -+# } -+ - endmenu -diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index c5142d237e48..875ed5559d95 100644 ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struct page *head, int tail, - #ifdef CONFIG_64BIT - (1L << PG_arch_2) | - #endif -- (1L << PG_dirty))); -+ (1L << PG_dirty) | -+ LRU_GEN_MASK | LRU_REFS_MASK)); - - /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, -diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index 6da5020a8656..49100d053e3b 100644 ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -1239,12 +1239,17 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - *lru_size += nr_pages; - - size = *lru_size; -+#ifdef CONFIG_LRU_GEN -+ /* unlikely but not a bug when reset_batch_size() is pending */ -+ VM_WARN_ON(size + MAX_BATCH_SIZE < 0); -+#else - if (WARN_ONCE(size < 0, - "%s(%p, %d, %d): lru_size %ld\n", - __func__, lruvec, lru, nr_pages, size)) { - VM_BUG_ON(1); - *lru_size = 0; - } -+#endif - - if (nr_pages > 0) - *lru_size += nr_pages; -@@ -5110,6 +5115,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) - - static void mem_cgroup_free(struct mem_cgroup *memcg) - { -+ lru_gen_free_memcg(memcg); - memcg_wb_domain_exit(memcg); - __mem_cgroup_free(memcg); - } -@@ -5173,6 +5179,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) - memcg->deferred_split_queue.split_queue_len = 0; - #endif - idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); -+ lru_gen_init_memcg(memcg); - return memcg; - fail: - mem_cgroup_id_remove(memcg); -@@ -6156,6 +6163,29 @@ static void mem_cgroup_move_task(void) - } - #endif - -+#ifdef CONFIG_LRU_GEN -+static void mem_cgroup_attach(struct cgroup_taskset *tset) -+{ -+ struct cgroup_subsys_state *css; -+ struct task_struct *task = NULL; -+ -+ cgroup_taskset_for_each_leader(task, css, tset) -+ break; -+ -+ if (!task) -+ return; -+ -+ task_lock(task); -+ if (task->mm && task->mm->owner == task) -+ lru_gen_migrate_mm(task->mm); -+ task_unlock(task); -+} -+#else -+static void mem_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+#endif /* CONFIG_LRU_GEN */ -+ - static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) - { - if (value == PAGE_COUNTER_MAX) -@@ -6499,6 +6529,7 @@ struct cgroup_subsys memory_cgrp_subsys = { - .css_reset = mem_cgroup_css_reset, - .css_rstat_flush = mem_cgroup_css_rstat_flush, - .can_attach = mem_cgroup_can_attach, -+ .attach = mem_cgroup_attach, - .cancel_attach = mem_cgroup_cancel_attach, - .post_attach = mem_cgroup_move_task, - .dfl_cftypes = memory_files, -diff --git a/mm/memory.c b/mm/memory.c -index c52be6d6b605..6b2b665c0866 100644 ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly = - 2; - #endif - --#ifndef arch_faults_on_old_pte --static inline bool arch_faults_on_old_pte(void) --{ -- /* -- * Those arches which don't have hw access flag feature need to -- * implement their own helper. By default, "true" means pagefault -- * will be hit on old pte. -- */ -- return true; --} --#endif -- - #ifndef arch_wants_old_prefaulted_pte - static inline bool arch_wants_old_prefaulted_pte(void) - { -@@ -2769,7 +2757,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src, - * On architectures with software "accessed" bits, we would - * take a double page fault, so mark it accessed here. - */ -- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { -+ if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) { - pte_t entry; - - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); -@@ -4774,6 +4762,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, struct pt_regs *regs) - { - vm_fault_t ret; -+ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ); - - __set_current_state(TASK_RUNNING); - -@@ -4795,11 +4784,17 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - if (flags & FAULT_FLAG_USER) - mem_cgroup_enter_user_fault(); - -+ if (nonseq_fault) -+ task_enter_nonseq_fault(); -+ - if (unlikely(is_vm_hugetlb_page(vma))) - ret = hugetlb_fault(vma->vm_mm, vma, address, flags); - else - ret = __handle_mm_fault(vma, address, flags); - -+ if (nonseq_fault) -+ task_exit_nonseq_fault(); -+ - if (flags & FAULT_FLAG_USER) { - mem_cgroup_exit_user_fault(); - /* -diff --git a/mm/mm_init.c b/mm/mm_init.c -index 9ddaf0e1b0ab..0d7b2bd2454a 100644 ---- a/mm/mm_init.c -+++ b/mm/mm_init.c -@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void) - - shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH -- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; -+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", -- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", -+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", - SECTIONS_WIDTH, - NODES_WIDTH, - ZONES_WIDTH, - LAST_CPUPID_WIDTH, - KASAN_TAG_WIDTH, -+ LRU_GEN_WIDTH, -+ LRU_REFS_WIDTH, - NR_PAGEFLAGS); - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", -diff --git a/mm/oom_kill.c b/mm/oom_kill.c -index 989f35a2bbb1..72855b7bde09 100644 ---- a/mm/oom_kill.c -+++ b/mm/oom_kill.c -@@ -508,8 +508,8 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) - * victim (if that is possible) to help the OOM killer to move on. - */ - static struct task_struct *oom_reaper_th; --static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); --static struct task_struct *oom_reaper_list; -+DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -+struct task_struct *oom_reaper_list; - static DEFINE_SPINLOCK(oom_reaper_lock); - - bool __oom_reap_task_mm(struct mm_struct *mm) -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 23d3339ac4e8..5bee46999f60 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -7405,6 +7405,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) - - pgdat_page_ext_init(pgdat); - lruvec_init(&pgdat->__lruvec); -+ lru_gen_init_state(NULL, &pgdat->__lruvec); - } - - static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, -diff --git a/mm/rmap.c b/mm/rmap.c -index 6aebd1747251..a6b522fe871a 100644 ---- a/mm/rmap.c -+++ b/mm/rmap.c -@@ -73,6 +73,7 @@ - #include <linux/page_idle.h> - #include <linux/memremap.h> - #include <linux/userfaultfd_k.h> -+#include <linux/mm_inline.h> - - #include <asm/tlbflush.h> - -@@ -790,6 +791,13 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, - } - - if (pvmw.pte) { -+ /* the multigenerational lru exploits the spatial locality */ -+ if (lru_gen_enabled() && pte_young(*pvmw.pte) && -+ !(vma->vm_flags & VM_SEQ_READ)) { -+ lru_gen_look_around(&pvmw); -+ referenced++; -+ } -+ - if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) { - /* -diff --git a/mm/swap.c b/mm/swap.c -index af3cad4e5378..93f5fe5f99ca 100644 ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(struct page *page) - local_unlock(&lru_pvecs.lock); - } - -+#ifdef CONFIG_LRU_GEN -+static void page_inc_refs(struct page *page) -+{ -+ unsigned long refs; -+ unsigned long old_flags, new_flags; -+ -+ if (PageUnevictable(page)) -+ return; -+ -+ /* see the comment on MAX_NR_TIERS */ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ -+ if (!(new_flags & BIT(PG_referenced))) { -+ new_flags |= BIT(PG_referenced); -+ continue; -+ } -+ -+ if (!(new_flags & BIT(PG_workingset))) { -+ new_flags |= BIT(PG_workingset); -+ continue; -+ } -+ -+ refs = new_flags & LRU_REFS_MASK; -+ refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK); -+ -+ new_flags &= ~LRU_REFS_MASK; -+ new_flags |= refs; -+ } while (new_flags != old_flags && -+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+} -+#else -+static void page_inc_refs(struct page *page) -+{ -+} -+#endif /* CONFIG_LRU_GEN */ -+ - /* - * Mark a page as having seen activity. - * -@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *page) - { - page = compound_head(page); - -+ if (lru_gen_enabled()) { -+ page_inc_refs(page); -+ return; -+ } -+ - if (!PageReferenced(page)) { - SetPageReferenced(page); - } else if (PageUnevictable(page)) { -@@ -446,6 +488,11 @@ void lru_cache_add(struct page *page) - VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); - VM_BUG_ON_PAGE(PageLRU(page), page); - -+ /* see the comment in lru_gen_add_page() */ -+ if (lru_gen_enabled() && !PageUnevictable(page) && -+ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC)) -+ SetPageActive(page); -+ - get_page(page); - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_add); -@@ -547,7 +594,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) - - static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) - { -- if (PageActive(page) && !PageUnevictable(page)) { -+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { - int nr_pages = thp_nr_pages(page); - - del_page_from_lru_list(page, lruvec); -@@ -661,7 +708,7 @@ void deactivate_file_page(struct page *page) - */ - void deactivate_page(struct page *page) - { -- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { -+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { - struct pagevec *pvec; - - local_lock(&lru_pvecs.lock); -diff --git a/mm/swapfile.c b/mm/swapfile.c -index 22d10f713848..2ac9ac0b5ec3 100644 ---- a/mm/swapfile.c -+++ b/mm/swapfile.c -@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) - err = 0; - atomic_inc(&proc_poll_event); - wake_up_interruptible(&proc_poll_wait); -+ lru_gen_change_state(false, false, true); - - out_dput: - filp_close(victim, NULL); -@@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) - mutex_unlock(&swapon_mutex); - atomic_inc(&proc_poll_event); - wake_up_interruptible(&proc_poll_wait); -+ lru_gen_change_state(true, false, true); - - error = 0; - goto out; -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 74296c2d1fed..7c3a49173902 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -50,6 +50,11 @@ - #include <linux/printk.h> - #include <linux/dax.h> - #include <linux/psi.h> -+#include <linux/memory.h> -+#include <linux/pagewalk.h> -+#include <linux/shmem_fs.h> -+#include <linux/ctype.h> -+#include <linux/debugfs.h> - - #include <asm/tlbflush.h> - #include <asm/div64.h> -@@ -1142,9 +1147,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, - - if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page_private(page) }; -- mem_cgroup_swapout(page, swap); -+ -+ /* get a shadow entry before page_memcg() is cleared */ - if (reclaimed && !mapping_exiting(mapping)) - shadow = workingset_eviction(page, target_memcg); -+ mem_cgroup_swapout(page, swap); - __delete_from_swap_cache(page, swap, shadow); - xa_unlock_irq(&mapping->i_pages); - put_swap_page(page, swap); -@@ -1407,6 +1414,11 @@ static unsigned int shrink_page_list(struct list_head *page_list, - if (!sc->may_unmap && page_mapped(page)) - goto keep_locked; - -+ /* lru_gen_look_around() has updated this page? */ -+ if (lru_gen_enabled() && !ignore_references && -+ page_mapped(page) && PageReferenced(page)) -+ goto keep_locked; -+ - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); - -@@ -2562,6 +2574,106 @@ enum scan_balance { - SCAN_FILE, - }; - -+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) -+{ -+ unsigned long file; -+ struct lruvec *target_lruvec; -+ -+ if (lru_gen_enabled()) -+ return; -+ -+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); -+ -+ /* -+ * Determine the scan balance between anon and file LRUs. -+ */ -+ spin_lock_irq(&target_lruvec->lru_lock); -+ sc->anon_cost = target_lruvec->anon_cost; -+ sc->file_cost = target_lruvec->file_cost; -+ spin_unlock_irq(&target_lruvec->lru_lock); -+ -+ /* -+ * Target desirable inactive:active list ratios for the anon -+ * and file LRU lists. -+ */ -+ if (!sc->force_deactivate) { -+ unsigned long refaults; -+ -+ refaults = lruvec_page_state(target_lruvec, -+ WORKINGSET_ACTIVATE_ANON); -+ if (refaults != target_lruvec->refaults[0] || -+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) -+ sc->may_deactivate |= DEACTIVATE_ANON; -+ else -+ sc->may_deactivate &= ~DEACTIVATE_ANON; -+ -+ /* -+ * When refaults are being observed, it means a new -+ * workingset is being established. Deactivate to get -+ * rid of any stale active pages quickly. -+ */ -+ refaults = lruvec_page_state(target_lruvec, -+ WORKINGSET_ACTIVATE_FILE); -+ if (refaults != target_lruvec->refaults[1] || -+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) -+ sc->may_deactivate |= DEACTIVATE_FILE; -+ else -+ sc->may_deactivate &= ~DEACTIVATE_FILE; -+ } else -+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; -+ -+ /* -+ * If we have plenty of inactive file pages that aren't -+ * thrashing, try to reclaim those first before touching -+ * anonymous pages. -+ */ -+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); -+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) -+ sc->cache_trim_mode = 1; -+ else -+ sc->cache_trim_mode = 0; -+ -+ /* -+ * Prevent the reclaimer from falling into the cache trap: as -+ * cache pages start out inactive, every cache fault will tip -+ * the scan balance towards the file LRU. And as the file LRU -+ * shrinks, so does the window for rotation from references. -+ * This means we have a runaway feedback loop where a tiny -+ * thrashing file LRU becomes infinitely more attractive than -+ * anon pages. Try to detect this based on file LRU size. -+ */ -+ if (!cgroup_reclaim(sc)) { -+ unsigned long total_high_wmark = 0; -+ unsigned long free, anon; -+ int z; -+ -+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); -+ file = node_page_state(pgdat, NR_ACTIVE_FILE) + -+ node_page_state(pgdat, NR_INACTIVE_FILE); -+ -+ for (z = 0; z < MAX_NR_ZONES; z++) { -+ struct zone *zone = &pgdat->node_zones[z]; -+ -+ if (!managed_zone(zone)) -+ continue; -+ -+ total_high_wmark += high_wmark_pages(zone); -+ } -+ -+ /* -+ * Consider anon: if that's low too, this isn't a -+ * runaway file reclaim problem, but rather just -+ * extreme pressure. Reclaim as per usual then. -+ */ -+ anon = node_page_state(pgdat, NR_INACTIVE_ANON); -+ -+ sc->file_is_tiny = -+ file + free <= total_high_wmark && -+ !(sc->may_deactivate & DEACTIVATE_ANON) && -+ anon >> sc->priority; -+ } -+} -+ - /* - * Determine how aggressively the anon and file LRU lists should be - * scanned. The relative value of each set of LRU lists is determined -@@ -2783,6 +2895,2487 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, - return can_demote(pgdat->node_id, sc); - } - -+#ifdef CONFIG_LRU_GEN -+ -+/****************************************************************************** -+ * shorthand helpers -+ ******************************************************************************/ -+ -+#define DEFINE_MAX_SEQ(lruvec) \ -+ unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq) -+ -+#define DEFINE_MIN_SEQ(lruvec) \ -+ unsigned long min_seq[ANON_AND_FILE] = { \ -+ READ_ONCE((lruvec)->evictable.min_seq[0]), \ -+ READ_ONCE((lruvec)->evictable.min_seq[1]), \ -+ } -+ -+#define for_each_gen_type_zone(gen, type, zone) \ -+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ -+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ -+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) -+ -+static int page_lru_gen(struct page *page) -+{ -+ unsigned long flags = READ_ONCE(page->flags); -+ -+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+} -+ -+static int page_lru_tier(struct page *page) -+{ -+ int refs; -+ unsigned long flags = READ_ONCE(page->flags); -+ -+ refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ? -+ ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0; -+ -+ return lru_tier_from_refs(refs); -+} -+ -+static int get_swappiness(struct mem_cgroup *memcg) -+{ -+ return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? -+ mem_cgroup_swappiness(memcg) : 0; -+} -+ -+static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) -+{ -+ struct pglist_data *pgdat = NODE_DATA(nid); -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) -+ return &memcg->nodeinfo[nid]->lruvec; -+#endif -+ return pgdat ? &pgdat->__lruvec : NULL; -+} -+ -+static int get_nr_gens(struct lruvec *lruvec, int type) -+{ -+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; -+} -+ -+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) -+{ -+ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS && -+ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) && -+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS; -+} -+ -+/****************************************************************************** -+ * mm_struct list -+ ******************************************************************************/ -+ -+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) -+{ -+ static struct lru_gen_mm_list mm_list = { -+ .fifo = LIST_HEAD_INIT(mm_list.fifo), -+ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), -+ }; -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) -+ return &memcg->mm_list; -+#endif -+ return &mm_list; -+} -+ -+void lru_gen_add_mm(struct mm_struct *mm) -+{ -+ int nid; -+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); -+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); -+ -+ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm); -+#ifdef CONFIG_MEMCG -+ VM_BUG_ON_MM(mm->lrugen.memcg, mm); -+ mm->lrugen.memcg = memcg; -+#endif -+ spin_lock(&mm_list->lock); -+ -+ list_add_tail(&mm->lrugen.list, &mm_list->fifo); -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ if (!lruvec) -+ continue; -+ -+ if (lruvec->mm_walk.tail == &mm_list->fifo) -+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev; -+ } -+ -+ spin_unlock(&mm_list->lock); -+} -+ -+void lru_gen_del_mm(struct mm_struct *mm) -+{ -+ int nid; -+ struct lru_gen_mm_list *mm_list; -+ struct mem_cgroup *memcg = NULL; -+ -+ if (list_empty(&mm->lrugen.list)) -+ return; -+ -+#ifdef CONFIG_MEMCG -+ memcg = mm->lrugen.memcg; -+#endif -+ mm_list = get_mm_list(memcg); -+ -+ spin_lock(&mm_list->lock); -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ if (!lruvec) -+ continue; -+ -+ if (lruvec->mm_walk.tail == &mm->lrugen.list) -+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->next; -+ -+ if (lruvec->mm_walk.head != &mm->lrugen.list) -+ continue; -+ -+ lruvec->mm_walk.head = lruvec->mm_walk.head->next; -+ if (lruvec->mm_walk.head == &mm_list->fifo) -+ WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1); -+ } -+ -+ list_del_init(&mm->lrugen.list); -+ -+ spin_unlock(&mm_list->lock); -+ -+#ifdef CONFIG_MEMCG -+ mem_cgroup_put(mm->lrugen.memcg); -+ mm->lrugen.memcg = NULL; -+#endif -+} -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_migrate_mm(struct mm_struct *mm) -+{ -+ struct mem_cgroup *memcg; -+ -+ lockdep_assert_held(&mm->owner->alloc_lock); -+ -+ if (mem_cgroup_disabled()) -+ return; -+ -+ rcu_read_lock(); -+ memcg = mem_cgroup_from_task(mm->owner); -+ rcu_read_unlock(); -+ if (memcg == mm->lrugen.memcg) -+ return; -+ -+ VM_BUG_ON_MM(!mm->lrugen.memcg, mm); -+ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm); -+ -+ lru_gen_del_mm(mm); -+ lru_gen_add_mm(mm); -+} -+#endif -+ -+#define BLOOM_FILTER_SHIFT 15 -+ -+static inline int filter_gen_from_seq(unsigned long seq) -+{ -+ return seq % NR_BLOOM_FILTERS; -+} -+ -+static void get_item_key(void *item, int *key) -+{ -+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); -+ -+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); -+ -+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); -+ key[1] = hash >> BLOOM_FILTER_SHIFT; -+} -+ -+static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq) -+{ -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); -+ -+ filter = lruvec->mm_walk.filters[gen]; -+ if (filter) { -+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); -+ return; -+ } -+ -+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC); -+ WRITE_ONCE(lruvec->mm_walk.filters[gen], filter); -+} -+ -+static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -+{ -+ int key[2]; -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]); -+ if (!filter) -+ return; -+ -+ get_item_key(item, key); -+ -+ if (!test_bit(key[0], filter)) -+ set_bit(key[0], filter); -+ if (!test_bit(key[1], filter)) -+ set_bit(key[1], filter); -+} -+ -+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -+{ -+ int key[2]; -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]); -+ if (!filter) -+ return false; -+ -+ get_item_key(item, key); -+ -+ return test_bit(key[0], filter) && test_bit(key[1], filter); -+} -+ -+static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args) -+{ -+ int i; -+ int hist = lru_hist_from_seq(args->max_seq); -+ -+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); -+ -+ for (i = 0; i < NR_MM_STATS; i++) { -+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], -+ lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]); -+ args->mm_stats[i] = 0; -+ } -+ -+ if (!last || NR_HIST_GENS == 1) -+ return; -+ -+ hist = lru_hist_from_seq(args->max_seq + 1); -+ for (i = 0; i < NR_MM_STATS; i++) -+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0); -+} -+ -+static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args) -+{ -+ int type; -+ unsigned long size = 0; -+ -+ if (!lru_gen_mm_is_active(mm) && !node_isset(args->node_id, mm->lrugen.nodes)) -+ return true; -+ -+ if (mm_is_oom_victim(mm)) -+ return true; -+ -+ for (type = !args->swappiness; type < ANON_AND_FILE; type++) { -+ size += type ? get_mm_counter(mm, MM_FILEPAGES) : -+ get_mm_counter(mm, MM_ANONPAGES) + -+ get_mm_counter(mm, MM_SHMEMPAGES); -+ } -+ -+ if (size < MIN_BATCH_SIZE) -+ return true; -+ -+ if (!mmget_not_zero(mm)) -+ return true; -+ -+ node_clear(args->node_id, mm->lrugen.nodes); -+ -+ return false; -+} -+ -+/* To support multiple walkers that concurrently walk an mm_struct list. */ -+static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args, -+ struct mm_struct **iter) -+{ -+ bool first = false; -+ bool last = true; -+ struct mm_struct *mm = NULL; -+ struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk; -+ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg); -+ -+ if (*iter) -+ mmput_async(*iter); -+ else if (args->max_seq <= READ_ONCE(mm_walk->seq)) -+ return false; -+ -+ spin_lock(&mm_list->lock); -+ -+ VM_BUG_ON(args->max_seq > mm_walk->seq + 1); -+ VM_BUG_ON(*iter && args->max_seq < mm_walk->seq); -+ VM_BUG_ON(*iter && !mm_walk->nr_walkers); -+ -+ if (args->max_seq <= mm_walk->seq) { -+ if (!*iter) -+ last = false; -+ goto done; -+ } -+ -+ if (mm_walk->head == &mm_list->fifo) { -+ VM_BUG_ON(mm_walk->nr_walkers); -+ mm_walk->head = mm_walk->head->next; -+ first = true; -+ } -+ -+ while (!mm && mm_walk->head != &mm_list->fifo) { -+ mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list); -+ -+ mm_walk->head = mm_walk->head->next; -+ -+ if (mm_walk->tail == &mm->lrugen.list) { -+ mm_walk->tail = mm_walk->tail->next; -+ args->use_filter = false; -+ } -+ -+ if (should_skip_mm(mm, args)) -+ mm = NULL; -+ } -+ -+ if (mm_walk->head == &mm_list->fifo) -+ WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1); -+done: -+ if (*iter && !mm) -+ mm_walk->nr_walkers--; -+ if (!*iter && mm) -+ mm_walk->nr_walkers++; -+ -+ if (mm_walk->nr_walkers) -+ last = false; -+ -+ if (mm && first) -+ clear_bloom_filter(lruvec, args->max_seq + 1); -+ -+ if (*iter || last) -+ reset_mm_stats(lruvec, last, args); -+ -+ spin_unlock(&mm_list->lock); -+ -+ *iter = mm; -+ -+ return last; -+} -+ -+/****************************************************************************** -+ * refault feedback loop -+ ******************************************************************************/ -+ -+/* -+ * A feedback loop modeled after the PID controller. Currently supports the -+ * proportional (P) and the integral (I) terms; the derivative (D) term can be -+ * added if necessary. The setpoint (SP) is the desired position; the process -+ * variable (PV) is the measured position. The error is the difference between -+ * the SP and the PV. A positive error results in a positive control output -+ * correction, which, in our case, is to allow eviction. -+ * -+ * The P term is refaulted % of the current generation being evicted. The I -+ * term is the exponential moving average of refaulted % of previously evicted -+ * generations, using the smoothing factor 1/2. -+ * -+ * Our goal is to maintain proportional refaulted % across all tiers. -+ */ -+struct ctrl_pos { -+ unsigned long refaulted; -+ unsigned long total; -+ int gain; -+}; -+ -+static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, -+ struct ctrl_pos *pos) -+{ -+ struct lrugen *lrugen = &lruvec->evictable; -+ int hist = lru_hist_from_seq(lrugen->min_seq[type]); -+ -+ pos->refaulted = lrugen->avg_refaulted[type][tier] + -+ atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ pos->total = lrugen->avg_total[type][tier] + -+ atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ pos->total += lrugen->protected[hist][type][tier - 1]; -+ pos->gain = gain; -+} -+ -+static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type) -+{ -+ int tier; -+ int hist = lru_hist_from_seq(gen); -+ struct lrugen *lrugen = &lruvec->evictable; -+ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); -+ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; -+ -+ if (!carryover && !clear) -+ return; -+ -+ for (tier = 0; tier < MAX_NR_TIERS; tier++) { -+ if (carryover) { -+ unsigned long sum; -+ -+ sum = lrugen->avg_refaulted[type][tier] + -+ atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); -+ -+ sum = lrugen->avg_total[type][tier] + -+ atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ sum += lrugen->protected[hist][type][tier - 1]; -+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); -+ } -+ -+ if (clear) { -+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); -+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0); -+ if (tier) -+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); -+ } -+ } -+} -+ -+static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) -+{ -+ /* -+ * Allow eviction if the PV has a limited number of refaulted pages or a -+ * lower refaulted % than the SP. -+ */ -+ return pv->refaulted < MIN_BATCH_SIZE || -+ pv->refaulted * max(sp->total, 1UL) * sp->gain <= -+ sp->refaulted * max(pv->total, 1UL) * pv->gain; -+} -+ -+/****************************************************************************** -+ * the aging -+ ******************************************************************************/ -+ -+static int page_update_gen(struct page *page, int gen) -+{ -+ unsigned long old_flags, new_flags; -+ -+ VM_BUG_ON(gen >= MAX_NR_GENS); -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ -+ if (!(new_flags & LRU_GEN_MASK)) { -+ new_flags |= BIT(PG_referenced); -+ continue; -+ } -+ -+ new_flags &= ~LRU_GEN_MASK; -+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; -+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); -+ } while (new_flags != old_flags && -+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+} -+ -+static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ int old_gen, new_gen; -+ unsigned long old_flags, new_flags; -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ old_gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ VM_BUG_ON_PAGE(!(new_flags & LRU_GEN_MASK), page); -+ -+ new_gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+ /* page_update_gen() has updated this page? */ -+ if (new_gen >= 0 && new_gen != old_gen) { -+ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); -+ return; -+ } -+ -+ new_gen = (old_gen + 1) % MAX_NR_GENS; -+ -+ new_flags &= ~LRU_GEN_MASK; -+ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; -+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); -+ /* for end_page_writeback() */ -+ if (reclaiming) -+ new_flags |= BIT(PG_reclaim); -+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ lru_gen_update_size(page, lruvec, old_gen, new_gen); -+ if (reclaiming) -+ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); -+ else -+ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]); -+} -+ -+static void update_batch_size(struct page *page, int old_gen, int new_gen, -+ struct mm_walk_args *args) -+{ -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ int delta = thp_nr_pages(page); -+ -+ VM_BUG_ON(old_gen >= MAX_NR_GENS); -+ VM_BUG_ON(new_gen >= MAX_NR_GENS); -+ -+ args->batch_size++; -+ -+ args->nr_pages[old_gen][type][zone] -= delta; -+ args->nr_pages[new_gen][type][zone] += delta; -+} -+ -+static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args) -+{ -+ int gen, type, zone; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ args->batch_size = 0; -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ enum lru_list lru = type * LRU_FILE; -+ int delta = args->nr_pages[gen][type][zone]; -+ -+ if (!delta) -+ continue; -+ -+ args->nr_pages[gen][type][zone] = 0; -+ WRITE_ONCE(lrugen->sizes[gen][type][zone], -+ lrugen->sizes[gen][type][zone] + delta); -+ -+ if (lru_gen_is_active(lruvec, gen)) -+ lru += LRU_ACTIVE; -+ update_lru_size(lruvec, lru, zone, delta); -+ } -+} -+ -+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk) -+{ -+ struct address_space *mapping; -+ struct vm_area_struct *vma = walk->vma; -+ struct mm_walk_args *args = walk->private; -+ -+ if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) || -+ (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ))) -+ return true; -+ -+ if (vma_is_anonymous(vma)) -+ return !args->swappiness; -+ -+ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) -+ return true; -+ -+ mapping = vma->vm_file->f_mapping; -+ if (!mapping->a_ops->writepage) -+ return true; -+ -+ return (shmem_mapping(mapping) && !args->swappiness) || mapping_unevictable(mapping); -+} -+ -+/* -+ * Some userspace memory allocators create many single-page VMAs. So instead of -+ * returning back to the PGD table for each of such VMAs, we finish at least an -+ * entire PMD table and therefore avoid many zigzags. -+ */ -+static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size, -+ unsigned long *start, unsigned long *end) -+{ -+ unsigned long next = round_up(*end, size); -+ -+ VM_BUG_ON(mask & size); -+ VM_BUG_ON(*start >= *end); -+ VM_BUG_ON((next & mask) != (*start & mask)); -+ -+ while (walk->vma) { -+ if (next >= walk->vma->vm_end) { -+ walk->vma = walk->vma->vm_next; -+ continue; -+ } -+ -+ if ((next & mask) != (walk->vma->vm_start & mask)) -+ return false; -+ -+ if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) { -+ walk->vma = walk->vma->vm_next; -+ continue; -+ } -+ -+ *start = max(next, walk->vma->vm_start); -+ next = (next | ~mask) + 1; -+ /* rounded-up boundaries can wrap to 0 */ -+ *end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end; -+ -+ return true; -+ } -+ -+ return false; -+} -+ -+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, -+ struct mm_walk *walk) -+{ -+ int i; -+ pte_t *pte; -+ spinlock_t *ptl; -+ unsigned long addr; -+ int worth = 0; -+ struct mm_walk_args *args = walk->private; -+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); -+ -+ VM_BUG_ON(pmd_leaf(*pmd)); -+ -+ pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl); -+ arch_enter_lazy_mmu_mode(); -+restart: -+ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { -+ struct page *page; -+ unsigned long pfn = pte_pfn(pte[i]); -+ -+ args->mm_stats[MM_LEAF_TOTAL]++; -+ -+ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) -+ continue; -+ -+ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) -+ continue; -+ -+ if (!pte_young(pte[i])) { -+ args->mm_stats[MM_LEAF_OLD]++; -+ continue; -+ } -+ -+ VM_BUG_ON(!pfn_valid(pfn)); -+ if (pfn < args->start_pfn || pfn >= args->end_pfn) -+ continue; -+ -+ page = compound_head(pfn_to_page(pfn)); -+ if (page_to_nid(page) != args->node_id) -+ continue; -+ -+ if (page_memcg_rcu(page) != args->memcg) -+ continue; -+ -+ VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end); -+ if (!ptep_test_and_clear_young(walk->vma, addr, pte + i)) -+ continue; -+ -+ args->mm_stats[MM_LEAF_YOUNG]++; -+ -+ if (pte_dirty(pte[i]) && !PageDirty(page) && -+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) -+ set_page_dirty(page); -+ -+ old_gen = page_update_gen(page, new_gen); -+ if (old_gen >= 0 && old_gen != new_gen) -+ update_batch_size(page, old_gen, new_gen, args); -+ -+ worth++; -+ } -+ -+ if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end)) -+ goto restart; -+ -+ arch_leave_lazy_mmu_mode(); -+ pte_unmap_unlock(pte, ptl); -+ -+ return worth >= MIN_BATCH_SIZE / 2; -+} -+ -+/* -+ * We scan PMD entries in two passes. The first pass reaches to PTE tables and -+ * doesn't take the PMD lock. The second pass clears the accessed bit on PMD -+ * entries and needs to take the PMD lock. -+ */ -+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -+static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, -+ struct vm_area_struct *vma, struct mm_walk *walk) -+{ -+ int i; -+ pmd_t *pmd; -+ spinlock_t *ptl; -+ struct mm_walk_args *args = walk->private; -+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); -+ -+ VM_BUG_ON(pud_leaf(*pud)); -+ -+ start = (start & PUD_MASK) + offset * PMD_SIZE; -+ pmd = pmd_offset(pud, start); -+ ptl = pmd_lock(walk->mm, pmd); -+ arch_enter_lazy_mmu_mode(); -+ -+ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) { -+ struct page *page; -+ unsigned long pfn = pmd_pfn(pmd[i]); -+ unsigned long addr = start + i * PMD_SIZE; -+ -+ if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i])) -+ continue; -+ -+ if (WARN_ON_ONCE(pmd_devmap(pmd[i]))) -+ continue; -+ -+ if (!pmd_trans_huge(pmd[i])) { -+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) -+ pmdp_test_and_clear_young(vma, addr, pmd + i); -+ continue; -+ } -+ -+ VM_BUG_ON(!pfn_valid(pfn)); -+ if (pfn < args->start_pfn || pfn >= args->end_pfn) -+ continue; -+ -+ page = pfn_to_page(pfn); -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ if (page_to_nid(page) != args->node_id) -+ continue; -+ -+ if (page_memcg_rcu(page) != args->memcg) -+ continue; -+ -+ VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end); -+ if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) -+ continue; -+ -+ args->mm_stats[MM_LEAF_YOUNG]++; -+ -+ if (pmd_dirty(pmd[i]) && !PageDirty(page) && -+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) -+ set_page_dirty(page); -+ -+ old_gen = page_update_gen(page, new_gen); -+ if (old_gen >= 0 && old_gen != new_gen) -+ update_batch_size(page, old_gen, new_gen, args); -+ } -+ -+ arch_leave_lazy_mmu_mode(); -+ spin_unlock(ptl); -+ -+ bitmap_zero(args->bitmap, MIN_BATCH_SIZE); -+} -+#else -+static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, -+ struct vm_area_struct *vma, struct mm_walk *walk) -+{ -+} -+#endif -+ -+static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, -+ struct mm_walk *walk) -+{ -+ int i; -+ pmd_t *pmd; -+ unsigned long next; -+ unsigned long addr; -+ struct vm_area_struct *vma; -+ int offset = -1; -+ bool reset = false; -+ struct mm_walk_args *args = walk->private; -+ struct lruvec *lruvec = get_lruvec(args->node_id, args->memcg); -+ -+ VM_BUG_ON(pud_leaf(*pud)); -+ -+ pmd = pmd_offset(pud, start & PUD_MASK); -+restart: -+ vma = walk->vma; -+ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { -+ pmd_t val = pmd_read_atomic(pmd + i); -+ -+ /* for pmd_read_atomic() */ -+ barrier(); -+ -+ next = pmd_addr_end(addr, end); -+ -+ if (!pmd_present(val)) { -+ args->mm_stats[MM_LEAF_TOTAL]++; -+ continue; -+ } -+ -+#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+ if (pmd_trans_huge(val)) { -+ unsigned long pfn = pmd_pfn(val); -+ -+ args->mm_stats[MM_LEAF_TOTAL]++; -+ -+ if (is_huge_zero_pmd(val)) -+ continue; -+ -+ if (!pmd_young(val)) { -+ args->mm_stats[MM_LEAF_OLD]++; -+ continue; -+ } -+ -+ if (pfn < args->start_pfn || pfn >= args->end_pfn) -+ continue; -+ -+ if (offset < 0) -+ offset = i; -+ else if (i - offset >= MIN_BATCH_SIZE) { -+ walk_pmd_range_locked(pud, start, offset, vma, walk); -+ offset = i; -+ } -+ __set_bit(i - offset, args->bitmap); -+ reset = true; -+ continue; -+ } -+#endif -+ args->mm_stats[MM_NONLEAF_TOTAL]++; -+ -+#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG -+ if (!pmd_young(val)) -+ continue; -+ -+ if (offset < 0) -+ offset = i; -+ else if (i - offset >= MIN_BATCH_SIZE) { -+ walk_pmd_range_locked(pud, start, offset, vma, walk); -+ offset = i; -+ reset = false; -+ } -+ __set_bit(i - offset, args->bitmap); -+#endif -+ if (args->use_filter && !test_bloom_filter(lruvec, args->max_seq, pmd + i)) -+ continue; -+ -+ args->mm_stats[MM_NONLEAF_PREV]++; -+ -+ if (!walk_pte_range(&val, addr, next, walk)) -+ continue; -+ -+ args->mm_stats[MM_NONLEAF_CUR]++; -+ -+ set_bloom_filter(lruvec, args->max_seq + 1, pmd + i); -+ } -+ -+ if (reset) { -+ walk_pmd_range_locked(pud, start, offset, vma, walk); -+ offset = -1; -+ reset = false; -+ } -+ -+ if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end)) -+ goto restart; -+ -+ if (offset >= 0) -+ walk_pmd_range_locked(pud, start, offset, vma, walk); -+} -+ -+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, -+ struct mm_walk *walk) -+{ -+ int i; -+ pud_t *pud; -+ unsigned long addr; -+ unsigned long next; -+ struct mm_walk_args *args = walk->private; -+ -+ VM_BUG_ON(p4d_leaf(*p4d)); -+ -+ pud = pud_offset(p4d, start & P4D_MASK); -+restart: -+ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { -+ pud_t val = READ_ONCE(pud[i]); -+ -+ next = pud_addr_end(addr, end); -+ -+ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) -+ continue; -+ -+ walk_pmd_range(&val, addr, next, walk); -+ -+ if (args->batch_size >= MAX_BATCH_SIZE) { -+ end = (addr | ~PUD_MASK) + 1; -+ goto done; -+ } -+ } -+ -+ if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end)) -+ goto restart; -+ -+ end = round_up(end, P4D_SIZE); -+done: -+ /* rounded-up boundaries can wrap to 0 */ -+ args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0; -+ -+ return -EAGAIN; -+} -+ -+static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct mm_walk_args *args) -+{ -+ static const struct mm_walk_ops mm_walk_ops = { -+ .test_walk = should_skip_vma, -+ .p4d_entry = walk_pud_range, -+ }; -+ -+ int err; -+ -+ args->next_addr = FIRST_USER_ADDRESS; -+ -+ do { -+ unsigned long start = args->next_addr; -+ unsigned long end = mm->highest_vm_end; -+ -+ err = -EBUSY; -+ -+ rcu_read_lock(); -+#ifdef CONFIG_MEMCG -+ if (args->memcg && atomic_read(&args->memcg->moving_account)) -+ goto contended; -+#endif -+ if (!mmap_read_trylock(mm)) -+ goto contended; -+ -+ err = walk_page_range(mm, start, end, &mm_walk_ops, args); -+ -+ mmap_read_unlock(mm); -+ -+ if (args->batch_size) { -+ spin_lock_irq(&lruvec->lru_lock); -+ reset_batch_size(lruvec, args); -+ spin_unlock_irq(&lruvec->lru_lock); -+ } -+contended: -+ rcu_read_unlock(); -+ -+ cond_resched(); -+ } while (err == -EAGAIN && args->next_addr && !mm_is_oom_victim(mm)); -+} -+ -+static struct mm_walk_args *alloc_mm_walk_args(void) -+{ -+ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) -+ return kvzalloc(sizeof(struct mm_walk_args), GFP_KERNEL); -+ -+ return current->reclaim_state->mm_walk_args; -+} -+ -+static void free_mm_walk_args(struct mm_walk_args *args) -+{ -+ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) -+ kvfree(args); -+} -+ -+static bool inc_min_seq(struct lruvec *lruvec, int type) -+{ -+ int gen, zone; -+ int remaining = MAX_BATCH_SIZE; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ if (get_nr_gens(lruvec, type) != MAX_NR_GENS) -+ return true; -+ -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ struct list_head *head = &lrugen->lists[gen][type][zone]; -+ -+ while (!list_empty(head)) { -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ page_inc_gen(page, lruvec, false); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ reset_ctrl_pos(lruvec, gen, type); -+ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); -+ -+ return true; -+} -+ -+static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) -+{ -+ int gen, type, zone; -+ bool success = false; -+ struct lrugen *lrugen = &lruvec->evictable; -+ DEFINE_MIN_SEQ(lruvec); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ while (lrugen->max_seq - min_seq[type] >= MIN_NR_GENS) { -+ gen = lru_gen_from_seq(min_seq[type]); -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ if (!list_empty(&lrugen->lists[gen][type][zone])) -+ goto next; -+ } -+ -+ min_seq[type]++; -+ } -+next: -+ ; -+ } -+ -+ min_seq[0] = min(min_seq[0], min_seq[1]); -+ if (swappiness) -+ min_seq[1] = max(min_seq[0], lrugen->min_seq[1]); -+ -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ if (min_seq[type] == lrugen->min_seq[type]) -+ continue; -+ -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ reset_ctrl_pos(lruvec, gen, type); -+ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); -+ success = true; -+ } -+ -+ return success; -+} -+ -+static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq) -+{ -+ int gen, type, zone; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ if (max_seq != lrugen->max_seq) -+ goto unlock; -+ -+ if (!try_to_inc_min_seq(lruvec, true)) { -+ for (type = ANON_AND_FILE - 1; type >= 0; type--) { -+ while (!inc_min_seq(lruvec, type)) { -+ spin_unlock_irq(&lruvec->lru_lock); -+ cond_resched(); -+ spin_lock_irq(&lruvec->lru_lock); -+ } -+ } -+ } -+ -+ gen = lru_gen_from_seq(lrugen->max_seq - 1); -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ enum lru_list lru = type * LRU_FILE; -+ long delta = lrugen->sizes[gen][type][zone]; -+ -+ if (!delta) -+ continue; -+ -+ WARN_ON_ONCE(delta != (int)delta); -+ -+ update_lru_size(lruvec, lru, zone, delta); -+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); -+ } -+ } -+ -+ gen = lru_gen_from_seq(lrugen->max_seq + 1); -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ enum lru_list lru = type * LRU_FILE; -+ long delta = lrugen->sizes[gen][type][zone]; -+ -+ if (!delta) -+ continue; -+ -+ WARN_ON_ONCE(delta != (int)delta); -+ -+ update_lru_size(lruvec, lru, zone, -delta); -+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); -+ } -+ } -+ -+ for (type = 0; type < ANON_AND_FILE; type++) -+ reset_ctrl_pos(lruvec, gen, type); -+ -+ WRITE_ONCE(lrugen->timestamps[gen], jiffies); -+ /* make sure all preceding modifications appear first */ -+ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); -+unlock: -+ spin_unlock_irq(&lruvec->lru_lock); -+} -+ -+/* Main function used by the foreground, the background and the user-triggered aging. */ -+static bool try_to_inc_max_seq(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ unsigned long max_seq, bool use_filter) -+{ -+ bool last; -+ struct mm_walk_args *args; -+ struct mm_struct *mm = NULL; -+ struct lrugen *lrugen = &lruvec->evictable; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ int nid = pgdat->node_id; -+ -+ VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq)); -+ -+ /* -+ * If we are not from run_aging() and clearing the accessed bit may -+ * trigger page faults, then don't proceed to clearing all accessed -+ * PTEs. Instead, fallback to lru_gen_look_around(), which only clears a -+ * handful of accessed PTEs. This is less efficient but causes fewer -+ * page faults on CPUs that don't have the capability. -+ */ -+ if ((current->flags & PF_MEMALLOC) && !arch_has_hw_pte_young(false)) { -+ inc_max_seq(lruvec, max_seq); -+ return true; -+ } -+ -+ args = alloc_mm_walk_args(); -+ if (!args) -+ return false; -+ -+ args->memcg = memcg; -+ args->max_seq = max_seq; -+ args->start_pfn = pgdat->node_start_pfn; -+ args->end_pfn = pgdat_end_pfn(pgdat); -+ args->node_id = nid; -+ args->swappiness = swappiness; -+ args->use_filter = use_filter; -+ -+ do { -+ last = get_next_mm(lruvec, args, &mm); -+ if (mm) -+ walk_mm(lruvec, mm, args); -+ -+ cond_resched(); -+ } while (mm); -+ -+ free_mm_walk_args(args); -+ -+ if (!last) { -+ /* don't wait unless we may have trouble reclaiming */ -+ if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2) -+ wait_event_killable(lruvec->mm_walk.wait, -+ max_seq < READ_ONCE(lrugen->max_seq)); -+ -+ return max_seq < READ_ONCE(lrugen->max_seq); -+ } -+ -+ VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq)); -+ -+ inc_max_seq(lruvec, max_seq); -+ /* either we see any waiters or they will see updated max_seq */ -+ if (wq_has_sleeper(&lruvec->mm_walk.wait)) -+ wake_up_all(&lruvec->mm_walk.wait); -+ -+ wakeup_flusher_threads(WB_REASON_VMSCAN); -+ -+ return true; -+} -+ -+static long get_nr_evictable(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ unsigned long max_seq, unsigned long *min_seq, bool *low) -+{ -+ int gen, type, zone; -+ long max = 0; -+ long min = 0; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ for (type = !swappiness; type < ANON_AND_FILE; type++) { -+ unsigned long seq; -+ -+ for (seq = min_seq[type]; seq <= max_seq; seq++) { -+ long size = 0; -+ -+ gen = lru_gen_from_seq(seq); -+ -+ for (zone = 0; zone <= sc->reclaim_idx; zone++) -+ size += READ_ONCE(lrugen->sizes[gen][type][zone]); -+ -+ max += size; -+ if (type && max_seq - seq >= MIN_NR_GENS) -+ min += size; -+ } -+ } -+ -+ *low = max_seq - min_seq[1] <= MIN_NR_GENS && min < MIN_BATCH_SIZE; -+ -+ return max > 0 ? max : 0; -+} -+ -+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, -+ unsigned long min_ttl) -+{ -+ bool low; -+ long nr_to_scan; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ int swappiness = get_swappiness(memcg); -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (mem_cgroup_below_min(memcg)) -+ return false; -+ -+ if (min_ttl) { -+ int gen = lru_gen_from_seq(min_seq[1]); -+ unsigned long birth = READ_ONCE(lruvec->evictable.timestamps[gen]); -+ -+ if (time_is_after_jiffies(birth + min_ttl)) -+ return false; -+ } -+ -+ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); -+ if (!nr_to_scan) -+ return false; -+ -+ nr_to_scan >>= sc->priority; -+ -+ if (!mem_cgroup_online(memcg)) -+ nr_to_scan++; -+ -+ if (nr_to_scan && low && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim)) -+ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true); -+ -+ return true; -+} -+ -+/* Protect the working set accessed within the last N milliseconds. */ -+static unsigned long lru_gen_min_ttl __read_mostly; -+ -+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+ struct mem_cgroup *memcg; -+ bool success = false; -+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); -+ -+ VM_BUG_ON(!current_is_kswapd()); -+ -+ if (!sc->force_deactivate) { -+ sc->force_deactivate = 1; -+ return; -+ } -+ -+ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ -+ if (age_lruvec(lruvec, sc, min_ttl)) -+ success = true; -+ -+ cond_resched(); -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+ -+ if (!success && mutex_trylock(&oom_lock)) { -+ struct oom_control oc = { -+ .gfp_mask = sc->gfp_mask, -+ .order = sc->order, -+ }; -+ -+ /* to avoid overkilling */ -+ if (!oom_reaping_in_progress()) -+ out_of_memory(&oc); -+ -+ mutex_unlock(&oom_lock); -+ } -+ -+ current->reclaim_state->mm_walk_args = NULL; -+} -+ -+/* Scan the vicinity of an accessed PTE when shrink_page_list() uses the rmap. */ -+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) -+{ -+ int i; -+ pte_t *pte; -+ struct page *page; -+ int old_gen, new_gen; -+ unsigned long start; -+ unsigned long end; -+ unsigned long addr; -+ struct mm_walk_args *args; -+ int worth = 0; -+ struct mem_cgroup *memcg = page_memcg(pvmw->page); -+ struct pglist_data *pgdat = page_pgdat(pvmw->page); -+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ DEFINE_MAX_SEQ(lruvec); -+ -+ lockdep_assert_held(pvmw->ptl); -+ VM_BUG_ON_PAGE(PageLRU(pvmw->page), pvmw->page); -+ -+ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; -+ if (!args) -+ return; -+ -+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); -+ end = pmd_addr_end(pvmw->address, pvmw->vma->vm_end); -+ -+ if (end - start > MIN_BATCH_SIZE * PAGE_SIZE) { -+ if (pvmw->address - start < MIN_BATCH_SIZE * PAGE_SIZE / 2) -+ end = start + MIN_BATCH_SIZE * PAGE_SIZE; -+ else if (end - pvmw->address < MIN_BATCH_SIZE * PAGE_SIZE / 2) -+ start = end - MIN_BATCH_SIZE * PAGE_SIZE; -+ else { -+ start = pvmw->address - MIN_BATCH_SIZE * PAGE_SIZE / 2; -+ end = pvmw->address + MIN_BATCH_SIZE * PAGE_SIZE / 2; -+ } -+ } -+ -+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; -+ new_gen = lru_gen_from_seq(max_seq); -+ -+ lock_page_memcg(pvmw->page); -+ arch_enter_lazy_mmu_mode(); -+ -+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { -+ unsigned long pfn = pte_pfn(pte[i]); -+ -+ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) -+ continue; -+ -+ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) -+ continue; -+ -+ VM_BUG_ON(!pfn_valid(pfn)); -+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) -+ continue; -+ -+ worth++; -+ -+ if (!pte_young(pte[i])) -+ continue; -+ -+ page = compound_head(pfn_to_page(pfn)); -+ if (page_to_nid(page) != pgdat->node_id) -+ continue; -+ -+ if (page_memcg_rcu(page) != memcg) -+ continue; -+ -+ VM_BUG_ON(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end); -+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) -+ continue; -+ -+ if (pte_dirty(pte[i]) && !PageDirty(page) && -+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) -+ __set_bit(i, args->bitmap); -+ -+ old_gen = page_update_gen(page, new_gen); -+ if (old_gen >= 0 && old_gen != new_gen) -+ update_batch_size(page, old_gen, new_gen, args); -+ } -+ -+ arch_leave_lazy_mmu_mode(); -+ unlock_page_memcg(pvmw->page); -+ -+ if (worth >= MIN_BATCH_SIZE / 2) -+ set_bloom_filter(lruvec, max_seq, pvmw->pmd); -+ -+ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) -+ set_page_dirty(pte_page(pte[i])); -+ -+ bitmap_zero(args->bitmap, MIN_BATCH_SIZE); -+} -+ -+/****************************************************************************** -+ * the eviction -+ ******************************************************************************/ -+ -+static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx) -+{ -+ bool success; -+ int gen = page_lru_gen(page); -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ int tier = page_lru_tier(page); -+ int delta = thp_nr_pages(page); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page); -+ -+ /* an mlocked page? */ -+ if (!page_evictable(page)) { -+ success = lru_gen_del_page(page, lruvec, true); -+ VM_BUG_ON_PAGE(!success, page); -+ SetPageUnevictable(page); -+ add_page_to_lru_list(page, lruvec); -+ __count_vm_events(UNEVICTABLE_PGCULLED, delta); -+ return true; -+ } -+ -+ /* a lazy-free page that has been written into? */ -+ if (type && PageDirty(page) && PageAnon(page)) { -+ success = lru_gen_del_page(page, lruvec, true); -+ VM_BUG_ON_PAGE(!success, page); -+ SetPageSwapBacked(page); -+ add_page_to_lru_list_tail(page, lruvec); -+ return true; -+ } -+ -+ /* page_update_gen() has updated this page? */ -+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { -+ list_move(&page->lru, &lrugen->lists[gen][type][zone]); -+ return true; -+ } -+ -+ /* protect this page if its tier has a higher refaulted % */ -+ if (tier > tier_idx) { -+ int hist = lru_hist_from_seq(gen); -+ -+ page_inc_gen(page, lruvec, false); -+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], -+ lrugen->protected[hist][type][tier - 1] + delta); -+ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); -+ return true; -+ } -+ -+ /* mark this page for reclaim if it's pending writeback */ -+ if (PageWriteback(page) || (type && PageDirty(page))) { -+ page_inc_gen(page, lruvec, true); -+ return true; -+ } -+ -+ return false; -+} -+ -+static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc) -+{ -+ bool success; -+ -+ if (!sc->may_unmap && page_mapped(page)) -+ return false; -+ -+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && -+ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page)))) -+ return false; -+ -+ if (!get_page_unless_zero(page)) -+ return false; -+ -+ if (!TestClearPageLRU(page)) { -+ put_page(page); -+ return false; -+ } -+ -+ success = lru_gen_del_page(page, lruvec, true); -+ VM_BUG_ON_PAGE(!success, page); -+ -+ return true; -+} -+ -+static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, -+ int type, int tier, struct list_head *list) -+{ -+ int gen, zone; -+ enum vm_event_item item; -+ int sorted = 0; -+ int scanned = 0; -+ int isolated = 0; -+ int remaining = MAX_BATCH_SIZE; -+ struct lrugen *lrugen = &lruvec->evictable; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ -+ VM_BUG_ON(!list_empty(list)); -+ -+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS) -+ return 0; -+ -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ for (zone = sc->reclaim_idx; zone >= 0; zone--) { -+ LIST_HEAD(moved); -+ int skipped = 0; -+ struct list_head *head = &lrugen->lists[gen][type][zone]; -+ -+ while (!list_empty(head)) { -+ struct page *page = lru_to_page(head); -+ int delta = thp_nr_pages(page); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ scanned += delta; -+ -+ if (sort_page(page, lruvec, tier)) -+ sorted += delta; -+ else if (isolate_page(page, lruvec, sc)) { -+ list_add(&page->lru, list); -+ isolated += delta; -+ } else { -+ list_move(&page->lru, &moved); -+ skipped += delta; -+ } -+ -+ if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE) -+ break; -+ } -+ -+ if (skipped) { -+ list_splice(&moved, head); -+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); -+ } -+ -+ if (!remaining || isolated >= MIN_BATCH_SIZE) -+ break; -+ } -+ -+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; -+ if (!cgroup_reclaim(sc)) { -+ __count_vm_events(item, isolated); -+ __count_vm_events(PGREFILL, sorted); -+ } -+ __count_memcg_events(memcg, item, isolated); -+ __count_memcg_events(memcg, PGREFILL, sorted); -+ __count_vm_events(PGSCAN_ANON + type, isolated); -+ -+ /* -+ * We may have trouble finding eligible pages due to reclaim_idx, -+ * may_unmap and may_writepage. Check `remaining` to make sure we won't -+ * be stuck if we aren't making enough progress. -+ */ -+ return isolated || !remaining ? scanned : 0; -+} -+ -+static int get_tier_idx(struct lruvec *lruvec, int type) -+{ -+ int tier; -+ struct ctrl_pos sp, pv; -+ -+ /* -+ * Ideally we don't want to evict upper tiers that have higher refaulted -+ * %. However, we need to leave a margin for the fluctuation in -+ * refaulted %. So we use a larger gain factor to make sure upper tiers -+ * are indeed more active. We choose 2 because the lowest upper tier -+ * would have twice of refaulted % of the base tier, according to their -+ * numbers of accesses. -+ */ -+ read_ctrl_pos(lruvec, type, 0, 1, &sp); -+ for (tier = 1; tier < MAX_NR_TIERS; tier++) { -+ read_ctrl_pos(lruvec, type, tier, 2, &pv); -+ if (!positive_ctrl_err(&sp, &pv)) -+ break; -+ } -+ -+ return tier - 1; -+} -+ -+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) -+{ -+ int type, tier; -+ struct ctrl_pos sp, pv; -+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; -+ -+ /* -+ * Compare refaulted % between the base tiers of anon and file to -+ * determine which type to evict. Also need to compare refaulted % of -+ * the upper tiers of the selected type with that of the base tier of -+ * the other type to determine which tier of the selected type to evict. -+ */ -+ read_ctrl_pos(lruvec, 0, 0, gain[0], &sp); -+ read_ctrl_pos(lruvec, 1, 0, gain[1], &pv); -+ type = positive_ctrl_err(&sp, &pv); -+ -+ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); -+ for (tier = 1; tier < MAX_NR_TIERS; tier++) { -+ read_ctrl_pos(lruvec, type, tier, gain[type], &pv); -+ if (!positive_ctrl_err(&sp, &pv)) -+ break; -+ } -+ -+ *tier_idx = tier - 1; -+ -+ return type; -+} -+ -+static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ int *type_scanned, struct list_head *list) -+{ -+ int i; -+ int type; -+ int scanned; -+ int tier = -1; -+ DEFINE_MIN_SEQ(lruvec); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ /* -+ * Try to select a type based on generations and swappiness, and if that -+ * fails, fall back to get_type_to_scan(). When anon and file are both -+ * available from the same generation, swappiness 200 is interpreted as -+ * anon first and swappiness 1 is interpreted as file first. -+ */ -+ if (!swappiness) -+ type = 1; -+ else if (min_seq[0] < min_seq[1]) -+ type = 0; -+ else if (swappiness == 1) -+ type = 1; -+ else if (swappiness == 200) -+ type = 0; -+ else -+ type = get_type_to_scan(lruvec, swappiness, &tier); -+ -+ for (i = !swappiness; i < ANON_AND_FILE; i++) { -+ if (tier < 0) -+ tier = get_tier_idx(lruvec, type); -+ -+ scanned = scan_pages(lruvec, sc, type, tier, list); -+ if (scanned) -+ break; -+ -+ type = !type; -+ tier = -1; -+ } -+ -+ *type_scanned = type; -+ -+ return scanned; -+} -+ -+/* Main function used by the foreground, the background and the user-triggered eviction. */ -+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) -+{ -+ int type; -+ int scanned; -+ int reclaimed; -+ LIST_HEAD(list); -+ struct page *page; -+ enum vm_event_item item; -+ struct reclaim_stat stat; -+ struct mm_walk_args *args; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); -+ -+ if (try_to_inc_min_seq(lruvec, swappiness)) -+ scanned++; -+ -+ if (get_nr_gens(lruvec, 1) == MIN_NR_GENS) -+ scanned = 0; -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ -+ if (list_empty(&list)) -+ return scanned; -+ -+ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); -+ /* -+ * We need to prevent rejected pages from being added back to the same -+ * lists they were isolated from. Otherwise we may risk looping on them -+ * forever. -+ */ -+ list_for_each_entry(page, &list, lru) { -+ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page))) -+ SetPageActive(page); -+ -+ ClearPageReferenced(page); -+ ClearPageWorkingset(page); -+ } -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ move_pages_to_lru(lruvec, &list); -+ -+ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; -+ if (args && args->batch_size) -+ reset_batch_size(lruvec, args); -+ -+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; -+ if (!cgroup_reclaim(sc)) -+ __count_vm_events(item, reclaimed); -+ __count_memcg_events(memcg, item, reclaimed); -+ __count_vm_events(PGSTEAL_ANON + type, reclaimed); -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ -+ mem_cgroup_uncharge_list(&list); -+ free_unref_page_list(&list); -+ -+ sc->nr_reclaimed += reclaimed; -+ -+ return scanned; -+} -+ -+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) -+{ -+ bool low; -+ long nr_to_scan; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ int priority = sc->priority; -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (mem_cgroup_below_min(memcg) || -+ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) -+ return 0; -+ -+ if (sc->nr_reclaimed >= sc->nr_to_reclaim) { -+ priority = DEF_PRIORITY; -+ sc->force_deactivate = 0; -+ } -+ -+ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); -+ if (!nr_to_scan) -+ return 0; -+ -+ nr_to_scan >>= priority; -+ -+ if (!mem_cgroup_online(memcg)) -+ nr_to_scan++; -+ -+ if (!nr_to_scan) -+ return 0; -+ -+ if (current_is_kswapd()) { -+ /* leave the work to lru_gen_age_node() */ -+ if (max_seq - min_seq[1] < MIN_NR_GENS) -+ return 0; -+ -+ if (!low) -+ sc->force_deactivate = 0; -+ -+ return nr_to_scan; -+ } -+ -+ if (max_seq - min_seq[1] >= MIN_NR_GENS) -+ return nr_to_scan; -+ -+ /* move onto slab and other memcgs if we haven't tried them all */ -+ if (!sc->force_deactivate) { -+ sc->skipped_deactivate = 1; -+ return 0; -+ } -+ -+ return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0; -+} -+ -+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ -+ struct blk_plug plug; -+ long scanned = 0; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ -+ lru_add_drain(); -+ -+ if (current_is_kswapd()) -+ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; -+ -+ blk_start_plug(&plug); -+ -+ while (true) { -+ int delta; -+ int swappiness; -+ long nr_to_scan; -+ -+ if (sc->may_swap) -+ swappiness = get_swappiness(memcg); -+ else if (!cgroup_reclaim(sc) && get_swappiness(memcg)) -+ swappiness = 1; -+ else -+ swappiness = 0; -+ -+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); -+ if (!nr_to_scan) -+ break; -+ -+ delta = evict_pages(lruvec, sc, swappiness); -+ if (!delta) -+ break; -+ -+ scanned += delta; -+ if (scanned >= nr_to_scan) -+ break; -+ -+ cond_resched(); -+ } -+ -+ blk_finish_plug(&plug); -+ -+ if (current_is_kswapd()) -+ current->reclaim_state->mm_walk_args = NULL; -+} -+ -+/****************************************************************************** -+ * state change -+ ******************************************************************************/ -+ -+#ifdef CONFIG_LRU_GEN_ENABLED -+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); -+#else -+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); -+#endif -+ -+static int lru_gen_nr_swapfiles; -+ -+static bool __maybe_unused state_is_valid(struct lruvec *lruvec) -+{ -+ int gen, type, zone; -+ enum lru_list lru; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ for_each_evictable_lru(lru) { -+ type = is_file_lru(lru); -+ -+ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) -+ return false; -+ } -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) -+ return false; -+ -+ /* unlikely but not a bug when reset_batch_size() is pending */ -+ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); -+ } -+ -+ return true; -+} -+ -+static bool fill_lists(struct lruvec *lruvec) -+{ -+ enum lru_list lru; -+ int remaining = MAX_BATCH_SIZE; -+ -+ for_each_evictable_lru(lru) { -+ int type = is_file_lru(lru); -+ bool active = is_active_lru(lru); -+ struct list_head *head = &lruvec->lists[lru]; -+ -+ if (!lruvec->evictable.enabled[type]) -+ continue; -+ -+ while (!list_empty(head)) { -+ bool success; -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page) != active, page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ del_page_from_lru_list(page, lruvec); -+ success = lru_gen_add_page(page, lruvec, false); -+ VM_BUG_ON(!success); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+static bool drain_lists(struct lruvec *lruvec) -+{ -+ int gen, type, zone; -+ int remaining = MAX_BATCH_SIZE; -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; -+ -+ if (lruvec->evictable.enabled[type]) -+ continue; -+ -+ while (!list_empty(head)) { -+ bool success; -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ success = lru_gen_del_page(page, lruvec, false); -+ VM_BUG_ON(!success); -+ add_page_to_lru_list(page, lruvec); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+/* -+ * For file page tracking, we enable/disable it according to the main switch. -+ * For anon page tracking, we only enabled it when the main switch is on and -+ * there is at least one swapfile; we disable it when there are no swapfiles -+ * regardless of the value of the main switch. Otherwise, we will eventually -+ * reach the max size of the sliding window and have to call inc_min_seq(). -+ */ -+void lru_gen_change_state(bool enable, bool main, bool swap) -+{ -+ static DEFINE_MUTEX(state_mutex); -+ -+ struct mem_cgroup *memcg; -+ -+ mem_hotplug_begin(); -+ cgroup_lock(); -+ mutex_lock(&state_mutex); -+ -+ if (swap) { -+ if (enable) -+ swap = !lru_gen_nr_swapfiles++; -+ else -+ swap = !--lru_gen_nr_swapfiles; -+ } -+ -+ if (main && enable != lru_gen_enabled()) { -+ if (enable) -+ static_branch_enable(&lru_gen_static_key); -+ else -+ static_branch_disable(&lru_gen_static_key); -+ } else if (!swap || !lru_gen_enabled()) -+ goto unlock; -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ int nid; -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ if (!lruvec) -+ continue; -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ VM_BUG_ON(!state_is_valid(lruvec)); -+ -+ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; -+ lruvec->evictable.enabled[1] = lru_gen_enabled(); -+ -+ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) { -+ spin_unlock_irq(&lruvec->lru_lock); -+ cond_resched(); -+ spin_lock_irq(&lruvec->lru_lock); -+ } -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ } -+ -+ cond_resched(); -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+unlock: -+ mutex_unlock(&state_mutex); -+ cgroup_unlock(); -+ mem_hotplug_done(); -+} -+ -+/****************************************************************************** -+ * sysfs interface -+ ******************************************************************************/ -+ -+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); -+} -+ -+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, -+ const char *buf, size_t len) -+{ -+ unsigned int msecs; -+ -+ if (kstrtouint(buf, 10, &msecs)) -+ return -EINVAL; -+ -+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); -+ -+ return len; -+} -+ -+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( -+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl -+); -+ -+static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -+{ -+ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled()); -+} -+ -+static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr, -+ const char *buf, size_t len) -+{ -+ bool enable; -+ -+ if (kstrtobool(buf, &enable)) -+ return -EINVAL; -+ -+ lru_gen_change_state(enable, true, false); -+ -+ return len; -+} -+ -+static struct kobj_attribute lru_gen_enabled_attr = __ATTR( -+ enabled, 0644, show_enable, store_enable -+); -+ -+static struct attribute *lru_gen_attrs[] = { -+ &lru_gen_min_ttl_attr.attr, -+ &lru_gen_enabled_attr.attr, -+ NULL -+}; -+ -+static struct attribute_group lru_gen_attr_group = { -+ .name = "lru_gen", -+ .attrs = lru_gen_attrs, -+}; -+ -+/****************************************************************************** -+ * debugfs interface -+ ******************************************************************************/ -+ -+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) -+{ -+ struct mem_cgroup *memcg; -+ loff_t nr_to_skip = *pos; -+ -+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL); -+ if (!m->private) -+ return ERR_PTR(-ENOMEM); -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ int nid; -+ -+ for_each_node_state(nid, N_MEMORY) { -+ if (!nr_to_skip--) -+ return get_lruvec(nid, memcg); -+ } -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+ -+ return NULL; -+} -+ -+static void lru_gen_seq_stop(struct seq_file *m, void *v) -+{ -+ if (!IS_ERR_OR_NULL(v)) -+ mem_cgroup_iter_break(NULL, lruvec_memcg(v)); -+ -+ kvfree(m->private); -+ m->private = NULL; -+} -+ -+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) -+{ -+ int nid = lruvec_pgdat(v)->node_id; -+ struct mem_cgroup *memcg = lruvec_memcg(v); -+ -+ ++*pos; -+ -+ nid = next_memory_node(nid); -+ if (nid == MAX_NUMNODES) { -+ memcg = mem_cgroup_iter(NULL, memcg, NULL); -+ if (!memcg) -+ return NULL; -+ -+ nid = first_memory_node; -+ } -+ -+ return get_lruvec(nid, memcg); -+} -+ -+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, -+ unsigned long max_seq, unsigned long *min_seq, -+ unsigned long seq) -+{ -+ int i; -+ int type, tier; -+ int hist = lru_hist_from_seq(seq); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ for (tier = 0; tier < MAX_NR_TIERS; tier++) { -+ seq_printf(m, " %10d", tier); -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ unsigned long n[3] = {}; -+ -+ if (seq == max_seq) { -+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); -+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]); -+ -+ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]); -+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { -+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); -+ -+ seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]); -+ } else -+ seq_puts(m, " 0 0 0 "); -+ } -+ seq_putc(m, '\n'); -+ } -+ -+ seq_puts(m, " "); -+ for (i = 0; i < NR_MM_STATS; i++) { -+ if (seq == max_seq && NR_HIST_GENS == 1) -+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), -+ toupper(MM_STAT_CODES[i])); -+ else if (seq != max_seq && NR_HIST_GENS > 1) -+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), -+ MM_STAT_CODES[i]); -+ else -+ seq_puts(m, " 0 "); -+ } -+ seq_putc(m, '\n'); -+} -+ -+static int lru_gen_seq_show(struct seq_file *m, void *v) -+{ -+ unsigned long seq; -+ bool full = !debugfs_real_fops(m->file)->write; -+ struct lruvec *lruvec = v; -+ struct lrugen *lrugen = &lruvec->evictable; -+ int nid = lruvec_pgdat(lruvec)->node_id; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (nid == first_memory_node) { -+ const char *path = memcg ? m->private : ""; -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) -+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); -+#endif -+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); -+ } -+ -+ seq_printf(m, " node %5d\n", nid); -+ -+ if (!full) -+ seq = min_seq[0]; -+ else if (max_seq >= MAX_NR_GENS) -+ seq = max_seq - MAX_NR_GENS + 1; -+ else -+ seq = 0; -+ -+ for (; seq <= max_seq; seq++) { -+ int gen, type, zone; -+ unsigned int msecs; -+ -+ gen = lru_gen_from_seq(seq); -+ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen])); -+ -+ seq_printf(m, " %10lu %10u", seq, msecs); -+ -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ long size = 0; -+ -+ if (seq < min_seq[type]) { -+ seq_puts(m, " -0 "); -+ continue; -+ } -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) -+ size += READ_ONCE(lrugen->sizes[gen][type][zone]); -+ -+ seq_printf(m, " %10lu ", max(size, 0L)); -+ } -+ -+ seq_putc(m, '\n'); -+ -+ if (full) -+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); -+ } -+ -+ return 0; -+} -+ -+static const struct seq_operations lru_gen_seq_ops = { -+ .start = lru_gen_seq_start, -+ .stop = lru_gen_seq_stop, -+ .next = lru_gen_seq_next, -+ .show = lru_gen_seq_show, -+}; -+ -+static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ unsigned long seq, bool use_filter) -+{ -+ DEFINE_MAX_SEQ(lruvec); -+ -+ if (seq == max_seq) -+ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter); -+ -+ return seq > max_seq ? -EINVAL : 0; -+} -+ -+static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ unsigned long seq, unsigned long nr_to_reclaim) -+{ -+ struct blk_plug plug; -+ int err = -EINTR; -+ DEFINE_MAX_SEQ(lruvec); -+ -+ if (seq >= max_seq - 1) -+ return -EINVAL; -+ -+ sc->nr_reclaimed = 0; -+ -+ blk_start_plug(&plug); -+ -+ while (!signal_pending(current)) { -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim || -+ !evict_pages(lruvec, sc, swappiness)) { -+ err = 0; -+ break; -+ } -+ -+ cond_resched(); -+ } -+ -+ blk_finish_plug(&plug); -+ -+ return err; -+} -+ -+static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc, -+ int swappiness, unsigned long seq, unsigned long opt) -+{ -+ struct lruvec *lruvec; -+ int err = -EINVAL; -+ struct mem_cgroup *memcg = NULL; -+ -+ if (!mem_cgroup_disabled()) { -+ rcu_read_lock(); -+ memcg = mem_cgroup_from_id(memcg_id); -+#ifdef CONFIG_MEMCG -+ if (memcg && !css_tryget(&memcg->css)) -+ memcg = NULL; -+#endif -+ rcu_read_unlock(); -+ -+ if (!memcg) -+ goto done; -+ } -+ if (memcg_id != mem_cgroup_id(memcg)) -+ goto done; -+ -+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) -+ goto done; -+ -+ lruvec = get_lruvec(nid, memcg); -+ -+ if (swappiness < 0) -+ swappiness = get_swappiness(memcg); -+ else if (swappiness > 200) -+ goto done; -+ -+ switch (cmd) { -+ case '+': -+ err = run_aging(lruvec, sc, swappiness, seq, opt); -+ break; -+ case '-': -+ err = run_eviction(lruvec, sc, swappiness, seq, opt); -+ break; -+ } -+done: -+ mem_cgroup_put(memcg); -+ -+ return err; -+} -+ -+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, -+ size_t len, loff_t *pos) -+{ -+ void *buf; -+ char *cur, *next; -+ unsigned int flags; -+ int err = 0; -+ struct scan_control sc = { -+ .may_writepage = 1, -+ .may_unmap = 1, -+ .may_swap = 1, -+ .reclaim_idx = MAX_NR_ZONES - 1, -+ .gfp_mask = GFP_KERNEL, -+ }; -+ -+ buf = kvmalloc(len + 1, GFP_KERNEL); -+ if (!buf) -+ return -ENOMEM; -+ -+ if (copy_from_user(buf, src, len)) { -+ kvfree(buf); -+ return -EFAULT; -+ } -+ -+ next = buf; -+ next[len] = '\0'; -+ -+ sc.reclaim_state.mm_walk_args = alloc_mm_walk_args(); -+ if (!sc.reclaim_state.mm_walk_args) { -+ kvfree(buf); -+ return -ENOMEM; -+ } -+ -+ flags = memalloc_noreclaim_save(); -+ set_task_reclaim_state(current, &sc.reclaim_state); -+ -+ while ((cur = strsep(&next, ",;\n"))) { -+ int n; -+ int end; -+ char cmd; -+ unsigned int memcg_id; -+ unsigned int nid; -+ unsigned long seq; -+ unsigned int swappiness = -1; -+ unsigned long opt = -1; -+ -+ cur = skip_spaces(cur); -+ if (!*cur) -+ continue; -+ -+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, -+ &seq, &end, &swappiness, &end, &opt, &end); -+ if (n < 4 || cur[end]) { -+ err = -EINVAL; -+ break; -+ } -+ -+ err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt); -+ if (err) -+ break; -+ } -+ -+ set_task_reclaim_state(current, NULL); -+ memalloc_noreclaim_restore(flags); -+ -+ free_mm_walk_args(sc.reclaim_state.mm_walk_args); -+ kvfree(buf); -+ -+ return err ? : len; -+} -+ -+static int lru_gen_seq_open(struct inode *inode, struct file *file) -+{ -+ return seq_open(file, &lru_gen_seq_ops); -+} -+ -+static const struct file_operations lru_gen_rw_fops = { -+ .open = lru_gen_seq_open, -+ .read = seq_read, -+ .write = lru_gen_seq_write, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+static const struct file_operations lru_gen_ro_fops = { -+ .open = lru_gen_seq_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+/****************************************************************************** -+ * initialization -+ ******************************************************************************/ -+ -+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) -+{ -+ int i; -+ int gen, type, zone; -+ struct lrugen *lrugen = &lruvec->evictable; -+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); -+ -+ lrugen->max_seq = MIN_NR_GENS + 1; -+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; -+ lrugen->enabled[1] = lru_gen_enabled(); -+ -+ for (i = 0; i <= MIN_NR_GENS + 1; i++) -+ lrugen->timestamps[i] = jiffies; -+ -+ for_each_gen_type_zone(gen, type, zone) -+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); -+ -+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) -+ spin_lock(&mm_list->lock); -+ -+ lruvec->mm_walk.seq = MIN_NR_GENS; -+ lruvec->mm_walk.head = &mm_list->fifo; -+ lruvec->mm_walk.tail = &mm_list->fifo; -+ init_waitqueue_head(&lruvec->mm_walk.wait); -+ -+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) -+ spin_unlock(&mm_list->lock); -+} -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_init_memcg(struct mem_cgroup *memcg) -+{ -+ int nid; -+ -+ INIT_LIST_HEAD(&memcg->mm_list.fifo); -+ spin_lock_init(&memcg->mm_list.lock); -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ lru_gen_init_state(memcg, lruvec); -+ } -+} -+ -+void lru_gen_free_memcg(struct mem_cgroup *memcg) -+{ -+ int nid; -+ -+ for_each_node(nid) { -+ int i; -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ for (i = 0; i < NR_BLOOM_FILTERS; i++) { -+ bitmap_free(lruvec->mm_walk.filters[i]); -+ lruvec->mm_walk.filters[i] = NULL; -+ } -+ } -+} -+#endif -+ -+static int __init init_lru_gen(void) -+{ -+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); -+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); -+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); -+ -+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) -+ pr_err("lru_gen: failed to create sysfs group\n"); -+ -+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); -+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); -+ -+ return 0; -+}; -+late_initcall(init_lru_gen); -+ -+#else -+ -+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+} -+ -+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - { - unsigned long nr[NR_LRU_LISTS]; -@@ -2794,6 +5387,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - struct blk_plug plug; - bool scan_adjusted; - -+ if (lru_gen_enabled()) { -+ lru_gen_shrink_lruvec(lruvec, sc); -+ return; -+ } -+ - get_scan_count(lruvec, sc, nr); - - /* Record the original scan target for proportional adjustments later */ -@@ -3032,7 +5630,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) - unsigned long nr_reclaimed, nr_scanned; - struct lruvec *target_lruvec; - bool reclaimable = false; -- unsigned long file; - - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - -@@ -3048,93 +5645,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) - nr_reclaimed = sc->nr_reclaimed; - nr_scanned = sc->nr_scanned; - -- /* -- * Determine the scan balance between anon and file LRUs. -- */ -- spin_lock_irq(&target_lruvec->lru_lock); -- sc->anon_cost = target_lruvec->anon_cost; -- sc->file_cost = target_lruvec->file_cost; -- spin_unlock_irq(&target_lruvec->lru_lock); -- -- /* -- * Target desirable inactive:active list ratios for the anon -- * and file LRU lists. -- */ -- if (!sc->force_deactivate) { -- unsigned long refaults; -- -- refaults = lruvec_page_state(target_lruvec, -- WORKINGSET_ACTIVATE_ANON); -- if (refaults != target_lruvec->refaults[0] || -- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) -- sc->may_deactivate |= DEACTIVATE_ANON; -- else -- sc->may_deactivate &= ~DEACTIVATE_ANON; -- -- /* -- * When refaults are being observed, it means a new -- * workingset is being established. Deactivate to get -- * rid of any stale active pages quickly. -- */ -- refaults = lruvec_page_state(target_lruvec, -- WORKINGSET_ACTIVATE_FILE); -- if (refaults != target_lruvec->refaults[1] || -- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) -- sc->may_deactivate |= DEACTIVATE_FILE; -- else -- sc->may_deactivate &= ~DEACTIVATE_FILE; -- } else -- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; -- -- /* -- * If we have plenty of inactive file pages that aren't -- * thrashing, try to reclaim those first before touching -- * anonymous pages. -- */ -- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); -- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) -- sc->cache_trim_mode = 1; -- else -- sc->cache_trim_mode = 0; -- -- /* -- * Prevent the reclaimer from falling into the cache trap: as -- * cache pages start out inactive, every cache fault will tip -- * the scan balance towards the file LRU. And as the file LRU -- * shrinks, so does the window for rotation from references. -- * This means we have a runaway feedback loop where a tiny -- * thrashing file LRU becomes infinitely more attractive than -- * anon pages. Try to detect this based on file LRU size. -- */ -- if (!cgroup_reclaim(sc)) { -- unsigned long total_high_wmark = 0; -- unsigned long free, anon; -- int z; -- -- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); -- file = node_page_state(pgdat, NR_ACTIVE_FILE) + -- node_page_state(pgdat, NR_INACTIVE_FILE); -- -- for (z = 0; z < MAX_NR_ZONES; z++) { -- struct zone *zone = &pgdat->node_zones[z]; -- if (!managed_zone(zone)) -- continue; -- -- total_high_wmark += high_wmark_pages(zone); -- } -- -- /* -- * Consider anon: if that's low too, this isn't a -- * runaway file reclaim problem, but rather just -- * extreme pressure. Reclaim as per usual then. -- */ -- anon = node_page_state(pgdat, NR_INACTIVE_ANON); -- -- sc->file_is_tiny = -- file + free <= total_high_wmark && -- !(sc->may_deactivate & DEACTIVATE_ANON) && -- anon >> sc->priority; -- } -+ prepare_scan_count(pgdat, sc); - - shrink_node_memcgs(pgdat, sc); - -@@ -3354,6 +5865,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) - struct lruvec *target_lruvec; - unsigned long refaults; - -+ if (lru_gen_enabled()) -+ return; -+ - target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); - target_lruvec->refaults[0] = refaults; -@@ -3724,6 +6238,11 @@ static void age_active_anon(struct pglist_data *pgdat, - struct mem_cgroup *memcg; - struct lruvec *lruvec; - -+ if (lru_gen_enabled()) { -+ lru_gen_age_node(pgdat, sc); -+ return; -+ } -+ - if (!can_age_anon_pages(pgdat, sc)) - return; - -diff --git a/mm/workingset.c b/mm/workingset.c -index d5b81e4f4cbe..27d504a5d998 100644 ---- a/mm/workingset.c -+++ b/mm/workingset.c -@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly; - static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, - bool workingset) - { -- eviction >>= bucket_order; - eviction &= EVICTION_MASK; - eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; - eviction = (eviction << NODES_SHIFT) | pgdat->node_id; -@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - - *memcgidp = memcgid; - *pgdat = NODE_DATA(nid); -- *evictionp = entry << bucket_order; -+ *evictionp = entry; - *workingsetp = workingset; - } - -+#ifdef CONFIG_LRU_GEN -+ -+static int page_lru_refs(struct page *page) -+{ -+ unsigned long flags = READ_ONCE(page->flags); -+ -+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); -+ -+ /* see the comment on MAX_NR_TIERS */ -+ return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0; -+} -+ -+/* Return a token to be stored in the shadow entry of a page being evicted. */ -+static void *lru_gen_eviction(struct page *page) -+{ -+ int hist, tier; -+ unsigned long token; -+ unsigned long min_seq; -+ struct lruvec *lruvec; -+ struct lrugen *lrugen; -+ int type = page_is_file_lru(page); -+ int refs = page_lru_refs(page); -+ int delta = thp_nr_pages(page); -+ bool workingset = PageWorkingset(page); -+ struct mem_cgroup *memcg = page_memcg(page); -+ struct pglist_data *pgdat = page_pgdat(page); -+ -+ lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ lrugen = &lruvec->evictable; -+ min_seq = READ_ONCE(lrugen->min_seq[type]); -+ token = (min_seq << LRU_REFS_WIDTH) | refs; -+ -+ hist = lru_hist_from_seq(min_seq); -+ tier = lru_tier_from_refs(refs + workingset); -+ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); -+ -+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); -+} -+ -+/* Count a refaulted page based on the token stored in its shadow entry. */ -+static void lru_gen_refault(struct page *page, void *shadow) -+{ -+ int hist, tier, refs; -+ int memcg_id; -+ bool workingset; -+ unsigned long token; -+ unsigned long min_seq; -+ struct lruvec *lruvec; -+ struct lrugen *lrugen; -+ struct mem_cgroup *memcg; -+ struct pglist_data *pgdat; -+ int type = page_is_file_lru(page); -+ int delta = thp_nr_pages(page); -+ -+ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); -+ if (page_pgdat(page) != pgdat) -+ return; -+ -+ rcu_read_lock(); -+ memcg = page_memcg_rcu(page); -+ if (mem_cgroup_id(memcg) != memcg_id) -+ goto unlock; -+ -+ refs = token & (BIT(LRU_REFS_WIDTH) - 1); -+ if (refs && !workingset) -+ goto unlock; -+ -+ token >>= LRU_REFS_WIDTH; -+ lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ lrugen = &lruvec->evictable; -+ min_seq = READ_ONCE(lrugen->min_seq[type]); -+ if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) -+ goto unlock; -+ -+ hist = lru_hist_from_seq(min_seq); -+ tier = lru_tier_from_refs(refs + workingset); -+ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); -+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); -+ -+ /* -+ * Tiers don't offer any protection to pages accessed via page tables. -+ * That's what generations do. Tiers can't fully protect pages after -+ * their numbers of accesses has exceeded the max value. Conservatively -+ * count these two conditions as stalls even though they might not -+ * indicate any real memory pressure. -+ */ -+ if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) { -+ SetPageWorkingset(page); -+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); -+ } -+unlock: -+ rcu_read_unlock(); -+} -+ -+#else -+ -+static void *lru_gen_eviction(struct page *page) -+{ -+ return NULL; -+} -+ -+static void lru_gen_refault(struct page *page, void *shadow) -+{ -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - /** - * workingset_age_nonresident - age non-resident entries as LRU ages - * @lruvec: the lruvec that was aged -@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) - VM_BUG_ON_PAGE(page_count(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - -+ if (lru_gen_enabled()) -+ return lru_gen_eviction(page); -+ - lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - /* XXX: target_memcg can be NULL, go through lruvec */ - memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); - eviction = atomic_long_read(&lruvec->nonresident_age); -+ eviction >>= bucket_order; - workingset_age_nonresident(lruvec, thp_nr_pages(page)); - return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); - } -@@ -296,7 +406,13 @@ void workingset_refault(struct page *page, void *shadow) - bool workingset; - int memcgid; - -+ if (lru_gen_enabled()) { -+ lru_gen_refault(page, shadow); -+ return; -+ } -+ - unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); -+ eviction <<= bucket_order; - - rcu_read_lock(); - /* --- -2.33.1 - |