summarylogtreecommitdiffstats
path: root/sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch
diff options
context:
space:
mode:
authordragonn2021-07-20 20:33:28 +0200
committerdragonn2021-07-20 20:33:28 +0200
commitb59bf3acd35fd937c8cc162243b70724e643294f (patch)
treec42e496dd99a5a83a7279086f4203fe3d9c5bc42 /sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch
parent3e86e006ef476d294fe1d93ec691a65dd144502c (diff)
downloadaur-b59bf3acd35fd937c8cc162243b70724e643294f.tar.gz
5.13.4
Diffstat (limited to 'sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch')
-rw-r--r--sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch3458
1 files changed, 1630 insertions, 1828 deletions
diff --git a/sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch b/sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch
index 16b30ccf94ad..b85a0f064684 100644
--- a/sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch
+++ b/sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch
@@ -1,57 +1,165 @@
-diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
-index ac398e143c9a..89fe4e3592f9 100644
---- a/include/linux/nodemask.h
-+++ b/include/linux/nodemask.h
-@@ -486,6 +486,7 @@ static inline int num_node_state(enum node_states state)
- #define first_online_node 0
- #define first_memory_node 0
- #define next_online_node(nid) (MAX_NUMNODES)
-+#define next_memory_node(nid) (MAX_NUMNODES)
- #define nr_node_ids 1U
- #define nr_online_nodes 1U
-
-diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
-index 4f2f79de083e..bd5744360cfa 100644
---- a/include/linux/cgroup.h
-+++ b/include/linux/cgroup.h
-@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
- css_put(&cgrp->self);
- }
+diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst
+index eff5fbd492d0..c353b3f55924 100644
+--- a/Documentation/vm/index.rst
++++ b/Documentation/vm/index.rst
+@@ -17,6 +17,7 @@ various features of the Linux memory management
-+extern struct mutex cgroup_mutex;
+ swap_numa
+ zswap
++ multigen_lru
+
+ Kernel developers MM documentation
+ ==================================
+diff --git a/Documentation/vm/multigen_lru.rst b/Documentation/vm/multigen_lru.rst
+new file mode 100644
+index 000000000000..a18416ed7e92
+--- /dev/null
++++ b/Documentation/vm/multigen_lru.rst
+@@ -0,0 +1,143 @@
++.. SPDX-License-Identifier: GPL-2.0
+
-+static inline void cgroup_lock(void)
-+{
-+ mutex_lock(&cgroup_mutex);
-+}
++=====================
++Multigenerational LRU
++=====================
+
-+static inline void cgroup_unlock(void)
-+{
-+ mutex_unlock(&cgroup_mutex);
-+}
++Quick Start
++===========
++Build Options
++-------------
++:Required: Set ``CONFIG_LRU_GEN=y``.
+
- /**
- * task_css_set_check - obtain a task's css_set with extra access conditions
- * @task: the task to obtain css_set for
-@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
- * as locks used during the cgroup_subsys::attach() methods.
- */
- #ifdef CONFIG_PROVE_RCU
--extern struct mutex cgroup_mutex;
- extern spinlock_t css_set_lock;
- #define task_css_set_check(task, __c) \
- rcu_dereference_check((task)->cgroups, \
-@@ -704,6 +715,8 @@ struct cgroup;
- static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
- static inline void css_get(struct cgroup_subsys_state *css) {}
- static inline void css_put(struct cgroup_subsys_state *css) {}
-+static inline void cgroup_lock(void) {}
-+static inline void cgroup_unlock(void) {}
- static inline int cgroup_attach_task_all(struct task_struct *from,
- struct task_struct *t) { return 0; }
- static inline int cgroupstats_build(struct cgroupstats *stats,
-
- diff --git a/arch/Kconfig b/arch/Kconfig
++:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by
++ default.
++
++:Optional: Change ``CONFIG_NR_LRU_GENS`` to a number ``X`` to support
++ a maximum of ``X`` generations.
++
++:Optional: Change ``CONFIG_TIERS_PER_GEN`` to a number ``Y`` to
++ support a maximum of ``Y`` tiers per generation.
++
++Runtime Options
++---------------
++:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the
++ feature was not turned on by default.
++
++:Optional: Change ``/sys/kernel/mm/lru_gen/spread`` to a number ``N``
++ to spread pages out across ``N+1`` generations. ``N`` should be less
++ than ``X``. Larger values make the background aging more aggressive.
++
++:Optional: Read ``/sys/kernel/debug/lru_gen`` to verify the feature.
++ This file has the following output:
++
++::
++
++ memcg memcg_id memcg_path
++ node node_id
++ min_gen birth_time anon_size file_size
++ ...
++ max_gen birth_time anon_size file_size
++
++Given a memcg and a node, ``min_gen`` is the oldest generation
++(number) and ``max_gen`` is the youngest. Birth time is in
++milliseconds. The sizes of anon and file types are in pages.
++
++Recipes
++-------
++:Android on ARMv8.1+: ``X=4``, ``Y=3`` and ``N=0``.
++
++:Android on pre-ARMv8.1 CPUs: Not recommended due to the lack of
++ ``ARM64_HW_AFDBM``.
++
++:Laptops and workstations running Chrome on x86_64: Use the default
++ values.
++
++:Working set estimation: Write ``+ memcg_id node_id gen [swappiness]``
++ to ``/sys/kernel/debug/lru_gen`` to account referenced pages to
++ generation ``max_gen`` and create the next generation ``max_gen+1``.
++ ``gen`` should be equal to ``max_gen``. A swap file and a non-zero
++ ``swappiness`` are required to scan anon type. If swapping is not
++ desired, set ``vm.swappiness`` to ``0``.
++
++:Proactive reclaim: Write ``- memcg_id node_id gen [swappiness]
++ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to evict
++ generations less than or equal to ``gen``. ``gen`` should be less
++ than ``max_gen-1`` as ``max_gen`` and ``max_gen-1`` are active
++ generations and therefore protected from the eviction. Use
++ ``nr_to_reclaim`` to limit the number of pages to evict. Multiple
++ command lines are supported, so does concatenation with delimiters
++ ``,`` and ``;``.
++
++Framework
++=========
++For each ``lruvec``, evictable pages are divided into multiple
++generations. The youngest generation number is stored in ``max_seq``
++for both anon and file types as they are aged on an equal footing. The
++oldest generation numbers are stored in ``min_seq[2]`` separately for
++anon and file types as clean file pages can be evicted regardless of
++swap and write-back constraints. These three variables are
++monotonically increasing. Generation numbers are truncated into
++``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into
++``page->flags``. The sliding window technique is used to prevent
++truncated generation numbers from overlapping. Each truncated
++generation number is an index to an array of per-type and per-zone
++lists. Evictable pages are added to the per-zone lists indexed by
++``max_seq`` or ``min_seq[2]`` (modulo ``CONFIG_NR_LRU_GENS``),
++depending on their types.
++
++Each generation is then divided into multiple tiers. Tiers represent
++levels of usage from file descriptors only. Pages accessed N times via
++file descriptors belong to tier order_base_2(N). Each generation
++contains at most CONFIG_TIERS_PER_GEN tiers, and they require
++additional CONFIG_TIERS_PER_GEN-2 bits in page->flags. In contrast to
++moving across generations which requires the lru lock for the list
++operations, moving across tiers only involves an atomic operation on
++``page->flags`` and therefore has a negligible cost. A feedback loop
++modeled after the PID controller monitors the refault rates across all
++tiers and decides when to activate pages from which tiers in the
++reclaim path.
++
++The framework comprises two conceptually independent components: the
++aging and the eviction, which can be invoked separately from user
++space for the purpose of working set estimation and proactive reclaim.
++
++Aging
++-----
++The aging produces young generations. Given an ``lruvec``, the aging
++scans page tables for referenced pages of this ``lruvec``. Upon
++finding one, the aging updates its generation number to ``max_seq``.
++After each round of scan, the aging increments ``max_seq``.
++
++The aging maintains either a system-wide ``mm_struct`` list or
++per-memcg ``mm_struct`` lists, and it only scans page tables of
++processes that have been scheduled since the last scan.
++
++The aging is due when both of ``min_seq[2]`` reaches ``max_seq-1``,
++assuming both anon and file types are reclaimable.
++
++Eviction
++--------
++The eviction consumes old generations. Given an ``lruvec``, the
++eviction scans the pages on the per-zone lists indexed by either of
++``min_seq[2]``. It first tries to select a type based on the values of
++``min_seq[2]``. When anon and file types are both available from the
++same generation, it selects the one that has a lower refault rate.
++
++During a scan, the eviction sorts pages according to their new
++generation numbers, if the aging has found them referenced. It also
++moves pages from the tiers that have higher refault rates than tier 0
++to the next generation.
++
++When it finds all the per-zone lists of a selected type are empty, the
++eviction increments ``min_seq[2]`` indexed by this selected type.
++
++To-do List
++==========
++KVM Optimization
++----------------
++Support shadow page table scanning.
++
++NUMA Optimization
++-----------------
++Optimize page table scan for NUMA.
+diff --git a/arch/Kconfig b/arch/Kconfig
index c45b770d3579..e3812adc69f7 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -119,362 +227,26 @@ index d27cf69e811d..b968d6bd28b6 100644
int pudp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp)
{
-diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
-index 46b13780c2c8..94ecc1d277a2 100644
---- a/include/linux/pgtable.h
-+++ b/include/linux/pgtable.h
-@@ -193,7 +193,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
- #endif
-
- #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
--#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG)
- static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmdp)
-@@ -214,7 +214,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
- BUILD_BUG();
- return 0;
- }
--#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG */
- #endif
-
- #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-
-diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 5199b9696bab..2339459c97d4 100644
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -2421,6 +2421,103 @@ enum scan_balance {
- SCAN_FILE,
- };
-
-+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
-+{
-+ unsigned long file;
-+ struct lruvec *target_lruvec;
-+
-+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
-+
-+ /*
-+ * Determine the scan balance between anon and file LRUs.
-+ */
-+ spin_lock_irq(&target_lruvec->lru_lock);
-+ sc->anon_cost = target_lruvec->anon_cost;
-+ sc->file_cost = target_lruvec->file_cost;
-+ spin_unlock_irq(&target_lruvec->lru_lock);
-+
-+ /*
-+ * Target desirable inactive:active list ratios for the anon
-+ * and file LRU lists.
-+ */
-+ if (!sc->force_deactivate) {
-+ unsigned long refaults;
-+
-+ refaults = lruvec_page_state(target_lruvec,
-+ WORKINGSET_ACTIVATE_ANON);
-+ if (refaults != target_lruvec->refaults[0] ||
-+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
-+ sc->may_deactivate |= DEACTIVATE_ANON;
-+ else
-+ sc->may_deactivate &= ~DEACTIVATE_ANON;
-+
-+ /*
-+ * When refaults are being observed, it means a new
-+ * workingset is being established. Deactivate to get
-+ * rid of any stale active pages quickly.
-+ */
-+ refaults = lruvec_page_state(target_lruvec,
-+ WORKINGSET_ACTIVATE_FILE);
-+ if (refaults != target_lruvec->refaults[1] ||
-+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
-+ sc->may_deactivate |= DEACTIVATE_FILE;
-+ else
-+ sc->may_deactivate &= ~DEACTIVATE_FILE;
-+ } else
-+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
-+
-+ /*
-+ * If we have plenty of inactive file pages that aren't
-+ * thrashing, try to reclaim those first before touching
-+ * anonymous pages.
-+ */
-+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
-+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
-+ sc->cache_trim_mode = 1;
-+ else
-+ sc->cache_trim_mode = 0;
-+
-+ /*
-+ * Prevent the reclaimer from falling into the cache trap: as
-+ * cache pages start out inactive, every cache fault will tip
-+ * the scan balance towards the file LRU. And as the file LRU
-+ * shrinks, so does the window for rotation from references.
-+ * This means we have a runaway feedback loop where a tiny
-+ * thrashing file LRU becomes infinitely more attractive than
-+ * anon pages. Try to detect this based on file LRU size.
-+ */
-+ if (!cgroup_reclaim(sc)) {
-+ unsigned long total_high_wmark = 0;
-+ unsigned long free, anon;
-+ int z;
-+
-+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
-+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
-+ node_page_state(pgdat, NR_INACTIVE_FILE);
-+
-+ for (z = 0; z < MAX_NR_ZONES; z++) {
-+ struct zone *zone = &pgdat->node_zones[z];
-+
-+ if (!managed_zone(zone))
-+ continue;
-+
-+ total_high_wmark += high_wmark_pages(zone);
-+ }
-+
-+ /*
-+ * Consider anon: if that's low too, this isn't a
-+ * runaway file reclaim problem, but rather just
-+ * extreme pressure. Reclaim as per usual then.
-+ */
-+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
-+
-+ sc->file_is_tiny =
-+ file + free <= total_high_wmark &&
-+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
-+ anon >> sc->priority;
-+ }
-+}
-+
- /*
- * Determine how aggressively the anon and file LRU lists should be
- * scanned. The relative value of each set of LRU lists is determined
-@@ -2866,7 +2963,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
- unsigned long nr_reclaimed, nr_scanned;
- struct lruvec *target_lruvec;
- bool reclaimable = false;
-- unsigned long file;
-
- target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
-
-@@ -2876,93 +2972,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
- nr_reclaimed = sc->nr_reclaimed;
- nr_scanned = sc->nr_scanned;
-
-- /*
-- * Determine the scan balance between anon and file LRUs.
-- */
-- spin_lock_irq(&target_lruvec->lru_lock);
-- sc->anon_cost = target_lruvec->anon_cost;
-- sc->file_cost = target_lruvec->file_cost;
-- spin_unlock_irq(&target_lruvec->lru_lock);
--
-- /*
-- * Target desirable inactive:active list ratios for the anon
-- * and file LRU lists.
-- */
-- if (!sc->force_deactivate) {
-- unsigned long refaults;
--
-- refaults = lruvec_page_state(target_lruvec,
-- WORKINGSET_ACTIVATE_ANON);
-- if (refaults != target_lruvec->refaults[0] ||
-- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
-- sc->may_deactivate |= DEACTIVATE_ANON;
-- else
-- sc->may_deactivate &= ~DEACTIVATE_ANON;
--
-- /*
-- * When refaults are being observed, it means a new
-- * workingset is being established. Deactivate to get
-- * rid of any stale active pages quickly.
-- */
-- refaults = lruvec_page_state(target_lruvec,
-- WORKINGSET_ACTIVATE_FILE);
-- if (refaults != target_lruvec->refaults[1] ||
-- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
-- sc->may_deactivate |= DEACTIVATE_FILE;
-- else
-- sc->may_deactivate &= ~DEACTIVATE_FILE;
-- } else
-- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
--
-- /*
-- * If we have plenty of inactive file pages that aren't
-- * thrashing, try to reclaim those first before touching
-- * anonymous pages.
-- */
-- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
-- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
-- sc->cache_trim_mode = 1;
-- else
-- sc->cache_trim_mode = 0;
--
-- /*
-- * Prevent the reclaimer from falling into the cache trap: as
-- * cache pages start out inactive, every cache fault will tip
-- * the scan balance towards the file LRU. And as the file LRU
-- * shrinks, so does the window for rotation from references.
-- * This means we have a runaway feedback loop where a tiny
-- * thrashing file LRU becomes infinitely more attractive than
-- * anon pages. Try to detect this based on file LRU size.
-- */
-- if (!cgroup_reclaim(sc)) {
-- unsigned long total_high_wmark = 0;
-- unsigned long free, anon;
-- int z;
--
-- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
-- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
-- node_page_state(pgdat, NR_INACTIVE_FILE);
--
-- for (z = 0; z < MAX_NR_ZONES; z++) {
-- struct zone *zone = &pgdat->node_zones[z];
-- if (!managed_zone(zone))
-- continue;
--
-- total_high_wmark += high_wmark_pages(zone);
-- }
--
-- /*
-- * Consider anon: if that's low too, this isn't a
-- * runaway file reclaim problem, but rather just
-- * extreme pressure. Reclaim as per usual then.
-- */
-- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
--
-- sc->file_is_tiny =
-- file + free <= total_high_wmark &&
-- !(sc->may_deactivate & DEACTIVATE_ANON) &&
-- anon >> sc->priority;
-- }
-+ prepare_scan_count(pgdat, sc);
-
- shrink_node_memcgs(pgdat, sc);
-
-diff --git a/mm/workingset.c b/mm/workingset.c
-index b7cdeca5a76d..edb8aed2587e 100644
---- a/mm/workingset.c
-+++ b/mm/workingset.c
-@@ -168,9 +168,9 @@
- * refault distance will immediately activate the refaulting page.
- */
-
--#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
-- 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
--#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
-+#define EVICTION_SHIFT (BITS_PER_XA_VALUE - MEM_CGROUP_ID_SHIFT - NODES_SHIFT)
-+#define EVICTION_MASK (BIT(EVICTION_SHIFT) - 1)
-+#define WORKINGSET_WIDTH 1
-
- /*
- * Eviction timestamps need to be able to cover the full range of
-@@ -182,36 +182,23 @@
- */
- static unsigned int bucket_order __read_mostly;
-
--static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
-- bool workingset)
-+static void *pack_shadow(int memcg_id, struct pglist_data *pgdat, unsigned long val)
- {
-- eviction >>= bucket_order;
-- eviction &= EVICTION_MASK;
-- eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
-- eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
-- eviction = (eviction << 1) | workingset;
-+ val = (val << MEM_CGROUP_ID_SHIFT) | memcg_id;
-+ val = (val << NODES_SHIFT) | pgdat->node_id;
-
-- return xa_mk_value(eviction);
-+ return xa_mk_value(val);
- }
-
--static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
-- unsigned long *evictionp, bool *workingsetp)
-+static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_data **pgdat)
- {
-- unsigned long entry = xa_to_value(shadow);
-- int memcgid, nid;
-- bool workingset;
-+ unsigned long val = xa_to_value(shadow);
-
-- workingset = entry & 1;
-- entry >>= 1;
-- nid = entry & ((1UL << NODES_SHIFT) - 1);
-- entry >>= NODES_SHIFT;
-- memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
-- entry >>= MEM_CGROUP_ID_SHIFT;
-+ *pgdat = NODE_DATA(val & (BIT(NODES_SHIFT) - 1));
-+ val >>= NODES_SHIFT;
-+ *memcg_id = val & (BIT(MEM_CGROUP_ID_SHIFT) - 1);
-
-- *memcgidp = memcgid;
-- *pgdat = NODE_DATA(nid);
-- *evictionp = entry << bucket_order;
-- *workingsetp = workingset;
-+ return val >> MEM_CGROUP_ID_SHIFT;
- }
-
- /**
-@@ -266,8 +253,10 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
- /* XXX: target_memcg can be NULL, go through lruvec */
- memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
- eviction = atomic_long_read(&lruvec->nonresident_age);
-+ eviction >>= bucket_order;
-+ eviction = (eviction << WORKINGSET_WIDTH) | PageWorkingset(page);
- workingset_age_nonresident(lruvec, thp_nr_pages(page));
-- return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
-+ return pack_shadow(memcgid, pgdat, eviction);
- }
-
- /**
-@@ -294,7 +283,7 @@ void workingset_refault(struct page *page, void *shadow)
- bool workingset;
- int memcgid;
-
-- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
-+ eviction = unpack_shadow(shadow, &memcgid, &pgdat);
-
- rcu_read_lock();
- /*
-@@ -318,6 +307,8 @@ void workingset_refault(struct page *page, void *shadow)
- goto out;
- eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
- refault = atomic_long_read(&eviction_lruvec->nonresident_age);
-+ workingset = eviction & (BIT(WORKINGSET_WIDTH) - 1);
-+ eviction = (eviction >> WORKINGSET_WIDTH) << bucket_order;
-
- /*
- * Calculate the refault distance
-@@ -335,7 +326,7 @@ void workingset_refault(struct page *page, void *shadow)
- * longest time, so the occasional inappropriate activation
- * leading to pressure on the active list is not a problem.
- */
-- refault_distance = (refault - eviction) & EVICTION_MASK;
-+ refault_distance = (refault - eviction) & (EVICTION_MASK >> WORKINGSET_WIDTH);
-
- /*
- * The activation decision for this page is made at the level
-@@ -593,7 +584,7 @@ static int __init workingset_init(void)
- unsigned int max_order;
- int ret;
-
-- BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
-+ BUILD_BUG_ON(EVICTION_SHIFT < WORKINGSET_WIDTH);
+diff --git a/fs/exec.c b/fs/exec.c
+index 18594f11c31f..c691d4d7720c 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1008,6 +1008,7 @@ static int exec_mmap(struct mm_struct *mm)
+ active_mm = tsk->active_mm;
+ tsk->active_mm = mm;
+ tsk->mm = mm;
++ lru_gen_add_mm(mm);
/*
- * Calculate the eviction bucket size to cover the longest
- * actionable refault distance, which is currently half of
-@@ -601,7 +592,7 @@ static int __init workingset_init(void)
- * some more pages at runtime, so keep working with up to
- * double the initial memory by using totalram_pages as-is.
- */
-- timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
-+ timestamp_bits = EVICTION_SHIFT - WORKINGSET_WIDTH;
- max_order = fls_long(totalram_pages() - 1);
- if (max_order > timestamp_bits)
- bucket_order = max_order - timestamp_bits;
-
+ * This prevents preemption while active_mm is being loaded and
+ * it and mm are being updated, which could cause problems for
+@@ -1018,6 +1019,7 @@ static int exec_mmap(struct mm_struct *mm)
+ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+ local_irq_enable();
+ activate_mm(active_mm, mm);
++ lru_gen_switch_mm(active_mm, mm);
+ if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+ local_irq_enable();
+ tsk->mm->vmacache_seqnum = 0;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a5ceccc5ef00..f784c118f00f 100644
--- a/fs/fuse/dev.c
@@ -489,8 +261,80 @@ index a5ceccc5ef00..f784c118f00f 100644
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
+diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
+index 6bc9c76680b2..e52e44af6810 100644
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
+ css_put(&cgrp->self);
+ }
+
++extern struct mutex cgroup_mutex;
++
++static inline void cgroup_lock(void)
++{
++ mutex_lock(&cgroup_mutex);
++}
++
++static inline void cgroup_unlock(void)
++{
++ mutex_unlock(&cgroup_mutex);
++}
++
+ /**
+ * task_css_set_check - obtain a task's css_set with extra access conditions
+ * @task: the task to obtain css_set for
+@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
+ * as locks used during the cgroup_subsys::attach() methods.
+ */
+ #ifdef CONFIG_PROVE_RCU
+-extern struct mutex cgroup_mutex;
+ extern spinlock_t css_set_lock;
+ #define task_css_set_check(task, __c) \
+ rcu_dereference_check((task)->cgroups, \
+@@ -704,6 +715,8 @@ struct cgroup;
+ static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
+ static inline void css_get(struct cgroup_subsys_state *css) {}
+ static inline void css_put(struct cgroup_subsys_state *css) {}
++static inline void cgroup_lock(void) {}
++static inline void cgroup_unlock(void) {}
+ static inline int cgroup_attach_task_all(struct task_struct *from,
+ struct task_struct *t) { return 0; }
+ static inline int cgroupstats_build(struct cgroupstats *stats,
+diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
+index c193be760709..60601a997433 100644
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -230,6 +230,8 @@ struct obj_cgroup {
+ };
+ };
+
++struct lru_gen_mm_list;
++
+ /*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+@@ -349,6 +351,10 @@ struct mem_cgroup {
+ struct deferred_split deferred_split_queue;
+ #endif
+
++#ifdef CONFIG_LRU_GEN
++ struct lru_gen_mm_list *mm_list;
++#endif
++
+ struct mem_cgroup_per_node *nodeinfo[0];
+ /* WARNING: nodeinfo must be the last member here */
+ };
+@@ -1131,7 +1137,6 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
+
+ static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+ {
+- WARN_ON_ONCE(!rcu_read_lock_held());
+ return NULL;
+ }
+
diff --git a/include/linux/mm.h b/include/linux/mm.h
-index c274f75efcf9..e0c19a02db9d 100644
+index 8ae31622deef..d335b1c13cc2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1089,6 +1089,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
@@ -503,10 +347,10 @@ index c274f75efcf9..e0c19a02db9d 100644
/*
* Define the bit shifts to access each section. For non-existent
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
-index 355ea1ee32bd..ae3e3826dd7f 100644
+index 355ea1ee32bd..f3b99f65a652 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
-@@ -79,11 +79,199 @@ static __always_inline enum lru_list page_lru(struct page *page)
+@@ -79,11 +79,239 @@ static __always_inline enum lru_list page_lru(struct page *page)
return lru;
}
@@ -534,6 +378,12 @@ index 355ea1ee32bd..ae3e3826dd7f 100644
+ return seq % MAX_NR_GENS;
+}
+
++/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */
++static inline int lru_tier_from_usage(int usage)
++{
++ return order_base_2(usage + 1);
++}
++
+/* Return a proper index regardless whether we keep a full history of stats. */
+static inline int hist_from_seq_or_gen(int seq_or_gen)
+{
@@ -676,6 +526,36 @@ index 355ea1ee32bd..ae3e3826dd7f 100644
+ return true;
+}
+
++/* Return the level of usage of a page. See the comment on MAX_NR_TIERS. */
++static inline int page_tier_usage(struct page *page)
++{
++ unsigned long flags = READ_ONCE(page->flags);
++
++ return flags & BIT(PG_workingset) ?
++ ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0;
++}
++
++/* Increment the usage counter after a page is accessed via file descriptors. */
++static inline void page_inc_usage(struct page *page)
++{
++ unsigned long usage;
++ unsigned long old_flags, new_flags;
++
++ do {
++ old_flags = READ_ONCE(page->flags);
++
++ if (!(old_flags & BIT(PG_workingset))) {
++ new_flags = old_flags | BIT(PG_workingset);
++ continue;
++ }
++
++ usage = (old_flags & LRU_USAGE_MASK) + BIT(LRU_USAGE_PGOFF);
++
++ new_flags = (old_flags & ~LRU_USAGE_MASK) | min(usage, LRU_USAGE_MASK);
++ } while (new_flags != old_flags &&
++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
++}
++
+#else /* CONFIG_LRU_GEN */
+
+static inline bool lru_gen_enabled(void)
@@ -693,6 +573,10 @@ index 355ea1ee32bd..ae3e3826dd7f 100644
+ return false;
+}
+
++static inline void page_inc_usage(struct page *page)
++{
++}
++
+#endif /* CONFIG_LRU_GEN */
+
static __always_inline void add_page_to_lru_list(struct page *page,
@@ -706,7 +590,7 @@ index 355ea1ee32bd..ae3e3826dd7f 100644
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]);
}
-@@ -93,6 +281,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
+@@ -93,6 +321,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
{
enum lru_list lru = page_lru(page);
@@ -716,7 +600,7 @@ index 355ea1ee32bd..ae3e3826dd7f 100644
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add_tail(&page->lru, &lruvec->lists[lru]);
}
-@@ -100,6 +291,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
+@@ -100,6 +331,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec)
{
@@ -726,15 +610,148 @@ index 355ea1ee32bd..ae3e3826dd7f 100644
list_del(&page->lru);
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
-thp_nr_pages(page));
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 8f0fb62e8975..602901a0b1d0 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -15,6 +15,8 @@
+ #include <linux/page-flags-layout.h>
+ #include <linux/workqueue.h>
+ #include <linux/seqlock.h>
++#include <linux/nodemask.h>
++#include <linux/mmdebug.h>
+
+ #include <asm/mmu.h>
+
+@@ -574,6 +576,22 @@ struct mm_struct {
+
+ #ifdef CONFIG_IOMMU_SUPPORT
+ u32 pasid;
++#endif
++#ifdef CONFIG_LRU_GEN
++ struct {
++ /* the node of a global or per-memcg mm_struct list */
++ struct list_head list;
++#ifdef CONFIG_MEMCG
++ /* points to the memcg of the owner task above */
++ struct mem_cgroup *memcg;
++#endif
++ /* whether this mm_struct has been used since the last walk */
++ nodemask_t nodes;
++#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
++ /* the number of CPUs using this mm_struct */
++ atomic_t nr_cpus;
++#endif
++ } lrugen;
+ #endif
+ } __randomize_layout;
+
+@@ -601,6 +619,95 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
+ return (struct cpumask *)&mm->cpu_bitmap;
+ }
+
++#ifdef CONFIG_LRU_GEN
++
++void lru_gen_init_mm(struct mm_struct *mm);
++void lru_gen_add_mm(struct mm_struct *mm);
++void lru_gen_del_mm(struct mm_struct *mm);
++#ifdef CONFIG_MEMCG
++int lru_gen_alloc_mm_list(struct mem_cgroup *memcg);
++void lru_gen_free_mm_list(struct mem_cgroup *memcg);
++void lru_gen_migrate_mm(struct mm_struct *mm);
++#endif
++
++/* Track the usage of each mm_struct so that we can skip inactive ones. */
++static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new)
++{
++ /* exclude init_mm, efi_mm, etc. */
++ if (!core_kernel_data((unsigned long)old)) {
++ VM_BUG_ON(old == &init_mm);
++
++ nodes_setall(old->lrugen.nodes);
++#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
++ atomic_dec(&old->lrugen.nr_cpus);
++ VM_BUG_ON_MM(atomic_read(&old->lrugen.nr_cpus) < 0, old);
++#endif
++ } else
++ VM_BUG_ON_MM(READ_ONCE(old->lrugen.list.prev) ||
++ READ_ONCE(old->lrugen.list.next), old);
++
++ if (!core_kernel_data((unsigned long)new)) {
++ VM_BUG_ON(new == &init_mm);
++
++#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
++ atomic_inc(&new->lrugen.nr_cpus);
++ VM_BUG_ON_MM(atomic_read(&new->lrugen.nr_cpus) < 0, new);
++#endif
++ } else
++ VM_BUG_ON_MM(READ_ONCE(new->lrugen.list.prev) ||
++ READ_ONCE(new->lrugen.list.next), new);
++}
++
++/* Return whether this mm_struct is being used on any CPUs. */
++static inline bool lru_gen_mm_is_active(struct mm_struct *mm)
++{
++#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
++ return !cpumask_empty(mm_cpumask(mm));
++#else
++ return atomic_read(&mm->lrugen.nr_cpus);
++#endif
++}
++
++#else /* CONFIG_LRU_GEN */
++
++static inline void lru_gen_init_mm(struct mm_struct *mm)
++{
++}
++
++static inline void lru_gen_add_mm(struct mm_struct *mm)
++{
++}
++
++static inline void lru_gen_del_mm(struct mm_struct *mm)
++{
++}
++
++#ifdef CONFIG_MEMCG
++static inline int lru_gen_alloc_mm_list(struct mem_cgroup *memcg)
++{
++ return 0;
++}
++
++static inline void lru_gen_free_mm_list(struct mem_cgroup *memcg)
++{
++}
++
++static inline void lru_gen_migrate_mm(struct mm_struct *mm)
++{
++}
++#endif
++
++static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new)
++{
++}
++
++static inline bool lru_gen_mm_is_active(struct mm_struct *mm)
++{
++ return false;
++}
++
++#endif /* CONFIG_LRU_GEN */
++
+ struct mmu_gather;
+ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
+ extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
-index 0d53eba1c383..e5deec17b4bd 100644
+index 0d53eba1c383..ded72f44d7e7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
-@@ -293,6 +293,108 @@ enum lruvec_flags {
+@@ -293,6 +293,114 @@ enum lruvec_flags {
*/
};
+struct lruvec;
++struct page_vma_mapped_walk;
+
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+#define LRU_USAGE_MASK ((BIT(LRU_USAGE_WIDTH) - 1) << LRU_USAGE_PGOFF)
@@ -823,6 +840,7 @@ index 0d53eba1c383..e5deec17b4bd 100644
+
+void lru_gen_init_lruvec(struct lruvec *lruvec);
+void lru_gen_set_state(bool enable, bool main, bool swap);
++void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw);
+
+#else /* CONFIG_LRU_GEN */
+
@@ -834,12 +852,16 @@ index 0d53eba1c383..e5deec17b4bd 100644
+{
+}
+
++static inline void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw)
++{
++}
++
+#endif /* CONFIG_LRU_GEN */
+
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
-@@ -310,6 +412,10 @@ struct lruvec {
+@@ -310,6 +418,10 @@ struct lruvec {
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
@@ -850,6 +872,37 @@ index 0d53eba1c383..e5deec17b4bd 100644
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
+@@ -751,6 +863,8 @@ struct deferred_split {
+ };
+ #endif
+
++struct mm_walk_args;
++
+ /*
+ * On NUMA machines, each NUMA node would have a pg_data_t to describe
+ * it's memory layout. On UMA machines there is a single pglist_data which
+@@ -856,6 +970,9 @@ typedef struct pglist_data {
+
+ unsigned long flags;
+
++#ifdef CONFIG_LRU_GEN
++ struct mm_walk_args *mm_walk_args;
++#endif
+ ZONE_PADDING(_pad2_)
+
+ /* Per-node vmstats */
+diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
+index ac398e143c9a..89fe4e3592f9 100644
+--- a/include/linux/nodemask.h
++++ b/include/linux/nodemask.h
+@@ -486,6 +486,7 @@ static inline int num_node_state(enum node_states state)
+ #define first_online_node 0
+ #define first_memory_node 0
+ #define next_online_node(nid) (MAX_NUMNODES)
++#define next_memory_node(nid) (MAX_NUMNODES)
+ #define nr_node_ids 1U
+ #define nr_online_nodes 1U
+
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index ef1e3e736e14..ce8d5732a3aa 100644
--- a/include/linux/page-flags-layout.h
@@ -873,7 +926,7 @@ index ef1e3e736e14..ce8d5732a3aa 100644
#define SECTIONS_WIDTH 0
#endif
--#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_USAGE_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
+ <= BITS_PER_LONG - NR_PAGEFLAGS
#define NODES_WIDTH NODES_SHIFT
@@ -883,7 +936,7 @@ index ef1e3e736e14..ce8d5732a3aa 100644
#define LAST_CPUPID_SHIFT 0
#endif
--#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \
+-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
- <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_USAGE_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
@@ -894,7 +947,7 @@ index ef1e3e736e14..ce8d5732a3aa 100644
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
#endif
--#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH+LAST_CPUPID_WIDTH+KASAN_TAG_WIDTH \
+-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
- > BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_USAGE_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
@@ -923,6 +976,43 @@ index 04a34c08e0a6..e58984fca32a 100644
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
+diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
+index a43047b1030d..47c2c39bafdf 100644
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -193,7 +193,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ #endif
+
+ #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG)
+ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmdp)
+@@ -214,7 +214,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ BUILD_BUG();
+ return 0;
+ }
+-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
++#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG */
+ #endif
+
+ #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 144727041e78..30b1f15f5c6e 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -365,8 +365,8 @@ extern void deactivate_page(struct page *page);
+ extern void mark_page_lazyfree(struct page *page);
+ extern void swap_setup(void);
+
+-extern void lru_cache_add_inactive_or_unevictable(struct page *page,
+- struct vm_area_struct *vma);
++extern void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
++ bool faulting);
+
+ /* linux/mm/vmscan.c */
+ extern unsigned long zone_reclaimable_pages(struct zone *zone);
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9795d75b09b2..a8cbf2d0b11a 100644
--- a/kernel/bounds.c
@@ -940,501 +1030,177 @@ index 9795d75b09b2..a8cbf2d0b11a 100644
/* End of constants */
return 0;
-diff --git a/mm/huge_memory.c b/mm/huge_memory.c
-index 63ed6b25deaa..8ac9093e5a0d 100644
---- a/mm/huge_memory.c
-+++ b/mm/huge_memory.c
-@@ -2410,7 +2410,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
- #ifdef CONFIG_64BIT
- (1L << PG_arch_2) |
+diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
+index 6addc9780319..4e93e5602723 100644
+--- a/kernel/events/uprobes.c
++++ b/kernel/events/uprobes.c
+@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
+ if (new_page) {
+ get_page(new_page);
+ page_add_new_anon_rmap(new_page, vma, addr, false);
+- lru_cache_add_inactive_or_unevictable(new_page, vma);
++ lru_cache_add_page_vma(new_page, vma, false);
+ } else
+ /* no new page, just dec_mm_counter for old_page */
+ dec_mm_counter(mm, MM_ANONPAGES);
+diff --git a/kernel/exit.c b/kernel/exit.c
+index 65809fac3038..6e6d95b0462c 100644
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -422,6 +422,7 @@ void mm_update_next_owner(struct mm_struct *mm)
+ goto retry;
+ }
+ WRITE_ONCE(mm->owner, c);
++ lru_gen_migrate_mm(mm);
+ task_unlock(c);
+ put_task_struct(c);
+ }
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 03baafd70b98..7a72a9e17059 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -673,6 +673,7 @@ static void check_mm(struct mm_struct *mm)
+ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
-- (1L << PG_dirty)));
-+ (1L << PG_dirty) |
-+ LRU_GEN_MASK | LRU_USAGE_MASK));
++ VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm);
+ }
- /* ->mapping in first tail page is compound_mapcount */
- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
-diff --git a/mm/mm_init.c b/mm/mm_init.c
-index 9ddaf0e1b0ab..ef0deadb90a7 100644
---- a/mm/mm_init.c
-+++ b/mm/mm_init.c
-@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void)
+ #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+@@ -1065,6 +1066,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ goto fail_nocontext;
- shift = 8 * sizeof(unsigned long);
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
-- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
-+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_USAGE_WIDTH;
- mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
-- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
-+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
- SECTIONS_WIDTH,
- NODES_WIDTH,
- ZONES_WIDTH,
- LAST_CPUPID_WIDTH,
- KASAN_TAG_WIDTH,
-+ LRU_GEN_WIDTH,
-+ LRU_USAGE_WIDTH,
- NR_PAGEFLAGS);
- mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
-diff --git a/mm/mmzone.c b/mm/mmzone.c
-index eb89d6e018e2..2ec0d7793424 100644
---- a/mm/mmzone.c
-+++ b/mm/mmzone.c
-@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec)
+ mm->user_ns = get_user_ns(user_ns);
++ lru_gen_init_mm(mm);
+ return mm;
- for_each_lru(lru)
- INIT_LIST_HEAD(&lruvec->lists[lru]);
-+
-+ lru_gen_init_lruvec(lruvec);
+ fail_nocontext:
+@@ -1107,6 +1109,7 @@ static inline void __mmput(struct mm_struct *mm)
+ }
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
++ lru_gen_del_mm(mm);
+ mmdrop(mm);
}
- #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
-diff --git a/mm/swapfile.c b/mm/swapfile.c
-index 149e77454e3c..3598b668f533 100644
---- a/mm/swapfile.c
-+++ b/mm/swapfile.c
-@@ -2702,6 +2702,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
- err = 0;
- atomic_inc(&proc_poll_event);
- wake_up_interruptible(&proc_poll_wait);
-+ /* stop tracking anon if the multigenerational lru is turned off */
-+ lru_gen_set_state(false, false, true);
+@@ -2531,6 +2534,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
+ get_task_struct(p);
+ }
- out_dput:
- filp_close(victim, NULL);
-@@ -3348,6 +3350,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
- mutex_unlock(&swapon_mutex);
- atomic_inc(&proc_poll_event);
- wake_up_interruptible(&proc_poll_wait);
-+ /* start tracking anon if the multigenerational lru is turned on */
-+ lru_gen_set_state(true, false, true);
++ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
++ /* lock the task to synchronize with memcg migration */
++ task_lock(p);
++ lru_gen_add_mm(p->mm);
++ task_unlock(p);
++ }
++
+ wake_up_new_task(p);
- error = 0;
- goto out;
-diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 2339459c97d4..f7bbfc0b1ebd 100644
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -49,6 +49,7 @@
- #include <linux/printk.h>
- #include <linux/dax.h>
- #include <linux/psi.h>
-+#include <linux/memory.h>
+ /* forking complete and child started to run, tell ptracer */
+diff --git a/kernel/kthread.c b/kernel/kthread.c
+index 0fccf7d0c6a1..42cea2a77273 100644
+--- a/kernel/kthread.c
++++ b/kernel/kthread.c
+@@ -1350,6 +1350,7 @@ void kthread_use_mm(struct mm_struct *mm)
+ tsk->mm = mm;
+ membarrier_update_current_mm(mm);
+ switch_mm_irqs_off(active_mm, mm, tsk);
++ lru_gen_switch_mm(active_mm, mm);
+ local_irq_enable();
+ task_unlock(tsk);
+ #ifdef finish_arch_post_lock_switch
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 4ca80df205ce..68e6dc4ef643 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4323,6 +4323,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
+ * finish_task_switch()'s mmdrop().
+ */
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
++ lru_gen_switch_mm(prev->active_mm, next->mm);
- #include <asm/tlbflush.h>
- #include <asm/div64.h>
-@@ -2715,6 +2716,311 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ if (!prev->mm) { // from kernel
+ /* will mmdrop() in finish_task_switch(). */
+@@ -7602,6 +7603,7 @@ void idle_task_exit(void)
+
+ if (mm != &init_mm) {
+ switch_mm(mm, &init_mm, current);
++ lru_gen_switch_mm(mm, &init_mm);
+ finish_arch_post_lock_switch();
}
- }
-+#ifdef CONFIG_LRU_GEN
-+
-+/*
-+ * After pages are faulted in, the aging must scan them twice before the
-+ * eviction can consider them. The first scan clears the accessed bit set during
-+ * initial faults. And the second scan makes sure they haven't been used since
-+ * the first scan.
-+ */
-+#define MIN_NR_GENS 2
-+
-+#define MAX_BATCH_SIZE 8192
-+
-+/******************************************************************************
-+ * shorthand helpers
-+ ******************************************************************************/
-+
-+#define DEFINE_MAX_SEQ() \
-+ unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq)
-+
-+#define DEFINE_MIN_SEQ() \
-+ unsigned long min_seq[ANON_AND_FILE] = { \
-+ READ_ONCE(lruvec->evictable.min_seq[0]), \
-+ READ_ONCE(lruvec->evictable.min_seq[1]), \
-+ }
-+
-+#define for_each_type_zone(type, zone) \
-+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
-+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
-+
-+#define for_each_gen_type_zone(gen, type, zone) \
-+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
-+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
-+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
-+
-+static int page_lru_gen(struct page *page)
-+{
-+ return ((page->flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
-+}
-+
-+static int get_nr_gens(struct lruvec *lruvec, int type)
-+{
-+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
-+}
-+
-+static int min_nr_gens(unsigned long max_seq, unsigned long *min_seq, int swappiness)
-+{
-+ return max_seq - max(min_seq[!swappiness], min_seq[1]) + 1;
-+}
-+
-+static int max_nr_gens(unsigned long max_seq, unsigned long *min_seq, int swappiness)
-+{
-+ return max_seq - min(min_seq[!swappiness], min_seq[1]) + 1;
-+}
-+
-+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
-+{
-+ lockdep_assert_held(&lruvec->lru_lock);
-+
-+ return get_nr_gens(lruvec, 0) >= MIN_NR_GENS &&
-+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS &&
-+ get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
-+ get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
-+}
-+
-+/******************************************************************************
-+ * state change
-+ ******************************************************************************/
-+
-+#ifdef CONFIG_LRU_GEN_ENABLED
-+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
-+#else
-+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
-+#endif
-+
-+static DEFINE_MUTEX(lru_gen_state_mutex);
-+static int lru_gen_nr_swapfiles __read_mostly;
-+
-+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
-+{
-+ int gen, type, zone;
-+ enum lru_list lru;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ for_each_evictable_lru(lru) {
-+ type = is_file_lru(lru);
-+
-+ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
-+ return false;
-+ }
-+
-+ for_each_gen_type_zone(gen, type, zone) {
-+ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
-+ return false;
-+
-+ VM_WARN_ON_ONCE(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
-+ }
-+
-+ return true;
-+}
-+
-+static bool fill_lru_gen_lists(struct lruvec *lruvec)
-+{
-+ enum lru_list lru;
-+ int batch_size = 0;
-+
-+ for_each_evictable_lru(lru) {
-+ int type = is_file_lru(lru);
-+ bool active = is_active_lru(lru);
-+ struct list_head *head = &lruvec->lists[lru];
-+
-+ if (!lruvec->evictable.enabled[type])
-+ continue;
-+
-+ while (!list_empty(head)) {
-+ bool success;
-+ struct page *page = lru_to_page(head);
-+
-+ VM_BUG_ON_PAGE(PageTail(page), page);
-+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
-+ VM_BUG_ON_PAGE(PageActive(page) != active, page);
-+ VM_BUG_ON_PAGE(page_lru_gen(page) != -1, page);
-+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
-+
-+ prefetchw_prev_lru_page(page, head, flags);
-+
-+ del_page_from_lru_list(page, lruvec);
-+ success = lru_gen_addition(page, lruvec, true);
-+ VM_BUG_ON(!success);
-+
-+ if (++batch_size == MAX_BATCH_SIZE)
-+ return false;
-+ }
-+ }
-+
-+ return true;
-+}
-+
-+static bool drain_lru_gen_lists(struct lruvec *lruvec)
-+{
-+ int gen, type, zone;
-+ int batch_size = 0;
-+
-+ for_each_gen_type_zone(gen, type, zone) {
-+ struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
-+
-+ if (lruvec->evictable.enabled[type])
-+ continue;
-+
-+ while (!list_empty(head)) {
-+ bool success;
-+ struct page *page = lru_to_page(head);
-+
-+ VM_BUG_ON_PAGE(PageTail(page), page);
-+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
-+ VM_BUG_ON_PAGE(PageActive(page), page);
-+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
-+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
-+
-+ prefetchw_prev_lru_page(page, head, flags);
-+
-+ success = lru_gen_deletion(page, lruvec);
-+ VM_BUG_ON(!success);
-+ add_page_to_lru_list(page, lruvec);
-+
-+ if (++batch_size == MAX_BATCH_SIZE)
-+ return false;
-+ }
-+ }
-+
-+ return true;
-+}
-+
-+/*
-+ * For file page tracking, we enable/disable it according to the main switch.
-+ * For anon page tracking, we only enabled it when the main switch is on and
-+ * there is at least one swapfile; we disable it when there are no swapfiles
-+ * regardless of the value of the main switch. Otherwise, we will eventually
-+ * reach the max size of the sliding window and have to call inc_min_seq(),
-+ * which brings an unnecessary overhead.
-+ */
-+void lru_gen_set_state(bool enable, bool main, bool swap)
-+{
-+ struct mem_cgroup *memcg;
-+
-+ mem_hotplug_begin();
-+ mutex_lock(&lru_gen_state_mutex);
-+ cgroup_lock();
-+
-+ main = main && enable != lru_gen_enabled();
-+ swap = swap && !(enable ? lru_gen_nr_swapfiles++ : --lru_gen_nr_swapfiles);
-+ swap = swap && lru_gen_enabled();
-+ if (!main && !swap)
-+ goto unlock;
-+
-+ if (main) {
-+ if (enable)
-+ static_branch_enable(&lru_gen_static_key);
-+ else
-+ static_branch_disable(&lru_gen_static_key);
-+ }
-+
-+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
-+ do {
-+ int nid;
-+
-+ for_each_node_state(nid, N_MEMORY) {
-+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ spin_lock_irq(&lruvec->lru_lock);
-+
-+ VM_BUG_ON(!seq_is_valid(lruvec));
-+ VM_BUG_ON(!state_is_valid(lruvec));
-+
-+ WRITE_ONCE(lrugen->enabled[0], lru_gen_enabled() && lru_gen_nr_swapfiles);
-+ WRITE_ONCE(lrugen->enabled[1], lru_gen_enabled());
-+
-+ while (!(enable ? fill_lru_gen_lists(lruvec) :
-+ drain_lru_gen_lists(lruvec))) {
-+ spin_unlock_irq(&lruvec->lru_lock);
-+ cond_resched();
-+ spin_lock_irq(&lruvec->lru_lock);
-+ }
-+
-+ spin_unlock_irq(&lruvec->lru_lock);
-+ }
-+
-+ cond_resched();
-+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
-+unlock:
-+ cgroup_unlock();
-+ mutex_unlock(&lru_gen_state_mutex);
-+ mem_hotplug_done();
-+}
-+
-+static int __meminit __maybe_unused lru_gen_online_mem(struct notifier_block *self,
-+ unsigned long action, void *arg)
-+{
-+ struct mem_cgroup *memcg;
-+ struct memory_notify *mnb = arg;
-+ int nid = mnb->status_change_nid;
-+
-+ if (action != MEM_GOING_ONLINE || nid == NUMA_NO_NODE)
-+ return NOTIFY_DONE;
-+
-+ mutex_lock(&lru_gen_state_mutex);
-+ cgroup_lock();
-+
-+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
-+ do {
-+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ VM_BUG_ON(!seq_is_valid(lruvec));
-+ VM_BUG_ON(!state_is_valid(lruvec));
-+
-+ WRITE_ONCE(lrugen->enabled[0], lru_gen_enabled() && lru_gen_nr_swapfiles);
-+ WRITE_ONCE(lrugen->enabled[1], lru_gen_enabled());
-+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
-+
-+ cgroup_unlock();
-+ mutex_unlock(&lru_gen_state_mutex);
-+
-+ return NOTIFY_DONE;
-+}
-+
-+/******************************************************************************
-+ * initialization
-+ ******************************************************************************/
-+
-+void lru_gen_init_lruvec(struct lruvec *lruvec)
-+{
-+ int i;
-+ int gen, type, zone;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ lrugen->max_seq = MIN_NR_GENS + 1;
-+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
-+ lrugen->enabled[1] = lru_gen_enabled();
-+
-+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
-+ lrugen->timestamps[i] = jiffies;
-+
-+ for_each_gen_type_zone(gen, type, zone)
-+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
-+}
-+
-+static int __init init_lru_gen(void)
-+{
-+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
-+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
-+
-+ if (hotplug_memory_notifier(lru_gen_online_mem, 0))
-+ pr_err("lru_gen: failed to subscribe hotplug notifications\n");
-+
-+ return 0;
-+};
-+/*
-+ * We want to run as early as possible because debug code may call mm_alloc()
-+ * and mmput(). Out only dependency mm_kobj is initialized one stage earlier.
-+ */
-+arch_initcall(init_lru_gen);
+diff --git a/mm/Kconfig b/mm/Kconfig
+index 02d44e3420f5..da125f145bc4 100644
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -901,4 +901,62 @@ config KMAP_LOCAL
+ # struct io_mapping based helper. Selected by drivers that need them
+ config IO_MAPPING
+ bool
+
-+#endif /* CONFIG_LRU_GEN */
++# the multigenerational lru {
++config LRU_GEN
++ bool "Multigenerational LRU"
++ depends on MMU
++ help
++ A high performance LRU implementation to heavily overcommit workloads
++ that are not IO bound. See Documentation/vm/multigen_lru.rst for
++ details.
+
- static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
- {
- unsigned long nr[NR_LRU_LISTS];
-
-diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
-index ae3e3826dd7f..f3b99f65a652 100644
---- a/include/linux/mm_inline.h
-+++ b/include/linux/mm_inline.h
-@@ -103,6 +103,12 @@ static inline int lru_gen_from_seq(unsigned long seq)
- return seq % MAX_NR_GENS;
- }
-
-+/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */
-+static inline int lru_tier_from_usage(int usage)
-+{
-+ return order_base_2(usage + 1);
-+}
++ Warning: do not enable this option unless you plan to use it because
++ it introduces a small per-process and per-memcg and per-node memory
++ overhead.
+
- /* Return a proper index regardless whether we keep a full history of stats. */
- static inline int hist_from_seq_or_gen(int seq_or_gen)
- {
-@@ -245,6 +251,36 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
- return true;
- }
-
-+/* Return the level of usage of a page. See the comment on MAX_NR_TIERS. */
-+static inline int page_tier_usage(struct page *page)
-+{
-+ unsigned long flags = READ_ONCE(page->flags);
++config LRU_GEN_ENABLED
++ bool "Turn on by default"
++ depends on LRU_GEN
++ help
++ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option
++ changes it to 1.
+
-+ return flags & BIT(PG_workingset) ?
-+ ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0;
-+}
++ Warning: the default value is the fast path. See
++ Documentation/static-keys.txt for details.
+
-+/* Increment the usage counter after a page is accessed via file descriptors. */
-+static inline void page_inc_usage(struct page *page)
-+{
-+ unsigned long usage;
-+ unsigned long old_flags, new_flags;
++config LRU_GEN_STATS
++ bool "Full stats for debugging"
++ depends on LRU_GEN
++ help
++ This option keeps full stats for each generation, which can be read
++ from /sys/kernel/debug/lru_gen_full.
+
-+ do {
-+ old_flags = READ_ONCE(page->flags);
++ Warning: do not enable this option unless you plan to use it because
++ it introduces an additional small per-process and per-memcg and
++ per-node memory overhead.
+
-+ if (!(old_flags & BIT(PG_workingset))) {
-+ new_flags = old_flags | BIT(PG_workingset);
-+ continue;
-+ }
++config NR_LRU_GENS
++ int "Max number of generations"
++ depends on LRU_GEN
++ range 4 31
++ default 7
++ help
++ This will use order_base_2(N+1) spare bits from page flags.
+
-+ usage = (old_flags & LRU_USAGE_MASK) + BIT(LRU_USAGE_PGOFF);
++ Warning: do not use numbers larger than necessary because each
++ generation introduces a small per-node and per-memcg memory overhead.
+
-+ new_flags = (old_flags & ~LRU_USAGE_MASK) | min(usage, LRU_USAGE_MASK);
-+ } while (new_flags != old_flags &&
-+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
-+}
++config TIERS_PER_GEN
++ int "Number of tiers per generation"
++ depends on LRU_GEN
++ range 2 5
++ default 4
++ help
++ This will use N-2 spare bits from page flags.
+
- #else /* CONFIG_LRU_GEN */
-
- static inline bool lru_gen_enabled(void)
-@@ -262,6 +298,10 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
- return false;
- }
-
-+static inline void page_inc_usage(struct page *page)
-+{
-+}
++ Larger values generally offer better protection to active pages under
++ heavy buffered I/O workloads.
++# }
+
- #endif /* CONFIG_LRU_GEN */
-
- static __always_inline void add_page_to_lru_list(struct page *page,
-diff --git a/include/linux/swap.h b/include/linux/swap.h
-index 144727041e78..30b1f15f5c6e 100644
---- a/include/linux/swap.h
-+++ b/include/linux/swap.h
-@@ -365,8 +365,8 @@ extern void deactivate_page(struct page *page);
- extern void mark_page_lazyfree(struct page *page);
- extern void swap_setup(void);
-
--extern void lru_cache_add_inactive_or_unevictable(struct page *page,
-- struct vm_area_struct *vma);
-+extern void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
-+ bool faulting);
-
- /* linux/mm/vmscan.c */
- extern unsigned long zone_reclaimable_pages(struct zone *zone);
-diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
-index 6addc9780319..4e93e5602723 100644
---- a/kernel/events/uprobes.c
-+++ b/kernel/events/uprobes.c
-@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
- if (new_page) {
- get_page(new_page);
- page_add_new_anon_rmap(new_page, vma, addr, false);
-- lru_cache_add_inactive_or_unevictable(new_page, vma);
-+ lru_cache_add_page_vma(new_page, vma, false);
- } else
- /* no new page, just dec_mm_counter for old_page */
- dec_mm_counter(mm, MM_ANONPAGES);
+ endmenu
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
-index 8ac9093e5a0d..681da4a3cf61 100644
+index 6d2a0119fc58..64c70c322ac4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
-@@ -636,7 +636,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
+@@ -639,7 +639,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr, true);
@@ -1443,6 +1209,16 @@ index 8ac9093e5a0d..681da4a3cf61 100644
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+@@ -2422,7 +2422,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
+ #ifdef CONFIG_64BIT
+ (1L << PG_arch_2) |
+ #endif
+- (1L << PG_dirty)));
++ (1L << PG_dirty) |
++ LRU_GEN_MASK | LRU_USAGE_MASK));
+
+ /* ->mapping in first tail page is compound_mapcount */
+ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6c0185fdd815..09e5346c2754 100644
--- a/mm/khugepaged.c
@@ -1456,8 +1232,68 @@ index 6c0185fdd815..09e5346c2754 100644
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
+diff --git a/mm/memcontrol.c b/mm/memcontrol.c
+index 64ada9e650a5..58b610ffa0e0 100644
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -4981,6 +4981,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
+ for_each_node(node)
+ free_mem_cgroup_per_node_info(memcg, node);
+ free_percpu(memcg->vmstats_percpu);
++ lru_gen_free_mm_list(memcg);
+ kfree(memcg);
+ }
+
+@@ -5030,6 +5031,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
+ if (alloc_mem_cgroup_per_node_info(memcg, node))
+ goto fail;
+
++ if (lru_gen_alloc_mm_list(memcg))
++ goto fail;
++
+ if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+ goto fail;
+
+@@ -5991,6 +5995,29 @@ static void mem_cgroup_move_task(void)
+ }
+ #endif
+
++#ifdef CONFIG_LRU_GEN
++static void mem_cgroup_attach(struct cgroup_taskset *tset)
++{
++ struct cgroup_subsys_state *css;
++ struct task_struct *task = NULL;
++
++ cgroup_taskset_for_each_leader(task, css, tset)
++ ;
++
++ if (!task)
++ return;
++
++ task_lock(task);
++ if (task->mm && task->mm->owner == task)
++ lru_gen_migrate_mm(task->mm);
++ task_unlock(task);
++}
++#else
++static void mem_cgroup_attach(struct cgroup_taskset *tset)
++{
++}
++#endif
++
+ static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
+ {
+ if (value == PAGE_COUNTER_MAX)
+@@ -6332,6 +6359,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
+ .css_reset = mem_cgroup_css_reset,
+ .css_rstat_flush = mem_cgroup_css_rstat_flush,
+ .can_attach = mem_cgroup_can_attach,
++ .attach = mem_cgroup_attach,
+ .cancel_attach = mem_cgroup_cancel_attach,
+ .post_attach = mem_cgroup_move_task,
+ .dfl_cftypes = memory_files,
diff --git a/mm/memory.c b/mm/memory.c
-index 730daa00952b..a76196885f92 100644
+index 486f4a2874e7..c017bdac5fd1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -839,7 +839,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
@@ -1469,7 +1305,7 @@ index 730daa00952b..a76196885f92 100644
rss[mm_counter(new_page)]++;
/* All done, just insert the new page copy in the child */
-@@ -2950,7 +2950,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
+@@ -2962,7 +2962,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
@@ -1478,7 +1314,7 @@ index 730daa00952b..a76196885f92 100644
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
-@@ -3479,7 +3479,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
+@@ -3521,7 +3521,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
@@ -1487,7 +1323,7 @@ index 730daa00952b..a76196885f92 100644
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
}
-@@ -3625,7 +3625,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
+@@ -3668,7 +3668,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
@@ -1496,7 +1332,7 @@ index 730daa00952b..a76196885f92 100644
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
-@@ -3793,7 +3793,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
+@@ -3838,7 +3838,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
@@ -1506,10 +1342,10 @@ index 730daa00952b..a76196885f92 100644
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
diff --git a/mm/migrate.c b/mm/migrate.c
-index b234c3f3acb7..d3307c9eced4 100644
+index 41ff2c9896c4..e103ab266d97 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
-@@ -2967,7 +2967,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
+@@ -2968,7 +2968,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
if (!is_zone_device_page(page))
@@ -1518,6 +1354,66 @@ index b234c3f3acb7..d3307c9eced4 100644
get_page(page);
if (flush) {
+diff --git a/mm/mm_init.c b/mm/mm_init.c
+index 9ddaf0e1b0ab..ef0deadb90a7 100644
+--- a/mm/mm_init.c
++++ b/mm/mm_init.c
+@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void)
+
+ shift = 8 * sizeof(unsigned long);
+ width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
+- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
++ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_USAGE_WIDTH;
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
+- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
++ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
+ SECTIONS_WIDTH,
+ NODES_WIDTH,
+ ZONES_WIDTH,
+ LAST_CPUPID_WIDTH,
+ KASAN_TAG_WIDTH,
++ LRU_GEN_WIDTH,
++ LRU_USAGE_WIDTH,
+ NR_PAGEFLAGS);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
+diff --git a/mm/mmzone.c b/mm/mmzone.c
+index eb89d6e018e2..2ec0d7793424 100644
+--- a/mm/mmzone.c
++++ b/mm/mmzone.c
+@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec)
+
+ for_each_lru(lru)
+ INIT_LIST_HEAD(&lruvec->lists[lru]);
++
++ lru_gen_init_lruvec(lruvec);
+ }
+
+ #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
+diff --git a/mm/rmap.c b/mm/rmap.c
+index e05c300048e6..1a33e394f516 100644
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -72,6 +72,7 @@
+ #include <linux/page_idle.h>
+ #include <linux/memremap.h>
+ #include <linux/userfaultfd_k.h>
++#include <linux/mm_inline.h>
+
+ #include <asm/tlbflush.h>
+
+@@ -789,6 +790,11 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
+ }
+
+ if (pvmw.pte) {
++ /* the multigenerational lru exploits the spatial locality */
++ if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
++ lru_gen_scan_around(&pvmw);
++ referenced++;
++ }
+ if (ptep_clear_flush_young_notify(vma, address,
+ pvmw.pte)) {
+ /*
diff --git a/mm/swap.c b/mm/swap.c
index dfb48cf9c2c9..96ce95eeb2c9 100644
--- a/mm/swap.c
@@ -1591,7 +1487,7 @@ index dfb48cf9c2c9..96ce95eeb2c9 100644
local_lock(&lru_pvecs.lock);
diff --git a/mm/swapfile.c b/mm/swapfile.c
-index 3598b668f533..549e94318b2f 100644
+index 996afa8131c8..8b5ca15df123 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1936,7 +1936,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1603,8 +1499,26 @@ index 3598b668f533..549e94318b2f 100644
}
swap_free(entry);
out:
+@@ -2702,6 +2702,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+ err = 0;
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
++ /* stop tracking anon if the multigenerational lru is turned off */
++ lru_gen_set_state(false, false, true);
+
+ out_dput:
+ filp_close(victim, NULL);
+@@ -3348,6 +3350,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
+ mutex_unlock(&swapon_mutex);
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
++ /* start tracking anon if the multigenerational lru is turned on */
++ lru_gen_set_state(true, false, true);
+
+ error = 0;
+ goto out;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
-index e14b3820c6a8..175d55b4f594 100644
+index 63a73e164d55..747a2d7eb5b6 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
@@ -1617,10 +1531,22 @@ index e14b3820c6a8..175d55b4f594 100644
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
diff --git a/mm/vmscan.c b/mm/vmscan.c
-index f7bbfc0b1ebd..84d25079092e 100644
+index 5199b9696bab..ff2deec24c64 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
-@@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
+@@ -49,6 +49,11 @@
+ #include <linux/printk.h>
+ #include <linux/dax.h>
+ #include <linux/psi.h>
++#include <linux/memory.h>
++#include <linux/pagewalk.h>
++#include <linux/shmem_fs.h>
++#include <linux/ctype.h>
++#include <linux/debugfs.h>
+
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -1093,9 +1098,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
@@ -1633,10 +1559,193 @@ index f7bbfc0b1ebd..84d25079092e 100644
__delete_from_swap_cache(page, swap, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
-@@ -2780,6 +2782,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
- get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
+@@ -1306,6 +1313,11 @@ static unsigned int shrink_page_list(struct list_head *page_list,
+ if (!sc->may_unmap && page_mapped(page))
+ goto keep_locked;
+
++ /* in case the page was found accessed by lru_gen_scan_around() */
++ if (lru_gen_enabled() && !ignore_references &&
++ page_mapped(page) && PageReferenced(page))
++ goto keep_locked;
++
+ may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+ (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
+@@ -2421,6 +2433,106 @@ enum scan_balance {
+ SCAN_FILE,
+ };
+
++static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
++{
++ unsigned long file;
++ struct lruvec *target_lruvec;
++
++ if (lru_gen_enabled())
++ return;
++
++ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
++
++ /*
++ * Determine the scan balance between anon and file LRUs.
++ */
++ spin_lock_irq(&target_lruvec->lru_lock);
++ sc->anon_cost = target_lruvec->anon_cost;
++ sc->file_cost = target_lruvec->file_cost;
++ spin_unlock_irq(&target_lruvec->lru_lock);
++
++ /*
++ * Target desirable inactive:active list ratios for the anon
++ * and file LRU lists.
++ */
++ if (!sc->force_deactivate) {
++ unsigned long refaults;
++
++ refaults = lruvec_page_state(target_lruvec,
++ WORKINGSET_ACTIVATE_ANON);
++ if (refaults != target_lruvec->refaults[0] ||
++ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
++ sc->may_deactivate |= DEACTIVATE_ANON;
++ else
++ sc->may_deactivate &= ~DEACTIVATE_ANON;
++
++ /*
++ * When refaults are being observed, it means a new
++ * workingset is being established. Deactivate to get
++ * rid of any stale active pages quickly.
++ */
++ refaults = lruvec_page_state(target_lruvec,
++ WORKINGSET_ACTIVATE_FILE);
++ if (refaults != target_lruvec->refaults[1] ||
++ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
++ sc->may_deactivate |= DEACTIVATE_FILE;
++ else
++ sc->may_deactivate &= ~DEACTIVATE_FILE;
++ } else
++ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
++
++ /*
++ * If we have plenty of inactive file pages that aren't
++ * thrashing, try to reclaim those first before touching
++ * anonymous pages.
++ */
++ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
++ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
++ sc->cache_trim_mode = 1;
++ else
++ sc->cache_trim_mode = 0;
++
++ /*
++ * Prevent the reclaimer from falling into the cache trap: as
++ * cache pages start out inactive, every cache fault will tip
++ * the scan balance towards the file LRU. And as the file LRU
++ * shrinks, so does the window for rotation from references.
++ * This means we have a runaway feedback loop where a tiny
++ * thrashing file LRU becomes infinitely more attractive than
++ * anon pages. Try to detect this based on file LRU size.
++ */
++ if (!cgroup_reclaim(sc)) {
++ unsigned long total_high_wmark = 0;
++ unsigned long free, anon;
++ int z;
++
++ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
++ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
++ node_page_state(pgdat, NR_INACTIVE_FILE);
++
++ for (z = 0; z < MAX_NR_ZONES; z++) {
++ struct zone *zone = &pgdat->node_zones[z];
++
++ if (!managed_zone(zone))
++ continue;
++
++ total_high_wmark += high_wmark_pages(zone);
++ }
++
++ /*
++ * Consider anon: if that's low too, this isn't a
++ * runaway file reclaim problem, but rather just
++ * extreme pressure. Reclaim as per usual then.
++ */
++ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
++
++ sc->file_is_tiny =
++ file + free <= total_high_wmark &&
++ !(sc->may_deactivate & DEACTIVATE_ANON) &&
++ anon >> sc->priority;
++ }
++}
++
+ /*
+ * Determine how aggressively the anon and file LRU lists should be
+ * scanned. The relative value of each set of LRU lists is determined
+@@ -2618,6 +2730,2425 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ }
}
++#ifdef CONFIG_LRU_GEN
++
++/*
++ * After pages are faulted in, the aging must scan them twice before the
++ * eviction can consider them. The first scan clears the accessed bit set during
++ * initial faults. And the second scan makes sure they haven't been used since
++ * the first scan.
++ */
++#define MIN_NR_GENS 2
++
++#define MAX_BATCH_SIZE 8192
++
++/******************************************************************************
++ * shorthand helpers
++ ******************************************************************************/
++
++#define DEFINE_MAX_SEQ() \
++ unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq)
++
++#define DEFINE_MIN_SEQ() \
++ unsigned long min_seq[ANON_AND_FILE] = { \
++ READ_ONCE(lruvec->evictable.min_seq[0]), \
++ READ_ONCE(lruvec->evictable.min_seq[1]), \
++ }
++
++#define for_each_type_zone(type, zone) \
++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
++
++#define for_each_gen_type_zone(gen, type, zone) \
++ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
++
++static int page_lru_gen(struct page *page)
++{
++ return ((page->flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
++}
++
++static int get_nr_gens(struct lruvec *lruvec, int type)
++{
++ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
++}
++
++static int min_nr_gens(unsigned long max_seq, unsigned long *min_seq, int swappiness)
++{
++ return max_seq - max(min_seq[!swappiness], min_seq[1]) + 1;
++}
++
++static int max_nr_gens(unsigned long max_seq, unsigned long *min_seq, int swappiness)
++{
++ return max_seq - min(min_seq[!swappiness], min_seq[1]) + 1;
++}
++
++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
++{
++ lockdep_assert_held(&lruvec->lru_lock);
++
++ return get_nr_gens(lruvec, 0) >= MIN_NR_GENS &&
++ get_nr_gens(lruvec, 0) <= MAX_NR_GENS &&
++ get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
++ get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
++}
++
+/******************************************************************************
+ * refault feedback loop
+ ******************************************************************************/
@@ -1724,477 +1833,6 @@ index f7bbfc0b1ebd..84d25079092e 100644
+ sp->refaulted * max(pv->total, 1UL) * pv->gain;
+}
+
- /******************************************************************************
- * state change
- ******************************************************************************/
-diff --git a/mm/workingset.c b/mm/workingset.c
-index edb8aed2587e..3f3f03d51ea7 100644
---- a/mm/workingset.c
-+++ b/mm/workingset.c
-@@ -201,6 +201,110 @@ static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_da
- return val >> MEM_CGROUP_ID_SHIFT;
- }
-
-+#ifdef CONFIG_LRU_GEN
-+
-+#if LRU_GEN_SHIFT + LRU_USAGE_SHIFT >= EVICTION_SHIFT
-+#error "Please try smaller NODES_SHIFT, NR_LRU_GENS and TIERS_PER_GEN configurations"
-+#endif
-+
-+static void page_set_usage(struct page *page, int usage)
-+{
-+ unsigned long old_flags, new_flags;
-+
-+ VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH));
-+
-+ if (!usage)
-+ return;
-+
-+ do {
-+ old_flags = READ_ONCE(page->flags);
-+ new_flags = (old_flags & ~LRU_USAGE_MASK) | LRU_TIER_FLAGS |
-+ ((usage - 1UL) << LRU_USAGE_PGOFF);
-+ } while (new_flags != old_flags &&
-+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
-+}
-+
-+/* Return a token to be stored in the shadow entry of a page being evicted. */
-+static void *lru_gen_eviction(struct page *page)
-+{
-+ int hist, tier;
-+ unsigned long token;
-+ unsigned long min_seq;
-+ struct lruvec *lruvec;
-+ struct lrugen *lrugen;
-+ int type = page_is_file_lru(page);
-+ int usage = page_tier_usage(page);
-+ struct mem_cgroup *memcg = page_memcg(page);
-+ struct pglist_data *pgdat = page_pgdat(page);
-+
-+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
-+ lrugen = &lruvec->evictable;
-+ min_seq = READ_ONCE(lrugen->min_seq[type]);
-+ token = (min_seq << LRU_USAGE_SHIFT) | usage;
-+
-+ hist = hist_from_seq_or_gen(min_seq);
-+ tier = lru_tier_from_usage(usage);
-+ atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]);
-+
-+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token);
-+}
-+
-+/* Account a refaulted page based on the token stored in its shadow entry. */
-+static void lru_gen_refault(struct page *page, void *shadow)
-+{
-+ int hist, tier, usage;
-+ int memcg_id;
-+ unsigned long token;
-+ unsigned long min_seq;
-+ struct lruvec *lruvec;
-+ struct lrugen *lrugen;
-+ struct pglist_data *pgdat;
-+ struct mem_cgroup *memcg;
-+ int type = page_is_file_lru(page);
-+
-+ token = unpack_shadow(shadow, &memcg_id, &pgdat);
-+ if (page_pgdat(page) != pgdat)
-+ return;
-+
-+ rcu_read_lock();
-+ memcg = page_memcg_rcu(page);
-+ if (mem_cgroup_id(memcg) != memcg_id)
-+ goto unlock;
-+
-+ usage = token & (BIT(LRU_USAGE_SHIFT) - 1);
-+ token >>= LRU_USAGE_SHIFT;
-+
-+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
-+ lrugen = &lruvec->evictable;
-+ min_seq = READ_ONCE(lrugen->min_seq[type]);
-+ if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_SHIFT)))
-+ goto unlock;
-+
-+ page_set_usage(page, usage);
-+
-+ hist = hist_from_seq_or_gen(min_seq);
-+ tier = lru_tier_from_usage(usage);
-+ atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]);
-+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type);
-+ if (tier)
-+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type);
-+unlock:
-+ rcu_read_unlock();
-+}
-+
-+#else /* CONFIG_LRU_GEN */
-+
-+static void *lru_gen_eviction(struct page *page)
-+{
-+ return NULL;
-+}
-+
-+static void lru_gen_refault(struct page *page, void *shadow)
-+{
-+}
-+
-+#endif /* CONFIG_LRU_GEN */
-+
- /**
- * workingset_age_nonresident - age non-resident entries as LRU ages
- * @lruvec: the lruvec that was aged
-@@ -249,6 +353,9 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
- VM_BUG_ON_PAGE(page_count(page), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
-
-+ if (lru_gen_enabled())
-+ return lru_gen_eviction(page);
-+
- lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
- /* XXX: target_memcg can be NULL, go through lruvec */
- memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
-@@ -283,6 +390,11 @@ void workingset_refault(struct page *page, void *shadow)
- bool workingset;
- int memcgid;
-
-+ if (lru_gen_enabled()) {
-+ lru_gen_refault(page, shadow);
-+ return;
-+ }
-+
- eviction = unpack_shadow(shadow, &memcgid, &pgdat);
-
- rcu_read_lock();
-
-diff --git a/fs/exec.c b/fs/exec.c
-index 18594f11c31f..c691d4d7720c 100644
---- a/fs/exec.c
-+++ b/fs/exec.c
-@@ -1008,6 +1008,7 @@ static int exec_mmap(struct mm_struct *mm)
- active_mm = tsk->active_mm;
- tsk->active_mm = mm;
- tsk->mm = mm;
-+ lru_gen_add_mm(mm);
- /*
- * This prevents preemption while active_mm is being loaded and
- * it and mm are being updated, which could cause problems for
-@@ -1018,6 +1019,7 @@ static int exec_mmap(struct mm_struct *mm)
- if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
- local_irq_enable();
- activate_mm(active_mm, mm);
-+ lru_gen_switch_mm(active_mm, mm);
- if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
- local_irq_enable();
- tsk->mm->vmacache_seqnum = 0;
-diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
-index 6bcac3d91dd1..60601a997433 100644
---- a/include/linux/memcontrol.h
-+++ b/include/linux/memcontrol.h
-@@ -230,6 +230,8 @@ struct obj_cgroup {
- };
- };
-
-+struct lru_gen_mm_list;
-+
- /*
- * The memory controller data structure. The memory controller controls both
- * page cache and RSS per cgroup. We would eventually like to provide
-@@ -349,6 +351,10 @@ struct mem_cgroup {
- struct deferred_split deferred_split_queue;
- #endif
-
-+#ifdef CONFIG_LRU_GEN
-+ struct lru_gen_mm_list *mm_list;
-+#endif
-+
- struct mem_cgroup_per_node *nodeinfo[0];
- /* WARNING: nodeinfo must be the last member here */
- };
-diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
-index 5aacc1c10a45..b0f662555eae 100644
---- a/include/linux/mm_types.h
-+++ b/include/linux/mm_types.h
-@@ -15,6 +15,8 @@
- #include <linux/page-flags-layout.h>
- #include <linux/workqueue.h>
- #include <linux/seqlock.h>
-+#include <linux/nodemask.h>
-+#include <linux/mmdebug.h>
-
- #include <asm/mmu.h>
-
-@@ -561,6 +563,22 @@ struct mm_struct {
-
- #ifdef CONFIG_IOMMU_SUPPORT
- u32 pasid;
-+#endif
-+#ifdef CONFIG_LRU_GEN
-+ struct {
-+ /* the node of a global or per-memcg mm_struct list */
-+ struct list_head list;
-+#ifdef CONFIG_MEMCG
-+ /* points to the memcg of the owner task above */
-+ struct mem_cgroup *memcg;
-+#endif
-+ /* whether this mm_struct has been used since the last walk */
-+ nodemask_t nodes;
-+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-+ /* the number of CPUs using this mm_struct */
-+ atomic_t nr_cpus;
-+#endif
-+ } lrugen;
- #endif
- } __randomize_layout;
-
-@@ -588,6 +606,95 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
- return (struct cpumask *)&mm->cpu_bitmap;
- }
-
-+#ifdef CONFIG_LRU_GEN
-+
-+void lru_gen_init_mm(struct mm_struct *mm);
-+void lru_gen_add_mm(struct mm_struct *mm);
-+void lru_gen_del_mm(struct mm_struct *mm);
-+#ifdef CONFIG_MEMCG
-+int lru_gen_alloc_mm_list(struct mem_cgroup *memcg);
-+void lru_gen_free_mm_list(struct mem_cgroup *memcg);
-+void lru_gen_migrate_mm(struct mm_struct *mm);
-+#endif
-+
-+/* Track the usage of each mm_struct so that we can skip inactive ones. */
-+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new)
-+{
-+ /* exclude init_mm, efi_mm, etc. */
-+ if (!core_kernel_data((unsigned long)old)) {
-+ VM_BUG_ON(old == &init_mm);
-+
-+ nodes_setall(old->lrugen.nodes);
-+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-+ atomic_dec(&old->lrugen.nr_cpus);
-+ VM_BUG_ON_MM(atomic_read(&old->lrugen.nr_cpus) < 0, old);
-+#endif
-+ } else
-+ VM_BUG_ON_MM(READ_ONCE(old->lrugen.list.prev) ||
-+ READ_ONCE(old->lrugen.list.next), old);
-+
-+ if (!core_kernel_data((unsigned long)new)) {
-+ VM_BUG_ON(new == &init_mm);
-+
-+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-+ atomic_inc(&new->lrugen.nr_cpus);
-+ VM_BUG_ON_MM(atomic_read(&new->lrugen.nr_cpus) < 0, new);
-+#endif
-+ } else
-+ VM_BUG_ON_MM(READ_ONCE(new->lrugen.list.prev) ||
-+ READ_ONCE(new->lrugen.list.next), new);
-+}
-+
-+/* Return whether this mm_struct is being used on any CPUs. */
-+static inline bool lru_gen_mm_is_active(struct mm_struct *mm)
-+{
-+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-+ return !cpumask_empty(mm_cpumask(mm));
-+#else
-+ return atomic_read(&mm->lrugen.nr_cpus);
-+#endif
-+}
-+
-+#else /* CONFIG_LRU_GEN */
-+
-+static inline void lru_gen_init_mm(struct mm_struct *mm)
-+{
-+}
-+
-+static inline void lru_gen_add_mm(struct mm_struct *mm)
-+{
-+}
-+
-+static inline void lru_gen_del_mm(struct mm_struct *mm)
-+{
-+}
-+
-+#ifdef CONFIG_MEMCG
-+static inline int lru_gen_alloc_mm_list(struct mem_cgroup *memcg)
-+{
-+ return 0;
-+}
-+
-+static inline void lru_gen_free_mm_list(struct mem_cgroup *memcg)
-+{
-+}
-+
-+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
-+{
-+}
-+#endif
-+
-+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new)
-+{
-+}
-+
-+static inline bool lru_gen_mm_is_active(struct mm_struct *mm)
-+{
-+ return false;
-+}
-+
-+#endif /* CONFIG_LRU_GEN */
-+
- struct mmu_gather;
- extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
- extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
-diff --git a/kernel/exit.c b/kernel/exit.c
-index fd1c04193e18..b362179852f1 100644
---- a/kernel/exit.c
-+++ b/kernel/exit.c
-@@ -423,6 +423,7 @@ void mm_update_next_owner(struct mm_struct *mm)
- goto retry;
- }
- WRITE_ONCE(mm->owner, c);
-+ lru_gen_migrate_mm(mm);
- task_unlock(c);
- put_task_struct(c);
- }
-diff --git a/kernel/fork.c b/kernel/fork.c
-index dc06afd725cb..2fd7dae9afcb 100644
---- a/kernel/fork.c
-+++ b/kernel/fork.c
-@@ -669,6 +669,7 @@ static void check_mm(struct mm_struct *mm)
- #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
- #endif
-+ VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm);
- }
-
- #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
-@@ -1061,6 +1062,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
- goto fail_nocontext;
-
- mm->user_ns = get_user_ns(user_ns);
-+ lru_gen_init_mm(mm);
- return mm;
-
- fail_nocontext:
-@@ -1103,6 +1105,7 @@ static inline void __mmput(struct mm_struct *mm)
- }
- if (mm->binfmt)
- module_put(mm->binfmt->module);
-+ lru_gen_del_mm(mm);
- mmdrop(mm);
- }
-
-@@ -2524,6 +2527,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
- get_task_struct(p);
- }
-
-+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
-+ /* lock the task to synchronize with memcg migration */
-+ task_lock(p);
-+ lru_gen_add_mm(p->mm);
-+ task_unlock(p);
-+ }
-+
- wake_up_new_task(p);
-
- /* forking complete and child started to run, tell ptracer */
-diff --git a/kernel/kthread.c b/kernel/kthread.c
-index fe3f2a40d61e..b81e49ed31a7 100644
---- a/kernel/kthread.c
-+++ b/kernel/kthread.c
-@@ -1325,6 +1325,7 @@ void kthread_use_mm(struct mm_struct *mm)
- tsk->mm = mm;
- membarrier_update_current_mm(mm);
- switch_mm_irqs_off(active_mm, mm, tsk);
-+ lru_gen_switch_mm(active_mm, mm);
- local_irq_enable();
- task_unlock(tsk);
- #ifdef finish_arch_post_lock_switch
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 5226cc26a095..2d4b77f173db 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -4323,6 +4323,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
- * finish_task_switch()'s mmdrop().
- */
- switch_mm_irqs_off(prev->active_mm, next->mm, next);
-+ lru_gen_switch_mm(prev->active_mm, next->mm);
-
- if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
-@@ -7603,6 +7604,7 @@ void idle_task_exit(void)
-
- if (mm != &init_mm) {
- switch_mm(mm, &init_mm, current);
-+ lru_gen_switch_mm(mm, &init_mm);
- finish_arch_post_lock_switch();
- }
-
-diff --git a/mm/memcontrol.c b/mm/memcontrol.c
-index 64ada9e650a5..58b610ffa0e0 100644
---- a/mm/memcontrol.c
-+++ b/mm/memcontrol.c
-@@ -5214,6 +5214,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
- free_mem_cgroup_per_node_info(memcg, node);
- free_percpu(memcg->vmstats_percpu);
- free_percpu(memcg->vmstats_local);
-+ lru_gen_free_mm_list(memcg);
- kfree(memcg);
- }
-
-@@ -5266,6 +5267,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
- if (alloc_mem_cgroup_per_node_info(memcg, node))
- goto fail;
-
-+ if (lru_gen_alloc_mm_list(memcg))
-+ goto fail;
-+
- if (memcg_wb_domain_init(memcg, GFP_KERNEL))
- goto fail;
-
-@@ -5991,6 +5995,29 @@ static void mem_cgroup_move_task(void)
- }
- #endif
-
-+#ifdef CONFIG_LRU_GEN
-+static void mem_cgroup_attach(struct cgroup_taskset *tset)
-+{
-+ struct cgroup_subsys_state *css;
-+ struct task_struct *task = NULL;
-+
-+ cgroup_taskset_for_each_leader(task, css, tset)
-+ ;
-+
-+ if (!task)
-+ return;
-+
-+ task_lock(task);
-+ if (task->mm && task->mm->owner == task)
-+ lru_gen_migrate_mm(task->mm);
-+ task_unlock(task);
-+}
-+#else
-+static void mem_cgroup_attach(struct cgroup_taskset *tset)
-+{
-+}
-+#endif
-+
- static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
- {
- if (value == PAGE_COUNTER_MAX)
-@@ -6332,6 +6359,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
- .css_reset = mem_cgroup_css_reset,
- .css_rstat_flush = mem_cgroup_css_rstat_flush,
- .can_attach = mem_cgroup_can_attach,
-+ .attach = mem_cgroup_attach,
- .cancel_attach = mem_cgroup_cancel_attach,
- .post_attach = mem_cgroup_move_task,
- .dfl_cftypes = memory_files,
-diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 84d25079092e..d93d2272e475 100644
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -2869,6 +2869,323 @@ static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *
- sp->refaulted * max(pv->total, 1UL) * pv->gain;
- }
-
+/******************************************************************************
+ * mm_struct list
+ ******************************************************************************/
@@ -2512,96 +2150,6 @@ index 84d25079092e..d93d2272e475 100644
+ return last;
+}
+
- /******************************************************************************
- * state change
- ******************************************************************************/
-@@ -3096,6 +3413,13 @@ static int __init init_lru_gen(void)
- {
- BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
- BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
-+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
-+
-+ if (mem_cgroup_disabled()) {
-+ global_mm_list = alloc_mm_list();
-+ if (WARN_ON_ONCE(!global_mm_list))
-+ return -ENOMEM;
-+ }
-
- if (hotplug_memory_notifier(lru_gen_online_mem, 0))
- pr_err("lru_gen: failed to subscribe hotplug notifications\n");
-
-diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
-index e5deec17b4bd..38de59fcbe54 100644
---- a/include/linux/mmzone.h
-+++ b/include/linux/mmzone.h
-@@ -294,6 +294,7 @@ enum lruvec_flags {
- };
-
- struct lruvec;
-+struct page_vma_mapped_walk;
-
- #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
- #define LRU_USAGE_MASK ((BIT(LRU_USAGE_WIDTH) - 1) << LRU_USAGE_PGOFF)
-@@ -382,6 +383,7 @@ struct lrugen {
-
- void lru_gen_init_lruvec(struct lruvec *lruvec);
- void lru_gen_set_state(bool enable, bool main, bool swap);
-+void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw);
-
- #else /* CONFIG_LRU_GEN */
-
-@@ -393,6 +395,10 @@ static inline void lru_gen_set_state(bool enable, bool main, bool swap)
- {
- }
-
-+static inline void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw)
-+{
-+}
-+
- #endif /* CONFIG_LRU_GEN */
-
- struct lruvec {
-diff --git a/mm/rmap.c b/mm/rmap.c
-index 693a610e181d..985cf4ebd03c 100644
---- a/mm/rmap.c
-+++ b/mm/rmap.c
-@@ -72,6 +72,7 @@
- #include <linux/page_idle.h>
- #include <linux/memremap.h>
- #include <linux/userfaultfd_k.h>
-+#include <linux/mm_inline.h>
-
- #include <asm/tlbflush.h>
-
-@@ -792,6 +793,11 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
- }
-
- if (pvmw.pte) {
-+ /* the multigenerational lru exploits the spatial locality */
-+ if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
-+ lru_gen_scan_around(&pvmw);
-+ referenced++;
-+ }
- if (ptep_clear_flush_young_notify(vma, address,
- pvmw.pte)) {
- /*
-diff --git a/mm/vmscan.c b/mm/vmscan.c
-index d93d2272e475..837d5e6a821e 100644
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -50,6 +50,8 @@
- #include <linux/dax.h>
- #include <linux/psi.h>
- #include <linux/memory.h>
-+#include <linux/pagewalk.h>
-+#include <linux/shmem_fs.h>
-
- #include <asm/tlbflush.h>
- #include <asm/div64.h>
-@@ -3186,6 +3188,788 @@ static bool get_next_mm(struct mm_walk_args *args, struct mm_struct **iter)
- return last;
- }
-
+/******************************************************************************
+ * the aging
+ ******************************************************************************/
@@ -3384,74 +2932,6 @@ index d93d2272e475..837d5e6a821e 100644
+ set_page_dirty(pte_page(pte[i]));
+}
+
- /******************************************************************************
- * state change
- ******************************************************************************/
-@@ -3415,6 +4199,10 @@ static int __init init_lru_gen(void)
- BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
- BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
-
-+ VM_BUG_ON(PMD_SIZE / PAGE_SIZE != PTRS_PER_PTE);
-+ VM_BUG_ON(PUD_SIZE / PMD_SIZE != PTRS_PER_PMD);
-+ VM_BUG_ON(P4D_SIZE / PUD_SIZE != PTRS_PER_PUD);
-+
- if (mem_cgroup_disabled()) {
- global_mm_list = alloc_mm_list();
- if (WARN_ON_ONCE(!global_mm_list))
-
-diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
-index 38de59fcbe54..ded72f44d7e7 100644
---- a/include/linux/mmzone.h
-+++ b/include/linux/mmzone.h
-@@ -863,6 +863,8 @@ struct deferred_split {
- };
- #endif
-
-+struct mm_walk_args;
-+
- /*
- * On NUMA machines, each NUMA node would have a pg_data_t to describe
- * it's memory layout. On UMA machines there is a single pglist_data which
-@@ -968,6 +970,9 @@ typedef struct pglist_data {
-
- unsigned long flags;
-
-+#ifdef CONFIG_LRU_GEN
-+ struct mm_walk_args *mm_walk_args;
-+#endif
- ZONE_PADDING(_pad2_)
-
- /* Per-node vmstats */
-diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 837d5e6a821e..2f86dcc04c56 100644
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -1311,6 +1311,11 @@ static unsigned int shrink_page_list(struct list_head *page_list,
- if (!sc->may_unmap && page_mapped(page))
- goto keep_locked;
-
-+ /* in case the page was found accessed by lru_gen_scan_around() */
-+ if (lru_gen_enabled() && !ignore_references &&
-+ page_mapped(page) && PageReferenced(page))
-+ goto keep_locked;
-+
- may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
- (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
-
-@@ -2431,6 +2436,9 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
- unsigned long file;
- struct lruvec *target_lruvec;
-
-+ if (lru_gen_enabled())
-+ return;
-+
- target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
-
- /*
-@@ -3970,6 +3978,489 @@ void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw)
- set_page_dirty(pte_page(pte[i]));
- }
-
+/******************************************************************************
+ * the eviction
+ ******************************************************************************/
@@ -3935,124 +3415,223 @@ index 837d5e6a821e..2f86dcc04c56 100644
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+}
+
- /******************************************************************************
- * state change
- ******************************************************************************/
-@@ -4172,6 +4663,21 @@ static int __meminit __maybe_unused lru_gen_online_mem(struct notifier_block *se
- return NOTIFY_DONE;
- }
-
-+static void lru_gen_start_kswapd(int nid)
++/******************************************************************************
++ * state change
++ ******************************************************************************/
++
++#ifdef CONFIG_LRU_GEN_ENABLED
++DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
++#else
++DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
++#endif
++
++static DEFINE_MUTEX(lru_gen_state_mutex);
++static int lru_gen_nr_swapfiles __read_mostly;
++
++static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+{
-+ struct pglist_data *pgdat = NODE_DATA(nid);
++ int gen, type, zone;
++ enum lru_list lru;
++ struct lrugen *lrugen = &lruvec->evictable;
+
-+ pgdat->mm_walk_args = kvzalloc_node(size_of_mm_walk_args(), GFP_KERNEL, nid);
-+ WARN_ON_ONCE(!pgdat->mm_walk_args);
++ for_each_evictable_lru(lru) {
++ type = is_file_lru(lru);
++
++ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
++ return false;
++ }
++
++ for_each_gen_type_zone(gen, type, zone) {
++ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
++ return false;
++
++ VM_WARN_ON_ONCE(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
++ }
++
++ return true;
+}
+
-+static void lru_gen_stop_kswapd(int nid)
++static bool fill_lru_gen_lists(struct lruvec *lruvec)
+{
-+ struct pglist_data *pgdat = NODE_DATA(nid);
++ enum lru_list lru;
++ int batch_size = 0;
+
-+ kvfree(pgdat->mm_walk_args);
-+}
++ for_each_evictable_lru(lru) {
++ int type = is_file_lru(lru);
++ bool active = is_active_lru(lru);
++ struct list_head *head = &lruvec->lists[lru];
+
- /******************************************************************************
- * initialization
- ******************************************************************************/
-@@ -4220,6 +4726,24 @@ static int __init init_lru_gen(void)
- */
- arch_initcall(init_lru_gen);
-
-+#else /* CONFIG_LRU_GEN */
++ if (!lruvec->evictable.enabled[type])
++ continue;
+
-+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-+{
++ while (!list_empty(head)) {
++ bool success;
++ struct page *page = lru_to_page(head);
++
++ VM_BUG_ON_PAGE(PageTail(page), page);
++ VM_BUG_ON_PAGE(PageUnevictable(page), page);
++ VM_BUG_ON_PAGE(PageActive(page) != active, page);
++ VM_BUG_ON_PAGE(page_lru_gen(page) != -1, page);
++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
++
++ prefetchw_prev_lru_page(page, head, flags);
++
++ del_page_from_lru_list(page, lruvec);
++ success = lru_gen_addition(page, lruvec, true);
++ VM_BUG_ON(!success);
++
++ if (++batch_size == MAX_BATCH_SIZE)
++ return false;
++ }
++ }
++
++ return true;
+}
+
-+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
++static bool drain_lru_gen_lists(struct lruvec *lruvec)
+{
++ int gen, type, zone;
++ int batch_size = 0;
++
++ for_each_gen_type_zone(gen, type, zone) {
++ struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
++
++ if (lruvec->evictable.enabled[type])
++ continue;
++
++ while (!list_empty(head)) {
++ bool success;
++ struct page *page = lru_to_page(head);
++
++ VM_BUG_ON_PAGE(PageTail(page), page);
++ VM_BUG_ON_PAGE(PageUnevictable(page), page);
++ VM_BUG_ON_PAGE(PageActive(page), page);
++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
++
++ prefetchw_prev_lru_page(page, head, flags);
++
++ success = lru_gen_deletion(page, lruvec);
++ VM_BUG_ON(!success);
++ add_page_to_lru_list(page, lruvec);
++
++ if (++batch_size == MAX_BATCH_SIZE)
++ return false;
++ }
++ }
++
++ return true;
+}
+
-+static void lru_gen_start_kswapd(int nid)
++/*
++ * For file page tracking, we enable/disable it according to the main switch.
++ * For anon page tracking, we only enabled it when the main switch is on and
++ * there is at least one swapfile; we disable it when there are no swapfiles
++ * regardless of the value of the main switch. Otherwise, we will eventually
++ * reach the max size of the sliding window and have to call inc_min_seq(),
++ * which brings an unnecessary overhead.
++ */
++void lru_gen_set_state(bool enable, bool main, bool swap)
+{
++ struct mem_cgroup *memcg;
++
++ mem_hotplug_begin();
++ mutex_lock(&lru_gen_state_mutex);
++ cgroup_lock();
++
++ main = main && enable != lru_gen_enabled();
++ swap = swap && !(enable ? lru_gen_nr_swapfiles++ : --lru_gen_nr_swapfiles);
++ swap = swap && lru_gen_enabled();
++ if (!main && !swap)
++ goto unlock;
++
++ if (main) {
++ if (enable)
++ static_branch_enable(&lru_gen_static_key);
++ else
++ static_branch_disable(&lru_gen_static_key);
++ }
++
++ memcg = mem_cgroup_iter(NULL, NULL, NULL);
++ do {
++ int nid;
++
++ for_each_node_state(nid, N_MEMORY) {
++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ spin_lock_irq(&lruvec->lru_lock);
++
++ VM_BUG_ON(!seq_is_valid(lruvec));
++ VM_BUG_ON(!state_is_valid(lruvec));
++
++ WRITE_ONCE(lrugen->enabled[0], lru_gen_enabled() && lru_gen_nr_swapfiles);
++ WRITE_ONCE(lrugen->enabled[1], lru_gen_enabled());
++
++ while (!(enable ? fill_lru_gen_lists(lruvec) :
++ drain_lru_gen_lists(lruvec))) {
++ spin_unlock_irq(&lruvec->lru_lock);
++ cond_resched();
++ spin_lock_irq(&lruvec->lru_lock);
++ }
++
++ spin_unlock_irq(&lruvec->lru_lock);
++ }
++
++ cond_resched();
++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
++unlock:
++ cgroup_unlock();
++ mutex_unlock(&lru_gen_state_mutex);
++ mem_hotplug_done();
+}
+
-+static void lru_gen_stop_kswapd(int nid)
++static int __meminit __maybe_unused lru_gen_online_mem(struct notifier_block *self,
++ unsigned long action, void *arg)
+{
++ struct mem_cgroup *memcg;
++ struct memory_notify *mnb = arg;
++ int nid = mnb->status_change_nid;
++
++ if (action != MEM_GOING_ONLINE || nid == NUMA_NO_NODE)
++ return NOTIFY_DONE;
++
++ mutex_lock(&lru_gen_state_mutex);
++ cgroup_lock();
++
++ memcg = mem_cgroup_iter(NULL, NULL, NULL);
++ do {
++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ VM_BUG_ON(!seq_is_valid(lruvec));
++ VM_BUG_ON(!state_is_valid(lruvec));
++
++ WRITE_ONCE(lrugen->enabled[0], lru_gen_enabled() && lru_gen_nr_swapfiles);
++ WRITE_ONCE(lrugen->enabled[1], lru_gen_enabled());
++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
++
++ cgroup_unlock();
++ mutex_unlock(&lru_gen_state_mutex);
++
++ return NOTIFY_DONE;
+}
+
- #endif /* CONFIG_LRU_GEN */
-
- static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-@@ -4233,6 +4757,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
- struct blk_plug plug;
- bool scan_adjusted;
-
-+ if (lru_gen_enabled()) {
-+ lru_gen_shrink_lruvec(lruvec, sc);
-+ return;
-+ }
++static void lru_gen_start_kswapd(int nid)
++{
++ struct pglist_data *pgdat = NODE_DATA(nid);
+
- get_scan_count(lruvec, sc, nr);
-
- /* Record the original scan target for proportional adjustments later */
-@@ -4699,6 +5228,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
- struct lruvec *target_lruvec;
- unsigned long refaults;
-
-+ if (lru_gen_enabled())
-+ return;
++ pgdat->mm_walk_args = kvzalloc_node(size_of_mm_walk_args(), GFP_KERNEL, nid);
++ WARN_ON_ONCE(!pgdat->mm_walk_args);
++}
+
- target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
- refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
- target_lruvec->refaults[0] = refaults;
-@@ -5073,6 +5605,11 @@ static void age_active_anon(struct pglist_data *pgdat,
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
-+ if (lru_gen_enabled()) {
-+ lru_gen_age_node(pgdat, sc);
-+ return;
-+ }
++static void lru_gen_stop_kswapd(int nid)
++{
++ struct pglist_data *pgdat = NODE_DATA(nid);
+
- if (!total_swap_pages)
- return;
-
-@@ -5753,6 +6290,8 @@ int kswapd_run(int nid)
- if (pgdat->kswapd)
- return 0;
-
-+ lru_gen_start_kswapd(nid);
++ kvfree(pgdat->mm_walk_args);
++}
+
- pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
- if (IS_ERR(pgdat->kswapd)) {
- /* failure at boot is fatal */
-@@ -5775,6 +6314,7 @@ void kswapd_stop(int nid)
- if (kswapd) {
- kthread_stop(kswapd);
- NODE_DATA(nid)->kswapd = NULL;
-+ lru_gen_stop_kswapd(nid);
- }
- }
-
-diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 2f86dcc04c56..ff2deec24c64 100644
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -52,6 +52,8 @@
- #include <linux/memory.h>
- #include <linux/pagewalk.h>
- #include <linux/shmem_fs.h>
-+#include <linux/ctype.h>
-+#include <linux/debugfs.h>
-
- #include <asm/tlbflush.h>
- #include <asm/div64.h>
-@@ -4678,6 +4680,401 @@ static void lru_gen_stop_kswapd(int nid)
- kvfree(pgdat->mm_walk_args);
- }
-
+/******************************************************************************
+ * sysfs interface
+ ******************************************************************************/
@@ -4448,250 +4027,473 @@ index 2f86dcc04c56..ff2deec24c64 100644
+ .release = seq_release,
+};
+
- /******************************************************************************
- * initialization
- ******************************************************************************/
-@@ -4718,6 +5115,12 @@ static int __init init_lru_gen(void)
- if (hotplug_memory_notifier(lru_gen_online_mem, 0))
- pr_err("lru_gen: failed to subscribe hotplug notifications\n");
-
++/******************************************************************************
++ * initialization
++ ******************************************************************************/
++
++void lru_gen_init_lruvec(struct lruvec *lruvec)
++{
++ int i;
++ int gen, type, zone;
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ lrugen->max_seq = MIN_NR_GENS + 1;
++ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
++ lrugen->enabled[1] = lru_gen_enabled();
++
++ for (i = 0; i <= MIN_NR_GENS + 1; i++)
++ lrugen->timestamps[i] = jiffies;
++
++ for_each_gen_type_zone(gen, type, zone)
++ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
++}
++
++static int __init init_lru_gen(void)
++{
++ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
++ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
++ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
++
++ VM_BUG_ON(PMD_SIZE / PAGE_SIZE != PTRS_PER_PTE);
++ VM_BUG_ON(PUD_SIZE / PMD_SIZE != PTRS_PER_PMD);
++ VM_BUG_ON(P4D_SIZE / PUD_SIZE != PTRS_PER_PUD);
++
++ if (mem_cgroup_disabled()) {
++ global_mm_list = alloc_mm_list();
++ if (WARN_ON_ONCE(!global_mm_list))
++ return -ENOMEM;
++ }
++
++ if (hotplug_memory_notifier(lru_gen_online_mem, 0))
++ pr_err("lru_gen: failed to subscribe hotplug notifications\n");
++
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
+ pr_err("lru_gen: failed to create sysfs group\n");
+
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+
- return 0;
- };
- /*
-
-diff --git a/mm/Kconfig b/mm/Kconfig
-index 24c045b24b95..e82e6b92820c 100644
---- a/mm/Kconfig
-+++ b/mm/Kconfig
-@@ -872,4 +872,61 @@ config MAPPING_DIRTY_HELPERS
- config KMAP_LOCAL
- bool
-
-+# the multigenerational lru {
-+config LRU_GEN
-+ bool "Multigenerational LRU"
-+ depends on MMU
-+ help
-+ A high performance LRU implementation to heavily overcommit workloads
-+ that are not IO bound. See Documentation/vm/multigen_lru.rst for
-+ details.
-+
-+ Warning: do not enable this option unless you plan to use it because
-+ it introduces a small per-process and per-memcg and per-node memory
-+ overhead.
-+
-+config LRU_GEN_ENABLED
-+ bool "Turn on by default"
-+ depends on LRU_GEN
-+ help
-+ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option
-+ changes it to 1.
++ return 0;
++};
++/*
++ * We want to run as early as possible because debug code may call mm_alloc()
++ * and mmput(). Out only dependency mm_kobj is initialized one stage earlier.
++ */
++arch_initcall(init_lru_gen);
+
-+ Warning: the default value is the fast path. See
-+ Documentation/static-keys.txt for details.
++#else /* CONFIG_LRU_GEN */
+
-+config LRU_GEN_STATS
-+ bool "Full stats for debugging"
-+ depends on LRU_GEN
-+ help
-+ This option keeps full stats for each generation, which can be read
-+ from /sys/kernel/debug/lru_gen_full.
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++}
+
-+ Warning: do not enable this option unless you plan to use it because
-+ it introduces an additional small per-process and per-memcg and
-+ per-node memory overhead.
++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++}
+
-+config NR_LRU_GENS
-+ int "Max number of generations"
-+ depends on LRU_GEN
-+ range 4 31
-+ default 7
-+ help
-+ This will use order_base_2(N+1) spare bits from page flags.
++static void lru_gen_start_kswapd(int nid)
++{
++}
+
-+ Warning: do not use numbers larger than necessary because each
-+ generation introduces a small per-node and per-memcg memory overhead.
++static void lru_gen_stop_kswapd(int nid)
++{
++}
+
-+config TIERS_PER_GEN
-+ int "Number of tiers per generation"
-+ depends on LRU_GEN
-+ range 2 5
-+ default 4
-+ help
-+ This will use N-2 spare bits from page flags.
++#endif /* CONFIG_LRU_GEN */
+
-+ Larger values generally offer better protection to active pages under
-+ heavy buffered I/O workloads.
-+# }
+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ {
+ unsigned long nr[NR_LRU_LISTS];
+@@ -2629,6 +5160,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ struct blk_plug plug;
+ bool scan_adjusted;
+
++ if (lru_gen_enabled()) {
++ lru_gen_shrink_lruvec(lruvec, sc);
++ return;
++ }
+
- endmenu
-
-
-diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst
-index eff5fbd492d0..c353b3f55924 100644
---- a/Documentation/vm/index.rst
-+++ b/Documentation/vm/index.rst
-@@ -17,6 +17,7 @@ various features of the Linux memory management
+ get_scan_count(lruvec, sc, nr);
- swap_numa
- zswap
-+ multigen_lru
+ /* Record the original scan target for proportional adjustments later */
+@@ -2866,7 +5402,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ unsigned long nr_reclaimed, nr_scanned;
+ struct lruvec *target_lruvec;
+ bool reclaimable = false;
+- unsigned long file;
- Kernel developers MM documentation
- ==================================
-diff --git a/Documentation/vm/multigen_lru.rst b/Documentation/vm/multigen_lru.rst
-new file mode 100644
-index 000000000000..a18416ed7e92
---- /dev/null
-+++ b/Documentation/vm/multigen_lru.rst
-@@ -0,0 +1,143 @@
-+.. SPDX-License-Identifier: GPL-2.0
-+
-+=====================
-+Multigenerational LRU
-+=====================
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+@@ -2876,93 +5411,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ nr_reclaimed = sc->nr_reclaimed;
+ nr_scanned = sc->nr_scanned;
+
+- /*
+- * Determine the scan balance between anon and file LRUs.
+- */
+- spin_lock_irq(&target_lruvec->lru_lock);
+- sc->anon_cost = target_lruvec->anon_cost;
+- sc->file_cost = target_lruvec->file_cost;
+- spin_unlock_irq(&target_lruvec->lru_lock);
+-
+- /*
+- * Target desirable inactive:active list ratios for the anon
+- * and file LRU lists.
+- */
+- if (!sc->force_deactivate) {
+- unsigned long refaults;
+-
+- refaults = lruvec_page_state(target_lruvec,
+- WORKINGSET_ACTIVATE_ANON);
+- if (refaults != target_lruvec->refaults[0] ||
+- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+- sc->may_deactivate |= DEACTIVATE_ANON;
+- else
+- sc->may_deactivate &= ~DEACTIVATE_ANON;
+-
+- /*
+- * When refaults are being observed, it means a new
+- * workingset is being established. Deactivate to get
+- * rid of any stale active pages quickly.
+- */
+- refaults = lruvec_page_state(target_lruvec,
+- WORKINGSET_ACTIVATE_FILE);
+- if (refaults != target_lruvec->refaults[1] ||
+- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+- sc->may_deactivate |= DEACTIVATE_FILE;
+- else
+- sc->may_deactivate &= ~DEACTIVATE_FILE;
+- } else
+- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+-
+- /*
+- * If we have plenty of inactive file pages that aren't
+- * thrashing, try to reclaim those first before touching
+- * anonymous pages.
+- */
+- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+- sc->cache_trim_mode = 1;
+- else
+- sc->cache_trim_mode = 0;
+-
+- /*
+- * Prevent the reclaimer from falling into the cache trap: as
+- * cache pages start out inactive, every cache fault will tip
+- * the scan balance towards the file LRU. And as the file LRU
+- * shrinks, so does the window for rotation from references.
+- * This means we have a runaway feedback loop where a tiny
+- * thrashing file LRU becomes infinitely more attractive than
+- * anon pages. Try to detect this based on file LRU size.
+- */
+- if (!cgroup_reclaim(sc)) {
+- unsigned long total_high_wmark = 0;
+- unsigned long free, anon;
+- int z;
+-
+- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+- node_page_state(pgdat, NR_INACTIVE_FILE);
+-
+- for (z = 0; z < MAX_NR_ZONES; z++) {
+- struct zone *zone = &pgdat->node_zones[z];
+- if (!managed_zone(zone))
+- continue;
+-
+- total_high_wmark += high_wmark_pages(zone);
+- }
+-
+- /*
+- * Consider anon: if that's low too, this isn't a
+- * runaway file reclaim problem, but rather just
+- * extreme pressure. Reclaim as per usual then.
+- */
+- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+-
+- sc->file_is_tiny =
+- file + free <= total_high_wmark &&
+- !(sc->may_deactivate & DEACTIVATE_ANON) &&
+- anon >> sc->priority;
+- }
++ prepare_scan_count(pgdat, sc);
+
+ shrink_node_memcgs(pgdat, sc);
+
+@@ -3182,6 +5631,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
+ struct lruvec *target_lruvec;
+ unsigned long refaults;
+
++ if (lru_gen_enabled())
++ return;
+
-+Quick Start
-+===========
-+Build Options
-+-------------
-+:Required: Set ``CONFIG_LRU_GEN=y``.
+ target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
+ target_lruvec->refaults[0] = refaults;
+@@ -3556,6 +6008,11 @@ static void age_active_anon(struct pglist_data *pgdat,
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
++ if (lru_gen_enabled()) {
++ lru_gen_age_node(pgdat, sc);
++ return;
++ }
+
-+:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by
-+ default.
+ if (!total_swap_pages)
+ return;
+
+@@ -4236,6 +6693,8 @@ int kswapd_run(int nid)
+ if (pgdat->kswapd)
+ return 0;
+
++ lru_gen_start_kswapd(nid);
+
-+:Optional: Change ``CONFIG_NR_LRU_GENS`` to a number ``X`` to support
-+ a maximum of ``X`` generations.
+ pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ if (IS_ERR(pgdat->kswapd)) {
+ /* failure at boot is fatal */
+@@ -4258,6 +6717,7 @@ void kswapd_stop(int nid)
+ if (kswapd) {
+ kthread_stop(kswapd);
+ NODE_DATA(nid)->kswapd = NULL;
++ lru_gen_stop_kswapd(nid);
+ }
+ }
+
+diff --git a/mm/workingset.c b/mm/workingset.c
+index b7cdeca5a76d..3f3f03d51ea7 100644
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -168,9 +168,9 @@
+ * refault distance will immediately activate the refaulting page.
+ */
+
+-#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
+- 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
+-#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
++#define EVICTION_SHIFT (BITS_PER_XA_VALUE - MEM_CGROUP_ID_SHIFT - NODES_SHIFT)
++#define EVICTION_MASK (BIT(EVICTION_SHIFT) - 1)
++#define WORKINGSET_WIDTH 1
+
+ /*
+ * Eviction timestamps need to be able to cover the full range of
+@@ -182,38 +182,129 @@
+ */
+ static unsigned int bucket_order __read_mostly;
+
+-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+- bool workingset)
++static void *pack_shadow(int memcg_id, struct pglist_data *pgdat, unsigned long val)
+ {
+- eviction >>= bucket_order;
+- eviction &= EVICTION_MASK;
+- eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
+- eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+- eviction = (eviction << 1) | workingset;
++ val = (val << MEM_CGROUP_ID_SHIFT) | memcg_id;
++ val = (val << NODES_SHIFT) | pgdat->node_id;
+
+- return xa_mk_value(eviction);
++ return xa_mk_value(val);
+ }
+
+-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
+- unsigned long *evictionp, bool *workingsetp)
++static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_data **pgdat)
+ {
+- unsigned long entry = xa_to_value(shadow);
+- int memcgid, nid;
+- bool workingset;
++ unsigned long val = xa_to_value(shadow);
+
+- workingset = entry & 1;
+- entry >>= 1;
+- nid = entry & ((1UL << NODES_SHIFT) - 1);
+- entry >>= NODES_SHIFT;
+- memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
+- entry >>= MEM_CGROUP_ID_SHIFT;
++ *pgdat = NODE_DATA(val & (BIT(NODES_SHIFT) - 1));
++ val >>= NODES_SHIFT;
++ *memcg_id = val & (BIT(MEM_CGROUP_ID_SHIFT) - 1);
+
+- *memcgidp = memcgid;
+- *pgdat = NODE_DATA(nid);
+- *evictionp = entry << bucket_order;
+- *workingsetp = workingset;
++ return val >> MEM_CGROUP_ID_SHIFT;
+ }
+
++#ifdef CONFIG_LRU_GEN
+
-+:Optional: Change ``CONFIG_TIERS_PER_GEN`` to a number ``Y`` to
-+ support a maximum of ``Y`` tiers per generation.
++#if LRU_GEN_SHIFT + LRU_USAGE_SHIFT >= EVICTION_SHIFT
++#error "Please try smaller NODES_SHIFT, NR_LRU_GENS and TIERS_PER_GEN configurations"
++#endif
+
-+Runtime Options
-+---------------
-+:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the
-+ feature was not turned on by default.
++static void page_set_usage(struct page *page, int usage)
++{
++ unsigned long old_flags, new_flags;
+
-+:Optional: Change ``/sys/kernel/mm/lru_gen/spread`` to a number ``N``
-+ to spread pages out across ``N+1`` generations. ``N`` should be less
-+ than ``X``. Larger values make the background aging more aggressive.
++ VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH));
+
-+:Optional: Read ``/sys/kernel/debug/lru_gen`` to verify the feature.
-+ This file has the following output:
++ if (!usage)
++ return;
+
-+::
++ do {
++ old_flags = READ_ONCE(page->flags);
++ new_flags = (old_flags & ~LRU_USAGE_MASK) | LRU_TIER_FLAGS |
++ ((usage - 1UL) << LRU_USAGE_PGOFF);
++ } while (new_flags != old_flags &&
++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
++}
+
-+ memcg memcg_id memcg_path
-+ node node_id
-+ min_gen birth_time anon_size file_size
-+ ...
-+ max_gen birth_time anon_size file_size
++/* Return a token to be stored in the shadow entry of a page being evicted. */
++static void *lru_gen_eviction(struct page *page)
++{
++ int hist, tier;
++ unsigned long token;
++ unsigned long min_seq;
++ struct lruvec *lruvec;
++ struct lrugen *lrugen;
++ int type = page_is_file_lru(page);
++ int usage = page_tier_usage(page);
++ struct mem_cgroup *memcg = page_memcg(page);
++ struct pglist_data *pgdat = page_pgdat(page);
+
-+Given a memcg and a node, ``min_gen`` is the oldest generation
-+(number) and ``max_gen`` is the youngest. Birth time is in
-+milliseconds. The sizes of anon and file types are in pages.
++ lruvec = mem_cgroup_lruvec(memcg, pgdat);
++ lrugen = &lruvec->evictable;
++ min_seq = READ_ONCE(lrugen->min_seq[type]);
++ token = (min_seq << LRU_USAGE_SHIFT) | usage;
+
-+Recipes
-+-------
-+:Android on ARMv8.1+: ``X=4``, ``Y=3`` and ``N=0``.
++ hist = hist_from_seq_or_gen(min_seq);
++ tier = lru_tier_from_usage(usage);
++ atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]);
+
-+:Android on pre-ARMv8.1 CPUs: Not recommended due to the lack of
-+ ``ARM64_HW_AFDBM``.
++ return pack_shadow(mem_cgroup_id(memcg), pgdat, token);
++}
+
-+:Laptops and workstations running Chrome on x86_64: Use the default
-+ values.
++/* Account a refaulted page based on the token stored in its shadow entry. */
++static void lru_gen_refault(struct page *page, void *shadow)
++{
++ int hist, tier, usage;
++ int memcg_id;
++ unsigned long token;
++ unsigned long min_seq;
++ struct lruvec *lruvec;
++ struct lrugen *lrugen;
++ struct pglist_data *pgdat;
++ struct mem_cgroup *memcg;
++ int type = page_is_file_lru(page);
+
-+:Working set estimation: Write ``+ memcg_id node_id gen [swappiness]``
-+ to ``/sys/kernel/debug/lru_gen`` to account referenced pages to
-+ generation ``max_gen`` and create the next generation ``max_gen+1``.
-+ ``gen`` should be equal to ``max_gen``. A swap file and a non-zero
-+ ``swappiness`` are required to scan anon type. If swapping is not
-+ desired, set ``vm.swappiness`` to ``0``.
++ token = unpack_shadow(shadow, &memcg_id, &pgdat);
++ if (page_pgdat(page) != pgdat)
++ return;
+
-+:Proactive reclaim: Write ``- memcg_id node_id gen [swappiness]
-+ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to evict
-+ generations less than or equal to ``gen``. ``gen`` should be less
-+ than ``max_gen-1`` as ``max_gen`` and ``max_gen-1`` are active
-+ generations and therefore protected from the eviction. Use
-+ ``nr_to_reclaim`` to limit the number of pages to evict. Multiple
-+ command lines are supported, so does concatenation with delimiters
-+ ``,`` and ``;``.
++ rcu_read_lock();
++ memcg = page_memcg_rcu(page);
++ if (mem_cgroup_id(memcg) != memcg_id)
++ goto unlock;
+
-+Framework
-+=========
-+For each ``lruvec``, evictable pages are divided into multiple
-+generations. The youngest generation number is stored in ``max_seq``
-+for both anon and file types as they are aged on an equal footing. The
-+oldest generation numbers are stored in ``min_seq[2]`` separately for
-+anon and file types as clean file pages can be evicted regardless of
-+swap and write-back constraints. These three variables are
-+monotonically increasing. Generation numbers are truncated into
-+``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into
-+``page->flags``. The sliding window technique is used to prevent
-+truncated generation numbers from overlapping. Each truncated
-+generation number is an index to an array of per-type and per-zone
-+lists. Evictable pages are added to the per-zone lists indexed by
-+``max_seq`` or ``min_seq[2]`` (modulo ``CONFIG_NR_LRU_GENS``),
-+depending on their types.
++ usage = token & (BIT(LRU_USAGE_SHIFT) - 1);
++ token >>= LRU_USAGE_SHIFT;
+
-+Each generation is then divided into multiple tiers. Tiers represent
-+levels of usage from file descriptors only. Pages accessed N times via
-+file descriptors belong to tier order_base_2(N). Each generation
-+contains at most CONFIG_TIERS_PER_GEN tiers, and they require
-+additional CONFIG_TIERS_PER_GEN-2 bits in page->flags. In contrast to
-+moving across generations which requires the lru lock for the list
-+operations, moving across tiers only involves an atomic operation on
-+``page->flags`` and therefore has a negligible cost. A feedback loop
-+modeled after the PID controller monitors the refault rates across all
-+tiers and decides when to activate pages from which tiers in the
-+reclaim path.
++ lruvec = mem_cgroup_lruvec(memcg, pgdat);
++ lrugen = &lruvec->evictable;
++ min_seq = READ_ONCE(lrugen->min_seq[type]);
++ if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_SHIFT)))
++ goto unlock;
+
-+The framework comprises two conceptually independent components: the
-+aging and the eviction, which can be invoked separately from user
-+space for the purpose of working set estimation and proactive reclaim.
++ page_set_usage(page, usage);
+
-+Aging
-+-----
-+The aging produces young generations. Given an ``lruvec``, the aging
-+scans page tables for referenced pages of this ``lruvec``. Upon
-+finding one, the aging updates its generation number to ``max_seq``.
-+After each round of scan, the aging increments ``max_seq``.
++ hist = hist_from_seq_or_gen(min_seq);
++ tier = lru_tier_from_usage(usage);
++ atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]);
++ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type);
++ if (tier)
++ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type);
++unlock:
++ rcu_read_unlock();
++}
+
-+The aging maintains either a system-wide ``mm_struct`` list or
-+per-memcg ``mm_struct`` lists, and it only scans page tables of
-+processes that have been scheduled since the last scan.
++#else /* CONFIG_LRU_GEN */
+
-+The aging is due when both of ``min_seq[2]`` reaches ``max_seq-1``,
-+assuming both anon and file types are reclaimable.
++static void *lru_gen_eviction(struct page *page)
++{
++ return NULL;
++}
+
-+Eviction
-+--------
-+The eviction consumes old generations. Given an ``lruvec``, the
-+eviction scans the pages on the per-zone lists indexed by either of
-+``min_seq[2]``. It first tries to select a type based on the values of
-+``min_seq[2]``. When anon and file types are both available from the
-+same generation, it selects the one that has a lower refault rate.
++static void lru_gen_refault(struct page *page, void *shadow)
++{
++}
+
-+During a scan, the eviction sorts pages according to their new
-+generation numbers, if the aging has found them referenced. It also
-+moves pages from the tiers that have higher refault rates than tier 0
-+to the next generation.
++#endif /* CONFIG_LRU_GEN */
+
-+When it finds all the per-zone lists of a selected type are empty, the
-+eviction increments ``min_seq[2]`` indexed by this selected type.
+ /**
+ * workingset_age_nonresident - age non-resident entries as LRU ages
+ * @lruvec: the lruvec that was aged
+@@ -262,12 +353,17 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
++ if (lru_gen_enabled())
++ return lru_gen_eviction(page);
+
-+To-do List
-+==========
-+KVM Optimization
-+----------------
-+Support shadow page table scanning.
+ lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ /* XXX: target_memcg can be NULL, go through lruvec */
+ memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
+ eviction = atomic_long_read(&lruvec->nonresident_age);
++ eviction >>= bucket_order;
++ eviction = (eviction << WORKINGSET_WIDTH) | PageWorkingset(page);
+ workingset_age_nonresident(lruvec, thp_nr_pages(page));
+- return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
++ return pack_shadow(memcgid, pgdat, eviction);
+ }
+
+ /**
+@@ -294,7 +390,12 @@ void workingset_refault(struct page *page, void *shadow)
+ bool workingset;
+ int memcgid;
+
+- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
++ if (lru_gen_enabled()) {
++ lru_gen_refault(page, shadow);
++ return;
++ }
+
-+NUMA Optimization
-+-----------------
-+Optimize page table scan for NUMA.
--- \ No newline at end of file
++ eviction = unpack_shadow(shadow, &memcgid, &pgdat);
+
+ rcu_read_lock();
+ /*
+@@ -318,6 +419,8 @@ void workingset_refault(struct page *page, void *shadow)
+ goto out;
+ eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
+ refault = atomic_long_read(&eviction_lruvec->nonresident_age);
++ workingset = eviction & (BIT(WORKINGSET_WIDTH) - 1);
++ eviction = (eviction >> WORKINGSET_WIDTH) << bucket_order;
+
+ /*
+ * Calculate the refault distance
+@@ -335,7 +438,7 @@ void workingset_refault(struct page *page, void *shadow)
+ * longest time, so the occasional inappropriate activation
+ * leading to pressure on the active list is not a problem.
+ */
+- refault_distance = (refault - eviction) & EVICTION_MASK;
++ refault_distance = (refault - eviction) & (EVICTION_MASK >> WORKINGSET_WIDTH);
+
+ /*
+ * The activation decision for this page is made at the level
+@@ -593,7 +696,7 @@ static int __init workingset_init(void)
+ unsigned int max_order;
+ int ret;
+
+- BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
++ BUILD_BUG_ON(EVICTION_SHIFT < WORKINGSET_WIDTH);
+ /*
+ * Calculate the eviction bucket size to cover the longest
+ * actionable refault distance, which is currently half of
+@@ -601,7 +704,7 @@ static int __init workingset_init(void)
+ * some more pages at runtime, so keep working with up to
+ * double the initial memory by using totalram_pages as-is.
+ */
+- timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
++ timestamp_bits = EVICTION_SHIFT - WORKINGSET_WIDTH;
+ max_order = fls_long(totalram_pages() - 1);
+ if (max_order > timestamp_bits)
+ bucket_order = max_order - timestamp_bits;