diff options
author | dragonn | 2021-07-20 20:33:28 +0200 |
---|---|---|
committer | dragonn | 2021-07-20 20:33:28 +0200 |
commit | b59bf3acd35fd937c8cc162243b70724e643294f (patch) | |
tree | c42e496dd99a5a83a7279086f4203fe3d9c5bc42 /sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch | |
parent | 3e86e006ef476d294fe1d93ec691a65dd144502c (diff) | |
download | aur-b59bf3acd35fd937c8cc162243b70724e643294f.tar.gz |
5.13.4
Diffstat (limited to 'sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch')
-rw-r--r-- | sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch | 3458 |
1 files changed, 1630 insertions, 1828 deletions
diff --git a/sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch b/sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch index 16b30ccf94ad..b85a0f064684 100644 --- a/sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch +++ b/sys-kernel_arch-sources-g14_files-0005-lru-multi-generational.patch @@ -1,57 +1,165 @@ -diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h -index ac398e143c9a..89fe4e3592f9 100644 ---- a/include/linux/nodemask.h -+++ b/include/linux/nodemask.h -@@ -486,6 +486,7 @@ static inline int num_node_state(enum node_states state) - #define first_online_node 0 - #define first_memory_node 0 - #define next_online_node(nid) (MAX_NUMNODES) -+#define next_memory_node(nid) (MAX_NUMNODES) - #define nr_node_ids 1U - #define nr_online_nodes 1U - -diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h -index 4f2f79de083e..bd5744360cfa 100644 ---- a/include/linux/cgroup.h -+++ b/include/linux/cgroup.h -@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp) - css_put(&cgrp->self); - } +diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst +index eff5fbd492d0..c353b3f55924 100644 +--- a/Documentation/vm/index.rst ++++ b/Documentation/vm/index.rst +@@ -17,6 +17,7 @@ various features of the Linux memory management -+extern struct mutex cgroup_mutex; + swap_numa + zswap ++ multigen_lru + + Kernel developers MM documentation + ================================== +diff --git a/Documentation/vm/multigen_lru.rst b/Documentation/vm/multigen_lru.rst +new file mode 100644 +index 000000000000..a18416ed7e92 +--- /dev/null ++++ b/Documentation/vm/multigen_lru.rst +@@ -0,0 +1,143 @@ ++.. SPDX-License-Identifier: GPL-2.0 + -+static inline void cgroup_lock(void) -+{ -+ mutex_lock(&cgroup_mutex); -+} ++===================== ++Multigenerational LRU ++===================== + -+static inline void cgroup_unlock(void) -+{ -+ mutex_unlock(&cgroup_mutex); -+} ++Quick Start ++=========== ++Build Options ++------------- ++:Required: Set ``CONFIG_LRU_GEN=y``. + - /** - * task_css_set_check - obtain a task's css_set with extra access conditions - * @task: the task to obtain css_set for -@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp) - * as locks used during the cgroup_subsys::attach() methods. - */ - #ifdef CONFIG_PROVE_RCU --extern struct mutex cgroup_mutex; - extern spinlock_t css_set_lock; - #define task_css_set_check(task, __c) \ - rcu_dereference_check((task)->cgroups, \ -@@ -704,6 +715,8 @@ struct cgroup; - static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } - static inline void css_get(struct cgroup_subsys_state *css) {} - static inline void css_put(struct cgroup_subsys_state *css) {} -+static inline void cgroup_lock(void) {} -+static inline void cgroup_unlock(void) {} - static inline int cgroup_attach_task_all(struct task_struct *from, - struct task_struct *t) { return 0; } - static inline int cgroupstats_build(struct cgroupstats *stats, - - diff --git a/arch/Kconfig b/arch/Kconfig ++:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by ++ default. ++ ++:Optional: Change ``CONFIG_NR_LRU_GENS`` to a number ``X`` to support ++ a maximum of ``X`` generations. ++ ++:Optional: Change ``CONFIG_TIERS_PER_GEN`` to a number ``Y`` to ++ support a maximum of ``Y`` tiers per generation. ++ ++Runtime Options ++--------------- ++:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the ++ feature was not turned on by default. ++ ++:Optional: Change ``/sys/kernel/mm/lru_gen/spread`` to a number ``N`` ++ to spread pages out across ``N+1`` generations. ``N`` should be less ++ than ``X``. Larger values make the background aging more aggressive. ++ ++:Optional: Read ``/sys/kernel/debug/lru_gen`` to verify the feature. ++ This file has the following output: ++ ++:: ++ ++ memcg memcg_id memcg_path ++ node node_id ++ min_gen birth_time anon_size file_size ++ ... ++ max_gen birth_time anon_size file_size ++ ++Given a memcg and a node, ``min_gen`` is the oldest generation ++(number) and ``max_gen`` is the youngest. Birth time is in ++milliseconds. The sizes of anon and file types are in pages. ++ ++Recipes ++------- ++:Android on ARMv8.1+: ``X=4``, ``Y=3`` and ``N=0``. ++ ++:Android on pre-ARMv8.1 CPUs: Not recommended due to the lack of ++ ``ARM64_HW_AFDBM``. ++ ++:Laptops and workstations running Chrome on x86_64: Use the default ++ values. ++ ++:Working set estimation: Write ``+ memcg_id node_id gen [swappiness]`` ++ to ``/sys/kernel/debug/lru_gen`` to account referenced pages to ++ generation ``max_gen`` and create the next generation ``max_gen+1``. ++ ``gen`` should be equal to ``max_gen``. A swap file and a non-zero ++ ``swappiness`` are required to scan anon type. If swapping is not ++ desired, set ``vm.swappiness`` to ``0``. ++ ++:Proactive reclaim: Write ``- memcg_id node_id gen [swappiness] ++ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to evict ++ generations less than or equal to ``gen``. ``gen`` should be less ++ than ``max_gen-1`` as ``max_gen`` and ``max_gen-1`` are active ++ generations and therefore protected from the eviction. Use ++ ``nr_to_reclaim`` to limit the number of pages to evict. Multiple ++ command lines are supported, so does concatenation with delimiters ++ ``,`` and ``;``. ++ ++Framework ++========= ++For each ``lruvec``, evictable pages are divided into multiple ++generations. The youngest generation number is stored in ``max_seq`` ++for both anon and file types as they are aged on an equal footing. The ++oldest generation numbers are stored in ``min_seq[2]`` separately for ++anon and file types as clean file pages can be evicted regardless of ++swap and write-back constraints. These three variables are ++monotonically increasing. Generation numbers are truncated into ++``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into ++``page->flags``. The sliding window technique is used to prevent ++truncated generation numbers from overlapping. Each truncated ++generation number is an index to an array of per-type and per-zone ++lists. Evictable pages are added to the per-zone lists indexed by ++``max_seq`` or ``min_seq[2]`` (modulo ``CONFIG_NR_LRU_GENS``), ++depending on their types. ++ ++Each generation is then divided into multiple tiers. Tiers represent ++levels of usage from file descriptors only. Pages accessed N times via ++file descriptors belong to tier order_base_2(N). Each generation ++contains at most CONFIG_TIERS_PER_GEN tiers, and they require ++additional CONFIG_TIERS_PER_GEN-2 bits in page->flags. In contrast to ++moving across generations which requires the lru lock for the list ++operations, moving across tiers only involves an atomic operation on ++``page->flags`` and therefore has a negligible cost. A feedback loop ++modeled after the PID controller monitors the refault rates across all ++tiers and decides when to activate pages from which tiers in the ++reclaim path. ++ ++The framework comprises two conceptually independent components: the ++aging and the eviction, which can be invoked separately from user ++space for the purpose of working set estimation and proactive reclaim. ++ ++Aging ++----- ++The aging produces young generations. Given an ``lruvec``, the aging ++scans page tables for referenced pages of this ``lruvec``. Upon ++finding one, the aging updates its generation number to ``max_seq``. ++After each round of scan, the aging increments ``max_seq``. ++ ++The aging maintains either a system-wide ``mm_struct`` list or ++per-memcg ``mm_struct`` lists, and it only scans page tables of ++processes that have been scheduled since the last scan. ++ ++The aging is due when both of ``min_seq[2]`` reaches ``max_seq-1``, ++assuming both anon and file types are reclaimable. ++ ++Eviction ++-------- ++The eviction consumes old generations. Given an ``lruvec``, the ++eviction scans the pages on the per-zone lists indexed by either of ++``min_seq[2]``. It first tries to select a type based on the values of ++``min_seq[2]``. When anon and file types are both available from the ++same generation, it selects the one that has a lower refault rate. ++ ++During a scan, the eviction sorts pages according to their new ++generation numbers, if the aging has found them referenced. It also ++moves pages from the tiers that have higher refault rates than tier 0 ++to the next generation. ++ ++When it finds all the per-zone lists of a selected type are empty, the ++eviction increments ``min_seq[2]`` indexed by this selected type. ++ ++To-do List ++========== ++KVM Optimization ++---------------- ++Support shadow page table scanning. ++ ++NUMA Optimization ++----------------- ++Optimize page table scan for NUMA. +diff --git a/arch/Kconfig b/arch/Kconfig index c45b770d3579..e3812adc69f7 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -119,362 +227,26 @@ index d27cf69e811d..b968d6bd28b6 100644 int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp) { -diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h -index 46b13780c2c8..94ecc1d277a2 100644 ---- a/include/linux/pgtable.h -+++ b/include/linux/pgtable.h -@@ -193,7 +193,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - #endif - - #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG --#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG) - static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp) -@@ -214,7 +214,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - BUILD_BUG(); - return 0; - } --#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG */ - #endif - - #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 5199b9696bab..2339459c97d4 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -2421,6 +2421,103 @@ enum scan_balance { - SCAN_FILE, - }; - -+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) -+{ -+ unsigned long file; -+ struct lruvec *target_lruvec; -+ -+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); -+ -+ /* -+ * Determine the scan balance between anon and file LRUs. -+ */ -+ spin_lock_irq(&target_lruvec->lru_lock); -+ sc->anon_cost = target_lruvec->anon_cost; -+ sc->file_cost = target_lruvec->file_cost; -+ spin_unlock_irq(&target_lruvec->lru_lock); -+ -+ /* -+ * Target desirable inactive:active list ratios for the anon -+ * and file LRU lists. -+ */ -+ if (!sc->force_deactivate) { -+ unsigned long refaults; -+ -+ refaults = lruvec_page_state(target_lruvec, -+ WORKINGSET_ACTIVATE_ANON); -+ if (refaults != target_lruvec->refaults[0] || -+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) -+ sc->may_deactivate |= DEACTIVATE_ANON; -+ else -+ sc->may_deactivate &= ~DEACTIVATE_ANON; -+ -+ /* -+ * When refaults are being observed, it means a new -+ * workingset is being established. Deactivate to get -+ * rid of any stale active pages quickly. -+ */ -+ refaults = lruvec_page_state(target_lruvec, -+ WORKINGSET_ACTIVATE_FILE); -+ if (refaults != target_lruvec->refaults[1] || -+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) -+ sc->may_deactivate |= DEACTIVATE_FILE; -+ else -+ sc->may_deactivate &= ~DEACTIVATE_FILE; -+ } else -+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; -+ -+ /* -+ * If we have plenty of inactive file pages that aren't -+ * thrashing, try to reclaim those first before touching -+ * anonymous pages. -+ */ -+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); -+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) -+ sc->cache_trim_mode = 1; -+ else -+ sc->cache_trim_mode = 0; -+ -+ /* -+ * Prevent the reclaimer from falling into the cache trap: as -+ * cache pages start out inactive, every cache fault will tip -+ * the scan balance towards the file LRU. And as the file LRU -+ * shrinks, so does the window for rotation from references. -+ * This means we have a runaway feedback loop where a tiny -+ * thrashing file LRU becomes infinitely more attractive than -+ * anon pages. Try to detect this based on file LRU size. -+ */ -+ if (!cgroup_reclaim(sc)) { -+ unsigned long total_high_wmark = 0; -+ unsigned long free, anon; -+ int z; -+ -+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); -+ file = node_page_state(pgdat, NR_ACTIVE_FILE) + -+ node_page_state(pgdat, NR_INACTIVE_FILE); -+ -+ for (z = 0; z < MAX_NR_ZONES; z++) { -+ struct zone *zone = &pgdat->node_zones[z]; -+ -+ if (!managed_zone(zone)) -+ continue; -+ -+ total_high_wmark += high_wmark_pages(zone); -+ } -+ -+ /* -+ * Consider anon: if that's low too, this isn't a -+ * runaway file reclaim problem, but rather just -+ * extreme pressure. Reclaim as per usual then. -+ */ -+ anon = node_page_state(pgdat, NR_INACTIVE_ANON); -+ -+ sc->file_is_tiny = -+ file + free <= total_high_wmark && -+ !(sc->may_deactivate & DEACTIVATE_ANON) && -+ anon >> sc->priority; -+ } -+} -+ - /* - * Determine how aggressively the anon and file LRU lists should be - * scanned. The relative value of each set of LRU lists is determined -@@ -2866,7 +2963,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) - unsigned long nr_reclaimed, nr_scanned; - struct lruvec *target_lruvec; - bool reclaimable = false; -- unsigned long file; - - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - -@@ -2876,93 +2972,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) - nr_reclaimed = sc->nr_reclaimed; - nr_scanned = sc->nr_scanned; - -- /* -- * Determine the scan balance between anon and file LRUs. -- */ -- spin_lock_irq(&target_lruvec->lru_lock); -- sc->anon_cost = target_lruvec->anon_cost; -- sc->file_cost = target_lruvec->file_cost; -- spin_unlock_irq(&target_lruvec->lru_lock); -- -- /* -- * Target desirable inactive:active list ratios for the anon -- * and file LRU lists. -- */ -- if (!sc->force_deactivate) { -- unsigned long refaults; -- -- refaults = lruvec_page_state(target_lruvec, -- WORKINGSET_ACTIVATE_ANON); -- if (refaults != target_lruvec->refaults[0] || -- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) -- sc->may_deactivate |= DEACTIVATE_ANON; -- else -- sc->may_deactivate &= ~DEACTIVATE_ANON; -- -- /* -- * When refaults are being observed, it means a new -- * workingset is being established. Deactivate to get -- * rid of any stale active pages quickly. -- */ -- refaults = lruvec_page_state(target_lruvec, -- WORKINGSET_ACTIVATE_FILE); -- if (refaults != target_lruvec->refaults[1] || -- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) -- sc->may_deactivate |= DEACTIVATE_FILE; -- else -- sc->may_deactivate &= ~DEACTIVATE_FILE; -- } else -- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; -- -- /* -- * If we have plenty of inactive file pages that aren't -- * thrashing, try to reclaim those first before touching -- * anonymous pages. -- */ -- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); -- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) -- sc->cache_trim_mode = 1; -- else -- sc->cache_trim_mode = 0; -- -- /* -- * Prevent the reclaimer from falling into the cache trap: as -- * cache pages start out inactive, every cache fault will tip -- * the scan balance towards the file LRU. And as the file LRU -- * shrinks, so does the window for rotation from references. -- * This means we have a runaway feedback loop where a tiny -- * thrashing file LRU becomes infinitely more attractive than -- * anon pages. Try to detect this based on file LRU size. -- */ -- if (!cgroup_reclaim(sc)) { -- unsigned long total_high_wmark = 0; -- unsigned long free, anon; -- int z; -- -- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); -- file = node_page_state(pgdat, NR_ACTIVE_FILE) + -- node_page_state(pgdat, NR_INACTIVE_FILE); -- -- for (z = 0; z < MAX_NR_ZONES; z++) { -- struct zone *zone = &pgdat->node_zones[z]; -- if (!managed_zone(zone)) -- continue; -- -- total_high_wmark += high_wmark_pages(zone); -- } -- -- /* -- * Consider anon: if that's low too, this isn't a -- * runaway file reclaim problem, but rather just -- * extreme pressure. Reclaim as per usual then. -- */ -- anon = node_page_state(pgdat, NR_INACTIVE_ANON); -- -- sc->file_is_tiny = -- file + free <= total_high_wmark && -- !(sc->may_deactivate & DEACTIVATE_ANON) && -- anon >> sc->priority; -- } -+ prepare_scan_count(pgdat, sc); - - shrink_node_memcgs(pgdat, sc); - -diff --git a/mm/workingset.c b/mm/workingset.c -index b7cdeca5a76d..edb8aed2587e 100644 ---- a/mm/workingset.c -+++ b/mm/workingset.c -@@ -168,9 +168,9 @@ - * refault distance will immediately activate the refaulting page. - */ - --#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ -- 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) --#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) -+#define EVICTION_SHIFT (BITS_PER_XA_VALUE - MEM_CGROUP_ID_SHIFT - NODES_SHIFT) -+#define EVICTION_MASK (BIT(EVICTION_SHIFT) - 1) -+#define WORKINGSET_WIDTH 1 - - /* - * Eviction timestamps need to be able to cover the full range of -@@ -182,36 +182,23 @@ - */ - static unsigned int bucket_order __read_mostly; - --static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, -- bool workingset) -+static void *pack_shadow(int memcg_id, struct pglist_data *pgdat, unsigned long val) - { -- eviction >>= bucket_order; -- eviction &= EVICTION_MASK; -- eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; -- eviction = (eviction << NODES_SHIFT) | pgdat->node_id; -- eviction = (eviction << 1) | workingset; -+ val = (val << MEM_CGROUP_ID_SHIFT) | memcg_id; -+ val = (val << NODES_SHIFT) | pgdat->node_id; - -- return xa_mk_value(eviction); -+ return xa_mk_value(val); - } - --static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, -- unsigned long *evictionp, bool *workingsetp) -+static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_data **pgdat) - { -- unsigned long entry = xa_to_value(shadow); -- int memcgid, nid; -- bool workingset; -+ unsigned long val = xa_to_value(shadow); - -- workingset = entry & 1; -- entry >>= 1; -- nid = entry & ((1UL << NODES_SHIFT) - 1); -- entry >>= NODES_SHIFT; -- memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); -- entry >>= MEM_CGROUP_ID_SHIFT; -+ *pgdat = NODE_DATA(val & (BIT(NODES_SHIFT) - 1)); -+ val >>= NODES_SHIFT; -+ *memcg_id = val & (BIT(MEM_CGROUP_ID_SHIFT) - 1); - -- *memcgidp = memcgid; -- *pgdat = NODE_DATA(nid); -- *evictionp = entry << bucket_order; -- *workingsetp = workingset; -+ return val >> MEM_CGROUP_ID_SHIFT; - } - - /** -@@ -266,8 +253,10 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) - /* XXX: target_memcg can be NULL, go through lruvec */ - memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); - eviction = atomic_long_read(&lruvec->nonresident_age); -+ eviction >>= bucket_order; -+ eviction = (eviction << WORKINGSET_WIDTH) | PageWorkingset(page); - workingset_age_nonresident(lruvec, thp_nr_pages(page)); -- return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); -+ return pack_shadow(memcgid, pgdat, eviction); - } - - /** -@@ -294,7 +283,7 @@ void workingset_refault(struct page *page, void *shadow) - bool workingset; - int memcgid; - -- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); -+ eviction = unpack_shadow(shadow, &memcgid, &pgdat); - - rcu_read_lock(); - /* -@@ -318,6 +307,8 @@ void workingset_refault(struct page *page, void *shadow) - goto out; - eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); - refault = atomic_long_read(&eviction_lruvec->nonresident_age); -+ workingset = eviction & (BIT(WORKINGSET_WIDTH) - 1); -+ eviction = (eviction >> WORKINGSET_WIDTH) << bucket_order; - - /* - * Calculate the refault distance -@@ -335,7 +326,7 @@ void workingset_refault(struct page *page, void *shadow) - * longest time, so the occasional inappropriate activation - * leading to pressure on the active list is not a problem. - */ -- refault_distance = (refault - eviction) & EVICTION_MASK; -+ refault_distance = (refault - eviction) & (EVICTION_MASK >> WORKINGSET_WIDTH); - - /* - * The activation decision for this page is made at the level -@@ -593,7 +584,7 @@ static int __init workingset_init(void) - unsigned int max_order; - int ret; - -- BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); -+ BUILD_BUG_ON(EVICTION_SHIFT < WORKINGSET_WIDTH); +diff --git a/fs/exec.c b/fs/exec.c +index 18594f11c31f..c691d4d7720c 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1008,6 +1008,7 @@ static int exec_mmap(struct mm_struct *mm) + active_mm = tsk->active_mm; + tsk->active_mm = mm; + tsk->mm = mm; ++ lru_gen_add_mm(mm); /* - * Calculate the eviction bucket size to cover the longest - * actionable refault distance, which is currently half of -@@ -601,7 +592,7 @@ static int __init workingset_init(void) - * some more pages at runtime, so keep working with up to - * double the initial memory by using totalram_pages as-is. - */ -- timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; -+ timestamp_bits = EVICTION_SHIFT - WORKINGSET_WIDTH; - max_order = fls_long(totalram_pages() - 1); - if (max_order > timestamp_bits) - bucket_order = max_order - timestamp_bits; - + * This prevents preemption while active_mm is being loaded and + * it and mm are being updated, which could cause problems for +@@ -1018,6 +1019,7 @@ static int exec_mmap(struct mm_struct *mm) + if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); + activate_mm(active_mm, mm); ++ lru_gen_switch_mm(active_mm, mm); + if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); + tsk->mm->vmacache_seqnum = 0; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a5ceccc5ef00..f784c118f00f 100644 --- a/fs/fuse/dev.c @@ -489,8 +261,80 @@ index a5ceccc5ef00..f784c118f00f 100644 dump_page(page, "fuse: trying to steal weird page"); return 1; } +diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h +index 6bc9c76680b2..e52e44af6810 100644 +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp) + css_put(&cgrp->self); + } + ++extern struct mutex cgroup_mutex; ++ ++static inline void cgroup_lock(void) ++{ ++ mutex_lock(&cgroup_mutex); ++} ++ ++static inline void cgroup_unlock(void) ++{ ++ mutex_unlock(&cgroup_mutex); ++} ++ + /** + * task_css_set_check - obtain a task's css_set with extra access conditions + * @task: the task to obtain css_set for +@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp) + * as locks used during the cgroup_subsys::attach() methods. + */ + #ifdef CONFIG_PROVE_RCU +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + #define task_css_set_check(task, __c) \ + rcu_dereference_check((task)->cgroups, \ +@@ -704,6 +715,8 @@ struct cgroup; + static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } + static inline void css_get(struct cgroup_subsys_state *css) {} + static inline void css_put(struct cgroup_subsys_state *css) {} ++static inline void cgroup_lock(void) {} ++static inline void cgroup_unlock(void) {} + static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) { return 0; } + static inline int cgroupstats_build(struct cgroupstats *stats, +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index c193be760709..60601a997433 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -230,6 +230,8 @@ struct obj_cgroup { + }; + }; + ++struct lru_gen_mm_list; ++ + /* + * The memory controller data structure. The memory controller controls both + * page cache and RSS per cgroup. We would eventually like to provide +@@ -349,6 +351,10 @@ struct mem_cgroup { + struct deferred_split deferred_split_queue; + #endif + ++#ifdef CONFIG_LRU_GEN ++ struct lru_gen_mm_list *mm_list; ++#endif ++ + struct mem_cgroup_per_node *nodeinfo[0]; + /* WARNING: nodeinfo must be the last member here */ + }; +@@ -1131,7 +1137,6 @@ static inline struct mem_cgroup *page_memcg(struct page *page) + + static inline struct mem_cgroup *page_memcg_rcu(struct page *page) + { +- WARN_ON_ONCE(!rcu_read_lock_held()); + return NULL; + } + diff --git a/include/linux/mm.h b/include/linux/mm.h -index c274f75efcf9..e0c19a02db9d 100644 +index 8ae31622deef..d335b1c13cc2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1089,6 +1089,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); @@ -503,10 +347,10 @@ index c274f75efcf9..e0c19a02db9d 100644 /* * Define the bit shifts to access each section. For non-existent diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h -index 355ea1ee32bd..ae3e3826dd7f 100644 +index 355ea1ee32bd..f3b99f65a652 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h -@@ -79,11 +79,199 @@ static __always_inline enum lru_list page_lru(struct page *page) +@@ -79,11 +79,239 @@ static __always_inline enum lru_list page_lru(struct page *page) return lru; } @@ -534,6 +378,12 @@ index 355ea1ee32bd..ae3e3826dd7f 100644 + return seq % MAX_NR_GENS; +} + ++/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */ ++static inline int lru_tier_from_usage(int usage) ++{ ++ return order_base_2(usage + 1); ++} ++ +/* Return a proper index regardless whether we keep a full history of stats. */ +static inline int hist_from_seq_or_gen(int seq_or_gen) +{ @@ -676,6 +526,36 @@ index 355ea1ee32bd..ae3e3826dd7f 100644 + return true; +} + ++/* Return the level of usage of a page. See the comment on MAX_NR_TIERS. */ ++static inline int page_tier_usage(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ return flags & BIT(PG_workingset) ? ++ ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0; ++} ++ ++/* Increment the usage counter after a page is accessed via file descriptors. */ ++static inline void page_inc_usage(struct page *page) ++{ ++ unsigned long usage; ++ unsigned long old_flags, new_flags; ++ ++ do { ++ old_flags = READ_ONCE(page->flags); ++ ++ if (!(old_flags & BIT(PG_workingset))) { ++ new_flags = old_flags | BIT(PG_workingset); ++ continue; ++ } ++ ++ usage = (old_flags & LRU_USAGE_MASK) + BIT(LRU_USAGE_PGOFF); ++ ++ new_flags = (old_flags & ~LRU_USAGE_MASK) | min(usage, LRU_USAGE_MASK); ++ } while (new_flags != old_flags && ++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++} ++ +#else /* CONFIG_LRU_GEN */ + +static inline bool lru_gen_enabled(void) @@ -693,6 +573,10 @@ index 355ea1ee32bd..ae3e3826dd7f 100644 + return false; +} + ++static inline void page_inc_usage(struct page *page) ++{ ++} ++ +#endif /* CONFIG_LRU_GEN */ + static __always_inline void add_page_to_lru_list(struct page *page, @@ -706,7 +590,7 @@ index 355ea1ee32bd..ae3e3826dd7f 100644 update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); } -@@ -93,6 +281,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, +@@ -93,6 +321,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, { enum lru_list lru = page_lru(page); @@ -716,7 +600,7 @@ index 355ea1ee32bd..ae3e3826dd7f 100644 update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add_tail(&page->lru, &lruvec->lists[lru]); } -@@ -100,6 +291,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, +@@ -100,6 +331,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec) { @@ -726,15 +610,148 @@ index 355ea1ee32bd..ae3e3826dd7f 100644 list_del(&page->lru); update_lru_size(lruvec, page_lru(page), page_zonenum(page), -thp_nr_pages(page)); +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 8f0fb62e8975..602901a0b1d0 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -15,6 +15,8 @@ + #include <linux/page-flags-layout.h> + #include <linux/workqueue.h> + #include <linux/seqlock.h> ++#include <linux/nodemask.h> ++#include <linux/mmdebug.h> + + #include <asm/mmu.h> + +@@ -574,6 +576,22 @@ struct mm_struct { + + #ifdef CONFIG_IOMMU_SUPPORT + u32 pasid; ++#endif ++#ifdef CONFIG_LRU_GEN ++ struct { ++ /* the node of a global or per-memcg mm_struct list */ ++ struct list_head list; ++#ifdef CONFIG_MEMCG ++ /* points to the memcg of the owner task above */ ++ struct mem_cgroup *memcg; ++#endif ++ /* whether this mm_struct has been used since the last walk */ ++ nodemask_t nodes; ++#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH ++ /* the number of CPUs using this mm_struct */ ++ atomic_t nr_cpus; ++#endif ++ } lrugen; + #endif + } __randomize_layout; + +@@ -601,6 +619,95 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) + return (struct cpumask *)&mm->cpu_bitmap; + } + ++#ifdef CONFIG_LRU_GEN ++ ++void lru_gen_init_mm(struct mm_struct *mm); ++void lru_gen_add_mm(struct mm_struct *mm); ++void lru_gen_del_mm(struct mm_struct *mm); ++#ifdef CONFIG_MEMCG ++int lru_gen_alloc_mm_list(struct mem_cgroup *memcg); ++void lru_gen_free_mm_list(struct mem_cgroup *memcg); ++void lru_gen_migrate_mm(struct mm_struct *mm); ++#endif ++ ++/* Track the usage of each mm_struct so that we can skip inactive ones. */ ++static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) ++{ ++ /* exclude init_mm, efi_mm, etc. */ ++ if (!core_kernel_data((unsigned long)old)) { ++ VM_BUG_ON(old == &init_mm); ++ ++ nodes_setall(old->lrugen.nodes); ++#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH ++ atomic_dec(&old->lrugen.nr_cpus); ++ VM_BUG_ON_MM(atomic_read(&old->lrugen.nr_cpus) < 0, old); ++#endif ++ } else ++ VM_BUG_ON_MM(READ_ONCE(old->lrugen.list.prev) || ++ READ_ONCE(old->lrugen.list.next), old); ++ ++ if (!core_kernel_data((unsigned long)new)) { ++ VM_BUG_ON(new == &init_mm); ++ ++#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH ++ atomic_inc(&new->lrugen.nr_cpus); ++ VM_BUG_ON_MM(atomic_read(&new->lrugen.nr_cpus) < 0, new); ++#endif ++ } else ++ VM_BUG_ON_MM(READ_ONCE(new->lrugen.list.prev) || ++ READ_ONCE(new->lrugen.list.next), new); ++} ++ ++/* Return whether this mm_struct is being used on any CPUs. */ ++static inline bool lru_gen_mm_is_active(struct mm_struct *mm) ++{ ++#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH ++ return !cpumask_empty(mm_cpumask(mm)); ++#else ++ return atomic_read(&mm->lrugen.nr_cpus); ++#endif ++} ++ ++#else /* CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_init_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_add_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_del_mm(struct mm_struct *mm) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline int lru_gen_alloc_mm_list(struct mem_cgroup *memcg) ++{ ++ return 0; ++} ++ ++static inline void lru_gen_free_mm_list(struct mem_cgroup *memcg) ++{ ++} ++ ++static inline void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++} ++#endif ++ ++static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) ++{ ++} ++ ++static inline bool lru_gen_mm_is_active(struct mm_struct *mm) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct mmu_gather; + extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); + extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index 0d53eba1c383..e5deec17b4bd 100644 +index 0d53eba1c383..ded72f44d7e7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h -@@ -293,6 +293,108 @@ enum lruvec_flags { +@@ -293,6 +293,114 @@ enum lruvec_flags { */ }; +struct lruvec; ++struct page_vma_mapped_walk; + +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) +#define LRU_USAGE_MASK ((BIT(LRU_USAGE_WIDTH) - 1) << LRU_USAGE_PGOFF) @@ -823,6 +840,7 @@ index 0d53eba1c383..e5deec17b4bd 100644 + +void lru_gen_init_lruvec(struct lruvec *lruvec); +void lru_gen_set_state(bool enable, bool main, bool swap); ++void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw); + +#else /* CONFIG_LRU_GEN */ + @@ -834,12 +852,16 @@ index 0d53eba1c383..e5deec17b4bd 100644 +{ +} + ++static inline void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw) ++{ ++} ++ +#endif /* CONFIG_LRU_GEN */ + struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */ -@@ -310,6 +412,10 @@ struct lruvec { +@@ -310,6 +418,10 @@ struct lruvec { unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; @@ -850,6 +872,37 @@ index 0d53eba1c383..e5deec17b4bd 100644 #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif +@@ -751,6 +863,8 @@ struct deferred_split { + }; + #endif + ++struct mm_walk_args; ++ + /* + * On NUMA machines, each NUMA node would have a pg_data_t to describe + * it's memory layout. On UMA machines there is a single pglist_data which +@@ -856,6 +970,9 @@ typedef struct pglist_data { + + unsigned long flags; + ++#ifdef CONFIG_LRU_GEN ++ struct mm_walk_args *mm_walk_args; ++#endif + ZONE_PADDING(_pad2_) + + /* Per-node vmstats */ +diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h +index ac398e143c9a..89fe4e3592f9 100644 +--- a/include/linux/nodemask.h ++++ b/include/linux/nodemask.h +@@ -486,6 +486,7 @@ static inline int num_node_state(enum node_states state) + #define first_online_node 0 + #define first_memory_node 0 + #define next_online_node(nid) (MAX_NUMNODES) ++#define next_memory_node(nid) (MAX_NUMNODES) + #define nr_node_ids 1U + #define nr_online_nodes 1U + diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index ef1e3e736e14..ce8d5732a3aa 100644 --- a/include/linux/page-flags-layout.h @@ -873,7 +926,7 @@ index ef1e3e736e14..ce8d5732a3aa 100644 #define SECTIONS_WIDTH 0 #endif --#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_USAGE_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ + <= BITS_PER_LONG - NR_PAGEFLAGS #define NODES_WIDTH NODES_SHIFT @@ -883,7 +936,7 @@ index ef1e3e736e14..ce8d5732a3aa 100644 #define LAST_CPUPID_SHIFT 0 #endif --#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \ +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ - <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_USAGE_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS @@ -894,7 +947,7 @@ index ef1e3e736e14..ce8d5732a3aa 100644 #define LAST_CPUPID_NOT_IN_PAGE_FLAGS #endif --#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH+LAST_CPUPID_WIDTH+KASAN_TAG_WIDTH \ +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ - > BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_USAGE_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS @@ -923,6 +976,43 @@ index 04a34c08e0a6..e58984fca32a 100644 #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) +diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h +index a43047b1030d..47c2c39bafdf 100644 +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -193,7 +193,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + #endif + + #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG) + static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +@@ -214,7 +214,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + BUILD_BUG(); + return 0; + } +-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ++#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG */ + #endif + + #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 144727041e78..30b1f15f5c6e 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -365,8 +365,8 @@ extern void deactivate_page(struct page *page); + extern void mark_page_lazyfree(struct page *page); + extern void swap_setup(void); + +-extern void lru_cache_add_inactive_or_unevictable(struct page *page, +- struct vm_area_struct *vma); ++extern void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma, ++ bool faulting); + + /* linux/mm/vmscan.c */ + extern unsigned long zone_reclaimable_pages(struct zone *zone); diff --git a/kernel/bounds.c b/kernel/bounds.c index 9795d75b09b2..a8cbf2d0b11a 100644 --- a/kernel/bounds.c @@ -940,501 +1030,177 @@ index 9795d75b09b2..a8cbf2d0b11a 100644 /* End of constants */ return 0; -diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index 63ed6b25deaa..8ac9093e5a0d 100644 ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -2410,7 +2410,8 @@ static void __split_huge_page_tail(struct page *head, int tail, - #ifdef CONFIG_64BIT - (1L << PG_arch_2) | +diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c +index 6addc9780319..4e93e5602723 100644 +--- a/kernel/events/uprobes.c ++++ b/kernel/events/uprobes.c +@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, + if (new_page) { + get_page(new_page); + page_add_new_anon_rmap(new_page, vma, addr, false); +- lru_cache_add_inactive_or_unevictable(new_page, vma); ++ lru_cache_add_page_vma(new_page, vma, false); + } else + /* no new page, just dec_mm_counter for old_page */ + dec_mm_counter(mm, MM_ANONPAGES); +diff --git a/kernel/exit.c b/kernel/exit.c +index 65809fac3038..6e6d95b0462c 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -422,6 +422,7 @@ void mm_update_next_owner(struct mm_struct *mm) + goto retry; + } + WRITE_ONCE(mm->owner, c); ++ lru_gen_migrate_mm(mm); + task_unlock(c); + put_task_struct(c); + } +diff --git a/kernel/fork.c b/kernel/fork.c +index 03baafd70b98..7a72a9e17059 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -673,6 +673,7 @@ static void check_mm(struct mm_struct *mm) + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif -- (1L << PG_dirty))); -+ (1L << PG_dirty) | -+ LRU_GEN_MASK | LRU_USAGE_MASK)); ++ VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm); + } - /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, -diff --git a/mm/mm_init.c b/mm/mm_init.c -index 9ddaf0e1b0ab..ef0deadb90a7 100644 ---- a/mm/mm_init.c -+++ b/mm/mm_init.c -@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void) + #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) +@@ -1065,6 +1066,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + goto fail_nocontext; - shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH -- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; -+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_USAGE_WIDTH; - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", -- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", -+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", - SECTIONS_WIDTH, - NODES_WIDTH, - ZONES_WIDTH, - LAST_CPUPID_WIDTH, - KASAN_TAG_WIDTH, -+ LRU_GEN_WIDTH, -+ LRU_USAGE_WIDTH, - NR_PAGEFLAGS); - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", -diff --git a/mm/mmzone.c b/mm/mmzone.c -index eb89d6e018e2..2ec0d7793424 100644 ---- a/mm/mmzone.c -+++ b/mm/mmzone.c -@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec) + mm->user_ns = get_user_ns(user_ns); ++ lru_gen_init_mm(mm); + return mm; - for_each_lru(lru) - INIT_LIST_HEAD(&lruvec->lists[lru]); -+ -+ lru_gen_init_lruvec(lruvec); + fail_nocontext: +@@ -1107,6 +1109,7 @@ static inline void __mmput(struct mm_struct *mm) + } + if (mm->binfmt) + module_put(mm->binfmt->module); ++ lru_gen_del_mm(mm); + mmdrop(mm); } - #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) -diff --git a/mm/swapfile.c b/mm/swapfile.c -index 149e77454e3c..3598b668f533 100644 ---- a/mm/swapfile.c -+++ b/mm/swapfile.c -@@ -2702,6 +2702,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) - err = 0; - atomic_inc(&proc_poll_event); - wake_up_interruptible(&proc_poll_wait); -+ /* stop tracking anon if the multigenerational lru is turned off */ -+ lru_gen_set_state(false, false, true); +@@ -2531,6 +2534,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) + get_task_struct(p); + } - out_dput: - filp_close(victim, NULL); -@@ -3348,6 +3350,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) - mutex_unlock(&swapon_mutex); - atomic_inc(&proc_poll_event); - wake_up_interruptible(&proc_poll_wait); -+ /* start tracking anon if the multigenerational lru is turned on */ -+ lru_gen_set_state(true, false, true); ++ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { ++ /* lock the task to synchronize with memcg migration */ ++ task_lock(p); ++ lru_gen_add_mm(p->mm); ++ task_unlock(p); ++ } ++ + wake_up_new_task(p); - error = 0; - goto out; -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 2339459c97d4..f7bbfc0b1ebd 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -49,6 +49,7 @@ - #include <linux/printk.h> - #include <linux/dax.h> - #include <linux/psi.h> -+#include <linux/memory.h> + /* forking complete and child started to run, tell ptracer */ +diff --git a/kernel/kthread.c b/kernel/kthread.c +index 0fccf7d0c6a1..42cea2a77273 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -1350,6 +1350,7 @@ void kthread_use_mm(struct mm_struct *mm) + tsk->mm = mm; + membarrier_update_current_mm(mm); + switch_mm_irqs_off(active_mm, mm, tsk); ++ lru_gen_switch_mm(active_mm, mm); + local_irq_enable(); + task_unlock(tsk); + #ifdef finish_arch_post_lock_switch +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 4ca80df205ce..68e6dc4ef643 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4323,6 +4323,7 @@ context_switch(struct rq *rq, struct task_struct *prev, + * finish_task_switch()'s mmdrop(). + */ + switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ lru_gen_switch_mm(prev->active_mm, next->mm); - #include <asm/tlbflush.h> - #include <asm/div64.h> -@@ -2715,6 +2716,311 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ +@@ -7602,6 +7603,7 @@ void idle_task_exit(void) + + if (mm != &init_mm) { + switch_mm(mm, &init_mm, current); ++ lru_gen_switch_mm(mm, &init_mm); + finish_arch_post_lock_switch(); } - } -+#ifdef CONFIG_LRU_GEN -+ -+/* -+ * After pages are faulted in, the aging must scan them twice before the -+ * eviction can consider them. The first scan clears the accessed bit set during -+ * initial faults. And the second scan makes sure they haven't been used since -+ * the first scan. -+ */ -+#define MIN_NR_GENS 2 -+ -+#define MAX_BATCH_SIZE 8192 -+ -+/****************************************************************************** -+ * shorthand helpers -+ ******************************************************************************/ -+ -+#define DEFINE_MAX_SEQ() \ -+ unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq) -+ -+#define DEFINE_MIN_SEQ() \ -+ unsigned long min_seq[ANON_AND_FILE] = { \ -+ READ_ONCE(lruvec->evictable.min_seq[0]), \ -+ READ_ONCE(lruvec->evictable.min_seq[1]), \ -+ } -+ -+#define for_each_type_zone(type, zone) \ -+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ -+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) -+ -+#define for_each_gen_type_zone(gen, type, zone) \ -+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ -+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ -+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) -+ -+static int page_lru_gen(struct page *page) -+{ -+ return ((page->flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+} -+ -+static int get_nr_gens(struct lruvec *lruvec, int type) -+{ -+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; -+} -+ -+static int min_nr_gens(unsigned long max_seq, unsigned long *min_seq, int swappiness) -+{ -+ return max_seq - max(min_seq[!swappiness], min_seq[1]) + 1; -+} -+ -+static int max_nr_gens(unsigned long max_seq, unsigned long *min_seq, int swappiness) -+{ -+ return max_seq - min(min_seq[!swappiness], min_seq[1]) + 1; -+} -+ -+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) -+{ -+ lockdep_assert_held(&lruvec->lru_lock); -+ -+ return get_nr_gens(lruvec, 0) >= MIN_NR_GENS && -+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS && -+ get_nr_gens(lruvec, 1) >= MIN_NR_GENS && -+ get_nr_gens(lruvec, 1) <= MAX_NR_GENS; -+} -+ -+/****************************************************************************** -+ * state change -+ ******************************************************************************/ -+ -+#ifdef CONFIG_LRU_GEN_ENABLED -+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); -+#else -+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); -+#endif -+ -+static DEFINE_MUTEX(lru_gen_state_mutex); -+static int lru_gen_nr_swapfiles __read_mostly; -+ -+static bool __maybe_unused state_is_valid(struct lruvec *lruvec) -+{ -+ int gen, type, zone; -+ enum lru_list lru; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ for_each_evictable_lru(lru) { -+ type = is_file_lru(lru); -+ -+ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) -+ return false; -+ } -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) -+ return false; -+ -+ VM_WARN_ON_ONCE(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); -+ } -+ -+ return true; -+} -+ -+static bool fill_lru_gen_lists(struct lruvec *lruvec) -+{ -+ enum lru_list lru; -+ int batch_size = 0; -+ -+ for_each_evictable_lru(lru) { -+ int type = is_file_lru(lru); -+ bool active = is_active_lru(lru); -+ struct list_head *head = &lruvec->lists[lru]; -+ -+ if (!lruvec->evictable.enabled[type]) -+ continue; -+ -+ while (!list_empty(head)) { -+ bool success; -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page) != active, page); -+ VM_BUG_ON_PAGE(page_lru_gen(page) != -1, page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ del_page_from_lru_list(page, lruvec); -+ success = lru_gen_addition(page, lruvec, true); -+ VM_BUG_ON(!success); -+ -+ if (++batch_size == MAX_BATCH_SIZE) -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+static bool drain_lru_gen_lists(struct lruvec *lruvec) -+{ -+ int gen, type, zone; -+ int batch_size = 0; -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; -+ -+ if (lruvec->evictable.enabled[type]) -+ continue; -+ -+ while (!list_empty(head)) { -+ bool success; -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ success = lru_gen_deletion(page, lruvec); -+ VM_BUG_ON(!success); -+ add_page_to_lru_list(page, lruvec); -+ -+ if (++batch_size == MAX_BATCH_SIZE) -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+/* -+ * For file page tracking, we enable/disable it according to the main switch. -+ * For anon page tracking, we only enabled it when the main switch is on and -+ * there is at least one swapfile; we disable it when there are no swapfiles -+ * regardless of the value of the main switch. Otherwise, we will eventually -+ * reach the max size of the sliding window and have to call inc_min_seq(), -+ * which brings an unnecessary overhead. -+ */ -+void lru_gen_set_state(bool enable, bool main, bool swap) -+{ -+ struct mem_cgroup *memcg; -+ -+ mem_hotplug_begin(); -+ mutex_lock(&lru_gen_state_mutex); -+ cgroup_lock(); -+ -+ main = main && enable != lru_gen_enabled(); -+ swap = swap && !(enable ? lru_gen_nr_swapfiles++ : --lru_gen_nr_swapfiles); -+ swap = swap && lru_gen_enabled(); -+ if (!main && !swap) -+ goto unlock; -+ -+ if (main) { -+ if (enable) -+ static_branch_enable(&lru_gen_static_key); -+ else -+ static_branch_disable(&lru_gen_static_key); -+ } -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ int nid; -+ -+ for_each_node_state(nid, N_MEMORY) { -+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ VM_BUG_ON(!state_is_valid(lruvec)); -+ -+ WRITE_ONCE(lrugen->enabled[0], lru_gen_enabled() && lru_gen_nr_swapfiles); -+ WRITE_ONCE(lrugen->enabled[1], lru_gen_enabled()); -+ -+ while (!(enable ? fill_lru_gen_lists(lruvec) : -+ drain_lru_gen_lists(lruvec))) { -+ spin_unlock_irq(&lruvec->lru_lock); -+ cond_resched(); -+ spin_lock_irq(&lruvec->lru_lock); -+ } -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ } -+ -+ cond_resched(); -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+unlock: -+ cgroup_unlock(); -+ mutex_unlock(&lru_gen_state_mutex); -+ mem_hotplug_done(); -+} -+ -+static int __meminit __maybe_unused lru_gen_online_mem(struct notifier_block *self, -+ unsigned long action, void *arg) -+{ -+ struct mem_cgroup *memcg; -+ struct memory_notify *mnb = arg; -+ int nid = mnb->status_change_nid; -+ -+ if (action != MEM_GOING_ONLINE || nid == NUMA_NO_NODE) -+ return NOTIFY_DONE; -+ -+ mutex_lock(&lru_gen_state_mutex); -+ cgroup_lock(); -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ VM_BUG_ON(!state_is_valid(lruvec)); -+ -+ WRITE_ONCE(lrugen->enabled[0], lru_gen_enabled() && lru_gen_nr_swapfiles); -+ WRITE_ONCE(lrugen->enabled[1], lru_gen_enabled()); -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+ -+ cgroup_unlock(); -+ mutex_unlock(&lru_gen_state_mutex); -+ -+ return NOTIFY_DONE; -+} -+ -+/****************************************************************************** -+ * initialization -+ ******************************************************************************/ -+ -+void lru_gen_init_lruvec(struct lruvec *lruvec) -+{ -+ int i; -+ int gen, type, zone; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ lrugen->max_seq = MIN_NR_GENS + 1; -+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; -+ lrugen->enabled[1] = lru_gen_enabled(); -+ -+ for (i = 0; i <= MIN_NR_GENS + 1; i++) -+ lrugen->timestamps[i] = jiffies; -+ -+ for_each_gen_type_zone(gen, type, zone) -+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); -+} -+ -+static int __init init_lru_gen(void) -+{ -+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); -+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); -+ -+ if (hotplug_memory_notifier(lru_gen_online_mem, 0)) -+ pr_err("lru_gen: failed to subscribe hotplug notifications\n"); -+ -+ return 0; -+}; -+/* -+ * We want to run as early as possible because debug code may call mm_alloc() -+ * and mmput(). Out only dependency mm_kobj is initialized one stage earlier. -+ */ -+arch_initcall(init_lru_gen); +diff --git a/mm/Kconfig b/mm/Kconfig +index 02d44e3420f5..da125f145bc4 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -901,4 +901,62 @@ config KMAP_LOCAL + # struct io_mapping based helper. Selected by drivers that need them + config IO_MAPPING + bool + -+#endif /* CONFIG_LRU_GEN */ ++# the multigenerational lru { ++config LRU_GEN ++ bool "Multigenerational LRU" ++ depends on MMU ++ help ++ A high performance LRU implementation to heavily overcommit workloads ++ that are not IO bound. See Documentation/vm/multigen_lru.rst for ++ details. + - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - { - unsigned long nr[NR_LRU_LISTS]; - -diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h -index ae3e3826dd7f..f3b99f65a652 100644 ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -103,6 +103,12 @@ static inline int lru_gen_from_seq(unsigned long seq) - return seq % MAX_NR_GENS; - } - -+/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */ -+static inline int lru_tier_from_usage(int usage) -+{ -+ return order_base_2(usage + 1); -+} ++ Warning: do not enable this option unless you plan to use it because ++ it introduces a small per-process and per-memcg and per-node memory ++ overhead. + - /* Return a proper index regardless whether we keep a full history of stats. */ - static inline int hist_from_seq_or_gen(int seq_or_gen) - { -@@ -245,6 +251,36 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec) - return true; - } - -+/* Return the level of usage of a page. See the comment on MAX_NR_TIERS. */ -+static inline int page_tier_usage(struct page *page) -+{ -+ unsigned long flags = READ_ONCE(page->flags); ++config LRU_GEN_ENABLED ++ bool "Turn on by default" ++ depends on LRU_GEN ++ help ++ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option ++ changes it to 1. + -+ return flags & BIT(PG_workingset) ? -+ ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0; -+} ++ Warning: the default value is the fast path. See ++ Documentation/static-keys.txt for details. + -+/* Increment the usage counter after a page is accessed via file descriptors. */ -+static inline void page_inc_usage(struct page *page) -+{ -+ unsigned long usage; -+ unsigned long old_flags, new_flags; ++config LRU_GEN_STATS ++ bool "Full stats for debugging" ++ depends on LRU_GEN ++ help ++ This option keeps full stats for each generation, which can be read ++ from /sys/kernel/debug/lru_gen_full. + -+ do { -+ old_flags = READ_ONCE(page->flags); ++ Warning: do not enable this option unless you plan to use it because ++ it introduces an additional small per-process and per-memcg and ++ per-node memory overhead. + -+ if (!(old_flags & BIT(PG_workingset))) { -+ new_flags = old_flags | BIT(PG_workingset); -+ continue; -+ } ++config NR_LRU_GENS ++ int "Max number of generations" ++ depends on LRU_GEN ++ range 4 31 ++ default 7 ++ help ++ This will use order_base_2(N+1) spare bits from page flags. + -+ usage = (old_flags & LRU_USAGE_MASK) + BIT(LRU_USAGE_PGOFF); ++ Warning: do not use numbers larger than necessary because each ++ generation introduces a small per-node and per-memcg memory overhead. + -+ new_flags = (old_flags & ~LRU_USAGE_MASK) | min(usage, LRU_USAGE_MASK); -+ } while (new_flags != old_flags && -+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+} ++config TIERS_PER_GEN ++ int "Number of tiers per generation" ++ depends on LRU_GEN ++ range 2 5 ++ default 4 ++ help ++ This will use N-2 spare bits from page flags. + - #else /* CONFIG_LRU_GEN */ - - static inline bool lru_gen_enabled(void) -@@ -262,6 +298,10 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec) - return false; - } - -+static inline void page_inc_usage(struct page *page) -+{ -+} ++ Larger values generally offer better protection to active pages under ++ heavy buffered I/O workloads. ++# } + - #endif /* CONFIG_LRU_GEN */ - - static __always_inline void add_page_to_lru_list(struct page *page, -diff --git a/include/linux/swap.h b/include/linux/swap.h -index 144727041e78..30b1f15f5c6e 100644 ---- a/include/linux/swap.h -+++ b/include/linux/swap.h -@@ -365,8 +365,8 @@ extern void deactivate_page(struct page *page); - extern void mark_page_lazyfree(struct page *page); - extern void swap_setup(void); - --extern void lru_cache_add_inactive_or_unevictable(struct page *page, -- struct vm_area_struct *vma); -+extern void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma, -+ bool faulting); - - /* linux/mm/vmscan.c */ - extern unsigned long zone_reclaimable_pages(struct zone *zone); -diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c -index 6addc9780319..4e93e5602723 100644 ---- a/kernel/events/uprobes.c -+++ b/kernel/events/uprobes.c -@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, - if (new_page) { - get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); -- lru_cache_add_inactive_or_unevictable(new_page, vma); -+ lru_cache_add_page_vma(new_page, vma, false); - } else - /* no new page, just dec_mm_counter for old_page */ - dec_mm_counter(mm, MM_ANONPAGES); + endmenu diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index 8ac9093e5a0d..681da4a3cf61 100644 +index 6d2a0119fc58..64c70c322ac4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c -@@ -636,7 +636,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, +@@ -639,7 +639,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr, true); @@ -1443,6 +1209,16 @@ index 8ac9093e5a0d..681da4a3cf61 100644 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); +@@ -2422,7 +2422,8 @@ static void __split_huge_page_tail(struct page *head, int tail, + #ifdef CONFIG_64BIT + (1L << PG_arch_2) | + #endif +- (1L << PG_dirty))); ++ (1L << PG_dirty) | ++ LRU_GEN_MASK | LRU_USAGE_MASK)); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6c0185fdd815..09e5346c2754 100644 --- a/mm/khugepaged.c @@ -1456,8 +1232,68 @@ index 6c0185fdd815..09e5346c2754 100644 pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 64ada9e650a5..58b610ffa0e0 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -4981,6 +4981,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) + for_each_node(node) + free_mem_cgroup_per_node_info(memcg, node); + free_percpu(memcg->vmstats_percpu); ++ lru_gen_free_mm_list(memcg); + kfree(memcg); + } + +@@ -5030,6 +5031,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) + if (alloc_mem_cgroup_per_node_info(memcg, node)) + goto fail; + ++ if (lru_gen_alloc_mm_list(memcg)) ++ goto fail; ++ + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto fail; + +@@ -5991,6 +5995,29 @@ static void mem_cgroup_move_task(void) + } + #endif + ++#ifdef CONFIG_LRU_GEN ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++ struct cgroup_subsys_state *css; ++ struct task_struct *task = NULL; ++ ++ cgroup_taskset_for_each_leader(task, css, tset) ++ ; ++ ++ if (!task) ++ return; ++ ++ task_lock(task); ++ if (task->mm && task->mm->owner == task) ++ lru_gen_migrate_mm(task->mm); ++ task_unlock(task); ++} ++#else ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++#endif ++ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) + { + if (value == PAGE_COUNTER_MAX) +@@ -6332,6 +6359,7 @@ struct cgroup_subsys memory_cgrp_subsys = { + .css_reset = mem_cgroup_css_reset, + .css_rstat_flush = mem_cgroup_css_rstat_flush, + .can_attach = mem_cgroup_can_attach, ++ .attach = mem_cgroup_attach, + .cancel_attach = mem_cgroup_cancel_attach, + .post_attach = mem_cgroup_move_task, + .dfl_cftypes = memory_files, diff --git a/mm/memory.c b/mm/memory.c -index 730daa00952b..a76196885f92 100644 +index 486f4a2874e7..c017bdac5fd1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -839,7 +839,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma @@ -1469,7 +1305,7 @@ index 730daa00952b..a76196885f92 100644 rss[mm_counter(new_page)]++; /* All done, just insert the new page copy in the child */ -@@ -2950,7 +2950,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) +@@ -2962,7 +2962,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); @@ -1478,7 +1314,7 @@ index 730daa00952b..a76196885f92 100644 /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the -@@ -3479,7 +3479,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) +@@ -3521,7 +3521,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); @@ -1487,7 +1323,7 @@ index 730daa00952b..a76196885f92 100644 } else { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); } -@@ -3625,7 +3625,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) +@@ -3668,7 +3668,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); @@ -1496,7 +1332,7 @@ index 730daa00952b..a76196885f92 100644 setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); -@@ -3793,7 +3793,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) +@@ -3838,7 +3838,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); @@ -1506,10 +1342,10 @@ index 730daa00952b..a76196885f92 100644 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); diff --git a/mm/migrate.c b/mm/migrate.c -index b234c3f3acb7..d3307c9eced4 100644 +index 41ff2c9896c4..e103ab266d97 100644 --- a/mm/migrate.c +++ b/mm/migrate.c -@@ -2967,7 +2967,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, +@@ -2968,7 +2968,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); if (!is_zone_device_page(page)) @@ -1518,6 +1354,66 @@ index b234c3f3acb7..d3307c9eced4 100644 get_page(page); if (flush) { +diff --git a/mm/mm_init.c b/mm/mm_init.c +index 9ddaf0e1b0ab..ef0deadb90a7 100644 +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void) + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH +- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; ++ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_USAGE_WIDTH; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", +- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", ++ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, ++ LRU_GEN_WIDTH, ++ LRU_USAGE_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", +diff --git a/mm/mmzone.c b/mm/mmzone.c +index eb89d6e018e2..2ec0d7793424 100644 +--- a/mm/mmzone.c ++++ b/mm/mmzone.c +@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec) + + for_each_lru(lru) + INIT_LIST_HEAD(&lruvec->lists[lru]); ++ ++ lru_gen_init_lruvec(lruvec); + } + + #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) +diff --git a/mm/rmap.c b/mm/rmap.c +index e05c300048e6..1a33e394f516 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -72,6 +72,7 @@ + #include <linux/page_idle.h> + #include <linux/memremap.h> + #include <linux/userfaultfd_k.h> ++#include <linux/mm_inline.h> + + #include <asm/tlbflush.h> + +@@ -789,6 +790,11 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, + } + + if (pvmw.pte) { ++ /* the multigenerational lru exploits the spatial locality */ ++ if (lru_gen_enabled() && pte_young(*pvmw.pte)) { ++ lru_gen_scan_around(&pvmw); ++ referenced++; ++ } + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + /* diff --git a/mm/swap.c b/mm/swap.c index dfb48cf9c2c9..96ce95eeb2c9 100644 --- a/mm/swap.c @@ -1591,7 +1487,7 @@ index dfb48cf9c2c9..96ce95eeb2c9 100644 local_lock(&lru_pvecs.lock); diff --git a/mm/swapfile.c b/mm/swapfile.c -index 3598b668f533..549e94318b2f 100644 +index 996afa8131c8..8b5ca15df123 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1936,7 +1936,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, @@ -1603,8 +1499,26 @@ index 3598b668f533..549e94318b2f 100644 } swap_free(entry); out: +@@ -2702,6 +2702,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) + err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); ++ /* stop tracking anon if the multigenerational lru is turned off */ ++ lru_gen_set_state(false, false, true); + + out_dput: + filp_close(victim, NULL); +@@ -3348,6 +3350,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) + mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); ++ /* start tracking anon if the multigenerational lru is turned on */ ++ lru_gen_set_state(true, false, true); + + error = 0; + goto out; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c -index e14b3820c6a8..175d55b4f594 100644 +index 63a73e164d55..747a2d7eb5b6 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, @@ -1617,10 +1531,22 @@ index e14b3820c6a8..175d55b4f594 100644 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); diff --git a/mm/vmscan.c b/mm/vmscan.c -index f7bbfc0b1ebd..84d25079092e 100644 +index 5199b9696bab..ff2deec24c64 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c -@@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, +@@ -49,6 +49,11 @@ + #include <linux/printk.h> + #include <linux/dax.h> + #include <linux/psi.h> ++#include <linux/memory.h> ++#include <linux/pagewalk.h> ++#include <linux/shmem_fs.h> ++#include <linux/ctype.h> ++#include <linux/debugfs.h> + + #include <asm/tlbflush.h> + #include <asm/div64.h> +@@ -1093,9 +1098,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; @@ -1633,10 +1559,193 @@ index f7bbfc0b1ebd..84d25079092e 100644 __delete_from_swap_cache(page, swap, shadow); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); -@@ -2780,6 +2782,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) - get_nr_gens(lruvec, 1) <= MAX_NR_GENS; +@@ -1306,6 +1313,11 @@ static unsigned int shrink_page_list(struct list_head *page_list, + if (!sc->may_unmap && page_mapped(page)) + goto keep_locked; + ++ /* in case the page was found accessed by lru_gen_scan_around() */ ++ if (lru_gen_enabled() && !ignore_references && ++ page_mapped(page) && PageReferenced(page)) ++ goto keep_locked; ++ + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + +@@ -2421,6 +2433,106 @@ enum scan_balance { + SCAN_FILE, + }; + ++static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) ++{ ++ unsigned long file; ++ struct lruvec *target_lruvec; ++ ++ if (lru_gen_enabled()) ++ return; ++ ++ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); ++ ++ /* ++ * Determine the scan balance between anon and file LRUs. ++ */ ++ spin_lock_irq(&target_lruvec->lru_lock); ++ sc->anon_cost = target_lruvec->anon_cost; ++ sc->file_cost = target_lruvec->file_cost; ++ spin_unlock_irq(&target_lruvec->lru_lock); ++ ++ /* ++ * Target desirable inactive:active list ratios for the anon ++ * and file LRU lists. ++ */ ++ if (!sc->force_deactivate) { ++ unsigned long refaults; ++ ++ refaults = lruvec_page_state(target_lruvec, ++ WORKINGSET_ACTIVATE_ANON); ++ if (refaults != target_lruvec->refaults[0] || ++ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) ++ sc->may_deactivate |= DEACTIVATE_ANON; ++ else ++ sc->may_deactivate &= ~DEACTIVATE_ANON; ++ ++ /* ++ * When refaults are being observed, it means a new ++ * workingset is being established. Deactivate to get ++ * rid of any stale active pages quickly. ++ */ ++ refaults = lruvec_page_state(target_lruvec, ++ WORKINGSET_ACTIVATE_FILE); ++ if (refaults != target_lruvec->refaults[1] || ++ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) ++ sc->may_deactivate |= DEACTIVATE_FILE; ++ else ++ sc->may_deactivate &= ~DEACTIVATE_FILE; ++ } else ++ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; ++ ++ /* ++ * If we have plenty of inactive file pages that aren't ++ * thrashing, try to reclaim those first before touching ++ * anonymous pages. ++ */ ++ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); ++ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) ++ sc->cache_trim_mode = 1; ++ else ++ sc->cache_trim_mode = 0; ++ ++ /* ++ * Prevent the reclaimer from falling into the cache trap: as ++ * cache pages start out inactive, every cache fault will tip ++ * the scan balance towards the file LRU. And as the file LRU ++ * shrinks, so does the window for rotation from references. ++ * This means we have a runaway feedback loop where a tiny ++ * thrashing file LRU becomes infinitely more attractive than ++ * anon pages. Try to detect this based on file LRU size. ++ */ ++ if (!cgroup_reclaim(sc)) { ++ unsigned long total_high_wmark = 0; ++ unsigned long free, anon; ++ int z; ++ ++ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); ++ file = node_page_state(pgdat, NR_ACTIVE_FILE) + ++ node_page_state(pgdat, NR_INACTIVE_FILE); ++ ++ for (z = 0; z < MAX_NR_ZONES; z++) { ++ struct zone *zone = &pgdat->node_zones[z]; ++ ++ if (!managed_zone(zone)) ++ continue; ++ ++ total_high_wmark += high_wmark_pages(zone); ++ } ++ ++ /* ++ * Consider anon: if that's low too, this isn't a ++ * runaway file reclaim problem, but rather just ++ * extreme pressure. Reclaim as per usual then. ++ */ ++ anon = node_page_state(pgdat, NR_INACTIVE_ANON); ++ ++ sc->file_is_tiny = ++ file + free <= total_high_wmark && ++ !(sc->may_deactivate & DEACTIVATE_ANON) && ++ anon >> sc->priority; ++ } ++} ++ + /* + * Determine how aggressively the anon and file LRU lists should be + * scanned. The relative value of each set of LRU lists is determined +@@ -2618,6 +2730,2425 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + } } ++#ifdef CONFIG_LRU_GEN ++ ++/* ++ * After pages are faulted in, the aging must scan them twice before the ++ * eviction can consider them. The first scan clears the accessed bit set during ++ * initial faults. And the second scan makes sure they haven't been used since ++ * the first scan. ++ */ ++#define MIN_NR_GENS 2 ++ ++#define MAX_BATCH_SIZE 8192 ++ ++/****************************************************************************** ++ * shorthand helpers ++ ******************************************************************************/ ++ ++#define DEFINE_MAX_SEQ() \ ++ unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq) ++ ++#define DEFINE_MIN_SEQ() \ ++ unsigned long min_seq[ANON_AND_FILE] = { \ ++ READ_ONCE(lruvec->evictable.min_seq[0]), \ ++ READ_ONCE(lruvec->evictable.min_seq[1]), \ ++ } ++ ++#define for_each_type_zone(type, zone) \ ++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ ++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) ++ ++#define for_each_gen_type_zone(gen, type, zone) \ ++ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ ++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ ++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) ++ ++static int page_lru_gen(struct page *page) ++{ ++ return ((page->flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ ++static int get_nr_gens(struct lruvec *lruvec, int type) ++{ ++ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; ++} ++ ++static int min_nr_gens(unsigned long max_seq, unsigned long *min_seq, int swappiness) ++{ ++ return max_seq - max(min_seq[!swappiness], min_seq[1]) + 1; ++} ++ ++static int max_nr_gens(unsigned long max_seq, unsigned long *min_seq, int swappiness) ++{ ++ return max_seq - min(min_seq[!swappiness], min_seq[1]) + 1; ++} ++ ++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) ++{ ++ lockdep_assert_held(&lruvec->lru_lock); ++ ++ return get_nr_gens(lruvec, 0) >= MIN_NR_GENS && ++ get_nr_gens(lruvec, 0) <= MAX_NR_GENS && ++ get_nr_gens(lruvec, 1) >= MIN_NR_GENS && ++ get_nr_gens(lruvec, 1) <= MAX_NR_GENS; ++} ++ +/****************************************************************************** + * refault feedback loop + ******************************************************************************/ @@ -1724,477 +1833,6 @@ index f7bbfc0b1ebd..84d25079092e 100644 + sp->refaulted * max(pv->total, 1UL) * pv->gain; +} + - /****************************************************************************** - * state change - ******************************************************************************/ -diff --git a/mm/workingset.c b/mm/workingset.c -index edb8aed2587e..3f3f03d51ea7 100644 ---- a/mm/workingset.c -+++ b/mm/workingset.c -@@ -201,6 +201,110 @@ static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_da - return val >> MEM_CGROUP_ID_SHIFT; - } - -+#ifdef CONFIG_LRU_GEN -+ -+#if LRU_GEN_SHIFT + LRU_USAGE_SHIFT >= EVICTION_SHIFT -+#error "Please try smaller NODES_SHIFT, NR_LRU_GENS and TIERS_PER_GEN configurations" -+#endif -+ -+static void page_set_usage(struct page *page, int usage) -+{ -+ unsigned long old_flags, new_flags; -+ -+ VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH)); -+ -+ if (!usage) -+ return; -+ -+ do { -+ old_flags = READ_ONCE(page->flags); -+ new_flags = (old_flags & ~LRU_USAGE_MASK) | LRU_TIER_FLAGS | -+ ((usage - 1UL) << LRU_USAGE_PGOFF); -+ } while (new_flags != old_flags && -+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+} -+ -+/* Return a token to be stored in the shadow entry of a page being evicted. */ -+static void *lru_gen_eviction(struct page *page) -+{ -+ int hist, tier; -+ unsigned long token; -+ unsigned long min_seq; -+ struct lruvec *lruvec; -+ struct lrugen *lrugen; -+ int type = page_is_file_lru(page); -+ int usage = page_tier_usage(page); -+ struct mem_cgroup *memcg = page_memcg(page); -+ struct pglist_data *pgdat = page_pgdat(page); -+ -+ lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ lrugen = &lruvec->evictable; -+ min_seq = READ_ONCE(lrugen->min_seq[type]); -+ token = (min_seq << LRU_USAGE_SHIFT) | usage; -+ -+ hist = hist_from_seq_or_gen(min_seq); -+ tier = lru_tier_from_usage(usage); -+ atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]); -+ -+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token); -+} -+ -+/* Account a refaulted page based on the token stored in its shadow entry. */ -+static void lru_gen_refault(struct page *page, void *shadow) -+{ -+ int hist, tier, usage; -+ int memcg_id; -+ unsigned long token; -+ unsigned long min_seq; -+ struct lruvec *lruvec; -+ struct lrugen *lrugen; -+ struct pglist_data *pgdat; -+ struct mem_cgroup *memcg; -+ int type = page_is_file_lru(page); -+ -+ token = unpack_shadow(shadow, &memcg_id, &pgdat); -+ if (page_pgdat(page) != pgdat) -+ return; -+ -+ rcu_read_lock(); -+ memcg = page_memcg_rcu(page); -+ if (mem_cgroup_id(memcg) != memcg_id) -+ goto unlock; -+ -+ usage = token & (BIT(LRU_USAGE_SHIFT) - 1); -+ token >>= LRU_USAGE_SHIFT; -+ -+ lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ lrugen = &lruvec->evictable; -+ min_seq = READ_ONCE(lrugen->min_seq[type]); -+ if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_SHIFT))) -+ goto unlock; -+ -+ page_set_usage(page, usage); -+ -+ hist = hist_from_seq_or_gen(min_seq); -+ tier = lru_tier_from_usage(usage); -+ atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]); -+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type); -+ if (tier) -+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type); -+unlock: -+ rcu_read_unlock(); -+} -+ -+#else /* CONFIG_LRU_GEN */ -+ -+static void *lru_gen_eviction(struct page *page) -+{ -+ return NULL; -+} -+ -+static void lru_gen_refault(struct page *page, void *shadow) -+{ -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - /** - * workingset_age_nonresident - age non-resident entries as LRU ages - * @lruvec: the lruvec that was aged -@@ -249,6 +353,9 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) - VM_BUG_ON_PAGE(page_count(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - -+ if (lru_gen_enabled()) -+ return lru_gen_eviction(page); -+ - lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - /* XXX: target_memcg can be NULL, go through lruvec */ - memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); -@@ -283,6 +390,11 @@ void workingset_refault(struct page *page, void *shadow) - bool workingset; - int memcgid; - -+ if (lru_gen_enabled()) { -+ lru_gen_refault(page, shadow); -+ return; -+ } -+ - eviction = unpack_shadow(shadow, &memcgid, &pgdat); - - rcu_read_lock(); - -diff --git a/fs/exec.c b/fs/exec.c -index 18594f11c31f..c691d4d7720c 100644 ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -1008,6 +1008,7 @@ static int exec_mmap(struct mm_struct *mm) - active_mm = tsk->active_mm; - tsk->active_mm = mm; - tsk->mm = mm; -+ lru_gen_add_mm(mm); - /* - * This prevents preemption while active_mm is being loaded and - * it and mm are being updated, which could cause problems for -@@ -1018,6 +1019,7 @@ static int exec_mmap(struct mm_struct *mm) - if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) - local_irq_enable(); - activate_mm(active_mm, mm); -+ lru_gen_switch_mm(active_mm, mm); - if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) - local_irq_enable(); - tsk->mm->vmacache_seqnum = 0; -diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h -index 6bcac3d91dd1..60601a997433 100644 ---- a/include/linux/memcontrol.h -+++ b/include/linux/memcontrol.h -@@ -230,6 +230,8 @@ struct obj_cgroup { - }; - }; - -+struct lru_gen_mm_list; -+ - /* - * The memory controller data structure. The memory controller controls both - * page cache and RSS per cgroup. We would eventually like to provide -@@ -349,6 +351,10 @@ struct mem_cgroup { - struct deferred_split deferred_split_queue; - #endif - -+#ifdef CONFIG_LRU_GEN -+ struct lru_gen_mm_list *mm_list; -+#endif -+ - struct mem_cgroup_per_node *nodeinfo[0]; - /* WARNING: nodeinfo must be the last member here */ - }; -diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index 5aacc1c10a45..b0f662555eae 100644 ---- a/include/linux/mm_types.h -+++ b/include/linux/mm_types.h -@@ -15,6 +15,8 @@ - #include <linux/page-flags-layout.h> - #include <linux/workqueue.h> - #include <linux/seqlock.h> -+#include <linux/nodemask.h> -+#include <linux/mmdebug.h> - - #include <asm/mmu.h> - -@@ -561,6 +563,22 @@ struct mm_struct { - - #ifdef CONFIG_IOMMU_SUPPORT - u32 pasid; -+#endif -+#ifdef CONFIG_LRU_GEN -+ struct { -+ /* the node of a global or per-memcg mm_struct list */ -+ struct list_head list; -+#ifdef CONFIG_MEMCG -+ /* points to the memcg of the owner task above */ -+ struct mem_cgroup *memcg; -+#endif -+ /* whether this mm_struct has been used since the last walk */ -+ nodemask_t nodes; -+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -+ /* the number of CPUs using this mm_struct */ -+ atomic_t nr_cpus; -+#endif -+ } lrugen; - #endif - } __randomize_layout; - -@@ -588,6 +606,95 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) - return (struct cpumask *)&mm->cpu_bitmap; - } - -+#ifdef CONFIG_LRU_GEN -+ -+void lru_gen_init_mm(struct mm_struct *mm); -+void lru_gen_add_mm(struct mm_struct *mm); -+void lru_gen_del_mm(struct mm_struct *mm); -+#ifdef CONFIG_MEMCG -+int lru_gen_alloc_mm_list(struct mem_cgroup *memcg); -+void lru_gen_free_mm_list(struct mem_cgroup *memcg); -+void lru_gen_migrate_mm(struct mm_struct *mm); -+#endif -+ -+/* Track the usage of each mm_struct so that we can skip inactive ones. */ -+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) -+{ -+ /* exclude init_mm, efi_mm, etc. */ -+ if (!core_kernel_data((unsigned long)old)) { -+ VM_BUG_ON(old == &init_mm); -+ -+ nodes_setall(old->lrugen.nodes); -+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -+ atomic_dec(&old->lrugen.nr_cpus); -+ VM_BUG_ON_MM(atomic_read(&old->lrugen.nr_cpus) < 0, old); -+#endif -+ } else -+ VM_BUG_ON_MM(READ_ONCE(old->lrugen.list.prev) || -+ READ_ONCE(old->lrugen.list.next), old); -+ -+ if (!core_kernel_data((unsigned long)new)) { -+ VM_BUG_ON(new == &init_mm); -+ -+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -+ atomic_inc(&new->lrugen.nr_cpus); -+ VM_BUG_ON_MM(atomic_read(&new->lrugen.nr_cpus) < 0, new); -+#endif -+ } else -+ VM_BUG_ON_MM(READ_ONCE(new->lrugen.list.prev) || -+ READ_ONCE(new->lrugen.list.next), new); -+} -+ -+/* Return whether this mm_struct is being used on any CPUs. */ -+static inline bool lru_gen_mm_is_active(struct mm_struct *mm) -+{ -+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -+ return !cpumask_empty(mm_cpumask(mm)); -+#else -+ return atomic_read(&mm->lrugen.nr_cpus); -+#endif -+} -+ -+#else /* CONFIG_LRU_GEN */ -+ -+static inline void lru_gen_init_mm(struct mm_struct *mm) -+{ -+} -+ -+static inline void lru_gen_add_mm(struct mm_struct *mm) -+{ -+} -+ -+static inline void lru_gen_del_mm(struct mm_struct *mm) -+{ -+} -+ -+#ifdef CONFIG_MEMCG -+static inline int lru_gen_alloc_mm_list(struct mem_cgroup *memcg) -+{ -+ return 0; -+} -+ -+static inline void lru_gen_free_mm_list(struct mem_cgroup *memcg) -+{ -+} -+ -+static inline void lru_gen_migrate_mm(struct mm_struct *mm) -+{ -+} -+#endif -+ -+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) -+{ -+} -+ -+static inline bool lru_gen_mm_is_active(struct mm_struct *mm) -+{ -+ return false; -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - struct mmu_gather; - extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); - extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); -diff --git a/kernel/exit.c b/kernel/exit.c -index fd1c04193e18..b362179852f1 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -423,6 +423,7 @@ void mm_update_next_owner(struct mm_struct *mm) - goto retry; - } - WRITE_ONCE(mm->owner, c); -+ lru_gen_migrate_mm(mm); - task_unlock(c); - put_task_struct(c); - } -diff --git a/kernel/fork.c b/kernel/fork.c -index dc06afd725cb..2fd7dae9afcb 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -669,6 +669,7 @@ static void check_mm(struct mm_struct *mm) - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON_MM(mm->pmd_huge_pte, mm); - #endif -+ VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm); - } - - #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) -@@ -1061,6 +1062,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, - goto fail_nocontext; - - mm->user_ns = get_user_ns(user_ns); -+ lru_gen_init_mm(mm); - return mm; - - fail_nocontext: -@@ -1103,6 +1105,7 @@ static inline void __mmput(struct mm_struct *mm) - } - if (mm->binfmt) - module_put(mm->binfmt->module); -+ lru_gen_del_mm(mm); - mmdrop(mm); - } - -@@ -2524,6 +2527,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) - get_task_struct(p); - } - -+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { -+ /* lock the task to synchronize with memcg migration */ -+ task_lock(p); -+ lru_gen_add_mm(p->mm); -+ task_unlock(p); -+ } -+ - wake_up_new_task(p); - - /* forking complete and child started to run, tell ptracer */ -diff --git a/kernel/kthread.c b/kernel/kthread.c -index fe3f2a40d61e..b81e49ed31a7 100644 ---- a/kernel/kthread.c -+++ b/kernel/kthread.c -@@ -1325,6 +1325,7 @@ void kthread_use_mm(struct mm_struct *mm) - tsk->mm = mm; - membarrier_update_current_mm(mm); - switch_mm_irqs_off(active_mm, mm, tsk); -+ lru_gen_switch_mm(active_mm, mm); - local_irq_enable(); - task_unlock(tsk); - #ifdef finish_arch_post_lock_switch -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 5226cc26a095..2d4b77f173db 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -4323,6 +4323,7 @@ context_switch(struct rq *rq, struct task_struct *prev, - * finish_task_switch()'s mmdrop(). - */ - switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ lru_gen_switch_mm(prev->active_mm, next->mm); - - if (!prev->mm) { // from kernel - /* will mmdrop() in finish_task_switch(). */ -@@ -7603,6 +7604,7 @@ void idle_task_exit(void) - - if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); -+ lru_gen_switch_mm(mm, &init_mm); - finish_arch_post_lock_switch(); - } - -diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index 64ada9e650a5..58b610ffa0e0 100644 ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -5214,6 +5214,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) - free_mem_cgroup_per_node_info(memcg, node); - free_percpu(memcg->vmstats_percpu); - free_percpu(memcg->vmstats_local); -+ lru_gen_free_mm_list(memcg); - kfree(memcg); - } - -@@ -5266,6 +5267,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) - if (alloc_mem_cgroup_per_node_info(memcg, node)) - goto fail; - -+ if (lru_gen_alloc_mm_list(memcg)) -+ goto fail; -+ - if (memcg_wb_domain_init(memcg, GFP_KERNEL)) - goto fail; - -@@ -5991,6 +5995,29 @@ static void mem_cgroup_move_task(void) - } - #endif - -+#ifdef CONFIG_LRU_GEN -+static void mem_cgroup_attach(struct cgroup_taskset *tset) -+{ -+ struct cgroup_subsys_state *css; -+ struct task_struct *task = NULL; -+ -+ cgroup_taskset_for_each_leader(task, css, tset) -+ ; -+ -+ if (!task) -+ return; -+ -+ task_lock(task); -+ if (task->mm && task->mm->owner == task) -+ lru_gen_migrate_mm(task->mm); -+ task_unlock(task); -+} -+#else -+static void mem_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+#endif -+ - static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) - { - if (value == PAGE_COUNTER_MAX) -@@ -6332,6 +6359,7 @@ struct cgroup_subsys memory_cgrp_subsys = { - .css_reset = mem_cgroup_css_reset, - .css_rstat_flush = mem_cgroup_css_rstat_flush, - .can_attach = mem_cgroup_can_attach, -+ .attach = mem_cgroup_attach, - .cancel_attach = mem_cgroup_cancel_attach, - .post_attach = mem_cgroup_move_task, - .dfl_cftypes = memory_files, -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 84d25079092e..d93d2272e475 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -2869,6 +2869,323 @@ static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos * - sp->refaulted * max(pv->total, 1UL) * pv->gain; - } - +/****************************************************************************** + * mm_struct list + ******************************************************************************/ @@ -2512,96 +2150,6 @@ index 84d25079092e..d93d2272e475 100644 + return last; +} + - /****************************************************************************** - * state change - ******************************************************************************/ -@@ -3096,6 +3413,13 @@ static int __init init_lru_gen(void) - { - BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); - BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); -+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); -+ -+ if (mem_cgroup_disabled()) { -+ global_mm_list = alloc_mm_list(); -+ if (WARN_ON_ONCE(!global_mm_list)) -+ return -ENOMEM; -+ } - - if (hotplug_memory_notifier(lru_gen_online_mem, 0)) - pr_err("lru_gen: failed to subscribe hotplug notifications\n"); - -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index e5deec17b4bd..38de59fcbe54 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -294,6 +294,7 @@ enum lruvec_flags { - }; - - struct lruvec; -+struct page_vma_mapped_walk; - - #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) - #define LRU_USAGE_MASK ((BIT(LRU_USAGE_WIDTH) - 1) << LRU_USAGE_PGOFF) -@@ -382,6 +383,7 @@ struct lrugen { - - void lru_gen_init_lruvec(struct lruvec *lruvec); - void lru_gen_set_state(bool enable, bool main, bool swap); -+void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw); - - #else /* CONFIG_LRU_GEN */ - -@@ -393,6 +395,10 @@ static inline void lru_gen_set_state(bool enable, bool main, bool swap) - { - } - -+static inline void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw) -+{ -+} -+ - #endif /* CONFIG_LRU_GEN */ - - struct lruvec { -diff --git a/mm/rmap.c b/mm/rmap.c -index 693a610e181d..985cf4ebd03c 100644 ---- a/mm/rmap.c -+++ b/mm/rmap.c -@@ -72,6 +72,7 @@ - #include <linux/page_idle.h> - #include <linux/memremap.h> - #include <linux/userfaultfd_k.h> -+#include <linux/mm_inline.h> - - #include <asm/tlbflush.h> - -@@ -792,6 +793,11 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, - } - - if (pvmw.pte) { -+ /* the multigenerational lru exploits the spatial locality */ -+ if (lru_gen_enabled() && pte_young(*pvmw.pte)) { -+ lru_gen_scan_around(&pvmw); -+ referenced++; -+ } - if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) { - /* -diff --git a/mm/vmscan.c b/mm/vmscan.c -index d93d2272e475..837d5e6a821e 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -50,6 +50,8 @@ - #include <linux/dax.h> - #include <linux/psi.h> - #include <linux/memory.h> -+#include <linux/pagewalk.h> -+#include <linux/shmem_fs.h> - - #include <asm/tlbflush.h> - #include <asm/div64.h> -@@ -3186,6 +3188,788 @@ static bool get_next_mm(struct mm_walk_args *args, struct mm_struct **iter) - return last; - } - +/****************************************************************************** + * the aging + ******************************************************************************/ @@ -3384,74 +2932,6 @@ index d93d2272e475..837d5e6a821e 100644 + set_page_dirty(pte_page(pte[i])); +} + - /****************************************************************************** - * state change - ******************************************************************************/ -@@ -3415,6 +4199,10 @@ static int __init init_lru_gen(void) - BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); - BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); - -+ VM_BUG_ON(PMD_SIZE / PAGE_SIZE != PTRS_PER_PTE); -+ VM_BUG_ON(PUD_SIZE / PMD_SIZE != PTRS_PER_PMD); -+ VM_BUG_ON(P4D_SIZE / PUD_SIZE != PTRS_PER_PUD); -+ - if (mem_cgroup_disabled()) { - global_mm_list = alloc_mm_list(); - if (WARN_ON_ONCE(!global_mm_list)) - -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index 38de59fcbe54..ded72f44d7e7 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -863,6 +863,8 @@ struct deferred_split { - }; - #endif - -+struct mm_walk_args; -+ - /* - * On NUMA machines, each NUMA node would have a pg_data_t to describe - * it's memory layout. On UMA machines there is a single pglist_data which -@@ -968,6 +970,9 @@ typedef struct pglist_data { - - unsigned long flags; - -+#ifdef CONFIG_LRU_GEN -+ struct mm_walk_args *mm_walk_args; -+#endif - ZONE_PADDING(_pad2_) - - /* Per-node vmstats */ -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 837d5e6a821e..2f86dcc04c56 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -1311,6 +1311,11 @@ static unsigned int shrink_page_list(struct list_head *page_list, - if (!sc->may_unmap && page_mapped(page)) - goto keep_locked; - -+ /* in case the page was found accessed by lru_gen_scan_around() */ -+ if (lru_gen_enabled() && !ignore_references && -+ page_mapped(page) && PageReferenced(page)) -+ goto keep_locked; -+ - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); - -@@ -2431,6 +2436,9 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) - unsigned long file; - struct lruvec *target_lruvec; - -+ if (lru_gen_enabled()) -+ return; -+ - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - - /* -@@ -3970,6 +3978,489 @@ void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw) - set_page_dirty(pte_page(pte[i])); - } - +/****************************************************************************** + * the eviction + ******************************************************************************/ @@ -3935,124 +3415,223 @@ index 837d5e6a821e..2f86dcc04c56 100644 + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); +} + - /****************************************************************************** - * state change - ******************************************************************************/ -@@ -4172,6 +4663,21 @@ static int __meminit __maybe_unused lru_gen_online_mem(struct notifier_block *se - return NOTIFY_DONE; - } - -+static void lru_gen_start_kswapd(int nid) ++/****************************************************************************** ++ * state change ++ ******************************************************************************/ ++ ++#ifdef CONFIG_LRU_GEN_ENABLED ++DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); ++#else ++DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); ++#endif ++ ++static DEFINE_MUTEX(lru_gen_state_mutex); ++static int lru_gen_nr_swapfiles __read_mostly; ++ ++static bool __maybe_unused state_is_valid(struct lruvec *lruvec) +{ -+ struct pglist_data *pgdat = NODE_DATA(nid); ++ int gen, type, zone; ++ enum lru_list lru; ++ struct lrugen *lrugen = &lruvec->evictable; + -+ pgdat->mm_walk_args = kvzalloc_node(size_of_mm_walk_args(), GFP_KERNEL, nid); -+ WARN_ON_ONCE(!pgdat->mm_walk_args); ++ for_each_evictable_lru(lru) { ++ type = is_file_lru(lru); ++ ++ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) ++ return false; ++ } ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) ++ return false; ++ ++ VM_WARN_ON_ONCE(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); ++ } ++ ++ return true; +} + -+static void lru_gen_stop_kswapd(int nid) ++static bool fill_lru_gen_lists(struct lruvec *lruvec) +{ -+ struct pglist_data *pgdat = NODE_DATA(nid); ++ enum lru_list lru; ++ int batch_size = 0; + -+ kvfree(pgdat->mm_walk_args); -+} ++ for_each_evictable_lru(lru) { ++ int type = is_file_lru(lru); ++ bool active = is_active_lru(lru); ++ struct list_head *head = &lruvec->lists[lru]; + - /****************************************************************************** - * initialization - ******************************************************************************/ -@@ -4220,6 +4726,24 @@ static int __init init_lru_gen(void) - */ - arch_initcall(init_lru_gen); - -+#else /* CONFIG_LRU_GEN */ ++ if (!lruvec->evictable.enabled[type]) ++ continue; + -+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page) != active, page); ++ VM_BUG_ON_PAGE(page_lru_gen(page) != -1, page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ del_page_from_lru_list(page, lruvec); ++ success = lru_gen_addition(page, lruvec, true); ++ VM_BUG_ON(!success); ++ ++ if (++batch_size == MAX_BATCH_SIZE) ++ return false; ++ } ++ } ++ ++ return true; +} + -+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++static bool drain_lru_gen_lists(struct lruvec *lruvec) +{ ++ int gen, type, zone; ++ int batch_size = 0; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; ++ ++ if (lruvec->evictable.enabled[type]) ++ continue; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ success = lru_gen_deletion(page, lruvec); ++ VM_BUG_ON(!success); ++ add_page_to_lru_list(page, lruvec); ++ ++ if (++batch_size == MAX_BATCH_SIZE) ++ return false; ++ } ++ } ++ ++ return true; +} + -+static void lru_gen_start_kswapd(int nid) ++/* ++ * For file page tracking, we enable/disable it according to the main switch. ++ * For anon page tracking, we only enabled it when the main switch is on and ++ * there is at least one swapfile; we disable it when there are no swapfiles ++ * regardless of the value of the main switch. Otherwise, we will eventually ++ * reach the max size of the sliding window and have to call inc_min_seq(), ++ * which brings an unnecessary overhead. ++ */ ++void lru_gen_set_state(bool enable, bool main, bool swap) +{ ++ struct mem_cgroup *memcg; ++ ++ mem_hotplug_begin(); ++ mutex_lock(&lru_gen_state_mutex); ++ cgroup_lock(); ++ ++ main = main && enable != lru_gen_enabled(); ++ swap = swap && !(enable ? lru_gen_nr_swapfiles++ : --lru_gen_nr_swapfiles); ++ swap = swap && lru_gen_enabled(); ++ if (!main && !swap) ++ goto unlock; ++ ++ if (main) { ++ if (enable) ++ static_branch_enable(&lru_gen_static_key); ++ else ++ static_branch_disable(&lru_gen_static_key); ++ } ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node_state(nid, N_MEMORY) { ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ VM_BUG_ON(!state_is_valid(lruvec)); ++ ++ WRITE_ONCE(lrugen->enabled[0], lru_gen_enabled() && lru_gen_nr_swapfiles); ++ WRITE_ONCE(lrugen->enabled[1], lru_gen_enabled()); ++ ++ while (!(enable ? fill_lru_gen_lists(lruvec) : ++ drain_lru_gen_lists(lruvec))) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++unlock: ++ cgroup_unlock(); ++ mutex_unlock(&lru_gen_state_mutex); ++ mem_hotplug_done(); +} + -+static void lru_gen_stop_kswapd(int nid) ++static int __meminit __maybe_unused lru_gen_online_mem(struct notifier_block *self, ++ unsigned long action, void *arg) +{ ++ struct mem_cgroup *memcg; ++ struct memory_notify *mnb = arg; ++ int nid = mnb->status_change_nid; ++ ++ if (action != MEM_GOING_ONLINE || nid == NUMA_NO_NODE) ++ return NOTIFY_DONE; ++ ++ mutex_lock(&lru_gen_state_mutex); ++ cgroup_lock(); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ VM_BUG_ON(!state_is_valid(lruvec)); ++ ++ WRITE_ONCE(lrugen->enabled[0], lru_gen_enabled() && lru_gen_nr_swapfiles); ++ WRITE_ONCE(lrugen->enabled[1], lru_gen_enabled()); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ cgroup_unlock(); ++ mutex_unlock(&lru_gen_state_mutex); ++ ++ return NOTIFY_DONE; +} + - #endif /* CONFIG_LRU_GEN */ - - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -@@ -4233,6 +4757,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - struct blk_plug plug; - bool scan_adjusted; - -+ if (lru_gen_enabled()) { -+ lru_gen_shrink_lruvec(lruvec, sc); -+ return; -+ } ++static void lru_gen_start_kswapd(int nid) ++{ ++ struct pglist_data *pgdat = NODE_DATA(nid); + - get_scan_count(lruvec, sc, nr); - - /* Record the original scan target for proportional adjustments later */ -@@ -4699,6 +5228,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) - struct lruvec *target_lruvec; - unsigned long refaults; - -+ if (lru_gen_enabled()) -+ return; ++ pgdat->mm_walk_args = kvzalloc_node(size_of_mm_walk_args(), GFP_KERNEL, nid); ++ WARN_ON_ONCE(!pgdat->mm_walk_args); ++} + - target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); - target_lruvec->refaults[0] = refaults; -@@ -5073,6 +5605,11 @@ static void age_active_anon(struct pglist_data *pgdat, - struct mem_cgroup *memcg; - struct lruvec *lruvec; - -+ if (lru_gen_enabled()) { -+ lru_gen_age_node(pgdat, sc); -+ return; -+ } ++static void lru_gen_stop_kswapd(int nid) ++{ ++ struct pglist_data *pgdat = NODE_DATA(nid); + - if (!total_swap_pages) - return; - -@@ -5753,6 +6290,8 @@ int kswapd_run(int nid) - if (pgdat->kswapd) - return 0; - -+ lru_gen_start_kswapd(nid); ++ kvfree(pgdat->mm_walk_args); ++} + - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); - if (IS_ERR(pgdat->kswapd)) { - /* failure at boot is fatal */ -@@ -5775,6 +6314,7 @@ void kswapd_stop(int nid) - if (kswapd) { - kthread_stop(kswapd); - NODE_DATA(nid)->kswapd = NULL; -+ lru_gen_stop_kswapd(nid); - } - } - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 2f86dcc04c56..ff2deec24c64 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -52,6 +52,8 @@ - #include <linux/memory.h> - #include <linux/pagewalk.h> - #include <linux/shmem_fs.h> -+#include <linux/ctype.h> -+#include <linux/debugfs.h> - - #include <asm/tlbflush.h> - #include <asm/div64.h> -@@ -4678,6 +4680,401 @@ static void lru_gen_stop_kswapd(int nid) - kvfree(pgdat->mm_walk_args); - } - +/****************************************************************************** + * sysfs interface + ******************************************************************************/ @@ -4448,250 +4027,473 @@ index 2f86dcc04c56..ff2deec24c64 100644 + .release = seq_release, +}; + - /****************************************************************************** - * initialization - ******************************************************************************/ -@@ -4718,6 +5115,12 @@ static int __init init_lru_gen(void) - if (hotplug_memory_notifier(lru_gen_online_mem, 0)) - pr_err("lru_gen: failed to subscribe hotplug notifications\n"); - ++/****************************************************************************** ++ * initialization ++ ******************************************************************************/ ++ ++void lru_gen_init_lruvec(struct lruvec *lruvec) ++{ ++ int i; ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ lrugen->max_seq = MIN_NR_GENS + 1; ++ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; ++ lrugen->enabled[1] = lru_gen_enabled(); ++ ++ for (i = 0; i <= MIN_NR_GENS + 1; i++) ++ lrugen->timestamps[i] = jiffies; ++ ++ for_each_gen_type_zone(gen, type, zone) ++ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++} ++ ++static int __init init_lru_gen(void) ++{ ++ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); ++ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); ++ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); ++ ++ VM_BUG_ON(PMD_SIZE / PAGE_SIZE != PTRS_PER_PTE); ++ VM_BUG_ON(PUD_SIZE / PMD_SIZE != PTRS_PER_PMD); ++ VM_BUG_ON(P4D_SIZE / PUD_SIZE != PTRS_PER_PUD); ++ ++ if (mem_cgroup_disabled()) { ++ global_mm_list = alloc_mm_list(); ++ if (WARN_ON_ONCE(!global_mm_list)) ++ return -ENOMEM; ++ } ++ ++ if (hotplug_memory_notifier(lru_gen_online_mem, 0)) ++ pr_err("lru_gen: failed to subscribe hotplug notifications\n"); ++ + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) + pr_err("lru_gen: failed to create sysfs group\n"); + + debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); + debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); + - return 0; - }; - /* - -diff --git a/mm/Kconfig b/mm/Kconfig -index 24c045b24b95..e82e6b92820c 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -872,4 +872,61 @@ config MAPPING_DIRTY_HELPERS - config KMAP_LOCAL - bool - -+# the multigenerational lru { -+config LRU_GEN -+ bool "Multigenerational LRU" -+ depends on MMU -+ help -+ A high performance LRU implementation to heavily overcommit workloads -+ that are not IO bound. See Documentation/vm/multigen_lru.rst for -+ details. -+ -+ Warning: do not enable this option unless you plan to use it because -+ it introduces a small per-process and per-memcg and per-node memory -+ overhead. -+ -+config LRU_GEN_ENABLED -+ bool "Turn on by default" -+ depends on LRU_GEN -+ help -+ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option -+ changes it to 1. ++ return 0; ++}; ++/* ++ * We want to run as early as possible because debug code may call mm_alloc() ++ * and mmput(). Out only dependency mm_kobj is initialized one stage earlier. ++ */ ++arch_initcall(init_lru_gen); + -+ Warning: the default value is the fast path. See -+ Documentation/static-keys.txt for details. ++#else /* CONFIG_LRU_GEN */ + -+config LRU_GEN_STATS -+ bool "Full stats for debugging" -+ depends on LRU_GEN -+ help -+ This option keeps full stats for each generation, which can be read -+ from /sys/kernel/debug/lru_gen_full. ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++} + -+ Warning: do not enable this option unless you plan to use it because -+ it introduces an additional small per-process and per-memcg and -+ per-node memory overhead. ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++} + -+config NR_LRU_GENS -+ int "Max number of generations" -+ depends on LRU_GEN -+ range 4 31 -+ default 7 -+ help -+ This will use order_base_2(N+1) spare bits from page flags. ++static void lru_gen_start_kswapd(int nid) ++{ ++} + -+ Warning: do not use numbers larger than necessary because each -+ generation introduces a small per-node and per-memcg memory overhead. ++static void lru_gen_stop_kswapd(int nid) ++{ ++} + -+config TIERS_PER_GEN -+ int "Number of tiers per generation" -+ depends on LRU_GEN -+ range 2 5 -+ default 4 -+ help -+ This will use N-2 spare bits from page flags. ++#endif /* CONFIG_LRU_GEN */ + -+ Larger values generally offer better protection to active pages under -+ heavy buffered I/O workloads. -+# } + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + unsigned long nr[NR_LRU_LISTS]; +@@ -2629,6 +5160,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + struct blk_plug plug; + bool scan_adjusted; + ++ if (lru_gen_enabled()) { ++ lru_gen_shrink_lruvec(lruvec, sc); ++ return; ++ } + - endmenu - - -diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst -index eff5fbd492d0..c353b3f55924 100644 ---- a/Documentation/vm/index.rst -+++ b/Documentation/vm/index.rst -@@ -17,6 +17,7 @@ various features of the Linux memory management + get_scan_count(lruvec, sc, nr); - swap_numa - zswap -+ multigen_lru + /* Record the original scan target for proportional adjustments later */ +@@ -2866,7 +5402,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; + bool reclaimable = false; +- unsigned long file; - Kernel developers MM documentation - ================================== -diff --git a/Documentation/vm/multigen_lru.rst b/Documentation/vm/multigen_lru.rst -new file mode 100644 -index 000000000000..a18416ed7e92 ---- /dev/null -+++ b/Documentation/vm/multigen_lru.rst -@@ -0,0 +1,143 @@ -+.. SPDX-License-Identifier: GPL-2.0 -+ -+===================== -+Multigenerational LRU -+===================== + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + +@@ -2876,93 +5411,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + +- /* +- * Determine the scan balance between anon and file LRUs. +- */ +- spin_lock_irq(&target_lruvec->lru_lock); +- sc->anon_cost = target_lruvec->anon_cost; +- sc->file_cost = target_lruvec->file_cost; +- spin_unlock_irq(&target_lruvec->lru_lock); +- +- /* +- * Target desirable inactive:active list ratios for the anon +- * and file LRU lists. +- */ +- if (!sc->force_deactivate) { +- unsigned long refaults; +- +- refaults = lruvec_page_state(target_lruvec, +- WORKINGSET_ACTIVATE_ANON); +- if (refaults != target_lruvec->refaults[0] || +- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) +- sc->may_deactivate |= DEACTIVATE_ANON; +- else +- sc->may_deactivate &= ~DEACTIVATE_ANON; +- +- /* +- * When refaults are being observed, it means a new +- * workingset is being established. Deactivate to get +- * rid of any stale active pages quickly. +- */ +- refaults = lruvec_page_state(target_lruvec, +- WORKINGSET_ACTIVATE_FILE); +- if (refaults != target_lruvec->refaults[1] || +- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) +- sc->may_deactivate |= DEACTIVATE_FILE; +- else +- sc->may_deactivate &= ~DEACTIVATE_FILE; +- } else +- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; +- +- /* +- * If we have plenty of inactive file pages that aren't +- * thrashing, try to reclaim those first before touching +- * anonymous pages. +- */ +- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); +- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) +- sc->cache_trim_mode = 1; +- else +- sc->cache_trim_mode = 0; +- +- /* +- * Prevent the reclaimer from falling into the cache trap: as +- * cache pages start out inactive, every cache fault will tip +- * the scan balance towards the file LRU. And as the file LRU +- * shrinks, so does the window for rotation from references. +- * This means we have a runaway feedback loop where a tiny +- * thrashing file LRU becomes infinitely more attractive than +- * anon pages. Try to detect this based on file LRU size. +- */ +- if (!cgroup_reclaim(sc)) { +- unsigned long total_high_wmark = 0; +- unsigned long free, anon; +- int z; +- +- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); +- file = node_page_state(pgdat, NR_ACTIVE_FILE) + +- node_page_state(pgdat, NR_INACTIVE_FILE); +- +- for (z = 0; z < MAX_NR_ZONES; z++) { +- struct zone *zone = &pgdat->node_zones[z]; +- if (!managed_zone(zone)) +- continue; +- +- total_high_wmark += high_wmark_pages(zone); +- } +- +- /* +- * Consider anon: if that's low too, this isn't a +- * runaway file reclaim problem, but rather just +- * extreme pressure. Reclaim as per usual then. +- */ +- anon = node_page_state(pgdat, NR_INACTIVE_ANON); +- +- sc->file_is_tiny = +- file + free <= total_high_wmark && +- !(sc->may_deactivate & DEACTIVATE_ANON) && +- anon >> sc->priority; +- } ++ prepare_scan_count(pgdat, sc); + + shrink_node_memcgs(pgdat, sc); + +@@ -3182,6 +5631,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) + struct lruvec *target_lruvec; + unsigned long refaults; + ++ if (lru_gen_enabled()) ++ return; + -+Quick Start -+=========== -+Build Options -+------------- -+:Required: Set ``CONFIG_LRU_GEN=y``. + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); + target_lruvec->refaults[0] = refaults; +@@ -3556,6 +6008,11 @@ static void age_active_anon(struct pglist_data *pgdat, + struct mem_cgroup *memcg; + struct lruvec *lruvec; + ++ if (lru_gen_enabled()) { ++ lru_gen_age_node(pgdat, sc); ++ return; ++ } + -+:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by -+ default. + if (!total_swap_pages) + return; + +@@ -4236,6 +6693,8 @@ int kswapd_run(int nid) + if (pgdat->kswapd) + return 0; + ++ lru_gen_start_kswapd(nid); + -+:Optional: Change ``CONFIG_NR_LRU_GENS`` to a number ``X`` to support -+ a maximum of ``X`` generations. + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ +@@ -4258,6 +6717,7 @@ void kswapd_stop(int nid) + if (kswapd) { + kthread_stop(kswapd); + NODE_DATA(nid)->kswapd = NULL; ++ lru_gen_stop_kswapd(nid); + } + } + +diff --git a/mm/workingset.c b/mm/workingset.c +index b7cdeca5a76d..3f3f03d51ea7 100644 +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -168,9 +168,9 @@ + * refault distance will immediately activate the refaulting page. + */ + +-#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ +- 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) +-#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) ++#define EVICTION_SHIFT (BITS_PER_XA_VALUE - MEM_CGROUP_ID_SHIFT - NODES_SHIFT) ++#define EVICTION_MASK (BIT(EVICTION_SHIFT) - 1) ++#define WORKINGSET_WIDTH 1 + + /* + * Eviction timestamps need to be able to cover the full range of +@@ -182,38 +182,129 @@ + */ + static unsigned int bucket_order __read_mostly; + +-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, +- bool workingset) ++static void *pack_shadow(int memcg_id, struct pglist_data *pgdat, unsigned long val) + { +- eviction >>= bucket_order; +- eviction &= EVICTION_MASK; +- eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; +- eviction = (eviction << NODES_SHIFT) | pgdat->node_id; +- eviction = (eviction << 1) | workingset; ++ val = (val << MEM_CGROUP_ID_SHIFT) | memcg_id; ++ val = (val << NODES_SHIFT) | pgdat->node_id; + +- return xa_mk_value(eviction); ++ return xa_mk_value(val); + } + +-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, +- unsigned long *evictionp, bool *workingsetp) ++static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_data **pgdat) + { +- unsigned long entry = xa_to_value(shadow); +- int memcgid, nid; +- bool workingset; ++ unsigned long val = xa_to_value(shadow); + +- workingset = entry & 1; +- entry >>= 1; +- nid = entry & ((1UL << NODES_SHIFT) - 1); +- entry >>= NODES_SHIFT; +- memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); +- entry >>= MEM_CGROUP_ID_SHIFT; ++ *pgdat = NODE_DATA(val & (BIT(NODES_SHIFT) - 1)); ++ val >>= NODES_SHIFT; ++ *memcg_id = val & (BIT(MEM_CGROUP_ID_SHIFT) - 1); + +- *memcgidp = memcgid; +- *pgdat = NODE_DATA(nid); +- *evictionp = entry << bucket_order; +- *workingsetp = workingset; ++ return val >> MEM_CGROUP_ID_SHIFT; + } + ++#ifdef CONFIG_LRU_GEN + -+:Optional: Change ``CONFIG_TIERS_PER_GEN`` to a number ``Y`` to -+ support a maximum of ``Y`` tiers per generation. ++#if LRU_GEN_SHIFT + LRU_USAGE_SHIFT >= EVICTION_SHIFT ++#error "Please try smaller NODES_SHIFT, NR_LRU_GENS and TIERS_PER_GEN configurations" ++#endif + -+Runtime Options -+--------------- -+:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the -+ feature was not turned on by default. ++static void page_set_usage(struct page *page, int usage) ++{ ++ unsigned long old_flags, new_flags; + -+:Optional: Change ``/sys/kernel/mm/lru_gen/spread`` to a number ``N`` -+ to spread pages out across ``N+1`` generations. ``N`` should be less -+ than ``X``. Larger values make the background aging more aggressive. ++ VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH)); + -+:Optional: Read ``/sys/kernel/debug/lru_gen`` to verify the feature. -+ This file has the following output: ++ if (!usage) ++ return; + -+:: ++ do { ++ old_flags = READ_ONCE(page->flags); ++ new_flags = (old_flags & ~LRU_USAGE_MASK) | LRU_TIER_FLAGS | ++ ((usage - 1UL) << LRU_USAGE_PGOFF); ++ } while (new_flags != old_flags && ++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++} + -+ memcg memcg_id memcg_path -+ node node_id -+ min_gen birth_time anon_size file_size -+ ... -+ max_gen birth_time anon_size file_size ++/* Return a token to be stored in the shadow entry of a page being evicted. */ ++static void *lru_gen_eviction(struct page *page) ++{ ++ int hist, tier; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lrugen *lrugen; ++ int type = page_is_file_lru(page); ++ int usage = page_tier_usage(page); ++ struct mem_cgroup *memcg = page_memcg(page); ++ struct pglist_data *pgdat = page_pgdat(page); + -+Given a memcg and a node, ``min_gen`` is the oldest generation -+(number) and ``max_gen`` is the youngest. Birth time is in -+milliseconds. The sizes of anon and file types are in pages. ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->evictable; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ token = (min_seq << LRU_USAGE_SHIFT) | usage; + -+Recipes -+------- -+:Android on ARMv8.1+: ``X=4``, ``Y=3`` and ``N=0``. ++ hist = hist_from_seq_or_gen(min_seq); ++ tier = lru_tier_from_usage(usage); ++ atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]); + -+:Android on pre-ARMv8.1 CPUs: Not recommended due to the lack of -+ ``ARM64_HW_AFDBM``. ++ return pack_shadow(mem_cgroup_id(memcg), pgdat, token); ++} + -+:Laptops and workstations running Chrome on x86_64: Use the default -+ values. ++/* Account a refaulted page based on the token stored in its shadow entry. */ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++ int hist, tier, usage; ++ int memcg_id; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lrugen *lrugen; ++ struct pglist_data *pgdat; ++ struct mem_cgroup *memcg; ++ int type = page_is_file_lru(page); + -+:Working set estimation: Write ``+ memcg_id node_id gen [swappiness]`` -+ to ``/sys/kernel/debug/lru_gen`` to account referenced pages to -+ generation ``max_gen`` and create the next generation ``max_gen+1``. -+ ``gen`` should be equal to ``max_gen``. A swap file and a non-zero -+ ``swappiness`` are required to scan anon type. If swapping is not -+ desired, set ``vm.swappiness`` to ``0``. ++ token = unpack_shadow(shadow, &memcg_id, &pgdat); ++ if (page_pgdat(page) != pgdat) ++ return; + -+:Proactive reclaim: Write ``- memcg_id node_id gen [swappiness] -+ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to evict -+ generations less than or equal to ``gen``. ``gen`` should be less -+ than ``max_gen-1`` as ``max_gen`` and ``max_gen-1`` are active -+ generations and therefore protected from the eviction. Use -+ ``nr_to_reclaim`` to limit the number of pages to evict. Multiple -+ command lines are supported, so does concatenation with delimiters -+ ``,`` and ``;``. ++ rcu_read_lock(); ++ memcg = page_memcg_rcu(page); ++ if (mem_cgroup_id(memcg) != memcg_id) ++ goto unlock; + -+Framework -+========= -+For each ``lruvec``, evictable pages are divided into multiple -+generations. The youngest generation number is stored in ``max_seq`` -+for both anon and file types as they are aged on an equal footing. The -+oldest generation numbers are stored in ``min_seq[2]`` separately for -+anon and file types as clean file pages can be evicted regardless of -+swap and write-back constraints. These three variables are -+monotonically increasing. Generation numbers are truncated into -+``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into -+``page->flags``. The sliding window technique is used to prevent -+truncated generation numbers from overlapping. Each truncated -+generation number is an index to an array of per-type and per-zone -+lists. Evictable pages are added to the per-zone lists indexed by -+``max_seq`` or ``min_seq[2]`` (modulo ``CONFIG_NR_LRU_GENS``), -+depending on their types. ++ usage = token & (BIT(LRU_USAGE_SHIFT) - 1); ++ token >>= LRU_USAGE_SHIFT; + -+Each generation is then divided into multiple tiers. Tiers represent -+levels of usage from file descriptors only. Pages accessed N times via -+file descriptors belong to tier order_base_2(N). Each generation -+contains at most CONFIG_TIERS_PER_GEN tiers, and they require -+additional CONFIG_TIERS_PER_GEN-2 bits in page->flags. In contrast to -+moving across generations which requires the lru lock for the list -+operations, moving across tiers only involves an atomic operation on -+``page->flags`` and therefore has a negligible cost. A feedback loop -+modeled after the PID controller monitors the refault rates across all -+tiers and decides when to activate pages from which tiers in the -+reclaim path. ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->evictable; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_SHIFT))) ++ goto unlock; + -+The framework comprises two conceptually independent components: the -+aging and the eviction, which can be invoked separately from user -+space for the purpose of working set estimation and proactive reclaim. ++ page_set_usage(page, usage); + -+Aging -+----- -+The aging produces young generations. Given an ``lruvec``, the aging -+scans page tables for referenced pages of this ``lruvec``. Upon -+finding one, the aging updates its generation number to ``max_seq``. -+After each round of scan, the aging increments ``max_seq``. ++ hist = hist_from_seq_or_gen(min_seq); ++ tier = lru_tier_from_usage(usage); ++ atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]); ++ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type); ++ if (tier) ++ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type); ++unlock: ++ rcu_read_unlock(); ++} + -+The aging maintains either a system-wide ``mm_struct`` list or -+per-memcg ``mm_struct`` lists, and it only scans page tables of -+processes that have been scheduled since the last scan. ++#else /* CONFIG_LRU_GEN */ + -+The aging is due when both of ``min_seq[2]`` reaches ``max_seq-1``, -+assuming both anon and file types are reclaimable. ++static void *lru_gen_eviction(struct page *page) ++{ ++ return NULL; ++} + -+Eviction -+-------- -+The eviction consumes old generations. Given an ``lruvec``, the -+eviction scans the pages on the per-zone lists indexed by either of -+``min_seq[2]``. It first tries to select a type based on the values of -+``min_seq[2]``. When anon and file types are both available from the -+same generation, it selects the one that has a lower refault rate. ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++} + -+During a scan, the eviction sorts pages according to their new -+generation numbers, if the aging has found them referenced. It also -+moves pages from the tiers that have higher refault rates than tier 0 -+to the next generation. ++#endif /* CONFIG_LRU_GEN */ + -+When it finds all the per-zone lists of a selected type are empty, the -+eviction increments ``min_seq[2]`` indexed by this selected type. + /** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @lruvec: the lruvec that was aged +@@ -262,12 +353,17 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + ++ if (lru_gen_enabled()) ++ return lru_gen_eviction(page); + -+To-do List -+========== -+KVM Optimization -+---------------- -+Support shadow page table scanning. + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + /* XXX: target_memcg can be NULL, go through lruvec */ + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + eviction = atomic_long_read(&lruvec->nonresident_age); ++ eviction >>= bucket_order; ++ eviction = (eviction << WORKINGSET_WIDTH) | PageWorkingset(page); + workingset_age_nonresident(lruvec, thp_nr_pages(page)); +- return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); ++ return pack_shadow(memcgid, pgdat, eviction); + } + + /** +@@ -294,7 +390,12 @@ void workingset_refault(struct page *page, void *shadow) + bool workingset; + int memcgid; + +- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); ++ if (lru_gen_enabled()) { ++ lru_gen_refault(page, shadow); ++ return; ++ } + -+NUMA Optimization -+----------------- -+Optimize page table scan for NUMA. ---
\ No newline at end of file ++ eviction = unpack_shadow(shadow, &memcgid, &pgdat); + + rcu_read_lock(); + /* +@@ -318,6 +419,8 @@ void workingset_refault(struct page *page, void *shadow) + goto out; + eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); + refault = atomic_long_read(&eviction_lruvec->nonresident_age); ++ workingset = eviction & (BIT(WORKINGSET_WIDTH) - 1); ++ eviction = (eviction >> WORKINGSET_WIDTH) << bucket_order; + + /* + * Calculate the refault distance +@@ -335,7 +438,7 @@ void workingset_refault(struct page *page, void *shadow) + * longest time, so the occasional inappropriate activation + * leading to pressure on the active list is not a problem. + */ +- refault_distance = (refault - eviction) & EVICTION_MASK; ++ refault_distance = (refault - eviction) & (EVICTION_MASK >> WORKINGSET_WIDTH); + + /* + * The activation decision for this page is made at the level +@@ -593,7 +696,7 @@ static int __init workingset_init(void) + unsigned int max_order; + int ret; + +- BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); ++ BUILD_BUG_ON(EVICTION_SHIFT < WORKINGSET_WIDTH); + /* + * Calculate the eviction bucket size to cover the longest + * actionable refault distance, which is currently half of +@@ -601,7 +704,7 @@ static int __init workingset_init(void) + * some more pages at runtime, so keep working with up to + * double the initial memory by using totalram_pages as-is. + */ +- timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; ++ timestamp_bits = EVICTION_SHIFT - WORKINGSET_WIDTH; + max_order = fls_long(totalram_pages() - 1); + if (max_order > timestamp_bits) + bucket_order = max_order - timestamp_bits; |