diff options
-rw-r--r-- | .SRCINFO | 26 | ||||
-rw-r--r-- | 0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch | 6 | ||||
-rw-r--r-- | 0002-x86-setup-Consolidate-early-memory-reservations.patch | 188 | ||||
-rw-r--r-- | 0003-bfq-lucjan-r2K210602.patch | 1534 | ||||
-rw-r--r-- | 0003-x86-setup-Merge-several-reservations-of-start-of-mem.patch | 67 | ||||
-rw-r--r-- | 0004-x86-setup-Move-trim_snb_memory-later-in-setup_arch-t.patch | 87 | ||||
-rw-r--r-- | 0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch | 170 | ||||
-rw-r--r-- | 0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch | 114 | ||||
-rw-r--r-- | 0007-x86-crash-remove-crash_reserve_low_1M.patch | 58 | ||||
-rw-r--r-- | 0008-UKSM.patch (renamed from 0002-UKSM.patch) | 0 | ||||
-rw-r--r-- | PKGBUILD | 24 | ||||
-rw-r--r-- | config | 92 |
12 files changed, 793 insertions, 1573 deletions
@@ -1,5 +1,5 @@ pkgbase = linux-ck-uksm - pkgver = 5.12.10 + pkgver = 5.12.12 pkgrel = 1 url = https://wiki.archlinux.org/index.php/Linux-ck arch = x86_64 @@ -12,24 +12,34 @@ pkgbase = linux-ck-uksm makedepends = tar makedepends = xz options = !strip - source = https://www.kernel.org/pub/linux/kernel/v5.x/linux-5.12.10.tar.xz - source = https://www.kernel.org/pub/linux/kernel/v5.x/linux-5.12.10.tar.sign + source = https://www.kernel.org/pub/linux/kernel/v5.x/linux-5.12.12.tar.xz + source = https://www.kernel.org/pub/linux/kernel/v5.x/linux-5.12.12.tar.sign source = config source = more-uarches-20210610.tar.gz::https://github.com/graysky2/kernel_compiler_patch/archive/20210610.tar.gz source = http://ck.kolivas.org/patches/5.0/5.12/5.12-ck1/patch-5.12-ck1.xz source = 0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch - source = 0002-UKSM.patch - source = 0003-bfq-lucjan-r2K210602.patch + source = 0002-x86-setup-Consolidate-early-memory-reservations.patch + source = 0003-x86-setup-Merge-several-reservations-of-start-of-mem.patch + source = 0004-x86-setup-Move-trim_snb_memory-later-in-setup_arch-t.patch + source = 0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch + source = 0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch + source = 0007-x86-crash-remove-crash_reserve_low_1M.patch + source = 0008-UKSM.patch validpgpkeys = ABAF11C65A2970B130ABE3C479BE3E4300411886 validpgpkeys = 647F28654894E3BD457199BE38DBBDC86092693E - b2sums = b40ef5a11ca435299899e8131fa72af147455cd8ebee4c0e187572b1f628e66d2b6fbb318308bc911a598d8303d1ab3622d52966deaa5c48d59dcd65f4f58687 + b2sums = f9aef3da2f65916cc30da9a066217d3379036e6a32a732224da7fee86c80810315484f48132b50b8cf8eb5e0b055ad1b7bbe63dadd0eb54b0b0626bc57c20963 b2sums = SKIP b2sums = SKIP b2sums = 30d1df754608bb423cbc99c2097ad521baa091b9a3b39df4bd5c2d50c57eec54d8fa0e4a4a04b847c3d1b87ba682cadc8db45fabeefdc9ad7caaf8e77b96e41a b2sums = c9f729ba1efe6f04e7b2c57d3999bc9675b577596dccb2f227e5b6e444285e1fdd270bf67c0fcf9f5808a4c3a4b1c7a5c13a76f754ad9b9447243ccbaf2ce6a3 - b2sums = e1eccb5b6b728e3852ade55dae7a53b8b6bd5f0fb2a330b99e85bfa64abaa430cb714d301ed169df14a1f302a75d952992f0d8fa6ab02fa6716165bdf23b63aa + b2sums = dda152592dec643bce44754bf5d2d43a5897cc57f8dc258b87857055a45abf903d619aba1de389228cb086a17fedea5458f8fe2c0993fa20213bb7c5bca331c8 + b2sums = 13330cf57b5c6b928ea73bd30479010688cf8d2003107b041a7fdad33c1ac225c8c905bef235cd762d6ea76be754b5db6be769526bacf7333298f72d6afff535 + b2sums = 381e0f177faa3090d1abf4d11a97db535712840870265dea167d7692dee7733a226d09c103d01705d5c0809fa66c7a23efea9da2473da672644b06e31db77083 + b2sums = cd9da0dee048fc52a3032343f122c2055081eeedfc8a3e5227218f0f63fc7618e8fe744c8caa7e3a2ca844f4aaf7314b57a306d0d3b1849e97b24687b8c5a501 + b2sums = 1810832172e1b006a5471d8e317573343884feed9abc9e7380a32d83c958b0e6aa68adf9a647c9b7b714783997591f5d80e754c6e7357279661eee998f22864c + b2sums = 4e7cb958f95d99bba9810e675d4f1b0b3c171f78e9fe96ff9d265f792f4ceb1367f2f4d238f36b5ca1c395e14abdabbf0f8ce2dc07c4fe567d822a8b629dfa05 + b2sums = 2251f8bf84e141b4661f84cc2ce7b21783ac0a349b2651477dfcbc5383b796b2e588d85ee411398b15c820cb3672256be8ed281c8bccfad252c9dd5b0e1e0cd5 b2sums = 14f45171afc3b15488b40a05e58b352c5057da3a5782e13527392f7750d8e45a8db54f9b50b218fedb8bf679de3b4e5d78e230a44f7b1aa482f7b3aa831bd641 - b2sums = 6a195695fcd207adbdea28dd2803b479f6e5dc478b56d5fce16a7600f719fa545ed0e468a26f9c94e982346fb803a0ff026abd0d70335e42027468475beb7cbb pkgname = linux-ck-uksm pkgdesc = The Linux-ck-uksm kernel and modules with the ck1 patchset featuring MuQSS CPU scheduler diff --git a/0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch b/0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch index 79dab97ee81a..73e35ef52bf5 100644 --- a/0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch +++ b/0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch @@ -1,7 +1,7 @@ -From f8f830397db175f686669b8b36755a6e5d5c3f03 Mon Sep 17 00:00:00 2001 +From fa17daad7209d62169553ce6336ef29ba4748049 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com> Date: Mon, 16 Sep 2019 04:53:20 +0200 -Subject: [PATCH 1/2] ZEN: Add sysctl and CONFIG to disallow unprivileged +Subject: [PATCH 1/8] ZEN: Add sysctl and CONFIG to disallow unprivileged CLONE_NEWUSER Our default behavior continues to match the vanilla kernel. @@ -150,5 +150,5 @@ index 9a4b980d695b..4388ca13ea3f 100644 static DEFINE_MUTEX(userns_state_mutex); -- -2.31.1 +2.32.0 diff --git a/0002-x86-setup-Consolidate-early-memory-reservations.patch b/0002-x86-setup-Consolidate-early-memory-reservations.patch new file mode 100644 index 000000000000..20c380797611 --- /dev/null +++ b/0002-x86-setup-Consolidate-early-memory-reservations.patch @@ -0,0 +1,188 @@ +From 56e6bb0fe2b790adda81851794409faa533e521c Mon Sep 17 00:00:00 2001 +From: Mike Rapoport <rppt@linux.ibm.com> +Date: Tue, 2 Mar 2021 12:04:05 +0200 +Subject: [PATCH 2/8] x86/setup: Consolidate early memory reservations + +The early reservations of memory areas used by the firmware, bootloader, +kernel text and data are spread over setup_arch(). Moreover, some of them +happen *after* memblock allocations, e.g trim_platform_memory_ranges() and +trim_low_memory_range() are called after reserve_real_mode() that allocates +memory. + +There was no corruption of these memory regions because memblock always +allocates memory either from the end of memory (in top-down mode) or above +the kernel image (in bottom-up mode). However, the bottom up mode is going +to be updated to span the entire memory [1] to avoid limitations caused by +KASLR. + +Consolidate early memory reservations in a dedicated function to improve +robustness against future changes. Having the early reservations in one +place also makes it clearer what memory must be reserved before memblock +allocations are allowed. + +Signed-off-by: Mike Rapoport <rppt@linux.ibm.com> +Signed-off-by: Borislav Petkov <bp@suse.de> +Reviewed-by: Baoquan He <bhe@redhat.com> +Acked-by: Borislav Petkov <bp@suse.de> +Acked-by: David Hildenbrand <david@redhat.com> +Link: [1] https://lore.kernel.org/lkml/20201217201214.3414100-2-guro@fb.com +Link: https://lkml.kernel.org/r/20210302100406.22059-2-rppt@kernel.org +--- + arch/x86/kernel/setup.c | 92 ++++++++++++++++++++--------------------- + 1 file changed, 44 insertions(+), 48 deletions(-) + +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c +index e79f21d13a0d..420d881da2bd 100644 +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -646,18 +646,6 @@ static void __init trim_snb_memory(void) + } + } + +-/* +- * Here we put platform-specific memory range workarounds, i.e. +- * memory known to be corrupt or otherwise in need to be reserved on +- * specific platforms. +- * +- * If this gets used more widely it could use a real dispatch mechanism. +- */ +-static void __init trim_platform_memory_ranges(void) +-{ +- trim_snb_memory(); +-} +- + static void __init trim_bios_range(void) + { + /* +@@ -730,7 +718,38 @@ static void __init trim_low_memory_range(void) + { + memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); + } +- ++ ++static void __init early_reserve_memory(void) ++{ ++ /* ++ * Reserve the memory occupied by the kernel between _text and ++ * __end_of_kernel_reserve symbols. Any kernel sections after the ++ * __end_of_kernel_reserve symbol must be explicitly reserved with a ++ * separate memblock_reserve() or they will be discarded. ++ */ ++ memblock_reserve(__pa_symbol(_text), ++ (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); ++ ++ /* ++ * Make sure page 0 is always reserved because on systems with ++ * L1TF its contents can be leaked to user processes. ++ */ ++ memblock_reserve(0, PAGE_SIZE); ++ ++ early_reserve_initrd(); ++ ++ if (efi_enabled(EFI_BOOT)) ++ efi_memblock_x86_reserve_range(); ++ ++ memblock_x86_reserve_range_setup_data(); ++ ++ reserve_ibft_region(); ++ reserve_bios_regions(); ++ ++ trim_snb_memory(); ++ trim_low_memory_range(); ++} ++ + /* + * Dump out kernel offset information on panic. + */ +@@ -765,29 +784,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) + + void __init setup_arch(char **cmdline_p) + { +- /* +- * Reserve the memory occupied by the kernel between _text and +- * __end_of_kernel_reserve symbols. Any kernel sections after the +- * __end_of_kernel_reserve symbol must be explicitly reserved with a +- * separate memblock_reserve() or they will be discarded. +- */ +- memblock_reserve(__pa_symbol(_text), +- (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); +- +- /* +- * Make sure page 0 is always reserved because on systems with +- * L1TF its contents can be leaked to user processes. +- */ +- memblock_reserve(0, PAGE_SIZE); +- +- early_reserve_initrd(); +- +- /* +- * At this point everything still needed from the boot loader +- * or BIOS or kernel text should be early reserved or marked not +- * RAM in e820. All other memory is free game. +- */ +- + #ifdef CONFIG_X86_32 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + +@@ -911,8 +907,18 @@ void __init setup_arch(char **cmdline_p) + + parse_early_param(); + +- if (efi_enabled(EFI_BOOT)) +- efi_memblock_x86_reserve_range(); ++ /* ++ * Do some memory reservations *before* memory is added to ++ * memblock, so memblock allocations won't overwrite it. ++ * Do it after early param, so we could get (unlikely) panic from ++ * serial. ++ * ++ * After this point everything still needed from the boot loader or ++ * firmware or kernel text should be early reserved or marked not ++ * RAM in e820. All other memory is free game. ++ */ ++ early_reserve_memory(); ++ + #ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux +@@ -939,9 +945,6 @@ void __init setup_arch(char **cmdline_p) + + x86_report_nx(); + +- /* after early param, so could get panic from serial */ +- memblock_x86_reserve_range_setup_data(); +- + if (acpi_mps_check()) { + #ifdef CONFIG_X86_LOCAL_APIC + disable_apic = 1; +@@ -1033,8 +1036,6 @@ void __init setup_arch(char **cmdline_p) + */ + find_smp_config(); + +- reserve_ibft_region(); +- + early_alloc_pgt_buf(); + + /* +@@ -1055,8 +1056,6 @@ void __init setup_arch(char **cmdline_p) + */ + sev_setup_arch(); + +- reserve_bios_regions(); +- + efi_fake_memmap(); + efi_find_mirror(); + efi_esrt_init(); +@@ -1082,9 +1081,6 @@ void __init setup_arch(char **cmdline_p) + + reserve_real_mode(); + +- trim_platform_memory_ranges(); +- trim_low_memory_range(); +- + init_mem_mapping(); + + idt_setup_early_pf(); +-- +2.32.0 + diff --git a/0003-bfq-lucjan-r2K210602.patch b/0003-bfq-lucjan-r2K210602.patch deleted file mode 100644 index b1fbccc60879..000000000000 --- a/0003-bfq-lucjan-r2K210602.patch +++ /dev/null @@ -1,1534 +0,0 @@ -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c -index b791e2041..ede8a0f0e 100644 ---- a/block/bfq-cgroup.c -+++ b/block/bfq-cgroup.c -@@ -224,7 +224,7 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - { - blkg_rwstat_add(&bfqg->stats.queued, op, 1); - bfqg_stats_end_empty_time(&bfqg->stats); -- if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -+ if (!(bfqq == bfqg->bfqd->in_service_queue)) - bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); - } - -@@ -309,8 +309,7 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq) - { - struct bfq_entity *group_entity = bfqq->entity.parent; - -- return group_entity ? container_of(group_entity, struct bfq_group, -- entity) : -+ return group_entity ? bfq_entity_to_bfqg(group_entity) : - bfqq->bfqd->root_group; - } - -@@ -427,6 +426,7 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) - - entity->weight = entity->new_weight; - entity->orig_weight = entity->new_weight; -+ entity->prio_changed = 0; - if (bfqq) { - bfqq->ioprio = bfqq->new_ioprio; - bfqq->ioprio_class = bfqq->new_ioprio_class; -@@ -547,6 +547,8 @@ static void bfq_pd_init(struct blkg_policy_data *pd) - - entity->orig_weight = entity->weight = entity->new_weight = d->weight; - entity->my_sched_data = &bfqg->sched_data; -+ entity->last_bfqq_created = NULL; -+ - bfqg->my_entity = entity; /* - * the root_group's will be set to NULL - * in bfq_init_queue() -@@ -610,8 +612,7 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - */ - entity = &bfqg->entity; - for_each_entity(entity) { -- struct bfq_group *curr_bfqg = container_of(entity, -- struct bfq_group, entity); -+ struct bfq_group *curr_bfqg = bfq_entity_to_bfqg(entity); - if (curr_bfqg != bfqd->root_group) { - parent = bfqg_parent(curr_bfqg); - if (!parent) -@@ -1431,15 +1432,11 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg) {} - struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) - { - struct bfq_group *bfqg; -- int i; - - bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); - if (!bfqg) - return NULL; - -- for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -- bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -- - return bfqg; - } - #endif /* CONFIG_BFQ_GROUP_IOSCHED */ -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index bc319931d..695421b08 100644 ---- a/block/bfq-iosched.c -+++ b/block/bfq-iosched.c -@@ -372,9 +372,38 @@ struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) - return bic->bfqq[is_sync]; - } - -+static void bfq_put_stable_ref(struct bfq_queue *bfqq); -+ - void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) - { -+ /* -+ * If bfqq != NULL, then a non-stable queue merge between -+ * bic->bfqq and bfqq is happening here. This causes troubles -+ * in the following case: bic->bfqq has also been scheduled -+ * for a possible stable merge with bic->stable_merge_bfqq, -+ * and bic->stable_merge_bfqq == bfqq happens to -+ * hold. Troubles occur because bfqq may then undergo a split, -+ * thereby becoming eligible for a stable merge. Yet, if -+ * bic->stable_merge_bfqq points exactly to bfqq, then bfqq -+ * would be stably merged with itself. To avoid this anomaly, -+ * we cancel the stable merge if -+ * bic->stable_merge_bfqq == bfqq. -+ */ - bic->bfqq[is_sync] = bfqq; -+ -+ if (bfqq && bic->stable_merge_bfqq == bfqq) { -+ /* -+ * Actually, these same instructions are executed also -+ * in bfq_setup_cooperator, in case of abort or actual -+ * execution of a stable merge. We could avoid -+ * repeating these instructions there too, but if we -+ * did so, we would nest even more complexity in this -+ * function. -+ */ -+ bfq_put_stable_ref(bic->stable_merge_bfqq); -+ -+ bic->stable_merge_bfqq = NULL; -+ } - } - - struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -@@ -1075,7 +1104,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - static int bfqq_process_refs(struct bfq_queue *bfqq) - { - return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - -- (bfqq->weight_counter != NULL); -+ (bfqq->weight_counter != NULL) - bfqq->stable_ref; - } - - /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */ -@@ -2220,7 +2249,7 @@ static void bfq_remove_request(struct request_queue *q, - bfqd->queued--; - elv_rb_del(&bfqq->sort_list, rq); - -- elv_rqhash_del(q, rq); -+ elv_rqhash_del(rq); - if (q->last_merge == rq) - q->last_merge = NULL; - -@@ -2288,9 +2317,9 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, - - ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); - -+ spin_unlock_irq(&bfqd->lock); - if (free) - blk_mq_free_request(free); -- spin_unlock_irq(&bfqd->lock); - - return ret; - } -@@ -2376,7 +2405,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - *next_bfqq = bfq_init_rq(next); - - if (!bfqq) -- return; -+ goto remove; - - /* - * If next and rq belong to the same bfq_queue and next is older -@@ -2399,6 +2428,14 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - bfqq->next_rq = rq; - - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+remove: -+ /* Merged request may be in the IO scheduler. Remove it. */ -+ if (!RB_EMPTY_NODE(&next->rb_node)) { -+ bfq_remove_request(next->q, next); -+ if (next_bfqq) -+ bfqg_stats_update_io_remove(bfqq_group(next_bfqq), -+ next->cmd_flags); -+ } - } - - /* Must be called with bfqq != NULL */ -@@ -2627,6 +2664,9 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - return true; - } - -+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq); -+ - /* - * Attempt to schedule a merge of bfqq with the currently in-service - * queue or with a close queue among the scheduled queues. Return -@@ -2649,10 +2689,55 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - */ - static struct bfq_queue * - bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- void *io_struct, bool request) -+ void *io_struct, bool request, struct bfq_io_cq *bic) - { - struct bfq_queue *in_service_bfqq, *new_bfqq; - -+ /* -+ * Check delayed stable merge for rotational or non-queueing -+ * devs. For this branch to be executed, bfqq must not be -+ * currently merged with some other queue (i.e., bfqq->bic -+ * must be non null). If we considered also merged queues, -+ * then we should also check whether bfqq has already been -+ * merged with bic->stable_merge_bfqq. But this would be -+ * costly and complicated. -+ */ -+ if (unlikely(!bfqd->nonrot_with_queueing)) { -+ /* -+ * Make sure also that bfqq is sync, because -+ * bic->stable_merge_bfqq may point to some queue (for -+ * stable merging) also if bic is associated with a -+ * sync queue, but bfqq is async -+ */ -+ if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq && -+ !bfq_bfqq_just_created(bfqq) && -+ time_is_before_jiffies(bfqq->split_time + -+ msecs_to_jiffies(200))) { -+ struct bfq_queue *stable_merge_bfqq = -+ bic->stable_merge_bfqq; -+ int proc_ref = min(bfqq_process_refs(bfqq), -+ bfqq_process_refs(stable_merge_bfqq)); -+ -+ /* deschedule stable merge, because done or aborted here */ -+ bfq_put_stable_ref(stable_merge_bfqq); -+ -+ bic->stable_merge_bfqq = NULL; -+ -+ if (!idling_boosts_thr_without_issues(bfqd, bfqq) && -+ proc_ref > 0) { -+ /* next function will take at least one ref */ -+ struct bfq_queue *new_bfqq = -+ bfq_setup_merge(bfqq, stable_merge_bfqq); -+ -+ bic->stably_merged = true; -+ if (new_bfqq && new_bfqq->bic) -+ new_bfqq->bic->stably_merged = true; -+ return new_bfqq; -+ } else -+ return NULL; -+ } -+ } -+ - /* - * Do not perform queue merging if the device is non - * rotational and performs internal queueing. In fact, such a -@@ -2794,6 +2879,17 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - } - } - -+ -+static void -+bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq) -+{ -+ if (cur_bfqq->entity.parent && -+ cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq) -+ cur_bfqq->entity.parent->last_bfqq_created = new_bfqq; -+ else if (cur_bfqq->bfqd && cur_bfqq->bfqd->last_bfqq_created == cur_bfqq) -+ cur_bfqq->bfqd->last_bfqq_created = new_bfqq; -+} -+ - void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - /* -@@ -2811,6 +2907,8 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqq != bfqd->in_service_queue) - bfq_del_bfqq_busy(bfqd, bfqq, false); - -+ bfq_reassign_last_bfqq(bfqq, NULL); -+ - bfq_put_queue(bfqq); - } - -@@ -2827,6 +2925,29 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - bfq_mark_bfqq_IO_bound(new_bfqq); - bfq_clear_bfqq_IO_bound(bfqq); - -+ /* -+ * The processes associated with bfqq are cooperators of the -+ * processes associated with new_bfqq. So, if bfqq has a -+ * waker, then assume that all these processes will be happy -+ * to let bfqq's waker freely inject I/O when they have no -+ * I/O. -+ */ -+ if (bfqq->waker_bfqq && !new_bfqq->waker_bfqq && -+ bfqq->waker_bfqq != new_bfqq) { -+ new_bfqq->waker_bfqq = bfqq->waker_bfqq; -+ new_bfqq->tentative_waker_bfqq = NULL; -+ -+ /* -+ * If the waker queue disappears, then -+ * new_bfqq->waker_bfqq must be reset. So insert -+ * new_bfqq into the woken_list of the waker. See -+ * bfq_check_waker for details. -+ */ -+ hlist_add_head(&new_bfqq->woken_list_node, -+ &new_bfqq->waker_bfqq->woken_list); -+ -+ } -+ - /* - * If bfqq is weight-raised, then let new_bfqq inherit - * weight-raising. To reduce false positives, neglect the case -@@ -2884,6 +3005,9 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - */ - new_bfqq->pid = -1; - bfqq->bic = NULL; -+ -+ bfq_reassign_last_bfqq(bfqq, new_bfqq); -+ - bfq_release_process_ref(bfqd, bfqq); - } - -@@ -2911,7 +3035,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - * We take advantage of this function to perform an early merge - * of the queues of possible cooperating processes. - */ -- new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false, bfqd->bio_bic); - if (new_bfqq) { - /* - * bic still points to bfqq, then it has not yet been -@@ -4496,9 +4620,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - bfq_bfqq_busy(bfqq->bic->bfqq[0]) && - bfqq->bic->bfqq[0]->next_rq ? - bfqq->bic->bfqq[0] : NULL; -+ struct bfq_queue *blocked_bfqq = -+ !hlist_empty(&bfqq->woken_list) ? -+ container_of(bfqq->woken_list.first, -+ struct bfq_queue, -+ woken_list_node) -+ : NULL; - - /* -- * The next three mutually-exclusive ifs decide -+ * The next four mutually-exclusive ifs decide - * whether to try injection, and choose the queue to - * pick an I/O request from. - * -@@ -4531,7 +4661,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * next bfqq's I/O is brought forward dramatically, - * for it is not blocked for milliseconds. - * -- * The third if checks whether bfqq is a queue for -+ * The third if checks whether there is a queue woken -+ * by bfqq, and currently with pending I/O. Such a -+ * woken queue does not steal bandwidth from bfqq, -+ * because it remains soon without I/O if bfqq is not -+ * served. So there is virtually no risk of loss of -+ * bandwidth for bfqq if this woken queue has I/O -+ * dispatched while bfqq is waiting for new I/O. -+ * -+ * The fourth if checks whether bfqq is a queue for - * which it is better to avoid injection. It is so if - * bfqq delivers more throughput when served without - * any further I/O from other queues in the middle, or -@@ -4551,11 +4689,11 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * bfq_update_has_short_ttime(), it is rather likely - * that, if I/O is being plugged for bfqq and the - * waker queue has pending I/O requests that are -- * blocking bfqq's I/O, then the third alternative -+ * blocking bfqq's I/O, then the fourth alternative - * above lets the waker queue get served before the - * I/O-plugging timeout fires. So one may deem the - * second alternative superfluous. It is not, because -- * the third alternative may be way less effective in -+ * the fourth alternative may be way less effective in - * case of a synchronization. For two main - * reasons. First, throughput may be low because the - * inject limit may be too low to guarantee the same -@@ -4564,7 +4702,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * guarantees (the second alternative unconditionally - * injects a pending I/O request of the waker queue - * for each bfq_dispatch_request()). Second, with the -- * third alternative, the duration of the plugging, -+ * fourth alternative, the duration of the plugging, - * i.e., the time before bfqq finally receives new I/O, - * may not be minimized, because the waker queue may - * happen to be served only after other queues. -@@ -4582,6 +4720,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - bfq_bfqq_budget_left(bfqq->waker_bfqq) - ) - bfqq = bfqq->waker_bfqq; -+ else if (blocked_bfqq && -+ bfq_bfqq_busy(blocked_bfqq) && -+ blocked_bfqq->next_rq && -+ bfq_serv_to_charge(blocked_bfqq->next_rq, -+ blocked_bfqq) <= -+ bfq_bfqq_budget_left(blocked_bfqq) -+ ) -+ bfqq = blocked_bfqq; - else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && - (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || - !bfq_bfqq_has_short_ttime(bfqq))) -@@ -4813,6 +4959,17 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - if (!bfqq) - goto exit; - -+ /* -+ * Here, the IO depth of queues belong to CLASS_IDLE is limited -+ * to 1, so that it can avoid introducing a larger tail latency -+ * under a device with a larger IO depth. Although limiting the -+ * IO depth may reduce the performance of idle_class, it is -+ * generally not a big problem, because idle_class usually -+ * does not have strict performance requirements. -+ */ -+ if (bfq_class_idle(bfqq) && bfqq->dispatched) -+ goto exit; -+ - rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq); - - if (rq) { -@@ -4988,6 +5145,12 @@ void bfq_put_queue(struct bfq_queue *bfqq) - bfqg_and_blkg_put(bfqg); - } - -+static void bfq_put_stable_ref(struct bfq_queue *bfqq) -+{ -+ bfqq->stable_ref--; -+ bfq_put_queue(bfqq); -+} -+ - static void bfq_put_cooperator(struct bfq_queue *bfqq) - { - struct bfq_queue *__bfqq, *next; -@@ -5044,6 +5207,24 @@ static void bfq_exit_icq(struct io_cq *icq) - { - struct bfq_io_cq *bic = icq_to_bic(icq); - -+ if (bic->stable_merge_bfqq) { -+ struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd; -+ -+ /* -+ * bfqd is NULL if scheduler already exited, and in -+ * that case this is the last time bfqq is accessed. -+ */ -+ if (bfqd) { -+ unsigned long flags; -+ -+ spin_lock_irqsave(&bfqd->lock, flags); -+ bfq_put_stable_ref(bic->stable_merge_bfqq); -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ } else { -+ bfq_put_stable_ref(bic->stable_merge_bfqq); -+ } -+ } -+ - bfq_exit_icq_bfqq(bic, true); - bfq_exit_icq_bfqq(bic, false); - } -@@ -5104,7 +5285,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) - - static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, bool is_sync, -- struct bfq_io_cq *bic); -+ struct bfq_io_cq *bic, -+ bool respawn); - - static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) - { -@@ -5124,7 +5306,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) - bfqq = bic_to_bfqq(bic, false); - if (bfqq) { - bfq_release_process_ref(bfqd, bfqq); -- bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true); - bic_set_bfqq(bic, bfqq, false); - } - -@@ -5167,6 +5349,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - /* set end request to minus infinity from now */ - bfqq->ttime.last_end_request = now_ns + 1; - -+ bfqq->creation_time = jiffies; -+ - bfqq->io_start_time = now_ns; - - bfq_mark_bfqq_IO_bound(bfqq); -@@ -5216,9 +5400,156 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, - } - } - -+static struct bfq_queue * -+bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, -+ struct bfq_queue *last_bfqq_created) -+{ -+ struct bfq_queue *new_bfqq = -+ bfq_setup_merge(bfqq, last_bfqq_created); -+ -+ if (!new_bfqq) -+ return bfqq; -+ -+ if (new_bfqq->bic) -+ new_bfqq->bic->stably_merged = true; -+ bic->stably_merged = true; -+ -+ /* -+ * Reusing merge functions. This implies that -+ * bfqq->bic must be set too, for -+ * bfq_merge_bfqqs to correctly save bfqq's -+ * state before killing it. -+ */ -+ bfqq->bic = bic; -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -+ -+ return new_bfqq; -+} -+ -+/* -+ * Many throughput-sensitive workloads are made of several parallel -+ * I/O flows, with all flows generated by the same application, or -+ * more generically by the same task (e.g., system boot). The most -+ * counterproductive action with these workloads is plugging I/O -+ * dispatch when one of the bfq_queues associated with these flows -+ * remains temporarily empty. -+ * -+ * To avoid this plugging, BFQ has been using a burst-handling -+ * mechanism for years now. This mechanism has proven effective for -+ * throughput, and not detrimental for service guarantees. The -+ * following function pushes this mechanism a little bit further, -+ * basing on the following two facts. -+ * -+ * First, all the I/O flows of a the same application or task -+ * contribute to the execution/completion of that common application -+ * or task. So the performance figures that matter are total -+ * throughput of the flows and task-wide I/O latency. In particular, -+ * these flows do not need to be protected from each other, in terms -+ * of individual bandwidth or latency. -+ * -+ * Second, the above fact holds regardless of the number of flows. -+ * -+ * Putting these two facts together, this commits merges stably the -+ * bfq_queues associated with these I/O flows, i.e., with the -+ * processes that generate these IO/ flows, regardless of how many the -+ * involved processes are. -+ * -+ * To decide whether a set of bfq_queues is actually associated with -+ * the I/O flows of a common application or task, and to merge these -+ * queues stably, this function operates as follows: given a bfq_queue, -+ * say Q2, currently being created, and the last bfq_queue, say Q1, -+ * created before Q2, Q2 is merged stably with Q1 if -+ * - very little time has elapsed since when Q1 was created -+ * - Q2 has the same ioprio as Q1 -+ * - Q2 belongs to the same group as Q1 -+ * -+ * Merging bfq_queues also reduces scheduling overhead. A fio test -+ * with ten random readers on /dev/nullb shows a throughput boost of -+ * 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of -+ * the total per-request processing time, the above throughput boost -+ * implies that BFQ's overhead is reduced by more than 50%. -+ * -+ * This new mechanism most certainly obsoletes the current -+ * burst-handling heuristics. We keep those heuristics for the moment. -+ */ -+static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ struct bfq_queue **source_bfqq = bfqq->entity.parent ? -+ &bfqq->entity.parent->last_bfqq_created : -+ &bfqd->last_bfqq_created; -+ -+ struct bfq_queue *last_bfqq_created = *source_bfqq; -+ -+ /* -+ * If last_bfqq_created has not been set yet, then init it. If -+ * it has been set already, but too long ago, then move it -+ * forward to bfqq. Finally, move also if bfqq belongs to a -+ * different group than last_bfqq_created, or if bfqq has a -+ * different ioprio or ioprio_class. If none of these -+ * conditions holds true, then try an early stable merge or -+ * schedule a delayed stable merge. -+ * -+ * A delayed merge is scheduled (instead of performing an -+ * early merge), in case bfqq might soon prove to be more -+ * throughput-beneficial if not merged. Currently this is -+ * possible only if bfqd is rotational with no queueing. For -+ * such a drive, not merging bfqq is better for throughput if -+ * bfqq happens to contain sequential I/O. So, we wait a -+ * little bit for enough I/O to flow through bfqq. After that, -+ * if such an I/O is sequential, then the merge is -+ * canceled. Otherwise the merge is finally performed. -+ */ -+ if (!last_bfqq_created || -+ time_before(last_bfqq_created->creation_time + -+ bfqd->bfq_burst_interval, -+ bfqq->creation_time) || -+ bfqq->entity.parent != last_bfqq_created->entity.parent || -+ bfqq->ioprio != last_bfqq_created->ioprio || -+ bfqq->ioprio_class != last_bfqq_created->ioprio_class) -+ *source_bfqq = bfqq; -+ else if (time_after_eq(last_bfqq_created->creation_time + -+ bfqd->bfq_burst_interval, -+ bfqq->creation_time)) { -+ if (likely(bfqd->nonrot_with_queueing)) -+ /* -+ * With this type of drive, leaving -+ * bfqq alone may provide no -+ * throughput benefits compared with -+ * merging bfqq. So merge bfqq now. -+ */ -+ bfqq = bfq_do_early_stable_merge(bfqd, bfqq, -+ bic, -+ last_bfqq_created); -+ else { /* schedule tentative stable merge */ -+ /* -+ * get reference on last_bfqq_created, -+ * to prevent it from being freed, -+ * until we decide whether to merge -+ */ -+ last_bfqq_created->ref++; -+ /* -+ * need to keep track of stable refs, to -+ * compute process refs correctly -+ */ -+ last_bfqq_created->stable_ref++; -+ /* -+ * Record the bfqq to merge to. -+ */ -+ bic->stable_merge_bfqq = last_bfqq_created; -+ } -+ } -+ -+ return bfqq; -+} -+ -+ - static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, bool is_sync, -- struct bfq_io_cq *bic) -+ struct bfq_io_cq *bic, -+ bool respawn) - { - const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -@@ -5276,7 +5607,10 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - - out: - bfqq->ref++; /* get a process reference to this queue */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ -+ if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn) -+ bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic); -+ - rcu_read_unlock(); - return bfqq; - } -@@ -5526,7 +5860,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), -- *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true, -+ RQ_BIC(rq)); - bool waiting, idle_timer_disabled = false; - - if (new_bfqq) { -@@ -5615,14 +5950,16 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - struct bfq_queue *bfqq; - bool idle_timer_disabled = false; - unsigned int cmd_flags; -+ LIST_HEAD(free); - - #ifdef CONFIG_BFQ_GROUP_IOSCHED - if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) - bfqg_stats_update_legacy_io(q, rq); - #endif - spin_lock_irq(&bfqd->lock); -- if (blk_mq_sched_try_insert_merge(q, rq)) { -+ if (blk_mq_sched_try_insert_merge(q, rq, &free)) { - spin_unlock_irq(&bfqd->lock); -+ blk_mq_free_requests(&free); - return; - } - -@@ -5632,7 +5969,48 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - bfqq = bfq_init_rq(rq); -- if (!bfqq || at_head || blk_rq_is_passthrough(rq)) { -+ -+ /* -+ * Reqs with at_head or passthrough flags set are to be put -+ * directly into dispatch list. Additional case for putting rq -+ * directly into the dispatch queue: the only active -+ * bfq_queues are bfqq and either its waker bfq_queue or one -+ * of its woken bfq_queues. The rationale behind this -+ * additional condition is as follows: -+ * - consider a bfq_queue, say Q1, detected as a waker of -+ * another bfq_queue, say Q2 -+ * - by definition of a waker, Q1 blocks the I/O of Q2, i.e., -+ * some I/O of Q1 needs to be completed for new I/O of Q2 -+ * to arrive. A notable example of waker is journald -+ * - so, Q1 and Q2 are in any respect the queues of two -+ * cooperating processes (or of two cooperating sets of -+ * processes): the goal of Q1's I/O is doing what needs to -+ * be done so that new Q2's I/O can finally be -+ * issued. Therefore, if the service of Q1's I/O is delayed, -+ * then Q2's I/O is delayed too. Conversely, if Q2's I/O is -+ * delayed, the goal of Q1's I/O is hindered. -+ * - as a consequence, if some I/O of Q1/Q2 arrives while -+ * Q2/Q1 is the only queue in service, there is absolutely -+ * no point in delaying the service of such an I/O. The -+ * only possible result is a throughput loss -+ * - so, when the above condition holds, the best option is to -+ * have the new I/O dispatched as soon as possible -+ * - the most effective and efficient way to attain the above -+ * goal is to put the new I/O directly in the dispatch -+ * list -+ * - as an additional restriction, Q1 and Q2 must be the only -+ * busy queues for this commit to put the I/O of Q2/Q1 in -+ * the dispatch list. This is necessary, because, if also -+ * other queues are waiting for service, then putting new -+ * I/O directly in the dispatch list may evidently cause a -+ * violation of service guarantees for the other queues -+ */ -+ if (!bfqq || -+ (bfqq != bfqd->in_service_queue && -+ bfqd->in_service_queue != NULL && -+ bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) && -+ (bfqq->waker_bfqq == bfqd->in_service_queue || -+ bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) { - if (at_head) - list_add(&rq->queuelist, &bfqd->dispatch); - else -@@ -5772,7 +6150,17 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - 1UL<<(BFQ_RATE_SHIFT - 10)) - bfq_update_rate_reset(bfqd, NULL); - bfqd->last_completion = now_ns; -- bfqd->last_completed_rq_bfqq = bfqq; -+ /* -+ * Shared queues are likely to receive I/O at a high -+ * rate. This may deceptively let them be considered as wakers -+ * of other queues. But a false waker will unjustly steal -+ * bandwidth to its supposedly woken queue. So considering -+ * also shared queues in the waking mechanism may cause more -+ * control troubles than throughput benefits. Then do not set -+ * last_completed_rq_bfqq to bfqq if bfqq is a shared queue. -+ */ -+ if (!bfq_bfqq_coop(bfqq)) -+ bfqd->last_completed_rq_bfqq = bfqq; - - /* - * If we are waiting to discover whether the request pattern -@@ -6015,6 +6403,7 @@ static void bfq_finish_requeue_request(struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd; -+ unsigned long flags; - - /* - * rq either is not associated with any icq, or is an already -@@ -6032,39 +6421,15 @@ static void bfq_finish_requeue_request(struct request *rq) - rq->io_start_time_ns, - rq->cmd_flags); - -+ spin_lock_irqsave(&bfqd->lock, flags); - if (likely(rq->rq_flags & RQF_STARTED)) { -- unsigned long flags; -- -- spin_lock_irqsave(&bfqd->lock, flags); -- - if (rq == bfqd->waited_rq) - bfq_update_inject_limit(bfqd, bfqq); - - bfq_completed_request(bfqq, bfqd); -- bfq_finish_requeue_request_body(bfqq); -- -- spin_unlock_irqrestore(&bfqd->lock, flags); -- } else { -- /* -- * Request rq may be still/already in the scheduler, -- * in which case we need to remove it (this should -- * never happen in case of requeue). And we cannot -- * defer such a check and removal, to avoid -- * inconsistencies in the time interval from the end -- * of this function to the start of the deferred work. -- * This situation seems to occur only in process -- * context, as a consequence of a merge. In the -- * current version of the code, this implies that the -- * lock is held. -- */ -- -- if (!RB_EMPTY_NODE(&rq->rb_node)) { -- bfq_remove_request(rq->q, rq); -- bfqg_stats_update_io_remove(bfqq_group(bfqq), -- rq->cmd_flags); -- } -- bfq_finish_requeue_request_body(bfqq); - } -+ bfq_finish_requeue_request_body(bfqq); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - - /* - * Reset private fields. In case of a requeue, this allows -@@ -6129,7 +6494,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - - if (bfqq) - bfq_put_queue(bfqq); -- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split); - - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { -@@ -6250,8 +6615,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) - - if (likely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ -- if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -- bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && -+ !bic->stably_merged) { -+ struct bfq_queue *old_bfqq = bfqq; - - /* Update bic before losing reference to bfqq */ - if (bfq_bfqq_in_large_burst(bfqq)) -@@ -6260,11 +6626,24 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) - bfqq = bfq_split_bfqq(bic, bfqq); - split = true; - -- if (!bfqq) -+ if (!bfqq) { - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, - true, is_sync, - NULL); -- else -+ bfqq->waker_bfqq = old_bfqq->waker_bfqq; -+ bfqq->tentative_waker_bfqq = NULL; -+ -+ /* -+ * If the waker queue disappears, then -+ * new_bfqq->waker_bfqq must be -+ * reset. So insert new_bfqq into the -+ * woken_list of the waker. See -+ * bfq_check_waker for details. -+ */ -+ if (bfqq->waker_bfqq) -+ hlist_add_head(&bfqq->woken_list_node, -+ &bfqq->waker_bfqq->woken_list); -+ } else - bfqq_already_existing = true; - } - } -@@ -6531,9 +6910,11 @@ static void bfq_init_root_group(struct bfq_group *root_group, - root_group->bfqd = bfqd; - #endif - root_group->rq_pos_tree = RB_ROOT; -- for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { - root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -- root_group->sched_data.bfq_class_idle_last_service = jiffies; -+ root_group->sched_data.bfq_class_last_service[i] = jiffies; -+ } -+ root_group->sched_data.class_timeout_last_check = jiffies; - } - - static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -@@ -6926,6 +7307,7 @@ MODULE_ALIAS("bfq-iosched"); - static int __init bfq_init(void) - { - int ret; -+ char msg[60] = "BFQ I/O-scheduler: BFQ-lucjan v5.12"; - - #ifdef CONFIG_BFQ_GROUP_IOSCHED - ret = blkcg_policy_register(&blkcg_policy_bfq); -@@ -6957,6 +7339,11 @@ static int __init bfq_init(void) - if (ret) - goto slab_kill; - -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); -+ - return 0; - - slab_kill: -diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h -index b8e793c34..a79796912 100644 ---- a/block/bfq-iosched.h -+++ b/block/bfq-iosched.h -@@ -13,7 +13,7 @@ - #include "blk-cgroup-rwstat.h" - - #define BFQ_IOPRIO_CLASSES 3 --#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+#define BFQ_CLASS_TIMEOUT (HZ/5) - - #define BFQ_MIN_WEIGHT 1 - #define BFQ_MAX_WEIGHT 1000 -@@ -22,7 +22,6 @@ - #define BFQ_DEFAULT_QUEUE_IOPRIO 4 - - #define BFQ_WEIGHT_LEGACY_DFL 100 --#define BFQ_DEFAULT_GRP_IOPRIO 0 - #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE - - #define MAX_PID_STR_LENGTH 12 -@@ -97,9 +96,12 @@ struct bfq_sched_data { - struct bfq_entity *next_in_service; - /* array of service trees, one per ioprio_class */ - struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -- /* last time CLASS_IDLE was served */ -- unsigned long bfq_class_idle_last_service; -- -+ /* last time the class was served */ -+ unsigned long bfq_class_last_service[BFQ_IOPRIO_CLASSES]; -+ /* last time class timeout was checked */ -+ unsigned long class_timeout_last_check; -+ /* next index to check class timeout */ -+ unsigned int next_class_index; - }; - - /** -@@ -197,6 +199,9 @@ struct bfq_entity { - - /* flag, set if the entity is counted in groups_with_pending_reqs */ - bool in_groups_with_pending_reqs; -+ -+ /* last child queue of entity created (for non-leaf entities) */ -+ struct bfq_queue *last_bfqq_created; - }; - - struct bfq_group; -@@ -230,6 +235,8 @@ struct bfq_ttime { - struct bfq_queue { - /* reference counter */ - int ref; -+ /* counter of references from other queues for delayed stable merge */ -+ int stable_ref; - /* parent bfq_data */ - struct bfq_data *bfqd; - -@@ -365,6 +372,8 @@ struct bfq_queue { - - unsigned long first_IO_time; /* time of first I/O for this queue */ - -+ unsigned long creation_time; /* when this queue is created */ -+ - /* max service rate measured so far */ - u32 max_service_rate; - -@@ -454,6 +463,11 @@ struct bfq_io_cq { - u64 saved_last_serv_time_ns; - unsigned int saved_inject_limit; - unsigned long saved_decrease_time_jif; -+ -+ /* candidate queue for a stable merge (due to close creation time) */ -+ struct bfq_queue *stable_merge_bfqq; -+ -+ bool stably_merged; /* non splittable if true */ - }; - - /** -@@ -578,6 +592,9 @@ struct bfq_data { - /* bfqq owning the last completed rq */ - struct bfq_queue *last_completed_rq_bfqq; - -+ /* last bfqq created, among those in the root group */ -+ struct bfq_queue *last_bfqq_created; -+ - /* time of last transition from empty to non-empty (ns) */ - u64 last_empty_occupied_ns; - -@@ -914,7 +931,7 @@ struct bfq_group { - struct bfq_entity entity; - struct bfq_sched_data sched_data; - -- void *bfqd; -+ struct bfq_data *bfqd; - - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; - struct bfq_queue *async_idle_bfqq; -@@ -940,8 +957,6 @@ struct bfq_group { - }; - #endif - --struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -- - /* --------------- main algorithm interface ----------------- */ - - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -@@ -1036,6 +1051,7 @@ extern struct blkcg_policy blkcg_policy_bfq; - - struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq); - struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+struct bfq_group *bfq_entity_to_bfqg(struct bfq_entity *entity); - unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd); - struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity); - struct bfq_entity *bfq_entity_of(struct rb_node *node); -diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c -index 070e34a7f..7e48ed5b7 100644 ---- a/block/bfq-wf2q.c -+++ b/block/bfq-wf2q.c -@@ -149,7 +149,7 @@ struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) - if (!group_entity) - group_entity = &bfqq->bfqd->root_group->entity; - -- return container_of(group_entity, struct bfq_group, entity); -+ return bfq_entity_to_bfqg(group_entity); - } - - /* -@@ -208,7 +208,7 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - if (bfq_entity_to_bfqq(entity)) - return true; - -- bfqg = container_of(entity, struct bfq_group, entity); -+ bfqg = bfq_entity_to_bfqg(entity); - - /* - * The field active_entities does not always contain the -@@ -266,6 +266,15 @@ struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) - return bfqq; - } - -+struct bfq_group *bfq_entity_to_bfqg(struct bfq_entity *entity) -+{ -+ struct bfq_group *bfqg = NULL; -+ -+ if (entity->my_sched_data) -+ bfqg = container_of(entity, struct bfq_group, entity); -+ -+ return bfqg; -+} - - /** - * bfq_delta - map service into the virtual time domain. -@@ -489,7 +498,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - #ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); -- bfqd = (struct bfq_data *)bfqg->bfqd; -+ bfqd = bfqg->bfqd; - #endif - if (bfqq) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -@@ -518,8 +527,9 @@ unsigned short bfq_ioprio_to_weight(int ioprio) - */ - static unsigned short bfq_weight_to_ioprio(int weight) - { -- return max_t(int, 0, -- IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight); -+ int ioprio = IOPRIO_BE_NR - weight / BFQ_WEIGHT_CONVERSION_COEFF; -+ -+ return ioprio < 0 ? 0 : min_t(int, ioprio, IOPRIO_BE_NR - 1); - } - - static void bfq_get_entity(struct bfq_entity *entity) -@@ -588,7 +598,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, - #ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); -- bfqd = (struct bfq_data *)bfqg->bfqd; -+ bfqd = bfqg->bfqd; - #endif - if (bfqq) - list_del(&bfqq->bfqq_list); -@@ -734,7 +744,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - else { - sd = entity->my_sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); -- bfqd = (struct bfq_data *)bfqg->bfqd; -+ bfqd = bfqg->bfqd; - } - #endif - -@@ -872,7 +882,7 @@ void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, - unsigned long time_ms) - { - struct bfq_entity *entity = &bfqq->entity; -- unsigned long timeout_ms = jiffies_to_msecs(bfq_timeout); -+ unsigned long timeout_ms = jiffies_to_msecs(bfqd->bfq_timeout); - unsigned long bounded_time_ms = min(time_ms, timeout_ms); - int serv_to_charge_for_time = - (bfqd->bfq_max_budget * bounded_time_ms) / timeout_ms; -@@ -1001,8 +1011,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity, - - #ifdef CONFIG_BFQ_GROUP_IOSCHED - if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ -- struct bfq_group *bfqg = -- container_of(entity, struct bfq_group, entity); -+ struct bfq_group *bfqg = bfq_entity_to_bfqg(entity); - struct bfq_data *bfqd = bfqg->bfqd; - - if (!entity->in_groups_with_pending_reqs) { -@@ -1160,6 +1169,7 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree) - { - struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st; -+ int idx = bfq_class_idx(entity); - bool is_in_service; - - if (!entity->on_st_or_in_serv) /* -@@ -1199,6 +1209,7 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree) - else - bfq_idle_insert(st, entity); - -+ sd->bfq_class_last_service[idx] = jiffies; - return true; - } - -@@ -1427,6 +1438,45 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) - return entity; - } - -+static int bfq_select_next_class(struct bfq_sched_data *sd) -+{ -+ struct bfq_service_tree *st = sd->service_tree; -+ unsigned long last_check, last_serve; -+ int i, class_idx, next_class = 0; -+ bool found = false; -+ -+ /* -+ * we needed to guarantee a minimum bandwidth for each class (if -+ * there is some active entity in this class). This should also -+ * mitigate priority-inversion problems in case a low priority -+ * task is holding file system resources. -+ */ -+ last_check = sd->class_timeout_last_check; -+ if (time_is_after_jiffies(last_check + BFQ_CLASS_TIMEOUT)) -+ return next_class; -+ -+ sd->class_timeout_last_check = jiffies; -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { -+ class_idx = (sd->next_class_index + i) % BFQ_IOPRIO_CLASSES; -+ last_serve = sd->bfq_class_last_service[class_idx]; -+ -+ if (time_is_after_jiffies(last_serve + BFQ_CLASS_TIMEOUT)) -+ continue; -+ -+ if (!RB_EMPTY_ROOT(&(st + class_idx)->active)) { -+ if (found) -+ continue; -+ -+ next_class = class_idx++; -+ class_idx %= BFQ_IOPRIO_CLASSES; -+ sd->next_class_index = class_idx; -+ found = true; -+ } -+ sd->bfq_class_last_service[class_idx] = jiffies; -+ } -+ return next_class; -+} -+ - /** - * bfq_lookup_next_entity - return the first eligible entity in @sd. - * @sd: the sched_data. -@@ -1440,24 +1490,8 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, - bool expiration) - { - struct bfq_service_tree *st = sd->service_tree; -- struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); - struct bfq_entity *entity = NULL; -- int class_idx = 0; -- -- /* -- * Choose from idle class, if needed to guarantee a minimum -- * bandwidth to this class (and if there is some active entity -- * in idle class). This should also mitigate -- * priority-inversion problems in case a low priority task is -- * holding file system resources. -- */ -- if (time_is_before_jiffies(sd->bfq_class_idle_last_service + -- BFQ_CL_IDLE_TIMEOUT)) { -- if (!RB_EMPTY_ROOT(&idle_class_st->active)) -- class_idx = BFQ_IOPRIO_CLASSES - 1; -- /* About to be served if backlogged, or not yet backlogged */ -- sd->bfq_class_idle_last_service = jiffies; -- } -+ int class_idx = bfq_select_next_class(sd); - - /* - * Find the next entity to serve for the highest-priority -@@ -1706,4 +1740,12 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) - - if (bfqq->wr_coeff > 1) - bfqd->wr_busy_queues++; -+ -+ /* Move bfqq to the head of the woken list of its waker */ -+ if (!hlist_unhashed(&bfqq->woken_list_node) && -+ &bfqq->woken_list_node != bfqq->waker_bfqq->woken_list.first) { -+ hlist_del_init(&bfqq->woken_list_node); -+ hlist_add_head(&bfqq->woken_list_node, -+ &bfqq->waker_bfqq->woken_list); -+ } - } -diff --git a/block/blk-merge.c b/block/blk-merge.c -index 4d97fb6dd..1398b52a2 100644 ---- a/block/blk-merge.c -+++ b/block/blk-merge.c -@@ -846,18 +846,15 @@ static struct request *attempt_front_merge(struct request_queue *q, - return NULL; - } - --int blk_attempt_req_merge(struct request_queue *q, struct request *rq, -- struct request *next) -+/* -+ * Try to merge 'next' into 'rq'. Return true if the merge happened, false -+ * otherwise. The caller is responsible for freeing 'next' if the merge -+ * happened. -+ */ -+bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, -+ struct request *next) - { -- struct request *free; -- -- free = attempt_merge(q, rq, next); -- if (free) { -- blk_put_request(free); -- return 1; -- } -- -- return 0; -+ return attempt_merge(q, rq, next); - } - - bool blk_rq_merge_ok(struct request *rq, struct bio *bio) -diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c -index fdeb9773b..fcc9b5728 100644 ---- a/block/blk-mq-sched.c -+++ b/block/blk-mq-sched.c -@@ -163,9 +163,19 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) - * in blk_mq_dispatch_rq_list(). - */ - list_add_tail(&rq->queuelist, &rq_list); -+ count++; - if (rq->mq_hctx != hctx) - multi_hctxs = true; -- } while (++count < max_dispatch); -+ -+ /* -+ * If we cannot get tag for the request, stop dequeueing -+ * requests from the IO scheduler. We are unlikely to be able -+ * to submit them anyway and it creates false impression for -+ * scheduling heuristics that the device can take more IO. -+ */ -+ if (!blk_mq_get_driver_tag(rq)) -+ break; -+ } while (count < max_dispatch); - - if (!count) { - if (run_queue) -@@ -380,9 +390,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, - return ret; - } - --bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) -+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, -+ struct list_head *free) - { -- return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); -+ return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free); - } - EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); - -diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h -index 5b18ab915..8b70de4b8 100644 ---- a/block/blk-mq-sched.h -+++ b/block/blk-mq-sched.h -@@ -11,7 +11,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **merged_request); - bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs); --bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); -+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, -+ struct list_head *free); - void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); - void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); - -diff --git a/block/blk-mq.c b/block/blk-mq.c -index 0e120547c..cadd12d68 100644 ---- a/block/blk-mq.c -+++ b/block/blk-mq.c -@@ -361,11 +361,12 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) - - if (e) { - /* -- * Flush requests are special and go directly to the -+ * Flush/passthrough requests are special and go directly to the - * dispatch list. Don't include reserved tags in the - * limiting, as it isn't useful. - */ - if (!op_is_flush(data->cmd_flags) && -+ !blk_op_is_passthrough(data->cmd_flags) && - e->type->ops.limit_depth && - !(data->flags & BLK_MQ_REQ_RESERVED)) - e->type->ops.limit_depth(data->cmd_flags, data); -@@ -1099,7 +1100,7 @@ static bool __blk_mq_get_driver_tag(struct request *rq) - return true; - } - --static bool blk_mq_get_driver_tag(struct request *rq) -+bool blk_mq_get_driver_tag(struct request *rq) - { - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - -diff --git a/block/blk-mq.h b/block/blk-mq.h -index 3616453ca..d9ef3e4f3 100644 ---- a/block/blk-mq.h -+++ b/block/blk-mq.h -@@ -242,6 +242,8 @@ static inline void blk_mq_put_driver_tag(struct request *rq) - __blk_mq_put_driver_tag(rq->mq_hctx, rq); - } - -+bool blk_mq_get_driver_tag(struct request *rq); -+ - static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) - { - int cpu; -@@ -282,6 +284,17 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q, - return NULL; - } - -+/* Free all requests on the list */ -+static inline void blk_mq_free_requests(struct list_head *list) -+{ -+ while (!list_empty(list)) { -+ struct request *rq = list_entry_rq(list->next); -+ -+ list_del_init(&rq->queuelist); -+ blk_mq_free_request(rq); -+ } -+} -+ - /* - * For shared tag users, we track the number of currently active users - * and attempt to provide a fair share of the tag depth for each of them. -diff --git a/block/blk.h b/block/blk.h -index 3b53e44b9..52ff5d4a3 100644 ---- a/block/blk.h -+++ b/block/blk.h -@@ -224,7 +224,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *, - void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); - int ll_back_merge_fn(struct request *req, struct bio *bio, - unsigned int nr_segs); --int blk_attempt_req_merge(struct request_queue *q, struct request *rq, -+bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, - struct request *next); - unsigned int blk_recalc_rq_segments(struct request *rq); - void blk_rq_set_mixed_merge(struct request *rq); -diff --git a/block/elevator.c b/block/elevator.c -index 293c5c813..151b30911 100644 ---- a/block/elevator.c -+++ b/block/elevator.c -@@ -203,7 +203,7 @@ static inline void __elv_rqhash_del(struct request *rq) - rq->rq_flags &= ~RQF_HASHED; - } - --void elv_rqhash_del(struct request_queue *q, struct request *rq) -+void elv_rqhash_del(struct request *rq) - { - if (ELV_ON_HASH(rq)) - __elv_rqhash_del(rq); -@@ -350,9 +350,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, - * we can append 'rq' to an existing request, so we can throw 'rq' away - * afterwards. - * -- * Returns true if we merged, false otherwise -+ * Returns true if we merged, false otherwise. 'free' will contain all -+ * requests that need to be freed. - */ --bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) -+bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq, -+ struct list_head *free) - { - struct request *__rq; - bool ret; -@@ -363,8 +365,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) - /* - * First try one-hit cache. - */ -- if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) -+ if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) { -+ list_add(&rq->queuelist, free); - return true; -+ } - - if (blk_queue_noxmerges(q)) - return false; -@@ -378,6 +382,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) - if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) - break; - -+ list_add(&rq->queuelist, free); - /* The merged request could be merged with others, try again */ - ret = true; - rq = __rq; -@@ -417,7 +422,7 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq) - struct elevator_queue *e = q->elevator; - - if (e->type->ops.next_request) -- return e->type->ops.next_request(q, rq); -+ return e->type->ops.next_request(rq); - - return NULL; - } -@@ -427,7 +432,7 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) - struct elevator_queue *e = q->elevator; - - if (e->type->ops.former_request) -- return e->type->ops.former_request(q, rq); -+ return e->type->ops.former_request(rq); - - return NULL; - } -@@ -616,15 +621,15 @@ static inline bool elv_support_iosched(struct request_queue *q) - } - - /* -- * For single queue devices, default to using mq-deadline. If we have multiple -- * queues or mq-deadline is not available, default to "none". -+ * For single queue devices, default to using bfq. If we have multiple -+ * queues or bfq is not available, default to "none". - */ - static struct elevator_type *elevator_get_default(struct request_queue *q) - { - if (q->nr_hw_queues != 1) - return NULL; - -- return elevator_get(q, "mq-deadline", false); -+ return elevator_get(q, "bfq", false); - } - - /* -@@ -802,8 +807,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) - return len; - } - --struct request *elv_rb_former_request(struct request_queue *q, -- struct request *rq) -+struct request *elv_rb_former_request(struct request *rq) - { - struct rb_node *rbprev = rb_prev(&rq->rb_node); - -@@ -814,8 +818,7 @@ struct request *elv_rb_former_request(struct request_queue *q, - } - EXPORT_SYMBOL(elv_rb_former_request); - --struct request *elv_rb_latter_request(struct request_queue *q, -- struct request *rq) -+struct request *elv_rb_latter_request(struct request *rq) - { - struct rb_node *rbnext = rb_next(&rq->rb_node); - -diff --git a/block/mq-deadline.c b/block/mq-deadline.c -index 3aabcd2a7..59178b7f5 100644 ---- a/block/mq-deadline.c -+++ b/block/mq-deadline.c -@@ -120,7 +120,7 @@ static void deadline_remove_request(struct request_queue *q, struct request *rq) - if (!RB_EMPTY_NODE(&rq->rb_node)) - deadline_del_rq_rb(dd, rq); - -- elv_rqhash_del(q, rq); -+ elv_rqhash_del(rq); - if (q->last_merge == rq) - q->last_merge = NULL; - } -@@ -487,6 +487,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - struct request_queue *q = hctx->queue; - struct deadline_data *dd = q->elevator->elevator_data; - const int data_dir = rq_data_dir(rq); -+ LIST_HEAD(free); - - /* - * This may be a requeue of a write request that has locked its -@@ -494,16 +495,15 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - */ - blk_req_zone_write_unlock(rq); - -- if (blk_mq_sched_try_insert_merge(q, rq)) -+ if (blk_mq_sched_try_insert_merge(q, rq, &free)) { -+ blk_mq_free_requests(&free); - return; -+ } - - trace_block_rq_insert(rq); - -- if (at_head || blk_rq_is_passthrough(rq)) { -- if (at_head) -- list_add(&rq->queuelist, &dd->dispatch); -- else -- list_add_tail(&rq->queuelist, &dd->dispatch); -+ if (at_head) { -+ list_add(&rq->queuelist, &dd->dispatch); - } else { - deadline_add_rq_rb(dd, rq); - -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 158aefae1..0d81eed39 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -272,6 +272,12 @@ static inline bool bio_is_passthrough(struct bio *bio) - return blk_op_is_scsi(op) || blk_op_is_private(op); - } - -+static inline bool blk_op_is_passthrough(unsigned int op) -+{ -+ return (blk_op_is_scsi(op & REQ_OP_MASK) || -+ blk_op_is_private(op & REQ_OP_MASK)); -+} -+ - static inline unsigned short req_get_ioprio(struct request *req) - { - return req->ioprio; -diff --git a/include/linux/elevator.h b/include/linux/elevator.h -index dcb2f9022..fffc6218a 100644 ---- a/include/linux/elevator.h -+++ b/include/linux/elevator.h -@@ -46,8 +46,8 @@ struct elevator_mq_ops { - bool (*has_work)(struct blk_mq_hw_ctx *); - void (*completed_request)(struct request *, u64); - void (*requeue_request)(struct request *); -- struct request *(*former_request)(struct request_queue *, struct request *); -- struct request *(*next_request)(struct request_queue *, struct request *); -+ struct request *(*former_request)(struct request *); -+ struct request *(*next_request)(struct request *); - void (*init_icq)(struct io_cq *); - void (*exit_icq)(struct io_cq *); - }; -@@ -90,7 +90,7 @@ struct elevator_type - - #define ELV_HASH_BITS 6 - --void elv_rqhash_del(struct request_queue *q, struct request *rq); -+void elv_rqhash_del(struct request *rq); - void elv_rqhash_add(struct request_queue *q, struct request *rq); - void elv_rqhash_reposition(struct request_queue *q, struct request *rq); - struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); -@@ -117,7 +117,8 @@ extern void elv_merge_requests(struct request_queue *, struct request *, - struct request *); - extern void elv_merged_request(struct request_queue *, struct request *, - enum elv_merge); --extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); -+extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, -+ struct list_head *); - extern struct request *elv_former_request(struct request_queue *, struct request *); - extern struct request *elv_latter_request(struct request_queue *, struct request *); - -@@ -140,8 +141,8 @@ extern struct elevator_queue *elevator_alloc(struct request_queue *, - /* - * Helper functions. - */ --extern struct request *elv_rb_former_request(struct request_queue *, struct request *); --extern struct request *elv_rb_latter_request(struct request_queue *, struct request *); -+extern struct request *elv_rb_former_request(struct request *); -+extern struct request *elv_rb_latter_request(struct request *); - - /* - * rb support functions. diff --git a/0003-x86-setup-Merge-several-reservations-of-start-of-mem.patch b/0003-x86-setup-Merge-several-reservations-of-start-of-mem.patch new file mode 100644 index 000000000000..eca80260ba10 --- /dev/null +++ b/0003-x86-setup-Merge-several-reservations-of-start-of-mem.patch @@ -0,0 +1,67 @@ +From e63cb4a867fe803dc90376af8b268ba1549ec36e Mon Sep 17 00:00:00 2001 +From: Mike Rapoport <rppt@linux.ibm.com> +Date: Tue, 2 Mar 2021 12:04:06 +0200 +Subject: [PATCH 3/8] x86/setup: Merge several reservations of start of memory + +Currently, the first several pages are reserved both to avoid leaking +their contents on systems with L1TF and to avoid corrupting BIOS memory. + +Merge the two memory reservations. + +Signed-off-by: Mike Rapoport <rppt@linux.ibm.com> +Signed-off-by: Borislav Petkov <bp@suse.de> +Reviewed-by: David Hildenbrand <david@redhat.com> +Acked-by: Borislav Petkov <bp@suse.de> +Link: https://lkml.kernel.org/r/20210302100406.22059-3-rppt@kernel.org +--- + arch/x86/kernel/setup.c | 19 ++++++++++--------- + 1 file changed, 10 insertions(+), 9 deletions(-) + +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c +index 420d881da2bd..282d572e49af 100644 +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -714,11 +714,6 @@ static int __init parse_reservelow(char *p) + + early_param("reservelow", parse_reservelow); + +-static void __init trim_low_memory_range(void) +-{ +- memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); +-} +- + static void __init early_reserve_memory(void) + { + /* +@@ -731,10 +726,17 @@ static void __init early_reserve_memory(void) + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); + + /* +- * Make sure page 0 is always reserved because on systems with +- * L1TF its contents can be leaked to user processes. ++ * The first 4Kb of memory is a BIOS owned area, but generally it is ++ * not listed as such in the E820 table. ++ * ++ * Reserve the first memory page and typically some additional ++ * memory (64KiB by default) since some BIOSes are known to corrupt ++ * low memory. See the Kconfig help text for X86_RESERVE_LOW. ++ * ++ * In addition, make sure page 0 is always reserved because on ++ * systems with L1TF its contents can be leaked to user processes. + */ +- memblock_reserve(0, PAGE_SIZE); ++ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); + + early_reserve_initrd(); + +@@ -747,7 +749,6 @@ static void __init early_reserve_memory(void) + reserve_bios_regions(); + + trim_snb_memory(); +- trim_low_memory_range(); + } + + /* +-- +2.32.0 + diff --git a/0004-x86-setup-Move-trim_snb_memory-later-in-setup_arch-t.patch b/0004-x86-setup-Move-trim_snb_memory-later-in-setup_arch-t.patch new file mode 100644 index 000000000000..8a8e4d194cc6 --- /dev/null +++ b/0004-x86-setup-Move-trim_snb_memory-later-in-setup_arch-t.patch @@ -0,0 +1,87 @@ +From c4b5e4bc8317ccb0a822429d87288d9f90453a04 Mon Sep 17 00:00:00 2001 +From: Mike Rapoport <rppt@linux.ibm.com> +Date: Tue, 13 Apr 2021 21:08:39 +0300 +Subject: [PATCH 4/8] x86/setup: Move trim_snb_memory() later in setup_arch() + to fix boot hangs + +Commit + + a799c2bd29d1 ("x86/setup: Consolidate early memory reservations") + +moved reservation of the memory inaccessible by Sandy Bride integrated +graphics very early, and, as a result, on systems with such devices +the first 1M was reserved by trim_snb_memory() which prevented the +allocation of the real mode trampoline and made the boot hang very +early. + +Since the purpose of trim_snb_memory() is to prevent problematic pages +ever reaching the graphics device, it is safe to reserve these pages +after memblock allocations are possible. + +Move trim_snb_memory() later in boot so that it will be called after +reserve_real_mode() and make comments describing trim_snb_memory() +operation more elaborate. + + [ bp: Massage a bit. ] + +Fixes: a799c2bd29d1 ("x86/setup: Consolidate early memory reservations") +Reported-by: Randy Dunlap <rdunlap@infradead.org> +Signed-off-by: Mike Rapoport <rppt@linux.ibm.com> +Signed-off-by: Borislav Petkov <bp@suse.de> +Tested-by: Randy Dunlap <rdunlap@infradead.org> +Tested-by: Hugh Dickins <hughd@google.com> +Link: https://lkml.kernel.org/r/f67d3e03-af90-f790-baf4-8d412fe055af@infradead.org +--- + arch/x86/kernel/setup.c | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c +index 282d572e49af..7d466f51be1f 100644 +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -634,11 +634,16 @@ static void __init trim_snb_memory(void) + printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); + + /* +- * Reserve all memory below the 1 MB mark that has not +- * already been reserved. ++ * SandyBridge integrated graphics devices have a bug that prevents ++ * them from accessing certain memory ranges, namely anything below ++ * 1M and in the pages listed in bad_pages[] above. ++ * ++ * To avoid these pages being ever accessed by SNB gfx devices ++ * reserve all memory below the 1 MB mark and bad_pages that have ++ * not already been reserved at boot time. + */ + memblock_reserve(0, 1<<20); +- ++ + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { + if (memblock_reserve(bad_pages[i], PAGE_SIZE)) + printk(KERN_WARNING "failed to reserve 0x%08lx\n", +@@ -747,8 +752,6 @@ static void __init early_reserve_memory(void) + + reserve_ibft_region(); + reserve_bios_regions(); +- +- trim_snb_memory(); + } + + /* +@@ -1082,6 +1085,13 @@ void __init setup_arch(char **cmdline_p) + + reserve_real_mode(); + ++ /* ++ * Reserving memory causing GPU hangs on Sandy Bridge integrated ++ * graphics devices should be done after we allocated memory under ++ * 1M for the real mode trampoline. ++ */ ++ trim_snb_memory(); ++ + init_mem_mapping(); + + idt_setup_early_pf(); +-- +2.32.0 + diff --git a/0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch b/0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch new file mode 100644 index 000000000000..169ba22ae2de --- /dev/null +++ b/0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch @@ -0,0 +1,170 @@ +From 3ffe8ae29143ee20e01b0bc4a63774182b59daf9 Mon Sep 17 00:00:00 2001 +From: Mike Rapoport <rppt@linux.ibm.com> +Date: Tue, 1 Jun 2021 10:53:52 +0300 +Subject: [PATCH 5/8] x86/setup: always reserve the first 1M of RAM + +There are BIOSes that are known to corrupt the memory under 1M, or more +precisely under 640K because the memory above 640K is anyway reserved for +the EGA/VGA frame buffer and BIOS. + +To prevent usage of the memory that will be potentially clobbered by the +kernel, the beginning of the memory is always reserved. The exact size of +the reserved area is determined by CONFIG_X86_RESERVE_LOW build time and +reservelow command line option. The reserved range may be from 4K to 640K +with the default of 64K. There are also configurations that reserve the +entire 1M range, like machines with SandyBridge graphic devices or systems +that enable crash kernel. + +In addition to the potentially clobbered memory, EBDA of unknown size may +be as low as 128K and the memory above that EBDA start is also reserved +early. + +It would have been possible to reserve the entire range under 1M unless for +the real mode trampoline that must reside in that area. + +To accommodate placement of the real mode trampoline and keep the memory +safe from being clobbered by BIOS reserve the first 64K of RAM before +memory allocations are possible and then, after the real mode trampoline is +allocated, reserve the entire range from 0 to 1M. + +Update trim_snb_memory() and reserve_real_mode() to avoid redundant +reservations of the same memory range. + +Also make sure the memory under 1M is not getting freed by +efi_free_boot_services(). + +Fixes: a799c2bd29d1 ("x86/setup: Consolidate early memory reservations") +Signed-off-by: Mike Rapoport <rppt@linux.ibm.com> +--- + arch/x86/kernel/setup.c | 35 ++++++++++++++++++++-------------- + arch/x86/platform/efi/quirks.c | 12 ++++++++++++ + arch/x86/realmode/init.c | 14 ++++++++------ + 3 files changed, 41 insertions(+), 20 deletions(-) + +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c +index 7d466f51be1f..d7cfb927864f 100644 +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -638,11 +638,11 @@ static void __init trim_snb_memory(void) + * them from accessing certain memory ranges, namely anything below + * 1M and in the pages listed in bad_pages[] above. + * +- * To avoid these pages being ever accessed by SNB gfx devices +- * reserve all memory below the 1 MB mark and bad_pages that have +- * not already been reserved at boot time. ++ * To avoid these pages being ever accessed by SNB gfx devices reserve ++ * bad_pages that have not already been reserved at boot time. ++ * All memory below the 1 MB mark is anyway reserved later during ++ * setup_arch(), so there is no need to reserve it here. + */ +- memblock_reserve(0, 1<<20); + + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { + if (memblock_reserve(bad_pages[i], PAGE_SIZE)) +@@ -734,14 +734,14 @@ static void __init early_reserve_memory(void) + * The first 4Kb of memory is a BIOS owned area, but generally it is + * not listed as such in the E820 table. + * +- * Reserve the first memory page and typically some additional +- * memory (64KiB by default) since some BIOSes are known to corrupt +- * low memory. See the Kconfig help text for X86_RESERVE_LOW. ++ * Reserve the first 64K of memory since some BIOSes are known to ++ * corrupt low memory. After the real mode trampoline is allocated the ++ * rest of the memory below 640k is reserved. + * + * In addition, make sure page 0 is always reserved because on + * systems with L1TF its contents can be leaked to user processes. + */ +- memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); ++ memblock_reserve(0, SZ_64K); + + early_reserve_initrd(); + +@@ -752,6 +752,7 @@ static void __init early_reserve_memory(void) + + reserve_ibft_region(); + reserve_bios_regions(); ++ trim_snb_memory(); + } + + /* +@@ -1083,14 +1084,20 @@ void __init setup_arch(char **cmdline_p) + (max_pfn_mapped<<PAGE_SHIFT) - 1); + #endif + +- reserve_real_mode(); +- + /* +- * Reserving memory causing GPU hangs on Sandy Bridge integrated +- * graphics devices should be done after we allocated memory under +- * 1M for the real mode trampoline. ++ * Find free memory for the real mode trampoline and place it ++ * there. ++ * If there is not enough free memory under 1M, on EFI-enabled ++ * systems there will be additional attempt to reclaim the memory ++ * for the real mode trampoline at efi_free_boot_services(). ++ * ++ * Unconditionally reserve the entire first 1M of RAM because ++ * BIOSes are know to corrupt low memory and several ++ * hundred kilobytes are not worth complex detection what memory gets ++ * clobbered. Moreover, on machines with SandyBridge graphics or in ++ * setups that use crashkernel the entire 1M is anyway reserved. + */ +- trim_snb_memory(); ++ reserve_real_mode(); + + init_mem_mapping(); + +diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c +index 67d93a243c35..27561b56a821 100644 +--- a/arch/x86/platform/efi/quirks.c ++++ b/arch/x86/platform/efi/quirks.c +@@ -450,6 +450,18 @@ void __init efi_free_boot_services(void) + size -= rm_size; + } + ++ /* ++ * Don't free memory under 1M for two reasons: ++ * - BIOS might clobber it ++ * - Crash kernel needs it to be reserved ++ */ ++ if (start + size < SZ_1M) ++ continue; ++ if (start < SZ_1M) { ++ size -= (SZ_1M - start); ++ start = SZ_1M; ++ } ++ + memblock_free_late(start, size); + } + +diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c +index 22fda7d99159..ea42630d4e2e 100644 +--- a/arch/x86/realmode/init.c ++++ b/arch/x86/realmode/init.c +@@ -29,14 +29,16 @@ void __init reserve_real_mode(void) + + /* Has to be under 1M so we can execute real-mode AP code. */ + mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); +- if (!mem) { ++ if (!mem) + pr_info("No sub-1M memory is available for the trampoline\n"); +- return; +- } ++ else ++ set_real_mode_mem(mem); + +- memblock_reserve(mem, size); +- set_real_mode_mem(mem); +- crash_reserve_low_1M(); ++ /* ++ * Unconditionally reserve the entire fisrt 1M, see comment in ++ * setup_arch() ++ */ ++ memblock_reserve(0, SZ_1M); + } + + static void sme_sev_setup_real_mode(struct trampoline_header *th) +-- +2.32.0 + diff --git a/0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch b/0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch new file mode 100644 index 000000000000..a49d92c2252b --- /dev/null +++ b/0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch @@ -0,0 +1,114 @@ +From 2e68d15d0a146e9b13bfbaba5f260c82b8c3d049 Mon Sep 17 00:00:00 2001 +From: Mike Rapoport <rppt@linux.ibm.com> +Date: Tue, 1 Jun 2021 10:53:53 +0300 +Subject: [PATCH 6/8] x86/setup: remove CONFIG_X86_RESERVE_LOW and reservelow + options + +The CONFIG_X86_RESERVE_LOW build time and reservelow command line option +allowed to control the amount of memory under 1M that would be reserved at +boot to avoid using memory that can be potentially clobbered by BIOS. + +Since the entire range under 1M is always reserved there is no need for +these options and they can be removed. + +Signed-off-by: Mike Rapoport <rppt@linux.ibm.com> +--- + .../admin-guide/kernel-parameters.txt | 5 ---- + arch/x86/Kconfig | 29 ------------------- + arch/x86/kernel/setup.c | 24 --------------- + 3 files changed, 58 deletions(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 835f810f2f26..479cc44cc4e2 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4623,11 +4623,6 @@ + Reserves a hole at the top of the kernel virtual + address space. + +- reservelow= [X86] +- Format: nn[K] +- Set the amount of memory to reserve for BIOS at +- the bottom of the address space. +- + reset_devices [KNL] Force drivers to reset the underlying device + during initialization. + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 861b1b794697..fc91be3b1bd1 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1688,35 +1688,6 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + Set whether the default state of memory_corruption_check is + on or off. + +-config X86_RESERVE_LOW +- int "Amount of low memory, in kilobytes, to reserve for the BIOS" +- default 64 +- range 4 640 +- help +- Specify the amount of low memory to reserve for the BIOS. +- +- The first page contains BIOS data structures that the kernel +- must not use, so that page must always be reserved. +- +- By default we reserve the first 64K of physical RAM, as a +- number of BIOSes are known to corrupt that memory range +- during events such as suspend/resume or monitor cable +- insertion, so it must not be used by the kernel. +- +- You can set this to 4 if you are absolutely sure that you +- trust the BIOS to get all its memory reservations and usages +- right. If you know your BIOS have problems beyond the +- default 64K area, you can set this to 640 to avoid using the +- entire low memory range. +- +- If you have doubts about the BIOS (e.g. suspend/resume does +- not work or there's kernel crashes after certain hardware +- hotplug events) then you might want to enable +- X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check +- typical corruption patterns. +- +- Leave this to the default value of 64 if you are unsure. +- + config MATH_EMULATION + bool + depends on MODIFY_LDT_SYSCALL +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c +index d7cfb927864f..fbda4bbf75c1 100644 +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -695,30 +695,6 @@ static void __init e820_add_kernel_range(void) + e820__range_add(start, size, E820_TYPE_RAM); + } + +-static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; +- +-static int __init parse_reservelow(char *p) +-{ +- unsigned long long size; +- +- if (!p) +- return -EINVAL; +- +- size = memparse(p, &p); +- +- if (size < 4096) +- size = 4096; +- +- if (size > 640*1024) +- size = 640*1024; +- +- reserve_low = size; +- +- return 0; +-} +- +-early_param("reservelow", parse_reservelow); +- + static void __init early_reserve_memory(void) + { + /* +-- +2.32.0 + diff --git a/0007-x86-crash-remove-crash_reserve_low_1M.patch b/0007-x86-crash-remove-crash_reserve_low_1M.patch new file mode 100644 index 000000000000..903e5fa0969a --- /dev/null +++ b/0007-x86-crash-remove-crash_reserve_low_1M.patch @@ -0,0 +1,58 @@ +From bb4c1200fdfd6c17fff64e159e625c3678342b87 Mon Sep 17 00:00:00 2001 +From: Mike Rapoport <rppt@linux.ibm.com> +Date: Tue, 1 Jun 2021 10:53:54 +0300 +Subject: [PATCH 7/8] x86/crash: remove crash_reserve_low_1M() + +The entire memory range under 1M is unconditionally reserved at +setup_arch(), so there is no need for crash_reserve_low_1M() anymore. + +Remove this function. + +Signed-off-by: Mike Rapoport <rppt@linux.ibm.com> +--- + arch/x86/include/asm/crash.h | 6 ------ + arch/x86/kernel/crash.c | 13 ------------- + 2 files changed, 19 deletions(-) + +diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h +index f58de66091e5..8b6bd63530dc 100644 +--- a/arch/x86/include/asm/crash.h ++++ b/arch/x86/include/asm/crash.h +@@ -9,10 +9,4 @@ int crash_setup_memmap_entries(struct kimage *image, + struct boot_params *params); + void crash_smp_send_stop(void); + +-#ifdef CONFIG_KEXEC_CORE +-void __init crash_reserve_low_1M(void); +-#else +-static inline void __init crash_reserve_low_1M(void) { } +-#endif +- + #endif /* _ASM_X86_CRASH_H */ +diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c +index b1deacbeb266..e0b8d9662da5 100644 +--- a/arch/x86/kernel/crash.c ++++ b/arch/x86/kernel/crash.c +@@ -70,19 +70,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void) + rcu_read_unlock(); + } + +-/* +- * When the crashkernel option is specified, only use the low +- * 1M for the real mode trampoline. +- */ +-void __init crash_reserve_low_1M(void) +-{ +- if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0) +- return; +- +- memblock_reserve(0, 1<<20); +- pr_info("Reserving the low 1M of memory for crashkernel\n"); +-} +- + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) + + static void kdump_nmi_callback(int cpu, struct pt_regs *regs) +-- +2.32.0 + diff --git a/0002-UKSM.patch b/0008-UKSM.patch index 3321eaa8ee58..3321eaa8ee58 100644 --- a/0002-UKSM.patch +++ b/0008-UKSM.patch @@ -68,7 +68,7 @@ _subarch=36 ### IMPORTANT: Do no edit below this line unless you know what you're doing pkgbase=linux-ck-uksm -pkgver=5.12.10 +pkgver=5.12.12 pkgrel=1 _ckpatchversion=1 _ckpatch="patch-5.12-ck${_ckpatchversion}" @@ -87,21 +87,31 @@ source=( "more-uarches-$_gcc_more_v.tar.gz::https://github.com/graysky2/kernel_compiler_patch/archive/$_gcc_more_v.tar.gz" "http://ck.kolivas.org/patches/5.0/5.12/5.12-ck${_ckpatchversion}/$_ckpatch.xz" 0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch - 0002-UKSM.patch - 0003-bfq-lucjan-r2K210602.patch + 0002-x86-setup-Consolidate-early-memory-reservations.patch + 0003-x86-setup-Merge-several-reservations-of-start-of-mem.patch + 0004-x86-setup-Move-trim_snb_memory-later-in-setup_arch-t.patch + 0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch + 0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch + 0007-x86-crash-remove-crash_reserve_low_1M.patch + 0008-UKSM.patch ) validpgpkeys=( 'ABAF11C65A2970B130ABE3C479BE3E4300411886' # Linus Torvalds '647F28654894E3BD457199BE38DBBDC86092693E' # Greg Kroah-Hartman ) -b2sums=('b40ef5a11ca435299899e8131fa72af147455cd8ebee4c0e187572b1f628e66d2b6fbb318308bc911a598d8303d1ab3622d52966deaa5c48d59dcd65f4f58687' +b2sums=('f9aef3da2f65916cc30da9a066217d3379036e6a32a732224da7fee86c80810315484f48132b50b8cf8eb5e0b055ad1b7bbe63dadd0eb54b0b0626bc57c20963' 'SKIP' 'SKIP' '30d1df754608bb423cbc99c2097ad521baa091b9a3b39df4bd5c2d50c57eec54d8fa0e4a4a04b847c3d1b87ba682cadc8db45fabeefdc9ad7caaf8e77b96e41a' 'c9f729ba1efe6f04e7b2c57d3999bc9675b577596dccb2f227e5b6e444285e1fdd270bf67c0fcf9f5808a4c3a4b1c7a5c13a76f754ad9b9447243ccbaf2ce6a3' - 'e1eccb5b6b728e3852ade55dae7a53b8b6bd5f0fb2a330b99e85bfa64abaa430cb714d301ed169df14a1f302a75d952992f0d8fa6ab02fa6716165bdf23b63aa' - '14f45171afc3b15488b40a05e58b352c5057da3a5782e13527392f7750d8e45a8db54f9b50b218fedb8bf679de3b4e5d78e230a44f7b1aa482f7b3aa831bd641' - '6a195695fcd207adbdea28dd2803b479f6e5dc478b56d5fce16a7600f719fa545ed0e468a26f9c94e982346fb803a0ff026abd0d70335e42027468475beb7cbb') + 'dda152592dec643bce44754bf5d2d43a5897cc57f8dc258b87857055a45abf903d619aba1de389228cb086a17fedea5458f8fe2c0993fa20213bb7c5bca331c8' + '13330cf57b5c6b928ea73bd30479010688cf8d2003107b041a7fdad33c1ac225c8c905bef235cd762d6ea76be754b5db6be769526bacf7333298f72d6afff535' + '381e0f177faa3090d1abf4d11a97db535712840870265dea167d7692dee7733a226d09c103d01705d5c0809fa66c7a23efea9da2473da672644b06e31db77083' + 'cd9da0dee048fc52a3032343f122c2055081eeedfc8a3e5227218f0f63fc7618e8fe744c8caa7e3a2ca844f4aaf7314b57a306d0d3b1849e97b24687b8c5a501' + '1810832172e1b006a5471d8e317573343884feed9abc9e7380a32d83c958b0e6aa68adf9a647c9b7b714783997591f5d80e754c6e7357279661eee998f22864c' + '4e7cb958f95d99bba9810e675d4f1b0b3c171f78e9fe96ff9d265f792f4ceb1367f2f4d238f36b5ca1c395e14abdabbf0f8ce2dc07c4fe567d822a8b629dfa05' + '2251f8bf84e141b4661f84cc2ce7b21783ac0a349b2651477dfcbc5383b796b2e588d85ee411398b15c820cb3672256be8ed281c8bccfad252c9dd5b0e1e0cd5' + '14f45171afc3b15488b40a05e58b352c5057da3a5782e13527392f7750d8e45a8db54f9b50b218fedb8bf679de3b4e5d78e230a44f7b1aa482f7b3aa831bd641') export KBUILD_BUILD_HOST=archlinux export KBUILD_BUILD_USER=$pkgbase @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 5.12.10 Kernel Configuration +# Linux/x86 5.12.12 Kernel Configuration # CONFIG_CC_VERSION_TEXT="gcc (GCC) 11.1.0" CONFIG_CC_IS_GCC=y @@ -488,7 +488,6 @@ CONFIG_X86_PMEM_LEGACY_DEVICE=y CONFIG_X86_PMEM_LEGACY=m CONFIG_X86_CHECK_BIOS_CORRUPTION=y CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y -CONFIG_X86_RESERVE_LOW=64 CONFIG_MTRR=y CONFIG_MTRR_SANITIZER=y CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 @@ -2904,10 +2903,12 @@ CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_NETPOLL=y CONFIG_NET_POLL_CONTROLLER=y CONFIG_NTB_NETDEV=m -# CONFIG_RIONET is not set +CONFIG_RIONET=m +CONFIG_RIONET_TX_SIZE=128 +CONFIG_RIONET_RX_SIZE=128 CONFIG_TUN=m CONFIG_TAP=m -# CONFIG_TUN_VNET_CROSS_LE is not set +CONFIG_TUN_VNET_CROSS_LE=y CONFIG_VETH=m CONFIG_VIRTIO_NET=m CONFIG_NLMON=m @@ -2915,7 +2916,17 @@ CONFIG_NET_VRF=m CONFIG_VSOCKMON=m CONFIG_MHI_NET=m CONFIG_SUNGEM_PHY=m -# CONFIG_ARCNET is not set +CONFIG_ARCNET=m +CONFIG_ARCNET_1201=m +CONFIG_ARCNET_1051=m +CONFIG_ARCNET_RAW=m +CONFIG_ARCNET_CAP=m +CONFIG_ARCNET_COM90xx=m +CONFIG_ARCNET_COM90xxIO=m +CONFIG_ARCNET_RIM_I=m +CONFIG_ARCNET_COM20020=m +CONFIG_ARCNET_COM20020_PCI=m +CONFIG_ARCNET_COM20020_CS=m CONFIG_ATM_DRIVERS=y # CONFIG_ATM_DUMMY is not set CONFIG_ATM_TCP=m @@ -3769,7 +3780,24 @@ CONFIG_PCMCIA_WL3501=m CONFIG_MAC80211_HWSIM=m CONFIG_USB_NET_RNDIS_WLAN=m CONFIG_VIRT_WIFI=m -# CONFIG_WAN is not set +CONFIG_WAN=y +CONFIG_LANMEDIA=m +CONFIG_HDLC=m +CONFIG_HDLC_RAW=m +CONFIG_HDLC_RAW_ETH=m +CONFIG_HDLC_CISCO=m +CONFIG_HDLC_FR=m +CONFIG_HDLC_PPP=m + +# +# X.25/LAPB support is disabled +# +CONFIG_PCI200SYN=m +CONFIG_WANXL=m +CONFIG_PC300TOO=m +CONFIG_FARSYNC=m +CONFIG_SBNI=m +CONFIG_SBNI_MULTILINE=y CONFIG_IEEE802154_DRIVERS=m CONFIG_IEEE802154_FAKELB=m CONFIG_IEEE802154_AT86RF230=m @@ -3787,7 +3815,7 @@ CONFIG_XEN_NETDEV_BACKEND=m CONFIG_VMXNET3=m CONFIG_FUJITSU_ES=m CONFIG_USB4_NET=m -# CONFIG_HYPERV_NET is not set +CONFIG_HYPERV_NET=m CONFIG_NETDEVSIM=m CONFIG_NET_FAILOVER=m CONFIG_ISDN=y @@ -6122,8 +6150,8 @@ CONFIG_DVB_DUMMY_FE=m CONFIG_AGP=m CONFIG_AGP_AMD64=m CONFIG_AGP_INTEL=m -# CONFIG_AGP_SIS is not set -# CONFIG_AGP_VIA is not set +CONFIG_AGP_SIS=m +CONFIG_AGP_VIA=m CONFIG_INTEL_GTT=m CONFIG_VGA_ARB=y CONFIG_VGA_ARB_MAX_GPUS=10 @@ -6132,7 +6160,7 @@ CONFIG_DRM=m CONFIG_DRM_MIPI_DBI=m CONFIG_DRM_MIPI_DSI=y CONFIG_DRM_DP_AUX_CHARDEV=y -# CONFIG_DRM_DEBUG_SELFTEST is not set +CONFIG_DRM_DEBUG_SELFTEST=m CONFIG_DRM_KMS_HELPER=m CONFIG_DRM_KMS_FB_HELPER=y CONFIG_DRM_FBDEV_EMULATION=y @@ -6185,7 +6213,14 @@ CONFIG_DRM_AMD_DC_SI=y # end of Display Engine Configuration CONFIG_HSA_AMD=y -# CONFIG_DRM_NOUVEAU is not set +CONFIG_DRM_NOUVEAU=m +CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT=y +CONFIG_NOUVEAU_DEBUG=5 +CONFIG_NOUVEAU_DEBUG_DEFAULT=3 +# CONFIG_NOUVEAU_DEBUG_MMU is not set +# CONFIG_NOUVEAU_DEBUG_PUSH is not set +CONFIG_DRM_NOUVEAU_BACKLIGHT=y +CONFIG_DRM_NOUVEAU_SVM=y CONFIG_DRM_I915=m CONFIG_DRM_I915_FORCE_PROBE="" CONFIG_DRM_I915_CAPTURE_ERROR=y @@ -6244,8 +6279,16 @@ CONFIG_TINYDRM_ST7735R=m CONFIG_DRM_XEN=y CONFIG_DRM_XEN_FRONTEND=m CONFIG_DRM_VBOXVIDEO=m -# CONFIG_DRM_LEGACY is not set +CONFIG_DRM_LEGACY=y +# CONFIG_DRM_TDFX is not set +# CONFIG_DRM_R128 is not set +# CONFIG_DRM_MGA is not set +# CONFIG_DRM_SIS is not set +# CONFIG_DRM_VIA is not set +# CONFIG_DRM_SAVAGE is not set +CONFIG_DRM_EXPORT_FOR_TESTS=y CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y +CONFIG_DRM_LIB_RANDOM=y # # Frame buffer Devices @@ -6318,16 +6361,23 @@ CONFIG_FB_ATY_CT=y CONFIG_FB_ATY_GENERIC_LCD=y CONFIG_FB_ATY_GX=y CONFIG_FB_ATY_BACKLIGHT=y -# CONFIG_FB_S3 is not set -# CONFIG_FB_SAVAGE is not set -# CONFIG_FB_SIS is not set +CONFIG_FB_S3=m +CONFIG_FB_S3_DDC=y +CONFIG_FB_SAVAGE=m +CONFIG_FB_SAVAGE_I2C=y +CONFIG_FB_SAVAGE_ACCEL=y +CONFIG_FB_SIS=m +CONFIG_FB_SIS_300=y +CONFIG_FB_SIS_315=y CONFIG_FB_VIA=m # CONFIG_FB_VIA_DIRECT_PROCFS is not set CONFIG_FB_VIA_X_COMPATIBILITY=y CONFIG_FB_NEOMAGIC=m -# CONFIG_FB_KYRO is not set -# CONFIG_FB_3DFX is not set -# CONFIG_FB_VOODOO1 is not set +CONFIG_FB_KYRO=m +CONFIG_FB_3DFX=m +CONFIG_FB_3DFX_ACCEL=y +CONFIG_FB_3DFX_I2C=y +CONFIG_FB_VOODOO1=m CONFIG_FB_VT8623=m # CONFIG_FB_TRIDENT is not set # CONFIG_FB_ARK is not set @@ -6341,7 +6391,7 @@ CONFIG_FB_IBM_GXT4500=m CONFIG_XEN_FBDEV_FRONTEND=m # CONFIG_FB_METRONOME is not set # CONFIG_FB_MB862XX is not set -# CONFIG_FB_HYPERV is not set +CONFIG_FB_HYPERV=m CONFIG_FB_SIMPLE=y # CONFIG_FB_SM712 is not set # end of Frame buffer Devices @@ -6365,7 +6415,7 @@ CONFIG_LCD_OTM3225A=m CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_BACKLIGHT_KTD253=m CONFIG_BACKLIGHT_LM3533=m -# CONFIG_BACKLIGHT_CARILLO_RANCH is not set +CONFIG_BACKLIGHT_CARILLO_RANCH=m CONFIG_BACKLIGHT_PWM=m CONFIG_BACKLIGHT_DA903X=m CONFIG_BACKLIGHT_DA9052=m @@ -9515,7 +9565,7 @@ CONFIG_GENERIC_STRNLEN_USER=y CONFIG_GENERIC_NET_UTILS=y CONFIG_GENERIC_FIND_FIRST_BIT=y CONFIG_CORDIC=m -# CONFIG_PRIME_NUMBERS is not set +CONFIG_PRIME_NUMBERS=m CONFIG_RATIONAL=y CONFIG_GENERIC_PCI_IOMAP=y CONFIG_GENERIC_IOMAP=y |