summarylogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.SRCINFO26
-rw-r--r--0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch6
-rw-r--r--0002-x86-setup-Consolidate-early-memory-reservations.patch188
-rw-r--r--0003-bfq-lucjan-r2K210602.patch1534
-rw-r--r--0003-x86-setup-Merge-several-reservations-of-start-of-mem.patch67
-rw-r--r--0004-x86-setup-Move-trim_snb_memory-later-in-setup_arch-t.patch87
-rw-r--r--0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch170
-rw-r--r--0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch114
-rw-r--r--0007-x86-crash-remove-crash_reserve_low_1M.patch58
-rw-r--r--0008-UKSM.patch (renamed from 0002-UKSM.patch)0
-rw-r--r--PKGBUILD24
-rw-r--r--config92
12 files changed, 793 insertions, 1573 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 227d593bc341..ebdb676050b7 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -1,5 +1,5 @@
pkgbase = linux-ck-uksm
- pkgver = 5.12.10
+ pkgver = 5.12.12
pkgrel = 1
url = https://wiki.archlinux.org/index.php/Linux-ck
arch = x86_64
@@ -12,24 +12,34 @@ pkgbase = linux-ck-uksm
makedepends = tar
makedepends = xz
options = !strip
- source = https://www.kernel.org/pub/linux/kernel/v5.x/linux-5.12.10.tar.xz
- source = https://www.kernel.org/pub/linux/kernel/v5.x/linux-5.12.10.tar.sign
+ source = https://www.kernel.org/pub/linux/kernel/v5.x/linux-5.12.12.tar.xz
+ source = https://www.kernel.org/pub/linux/kernel/v5.x/linux-5.12.12.tar.sign
source = config
source = more-uarches-20210610.tar.gz::https://github.com/graysky2/kernel_compiler_patch/archive/20210610.tar.gz
source = http://ck.kolivas.org/patches/5.0/5.12/5.12-ck1/patch-5.12-ck1.xz
source = 0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch
- source = 0002-UKSM.patch
- source = 0003-bfq-lucjan-r2K210602.patch
+ source = 0002-x86-setup-Consolidate-early-memory-reservations.patch
+ source = 0003-x86-setup-Merge-several-reservations-of-start-of-mem.patch
+ source = 0004-x86-setup-Move-trim_snb_memory-later-in-setup_arch-t.patch
+ source = 0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch
+ source = 0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch
+ source = 0007-x86-crash-remove-crash_reserve_low_1M.patch
+ source = 0008-UKSM.patch
validpgpkeys = ABAF11C65A2970B130ABE3C479BE3E4300411886
validpgpkeys = 647F28654894E3BD457199BE38DBBDC86092693E
- b2sums = b40ef5a11ca435299899e8131fa72af147455cd8ebee4c0e187572b1f628e66d2b6fbb318308bc911a598d8303d1ab3622d52966deaa5c48d59dcd65f4f58687
+ b2sums = f9aef3da2f65916cc30da9a066217d3379036e6a32a732224da7fee86c80810315484f48132b50b8cf8eb5e0b055ad1b7bbe63dadd0eb54b0b0626bc57c20963
b2sums = SKIP
b2sums = SKIP
b2sums = 30d1df754608bb423cbc99c2097ad521baa091b9a3b39df4bd5c2d50c57eec54d8fa0e4a4a04b847c3d1b87ba682cadc8db45fabeefdc9ad7caaf8e77b96e41a
b2sums = c9f729ba1efe6f04e7b2c57d3999bc9675b577596dccb2f227e5b6e444285e1fdd270bf67c0fcf9f5808a4c3a4b1c7a5c13a76f754ad9b9447243ccbaf2ce6a3
- b2sums = e1eccb5b6b728e3852ade55dae7a53b8b6bd5f0fb2a330b99e85bfa64abaa430cb714d301ed169df14a1f302a75d952992f0d8fa6ab02fa6716165bdf23b63aa
+ b2sums = dda152592dec643bce44754bf5d2d43a5897cc57f8dc258b87857055a45abf903d619aba1de389228cb086a17fedea5458f8fe2c0993fa20213bb7c5bca331c8
+ b2sums = 13330cf57b5c6b928ea73bd30479010688cf8d2003107b041a7fdad33c1ac225c8c905bef235cd762d6ea76be754b5db6be769526bacf7333298f72d6afff535
+ b2sums = 381e0f177faa3090d1abf4d11a97db535712840870265dea167d7692dee7733a226d09c103d01705d5c0809fa66c7a23efea9da2473da672644b06e31db77083
+ b2sums = cd9da0dee048fc52a3032343f122c2055081eeedfc8a3e5227218f0f63fc7618e8fe744c8caa7e3a2ca844f4aaf7314b57a306d0d3b1849e97b24687b8c5a501
+ b2sums = 1810832172e1b006a5471d8e317573343884feed9abc9e7380a32d83c958b0e6aa68adf9a647c9b7b714783997591f5d80e754c6e7357279661eee998f22864c
+ b2sums = 4e7cb958f95d99bba9810e675d4f1b0b3c171f78e9fe96ff9d265f792f4ceb1367f2f4d238f36b5ca1c395e14abdabbf0f8ce2dc07c4fe567d822a8b629dfa05
+ b2sums = 2251f8bf84e141b4661f84cc2ce7b21783ac0a349b2651477dfcbc5383b796b2e588d85ee411398b15c820cb3672256be8ed281c8bccfad252c9dd5b0e1e0cd5
b2sums = 14f45171afc3b15488b40a05e58b352c5057da3a5782e13527392f7750d8e45a8db54f9b50b218fedb8bf679de3b4e5d78e230a44f7b1aa482f7b3aa831bd641
- b2sums = 6a195695fcd207adbdea28dd2803b479f6e5dc478b56d5fce16a7600f719fa545ed0e468a26f9c94e982346fb803a0ff026abd0d70335e42027468475beb7cbb
pkgname = linux-ck-uksm
pkgdesc = The Linux-ck-uksm kernel and modules with the ck1 patchset featuring MuQSS CPU scheduler
diff --git a/0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch b/0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch
index 79dab97ee81a..73e35ef52bf5 100644
--- a/0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch
+++ b/0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch
@@ -1,7 +1,7 @@
-From f8f830397db175f686669b8b36755a6e5d5c3f03 Mon Sep 17 00:00:00 2001
+From fa17daad7209d62169553ce6336ef29ba4748049 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Mon, 16 Sep 2019 04:53:20 +0200
-Subject: [PATCH 1/2] ZEN: Add sysctl and CONFIG to disallow unprivileged
+Subject: [PATCH 1/8] ZEN: Add sysctl and CONFIG to disallow unprivileged
CLONE_NEWUSER
Our default behavior continues to match the vanilla kernel.
@@ -150,5 +150,5 @@ index 9a4b980d695b..4388ca13ea3f 100644
static DEFINE_MUTEX(userns_state_mutex);
--
-2.31.1
+2.32.0
diff --git a/0002-x86-setup-Consolidate-early-memory-reservations.patch b/0002-x86-setup-Consolidate-early-memory-reservations.patch
new file mode 100644
index 000000000000..20c380797611
--- /dev/null
+++ b/0002-x86-setup-Consolidate-early-memory-reservations.patch
@@ -0,0 +1,188 @@
+From 56e6bb0fe2b790adda81851794409faa533e521c Mon Sep 17 00:00:00 2001
+From: Mike Rapoport <rppt@linux.ibm.com>
+Date: Tue, 2 Mar 2021 12:04:05 +0200
+Subject: [PATCH 2/8] x86/setup: Consolidate early memory reservations
+
+The early reservations of memory areas used by the firmware, bootloader,
+kernel text and data are spread over setup_arch(). Moreover, some of them
+happen *after* memblock allocations, e.g trim_platform_memory_ranges() and
+trim_low_memory_range() are called after reserve_real_mode() that allocates
+memory.
+
+There was no corruption of these memory regions because memblock always
+allocates memory either from the end of memory (in top-down mode) or above
+the kernel image (in bottom-up mode). However, the bottom up mode is going
+to be updated to span the entire memory [1] to avoid limitations caused by
+KASLR.
+
+Consolidate early memory reservations in a dedicated function to improve
+robustness against future changes. Having the early reservations in one
+place also makes it clearer what memory must be reserved before memblock
+allocations are allowed.
+
+Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: Baoquan He <bhe@redhat.com>
+Acked-by: Borislav Petkov <bp@suse.de>
+Acked-by: David Hildenbrand <david@redhat.com>
+Link: [1] https://lore.kernel.org/lkml/20201217201214.3414100-2-guro@fb.com
+Link: https://lkml.kernel.org/r/20210302100406.22059-2-rppt@kernel.org
+---
+ arch/x86/kernel/setup.c | 92 ++++++++++++++++++++---------------------
+ 1 file changed, 44 insertions(+), 48 deletions(-)
+
+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
+index e79f21d13a0d..420d881da2bd 100644
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -646,18 +646,6 @@ static void __init trim_snb_memory(void)
+ }
+ }
+
+-/*
+- * Here we put platform-specific memory range workarounds, i.e.
+- * memory known to be corrupt or otherwise in need to be reserved on
+- * specific platforms.
+- *
+- * If this gets used more widely it could use a real dispatch mechanism.
+- */
+-static void __init trim_platform_memory_ranges(void)
+-{
+- trim_snb_memory();
+-}
+-
+ static void __init trim_bios_range(void)
+ {
+ /*
+@@ -730,7 +718,38 @@ static void __init trim_low_memory_range(void)
+ {
+ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+ }
+-
++
++static void __init early_reserve_memory(void)
++{
++ /*
++ * Reserve the memory occupied by the kernel between _text and
++ * __end_of_kernel_reserve symbols. Any kernel sections after the
++ * __end_of_kernel_reserve symbol must be explicitly reserved with a
++ * separate memblock_reserve() or they will be discarded.
++ */
++ memblock_reserve(__pa_symbol(_text),
++ (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
++
++ /*
++ * Make sure page 0 is always reserved because on systems with
++ * L1TF its contents can be leaked to user processes.
++ */
++ memblock_reserve(0, PAGE_SIZE);
++
++ early_reserve_initrd();
++
++ if (efi_enabled(EFI_BOOT))
++ efi_memblock_x86_reserve_range();
++
++ memblock_x86_reserve_range_setup_data();
++
++ reserve_ibft_region();
++ reserve_bios_regions();
++
++ trim_snb_memory();
++ trim_low_memory_range();
++}
++
+ /*
+ * Dump out kernel offset information on panic.
+ */
+@@ -765,29 +784,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
+
+ void __init setup_arch(char **cmdline_p)
+ {
+- /*
+- * Reserve the memory occupied by the kernel between _text and
+- * __end_of_kernel_reserve symbols. Any kernel sections after the
+- * __end_of_kernel_reserve symbol must be explicitly reserved with a
+- * separate memblock_reserve() or they will be discarded.
+- */
+- memblock_reserve(__pa_symbol(_text),
+- (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
+-
+- /*
+- * Make sure page 0 is always reserved because on systems with
+- * L1TF its contents can be leaked to user processes.
+- */
+- memblock_reserve(0, PAGE_SIZE);
+-
+- early_reserve_initrd();
+-
+- /*
+- * At this point everything still needed from the boot loader
+- * or BIOS or kernel text should be early reserved or marked not
+- * RAM in e820. All other memory is free game.
+- */
+-
+ #ifdef CONFIG_X86_32
+ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+
+@@ -911,8 +907,18 @@ void __init setup_arch(char **cmdline_p)
+
+ parse_early_param();
+
+- if (efi_enabled(EFI_BOOT))
+- efi_memblock_x86_reserve_range();
++ /*
++ * Do some memory reservations *before* memory is added to
++ * memblock, so memblock allocations won't overwrite it.
++ * Do it after early param, so we could get (unlikely) panic from
++ * serial.
++ *
++ * After this point everything still needed from the boot loader or
++ * firmware or kernel text should be early reserved or marked not
++ * RAM in e820. All other memory is free game.
++ */
++ early_reserve_memory();
++
+ #ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Memory used by the kernel cannot be hot-removed because Linux
+@@ -939,9 +945,6 @@ void __init setup_arch(char **cmdline_p)
+
+ x86_report_nx();
+
+- /* after early param, so could get panic from serial */
+- memblock_x86_reserve_range_setup_data();
+-
+ if (acpi_mps_check()) {
+ #ifdef CONFIG_X86_LOCAL_APIC
+ disable_apic = 1;
+@@ -1033,8 +1036,6 @@ void __init setup_arch(char **cmdline_p)
+ */
+ find_smp_config();
+
+- reserve_ibft_region();
+-
+ early_alloc_pgt_buf();
+
+ /*
+@@ -1055,8 +1056,6 @@ void __init setup_arch(char **cmdline_p)
+ */
+ sev_setup_arch();
+
+- reserve_bios_regions();
+-
+ efi_fake_memmap();
+ efi_find_mirror();
+ efi_esrt_init();
+@@ -1082,9 +1081,6 @@ void __init setup_arch(char **cmdline_p)
+
+ reserve_real_mode();
+
+- trim_platform_memory_ranges();
+- trim_low_memory_range();
+-
+ init_mem_mapping();
+
+ idt_setup_early_pf();
+--
+2.32.0
+
diff --git a/0003-bfq-lucjan-r2K210602.patch b/0003-bfq-lucjan-r2K210602.patch
deleted file mode 100644
index b1fbccc60879..000000000000
--- a/0003-bfq-lucjan-r2K210602.patch
+++ /dev/null
@@ -1,1534 +0,0 @@
-diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
-index b791e2041..ede8a0f0e 100644
---- a/block/bfq-cgroup.c
-+++ b/block/bfq-cgroup.c
-@@ -224,7 +224,7 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
- {
- blkg_rwstat_add(&bfqg->stats.queued, op, 1);
- bfqg_stats_end_empty_time(&bfqg->stats);
-- if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
-+ if (!(bfqq == bfqg->bfqd->in_service_queue))
- bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
- }
-
-@@ -309,8 +309,7 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
- {
- struct bfq_entity *group_entity = bfqq->entity.parent;
-
-- return group_entity ? container_of(group_entity, struct bfq_group,
-- entity) :
-+ return group_entity ? bfq_entity_to_bfqg(group_entity) :
- bfqq->bfqd->root_group;
- }
-
-@@ -427,6 +426,7 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
-
- entity->weight = entity->new_weight;
- entity->orig_weight = entity->new_weight;
-+ entity->prio_changed = 0;
- if (bfqq) {
- bfqq->ioprio = bfqq->new_ioprio;
- bfqq->ioprio_class = bfqq->new_ioprio_class;
-@@ -547,6 +547,8 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
-
- entity->orig_weight = entity->weight = entity->new_weight = d->weight;
- entity->my_sched_data = &bfqg->sched_data;
-+ entity->last_bfqq_created = NULL;
-+
- bfqg->my_entity = entity; /*
- * the root_group's will be set to NULL
- * in bfq_init_queue()
-@@ -610,8 +612,7 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
- */
- entity = &bfqg->entity;
- for_each_entity(entity) {
-- struct bfq_group *curr_bfqg = container_of(entity,
-- struct bfq_group, entity);
-+ struct bfq_group *curr_bfqg = bfq_entity_to_bfqg(entity);
- if (curr_bfqg != bfqd->root_group) {
- parent = bfqg_parent(curr_bfqg);
- if (!parent)
-@@ -1431,15 +1432,11 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg) {}
- struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
- {
- struct bfq_group *bfqg;
-- int i;
-
- bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
- if (!bfqg)
- return NULL;
-
-- for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
-- bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
--
- return bfqg;
- }
- #endif /* CONFIG_BFQ_GROUP_IOSCHED */
-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
-index bc319931d..695421b08 100644
---- a/block/bfq-iosched.c
-+++ b/block/bfq-iosched.c
-@@ -372,9 +372,38 @@ struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
- return bic->bfqq[is_sync];
- }
-
-+static void bfq_put_stable_ref(struct bfq_queue *bfqq);
-+
- void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
- {
-+ /*
-+ * If bfqq != NULL, then a non-stable queue merge between
-+ * bic->bfqq and bfqq is happening here. This causes troubles
-+ * in the following case: bic->bfqq has also been scheduled
-+ * for a possible stable merge with bic->stable_merge_bfqq,
-+ * and bic->stable_merge_bfqq == bfqq happens to
-+ * hold. Troubles occur because bfqq may then undergo a split,
-+ * thereby becoming eligible for a stable merge. Yet, if
-+ * bic->stable_merge_bfqq points exactly to bfqq, then bfqq
-+ * would be stably merged with itself. To avoid this anomaly,
-+ * we cancel the stable merge if
-+ * bic->stable_merge_bfqq == bfqq.
-+ */
- bic->bfqq[is_sync] = bfqq;
-+
-+ if (bfqq && bic->stable_merge_bfqq == bfqq) {
-+ /*
-+ * Actually, these same instructions are executed also
-+ * in bfq_setup_cooperator, in case of abort or actual
-+ * execution of a stable merge. We could avoid
-+ * repeating these instructions there too, but if we
-+ * did so, we would nest even more complexity in this
-+ * function.
-+ */
-+ bfq_put_stable_ref(bic->stable_merge_bfqq);
-+
-+ bic->stable_merge_bfqq = NULL;
-+ }
- }
-
- struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
-@@ -1075,7 +1104,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
- static int bfqq_process_refs(struct bfq_queue *bfqq)
- {
- return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
-- (bfqq->weight_counter != NULL);
-+ (bfqq->weight_counter != NULL) - bfqq->stable_ref;
- }
-
- /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
-@@ -2220,7 +2249,7 @@ static void bfq_remove_request(struct request_queue *q,
- bfqd->queued--;
- elv_rb_del(&bfqq->sort_list, rq);
-
-- elv_rqhash_del(q, rq);
-+ elv_rqhash_del(rq);
- if (q->last_merge == rq)
- q->last_merge = NULL;
-
-@@ -2288,9 +2317,9 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
-
- ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
-
-+ spin_unlock_irq(&bfqd->lock);
- if (free)
- blk_mq_free_request(free);
-- spin_unlock_irq(&bfqd->lock);
-
- return ret;
- }
-@@ -2376,7 +2405,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
- *next_bfqq = bfq_init_rq(next);
-
- if (!bfqq)
-- return;
-+ goto remove;
-
- /*
- * If next and rq belong to the same bfq_queue and next is older
-@@ -2399,6 +2428,14 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
- bfqq->next_rq = rq;
-
- bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
-+remove:
-+ /* Merged request may be in the IO scheduler. Remove it. */
-+ if (!RB_EMPTY_NODE(&next->rb_node)) {
-+ bfq_remove_request(next->q, next);
-+ if (next_bfqq)
-+ bfqg_stats_update_io_remove(bfqq_group(next_bfqq),
-+ next->cmd_flags);
-+ }
- }
-
- /* Must be called with bfqq != NULL */
-@@ -2627,6 +2664,9 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
- return true;
- }
-
-+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
-+ struct bfq_queue *bfqq);
-+
- /*
- * Attempt to schedule a merge of bfqq with the currently in-service
- * queue or with a close queue among the scheduled queues. Return
-@@ -2649,10 +2689,55 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
- */
- static struct bfq_queue *
- bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-- void *io_struct, bool request)
-+ void *io_struct, bool request, struct bfq_io_cq *bic)
- {
- struct bfq_queue *in_service_bfqq, *new_bfqq;
-
-+ /*
-+ * Check delayed stable merge for rotational or non-queueing
-+ * devs. For this branch to be executed, bfqq must not be
-+ * currently merged with some other queue (i.e., bfqq->bic
-+ * must be non null). If we considered also merged queues,
-+ * then we should also check whether bfqq has already been
-+ * merged with bic->stable_merge_bfqq. But this would be
-+ * costly and complicated.
-+ */
-+ if (unlikely(!bfqd->nonrot_with_queueing)) {
-+ /*
-+ * Make sure also that bfqq is sync, because
-+ * bic->stable_merge_bfqq may point to some queue (for
-+ * stable merging) also if bic is associated with a
-+ * sync queue, but bfqq is async
-+ */
-+ if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq &&
-+ !bfq_bfqq_just_created(bfqq) &&
-+ time_is_before_jiffies(bfqq->split_time +
-+ msecs_to_jiffies(200))) {
-+ struct bfq_queue *stable_merge_bfqq =
-+ bic->stable_merge_bfqq;
-+ int proc_ref = min(bfqq_process_refs(bfqq),
-+ bfqq_process_refs(stable_merge_bfqq));
-+
-+ /* deschedule stable merge, because done or aborted here */
-+ bfq_put_stable_ref(stable_merge_bfqq);
-+
-+ bic->stable_merge_bfqq = NULL;
-+
-+ if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
-+ proc_ref > 0) {
-+ /* next function will take at least one ref */
-+ struct bfq_queue *new_bfqq =
-+ bfq_setup_merge(bfqq, stable_merge_bfqq);
-+
-+ bic->stably_merged = true;
-+ if (new_bfqq && new_bfqq->bic)
-+ new_bfqq->bic->stably_merged = true;
-+ return new_bfqq;
-+ } else
-+ return NULL;
-+ }
-+ }
-+
- /*
- * Do not perform queue merging if the device is non
- * rotational and performs internal queueing. In fact, such a
-@@ -2794,6 +2879,17 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
- }
- }
-
-+
-+static void
-+bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq)
-+{
-+ if (cur_bfqq->entity.parent &&
-+ cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq)
-+ cur_bfqq->entity.parent->last_bfqq_created = new_bfqq;
-+ else if (cur_bfqq->bfqd && cur_bfqq->bfqd->last_bfqq_created == cur_bfqq)
-+ cur_bfqq->bfqd->last_bfqq_created = new_bfqq;
-+}
-+
- void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
- {
- /*
-@@ -2811,6 +2907,8 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
- bfqq != bfqd->in_service_queue)
- bfq_del_bfqq_busy(bfqd, bfqq, false);
-
-+ bfq_reassign_last_bfqq(bfqq, NULL);
-+
- bfq_put_queue(bfqq);
- }
-
-@@ -2827,6 +2925,29 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
- bfq_mark_bfqq_IO_bound(new_bfqq);
- bfq_clear_bfqq_IO_bound(bfqq);
-
-+ /*
-+ * The processes associated with bfqq are cooperators of the
-+ * processes associated with new_bfqq. So, if bfqq has a
-+ * waker, then assume that all these processes will be happy
-+ * to let bfqq's waker freely inject I/O when they have no
-+ * I/O.
-+ */
-+ if (bfqq->waker_bfqq && !new_bfqq->waker_bfqq &&
-+ bfqq->waker_bfqq != new_bfqq) {
-+ new_bfqq->waker_bfqq = bfqq->waker_bfqq;
-+ new_bfqq->tentative_waker_bfqq = NULL;
-+
-+ /*
-+ * If the waker queue disappears, then
-+ * new_bfqq->waker_bfqq must be reset. So insert
-+ * new_bfqq into the woken_list of the waker. See
-+ * bfq_check_waker for details.
-+ */
-+ hlist_add_head(&new_bfqq->woken_list_node,
-+ &new_bfqq->waker_bfqq->woken_list);
-+
-+ }
-+
- /*
- * If bfqq is weight-raised, then let new_bfqq inherit
- * weight-raising. To reduce false positives, neglect the case
-@@ -2884,6 +3005,9 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
- */
- new_bfqq->pid = -1;
- bfqq->bic = NULL;
-+
-+ bfq_reassign_last_bfqq(bfqq, new_bfqq);
-+
- bfq_release_process_ref(bfqd, bfqq);
- }
-
-@@ -2911,7 +3035,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
- * We take advantage of this function to perform an early merge
- * of the queues of possible cooperating processes.
- */
-- new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
-+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false, bfqd->bio_bic);
- if (new_bfqq) {
- /*
- * bic still points to bfqq, then it has not yet been
-@@ -4496,9 +4620,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
- bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
- bfqq->bic->bfqq[0]->next_rq ?
- bfqq->bic->bfqq[0] : NULL;
-+ struct bfq_queue *blocked_bfqq =
-+ !hlist_empty(&bfqq->woken_list) ?
-+ container_of(bfqq->woken_list.first,
-+ struct bfq_queue,
-+ woken_list_node)
-+ : NULL;
-
- /*
-- * The next three mutually-exclusive ifs decide
-+ * The next four mutually-exclusive ifs decide
- * whether to try injection, and choose the queue to
- * pick an I/O request from.
- *
-@@ -4531,7 +4661,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
- * next bfqq's I/O is brought forward dramatically,
- * for it is not blocked for milliseconds.
- *
-- * The third if checks whether bfqq is a queue for
-+ * The third if checks whether there is a queue woken
-+ * by bfqq, and currently with pending I/O. Such a
-+ * woken queue does not steal bandwidth from bfqq,
-+ * because it remains soon without I/O if bfqq is not
-+ * served. So there is virtually no risk of loss of
-+ * bandwidth for bfqq if this woken queue has I/O
-+ * dispatched while bfqq is waiting for new I/O.
-+ *
-+ * The fourth if checks whether bfqq is a queue for
- * which it is better to avoid injection. It is so if
- * bfqq delivers more throughput when served without
- * any further I/O from other queues in the middle, or
-@@ -4551,11 +4689,11 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
- * bfq_update_has_short_ttime(), it is rather likely
- * that, if I/O is being plugged for bfqq and the
- * waker queue has pending I/O requests that are
-- * blocking bfqq's I/O, then the third alternative
-+ * blocking bfqq's I/O, then the fourth alternative
- * above lets the waker queue get served before the
- * I/O-plugging timeout fires. So one may deem the
- * second alternative superfluous. It is not, because
-- * the third alternative may be way less effective in
-+ * the fourth alternative may be way less effective in
- * case of a synchronization. For two main
- * reasons. First, throughput may be low because the
- * inject limit may be too low to guarantee the same
-@@ -4564,7 +4702,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
- * guarantees (the second alternative unconditionally
- * injects a pending I/O request of the waker queue
- * for each bfq_dispatch_request()). Second, with the
-- * third alternative, the duration of the plugging,
-+ * fourth alternative, the duration of the plugging,
- * i.e., the time before bfqq finally receives new I/O,
- * may not be minimized, because the waker queue may
- * happen to be served only after other queues.
-@@ -4582,6 +4720,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
- bfq_bfqq_budget_left(bfqq->waker_bfqq)
- )
- bfqq = bfqq->waker_bfqq;
-+ else if (blocked_bfqq &&
-+ bfq_bfqq_busy(blocked_bfqq) &&
-+ blocked_bfqq->next_rq &&
-+ bfq_serv_to_charge(blocked_bfqq->next_rq,
-+ blocked_bfqq) <=
-+ bfq_bfqq_budget_left(blocked_bfqq)
-+ )
-+ bfqq = blocked_bfqq;
- else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
- (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
- !bfq_bfqq_has_short_ttime(bfqq)))
-@@ -4813,6 +4959,17 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
- if (!bfqq)
- goto exit;
-
-+ /*
-+ * Here, the IO depth of queues belong to CLASS_IDLE is limited
-+ * to 1, so that it can avoid introducing a larger tail latency
-+ * under a device with a larger IO depth. Although limiting the
-+ * IO depth may reduce the performance of idle_class, it is
-+ * generally not a big problem, because idle_class usually
-+ * does not have strict performance requirements.
-+ */
-+ if (bfq_class_idle(bfqq) && bfqq->dispatched)
-+ goto exit;
-+
- rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
-
- if (rq) {
-@@ -4988,6 +5145,12 @@ void bfq_put_queue(struct bfq_queue *bfqq)
- bfqg_and_blkg_put(bfqg);
- }
-
-+static void bfq_put_stable_ref(struct bfq_queue *bfqq)
-+{
-+ bfqq->stable_ref--;
-+ bfq_put_queue(bfqq);
-+}
-+
- static void bfq_put_cooperator(struct bfq_queue *bfqq)
- {
- struct bfq_queue *__bfqq, *next;
-@@ -5044,6 +5207,24 @@ static void bfq_exit_icq(struct io_cq *icq)
- {
- struct bfq_io_cq *bic = icq_to_bic(icq);
-
-+ if (bic->stable_merge_bfqq) {
-+ struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd;
-+
-+ /*
-+ * bfqd is NULL if scheduler already exited, and in
-+ * that case this is the last time bfqq is accessed.
-+ */
-+ if (bfqd) {
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&bfqd->lock, flags);
-+ bfq_put_stable_ref(bic->stable_merge_bfqq);
-+ spin_unlock_irqrestore(&bfqd->lock, flags);
-+ } else {
-+ bfq_put_stable_ref(bic->stable_merge_bfqq);
-+ }
-+ }
-+
- bfq_exit_icq_bfqq(bic, true);
- bfq_exit_icq_bfqq(bic, false);
- }
-@@ -5104,7 +5285,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
-
- static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
- struct bio *bio, bool is_sync,
-- struct bfq_io_cq *bic);
-+ struct bfq_io_cq *bic,
-+ bool respawn);
-
- static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
- {
-@@ -5124,7 +5306,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
- bfqq = bic_to_bfqq(bic, false);
- if (bfqq) {
- bfq_release_process_ref(bfqd, bfqq);
-- bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
-+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true);
- bic_set_bfqq(bic, bfqq, false);
- }
-
-@@ -5167,6 +5349,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- /* set end request to minus infinity from now */
- bfqq->ttime.last_end_request = now_ns + 1;
-
-+ bfqq->creation_time = jiffies;
-+
- bfqq->io_start_time = now_ns;
-
- bfq_mark_bfqq_IO_bound(bfqq);
-@@ -5216,9 +5400,156 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
- }
- }
-
-+static struct bfq_queue *
-+bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-+ struct bfq_io_cq *bic,
-+ struct bfq_queue *last_bfqq_created)
-+{
-+ struct bfq_queue *new_bfqq =
-+ bfq_setup_merge(bfqq, last_bfqq_created);
-+
-+ if (!new_bfqq)
-+ return bfqq;
-+
-+ if (new_bfqq->bic)
-+ new_bfqq->bic->stably_merged = true;
-+ bic->stably_merged = true;
-+
-+ /*
-+ * Reusing merge functions. This implies that
-+ * bfqq->bic must be set too, for
-+ * bfq_merge_bfqqs to correctly save bfqq's
-+ * state before killing it.
-+ */
-+ bfqq->bic = bic;
-+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
-+
-+ return new_bfqq;
-+}
-+
-+/*
-+ * Many throughput-sensitive workloads are made of several parallel
-+ * I/O flows, with all flows generated by the same application, or
-+ * more generically by the same task (e.g., system boot). The most
-+ * counterproductive action with these workloads is plugging I/O
-+ * dispatch when one of the bfq_queues associated with these flows
-+ * remains temporarily empty.
-+ *
-+ * To avoid this plugging, BFQ has been using a burst-handling
-+ * mechanism for years now. This mechanism has proven effective for
-+ * throughput, and not detrimental for service guarantees. The
-+ * following function pushes this mechanism a little bit further,
-+ * basing on the following two facts.
-+ *
-+ * First, all the I/O flows of a the same application or task
-+ * contribute to the execution/completion of that common application
-+ * or task. So the performance figures that matter are total
-+ * throughput of the flows and task-wide I/O latency. In particular,
-+ * these flows do not need to be protected from each other, in terms
-+ * of individual bandwidth or latency.
-+ *
-+ * Second, the above fact holds regardless of the number of flows.
-+ *
-+ * Putting these two facts together, this commits merges stably the
-+ * bfq_queues associated with these I/O flows, i.e., with the
-+ * processes that generate these IO/ flows, regardless of how many the
-+ * involved processes are.
-+ *
-+ * To decide whether a set of bfq_queues is actually associated with
-+ * the I/O flows of a common application or task, and to merge these
-+ * queues stably, this function operates as follows: given a bfq_queue,
-+ * say Q2, currently being created, and the last bfq_queue, say Q1,
-+ * created before Q2, Q2 is merged stably with Q1 if
-+ * - very little time has elapsed since when Q1 was created
-+ * - Q2 has the same ioprio as Q1
-+ * - Q2 belongs to the same group as Q1
-+ *
-+ * Merging bfq_queues also reduces scheduling overhead. A fio test
-+ * with ten random readers on /dev/nullb shows a throughput boost of
-+ * 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of
-+ * the total per-request processing time, the above throughput boost
-+ * implies that BFQ's overhead is reduced by more than 50%.
-+ *
-+ * This new mechanism most certainly obsoletes the current
-+ * burst-handling heuristics. We keep those heuristics for the moment.
-+ */
-+static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
-+ struct bfq_queue *bfqq,
-+ struct bfq_io_cq *bic)
-+{
-+ struct bfq_queue **source_bfqq = bfqq->entity.parent ?
-+ &bfqq->entity.parent->last_bfqq_created :
-+ &bfqd->last_bfqq_created;
-+
-+ struct bfq_queue *last_bfqq_created = *source_bfqq;
-+
-+ /*
-+ * If last_bfqq_created has not been set yet, then init it. If
-+ * it has been set already, but too long ago, then move it
-+ * forward to bfqq. Finally, move also if bfqq belongs to a
-+ * different group than last_bfqq_created, or if bfqq has a
-+ * different ioprio or ioprio_class. If none of these
-+ * conditions holds true, then try an early stable merge or
-+ * schedule a delayed stable merge.
-+ *
-+ * A delayed merge is scheduled (instead of performing an
-+ * early merge), in case bfqq might soon prove to be more
-+ * throughput-beneficial if not merged. Currently this is
-+ * possible only if bfqd is rotational with no queueing. For
-+ * such a drive, not merging bfqq is better for throughput if
-+ * bfqq happens to contain sequential I/O. So, we wait a
-+ * little bit for enough I/O to flow through bfqq. After that,
-+ * if such an I/O is sequential, then the merge is
-+ * canceled. Otherwise the merge is finally performed.
-+ */
-+ if (!last_bfqq_created ||
-+ time_before(last_bfqq_created->creation_time +
-+ bfqd->bfq_burst_interval,
-+ bfqq->creation_time) ||
-+ bfqq->entity.parent != last_bfqq_created->entity.parent ||
-+ bfqq->ioprio != last_bfqq_created->ioprio ||
-+ bfqq->ioprio_class != last_bfqq_created->ioprio_class)
-+ *source_bfqq = bfqq;
-+ else if (time_after_eq(last_bfqq_created->creation_time +
-+ bfqd->bfq_burst_interval,
-+ bfqq->creation_time)) {
-+ if (likely(bfqd->nonrot_with_queueing))
-+ /*
-+ * With this type of drive, leaving
-+ * bfqq alone may provide no
-+ * throughput benefits compared with
-+ * merging bfqq. So merge bfqq now.
-+ */
-+ bfqq = bfq_do_early_stable_merge(bfqd, bfqq,
-+ bic,
-+ last_bfqq_created);
-+ else { /* schedule tentative stable merge */
-+ /*
-+ * get reference on last_bfqq_created,
-+ * to prevent it from being freed,
-+ * until we decide whether to merge
-+ */
-+ last_bfqq_created->ref++;
-+ /*
-+ * need to keep track of stable refs, to
-+ * compute process refs correctly
-+ */
-+ last_bfqq_created->stable_ref++;
-+ /*
-+ * Record the bfqq to merge to.
-+ */
-+ bic->stable_merge_bfqq = last_bfqq_created;
-+ }
-+ }
-+
-+ return bfqq;
-+}
-+
-+
- static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
- struct bio *bio, bool is_sync,
-- struct bfq_io_cq *bic)
-+ struct bfq_io_cq *bic,
-+ bool respawn)
- {
- const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
- const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
-@@ -5276,7 +5607,10 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
-
- out:
- bfqq->ref++; /* get a process reference to this queue */
-- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
-+
-+ if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn)
-+ bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic);
-+
- rcu_read_unlock();
- return bfqq;
- }
-@@ -5526,7 +5860,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
- {
- struct bfq_queue *bfqq = RQ_BFQQ(rq),
-- *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
-+ *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true,
-+ RQ_BIC(rq));
- bool waiting, idle_timer_disabled = false;
-
- if (new_bfqq) {
-@@ -5615,14 +5950,16 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
- struct bfq_queue *bfqq;
- bool idle_timer_disabled = false;
- unsigned int cmd_flags;
-+ LIST_HEAD(free);
-
- #ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
- bfqg_stats_update_legacy_io(q, rq);
- #endif
- spin_lock_irq(&bfqd->lock);
-- if (blk_mq_sched_try_insert_merge(q, rq)) {
-+ if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
- spin_unlock_irq(&bfqd->lock);
-+ blk_mq_free_requests(&free);
- return;
- }
-
-@@ -5632,7 +5969,48 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
-
- spin_lock_irq(&bfqd->lock);
- bfqq = bfq_init_rq(rq);
-- if (!bfqq || at_head || blk_rq_is_passthrough(rq)) {
-+
-+ /*
-+ * Reqs with at_head or passthrough flags set are to be put
-+ * directly into dispatch list. Additional case for putting rq
-+ * directly into the dispatch queue: the only active
-+ * bfq_queues are bfqq and either its waker bfq_queue or one
-+ * of its woken bfq_queues. The rationale behind this
-+ * additional condition is as follows:
-+ * - consider a bfq_queue, say Q1, detected as a waker of
-+ * another bfq_queue, say Q2
-+ * - by definition of a waker, Q1 blocks the I/O of Q2, i.e.,
-+ * some I/O of Q1 needs to be completed for new I/O of Q2
-+ * to arrive. A notable example of waker is journald
-+ * - so, Q1 and Q2 are in any respect the queues of two
-+ * cooperating processes (or of two cooperating sets of
-+ * processes): the goal of Q1's I/O is doing what needs to
-+ * be done so that new Q2's I/O can finally be
-+ * issued. Therefore, if the service of Q1's I/O is delayed,
-+ * then Q2's I/O is delayed too. Conversely, if Q2's I/O is
-+ * delayed, the goal of Q1's I/O is hindered.
-+ * - as a consequence, if some I/O of Q1/Q2 arrives while
-+ * Q2/Q1 is the only queue in service, there is absolutely
-+ * no point in delaying the service of such an I/O. The
-+ * only possible result is a throughput loss
-+ * - so, when the above condition holds, the best option is to
-+ * have the new I/O dispatched as soon as possible
-+ * - the most effective and efficient way to attain the above
-+ * goal is to put the new I/O directly in the dispatch
-+ * list
-+ * - as an additional restriction, Q1 and Q2 must be the only
-+ * busy queues for this commit to put the I/O of Q2/Q1 in
-+ * the dispatch list. This is necessary, because, if also
-+ * other queues are waiting for service, then putting new
-+ * I/O directly in the dispatch list may evidently cause a
-+ * violation of service guarantees for the other queues
-+ */
-+ if (!bfqq ||
-+ (bfqq != bfqd->in_service_queue &&
-+ bfqd->in_service_queue != NULL &&
-+ bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) &&
-+ (bfqq->waker_bfqq == bfqd->in_service_queue ||
-+ bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) {
- if (at_head)
- list_add(&rq->queuelist, &bfqd->dispatch);
- else
-@@ -5772,7 +6150,17 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
- 1UL<<(BFQ_RATE_SHIFT - 10))
- bfq_update_rate_reset(bfqd, NULL);
- bfqd->last_completion = now_ns;
-- bfqd->last_completed_rq_bfqq = bfqq;
-+ /*
-+ * Shared queues are likely to receive I/O at a high
-+ * rate. This may deceptively let them be considered as wakers
-+ * of other queues. But a false waker will unjustly steal
-+ * bandwidth to its supposedly woken queue. So considering
-+ * also shared queues in the waking mechanism may cause more
-+ * control troubles than throughput benefits. Then do not set
-+ * last_completed_rq_bfqq to bfqq if bfqq is a shared queue.
-+ */
-+ if (!bfq_bfqq_coop(bfqq))
-+ bfqd->last_completed_rq_bfqq = bfqq;
-
- /*
- * If we are waiting to discover whether the request pattern
-@@ -6015,6 +6403,7 @@ static void bfq_finish_requeue_request(struct request *rq)
- {
- struct bfq_queue *bfqq = RQ_BFQQ(rq);
- struct bfq_data *bfqd;
-+ unsigned long flags;
-
- /*
- * rq either is not associated with any icq, or is an already
-@@ -6032,39 +6421,15 @@ static void bfq_finish_requeue_request(struct request *rq)
- rq->io_start_time_ns,
- rq->cmd_flags);
-
-+ spin_lock_irqsave(&bfqd->lock, flags);
- if (likely(rq->rq_flags & RQF_STARTED)) {
-- unsigned long flags;
--
-- spin_lock_irqsave(&bfqd->lock, flags);
--
- if (rq == bfqd->waited_rq)
- bfq_update_inject_limit(bfqd, bfqq);
-
- bfq_completed_request(bfqq, bfqd);
-- bfq_finish_requeue_request_body(bfqq);
--
-- spin_unlock_irqrestore(&bfqd->lock, flags);
-- } else {
-- /*
-- * Request rq may be still/already in the scheduler,
-- * in which case we need to remove it (this should
-- * never happen in case of requeue). And we cannot
-- * defer such a check and removal, to avoid
-- * inconsistencies in the time interval from the end
-- * of this function to the start of the deferred work.
-- * This situation seems to occur only in process
-- * context, as a consequence of a merge. In the
-- * current version of the code, this implies that the
-- * lock is held.
-- */
--
-- if (!RB_EMPTY_NODE(&rq->rb_node)) {
-- bfq_remove_request(rq->q, rq);
-- bfqg_stats_update_io_remove(bfqq_group(bfqq),
-- rq->cmd_flags);
-- }
-- bfq_finish_requeue_request_body(bfqq);
- }
-+ bfq_finish_requeue_request_body(bfqq);
-+ spin_unlock_irqrestore(&bfqd->lock, flags);
-
- /*
- * Reset private fields. In case of a requeue, this allows
-@@ -6129,7 +6494,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
-
- if (bfqq)
- bfq_put_queue(bfqq);
-- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
-+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split);
-
- bic_set_bfqq(bic, bfqq, is_sync);
- if (split && is_sync) {
-@@ -6250,8 +6615,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
-
- if (likely(!new_queue)) {
- /* If the queue was seeky for too long, break it apart. */
-- if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
-- bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
-+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) &&
-+ !bic->stably_merged) {
-+ struct bfq_queue *old_bfqq = bfqq;
-
- /* Update bic before losing reference to bfqq */
- if (bfq_bfqq_in_large_burst(bfqq))
-@@ -6260,11 +6626,24 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
- bfqq = bfq_split_bfqq(bic, bfqq);
- split = true;
-
-- if (!bfqq)
-+ if (!bfqq) {
- bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
- true, is_sync,
- NULL);
-- else
-+ bfqq->waker_bfqq = old_bfqq->waker_bfqq;
-+ bfqq->tentative_waker_bfqq = NULL;
-+
-+ /*
-+ * If the waker queue disappears, then
-+ * new_bfqq->waker_bfqq must be
-+ * reset. So insert new_bfqq into the
-+ * woken_list of the waker. See
-+ * bfq_check_waker for details.
-+ */
-+ if (bfqq->waker_bfqq)
-+ hlist_add_head(&bfqq->woken_list_node,
-+ &bfqq->waker_bfqq->woken_list);
-+ } else
- bfqq_already_existing = true;
- }
- }
-@@ -6531,9 +6910,11 @@ static void bfq_init_root_group(struct bfq_group *root_group,
- root_group->bfqd = bfqd;
- #endif
- root_group->rq_pos_tree = RB_ROOT;
-- for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
-+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
- root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
-- root_group->sched_data.bfq_class_idle_last_service = jiffies;
-+ root_group->sched_data.bfq_class_last_service[i] = jiffies;
-+ }
-+ root_group->sched_data.class_timeout_last_check = jiffies;
- }
-
- static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
-@@ -6926,6 +7307,7 @@ MODULE_ALIAS("bfq-iosched");
- static int __init bfq_init(void)
- {
- int ret;
-+ char msg[60] = "BFQ I/O-scheduler: BFQ-lucjan v5.12";
-
- #ifdef CONFIG_BFQ_GROUP_IOSCHED
- ret = blkcg_policy_register(&blkcg_policy_bfq);
-@@ -6957,6 +7339,11 @@ static int __init bfq_init(void)
- if (ret)
- goto slab_kill;
-
-+#ifdef CONFIG_BFQ_GROUP_IOSCHED
-+ strcat(msg, " (with cgroups support)");
-+#endif
-+ pr_info("%s", msg);
-+
- return 0;
-
- slab_kill:
-diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
-index b8e793c34..a79796912 100644
---- a/block/bfq-iosched.h
-+++ b/block/bfq-iosched.h
-@@ -13,7 +13,7 @@
- #include "blk-cgroup-rwstat.h"
-
- #define BFQ_IOPRIO_CLASSES 3
--#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
-+#define BFQ_CLASS_TIMEOUT (HZ/5)
-
- #define BFQ_MIN_WEIGHT 1
- #define BFQ_MAX_WEIGHT 1000
-@@ -22,7 +22,6 @@
- #define BFQ_DEFAULT_QUEUE_IOPRIO 4
-
- #define BFQ_WEIGHT_LEGACY_DFL 100
--#define BFQ_DEFAULT_GRP_IOPRIO 0
- #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
-
- #define MAX_PID_STR_LENGTH 12
-@@ -97,9 +96,12 @@ struct bfq_sched_data {
- struct bfq_entity *next_in_service;
- /* array of service trees, one per ioprio_class */
- struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
-- /* last time CLASS_IDLE was served */
-- unsigned long bfq_class_idle_last_service;
--
-+ /* last time the class was served */
-+ unsigned long bfq_class_last_service[BFQ_IOPRIO_CLASSES];
-+ /* last time class timeout was checked */
-+ unsigned long class_timeout_last_check;
-+ /* next index to check class timeout */
-+ unsigned int next_class_index;
- };
-
- /**
-@@ -197,6 +199,9 @@ struct bfq_entity {
-
- /* flag, set if the entity is counted in groups_with_pending_reqs */
- bool in_groups_with_pending_reqs;
-+
-+ /* last child queue of entity created (for non-leaf entities) */
-+ struct bfq_queue *last_bfqq_created;
- };
-
- struct bfq_group;
-@@ -230,6 +235,8 @@ struct bfq_ttime {
- struct bfq_queue {
- /* reference counter */
- int ref;
-+ /* counter of references from other queues for delayed stable merge */
-+ int stable_ref;
- /* parent bfq_data */
- struct bfq_data *bfqd;
-
-@@ -365,6 +372,8 @@ struct bfq_queue {
-
- unsigned long first_IO_time; /* time of first I/O for this queue */
-
-+ unsigned long creation_time; /* when this queue is created */
-+
- /* max service rate measured so far */
- u32 max_service_rate;
-
-@@ -454,6 +463,11 @@ struct bfq_io_cq {
- u64 saved_last_serv_time_ns;
- unsigned int saved_inject_limit;
- unsigned long saved_decrease_time_jif;
-+
-+ /* candidate queue for a stable merge (due to close creation time) */
-+ struct bfq_queue *stable_merge_bfqq;
-+
-+ bool stably_merged; /* non splittable if true */
- };
-
- /**
-@@ -578,6 +592,9 @@ struct bfq_data {
- /* bfqq owning the last completed rq */
- struct bfq_queue *last_completed_rq_bfqq;
-
-+ /* last bfqq created, among those in the root group */
-+ struct bfq_queue *last_bfqq_created;
-+
- /* time of last transition from empty to non-empty (ns) */
- u64 last_empty_occupied_ns;
-
-@@ -914,7 +931,7 @@ struct bfq_group {
- struct bfq_entity entity;
- struct bfq_sched_data sched_data;
-
-- void *bfqd;
-+ struct bfq_data *bfqd;
-
- struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
- struct bfq_queue *async_idle_bfqq;
-@@ -940,8 +957,6 @@ struct bfq_group {
- };
- #endif
-
--struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
--
- /* --------------- main algorithm interface ----------------- */
-
- #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
-@@ -1036,6 +1051,7 @@ extern struct blkcg_policy blkcg_policy_bfq;
-
- struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
- struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
-+struct bfq_group *bfq_entity_to_bfqg(struct bfq_entity *entity);
- unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
- struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
- struct bfq_entity *bfq_entity_of(struct rb_node *node);
-diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
-index 070e34a7f..7e48ed5b7 100644
---- a/block/bfq-wf2q.c
-+++ b/block/bfq-wf2q.c
-@@ -149,7 +149,7 @@ struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
- if (!group_entity)
- group_entity = &bfqq->bfqd->root_group->entity;
-
-- return container_of(group_entity, struct bfq_group, entity);
-+ return bfq_entity_to_bfqg(group_entity);
- }
-
- /*
-@@ -208,7 +208,7 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
- if (bfq_entity_to_bfqq(entity))
- return true;
-
-- bfqg = container_of(entity, struct bfq_group, entity);
-+ bfqg = bfq_entity_to_bfqg(entity);
-
- /*
- * The field active_entities does not always contain the
-@@ -266,6 +266,15 @@ struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
- return bfqq;
- }
-
-+struct bfq_group *bfq_entity_to_bfqg(struct bfq_entity *entity)
-+{
-+ struct bfq_group *bfqg = NULL;
-+
-+ if (entity->my_sched_data)
-+ bfqg = container_of(entity, struct bfq_group, entity);
-+
-+ return bfqg;
-+}
-
- /**
- * bfq_delta - map service into the virtual time domain.
-@@ -489,7 +498,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,
- #ifdef CONFIG_BFQ_GROUP_IOSCHED
- sd = entity->sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
-- bfqd = (struct bfq_data *)bfqg->bfqd;
-+ bfqd = bfqg->bfqd;
- #endif
- if (bfqq)
- list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
-@@ -518,8 +527,9 @@ unsigned short bfq_ioprio_to_weight(int ioprio)
- */
- static unsigned short bfq_weight_to_ioprio(int weight)
- {
-- return max_t(int, 0,
-- IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
-+ int ioprio = IOPRIO_BE_NR - weight / BFQ_WEIGHT_CONVERSION_COEFF;
-+
-+ return ioprio < 0 ? 0 : min_t(int, ioprio, IOPRIO_BE_NR - 1);
- }
-
- static void bfq_get_entity(struct bfq_entity *entity)
-@@ -588,7 +598,7 @@ static void bfq_active_extract(struct bfq_service_tree *st,
- #ifdef CONFIG_BFQ_GROUP_IOSCHED
- sd = entity->sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
-- bfqd = (struct bfq_data *)bfqg->bfqd;
-+ bfqd = bfqg->bfqd;
- #endif
- if (bfqq)
- list_del(&bfqq->bfqq_list);
-@@ -734,7 +744,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
- else {
- sd = entity->my_sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
-- bfqd = (struct bfq_data *)bfqg->bfqd;
-+ bfqd = bfqg->bfqd;
- }
- #endif
-
-@@ -872,7 +882,7 @@ void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- unsigned long time_ms)
- {
- struct bfq_entity *entity = &bfqq->entity;
-- unsigned long timeout_ms = jiffies_to_msecs(bfq_timeout);
-+ unsigned long timeout_ms = jiffies_to_msecs(bfqd->bfq_timeout);
- unsigned long bounded_time_ms = min(time_ms, timeout_ms);
- int serv_to_charge_for_time =
- (bfqd->bfq_max_budget * bounded_time_ms) / timeout_ms;
-@@ -1001,8 +1011,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
-
- #ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
-- struct bfq_group *bfqg =
-- container_of(entity, struct bfq_group, entity);
-+ struct bfq_group *bfqg = bfq_entity_to_bfqg(entity);
- struct bfq_data *bfqd = bfqg->bfqd;
-
- if (!entity->in_groups_with_pending_reqs) {
-@@ -1160,6 +1169,7 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
- {
- struct bfq_sched_data *sd = entity->sched_data;
- struct bfq_service_tree *st;
-+ int idx = bfq_class_idx(entity);
- bool is_in_service;
-
- if (!entity->on_st_or_in_serv) /*
-@@ -1199,6 +1209,7 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
- else
- bfq_idle_insert(st, entity);
-
-+ sd->bfq_class_last_service[idx] = jiffies;
- return true;
- }
-
-@@ -1427,6 +1438,45 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
- return entity;
- }
-
-+static int bfq_select_next_class(struct bfq_sched_data *sd)
-+{
-+ struct bfq_service_tree *st = sd->service_tree;
-+ unsigned long last_check, last_serve;
-+ int i, class_idx, next_class = 0;
-+ bool found = false;
-+
-+ /*
-+ * we needed to guarantee a minimum bandwidth for each class (if
-+ * there is some active entity in this class). This should also
-+ * mitigate priority-inversion problems in case a low priority
-+ * task is holding file system resources.
-+ */
-+ last_check = sd->class_timeout_last_check;
-+ if (time_is_after_jiffies(last_check + BFQ_CLASS_TIMEOUT))
-+ return next_class;
-+
-+ sd->class_timeout_last_check = jiffies;
-+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
-+ class_idx = (sd->next_class_index + i) % BFQ_IOPRIO_CLASSES;
-+ last_serve = sd->bfq_class_last_service[class_idx];
-+
-+ if (time_is_after_jiffies(last_serve + BFQ_CLASS_TIMEOUT))
-+ continue;
-+
-+ if (!RB_EMPTY_ROOT(&(st + class_idx)->active)) {
-+ if (found)
-+ continue;
-+
-+ next_class = class_idx++;
-+ class_idx %= BFQ_IOPRIO_CLASSES;
-+ sd->next_class_index = class_idx;
-+ found = true;
-+ }
-+ sd->bfq_class_last_service[class_idx] = jiffies;
-+ }
-+ return next_class;
-+}
-+
- /**
- * bfq_lookup_next_entity - return the first eligible entity in @sd.
- * @sd: the sched_data.
-@@ -1440,24 +1490,8 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
- bool expiration)
- {
- struct bfq_service_tree *st = sd->service_tree;
-- struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
- struct bfq_entity *entity = NULL;
-- int class_idx = 0;
--
-- /*
-- * Choose from idle class, if needed to guarantee a minimum
-- * bandwidth to this class (and if there is some active entity
-- * in idle class). This should also mitigate
-- * priority-inversion problems in case a low priority task is
-- * holding file system resources.
-- */
-- if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
-- BFQ_CL_IDLE_TIMEOUT)) {
-- if (!RB_EMPTY_ROOT(&idle_class_st->active))
-- class_idx = BFQ_IOPRIO_CLASSES - 1;
-- /* About to be served if backlogged, or not yet backlogged */
-- sd->bfq_class_idle_last_service = jiffies;
-- }
-+ int class_idx = bfq_select_next_class(sd);
-
- /*
- * Find the next entity to serve for the highest-priority
-@@ -1706,4 +1740,12 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
-
- if (bfqq->wr_coeff > 1)
- bfqd->wr_busy_queues++;
-+
-+ /* Move bfqq to the head of the woken list of its waker */
-+ if (!hlist_unhashed(&bfqq->woken_list_node) &&
-+ &bfqq->woken_list_node != bfqq->waker_bfqq->woken_list.first) {
-+ hlist_del_init(&bfqq->woken_list_node);
-+ hlist_add_head(&bfqq->woken_list_node,
-+ &bfqq->waker_bfqq->woken_list);
-+ }
- }
-diff --git a/block/blk-merge.c b/block/blk-merge.c
-index 4d97fb6dd..1398b52a2 100644
---- a/block/blk-merge.c
-+++ b/block/blk-merge.c
-@@ -846,18 +846,15 @@ static struct request *attempt_front_merge(struct request_queue *q,
- return NULL;
- }
-
--int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
-- struct request *next)
-+/*
-+ * Try to merge 'next' into 'rq'. Return true if the merge happened, false
-+ * otherwise. The caller is responsible for freeing 'next' if the merge
-+ * happened.
-+ */
-+bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
-+ struct request *next)
- {
-- struct request *free;
--
-- free = attempt_merge(q, rq, next);
-- if (free) {
-- blk_put_request(free);
-- return 1;
-- }
--
-- return 0;
-+ return attempt_merge(q, rq, next);
- }
-
- bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
-diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
-index fdeb9773b..fcc9b5728 100644
---- a/block/blk-mq-sched.c
-+++ b/block/blk-mq-sched.c
-@@ -163,9 +163,19 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
- * in blk_mq_dispatch_rq_list().
- */
- list_add_tail(&rq->queuelist, &rq_list);
-+ count++;
- if (rq->mq_hctx != hctx)
- multi_hctxs = true;
-- } while (++count < max_dispatch);
-+
-+ /*
-+ * If we cannot get tag for the request, stop dequeueing
-+ * requests from the IO scheduler. We are unlikely to be able
-+ * to submit them anyway and it creates false impression for
-+ * scheduling heuristics that the device can take more IO.
-+ */
-+ if (!blk_mq_get_driver_tag(rq))
-+ break;
-+ } while (count < max_dispatch);
-
- if (!count) {
- if (run_queue)
-@@ -380,9 +390,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
- return ret;
- }
-
--bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
-+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
-+ struct list_head *free)
- {
-- return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
-+ return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
- }
- EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
-
-diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
-index 5b18ab915..8b70de4b8 100644
---- a/block/blk-mq-sched.h
-+++ b/block/blk-mq-sched.h
-@@ -11,7 +11,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs, struct request **merged_request);
- bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs);
--bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
-+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
-+ struct list_head *free);
- void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
- void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
-
-diff --git a/block/blk-mq.c b/block/blk-mq.c
-index 0e120547c..cadd12d68 100644
---- a/block/blk-mq.c
-+++ b/block/blk-mq.c
-@@ -361,11 +361,12 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
-
- if (e) {
- /*
-- * Flush requests are special and go directly to the
-+ * Flush/passthrough requests are special and go directly to the
- * dispatch list. Don't include reserved tags in the
- * limiting, as it isn't useful.
- */
- if (!op_is_flush(data->cmd_flags) &&
-+ !blk_op_is_passthrough(data->cmd_flags) &&
- e->type->ops.limit_depth &&
- !(data->flags & BLK_MQ_REQ_RESERVED))
- e->type->ops.limit_depth(data->cmd_flags, data);
-@@ -1099,7 +1100,7 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
- return true;
- }
-
--static bool blk_mq_get_driver_tag(struct request *rq)
-+bool blk_mq_get_driver_tag(struct request *rq)
- {
- struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
-
-diff --git a/block/blk-mq.h b/block/blk-mq.h
-index 3616453ca..d9ef3e4f3 100644
---- a/block/blk-mq.h
-+++ b/block/blk-mq.h
-@@ -242,6 +242,8 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
- __blk_mq_put_driver_tag(rq->mq_hctx, rq);
- }
-
-+bool blk_mq_get_driver_tag(struct request *rq);
-+
- static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
- {
- int cpu;
-@@ -282,6 +284,17 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
- return NULL;
- }
-
-+/* Free all requests on the list */
-+static inline void blk_mq_free_requests(struct list_head *list)
-+{
-+ while (!list_empty(list)) {
-+ struct request *rq = list_entry_rq(list->next);
-+
-+ list_del_init(&rq->queuelist);
-+ blk_mq_free_request(rq);
-+ }
-+}
-+
- /*
- * For shared tag users, we track the number of currently active users
- * and attempt to provide a fair share of the tag depth for each of them.
-diff --git a/block/blk.h b/block/blk.h
-index 3b53e44b9..52ff5d4a3 100644
---- a/block/blk.h
-+++ b/block/blk.h
-@@ -224,7 +224,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *,
- void __blk_queue_split(struct bio **bio, unsigned int *nr_segs);
- int ll_back_merge_fn(struct request *req, struct bio *bio,
- unsigned int nr_segs);
--int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
-+bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
- struct request *next);
- unsigned int blk_recalc_rq_segments(struct request *rq);
- void blk_rq_set_mixed_merge(struct request *rq);
-diff --git a/block/elevator.c b/block/elevator.c
-index 293c5c813..151b30911 100644
---- a/block/elevator.c
-+++ b/block/elevator.c
-@@ -203,7 +203,7 @@ static inline void __elv_rqhash_del(struct request *rq)
- rq->rq_flags &= ~RQF_HASHED;
- }
-
--void elv_rqhash_del(struct request_queue *q, struct request *rq)
-+void elv_rqhash_del(struct request *rq)
- {
- if (ELV_ON_HASH(rq))
- __elv_rqhash_del(rq);
-@@ -350,9 +350,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req,
- * we can append 'rq' to an existing request, so we can throw 'rq' away
- * afterwards.
- *
-- * Returns true if we merged, false otherwise
-+ * Returns true if we merged, false otherwise. 'free' will contain all
-+ * requests that need to be freed.
- */
--bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
-+bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq,
-+ struct list_head *free)
- {
- struct request *__rq;
- bool ret;
-@@ -363,8 +365,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
- /*
- * First try one-hit cache.
- */
-- if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
-+ if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) {
-+ list_add(&rq->queuelist, free);
- return true;
-+ }
-
- if (blk_queue_noxmerges(q))
- return false;
-@@ -378,6 +382,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
- if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
- break;
-
-+ list_add(&rq->queuelist, free);
- /* The merged request could be merged with others, try again */
- ret = true;
- rq = __rq;
-@@ -417,7 +422,7 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
- struct elevator_queue *e = q->elevator;
-
- if (e->type->ops.next_request)
-- return e->type->ops.next_request(q, rq);
-+ return e->type->ops.next_request(rq);
-
- return NULL;
- }
-@@ -427,7 +432,7 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
- struct elevator_queue *e = q->elevator;
-
- if (e->type->ops.former_request)
-- return e->type->ops.former_request(q, rq);
-+ return e->type->ops.former_request(rq);
-
- return NULL;
- }
-@@ -616,15 +621,15 @@ static inline bool elv_support_iosched(struct request_queue *q)
- }
-
- /*
-- * For single queue devices, default to using mq-deadline. If we have multiple
-- * queues or mq-deadline is not available, default to "none".
-+ * For single queue devices, default to using bfq. If we have multiple
-+ * queues or bfq is not available, default to "none".
- */
- static struct elevator_type *elevator_get_default(struct request_queue *q)
- {
- if (q->nr_hw_queues != 1)
- return NULL;
-
-- return elevator_get(q, "mq-deadline", false);
-+ return elevator_get(q, "bfq", false);
- }
-
- /*
-@@ -802,8 +807,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
- return len;
- }
-
--struct request *elv_rb_former_request(struct request_queue *q,
-- struct request *rq)
-+struct request *elv_rb_former_request(struct request *rq)
- {
- struct rb_node *rbprev = rb_prev(&rq->rb_node);
-
-@@ -814,8 +818,7 @@ struct request *elv_rb_former_request(struct request_queue *q,
- }
- EXPORT_SYMBOL(elv_rb_former_request);
-
--struct request *elv_rb_latter_request(struct request_queue *q,
-- struct request *rq)
-+struct request *elv_rb_latter_request(struct request *rq)
- {
- struct rb_node *rbnext = rb_next(&rq->rb_node);
-
-diff --git a/block/mq-deadline.c b/block/mq-deadline.c
-index 3aabcd2a7..59178b7f5 100644
---- a/block/mq-deadline.c
-+++ b/block/mq-deadline.c
-@@ -120,7 +120,7 @@ static void deadline_remove_request(struct request_queue *q, struct request *rq)
- if (!RB_EMPTY_NODE(&rq->rb_node))
- deadline_del_rq_rb(dd, rq);
-
-- elv_rqhash_del(q, rq);
-+ elv_rqhash_del(rq);
- if (q->last_merge == rq)
- q->last_merge = NULL;
- }
-@@ -487,6 +487,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
- struct request_queue *q = hctx->queue;
- struct deadline_data *dd = q->elevator->elevator_data;
- const int data_dir = rq_data_dir(rq);
-+ LIST_HEAD(free);
-
- /*
- * This may be a requeue of a write request that has locked its
-@@ -494,16 +495,15 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
- */
- blk_req_zone_write_unlock(rq);
-
-- if (blk_mq_sched_try_insert_merge(q, rq))
-+ if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
-+ blk_mq_free_requests(&free);
- return;
-+ }
-
- trace_block_rq_insert(rq);
-
-- if (at_head || blk_rq_is_passthrough(rq)) {
-- if (at_head)
-- list_add(&rq->queuelist, &dd->dispatch);
-- else
-- list_add_tail(&rq->queuelist, &dd->dispatch);
-+ if (at_head) {
-+ list_add(&rq->queuelist, &dd->dispatch);
- } else {
- deadline_add_rq_rb(dd, rq);
-
-diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
-index 158aefae1..0d81eed39 100644
---- a/include/linux/blkdev.h
-+++ b/include/linux/blkdev.h
-@@ -272,6 +272,12 @@ static inline bool bio_is_passthrough(struct bio *bio)
- return blk_op_is_scsi(op) || blk_op_is_private(op);
- }
-
-+static inline bool blk_op_is_passthrough(unsigned int op)
-+{
-+ return (blk_op_is_scsi(op & REQ_OP_MASK) ||
-+ blk_op_is_private(op & REQ_OP_MASK));
-+}
-+
- static inline unsigned short req_get_ioprio(struct request *req)
- {
- return req->ioprio;
-diff --git a/include/linux/elevator.h b/include/linux/elevator.h
-index dcb2f9022..fffc6218a 100644
---- a/include/linux/elevator.h
-+++ b/include/linux/elevator.h
-@@ -46,8 +46,8 @@ struct elevator_mq_ops {
- bool (*has_work)(struct blk_mq_hw_ctx *);
- void (*completed_request)(struct request *, u64);
- void (*requeue_request)(struct request *);
-- struct request *(*former_request)(struct request_queue *, struct request *);
-- struct request *(*next_request)(struct request_queue *, struct request *);
-+ struct request *(*former_request)(struct request *);
-+ struct request *(*next_request)(struct request *);
- void (*init_icq)(struct io_cq *);
- void (*exit_icq)(struct io_cq *);
- };
-@@ -90,7 +90,7 @@ struct elevator_type
-
- #define ELV_HASH_BITS 6
-
--void elv_rqhash_del(struct request_queue *q, struct request *rq);
-+void elv_rqhash_del(struct request *rq);
- void elv_rqhash_add(struct request_queue *q, struct request *rq);
- void elv_rqhash_reposition(struct request_queue *q, struct request *rq);
- struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
-@@ -117,7 +117,8 @@ extern void elv_merge_requests(struct request_queue *, struct request *,
- struct request *);
- extern void elv_merged_request(struct request_queue *, struct request *,
- enum elv_merge);
--extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
-+extern bool elv_attempt_insert_merge(struct request_queue *, struct request *,
-+ struct list_head *);
- extern struct request *elv_former_request(struct request_queue *, struct request *);
- extern struct request *elv_latter_request(struct request_queue *, struct request *);
-
-@@ -140,8 +141,8 @@ extern struct elevator_queue *elevator_alloc(struct request_queue *,
- /*
- * Helper functions.
- */
--extern struct request *elv_rb_former_request(struct request_queue *, struct request *);
--extern struct request *elv_rb_latter_request(struct request_queue *, struct request *);
-+extern struct request *elv_rb_former_request(struct request *);
-+extern struct request *elv_rb_latter_request(struct request *);
-
- /*
- * rb support functions.
diff --git a/0003-x86-setup-Merge-several-reservations-of-start-of-mem.patch b/0003-x86-setup-Merge-several-reservations-of-start-of-mem.patch
new file mode 100644
index 000000000000..eca80260ba10
--- /dev/null
+++ b/0003-x86-setup-Merge-several-reservations-of-start-of-mem.patch
@@ -0,0 +1,67 @@
+From e63cb4a867fe803dc90376af8b268ba1549ec36e Mon Sep 17 00:00:00 2001
+From: Mike Rapoport <rppt@linux.ibm.com>
+Date: Tue, 2 Mar 2021 12:04:06 +0200
+Subject: [PATCH 3/8] x86/setup: Merge several reservations of start of memory
+
+Currently, the first several pages are reserved both to avoid leaking
+their contents on systems with L1TF and to avoid corrupting BIOS memory.
+
+Merge the two memory reservations.
+
+Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Acked-by: Borislav Petkov <bp@suse.de>
+Link: https://lkml.kernel.org/r/20210302100406.22059-3-rppt@kernel.org
+---
+ arch/x86/kernel/setup.c | 19 ++++++++++---------
+ 1 file changed, 10 insertions(+), 9 deletions(-)
+
+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
+index 420d881da2bd..282d572e49af 100644
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -714,11 +714,6 @@ static int __init parse_reservelow(char *p)
+
+ early_param("reservelow", parse_reservelow);
+
+-static void __init trim_low_memory_range(void)
+-{
+- memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+-}
+-
+ static void __init early_reserve_memory(void)
+ {
+ /*
+@@ -731,10 +726,17 @@ static void __init early_reserve_memory(void)
+ (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
+
+ /*
+- * Make sure page 0 is always reserved because on systems with
+- * L1TF its contents can be leaked to user processes.
++ * The first 4Kb of memory is a BIOS owned area, but generally it is
++ * not listed as such in the E820 table.
++ *
++ * Reserve the first memory page and typically some additional
++ * memory (64KiB by default) since some BIOSes are known to corrupt
++ * low memory. See the Kconfig help text for X86_RESERVE_LOW.
++ *
++ * In addition, make sure page 0 is always reserved because on
++ * systems with L1TF its contents can be leaked to user processes.
+ */
+- memblock_reserve(0, PAGE_SIZE);
++ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+
+ early_reserve_initrd();
+
+@@ -747,7 +749,6 @@ static void __init early_reserve_memory(void)
+ reserve_bios_regions();
+
+ trim_snb_memory();
+- trim_low_memory_range();
+ }
+
+ /*
+--
+2.32.0
+
diff --git a/0004-x86-setup-Move-trim_snb_memory-later-in-setup_arch-t.patch b/0004-x86-setup-Move-trim_snb_memory-later-in-setup_arch-t.patch
new file mode 100644
index 000000000000..8a8e4d194cc6
--- /dev/null
+++ b/0004-x86-setup-Move-trim_snb_memory-later-in-setup_arch-t.patch
@@ -0,0 +1,87 @@
+From c4b5e4bc8317ccb0a822429d87288d9f90453a04 Mon Sep 17 00:00:00 2001
+From: Mike Rapoport <rppt@linux.ibm.com>
+Date: Tue, 13 Apr 2021 21:08:39 +0300
+Subject: [PATCH 4/8] x86/setup: Move trim_snb_memory() later in setup_arch()
+ to fix boot hangs
+
+Commit
+
+ a799c2bd29d1 ("x86/setup: Consolidate early memory reservations")
+
+moved reservation of the memory inaccessible by Sandy Bride integrated
+graphics very early, and, as a result, on systems with such devices
+the first 1M was reserved by trim_snb_memory() which prevented the
+allocation of the real mode trampoline and made the boot hang very
+early.
+
+Since the purpose of trim_snb_memory() is to prevent problematic pages
+ever reaching the graphics device, it is safe to reserve these pages
+after memblock allocations are possible.
+
+Move trim_snb_memory() later in boot so that it will be called after
+reserve_real_mode() and make comments describing trim_snb_memory()
+operation more elaborate.
+
+ [ bp: Massage a bit. ]
+
+Fixes: a799c2bd29d1 ("x86/setup: Consolidate early memory reservations")
+Reported-by: Randy Dunlap <rdunlap@infradead.org>
+Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Tested-by: Randy Dunlap <rdunlap@infradead.org>
+Tested-by: Hugh Dickins <hughd@google.com>
+Link: https://lkml.kernel.org/r/f67d3e03-af90-f790-baf4-8d412fe055af@infradead.org
+---
+ arch/x86/kernel/setup.c | 20 +++++++++++++++-----
+ 1 file changed, 15 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
+index 282d572e49af..7d466f51be1f 100644
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -634,11 +634,16 @@ static void __init trim_snb_memory(void)
+ printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
+
+ /*
+- * Reserve all memory below the 1 MB mark that has not
+- * already been reserved.
++ * SandyBridge integrated graphics devices have a bug that prevents
++ * them from accessing certain memory ranges, namely anything below
++ * 1M and in the pages listed in bad_pages[] above.
++ *
++ * To avoid these pages being ever accessed by SNB gfx devices
++ * reserve all memory below the 1 MB mark and bad_pages that have
++ * not already been reserved at boot time.
+ */
+ memblock_reserve(0, 1<<20);
+-
++
+ for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
+ if (memblock_reserve(bad_pages[i], PAGE_SIZE))
+ printk(KERN_WARNING "failed to reserve 0x%08lx\n",
+@@ -747,8 +752,6 @@ static void __init early_reserve_memory(void)
+
+ reserve_ibft_region();
+ reserve_bios_regions();
+-
+- trim_snb_memory();
+ }
+
+ /*
+@@ -1082,6 +1085,13 @@ void __init setup_arch(char **cmdline_p)
+
+ reserve_real_mode();
+
++ /*
++ * Reserving memory causing GPU hangs on Sandy Bridge integrated
++ * graphics devices should be done after we allocated memory under
++ * 1M for the real mode trampoline.
++ */
++ trim_snb_memory();
++
+ init_mem_mapping();
+
+ idt_setup_early_pf();
+--
+2.32.0
+
diff --git a/0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch b/0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch
new file mode 100644
index 000000000000..169ba22ae2de
--- /dev/null
+++ b/0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch
@@ -0,0 +1,170 @@
+From 3ffe8ae29143ee20e01b0bc4a63774182b59daf9 Mon Sep 17 00:00:00 2001
+From: Mike Rapoport <rppt@linux.ibm.com>
+Date: Tue, 1 Jun 2021 10:53:52 +0300
+Subject: [PATCH 5/8] x86/setup: always reserve the first 1M of RAM
+
+There are BIOSes that are known to corrupt the memory under 1M, or more
+precisely under 640K because the memory above 640K is anyway reserved for
+the EGA/VGA frame buffer and BIOS.
+
+To prevent usage of the memory that will be potentially clobbered by the
+kernel, the beginning of the memory is always reserved. The exact size of
+the reserved area is determined by CONFIG_X86_RESERVE_LOW build time and
+reservelow command line option. The reserved range may be from 4K to 640K
+with the default of 64K. There are also configurations that reserve the
+entire 1M range, like machines with SandyBridge graphic devices or systems
+that enable crash kernel.
+
+In addition to the potentially clobbered memory, EBDA of unknown size may
+be as low as 128K and the memory above that EBDA start is also reserved
+early.
+
+It would have been possible to reserve the entire range under 1M unless for
+the real mode trampoline that must reside in that area.
+
+To accommodate placement of the real mode trampoline and keep the memory
+safe from being clobbered by BIOS reserve the first 64K of RAM before
+memory allocations are possible and then, after the real mode trampoline is
+allocated, reserve the entire range from 0 to 1M.
+
+Update trim_snb_memory() and reserve_real_mode() to avoid redundant
+reservations of the same memory range.
+
+Also make sure the memory under 1M is not getting freed by
+efi_free_boot_services().
+
+Fixes: a799c2bd29d1 ("x86/setup: Consolidate early memory reservations")
+Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
+---
+ arch/x86/kernel/setup.c | 35 ++++++++++++++++++++--------------
+ arch/x86/platform/efi/quirks.c | 12 ++++++++++++
+ arch/x86/realmode/init.c | 14 ++++++++------
+ 3 files changed, 41 insertions(+), 20 deletions(-)
+
+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
+index 7d466f51be1f..d7cfb927864f 100644
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -638,11 +638,11 @@ static void __init trim_snb_memory(void)
+ * them from accessing certain memory ranges, namely anything below
+ * 1M and in the pages listed in bad_pages[] above.
+ *
+- * To avoid these pages being ever accessed by SNB gfx devices
+- * reserve all memory below the 1 MB mark and bad_pages that have
+- * not already been reserved at boot time.
++ * To avoid these pages being ever accessed by SNB gfx devices reserve
++ * bad_pages that have not already been reserved at boot time.
++ * All memory below the 1 MB mark is anyway reserved later during
++ * setup_arch(), so there is no need to reserve it here.
+ */
+- memblock_reserve(0, 1<<20);
+
+ for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
+ if (memblock_reserve(bad_pages[i], PAGE_SIZE))
+@@ -734,14 +734,14 @@ static void __init early_reserve_memory(void)
+ * The first 4Kb of memory is a BIOS owned area, but generally it is
+ * not listed as such in the E820 table.
+ *
+- * Reserve the first memory page and typically some additional
+- * memory (64KiB by default) since some BIOSes are known to corrupt
+- * low memory. See the Kconfig help text for X86_RESERVE_LOW.
++ * Reserve the first 64K of memory since some BIOSes are known to
++ * corrupt low memory. After the real mode trampoline is allocated the
++ * rest of the memory below 640k is reserved.
+ *
+ * In addition, make sure page 0 is always reserved because on
+ * systems with L1TF its contents can be leaked to user processes.
+ */
+- memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
++ memblock_reserve(0, SZ_64K);
+
+ early_reserve_initrd();
+
+@@ -752,6 +752,7 @@ static void __init early_reserve_memory(void)
+
+ reserve_ibft_region();
+ reserve_bios_regions();
++ trim_snb_memory();
+ }
+
+ /*
+@@ -1083,14 +1084,20 @@ void __init setup_arch(char **cmdline_p)
+ (max_pfn_mapped<<PAGE_SHIFT) - 1);
+ #endif
+
+- reserve_real_mode();
+-
+ /*
+- * Reserving memory causing GPU hangs on Sandy Bridge integrated
+- * graphics devices should be done after we allocated memory under
+- * 1M for the real mode trampoline.
++ * Find free memory for the real mode trampoline and place it
++ * there.
++ * If there is not enough free memory under 1M, on EFI-enabled
++ * systems there will be additional attempt to reclaim the memory
++ * for the real mode trampoline at efi_free_boot_services().
++ *
++ * Unconditionally reserve the entire first 1M of RAM because
++ * BIOSes are know to corrupt low memory and several
++ * hundred kilobytes are not worth complex detection what memory gets
++ * clobbered. Moreover, on machines with SandyBridge graphics or in
++ * setups that use crashkernel the entire 1M is anyway reserved.
+ */
+- trim_snb_memory();
++ reserve_real_mode();
+
+ init_mem_mapping();
+
+diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
+index 67d93a243c35..27561b56a821 100644
+--- a/arch/x86/platform/efi/quirks.c
++++ b/arch/x86/platform/efi/quirks.c
+@@ -450,6 +450,18 @@ void __init efi_free_boot_services(void)
+ size -= rm_size;
+ }
+
++ /*
++ * Don't free memory under 1M for two reasons:
++ * - BIOS might clobber it
++ * - Crash kernel needs it to be reserved
++ */
++ if (start + size < SZ_1M)
++ continue;
++ if (start < SZ_1M) {
++ size -= (SZ_1M - start);
++ start = SZ_1M;
++ }
++
+ memblock_free_late(start, size);
+ }
+
+diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
+index 22fda7d99159..ea42630d4e2e 100644
+--- a/arch/x86/realmode/init.c
++++ b/arch/x86/realmode/init.c
+@@ -29,14 +29,16 @@ void __init reserve_real_mode(void)
+
+ /* Has to be under 1M so we can execute real-mode AP code. */
+ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+- if (!mem) {
++ if (!mem)
+ pr_info("No sub-1M memory is available for the trampoline\n");
+- return;
+- }
++ else
++ set_real_mode_mem(mem);
+
+- memblock_reserve(mem, size);
+- set_real_mode_mem(mem);
+- crash_reserve_low_1M();
++ /*
++ * Unconditionally reserve the entire fisrt 1M, see comment in
++ * setup_arch()
++ */
++ memblock_reserve(0, SZ_1M);
+ }
+
+ static void sme_sev_setup_real_mode(struct trampoline_header *th)
+--
+2.32.0
+
diff --git a/0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch b/0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch
new file mode 100644
index 000000000000..a49d92c2252b
--- /dev/null
+++ b/0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch
@@ -0,0 +1,114 @@
+From 2e68d15d0a146e9b13bfbaba5f260c82b8c3d049 Mon Sep 17 00:00:00 2001
+From: Mike Rapoport <rppt@linux.ibm.com>
+Date: Tue, 1 Jun 2021 10:53:53 +0300
+Subject: [PATCH 6/8] x86/setup: remove CONFIG_X86_RESERVE_LOW and reservelow
+ options
+
+The CONFIG_X86_RESERVE_LOW build time and reservelow command line option
+allowed to control the amount of memory under 1M that would be reserved at
+boot to avoid using memory that can be potentially clobbered by BIOS.
+
+Since the entire range under 1M is always reserved there is no need for
+these options and they can be removed.
+
+Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
+---
+ .../admin-guide/kernel-parameters.txt | 5 ----
+ arch/x86/Kconfig | 29 -------------------
+ arch/x86/kernel/setup.c | 24 ---------------
+ 3 files changed, 58 deletions(-)
+
+diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
+index 835f810f2f26..479cc44cc4e2 100644
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -4623,11 +4623,6 @@
+ Reserves a hole at the top of the kernel virtual
+ address space.
+
+- reservelow= [X86]
+- Format: nn[K]
+- Set the amount of memory to reserve for BIOS at
+- the bottom of the address space.
+-
+ reset_devices [KNL] Force drivers to reset the underlying device
+ during initialization.
+
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index 861b1b794697..fc91be3b1bd1 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -1688,35 +1688,6 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
+ Set whether the default state of memory_corruption_check is
+ on or off.
+
+-config X86_RESERVE_LOW
+- int "Amount of low memory, in kilobytes, to reserve for the BIOS"
+- default 64
+- range 4 640
+- help
+- Specify the amount of low memory to reserve for the BIOS.
+-
+- The first page contains BIOS data structures that the kernel
+- must not use, so that page must always be reserved.
+-
+- By default we reserve the first 64K of physical RAM, as a
+- number of BIOSes are known to corrupt that memory range
+- during events such as suspend/resume or monitor cable
+- insertion, so it must not be used by the kernel.
+-
+- You can set this to 4 if you are absolutely sure that you
+- trust the BIOS to get all its memory reservations and usages
+- right. If you know your BIOS have problems beyond the
+- default 64K area, you can set this to 640 to avoid using the
+- entire low memory range.
+-
+- If you have doubts about the BIOS (e.g. suspend/resume does
+- not work or there's kernel crashes after certain hardware
+- hotplug events) then you might want to enable
+- X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
+- typical corruption patterns.
+-
+- Leave this to the default value of 64 if you are unsure.
+-
+ config MATH_EMULATION
+ bool
+ depends on MODIFY_LDT_SYSCALL
+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
+index d7cfb927864f..fbda4bbf75c1 100644
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -695,30 +695,6 @@ static void __init e820_add_kernel_range(void)
+ e820__range_add(start, size, E820_TYPE_RAM);
+ }
+
+-static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
+-
+-static int __init parse_reservelow(char *p)
+-{
+- unsigned long long size;
+-
+- if (!p)
+- return -EINVAL;
+-
+- size = memparse(p, &p);
+-
+- if (size < 4096)
+- size = 4096;
+-
+- if (size > 640*1024)
+- size = 640*1024;
+-
+- reserve_low = size;
+-
+- return 0;
+-}
+-
+-early_param("reservelow", parse_reservelow);
+-
+ static void __init early_reserve_memory(void)
+ {
+ /*
+--
+2.32.0
+
diff --git a/0007-x86-crash-remove-crash_reserve_low_1M.patch b/0007-x86-crash-remove-crash_reserve_low_1M.patch
new file mode 100644
index 000000000000..903e5fa0969a
--- /dev/null
+++ b/0007-x86-crash-remove-crash_reserve_low_1M.patch
@@ -0,0 +1,58 @@
+From bb4c1200fdfd6c17fff64e159e625c3678342b87 Mon Sep 17 00:00:00 2001
+From: Mike Rapoport <rppt@linux.ibm.com>
+Date: Tue, 1 Jun 2021 10:53:54 +0300
+Subject: [PATCH 7/8] x86/crash: remove crash_reserve_low_1M()
+
+The entire memory range under 1M is unconditionally reserved at
+setup_arch(), so there is no need for crash_reserve_low_1M() anymore.
+
+Remove this function.
+
+Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
+---
+ arch/x86/include/asm/crash.h | 6 ------
+ arch/x86/kernel/crash.c | 13 -------------
+ 2 files changed, 19 deletions(-)
+
+diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h
+index f58de66091e5..8b6bd63530dc 100644
+--- a/arch/x86/include/asm/crash.h
++++ b/arch/x86/include/asm/crash.h
+@@ -9,10 +9,4 @@ int crash_setup_memmap_entries(struct kimage *image,
+ struct boot_params *params);
+ void crash_smp_send_stop(void);
+
+-#ifdef CONFIG_KEXEC_CORE
+-void __init crash_reserve_low_1M(void);
+-#else
+-static inline void __init crash_reserve_low_1M(void) { }
+-#endif
+-
+ #endif /* _ASM_X86_CRASH_H */
+diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
+index b1deacbeb266..e0b8d9662da5 100644
+--- a/arch/x86/kernel/crash.c
++++ b/arch/x86/kernel/crash.c
+@@ -70,19 +70,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void)
+ rcu_read_unlock();
+ }
+
+-/*
+- * When the crashkernel option is specified, only use the low
+- * 1M for the real mode trampoline.
+- */
+-void __init crash_reserve_low_1M(void)
+-{
+- if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0)
+- return;
+-
+- memblock_reserve(0, 1<<20);
+- pr_info("Reserving the low 1M of memory for crashkernel\n");
+-}
+-
+ #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
+
+ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
+--
+2.32.0
+
diff --git a/0002-UKSM.patch b/0008-UKSM.patch
index 3321eaa8ee58..3321eaa8ee58 100644
--- a/0002-UKSM.patch
+++ b/0008-UKSM.patch
diff --git a/PKGBUILD b/PKGBUILD
index e75e76e38a5e..11887d91837b 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -68,7 +68,7 @@ _subarch=36
### IMPORTANT: Do no edit below this line unless you know what you're doing
pkgbase=linux-ck-uksm
-pkgver=5.12.10
+pkgver=5.12.12
pkgrel=1
_ckpatchversion=1
_ckpatch="patch-5.12-ck${_ckpatchversion}"
@@ -87,21 +87,31 @@ source=(
"more-uarches-$_gcc_more_v.tar.gz::https://github.com/graysky2/kernel_compiler_patch/archive/$_gcc_more_v.tar.gz"
"http://ck.kolivas.org/patches/5.0/5.12/5.12-ck${_ckpatchversion}/$_ckpatch.xz"
0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch
- 0002-UKSM.patch
- 0003-bfq-lucjan-r2K210602.patch
+ 0002-x86-setup-Consolidate-early-memory-reservations.patch
+ 0003-x86-setup-Merge-several-reservations-of-start-of-mem.patch
+ 0004-x86-setup-Move-trim_snb_memory-later-in-setup_arch-t.patch
+ 0005-x86-setup-always-reserve-the-first-1M-of-RAM.patch
+ 0006-x86-setup-remove-CONFIG_X86_RESERVE_LOW-and-reservel.patch
+ 0007-x86-crash-remove-crash_reserve_low_1M.patch
+ 0008-UKSM.patch
)
validpgpkeys=(
'ABAF11C65A2970B130ABE3C479BE3E4300411886' # Linus Torvalds
'647F28654894E3BD457199BE38DBBDC86092693E' # Greg Kroah-Hartman
)
-b2sums=('b40ef5a11ca435299899e8131fa72af147455cd8ebee4c0e187572b1f628e66d2b6fbb318308bc911a598d8303d1ab3622d52966deaa5c48d59dcd65f4f58687'
+b2sums=('f9aef3da2f65916cc30da9a066217d3379036e6a32a732224da7fee86c80810315484f48132b50b8cf8eb5e0b055ad1b7bbe63dadd0eb54b0b0626bc57c20963'
'SKIP'
'SKIP'
'30d1df754608bb423cbc99c2097ad521baa091b9a3b39df4bd5c2d50c57eec54d8fa0e4a4a04b847c3d1b87ba682cadc8db45fabeefdc9ad7caaf8e77b96e41a'
'c9f729ba1efe6f04e7b2c57d3999bc9675b577596dccb2f227e5b6e444285e1fdd270bf67c0fcf9f5808a4c3a4b1c7a5c13a76f754ad9b9447243ccbaf2ce6a3'
- 'e1eccb5b6b728e3852ade55dae7a53b8b6bd5f0fb2a330b99e85bfa64abaa430cb714d301ed169df14a1f302a75d952992f0d8fa6ab02fa6716165bdf23b63aa'
- '14f45171afc3b15488b40a05e58b352c5057da3a5782e13527392f7750d8e45a8db54f9b50b218fedb8bf679de3b4e5d78e230a44f7b1aa482f7b3aa831bd641'
- '6a195695fcd207adbdea28dd2803b479f6e5dc478b56d5fce16a7600f719fa545ed0e468a26f9c94e982346fb803a0ff026abd0d70335e42027468475beb7cbb')
+ 'dda152592dec643bce44754bf5d2d43a5897cc57f8dc258b87857055a45abf903d619aba1de389228cb086a17fedea5458f8fe2c0993fa20213bb7c5bca331c8'
+ '13330cf57b5c6b928ea73bd30479010688cf8d2003107b041a7fdad33c1ac225c8c905bef235cd762d6ea76be754b5db6be769526bacf7333298f72d6afff535'
+ '381e0f177faa3090d1abf4d11a97db535712840870265dea167d7692dee7733a226d09c103d01705d5c0809fa66c7a23efea9da2473da672644b06e31db77083'
+ 'cd9da0dee048fc52a3032343f122c2055081eeedfc8a3e5227218f0f63fc7618e8fe744c8caa7e3a2ca844f4aaf7314b57a306d0d3b1849e97b24687b8c5a501'
+ '1810832172e1b006a5471d8e317573343884feed9abc9e7380a32d83c958b0e6aa68adf9a647c9b7b714783997591f5d80e754c6e7357279661eee998f22864c'
+ '4e7cb958f95d99bba9810e675d4f1b0b3c171f78e9fe96ff9d265f792f4ceb1367f2f4d238f36b5ca1c395e14abdabbf0f8ce2dc07c4fe567d822a8b629dfa05'
+ '2251f8bf84e141b4661f84cc2ce7b21783ac0a349b2651477dfcbc5383b796b2e588d85ee411398b15c820cb3672256be8ed281c8bccfad252c9dd5b0e1e0cd5'
+ '14f45171afc3b15488b40a05e58b352c5057da3a5782e13527392f7750d8e45a8db54f9b50b218fedb8bf679de3b4e5d78e230a44f7b1aa482f7b3aa831bd641')
export KBUILD_BUILD_HOST=archlinux
export KBUILD_BUILD_USER=$pkgbase
diff --git a/config b/config
index b0287e597b21..b7636b896135 100644
--- a/config
+++ b/config
@@ -1,6 +1,6 @@
#
# Automatically generated file; DO NOT EDIT.
-# Linux/x86 5.12.10 Kernel Configuration
+# Linux/x86 5.12.12 Kernel Configuration
#
CONFIG_CC_VERSION_TEXT="gcc (GCC) 11.1.0"
CONFIG_CC_IS_GCC=y
@@ -488,7 +488,6 @@ CONFIG_X86_PMEM_LEGACY_DEVICE=y
CONFIG_X86_PMEM_LEGACY=m
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
-CONFIG_X86_RESERVE_LOW=64
CONFIG_MTRR=y
CONFIG_MTRR_SANITIZER=y
CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1
@@ -2904,10 +2903,12 @@ CONFIG_NETCONSOLE_DYNAMIC=y
CONFIG_NETPOLL=y
CONFIG_NET_POLL_CONTROLLER=y
CONFIG_NTB_NETDEV=m
-# CONFIG_RIONET is not set
+CONFIG_RIONET=m
+CONFIG_RIONET_TX_SIZE=128
+CONFIG_RIONET_RX_SIZE=128
CONFIG_TUN=m
CONFIG_TAP=m
-# CONFIG_TUN_VNET_CROSS_LE is not set
+CONFIG_TUN_VNET_CROSS_LE=y
CONFIG_VETH=m
CONFIG_VIRTIO_NET=m
CONFIG_NLMON=m
@@ -2915,7 +2916,17 @@ CONFIG_NET_VRF=m
CONFIG_VSOCKMON=m
CONFIG_MHI_NET=m
CONFIG_SUNGEM_PHY=m
-# CONFIG_ARCNET is not set
+CONFIG_ARCNET=m
+CONFIG_ARCNET_1201=m
+CONFIG_ARCNET_1051=m
+CONFIG_ARCNET_RAW=m
+CONFIG_ARCNET_CAP=m
+CONFIG_ARCNET_COM90xx=m
+CONFIG_ARCNET_COM90xxIO=m
+CONFIG_ARCNET_RIM_I=m
+CONFIG_ARCNET_COM20020=m
+CONFIG_ARCNET_COM20020_PCI=m
+CONFIG_ARCNET_COM20020_CS=m
CONFIG_ATM_DRIVERS=y
# CONFIG_ATM_DUMMY is not set
CONFIG_ATM_TCP=m
@@ -3769,7 +3780,24 @@ CONFIG_PCMCIA_WL3501=m
CONFIG_MAC80211_HWSIM=m
CONFIG_USB_NET_RNDIS_WLAN=m
CONFIG_VIRT_WIFI=m
-# CONFIG_WAN is not set
+CONFIG_WAN=y
+CONFIG_LANMEDIA=m
+CONFIG_HDLC=m
+CONFIG_HDLC_RAW=m
+CONFIG_HDLC_RAW_ETH=m
+CONFIG_HDLC_CISCO=m
+CONFIG_HDLC_FR=m
+CONFIG_HDLC_PPP=m
+
+#
+# X.25/LAPB support is disabled
+#
+CONFIG_PCI200SYN=m
+CONFIG_WANXL=m
+CONFIG_PC300TOO=m
+CONFIG_FARSYNC=m
+CONFIG_SBNI=m
+CONFIG_SBNI_MULTILINE=y
CONFIG_IEEE802154_DRIVERS=m
CONFIG_IEEE802154_FAKELB=m
CONFIG_IEEE802154_AT86RF230=m
@@ -3787,7 +3815,7 @@ CONFIG_XEN_NETDEV_BACKEND=m
CONFIG_VMXNET3=m
CONFIG_FUJITSU_ES=m
CONFIG_USB4_NET=m
-# CONFIG_HYPERV_NET is not set
+CONFIG_HYPERV_NET=m
CONFIG_NETDEVSIM=m
CONFIG_NET_FAILOVER=m
CONFIG_ISDN=y
@@ -6122,8 +6150,8 @@ CONFIG_DVB_DUMMY_FE=m
CONFIG_AGP=m
CONFIG_AGP_AMD64=m
CONFIG_AGP_INTEL=m
-# CONFIG_AGP_SIS is not set
-# CONFIG_AGP_VIA is not set
+CONFIG_AGP_SIS=m
+CONFIG_AGP_VIA=m
CONFIG_INTEL_GTT=m
CONFIG_VGA_ARB=y
CONFIG_VGA_ARB_MAX_GPUS=10
@@ -6132,7 +6160,7 @@ CONFIG_DRM=m
CONFIG_DRM_MIPI_DBI=m
CONFIG_DRM_MIPI_DSI=y
CONFIG_DRM_DP_AUX_CHARDEV=y
-# CONFIG_DRM_DEBUG_SELFTEST is not set
+CONFIG_DRM_DEBUG_SELFTEST=m
CONFIG_DRM_KMS_HELPER=m
CONFIG_DRM_KMS_FB_HELPER=y
CONFIG_DRM_FBDEV_EMULATION=y
@@ -6185,7 +6213,14 @@ CONFIG_DRM_AMD_DC_SI=y
# end of Display Engine Configuration
CONFIG_HSA_AMD=y
-# CONFIG_DRM_NOUVEAU is not set
+CONFIG_DRM_NOUVEAU=m
+CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT=y
+CONFIG_NOUVEAU_DEBUG=5
+CONFIG_NOUVEAU_DEBUG_DEFAULT=3
+# CONFIG_NOUVEAU_DEBUG_MMU is not set
+# CONFIG_NOUVEAU_DEBUG_PUSH is not set
+CONFIG_DRM_NOUVEAU_BACKLIGHT=y
+CONFIG_DRM_NOUVEAU_SVM=y
CONFIG_DRM_I915=m
CONFIG_DRM_I915_FORCE_PROBE=""
CONFIG_DRM_I915_CAPTURE_ERROR=y
@@ -6244,8 +6279,16 @@ CONFIG_TINYDRM_ST7735R=m
CONFIG_DRM_XEN=y
CONFIG_DRM_XEN_FRONTEND=m
CONFIG_DRM_VBOXVIDEO=m
-# CONFIG_DRM_LEGACY is not set
+CONFIG_DRM_LEGACY=y
+# CONFIG_DRM_TDFX is not set
+# CONFIG_DRM_R128 is not set
+# CONFIG_DRM_MGA is not set
+# CONFIG_DRM_SIS is not set
+# CONFIG_DRM_VIA is not set
+# CONFIG_DRM_SAVAGE is not set
+CONFIG_DRM_EXPORT_FOR_TESTS=y
CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y
+CONFIG_DRM_LIB_RANDOM=y
#
# Frame buffer Devices
@@ -6318,16 +6361,23 @@ CONFIG_FB_ATY_CT=y
CONFIG_FB_ATY_GENERIC_LCD=y
CONFIG_FB_ATY_GX=y
CONFIG_FB_ATY_BACKLIGHT=y
-# CONFIG_FB_S3 is not set
-# CONFIG_FB_SAVAGE is not set
-# CONFIG_FB_SIS is not set
+CONFIG_FB_S3=m
+CONFIG_FB_S3_DDC=y
+CONFIG_FB_SAVAGE=m
+CONFIG_FB_SAVAGE_I2C=y
+CONFIG_FB_SAVAGE_ACCEL=y
+CONFIG_FB_SIS=m
+CONFIG_FB_SIS_300=y
+CONFIG_FB_SIS_315=y
CONFIG_FB_VIA=m
# CONFIG_FB_VIA_DIRECT_PROCFS is not set
CONFIG_FB_VIA_X_COMPATIBILITY=y
CONFIG_FB_NEOMAGIC=m
-# CONFIG_FB_KYRO is not set
-# CONFIG_FB_3DFX is not set
-# CONFIG_FB_VOODOO1 is not set
+CONFIG_FB_KYRO=m
+CONFIG_FB_3DFX=m
+CONFIG_FB_3DFX_ACCEL=y
+CONFIG_FB_3DFX_I2C=y
+CONFIG_FB_VOODOO1=m
CONFIG_FB_VT8623=m
# CONFIG_FB_TRIDENT is not set
# CONFIG_FB_ARK is not set
@@ -6341,7 +6391,7 @@ CONFIG_FB_IBM_GXT4500=m
CONFIG_XEN_FBDEV_FRONTEND=m
# CONFIG_FB_METRONOME is not set
# CONFIG_FB_MB862XX is not set
-# CONFIG_FB_HYPERV is not set
+CONFIG_FB_HYPERV=m
CONFIG_FB_SIMPLE=y
# CONFIG_FB_SM712 is not set
# end of Frame buffer Devices
@@ -6365,7 +6415,7 @@ CONFIG_LCD_OTM3225A=m
CONFIG_BACKLIGHT_CLASS_DEVICE=y
CONFIG_BACKLIGHT_KTD253=m
CONFIG_BACKLIGHT_LM3533=m
-# CONFIG_BACKLIGHT_CARILLO_RANCH is not set
+CONFIG_BACKLIGHT_CARILLO_RANCH=m
CONFIG_BACKLIGHT_PWM=m
CONFIG_BACKLIGHT_DA903X=m
CONFIG_BACKLIGHT_DA9052=m
@@ -9515,7 +9565,7 @@ CONFIG_GENERIC_STRNLEN_USER=y
CONFIG_GENERIC_NET_UTILS=y
CONFIG_GENERIC_FIND_FIRST_BIT=y
CONFIG_CORDIC=m
-# CONFIG_PRIME_NUMBERS is not set
+CONFIG_PRIME_NUMBERS=m
CONFIG_RATIONAL=y
CONFIG_GENERIC_PCI_IOMAP=y
CONFIG_GENERIC_IOMAP=y