summarylogtreecommitdiffstats
path: root/0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch
diff options
context:
space:
mode:
Diffstat (limited to '0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch')
-rw-r--r--0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch255
1 files changed, 255 insertions, 0 deletions
diff --git a/0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch b/0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch
new file mode 100644
index 000000000000..493337c64d42
--- /dev/null
+++ b/0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch
@@ -0,0 +1,255 @@
+From 8da079307d115705e243d226591dcb4388cef7e2 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Mon, 31 Jul 2023 16:28:43 -0400
+Subject: [PATCH 5/8] btrfs: wait on uncached block groups on every allocation
+ loop
+
+My initial fix for the generic/475 hangs was related to metadata, but
+our CI testing uncovered another case where we hang for similar reasons.
+We again have a task with a plug that is holding an outstanding request
+that is keeping the dm device from finishing it's suspend, and that task
+is stuck in the allocator.
+
+This time it is stuck trying to allocate data, but we do not have a
+block group that matches the size class. The larger loop in the
+allocator looks like this (simplified of course)
+
+ find_free_extent
+ for_each_block_group {
+ ffe_ctl->cached == btrfs_block_group_cache_done(bg)
+ if (!ffe_ctl->cached)
+ ffe_ctl->have_caching_bg = true;
+ do_allocation()
+ btrfs_wait_block_group_cache_progress();
+ }
+
+ if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+ go search again;
+
+In my earlier fix we were trying to allocate from the block group, but
+we weren't waiting for the progress because we were only waiting for the
+free space to be >= the amount of free space we wanted. My fix made it
+so we waited for forward progress to be made as well, so we would be
+sure to wait.
+
+This time however we did not have a block group that matched our size
+class, so what was happening was this
+
+ find_free_extent
+ for_each_block_group {
+ ffe_ctl->cached == btrfs_block_group_cache_done(bg)
+ if (!ffe_ctl->cached)
+ ffe_ctl->have_caching_bg = true;
+ if (size_class_doesn't_match())
+ goto loop;
+ do_allocation()
+ btrfs_wait_block_group_cache_progress();
+ loop:
+ release_block_group(block_group);
+ }
+
+ if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+ go search again;
+
+The size_class_doesn't_match() part was true, so we'd just skip this
+block group and never wait for caching, and then because we found a
+caching block group we'd just go back and do the loop again. We never
+sleep and thus never flush the plug and we have the same deadlock.
+
+Fix the logic for waiting on the block group caching to instead do it
+unconditionally when we goto loop. This takes the logic out of the
+allocation step, so now the loop looks more like this
+
+ find_free_extent
+ for_each_block_group {
+ ffe_ctl->cached == btrfs_block_group_cache_done(bg)
+ if (!ffe_ctl->cached)
+ ffe_ctl->have_caching_bg = true;
+ if (size_class_doesn't_match())
+ goto loop;
+ do_allocation()
+ btrfs_wait_block_group_cache_progress();
+ loop:
+ if (loop > LOOP_CACHING_NOWAIT && !ffe_ctl->retry_uncached &&
+ !ffe_ctl->cached) {
+ ffe_ctl->retry_uncached = true;
+ btrfs_wait_block_group_cache_progress();
+ }
+
+ release_block_group(block_group);
+ }
+
+ if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+ go search again;
+
+This simplifies the logic a lot, and makes sure that if we're hitting
+uncached block groups we're always waiting on them at some point.
+
+I ran this through 100 iterations of generic/475, as this particular
+case was harder to hit than the previous one.
+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/extent-tree.c | 61 +++++++++++++-----------------------------
+ fs/btrfs/extent-tree.h | 13 +++------
+ 2 files changed, 22 insertions(+), 52 deletions(-)
+
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index f396a9afa4032..6096bd98e6c70 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3427,7 +3427,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
+ * Helper function for find_free_extent().
+ *
+ * Return -ENOENT to inform caller that we need fallback to unclustered mode.
+- * Return -EAGAIN to inform caller that we need to re-search this block group
+ * Return >0 to inform caller that we find nothing
+ * Return 0 means we have found a location and set ffe_ctl->found_offset.
+ */
+@@ -3508,14 +3507,6 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
+ trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
+ return 0;
+ }
+- } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
+- !ffe_ctl->retry_clustered) {
+- spin_unlock(&last_ptr->refill_lock);
+-
+- ffe_ctl->retry_clustered = true;
+- btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+- ffe_ctl->empty_cluster + ffe_ctl->empty_size);
+- return -EAGAIN;
+ }
+ /*
+ * At this point we either didn't find a cluster or we weren't able to
+@@ -3530,7 +3521,6 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
+ /*
+ * Return >0 to inform caller that we find nothing
+ * Return 0 when we found an free extent and set ffe_ctrl->found_offset
+- * Return -EAGAIN to inform caller that we need to re-search this block group
+ */
+ static int find_free_extent_unclustered(struct btrfs_block_group *bg,
+ struct find_free_extent_ctl *ffe_ctl)
+@@ -3568,25 +3558,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg,
+ offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
+ ffe_ctl->num_bytes, ffe_ctl->empty_size,
+ &ffe_ctl->max_extent_size);
+-
+- /*
+- * If we didn't find a chunk, and we haven't failed on this block group
+- * before, and this block group is in the middle of caching and we are
+- * ok with waiting, then go ahead and wait for progress to be made, and
+- * set @retry_unclustered to true.
+- *
+- * If @retry_unclustered is true then we've already waited on this
+- * block group once and should move on to the next block group.
+- */
+- if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
+- ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
+- btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+- ffe_ctl->empty_size);
+- ffe_ctl->retry_unclustered = true;
+- return -EAGAIN;
+- } else if (!offset) {
++ if (!offset)
+ return 1;
+- }
+ ffe_ctl->found_offset = offset;
+ return 0;
+ }
+@@ -3600,7 +3573,7 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group,
+ /* We want to try and use the cluster allocator, so lets look there */
+ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
+ ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
+- if (ret >= 0 || ret == -EAGAIN)
++ if (ret >= 0)
+ return ret;
+ /* ret == -ENOENT case falls through */
+ }
+@@ -3816,8 +3789,7 @@ static void release_block_group(struct btrfs_block_group *block_group,
+ {
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+- ffe_ctl->retry_clustered = false;
+- ffe_ctl->retry_unclustered = false;
++ ffe_ctl->retry_uncached = false;
+ break;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Nothing to do */
+@@ -4168,9 +4140,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ ffe_ctl->orig_have_caching_bg = false;
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
+ ffe_ctl->loop = 0;
+- /* For clustered allocation */
+- ffe_ctl->retry_clustered = false;
+- ffe_ctl->retry_unclustered = false;
++ ffe_ctl->retry_uncached = false;
+ ffe_ctl->cached = 0;
+ ffe_ctl->max_extent_size = 0;
+ ffe_ctl->total_free_space = 0;
+@@ -4321,16 +4291,12 @@ static noinline int find_free_extent(struct btrfs_root *root,
+
+ bg_ret = NULL;
+ ret = do_allocation(block_group, ffe_ctl, &bg_ret);
+- if (ret == 0) {
+- if (bg_ret && bg_ret != block_group) {
+- btrfs_release_block_group(block_group,
+- ffe_ctl->delalloc);
+- block_group = bg_ret;
+- }
+- } else if (ret == -EAGAIN) {
+- goto have_block_group;
+- } else if (ret > 0) {
++ if (ret > 0)
+ goto loop;
++
++ if (bg_ret && bg_ret != block_group) {
++ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
++ block_group = bg_ret;
+ }
+
+ /* Checks */
+@@ -4371,6 +4337,15 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
+ break;
+ loop:
++ if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
++ !ffe_ctl->retry_uncached) {
++ ffe_ctl->retry_uncached = true;
++ btrfs_wait_block_group_cache_progress(block_group,
++ ffe_ctl->num_bytes +
++ ffe_ctl->empty_cluster +
++ ffe_ctl->empty_size);
++ goto have_block_group;
++ }
+ release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
+ cond_resched();
+ }
+diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
+index 429d5c5700618..6bfba2f22fdd4 100644
+--- a/fs/btrfs/extent-tree.h
++++ b/fs/btrfs/extent-tree.h
+@@ -48,16 +48,11 @@ struct find_free_extent_ctl {
+ int loop;
+
+ /*
+- * Whether we're refilling a cluster, if true we need to re-search
+- * current block group but don't try to refill the cluster again.
++ * Set to true if we're retrying the allocation on this block group
++ * after waiting for caching progress, this is so that we retry only
++ * once before moving on to another block group.
+ */
+- bool retry_clustered;
+-
+- /*
+- * Whether we're updating free space cache, if true we need to re-search
+- * current block group but don't try updating free space cache again.
+- */
+- bool retry_unclustered;
++ bool retry_uncached;
+
+ /* If current block group is cached */
+ int cached;
+--
+2.41.0
+