diff options
Diffstat (limited to '0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch')
-rw-r--r-- | 0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch | 255 |
1 files changed, 255 insertions, 0 deletions
diff --git a/0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch b/0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch new file mode 100644 index 000000000000..493337c64d42 --- /dev/null +++ b/0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch @@ -0,0 +1,255 @@ +From 8da079307d115705e243d226591dcb4388cef7e2 Mon Sep 17 00:00:00 2001 +From: Josef Bacik <josef@toxicpanda.com> +Date: Mon, 31 Jul 2023 16:28:43 -0400 +Subject: [PATCH 5/8] btrfs: wait on uncached block groups on every allocation + loop + +My initial fix for the generic/475 hangs was related to metadata, but +our CI testing uncovered another case where we hang for similar reasons. +We again have a task with a plug that is holding an outstanding request +that is keeping the dm device from finishing it's suspend, and that task +is stuck in the allocator. + +This time it is stuck trying to allocate data, but we do not have a +block group that matches the size class. The larger loop in the +allocator looks like this (simplified of course) + + find_free_extent + for_each_block_group { + ffe_ctl->cached == btrfs_block_group_cache_done(bg) + if (!ffe_ctl->cached) + ffe_ctl->have_caching_bg = true; + do_allocation() + btrfs_wait_block_group_cache_progress(); + } + + if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) + go search again; + +In my earlier fix we were trying to allocate from the block group, but +we weren't waiting for the progress because we were only waiting for the +free space to be >= the amount of free space we wanted. My fix made it +so we waited for forward progress to be made as well, so we would be +sure to wait. + +This time however we did not have a block group that matched our size +class, so what was happening was this + + find_free_extent + for_each_block_group { + ffe_ctl->cached == btrfs_block_group_cache_done(bg) + if (!ffe_ctl->cached) + ffe_ctl->have_caching_bg = true; + if (size_class_doesn't_match()) + goto loop; + do_allocation() + btrfs_wait_block_group_cache_progress(); + loop: + release_block_group(block_group); + } + + if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) + go search again; + +The size_class_doesn't_match() part was true, so we'd just skip this +block group and never wait for caching, and then because we found a +caching block group we'd just go back and do the loop again. We never +sleep and thus never flush the plug and we have the same deadlock. + +Fix the logic for waiting on the block group caching to instead do it +unconditionally when we goto loop. This takes the logic out of the +allocation step, so now the loop looks more like this + + find_free_extent + for_each_block_group { + ffe_ctl->cached == btrfs_block_group_cache_done(bg) + if (!ffe_ctl->cached) + ffe_ctl->have_caching_bg = true; + if (size_class_doesn't_match()) + goto loop; + do_allocation() + btrfs_wait_block_group_cache_progress(); + loop: + if (loop > LOOP_CACHING_NOWAIT && !ffe_ctl->retry_uncached && + !ffe_ctl->cached) { + ffe_ctl->retry_uncached = true; + btrfs_wait_block_group_cache_progress(); + } + + release_block_group(block_group); + } + + if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) + go search again; + +This simplifies the logic a lot, and makes sure that if we're hitting +uncached block groups we're always waiting on them at some point. + +I ran this through 100 iterations of generic/475, as this particular +case was harder to hit than the previous one. + +Signed-off-by: Josef Bacik <josef@toxicpanda.com> +Signed-off-by: David Sterba <dsterba@suse.com> +--- + fs/btrfs/extent-tree.c | 61 +++++++++++++----------------------------- + fs/btrfs/extent-tree.h | 13 +++------ + 2 files changed, 22 insertions(+), 52 deletions(-) + +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index f396a9afa4032..6096bd98e6c70 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3427,7 +3427,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, + * Helper function for find_free_extent(). + * + * Return -ENOENT to inform caller that we need fallback to unclustered mode. +- * Return -EAGAIN to inform caller that we need to re-search this block group + * Return >0 to inform caller that we find nothing + * Return 0 means we have found a location and set ffe_ctl->found_offset. + */ +@@ -3508,14 +3507,6 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, + trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); + return 0; + } +- } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && +- !ffe_ctl->retry_clustered) { +- spin_unlock(&last_ptr->refill_lock); +- +- ffe_ctl->retry_clustered = true; +- btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + +- ffe_ctl->empty_cluster + ffe_ctl->empty_size); +- return -EAGAIN; + } + /* + * At this point we either didn't find a cluster or we weren't able to +@@ -3530,7 +3521,6 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, + /* + * Return >0 to inform caller that we find nothing + * Return 0 when we found an free extent and set ffe_ctrl->found_offset +- * Return -EAGAIN to inform caller that we need to re-search this block group + */ + static int find_free_extent_unclustered(struct btrfs_block_group *bg, + struct find_free_extent_ctl *ffe_ctl) +@@ -3568,25 +3558,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg, + offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, + ffe_ctl->num_bytes, ffe_ctl->empty_size, + &ffe_ctl->max_extent_size); +- +- /* +- * If we didn't find a chunk, and we haven't failed on this block group +- * before, and this block group is in the middle of caching and we are +- * ok with waiting, then go ahead and wait for progress to be made, and +- * set @retry_unclustered to true. +- * +- * If @retry_unclustered is true then we've already waited on this +- * block group once and should move on to the next block group. +- */ +- if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && +- ffe_ctl->loop > LOOP_CACHING_NOWAIT) { +- btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + +- ffe_ctl->empty_size); +- ffe_ctl->retry_unclustered = true; +- return -EAGAIN; +- } else if (!offset) { ++ if (!offset) + return 1; +- } + ffe_ctl->found_offset = offset; + return 0; + } +@@ -3600,7 +3573,7 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, + /* We want to try and use the cluster allocator, so lets look there */ + if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { + ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); +- if (ret >= 0 || ret == -EAGAIN) ++ if (ret >= 0) + return ret; + /* ret == -ENOENT case falls through */ + } +@@ -3816,8 +3789,7 @@ static void release_block_group(struct btrfs_block_group *block_group, + { + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: +- ffe_ctl->retry_clustered = false; +- ffe_ctl->retry_unclustered = false; ++ ffe_ctl->retry_uncached = false; + break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ +@@ -4168,9 +4140,7 @@ static noinline int find_free_extent(struct btrfs_root *root, + ffe_ctl->orig_have_caching_bg = false; + ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); + ffe_ctl->loop = 0; +- /* For clustered allocation */ +- ffe_ctl->retry_clustered = false; +- ffe_ctl->retry_unclustered = false; ++ ffe_ctl->retry_uncached = false; + ffe_ctl->cached = 0; + ffe_ctl->max_extent_size = 0; + ffe_ctl->total_free_space = 0; +@@ -4321,16 +4291,12 @@ static noinline int find_free_extent(struct btrfs_root *root, + + bg_ret = NULL; + ret = do_allocation(block_group, ffe_ctl, &bg_ret); +- if (ret == 0) { +- if (bg_ret && bg_ret != block_group) { +- btrfs_release_block_group(block_group, +- ffe_ctl->delalloc); +- block_group = bg_ret; +- } +- } else if (ret == -EAGAIN) { +- goto have_block_group; +- } else if (ret > 0) { ++ if (ret > 0) + goto loop; ++ ++ if (bg_ret && bg_ret != block_group) { ++ btrfs_release_block_group(block_group, ffe_ctl->delalloc); ++ block_group = bg_ret; + } + + /* Checks */ +@@ -4371,6 +4337,15 @@ static noinline int find_free_extent(struct btrfs_root *root, + btrfs_release_block_group(block_group, ffe_ctl->delalloc); + break; + loop: ++ if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && ++ !ffe_ctl->retry_uncached) { ++ ffe_ctl->retry_uncached = true; ++ btrfs_wait_block_group_cache_progress(block_group, ++ ffe_ctl->num_bytes + ++ ffe_ctl->empty_cluster + ++ ffe_ctl->empty_size); ++ goto have_block_group; ++ } + release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); + cond_resched(); + } +diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h +index 429d5c5700618..6bfba2f22fdd4 100644 +--- a/fs/btrfs/extent-tree.h ++++ b/fs/btrfs/extent-tree.h +@@ -48,16 +48,11 @@ struct find_free_extent_ctl { + int loop; + + /* +- * Whether we're refilling a cluster, if true we need to re-search +- * current block group but don't try to refill the cluster again. ++ * Set to true if we're retrying the allocation on this block group ++ * after waiting for caching progress, this is so that we retry only ++ * once before moving on to another block group. + */ +- bool retry_clustered; +- +- /* +- * Whether we're updating free space cache, if true we need to re-search +- * current block group but don't try updating free space cache again. +- */ +- bool retry_unclustered; ++ bool retry_uncached; + + /* If current block group is cached */ + int cached; +-- +2.41.0 + |