1 files changed, 255 insertions, 0 deletions
diff --git a/0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch b/0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch
new file mode 100644
index 000000000000..493337c64d42
--- /dev/null
+++ b/0005-btrfs-wait-on-uncached-block-groups-on-every-allocat.patch
@@ -0,0 +1,255 @@
+From 8da079307d115705e243d226591dcb4388cef7e2 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Mon, 31 Jul 2023 16:28:43 -0400
+Subject: [PATCH 5/8] btrfs: wait on uncached block groups on every allocation
+ loop
+
+My initial fix for the generic/475 hangs was related to metadata, but
+our CI testing uncovered another case where we hang for similar reasons.
+We again have a task with a plug that is holding an outstanding request
+that is keeping the dm device from finishing it's suspend, and that task
+is stuck in the allocator.
+
+This time it is stuck trying to allocate data, but we do not have a
+block group that matches the size class.  The larger loop in the
+allocator looks like this (simplified of course)
+
+  find_free_extent
+    for_each_block_group {
+      ffe_ctl->cached == btrfs_block_group_cache_done(bg)
+      if (!ffe_ctl->cached)
+	ffe_ctl->have_caching_bg = true;
+      do_allocation()
+	btrfs_wait_block_group_cache_progress();
+    }
+
+    if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+      go search again;
+
+In my earlier fix we were trying to allocate from the block group, but
+we weren't waiting for the progress because we were only waiting for the
+free space to be >= the amount of free space we wanted.  My fix made it
+so we waited for forward progress to be made as well, so we would be
+sure to wait.
+
+This time however we did not have a block group that matched our size
+class, so what was happening was this
+
+  find_free_extent
+    for_each_block_group {
+      ffe_ctl->cached == btrfs_block_group_cache_done(bg)
+      if (!ffe_ctl->cached)
+	ffe_ctl->have_caching_bg = true;
+      if (size_class_doesn't_match())
+	goto loop;
+      do_allocation()
+	btrfs_wait_block_group_cache_progress();
+  loop:
+      release_block_group(block_group);
+    }
+
+    if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+      go search again;
+
+The size_class_doesn't_match() part was true, so we'd just skip this
+block group and never wait for caching, and then because we found a
+caching block group we'd just go back and do the loop again.  We never
+sleep and thus never flush the plug and we have the same deadlock.
+
+Fix the logic for waiting on the block group caching to instead do it
+unconditionally when we goto loop.  This takes the logic out of the
+allocation step, so now the loop looks more like this
+
+  find_free_extent
+    for_each_block_group {
+      ffe_ctl->cached == btrfs_block_group_cache_done(bg)
+      if (!ffe_ctl->cached)
+	ffe_ctl->have_caching_bg = true;
+      if (size_class_doesn't_match())
+	goto loop;
+      do_allocation()
+	btrfs_wait_block_group_cache_progress();
+  loop:
+      if (loop > LOOP_CACHING_NOWAIT && !ffe_ctl->retry_uncached &&
+	  !ffe_ctl->cached) {
+	 ffe_ctl->retry_uncached = true;
+	 btrfs_wait_block_group_cache_progress();
+      }
+
+      release_block_group(block_group);
+    }
+
+    if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+      go search again;
+
+This simplifies the logic a lot, and makes sure that if we're hitting
+uncached block groups we're always waiting on them at some point.
+
+I ran this through 100 iterations of generic/475, as this particular
+case was harder to hit than the previous one.
+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/extent-tree.c | 61 +++++++++++++-----------------------------
+ fs/btrfs/extent-tree.h | 13 +++------
+ 2 files changed, 22 insertions(+), 52 deletions(-)
+
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index f396a9afa4032..6096bd98e6c70 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3427,7 +3427,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
+  * Helper function for find_free_extent().
+  *
+  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
+- * Return -EAGAIN to inform caller that we need to re-search this block group
+  * Return >0 to inform caller that we find nothing
+  * Return 0 means we have found a location and set ffe_ctl->found_offset.
+  */
+@@ -3508,14 +3507,6 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
+ 			trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
+ 			return 0;
+ 		}
+-	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
+-		   !ffe_ctl->retry_clustered) {
+-		spin_unlock(&last_ptr->refill_lock);
+-
+-		ffe_ctl->retry_clustered = true;
+-		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+-				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
+-		return -EAGAIN;
+ 	}
+ 	/*
+ 	 * At this point we either didn't find a cluster or we weren't able to
+@@ -3530,7 +3521,6 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
+ /*
+  * Return >0 to inform caller that we find nothing
+  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
+- * Return -EAGAIN to inform caller that we need to re-search this block group
+  */
+ static int find_free_extent_unclustered(struct btrfs_block_group *bg,
+ 					struct find_free_extent_ctl *ffe_ctl)
+@@ -3568,25 +3558,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg,
+ 	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
+ 			ffe_ctl->num_bytes, ffe_ctl->empty_size,
+ 			&ffe_ctl->max_extent_size);
+-
+-	/*
+-	 * If we didn't find a chunk, and we haven't failed on this block group
+-	 * before, and this block group is in the middle of caching and we are
+-	 * ok with waiting, then go ahead and wait for progress to be made, and
+-	 * set @retry_unclustered to true.
+-	 *
+-	 * If @retry_unclustered is true then we've already waited on this
+-	 * block group once and should move on to the next block group.
+-	 */
+-	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
+-	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
+-		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+-						      ffe_ctl->empty_size);
+-		ffe_ctl->retry_unclustered = true;
+-		return -EAGAIN;
+-	} else if (!offset) {
++	if (!offset)
+ 		return 1;
+-	}
+ 	ffe_ctl->found_offset = offset;
+ 	return 0;
+ }
+@@ -3600,7 +3573,7 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group,
+ 	/* We want to try and use the cluster allocator, so lets look there */
+ 	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
+ 		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
+-		if (ret >= 0 || ret == -EAGAIN)
++		if (ret >= 0)
+ 			return ret;
+ 		/* ret == -ENOENT case falls through */
+ 	}
+@@ -3816,8 +3789,7 @@ static void release_block_group(struct btrfs_block_group *block_group,
+ {
+ 	switch (ffe_ctl->policy) {
+ 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
+-		ffe_ctl->retry_clustered = false;
+-		ffe_ctl->retry_unclustered = false;
++		ffe_ctl->retry_uncached = false;
+ 		break;
+ 	case BTRFS_EXTENT_ALLOC_ZONED:
+ 		/* Nothing to do */
+@@ -4168,9 +4140,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ 	ffe_ctl->orig_have_caching_bg = false;
+ 	ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
+ 	ffe_ctl->loop = 0;
+-	/* For clustered allocation */
+-	ffe_ctl->retry_clustered = false;
+-	ffe_ctl->retry_unclustered = false;
++	ffe_ctl->retry_uncached = false;
+ 	ffe_ctl->cached = 0;
+ 	ffe_ctl->max_extent_size = 0;
+ 	ffe_ctl->total_free_space = 0;
+@@ -4321,16 +4291,12 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ 
+ 		bg_ret = NULL;
+ 		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
+-		if (ret == 0) {
+-			if (bg_ret && bg_ret != block_group) {
+-				btrfs_release_block_group(block_group,
+-							  ffe_ctl->delalloc);
+-				block_group = bg_ret;
+-			}
+-		} else if (ret == -EAGAIN) {
+-			goto have_block_group;
+-		} else if (ret > 0) {
++		if (ret > 0)
+ 			goto loop;
++
++		if (bg_ret && bg_ret != block_group) {
++			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
++			block_group = bg_ret;
+ 		}
+ 
+ 		/* Checks */
+@@ -4371,6 +4337,15 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ 		btrfs_release_block_group(block_group, ffe_ctl->delalloc);
+ 		break;
+ loop:
++		if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
++		    !ffe_ctl->retry_uncached) {
++			ffe_ctl->retry_uncached = true;
++			btrfs_wait_block_group_cache_progress(block_group,
++						ffe_ctl->num_bytes +
++						ffe_ctl->empty_cluster +
++						ffe_ctl->empty_size);
++			goto have_block_group;
++		}
+ 		release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
+ 		cond_resched();
+ 	}
+diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
+index 429d5c5700618..6bfba2f22fdd4 100644
+--- a/fs/btrfs/extent-tree.h
++++ b/fs/btrfs/extent-tree.h
+@@ -48,16 +48,11 @@ struct find_free_extent_ctl {
+ 	int loop;
+ 
+ 	/*
+-	 * Whether we're refilling a cluster, if true we need to re-search
+-	 * current block group but don't try to refill the cluster again.
++	 * Set to true if we're retrying the allocation on this block group
++	 * after waiting for caching progress, this is so that we retry only
++	 * once before moving on to another block group.
+ 	 */
+-	bool retry_clustered;
+-
+-	/*
+-	 * Whether we're updating free space cache, if true we need to re-search
+-	 * current block group but don't try updating free space cache again.
+-	 */
+-	bool retry_unclustered;
++	bool retry_uncached;
+ 
+ 	/* If current block group is cached */
+ 	int cached;
+-- 
+2.41.0
+