summarylogtreecommitdiffstats
path: root/0067-btrfs-fix-lockdep-splat-with-reloc-root-extent-buffe.patch
diff options
context:
space:
mode:
Diffstat (limited to '0067-btrfs-fix-lockdep-splat-with-reloc-root-extent-buffe.patch')
-rw-r--r--0067-btrfs-fix-lockdep-splat-with-reloc-root-extent-buffe.patch365
1 files changed, 365 insertions, 0 deletions
diff --git a/0067-btrfs-fix-lockdep-splat-with-reloc-root-extent-buffe.patch b/0067-btrfs-fix-lockdep-splat-with-reloc-root-extent-buffe.patch
new file mode 100644
index 000000000000..51f5185f8596
--- /dev/null
+++ b/0067-btrfs-fix-lockdep-splat-with-reloc-root-extent-buffe.patch
@@ -0,0 +1,365 @@
+From b83e1567af28055b9a0bb7b10fcdd58b02d00954 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Tue, 26 Jul 2022 16:24:04 -0400
+Subject: [PATCH 67/73] btrfs: fix lockdep splat with reloc root extent buffers
+
+[ Upstream commit b40130b23ca4a08c5785d5a3559805916bddba3c ]
+
+We have been hitting the following lockdep splat with btrfs/187 recently
+
+ WARNING: possible circular locking dependency detected
+ 5.19.0-rc8+ #775 Not tainted
+ ------------------------------------------------------
+ btrfs/752500 is trying to acquire lock:
+ ffff97e1875a97b8 (btrfs-treloc-02#2){+.+.}-{3:3}, at: __btrfs_tree_lock+0x24/0x110
+
+ but task is already holding lock:
+ ffff97e1875a9278 (btrfs-tree-01/1){+.+.}-{3:3}, at: __btrfs_tree_lock+0x24/0x110
+
+ which lock already depends on the new lock.
+
+ the existing dependency chain (in reverse order) is:
+
+ -> #2 (btrfs-tree-01/1){+.+.}-{3:3}:
+ down_write_nested+0x41/0x80
+ __btrfs_tree_lock+0x24/0x110
+ btrfs_init_new_buffer+0x7d/0x2c0
+ btrfs_alloc_tree_block+0x120/0x3b0
+ __btrfs_cow_block+0x136/0x600
+ btrfs_cow_block+0x10b/0x230
+ btrfs_search_slot+0x53b/0xb70
+ btrfs_lookup_inode+0x2a/0xa0
+ __btrfs_update_delayed_inode+0x5f/0x280
+ btrfs_async_run_delayed_root+0x24c/0x290
+ btrfs_work_helper+0xf2/0x3e0
+ process_one_work+0x271/0x590
+ worker_thread+0x52/0x3b0
+ kthread+0xf0/0x120
+ ret_from_fork+0x1f/0x30
+
+ -> #1 (btrfs-tree-01){++++}-{3:3}:
+ down_write_nested+0x41/0x80
+ __btrfs_tree_lock+0x24/0x110
+ btrfs_search_slot+0x3c3/0xb70
+ do_relocation+0x10c/0x6b0
+ relocate_tree_blocks+0x317/0x6d0
+ relocate_block_group+0x1f1/0x560
+ btrfs_relocate_block_group+0x23e/0x400
+ btrfs_relocate_chunk+0x4c/0x140
+ btrfs_balance+0x755/0xe40
+ btrfs_ioctl+0x1ea2/0x2c90
+ __x64_sys_ioctl+0x88/0xc0
+ do_syscall_64+0x38/0x90
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+ -> #0 (btrfs-treloc-02#2){+.+.}-{3:3}:
+ __lock_acquire+0x1122/0x1e10
+ lock_acquire+0xc2/0x2d0
+ down_write_nested+0x41/0x80
+ __btrfs_tree_lock+0x24/0x110
+ btrfs_lock_root_node+0x31/0x50
+ btrfs_search_slot+0x1cb/0xb70
+ replace_path+0x541/0x9f0
+ merge_reloc_root+0x1d6/0x610
+ merge_reloc_roots+0xe2/0x260
+ relocate_block_group+0x2c8/0x560
+ btrfs_relocate_block_group+0x23e/0x400
+ btrfs_relocate_chunk+0x4c/0x140
+ btrfs_balance+0x755/0xe40
+ btrfs_ioctl+0x1ea2/0x2c90
+ __x64_sys_ioctl+0x88/0xc0
+ do_syscall_64+0x38/0x90
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+ other info that might help us debug this:
+
+ Chain exists of:
+ btrfs-treloc-02#2 --> btrfs-tree-01 --> btrfs-tree-01/1
+
+ Possible unsafe locking scenario:
+
+ CPU0 CPU1
+ ---- ----
+ lock(btrfs-tree-01/1);
+ lock(btrfs-tree-01);
+ lock(btrfs-tree-01/1);
+ lock(btrfs-treloc-02#2);
+
+ *** DEADLOCK ***
+
+ 7 locks held by btrfs/752500:
+ #0: ffff97e292fdf460 (sb_writers#12){.+.+}-{0:0}, at: btrfs_ioctl+0x208/0x2c90
+ #1: ffff97e284c02050 (&fs_info->reclaim_bgs_lock){+.+.}-{3:3}, at: btrfs_balance+0x55f/0xe40
+ #2: ffff97e284c00878 (&fs_info->cleaner_mutex){+.+.}-{3:3}, at: btrfs_relocate_block_group+0x236/0x400
+ #3: ffff97e292fdf650 (sb_internal#2){.+.+}-{0:0}, at: merge_reloc_root+0xef/0x610
+ #4: ffff97e284c02378 (btrfs_trans_num_writers){++++}-{0:0}, at: join_transaction+0x1a8/0x5a0
+ #5: ffff97e284c023a0 (btrfs_trans_num_extwriters){++++}-{0:0}, at: join_transaction+0x1a8/0x5a0
+ #6: ffff97e1875a9278 (btrfs-tree-01/1){+.+.}-{3:3}, at: __btrfs_tree_lock+0x24/0x110
+
+ stack backtrace:
+ CPU: 1 PID: 752500 Comm: btrfs Not tainted 5.19.0-rc8+ #775
+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
+ Call Trace:
+
+ dump_stack_lvl+0x56/0x73
+ check_noncircular+0xd6/0x100
+ ? lock_is_held_type+0xe2/0x140
+ __lock_acquire+0x1122/0x1e10
+ lock_acquire+0xc2/0x2d0
+ ? __btrfs_tree_lock+0x24/0x110
+ down_write_nested+0x41/0x80
+ ? __btrfs_tree_lock+0x24/0x110
+ __btrfs_tree_lock+0x24/0x110
+ btrfs_lock_root_node+0x31/0x50
+ btrfs_search_slot+0x1cb/0xb70
+ ? lock_release+0x137/0x2d0
+ ? _raw_spin_unlock+0x29/0x50
+ ? release_extent_buffer+0x128/0x180
+ replace_path+0x541/0x9f0
+ merge_reloc_root+0x1d6/0x610
+ merge_reloc_roots+0xe2/0x260
+ relocate_block_group+0x2c8/0x560
+ btrfs_relocate_block_group+0x23e/0x400
+ btrfs_relocate_chunk+0x4c/0x140
+ btrfs_balance+0x755/0xe40
+ btrfs_ioctl+0x1ea2/0x2c90
+ ? lock_is_held_type+0xe2/0x140
+ ? lock_is_held_type+0xe2/0x140
+ ? __x64_sys_ioctl+0x88/0xc0
+ __x64_sys_ioctl+0x88/0xc0
+ do_syscall_64+0x38/0x90
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+This isn't necessarily new, it's just tricky to hit in practice. There
+are two competing things going on here. With relocation we create a
+snapshot of every fs tree with a reloc tree. Any extent buffers that
+get initialized here are initialized with the reloc root lockdep key.
+However since it is a snapshot, any blocks that are currently in cache
+that originally belonged to the fs tree will have the normal tree
+lockdep key set. This creates the lock dependency of
+
+ reloc tree -> normal tree
+
+for the extent buffer locking during the first phase of the relocation
+as we walk down the reloc root to relocate blocks.
+
+However this is problematic because the final phase of the relocation is
+merging the reloc root into the original fs root. This involves
+searching down to any keys that exist in the original fs root and then
+swapping the relocated block and the original fs root block. We have to
+search down to the fs root first, and then go search the reloc root for
+the block we need to replace. This creates the dependency of
+
+ normal tree -> reloc tree
+
+which is why lockdep complains.
+
+Additionally even if we were to fix this particular mismatch with a
+different nesting for the merge case, we're still slotting in a block
+that has a owner of the reloc root objectid into a normal tree, so that
+block will have its lockdep key set to the tree reloc root, and create a
+lockdep splat later on when we wander into that block from the fs root.
+
+Unfortunately the only solution here is to make sure we do not set the
+lockdep key to the reloc tree lockdep key normally, and then reset any
+blocks we wander into from the reloc root when we're doing the merged.
+
+This solves the problem of having mixed tree reloc keys intermixed with
+normal tree keys, and then allows us to make sure in the merge case we
+maintain the lock order of
+
+ normal tree -> reloc tree
+
+We handle this by setting a bit on the reloc root when we do the search
+for the block we want to relocate, and any block we search into or COW
+at that point gets set to the reloc tree key. This works correctly
+because we only ever COW down to the parent node, so we aren't resetting
+the key for the block we're linking into the fs root.
+
+With this patch we no longer have the lockdep splat in btrfs/187.
+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/ctree.c | 3 +++
+ fs/btrfs/ctree.h | 2 ++
+ fs/btrfs/extent-tree.c | 18 +++++++++++++++++-
+ fs/btrfs/extent_io.c | 11 ++++++++++-
+ fs/btrfs/locking.c | 11 +++++++++++
+ fs/btrfs/locking.h | 5 +++++
+ fs/btrfs/relocation.c | 2 ++
+ 7 files changed, 50 insertions(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
+index 6e556031a8f3..ebfa35fe1c38 100644
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -2075,6 +2075,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+
+ if (!p->skip_locking) {
+ level = btrfs_header_level(b);
++
++ btrfs_maybe_reset_lockdep_class(root, b);
++
+ if (level <= write_lock_level) {
+ btrfs_tree_lock(b);
+ p->locks[level] = BTRFS_WRITE_LOCK;
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 7d3ca3ea0bce..4d8acd7e63eb 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -1146,6 +1146,8 @@ enum {
+ BTRFS_ROOT_ORPHAN_CLEANUP,
+ /* This root has a drop operation that was started previously. */
+ BTRFS_ROOT_UNFINISHED_DROP,
++ /* This reloc root needs to have its buffers lockdep class reset. */
++ BTRFS_ROOT_RESET_LOCKDEP_CLASS,
+ };
+
+ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index ced3fc76063f..92f3f5ed8bf1 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -4871,6 +4871,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *buf;
++ u64 lockdep_owner = owner;
+
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
+ if (IS_ERR(buf))
+@@ -4889,12 +4890,27 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ return ERR_PTR(-EUCLEAN);
+ }
+
++ /*
++ * The reloc trees are just snapshots, so we need them to appear to be
++ * just like any other fs tree WRT lockdep.
++ *
++ * The exception however is in replace_path() in relocation, where we
++ * hold the lock on the original fs root and then search for the reloc
++ * root. At that point we need to make sure any reloc root buffers are
++ * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
++ * lockdep happy.
++ */
++ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
++ !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
++ lockdep_owner = BTRFS_FS_TREE_OBJECTID;
++
+ /*
+ * This needs to stay, because we could allocate a freed block from an
+ * old tree into a new tree, so we need to make sure this new block is
+ * set to the appropriate level and owner.
+ */
+- btrfs_set_buffer_lockdep_class(owner, buf, level);
++ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
++
+ __btrfs_tree_lock(buf, nest);
+ btrfs_clean_tree_block(buf);
+ clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
+diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
+index cda25018ebd7..5785ed241f6f 100644
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -6228,6 +6228,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *exists = NULL;
+ struct page *p;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
++ u64 lockdep_owner = owner_root;
+ int uptodate = 1;
+ int ret;
+
+@@ -6252,7 +6253,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ eb = __alloc_extent_buffer(fs_info, start, len);
+ if (!eb)
+ return ERR_PTR(-ENOMEM);
+- btrfs_set_buffer_lockdep_class(owner_root, eb, level);
++
++ /*
++ * The reloc trees are just snapshots, so we need them to appear to be
++ * just like any other fs tree WRT lockdep.
++ */
++ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
++ lockdep_owner = BTRFS_FS_TREE_OBJECTID;
++
++ btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
+
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++, index++) {
+diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
+index 5747c63929df..9063072b399b 100644
+--- a/fs/btrfs/locking.c
++++ b/fs/btrfs/locking.c
+@@ -91,6 +91,13 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int
+ lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]);
+ }
+
++void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb)
++{
++ if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
++ btrfs_set_buffer_lockdep_class(root->root_key.objectid,
++ eb, btrfs_header_level(eb));
++}
++
+ #endif
+
+ /*
+@@ -244,6 +251,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+
+ while (1) {
+ eb = btrfs_root_node(root);
++
++ btrfs_maybe_reset_lockdep_class(root, eb);
+ btrfs_tree_lock(eb);
+ if (eb == root->node)
+ break;
+@@ -265,6 +274,8 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+
+ while (1) {
+ eb = btrfs_root_node(root);
++
++ btrfs_maybe_reset_lockdep_class(root, eb);
+ btrfs_tree_read_lock(eb);
+ if (eb == root->node)
+ break;
+diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
+index b21372cab840..ab268be09bb5 100644
+--- a/fs/btrfs/locking.h
++++ b/fs/btrfs/locking.h
+@@ -133,11 +133,16 @@ void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock);
+
+ #ifdef CONFIG_DEBUG_LOCK_ALLOC
+ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level);
++void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb);
+ #else
+ static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
+ struct extent_buffer *eb, int level)
+ {
+ }
++static inline void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root,
++ struct extent_buffer *eb)
++{
++}
+ #endif
+
+ #endif
+diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
+index 33411baf5c7a..45c02aba2492 100644
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -1326,7 +1326,9 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
+ btrfs_release_path(path);
+
+ path->lowest_level = level;
++ set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
+ ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
++ clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
+ path->lowest_level = 0;
+ if (ret) {
+ if (ret > 0)
+--
+2.37.3
+