diff options
author | Scott B | 2022-02-10 22:57:20 -0800 |
---|---|---|
committer | Scott B | 2022-02-12 00:57:42 -0800 |
commit | 44db0f40320d2895d9f2438145152e329fb6dfb1 (patch) | |
tree | a2eecf6c2d0248a9a18806f9b4136f586e754c3b | |
parent | 248c3c289b71536ece4f14f7bf753f14ce637696 (diff) | |
download | aur-44db0f40320d2895d9f2438145152e329fb6dfb1.tar.gz |
hotfix: resolve btrfs autodefrag high utilization
-rw-r--r-- | .SRCINFO | 2 | ||||
-rw-r--r-- | PKGBUILD | 4 | ||||
-rw-r--r-- | btrfs-fix-autodefrag-on-5.16.9.patch | 6417 |
3 files changed, 6423 insertions, 0 deletions
@@ -25,6 +25,7 @@ pkgbase = linux-xanmod-rog source = Bluetooth-btintel-Fix-bdaddress-comparison-with-garb.patch source = Bluetooth-Read-codec-capabilities-only-if-supported.patch source = Bluetooth-fix-deadlock-for-RFCOMM-sk-state-change.patch + source = btrfs-fix-autodefrag-on-5.16.9.patch source = Revert-XANMOD-fair-Remove-all-energy-efficiency-functions.patch source = cpufreq-CPPC-Fix-performance-frequency-conversion.patch source = udp-ipv6-optimisations-v2-net-next.patch @@ -56,6 +57,7 @@ pkgbase = linux-xanmod-rog sha256sums = 241f01f06849fcec462d72355ca3ab6bd34931731dec89876d785912ac532398 sha256sums = dd01bd3f774c3a9af42b6d89f534f39c4a5f200db32cd6d4b72a29325645100e sha256sums = a9647897e59b04cb883dcf649b3108e9397d5a6c672bc545ea0c6bb7bb30d5a9 + sha256sums = cd2795ab2c355eb0182cba2940712552ff46eee95b04abb41327c208f7f3e546 sha256sums = 3bb1cf422c64b4eea324b71048d0bdee04b5f9132136c6a4774e5205e45c46f1 sha256sums = 5c6c7778bc2d873657a885272956e232138b8b4935c3a3d6b11ef1619d344b20 sha256sums = 56f8f93a38ed7236c2504c79645a33123ee7bdf3c0cbb97dfd90600df06be7dd @@ -114,6 +114,9 @@ source=("https://cdn.kernel.org/pub/linux/kernel/v${_branch}/linux-${_major}.tar "Bluetooth-Read-codec-capabilities-only-if-supported.patch" "Bluetooth-fix-deadlock-for-RFCOMM-sk-state-change.patch" + # hotfix: address btrfs autodefrag excessive utilization + "btrfs-fix-autodefrag-on-5.16.9.patch" + # Revert Xanmod scheduler power efficiency removal "Revert-XANMOD-fair-Remove-all-energy-efficiency-functions.patch" @@ -180,6 +183,7 @@ sha256sums=('027d7e8988bb69ac12ee92406c3be1fe13f990b1ca2249e226225cd1573308bb' '241f01f06849fcec462d72355ca3ab6bd34931731dec89876d785912ac532398' 'dd01bd3f774c3a9af42b6d89f534f39c4a5f200db32cd6d4b72a29325645100e' 'a9647897e59b04cb883dcf649b3108e9397d5a6c672bc545ea0c6bb7bb30d5a9' + 'cd2795ab2c355eb0182cba2940712552ff46eee95b04abb41327c208f7f3e546' '3bb1cf422c64b4eea324b71048d0bdee04b5f9132136c6a4774e5205e45c46f1' '5c6c7778bc2d873657a885272956e232138b8b4935c3a3d6b11ef1619d344b20' '56f8f93a38ed7236c2504c79645a33123ee7bdf3c0cbb97dfd90600df06be7dd' diff --git a/btrfs-fix-autodefrag-on-5.16.9.patch b/btrfs-fix-autodefrag-on-5.16.9.patch new file mode 100644 index 000000000000..33053ea7b449 --- /dev/null +++ b/btrfs-fix-autodefrag-on-5.16.9.patch @@ -0,0 +1,6417 @@ +From 6c67e14b140aba83be3aee93961ade179dbc2473 Mon Sep 17 00:00:00 2001 +From: Scott B <arglebargle@arglebargle.dev> +Date: Fri, 11 Feb 2022 23:52:12 -0800 +Subject: [PATCH] btrfs fix autodefrag on 5.16.9 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Squashed commit of the following: + +commit 7af5a9b695e62bdb82b55cb255c448e3af3ac587 +Author: Qu Wenruo <wqu@suse.com> +Date: Fri Feb 11 14:41:43 2022 +0800 + + btrfs: defrag: make btrfs_defrag_file() to report accurate number of defragged sectors + + Previously rework btrfs_defrag_file() can only report the number of + sectors from the first run of defrag_collect_targets(). + + This number is not accurate as if holes are punched after the first + defrag_collect_targets() call, we will not choose to defrag the holes. + + Originally this is to avoid passing @sectors_defragged to every involved + functions. + + But now since we have btrfs_defrag_ctrl, there is no need to do such + inaccurate accounting, just update btrfs_defrag_ctrl::sectors_defragged + after a successful defrag_one_locked_target() call. + + Signed-off-by: Qu Wenruo <wqu@suse.com> + Reviewed-by: Filipe Manana <fdmanana@suse.com> + +commit 7d6ad9ac62135f86c190f4ccf1ea1e8bb2e13480 +Author: Qu Wenruo <wqu@suse.com> +Date: Fri Feb 11 14:41:42 2022 +0800 + + btrfs: defrag: use btrfs_defrag_ctrl to replace btrfs_ioctl_defrag_range_args for btrfs_defrag_file() + + This brings the following benefits: + + - No more strange range->start update to indicate last scanned bytenr + We have btrfs_defrag_ctrl::last_scanned (exclusive) for it directly. + + - No more return value to indicate defragged sectors + Now btrfs_defrag_file() will just return 0 if no error happened. + And btrfs_defrag_ctrl::sectors_defragged will show that value. + + - Less parameters to carry around + Now most defrag_* functions only need to fetch its policy parameters + from btrfs_defrag_ctrl directly. + + Signed-off-by: Qu Wenruo <wqu@suse.com> + +commit b19878cde4728eeb3b5e017a6718ffd9e263c1a2 +Author: Qu Wenruo <wqu@suse.com> +Date: Fri Feb 11 14:41:41 2022 +0800 + + btrfs: defrag: introduce btrfs_defrag_ctrl structure for later usage + + Currently btrfs_defrag_file() accepts not only + btrfs_ioctl_defrag_range_args but also other parameters like @newer_than + and @max_sectors_to_defrag for extra policies. + + Those extra values are hidden from defrag ioctl and even caused bugs in + the past due to different behaviors based on those extra values. + + Here we introduce a new structure, btrfs_defrag_ctrl, to include: + + - all members in btrfs_ioctl_defrag_range_args + + - @max_sectors_to_defrag and @newer_than + + - Extra values which callers of btrfs_defrag_file() may care + Like @sectors_defragged and @last_scanned. + + With the new structure, also introduce a new helper, + btrfs_defrag_ioctl_args_to_ctrl() to: + + - Do extra sanity check on @compress and @flags + + - Do range alignment when possible + + - Set default values. + + Signed-off-by: Qu Wenruo <wqu@suse.com> + Reviewed-by: Filipe Manana <fdmanana@suse.com> + +commit a510f6c16dbfead2fcf0b04489d676d16851ba9e +Author: Qu Wenruo <wqu@suse.com> +Date: Fri Feb 11 14:41:39 2022 +0800 + + btrfs: defrag: allow defrag_one_cluster() to skip large extent which is not a target + + In the rework of btrfs_defrag_file(), we always call + defrag_one_cluster() and increase the offset by cluster size, which is + only 256K. + + But there are cases where we have a large extent (e.g. 128M) which + doesn't need to be defragged at all. + + Before the refactor, we can directly skip the range, but now we have to + scan that extent map again and again until the cluster moves after the + non-target extent. + + Fix the problem by allow defrag_one_cluster() to increase + btrfs_defrag_ctrl::last_scanned to the end of an extent, if and only if + the last extent of the cluster is not a target. + + The test script looks like this: + + mkfs.btrfs -f $dev > /dev/null + + mount $dev $mnt + + # As btrfs ioctl uses 32M as extent_threshold + xfs_io -f -c "pwrite 0 64M" $mnt/file1 + sync + # Some fragemented range to defrag + xfs_io -s -c "pwrite 65548k 4k" \ + -c "pwrite 65544k 4k" \ + -c "pwrite 65540k 4k" \ + -c "pwrite 65536k 4k" \ + $mnt/file1 + sync + + echo "=== before ===" + xfs_io -c "fiemap -v" $mnt/file1 + echo "=== after ===" + btrfs fi defrag $mnt/file1 + sync + xfs_io -c "fiemap -v" $mnt/file1 + umount $mnt + + With extra ftrace put into defrag_one_cluster(), before the patch it + would result tons of loops: + + (As defrag_one_cluster() is inlined, the function name is its caller) + + btrfs-126062 [005] ..... 4682.816026: btrfs_defrag_file: r/i=5/257 start=0 len=262144 + btrfs-126062 [005] ..... 4682.816027: btrfs_defrag_file: r/i=5/257 start=262144 len=262144 + btrfs-126062 [005] ..... 4682.816028: btrfs_defrag_file: r/i=5/257 start=524288 len=262144 + btrfs-126062 [005] ..... 4682.816028: btrfs_defrag_file: r/i=5/257 start=786432 len=262144 + btrfs-126062 [005] ..... 4682.816028: btrfs_defrag_file: r/i=5/257 start=1048576 len=262144 + ... + btrfs-126062 [005] ..... 4682.816043: btrfs_defrag_file: r/i=5/257 start=67108864 len=262144 + + But with this patch there will be just one loop, then directly to the + end of the extent: + + btrfs-130471 [014] ..... 5434.029558: defrag_one_cluster: r/i=5/257 start=0 len=262144 + btrfs-130471 [014] ..... 5434.029559: defrag_one_cluster: r/i=5/257 start=67108864 len=16384 + + Cc: stable@vger.kernel.org # 5.16 + Signed-off-by: Qu Wenruo <wqu@suse.com> + Reviewed-by: Filipe Manana <fdmanana@suse.com> + +commit 3f2d69fc4a7a4ce3f389b9e84fa3c830f6a8b5c5 +Author: Qu Wenruo <wqu@suse.com> +Date: Fri Feb 11 14:41:40 2022 +0800 + + btrfs: uapi: introduce BTRFS_DEFRAG_RANGE_MASK for later sanity check + + And since we're here, replace the hardcoded bit flags (1, 2) with + (1UL << 0) and (1UL << 1), respectively. + + Signed-off-by: Qu Wenruo <wqu@suse.com> + Reviewed-by: Filipe Manana <fdmanana@suse.com> + +commit b6c665523425451af94eb3f044d4474c81f94b1e +Author: Qu Wenruo <wqu@suse.com> +Date: Fri Jan 28 15:21:22 2022 +0800 + + btrfs: defrag: remove an ambiguous condition for rejection + + From the very beginning of btrfs defrag, there is a check to reject + extents which meet both conditions: + + - Physically adjacent + + We may want to defrag physically adjacent extents to reduce the number + of extents or the size of subvolume tree. + + - Larger than 128K + + This may be there for compressed extents, but unfortunately 128K is + exactly the max capacity for compressed extents. + And the check is > 128K, thus it never rejects compressed extents. + + Furthermore, the compressed extent capacity bug is fixed by previous + patch, there is no reason for that check anymore. + + The original check has a very small ranges to reject (the target extent + size is > 128K, and default extent threshold is 256K), and for + compressed extent it doesn't work at all. + + So it's better just to remove the rejection, and allow us to defrag + physically adjacent extents. + + Reviewed-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: Qu Wenruo <wqu@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit cb53ba48a2b6c9126d128f301b4ed8085dcbce7b +Author: Qu Wenruo <wqu@suse.com> +Date: Fri Jan 28 15:21:21 2022 +0800 + + btrfs: defrag: don't defrag extents which is already at its max capacity + + [BUG] + For compressed extents, defrag ioctl will always try to defrag any + compressed extents, wasting not only IO but also CPU time to + compress/decompress: + + mkfs.btrfs -f $DEV + mount -o compress $DEV $MNT + xfs_io -f -c "pwrite -S 0xab 0 128K" $MNT/foobar + sync + xfs_io -f -c "pwrite -S 0xcd 128K 128K" $MNT/foobar + sync + echo "=== before ===" + xfs_io -c "fiemap -v" $MNT/foobar + btrfs filesystem defrag $MNT/foobar + sync + echo "=== after ===" + xfs_io -c "fiemap -v" $MNT/foobar + + Then it shows the 2 128K extents just get CoW for no extra benefit, with + extra IO/CPU spent: + + === before === + /mnt/btrfs/file1: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + 0: [0..255]: 26624..26879 256 0x8 + 1: [256..511]: 26632..26887 256 0x9 + === after === + /mnt/btrfs/file1: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + 0: [0..255]: 26640..26895 256 0x8 + 1: [256..511]: 26648..26903 256 0x9 + + This affects not only v5.16 (after the defrag rework), but also v5.15 + (before the defrag rework). + + [CAUSE] + >From the very beginning, btrfs defrag never checks if one extent is + already at its max capacity (128K for compressed extents, 128M + otherwise). + + And the default extent size threshold is 256K, which is already beyond + the compressed extent max size. + + This means, by default btrfs defrag ioctl will mark all compressed + extent which is not adjacent to a hole/preallocated range for defrag. + + [FIX] + Introduce a helper to grab the maximum extent size, and then in + defrag_collect_targets() and defrag_check_next_extent(), reject extents + which are already at their max capacity. + + Reported-by: Filipe Manana <fdmanana@suse.com> + Reviewed-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: Qu Wenruo <wqu@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit e5cf566d32d6f7b244d87ab0c0797b43d54b4c37 +Author: Qu Wenruo <wqu@suse.com> +Date: Fri Jan 28 15:21:20 2022 +0800 + + btrfs: defrag: don't try to merge regular extents with preallocated extents + + [BUG] + With older kernels (before v5.16), btrfs will defrag preallocated extents. + While with newer kernels (v5.16 and newer) btrfs will not defrag + preallocated extents, but it will defrag the extent just before the + preallocated extent, even it's just a single sector. + + This can be exposed by the following small script: + + mkfs.btrfs -f $dev > /dev/null + + mount $dev $mnt + xfs_io -f -c "pwrite 0 4k" -c sync -c "falloc 4k 16K" $mnt/file + xfs_io -c "fiemap -v" $mnt/file + btrfs fi defrag $mnt/file + sync + xfs_io -c "fiemap -v" $mnt/file + + The output looks like this on older kernels: + + /mnt/btrfs/file: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + 0: [0..7]: 26624..26631 8 0x0 + 1: [8..39]: 26632..26663 32 0x801 + /mnt/btrfs/file: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + 0: [0..39]: 26664..26703 40 0x1 + + Which defrags the single sector along with the preallocated extent, and + replace them with an regular extent into a new location (caused by data + COW). + This wastes most of the data IO just for the preallocated range. + + On the other hand, v5.16 is slightly better: + + /mnt/btrfs/file: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + 0: [0..7]: 26624..26631 8 0x0 + 1: [8..39]: 26632..26663 32 0x801 + /mnt/btrfs/file: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + 0: [0..7]: 26664..26671 8 0x0 + 1: [8..39]: 26632..26663 32 0x801 + + The preallocated range is not defragged, but the sector before it still + gets defragged, which has no need for it. + + [CAUSE] + One of the function reused by the old and new behavior is + defrag_check_next_extent(), it will determine if we should defrag + current extent by checking the next one. + + It only checks if the next extent is a hole or inlined, but it doesn't + check if it's preallocated. + + On the other hand, out of the function, both old and new kernel will + reject preallocated extents. + + Such inconsistent behavior causes above behavior. + + [FIX] + - Also check if next extent is preallocated + If so, don't defrag current extent. + + - Add comments for each branch why we reject the extent + + This will reduce the IO caused by defrag ioctl and autodefrag. + + Reviewed-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: Qu Wenruo <wqu@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit f9649fa5a78f6e27a8bb4ec026efe3b4c1d64bc8 +Author: Sidong Yang <realwakka@gmail.com> +Date: Sun Feb 6 12:52:48 2022 +0000 + + btrfs: qgroup: remove duplicated check in adding qgroup relations + + Removes duplicated check when adding qgroup relations. + btrfs_add_qgroup_relations function adds relations by calling + add_relation_rb(). add_relation_rb() checks that member/parentid exists + in current qgroup_tree. But it already checked before calling the + function. It seems that we don't need to double check. + + Add new function __add_relation_rb() that adds relations with + qgroup structures and makes old function use the new one. And it makes + btrfs_add_qgroup_relation() function work without double checks by + calling the new function. + + Signed-off-by: Sidong Yang <realwakka@gmail.com> + Reviewed-by: David Sterba <dsterba@suse.com> + [ add comments ] + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 843263d23fb3348d562a0e410d3e8e552e829ef3 +Author: Dāvis Mosāns <davispuh@gmail.com> +Date: Wed Feb 2 23:44:54 2022 +0200 + + btrfs: add lzo workspace buffer length constants + + It makes it more readable for length checking and is be used repeatedly. + + Signed-off-by: Dāvis Mosāns <davispuh@gmail.com> + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit b1794acfaaef72cc21c3ec3f92d63b1da0842f54 +Author: Dāvis Mosāns <davispuh@gmail.com> +Date: Wed Feb 2 23:44:55 2022 +0200 + + btrfs: prevent copying too big compressed lzo segment + + Compressed length can be corrupted to be a lot larger than memory + we have allocated for buffer. + This will cause memcpy in copy_compressed_segment to write outside + of allocated memory. + + This mostly results in stuck read syscall but sometimes when using + btrfs send can get #GP + + kernel: general protection fault, probably for non-canonical address 0x841551d5c1000: 0000 [#1] PREEMPT SMP NOPTI + kernel: CPU: 17 PID: 264 Comm: kworker/u256:7 Tainted: P OE 5.17.0-rc2-1 #12 + kernel: Workqueue: btrfs-endio btrfs_work_helper [btrfs] + kernel: RIP: 0010:lzo_decompress_bio (./include/linux/fortify-string.h:225 fs/btrfs/lzo.c:322 fs/btrfs/lzo.c:394) btrfs + Code starting with the faulting instruction + =========================================== + 0:* 48 8b 06 mov (%rsi),%rax <-- trapping instruction + 3: 48 8d 79 08 lea 0x8(%rcx),%rdi + 7: 48 83 e7 f8 and $0xfffffffffffffff8,%rdi + b: 48 89 01 mov %rax,(%rcx) + e: 44 89 f0 mov %r14d,%eax + 11: 48 8b 54 06 f8 mov -0x8(%rsi,%rax,1),%rdx + kernel: RSP: 0018:ffffb110812efd50 EFLAGS: 00010212 + kernel: RAX: 0000000000001000 RBX: 000000009ca264c8 RCX: ffff98996e6d8ff8 + kernel: RDX: 0000000000000064 RSI: 000841551d5c1000 RDI: ffffffff9500435d + kernel: RBP: ffff989a3be856c0 R08: 0000000000000000 R09: 0000000000000000 + kernel: R10: 0000000000000000 R11: 0000000000001000 R12: ffff98996e6d8000 + kernel: R13: 0000000000000008 R14: 0000000000001000 R15: 000841551d5c1000 + kernel: FS: 0000000000000000(0000) GS:ffff98a09d640000(0000) knlGS:0000000000000000 + kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + kernel: CR2: 00001e9f984d9ea8 CR3: 000000014971a000 CR4: 00000000003506e0 + kernel: Call Trace: + kernel: <TASK> + kernel: end_compressed_bio_read (fs/btrfs/compression.c:104 fs/btrfs/compression.c:1363 fs/btrfs/compression.c:323) btrfs + kernel: end_workqueue_fn (fs/btrfs/disk-io.c:1923) btrfs + kernel: btrfs_work_helper (fs/btrfs/async-thread.c:326) btrfs + kernel: process_one_work (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:212 ./include/trace/events/workqueue.h:108 kernel/workqueue.c:2312) + kernel: worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2455) + kernel: ? process_one_work (kernel/workqueue.c:2397) + kernel: kthread (kernel/kthread.c:377) + kernel: ? kthread_complete_and_exit (kernel/kthread.c:332) + kernel: ret_from_fork (arch/x86/entry/entry_64.S:301) + kernel: </TASK> + + CC: stable@vger.kernel.org # 4.9+ + Signed-off-by: Dāvis Mosāns <davispuh@gmail.com> + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 09ad560a431c83f3741f1e545924c7fbb8957dd4 +Author: Dāvis Mosāns <davispuh@gmail.com> +Date: Sat Feb 5 20:48:23 2022 +0200 + + btrfs: send: in case of IO error log it + + Currently if we get IO error while doing send then we abort without + logging information about which file caused issue. So log it to help + with debugging. + + CC: stable@vger.kernel.org # 4.9+ + Signed-off-by: Dāvis Mosāns <davispuh@gmail.com> + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 43b4cb906eef17f9a7ca8e660f3e9e44176082f6 +Author: Filipe Manana <fdmanana@suse.com> +Date: Wed Feb 2 15:26:09 2022 +0000 + + btrfs: get rid of warning on transaction commit when using flushoncommit + + When using the flushoncommit mount option, during almost every transaction + commit we trigger a warning from __writeback_inodes_sb_nr(): + + $ cat fs/fs-writeback.c: + (...) + static void __writeback_inodes_sb_nr(struct super_block *sb, ... + { + (...) + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + (...) + } + (...) + + The trace produced in dmesg looks like the following: + + [947.473890] WARNING: CPU: 5 PID: 930 at fs/fs-writeback.c:2610 __writeback_inodes_sb_nr+0x7e/0xb3 + [947.481623] Modules linked in: nfsd nls_cp437 cifs asn1_decoder cifs_arc4 fscache cifs_md4 ipmi_ssif + [947.489571] CPU: 5 PID: 930 Comm: btrfs-transacti Not tainted 95.16.3-srb-asrock-00001-g36437ad63879 #186 + [947.497969] RIP: 0010:__writeback_inodes_sb_nr+0x7e/0xb3 + [947.502097] Code: 24 10 4c 89 44 24 18 c6 (...) + [947.519760] RSP: 0018:ffffc90000777e10 EFLAGS: 00010246 + [947.523818] RAX: 0000000000000000 RBX: 0000000000963300 RCX: 0000000000000000 + [947.529765] RDX: 0000000000000000 RSI: 000000000000fa51 RDI: ffffc90000777e50 + [947.535740] RBP: ffff888101628a90 R08: ffff888100955800 R09: ffff888100956000 + [947.541701] R10: 0000000000000002 R11: 0000000000000001 R12: ffff888100963488 + [947.547645] R13: ffff888100963000 R14: ffff888112fb7200 R15: ffff888100963460 + [947.553621] FS: 0000000000000000(0000) GS:ffff88841fd40000(0000) knlGS:0000000000000000 + [947.560537] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [947.565122] CR2: 0000000008be50c4 CR3: 000000000220c000 CR4: 00000000001006e0 + [947.571072] Call Trace: + [947.572354] <TASK> + [947.573266] btrfs_commit_transaction+0x1f1/0x998 + [947.576785] ? start_transaction+0x3ab/0x44e + [947.579867] ? schedule_timeout+0x8a/0xdd + [947.582716] transaction_kthread+0xe9/0x156 + [947.585721] ? btrfs_cleanup_transaction.isra.0+0x407/0x407 + [947.590104] kthread+0x131/0x139 + [947.592168] ? set_kthread_struct+0x32/0x32 + [947.595174] ret_from_fork+0x22/0x30 + [947.597561] </TASK> + [947.598553] ---[ end trace 644721052755541c ]--- + + This is because we started using writeback_inodes_sb() to flush delalloc + when committing a transaction (when using -o flushoncommit), in order to + avoid deadlocks with filesystem freeze operations. This change was made + by commit ce8ea7cc6eb313 ("btrfs: don't call btrfs_start_delalloc_roots + in flushoncommit"). After that change we started producing that warning, + and every now and then a user reports this since the warning happens too + often, it spams dmesg/syslog, and a user is unsure if this reflects any + problem that might compromise the filesystem's reliability. + + We can not just lock the sb->s_umount semaphore before calling + writeback_inodes_sb(), because that would at least deadlock with + filesystem freezing, since at fs/super.c:freeze_super() sync_filesystem() + is called while we are holding that semaphore in write mode, and that can + trigger a transaction commit, resulting in a deadlock. It would also + trigger the same type of deadlock in the unmount path. Possibly, it could + also introduce some other locking dependencies that lockdep would report. + + To fix this call try_to_writeback_inodes_sb() instead of + writeback_inodes_sb(), because that will try to read lock sb->s_umount + and then will only call writeback_inodes_sb() if it was able to lock it. + This is fine because the cases where it can't read lock sb->s_umount + are during a filesystem unmount or during a filesystem freeze - in those + cases sb->s_umount is write locked and sync_filesystem() is called, which + calls writeback_inodes_sb(). In other words, in all cases where we can't + take a read lock on sb->s_umount, writeback is already being triggered + elsewhere. + + An alternative would be to call btrfs_start_delalloc_roots() with a + number of pages different from LONG_MAX, for example matching the number + of delalloc bytes we currently have, in which case we would end up + starting all delalloc with filemap_fdatawrite_wbc() and not with an + async flush via filemap_flush() - that is only possible after the rather + recent commit e076ab2a2ca70a ("btrfs: shrink delalloc pages instead of + full inodes"). However that creates a whole new can of worms due to new + lock dependencies, which lockdep complains, like for example: + + [ 8948.247280] ====================================================== + [ 8948.247823] WARNING: possible circular locking dependency detected + [ 8948.248353] 5.17.0-rc1-btrfs-next-111 #1 Not tainted + [ 8948.248786] ------------------------------------------------------ + [ 8948.249320] kworker/u16:18/933570 is trying to acquire lock: + [ 8948.249812] ffff9b3de1591690 (sb_internal#2){.+.+}-{0:0}, at: find_free_extent+0x141e/0x1590 [btrfs] + [ 8948.250638] + but task is already holding lock: + [ 8948.251140] ffff9b3e09c717d8 (&root->delalloc_mutex){+.+.}-{3:3}, at: start_delalloc_inodes+0x78/0x400 [btrfs] + [ 8948.252018] + which lock already depends on the new lock. + + [ 8948.252710] + the existing dependency chain (in reverse order) is: + [ 8948.253343] + -> #2 (&root->delalloc_mutex){+.+.}-{3:3}: + [ 8948.253950] __mutex_lock+0x90/0x900 + [ 8948.254354] start_delalloc_inodes+0x78/0x400 [btrfs] + [ 8948.254859] btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs] + [ 8948.255408] btrfs_commit_transaction+0x32f/0xc00 [btrfs] + [ 8948.255942] btrfs_mksubvol+0x380/0x570 [btrfs] + [ 8948.256406] btrfs_mksnapshot+0x81/0xb0 [btrfs] + [ 8948.256870] __btrfs_ioctl_snap_create+0x17f/0x190 [btrfs] + [ 8948.257413] btrfs_ioctl_snap_create_v2+0xbb/0x140 [btrfs] + [ 8948.257961] btrfs_ioctl+0x1196/0x3630 [btrfs] + [ 8948.258418] __x64_sys_ioctl+0x83/0xb0 + [ 8948.258793] do_syscall_64+0x3b/0xc0 + [ 8948.259146] entry_SYSCALL_64_after_hwframe+0x44/0xae + [ 8948.259709] + -> #1 (&fs_info->delalloc_root_mutex){+.+.}-{3:3}: + [ 8948.260330] __mutex_lock+0x90/0x900 + [ 8948.260692] btrfs_start_delalloc_roots+0x97/0x2a0 [btrfs] + [ 8948.261234] btrfs_commit_transaction+0x32f/0xc00 [btrfs] + [ 8948.261766] btrfs_set_free_space_cache_v1_active+0x38/0x60 [btrfs] + [ 8948.262379] btrfs_start_pre_rw_mount+0x119/0x180 [btrfs] + [ 8948.262909] open_ctree+0x1511/0x171e [btrfs] + [ 8948.263359] btrfs_mount_root.cold+0x12/0xde [btrfs] + [ 8948.263863] legacy_get_tree+0x30/0x50 + [ 8948.264242] vfs_get_tree+0x28/0xc0 + [ 8948.264594] vfs_kern_mount.part.0+0x71/0xb0 + [ 8948.265017] btrfs_mount+0x11d/0x3a0 [btrfs] + [ 8948.265462] legacy_get_tree+0x30/0x50 + [ 8948.265851] vfs_get_tree+0x28/0xc0 + [ 8948.266203] path_mount+0x2d4/0xbe0 + [ 8948.266554] __x64_sys_mount+0x103/0x140 + [ 8948.266940] do_syscall_64+0x3b/0xc0 + [ 8948.267300] entry_SYSCALL_64_after_hwframe+0x44/0xae + [ 8948.267790] + -> #0 (sb_internal#2){.+.+}-{0:0}: + [ 8948.268322] __lock_acquire+0x12e8/0x2260 + [ 8948.268733] lock_acquire+0xd7/0x310 + [ 8948.269092] start_transaction+0x44c/0x6e0 [btrfs] + [ 8948.269591] find_free_extent+0x141e/0x1590 [btrfs] + [ 8948.270087] btrfs_reserve_extent+0x14b/0x280 [btrfs] + [ 8948.270588] cow_file_range+0x17e/0x490 [btrfs] + [ 8948.271051] btrfs_run_delalloc_range+0x345/0x7a0 [btrfs] + [ 8948.271586] writepage_delalloc+0xb5/0x170 [btrfs] + [ 8948.272071] __extent_writepage+0x156/0x3c0 [btrfs] + [ 8948.272579] extent_write_cache_pages+0x263/0x460 [btrfs] + [ 8948.273113] extent_writepages+0x76/0x130 [btrfs] + [ 8948.273573] do_writepages+0xd2/0x1c0 + [ 8948.273942] filemap_fdatawrite_wbc+0x68/0x90 + [ 8948.274371] start_delalloc_inodes+0x17f/0x400 [btrfs] + [ 8948.274876] btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs] + [ 8948.275417] flush_space+0x1f2/0x630 [btrfs] + [ 8948.275863] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs] + [ 8948.276438] process_one_work+0x252/0x5a0 + [ 8948.276829] worker_thread+0x55/0x3b0 + [ 8948.277189] kthread+0xf2/0x120 + [ 8948.277506] ret_from_fork+0x22/0x30 + [ 8948.277868] + other info that might help us debug this: + + [ 8948.278548] Chain exists of: + sb_internal#2 --> &fs_info->delalloc_root_mutex --> &root->delalloc_mutex + + [ 8948.279601] Possible unsafe locking scenario: + + [ 8948.280102] CPU0 CPU1 + [ 8948.280508] ---- ---- + [ 8948.280915] lock(&root->delalloc_mutex); + [ 8948.281271] lock(&fs_info->delalloc_root_mutex); + [ 8948.281915] lock(&root->delalloc_mutex); + [ 8948.282487] lock(sb_internal#2); + [ 8948.282800] + *** DEADLOCK *** + + [ 8948.283333] 4 locks held by kworker/u16:18/933570: + [ 8948.283750] #0: ffff9b3dc00a9d48 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1d2/0x5a0 + [ 8948.284609] #1: ffffa90349dafe70 ((work_completion)(&fs_info->async_data_reclaim_work)){+.+.}-{0:0}, at: process_one_work+0x1d2/0x5a0 + [ 8948.285637] #2: ffff9b3e14db5040 (&fs_info->delalloc_root_mutex){+.+.}-{3:3}, at: btrfs_start_delalloc_roots+0x97/0x2a0 [btrfs] + [ 8948.286674] #3: ffff9b3e09c717d8 (&root->delalloc_mutex){+.+.}-{3:3}, at: start_delalloc_inodes+0x78/0x400 [btrfs] + [ 8948.287596] + stack backtrace: + [ 8948.287975] CPU: 3 PID: 933570 Comm: kworker/u16:18 Not tainted 5.17.0-rc1-btrfs-next-111 #1 + [ 8948.288677] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 + [ 8948.289649] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs] + [ 8948.290298] Call Trace: + [ 8948.290517] <TASK> + [ 8948.290700] dump_stack_lvl+0x59/0x73 + [ 8948.291026] check_noncircular+0xf3/0x110 + [ 8948.291375] ? start_transaction+0x228/0x6e0 [btrfs] + [ 8948.291826] __lock_acquire+0x12e8/0x2260 + [ 8948.292241] lock_acquire+0xd7/0x310 + [ 8948.292714] ? find_free_extent+0x141e/0x1590 [btrfs] + [ 8948.293241] ? lock_is_held_type+0xea/0x140 + [ 8948.293601] start_transaction+0x44c/0x6e0 [btrfs] + [ 8948.294055] ? find_free_extent+0x141e/0x1590 [btrfs] + [ 8948.294518] find_free_extent+0x141e/0x1590 [btrfs] + [ 8948.294957] ? _raw_spin_unlock+0x29/0x40 + [ 8948.295312] ? btrfs_get_alloc_profile+0x124/0x290 [btrfs] + [ 8948.295813] btrfs_reserve_extent+0x14b/0x280 [btrfs] + [ 8948.296270] cow_file_range+0x17e/0x490 [btrfs] + [ 8948.296691] btrfs_run_delalloc_range+0x345/0x7a0 [btrfs] + [ 8948.297175] ? find_lock_delalloc_range+0x247/0x270 [btrfs] + [ 8948.297678] writepage_delalloc+0xb5/0x170 [btrfs] + [ 8948.298123] __extent_writepage+0x156/0x3c0 [btrfs] + [ 8948.298570] extent_write_cache_pages+0x263/0x460 [btrfs] + [ 8948.299061] extent_writepages+0x76/0x130 [btrfs] + [ 8948.299495] do_writepages+0xd2/0x1c0 + [ 8948.299817] ? sched_clock_cpu+0xd/0x110 + [ 8948.300160] ? lock_release+0x155/0x4a0 + [ 8948.300494] filemap_fdatawrite_wbc+0x68/0x90 + [ 8948.300874] ? do_raw_spin_unlock+0x4b/0xa0 + [ 8948.301243] start_delalloc_inodes+0x17f/0x400 [btrfs] + [ 8948.301706] ? lock_release+0x155/0x4a0 + [ 8948.302055] btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs] + [ 8948.302564] flush_space+0x1f2/0x630 [btrfs] + [ 8948.302970] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs] + [ 8948.303510] process_one_work+0x252/0x5a0 + [ 8948.303860] ? process_one_work+0x5a0/0x5a0 + [ 8948.304221] worker_thread+0x55/0x3b0 + [ 8948.304543] ? process_one_work+0x5a0/0x5a0 + [ 8948.304904] kthread+0xf2/0x120 + [ 8948.305184] ? kthread_complete_and_exit+0x20/0x20 + [ 8948.305598] ret_from_fork+0x22/0x30 + [ 8948.305921] </TASK> + + It all comes from the fact that btrfs_start_delalloc_roots() takes the + delalloc_root_mutex, in the transaction commit path we are holding a + read lock on one of the superblock's freeze semaphores (via + sb_start_intwrite()), the async reclaim task can also do a call to + btrfs_start_delalloc_roots(), which ends up triggering writeback with + calls to filemap_fdatawrite_wbc(), resulting in extent allocation which + in turn can call btrfs_start_transaction(), which will result in taking + the freeze semaphore via sb_start_intwrite(), forming a nasty dependency + on all those locks which can be taken in different orders by different + code paths. + + So just adopt the simple approach of calling try_to_writeback_inodes_sb() + at btrfs_start_delalloc_flush(). + + Link: https://lore.kernel.org/linux-btrfs/20220130005258.GA7465@cuci.nl/ + Link: https://lore.kernel.org/linux-btrfs/43acc426-d683-d1b6-729d-c6bc4a2fff4d@gmail.com/ + Link: https://lore.kernel.org/linux-btrfs/6833930a-08d7-6fbc-0141-eb9cdfd6bb4d@gmail.com/ + Link: https://lore.kernel.org/linux-btrfs/20190322041731.GF16651@hungrycats.org/ + Reviewed-by: Omar Sandoval <osandov@fb.com> + Signed-off-by: Filipe Manana <fdmanana@suse.com> + [ add more link reports ] + Signed-off-by: David Sterba <dsterba@suse.com> + +commit b805349fbdc9a47199d96bc193f64b9399ec6761 +Author: Qu Wenruo <wqu@suse.com> +Date: Tue Feb 8 14:54:05 2022 +0800 + + btrfs: defrag: don't try to defrag extents which are under writeback + + Once we start writeback (have called btrfs_run_delalloc_range()), we + allocate an extent, create an extent map point to that extent, with a + generation of (u64)-1, created the ordered extent and then clear the + DELALLOC bit from the range in the inode's io tree. + + Such extent map can pass the first call of defrag_collect_targets(), as + its generation is (u64)-1, meets any possible minimal generation check. + And the range will not have DELALLOC bit, also passing the DELALLOC bit + check. + + It will only be re-checked in the second call of + defrag_collect_targets(), which will wait for writeback. + + But at that stage we have already spent our time waiting for some IO we + may or may not want to defrag. + + Let's reject such extents early so we won't waste our time. + + Reviewed-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: Qu Wenruo <wqu@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit e8a7717c71287a11dc81098199d7116d6a4b6006 +Author: Qu Wenruo <wqu@suse.com> +Date: Tue Feb 8 13:31:19 2022 +0800 + + btrfs: populate extent_map::generation when reading from disk + + When btrfs_get_extent() tries to get some file extent from disk, it + never populates extent_map::generation, leaving the value to be 0. + + On the other hand, for extent map generated by IO, it will get its + generation properly set at finish_ordered_io() + + finish_ordered_io() + |- unpin_extent_cache(gen = trans->transid) + |- em->generation = gen; + + [CAUSE] + Since extent_map::generation is mostly used by fsync code, and for fsync + they only care about modified extents, which all have their + em::generation > 0. + + Thus it's fine to not populate em read from disk for fsync. + + [CORNER CASE] + However autodefrag also relies on em::generation to determine if one + extent needs to be defragged. + + This unpopulated extent_map::generation can prevent the following + autodefrag case from working: + + mkfs.btrfs -f $dev + mount $dev $mnt -o autodefrag + + # initial write to queue the inode for autodefrag + xfs_io -f -c "pwrite 0 4k" $mnt/file + sync + + # Real fragmented write + xfs_io -f -s -c "pwrite -b 4096 0 32k" $mnt/file + sync + echo "=== before autodefrag ===" + xfs_io -c "fiemap -v" $mnt/file + + # Drop cache to force em to be read from disk + echo 3 > /proc/sys/vm/drop_caches + mount -o remount,commit=1 $mnt + sleep 3 + sync + + echo "=== After autodefrag ===" + xfs_io -c "fiemap -v" $mnt/file + umount $mnt + + The result looks like this: + + === before autodefrag === + /mnt/btrfs/file: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + 0: [0..15]: 26672..26687 16 0x0 + 1: [16..31]: 26656..26671 16 0x0 + 2: [32..47]: 26640..26655 16 0x0 + 3: [48..63]: 26624..26639 16 0x1 + === After autodefrag === + /mnt/btrfs/file: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + 0: [0..15]: 26672..26687 16 0x0 + 1: [16..31]: 26656..26671 16 0x0 + 2: [32..47]: 26640..26655 16 0x0 + 3: [48..63]: 26624..26639 16 0x1 + + This fragmented 32K will not be defragged by autodefrag. + + [FIX] + To make things less weird, just populate extent_map::generation when + reading file extents from disk. + + This would make above fragmented extents to be properly defragged: + + == before autodefrag === + /mnt/btrfs/file: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + 0: [0..15]: 26672..26687 16 0x0 + 1: [16..31]: 26656..26671 16 0x0 + 2: [32..47]: 26640..26655 16 0x0 + 3: [48..63]: 26624..26639 16 0x1 + === After autodefrag === + /mnt/btrfs/file: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + 0: [0..63]: 26688..26751 64 0x1 + + Reviewed-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: Qu Wenruo <wqu@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit df6d916f35305a85d6636256fbc9708a78df7465 +Author: Filipe Manana <fdmanana@suse.com> +Date: Thu Feb 3 14:55:50 2022 +0000 + + btrfs: prepare extents to be logged before locking a log tree path + + When we want to log an extent, in the fast fsync path, we obtain a path + to the leaf that will hold the file extent item either through a deletion + search, via btrfs_drop_extents(), or through an insertion search using + btrfs_insert_empty_item(). After that we fill the file extent item's + fields one by one directly on the leaf. + + Instead of doing that, we could prepare the file extent item before + obtaining a btree path, and then copy the prepared extent item with a + single operation once we get the path. This helps avoid some contention + on the log tree, since we are holding write locks for longer than + necessary, especially in the case where the path is obtained via + btrfs_drop_extents() through a deletion search, which always keeps a + write lock on the nodes at levels 1 and 2 (besides the leaf). + + This change does that, we prepare the file extent item that is going to + be inserted before acquiring a path, and then copy it into a leaf using + a single copy operation once we get a path. + + This change if part of a patchset that is comprised of the following + patches: + + 1/6 btrfs: remove unnecessary leaf free space checks when pushing items + 2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf + 3/6 btrfs: avoid unnecessary computation when deleting items from a leaf + 4/6 btrfs: remove constraint on number of visited leaves when replacing extents + 5/6 btrfs: remove useless path release in the fast fsync path + 6/6 btrfs: prepare extents to be logged before locking a log tree path + + The following test was run to measure the impact of the whole patchset: + + $ cat test.sh + #!/bin/bash + + DEV=/dev/sdi + MNT=/mnt/sdi + MOUNT_OPTIONS="-o ssd" + MKFS_OPTIONS="-R free-space-tree -O no-holes" + + NUM_JOBS=8 + FILE_SIZE=128M + RUN_TIME=200 + + cat <<EOF > /tmp/fio-job.ini + [writers] + rw=randwrite + fsync=1 + fallocate=none + group_reporting=1 + direct=0 + bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5 + ioengine=sync + filesize=$FILE_SIZE + runtime=$RUN_TIME + time_based + directory=$MNT + numjobs=$NUM_JOBS + thread + EOF + + echo "performance" | \ + tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor + + echo + echo "Using config:" + echo + cat /tmp/fio-job.ini + echo + + umount $MNT &> /dev/null + mkfs.btrfs -f $MKFS_OPTIONS $DEV + mount $MOUNT_OPTIONS $DEV $MNT + + fio /tmp/fio-job.ini + + umount $MNT + + The test ran inside a VM (8 cores, 32G of RAM) with the target disk + mapping to a raw NVMe device, and using a non-debug kernel config + (Debian's default config). + + Before the patchset: + + WRITE: bw=116MiB/s (122MB/s), 116MiB/s-116MiB/s (122MB/s-122MB/s), io=22.7GiB (24.4GB), run=200013-200013msec + + After the patchset: + + WRITE: bw=125MiB/s (131MB/s), 125MiB/s-125MiB/s (131MB/s-131MB/s), io=24.3GiB (26.1GB), run=200007-200007msec + + A 7.8% gain on throughput and +7.0% more IO done in the same period of + time (200 seconds). + + Signed-off-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 901ebe7172fa1fd03e4cc43d9d5f6a191d2e6428 +Author: Filipe Manana <fdmanana@suse.com> +Date: Thu Feb 3 14:55:49 2022 +0000 + + btrfs: remove useless path release in the fast fsync path + + There's no point in calling btrfs_release_path() after finishing the loop + that logs the modified extents, since log_one_extent() returns with the + path released. In case the list of extents is empty, the path is already + released, so there's no need for that case as well. + So just remove that unnecessary btrfs_release_path() call. + + This change if part of a patchset that is comprised of the following + patches: + + 1/6 btrfs: remove unnecessary leaf free space checks when pushing items + 2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf + 3/6 btrfs: avoid unnecessary computation when deleting items from a leaf + 4/6 btrfs: remove constraint on number of visited leaves when replacing extents + 5/6 btrfs: remove useless path release in the fast fsync path + 6/6 btrfs: prepare extents to be logged before locking a log tree path + + The last patch in the series has some performance test result in its + changelog. + + Signed-off-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 12d0362b8dad82e240e37aa43f7d344f7206c009 +Author: Filipe Manana <fdmanana@suse.com> +Date: Thu Feb 3 14:55:48 2022 +0000 + + btrfs: remove constraint on number of visited leaves when replacing extents + + At btrfs_drop_extents(), we try to replace a range of file extent items + with a new file extent in a single btree search, to avoid the need to do + a search for deletion, followed by a path release and followed by yet + another search for insertion. + + When I originally added that optimization, in commit 1acae57b161ef1 + ("Btrfs: faster file extent item replace operations"), I left a constraint + to do the fast replace only if we visited a single leaf. That was because + in the most common case we find all file extent items that need to be + deleted (or trimmed) in a single leaf, however it can work for other + common cases like when we need to delete a few file extent items located + at the end of a leaf and a few more located at the beginning of the next + leaf. The key for the new file extent item is greater than the key of + any deleted or trimmed file extent item from previous leaves, so we are + fine to use the last leaf that we found as long as we are holding a + write lock on it - even if the new key ends up at slot 0, as if that's + the case, the btree search has obtained a write lock on any upper nodes + that need to have a key pointer updated. + + So removed the constraint that limits the optimization to the case where + we visited only a single leaf. + + This change if part of a patchset that is comprised of the following + patches: + + 1/6 btrfs: remove unnecessary leaf free space checks when pushing items + 2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf + 3/6 btrfs: avoid unnecessary computation when deleting items from a leaf + 4/6 btrfs: remove constraint on number of visited leaves when replacing extents + 5/6 btrfs: remove useless path release in the fast fsync path + 6/6 btrfs: prepare extents to be logged before locking a log tree path + + The last patch in the series has some performance test result in its + changelog. + + Signed-off-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 967aa565ee63760ea7f1c00be743f8e24ee83aa6 +Author: Filipe Manana <fdmanana@suse.com> +Date: Thu Feb 3 14:55:47 2022 +0000 + + btrfs: avoid unnecessary computation when deleting items from a leaf + + When deleting items from a leaf, we always compute the sum of the data + sizes of the items that are going to be deleted. However we only use + that sum when the last item to delete is behind the last item in the + leaf. This unnecessarily wastes CPU time when we are deleting either + the whole leaf or from some slot > 0 up to the last item in the leaf, + and both of these cases are common (e.g. truncation operation, either + as a result of truncate(2) or when logging inodes, deleting checksums + after removing a large enough extent, etc). + + So compute only the sum of the data sizes if the last item to be + deleted does not match the last item in the leaf. + + This change if part of a patchset that is comprised of the following + patches: + + 1/6 btrfs: remove unnecessary leaf free space checks when pushing items + 2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf + 3/6 btrfs: avoid unnecessary computation when deleting items from a leaf + 4/6 btrfs: remove constraint on number of visited leaves when replacing extents + 5/6 btrfs: remove useless path release in the fast fsync path + 6/6 btrfs: prepare extents to be logged before locking a log tree path + + The last patch in the series has some performance test result in its + changelog. + + Signed-off-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 4fd3e4a94b70dfab57ce617a2c8196a77e8cc29d +Author: Filipe Manana <fdmanana@suse.com> +Date: Thu Feb 3 14:55:46 2022 +0000 + + btrfs: avoid unnecessary COW of leaves when deleting items from a leaf + + When we delete items from a leaf, if we end up with more than two thirds + of unused leaf space, we try to delete the leaf by moving all its items + into its left and right neighbour leaves. Sometimes that is not possible + because there is not enough free space in the left and right leaves, and + in that case we end up not deleting our leaf. + + The way we are doing this is not ideal and can be improved in the + following ways: + + 1) When we call push_leaf_left(), we pass a value of 1 byte to the data + size parameter of push_leaf_left(). This is not realistic value because + no item can have a size less than 25 bytes, which is the size of struct + btrfs_item. This means that means that if the left leaf has not enough + free space to push any item, we end up COWing it even if we end up not + changing its content at all. + + COWing that leaf means allocating a new metadata extent, marking it + dirty and doing more IO when committing a transaction or when syncing a + log tree. For a log tree case, it's particularly more important to + avoid the useless COW operation, as more IO can imply a higher latency + for an fsync operation. + + So instead of passing 1 as the minimum data size for push_leaf_left(), + pass the size of the first item in our leaf, as we don't want to COW + the left leaf if we can't at least push the first item of our leaf; + + 2) When we call push_leaf_right(), we also pass a value of 1 byte as the + data size parameter of push_leaf_right(). Like the previous case, it + will also result in COWing the right leaf even if we are not able to + move any items into it, since there can't be any item with a size + smaller than 25 bytes (the size of struct btrfs_item). + + So instead of passing 1 as the minimum data size to push_leaf_right(), + pass a size that corresponds to the sum of the size of all the + remaining items in our leaf. We are not interested in moving less than + that, because if we do, we are not able to delete our leaf and we have + COWed the right leaf for nothing. Plus, moving only some of the items + of our leaf, it means an even less balanced tree. + + Just like the previous case, we want to avoid the useless COW of the + right leaf, this way we don't have to spend time allocating one new + metadata extent, and doing more IO when committing a transaction or + syncing a log tree. For the log tree case it's specially more important + because more IO can result in a higher latency for a fsync operation. + + So adjust the minimum data size passed to push_leaf_left() and + push_leaf_right() as mentioned above. + + This change if part of a patchset that is comprised of the following + patches: + + 1/6 btrfs: remove unnecessary leaf free space checks when pushing items + 2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf + 3/6 btrfs: avoid unnecessary computation when deleting items from a leaf + 4/6 btrfs: remove constraint on number of visited leaves when replacing extents + 5/6 btrfs: remove useless path release in the fast fsync path + 6/6 btrfs: prepare extents to be logged before locking a log tree path + + Not being able to delete a leaf that became less than 1/3 full after + deleting items from it is actually common. For example, for the fio test + mentioned in the changelog of patch 6/6, we are only able to delete a + leaf at btrfs_del_items() about 5.3% of the time, due to its left and + right neighbour leaves not having enough free space to push all the + remaining items into them. + + The last patch in the series has some performance test result in its + changelog. + + Signed-off-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 1b956d9978687f9d463a39c0d66a5eab958b9f3a +Author: Filipe Manana <fdmanana@suse.com> +Date: Thu Feb 3 14:55:45 2022 +0000 + + btrfs: remove unnecessary leaf free space checks when pushing items + + When trying to push items from a leaf into its left and right neighbours, + we lock the left or right leaf, check if it has the required minimum free + space, COW the leaf and then check again if it has the minimum required + free space. This second check is pointless: + + 1) Most and foremost because it's not needed. We have a write lock on the + leaf and on its parent node, so no one can come in and change either + the pre-COW or post-COW version of the leaf for the whole duration of + the push_leaf_left() and push_leaf_right() calls; + + 2) The call to btrfs_leaf_free_space() is not trivial, it has a fair + amount of arithmetic operations and access to fields in the leaf's + header and items, so it's not very cheap. + + So remove the duplicated free space checks. + + This change if part of a patchset that is comprised of the following + patches: + + 1/6 btrfs: remove unnecessary leaf free space checks when pushing items + 2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf + 3/6 btrfs: avoid unnecessary computation when deleting items from a leaf + 4/6 btrfs: remove constraint on number of visited leaves when replacing extents + 5/6 btrfs: remove useless path release in the fast fsync path + 6/6 btrfs: prepare extents to be logged before locking a log tree path + + The last patch in the series has some performance test result in its + changelog. + + Signed-off-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 784a4d85814dea9bf1096e864d9368d032c0112a +Author: David Sterba <dsterba@suse.com> +Date: Tue Feb 1 15:42:07 2022 +0100 + + btrfs: replace BUILD_BUG_ON by static_assert + + The static_assert introduced in 6bab69c65013 ("build_bug.h: add wrapper + for _Static_assert") has been supported by compilers for a long time + (gcc 4.6, clang 3.0) and can be used in header files. We don't need to + put BUILD_BUG_ON to random functions but rather keep it next to the + definition. + + The exception here is the UAPI header btrfs_tree.h that could be + potentially included by userspace code and the static assert is not + defined (nor used in any other header). + + Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 2e0a36dc72d2a11fad03763dbd2ac4da106cfd1f +Author: Qu Wenruo <wqu@suse.com> +Date: Sun Jan 30 20:53:15 2022 +0800 + + btrfs: don't hold CPU for too long when defragging a file + + There is a user report about "btrfs filesystem defrag" causing 120s + timeout problem. + + For btrfs_defrag_file() it will iterate all file extents if called from + defrag ioctl, thus it can take a long time. + + There is no reason not to release the CPU during such a long operation. + + Add cond_resched() after defragged one cluster. + + CC: stable@vger.kernel.org # 5.16 + Link: https://lore.kernel.org/linux-btrfs/10e51417-2203-f0a4-2021-86c8511cc367@gmx.com + Signed-off-by: Qu Wenruo <wqu@suse.com> + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit cb792362499dacbdd3986d10ad109d0efd875eab +Author: Josef Bacik <josef@toxicpanda.com> +Date: Fri Nov 5 16:45:28 2021 -0400 + + btrfs: rework async transaction committing + + Currently we do this awful thing where we get another ref on a trans + handle, async off that handle and commit the transaction from that work. + Because we do this we have to mess with current->journal_info and the + freeze counting stuff. + + We already have an async thing to kick for the transaction commit, the + transaction kthread. Replace this work struct with a flag on the + fs_info to tell the kthread to go ahead and commit even if it's before + our timeout. Then we can drastically simplify the async transaction + commit path. + + Note: this can be simplified and functionality based on the pending + operation COMMIT. + + Signed-off-by: Josef Bacik <josef@toxicpanda.com> + [ add note ] + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 42786287b2e3443e3a3c90d7305fdc9b4287f00b +Author: Nikolay Borisov <nborisov@suse.com> +Date: Tue Nov 23 09:23:42 2021 +0200 + + btrfs: eliminate if in main loop in tree_search_offset + + Reshuffle the code inside the first loop of tree_search_offset so that + one if() is eliminated and the becomes more linear. + + Reviewed-by: Josef Bacik <josef@toxicpanda.com> + Signed-off-by: Nikolay Borisov <nborisov@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit e8bbc534dbd1d0b0d17bf68d7615e644513e652a +Author: Qu Wenruo <wqu@suse.com> +Date: Fri Nov 19 14:19:33 2021 +0800 + + btrfs: don't check stripe length if the profile is not stripe based + + [BUG] + When debugging calc_bio_boundaries(), I found that even for RAID1 + metadata, we're following stripe length to calculate stripe boundary. + + # mkfs.btrfs -m raid1 -d raid1 /dev/test/scratch[12] + # mount /dev/test/scratch /mnt/btrfs + # xfs_io -f -c "pwrite 0 64K" /mnt/btrfs/file + # umount + + Above very basic operations will make calc_bio_boundaries() to report + the following result: + + submit_extent_page: r/i=1/1 file_offset=22036480 len_to_stripe_boundary=49152 + submit_extent_page: r/i=1/1 file_offset=30474240 len_to_stripe_boundary=65536 + ... + submit_extent_page: r/i=1/1 file_offset=30523392 len_to_stripe_boundary=16384 + submit_extent_page: r/i=1/1 file_offset=30457856 len_to_stripe_boundary=16384 + submit_extent_page: r/i=5/257 file_offset=0 len_to_stripe_boundary=65536 + submit_extent_page: r/i=5/257 file_offset=65536 len_to_stripe_boundary=65536 + submit_extent_page: r/i=1/1 file_offset=30490624 len_to_stripe_boundary=49152 + submit_extent_page: r/i=1/1 file_offset=30507008 len_to_stripe_boundary=32768 + + Where "r/i" is the rootid and inode, 1/1 means they metadata. + The remaining names match the member used in kernel. + + Even all data/metadata are using RAID1, we're still following stripe + length. + + [CAUSE] + This behavior is caused by a wrong condition in btrfs_get_io_geometry(): + + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + /* Fill using stripe_len */ + len = min_t(u64, em->len - offset, max_len); + } else { + len = em->len - offset; + } + + This means, only for SINGLE we will not follow stripe_len. + + However for profiles like RAID1*, DUP, they don't need to bother + stripe_len. + + This can lead to unnecessary bio split for RAID1*/DUP profiles, and can + even be a blockage for future zoned RAID support. + + [FIX] + Introduce one single-use macro, BTRFS_BLOCK_GROUP_STRIPE_MASK, and + change the condition to only calculate the length using stripe length + for stripe based profiles. + + Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> + Reviewed-by: Anand Jain <anand.jain@oracle.com> + Signed-off-by: Qu Wenruo <wqu@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit b8cb209bdc55ae144881b2ae67dd36941813f970 +Author: Nikolay Borisov <nborisov@suse.com> +Date: Mon Nov 22 17:16:46 2021 +0200 + + btrfs: get next entry in tree_search_offset before doing checks + + This is a small optimisation since the currently 'entry' is already + checked in the if () {} else if {} construct above the loop. In essence + the first iteration of the final while loop is redundant. To eliminate + this extra check simply get the next entry at the beginning of the loop. + + Reviewed-by: Josef Bacik <josef@toxicpanda.com> + Signed-off-by: Nikolay Borisov <nborisov@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 0fe871af1b0056f912e654d5455486c9f76b0c5e +Author: Josef Bacik <josef@toxicpanda.com> +Date: Thu Nov 18 16:33:15 2021 -0500 + + btrfs: index free space entries on size + + Currently we index free space on offset only, because usually we have a + hint from the allocator that we want to honor for locality reasons. + However if we fail to use this hint we have to go back to a brute force + search through the free space entries to find a large enough extent. + + With sufficiently fragmented free space this becomes quite expensive, as + we have to linearly search all of the free space entries to find if we + have a part that's long enough. + + To fix this add a cached rb tree to index based on free space entry + bytes. This will allow us to quickly look up the largest chunk in the + free space tree for this block group, and stop searching once we've + found an entry that is too small to satisfy our allocation. We simply + choose to use this tree if we're searching from the beginning of the + block group, as we know we do not care about locality at that point. + + I wrote an allocator test that creates a 10TiB ram backed null block + device and then fallocates random files until the file system is full. + I think go through and delete all of the odd files. Then I spawn 8 + threads that fallocate 64MiB files (1/2 our extent size cap) until the + file system is full again. I use bcc's funclatency to measure the + latency of find_free_extent. The baseline results are + + nsecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 10356 |**** | + 512 -> 1023 : 58242 |************************* | + 1024 -> 2047 : 74418 |******************************** | + 2048 -> 4095 : 90393 |****************************************| + 4096 -> 8191 : 79119 |*********************************** | + 8192 -> 16383 : 35614 |*************** | + 16384 -> 32767 : 13418 |***** | + 32768 -> 65535 : 12811 |***** | + 65536 -> 131071 : 17090 |******* | + 131072 -> 262143 : 26465 |*********** | + 262144 -> 524287 : 40179 |***************** | + 524288 -> 1048575 : 55469 |************************ | + 1048576 -> 2097151 : 48807 |********************* | + 2097152 -> 4194303 : 26744 |*********** | + 4194304 -> 8388607 : 35351 |*************** | + 8388608 -> 16777215 : 13918 |****** | + 16777216 -> 33554431 : 21 | | + + avg = 908079 nsecs, total: 580889071441 nsecs, count: 639690 + + And the patch results are + + nsecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 6883 |** | + 512 -> 1023 : 54346 |********************* | + 1024 -> 2047 : 79170 |******************************** | + 2048 -> 4095 : 98890 |****************************************| + 4096 -> 8191 : 81911 |********************************* | + 8192 -> 16383 : 27075 |********** | + 16384 -> 32767 : 14668 |***** | + 32768 -> 65535 : 13251 |***** | + 65536 -> 131071 : 15340 |****** | + 131072 -> 262143 : 26715 |********** | + 262144 -> 524287 : 43274 |***************** | + 524288 -> 1048575 : 53870 |********************* | + 1048576 -> 2097151 : 55368 |********************** | + 2097152 -> 4194303 : 41036 |**************** | + 4194304 -> 8388607 : 24927 |********** | + 8388608 -> 16777215 : 33 | | + 16777216 -> 33554431 : 9 | | + + avg = 623599 nsecs, total: 397259314759 nsecs, count: 637042 + + There's a little variation in the amount of calls done because of timing + of the threads with metadata requirements, but the avg, total, and + count's are relatively consistent between runs (usually within 2-5% of + each other). As you can see here we have around a 30% decrease in + average latency with a 30% decrease in overall time spent in + find_free_extent. + + Reviewed-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: Josef Bacik <josef@toxicpanda.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit db7a62b00a8ff463466ceb7c68728d8bfcc2d65d +Author: Josef Bacik <josef@toxicpanda.com> +Date: Thu Nov 18 16:33:14 2021 -0500 + + btrfs: only use ->max_extent_size if it is set in the bitmap + + While adding self tests for my space index change I was hitting a + problem where the space indexed tree wasn't returning the expected + ->max_extent_size. This is because we will skip searching any entry + that doesn't have ->bytes >= the amount of bytes we want. However we'll + still set the max_extent_size based on that entry. The problem is if we + don't search the bitmap we won't have ->max_extent_size set properly, so + we can't really trust it. + + This doesn't really result in a problem per-se, it can just result in us + not finding contiguous area that may exist. Fix the max_extent_size + helper to return ->bytes if ->max_extent_size isn't set, and add a big + comment explaining why we're doing this. + + Signed-off-by: Josef Bacik <josef@toxicpanda.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 1796c46d66a36d108c3d13292dd47020dd4e02e1 +Author: Josef Bacik <josef@toxicpanda.com> +Date: Thu Oct 21 14:58:37 2021 -0400 + + btrfs: rename btrfs_item_end_nr to btrfs_item_data_end + + The name btrfs_item_end_nr() is a bit of a misnomer, as it's actually + the offset of the end of the data the item points to. In fact all of + the helpers that we use btrfs_item_end_nr() use data in their name, like + BTRFS_LEAF_DATA_SIZE() and leaf_data(). Rename to btrfs_item_data_end() + to make it clear what this helper is giving us. + + Signed-off-by: Josef Bacik <josef@toxicpanda.com> + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 89b1779e5b64958f44df12119436d01621a6f87a +Author: Josef Bacik <josef@toxicpanda.com> +Date: Thu Oct 21 14:58:36 2021 -0400 + + btrfs: remove the btrfs_item_end() helper + + We're only using btrfs_item_end() from btrfs_item_end_nr(), so this can + be collapsed. + + Signed-off-by: Josef Bacik <josef@toxicpanda.com> + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 8b3c0a73f1c9ae9506fb95f8581d35432f2bb8e8 +Author: Josef Bacik <josef@toxicpanda.com> +Date: Thu Oct 21 14:58:35 2021 -0400 + + btrfs: drop the _nr from the item helpers + + Now that all call sites are using the slot number to modify item values, + rename the SETGET helpers to raw_item_*(), and then rework the _nr() + helpers to be the btrfs_item_*() btrfs_set_item_*() helpers, and then + rename all of the callers to the new helpers. + + Signed-off-by: Josef Bacik <josef@toxicpanda.com> + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit cb7d27e5e1205b9ca8f512a48f6772b2eb2b84f9 +Author: Josef Bacik <josef@toxicpanda.com> +Date: Thu Oct 21 14:58:34 2021 -0400 + + btrfs: introduce item_nr token variant helpers + + The last remaining place where we have the pattern of + + item = btrfs_item_nr(slot) + <do something with the item> + + are the token helpers. Handle this by introducing token helpers that + will do the btrfs_item_nr() work inside of the helper itself, and then + convert all users of the btrfs_item token helpers to the new _nr() + variants. + + Signed-off-by: Josef Bacik <josef@toxicpanda.com> + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 18ba83dcd99b9619cfc1a246cfb84b1c9b530097 +Author: Josef Bacik <josef@toxicpanda.com> +Date: Thu Oct 21 14:58:33 2021 -0400 + + btrfs: make btrfs_file_extent_inline_item_len take a slot + + Instead of getting the btrfs_item for this, simply pass in the slot of + the item and then use the btrfs_item_size_nr() helper inside of + btrfs_file_extent_inline_item_len(). + + Signed-off-by: Josef Bacik <josef@toxicpanda.com> + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 898ecfdac6314335d8135f741414b0b6867ce1ab +Author: Josef Bacik <josef@toxicpanda.com> +Date: Thu Oct 21 14:58:32 2021 -0400 + + btrfs: add btrfs_set_item_*_nr() helpers + + We have the pattern of + + item = btrfs_item_nr(slot); + btrfs_set_item_*(leaf, item); + + in a bunch of places in our code. Fix this by adding + btrfs_set_item_*_nr() helpers which will do the appropriate work, and + replace those calls with + + btrfs_set_item_*_nr(leaf, slot); + + Signed-off-by: Josef Bacik <josef@toxicpanda.com> + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 35a9546592ae8d0f3348cd0f776a12dbf3b81aa8 +Author: Josef Bacik <josef@toxicpanda.com> +Date: Thu Oct 21 14:58:31 2021 -0400 + + btrfs: use btrfs_item_size_nr/btrfs_item_offset_nr everywhere + + We have this pattern in a lot of places + + item = btrfs_item_nr(slot); + btrfs_item_size(leaf, item); + + when we could simply use + + btrfs_item_size(leaf, slot); + + Fix all callers of btrfs_item_size() and btrfs_item_offset() to use the + _nr variation of the helpers. + + Reviewed-by: Qu Wenruo <wqu@suse.com> + Signed-off-by: Josef Bacik <josef@toxicpanda.com> + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 8e279d85c2ac593b8caf53f3ac72d0b7047d96f5 +Author: Filipe Manana <fdmanana@suse.com> +Date: Mon Oct 25 17:31:54 2021 +0100 + + btrfs: remove no longer needed logic for replaying directory deletes + + Now that we log only dir index keys when logging a directory, we no longer + need to deal with dir item keys in the log replay code for replaying + directory deletes. This is also true for the case when we replay a log + tree created by a kernel that still logs dir items. + + So remove the remaining code of the replay of directory deletes algorithm + that deals with dir item keys. + + Reviewed-by: Josef Bacik <josef@toxicpanda.com> + Signed-off-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit b8ac7a8b1dd9bba3b30bd034d754bb4932c4a970 +Author: Filipe Manana <fdmanana@suse.com> +Date: Mon Oct 25 17:31:53 2021 +0100 + + btrfs: only copy dir index keys when logging a directory + + Currently, when logging a directory, we copy both dir items and dir index + items from the fs/subvolume tree to the log tree. Both items have exactly + the same data (same struct btrfs_dir_item), the difference lies in the key + values, where a dir index key contains the index number of a directory + entry while the dir item key does not, as it's used for doing fast lookups + of an entry by name, while the former is used for sorting entries when + listing a directory. + + We can exploit that and log only the dir index items, since they contain + all the information needed to correctly add, replace and delete directory + entries when replaying a log tree. Logging only the dir index items is + also backward and forward compatible: an unpatched kernel (without this + change) can correctly replay a log tree generated by a patched kernel + (with this patch), and a patched kernel can correctly replay a log tree + generated by an unpatched kernel. + + The backward compatibility is ensured because: + + 1) For inserting a new dentry: a dentry is only inserted when we find a + new dir index key - we can only insert if we know the dir index offset, + which is encoded in the dir index key's offset; + + 2) For deleting dentries: during log replay, before adding or replacing + dentries, we first replay dentry deletions. Whenever we find a dir item + key or a dir index key in the subvolume/fs tree that is not logged in + a range for which the log tree is authoritative, we do the unlink of + the dentry, which removes both the existing dir item key and the dir + index key. Therefore logging just dir index keys is enough to ensure + dentry deletions are correctly replayed; + + 3) For dentry replacements: they work when we log only dir index keys + and this is mostly due to a combination of 1) and 2). If we replace a + dentry with name "foobar" to point from inode A to inode B, then we + know the dir index key for the new dentry is different from the old + one, as it has an index number (key offset) larger than the old one. + This results in replaying a deletion, through replay_dir_deletes(), + that causes the old dentry to be removed, both the dir item key and + the dir index key, as mentioned at 2). Then when processing the new + dir index key, we add the new dentry, adding both a new dir item key + and a new index key pointing to inode B, as stated in 1). + + The forward compatibility, the ability for a patched kernel to replay a + log created by an older, unpatched kernel, comes from the changes required + for making sure we are able to replay a log that only contains dir index + keys - we simply ignore every dir item key we find. + + So modify directory logging to log only dir index items, and modify the + log replay process to ignore dir item keys, from log trees created by an + unpatched kernel, and process only with dir index keys. This reduces the + amount of logged metadata by about half, and therefore the time spent + logging or fsyncing large directories (less CPU time and less IO). + + The following test script was used to measure this change: + + #!/bin/bash + + DEV=/dev/nvme0n1 + MNT=/mnt/nvme0n1 + + NUM_NEW_FILES=1000000 + NUM_FILE_DELETES=10000 + + mkfs.btrfs -f $DEV + mount -o ssd $DEV $MNT + + mkdir $MNT/testdir + + for ((i = 1; i <= $NUM_NEW_FILES; i++)); do + echo -n > $MNT/testdir/file_$i + done + + start=$(date +%s%N) + xfs_io -c "fsync" $MNT/testdir + end=$(date +%s%N) + + dur=$(( (end - start) / 1000000 )) + echo "dir fsync took $dur ms after adding $NUM_NEW_FILES files" + + # sync to force transaction commit and wipeout the log. + sync + + del_inc=$(( $NUM_NEW_FILES / $NUM_FILE_DELETES )) + for ((i = 1; i <= $NUM_NEW_FILES; i += $del_inc)); do + rm -f $MNT/testdir/file_$i + done + + start=$(date +%s%N) + xfs_io -c "fsync" $MNT/testdir + end=$(date +%s%N) + + dur=$(( (end - start) / 1000000 )) + echo "dir fsync took $dur ms after deleting $NUM_FILE_DELETES files" + echo + + umount $MNT + + The tests were run on a physical machine, with a non-debug kernel (Debian's + default kernel config), for different values of $NUM_NEW_FILES and + $NUM_FILE_DELETES, and the results were the following: + + ** Before patch, NUM_NEW_FILES = 1 000 000, NUM_DELETE_FILES = 10 000 ** + + dir fsync took 8412 ms after adding 1000000 files + dir fsync took 500 ms after deleting 10000 files + + ** After patch, NUM_NEW_FILES = 1 000 000, NUM_DELETE_FILES = 10 000 ** + + dir fsync took 4252 ms after adding 1000000 files (-49.5%) + dir fsync took 269 ms after deleting 10000 files (-46.2%) + + ** Before patch, NUM_NEW_FILES = 100 000, NUM_DELETE_FILES = 1 000 ** + + dir fsync took 745 ms after adding 100000 files + dir fsync took 59 ms after deleting 1000 files + + ** After patch, NUM_NEW_FILES = 100 000, NUM_DELETE_FILES = 1 000 ** + + dir fsync took 404 ms after adding 100000 files (-45.8%) + dir fsync took 31 ms after deleting 1000 files (-47.5%) + + ** Before patch, NUM_NEW_FILES = 10 000, NUM_DELETE_FILES = 1 000 ** + + dir fsync took 67 ms after adding 10000 files + dir fsync took 9 ms after deleting 1000 files + + ** After patch, NUM_NEW_FILES = 10 000, NUM_DELETE_FILES = 1 000 ** + + dir fsync took 36 ms after adding 10000 files (-46.3%) + dir fsync took 5 ms after deleting 1000 files (-44.4%) + + ** Before patch, NUM_NEW_FILES = 1 000, NUM_DELETE_FILES = 100 ** + + dir fsync took 9 ms after adding 1000 files + dir fsync took 4 ms after deleting 100 files + + ** After patch, NUM_NEW_FILES = 1 000, NUM_DELETE_FILES = 100 ** + + dir fsync took 7 ms after adding 1000 files (-22.2%) + dir fsync took 3 ms after deleting 100 files (-25.0%) + + Reviewed-by: Josef Bacik <josef@toxicpanda.com> + Signed-off-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 793440833c94fa896424e2ceef71d376a2ae2454 +Author: Nikolay Borisov <nborisov@suse.com> +Date: Thu Oct 14 10:03:11 2021 +0300 + + btrfs: remove spurious unlock/lock of unused_bgs_lock + + Since both unused block groups and reclaim bgs lists are protected by + unused_bgs_lock then free them in the same critical section without + doing an extra unlock/lock pair. + + Reviewed-by: Josef Bacik <josef@toxicpanda.com> + Signed-off-by: Nikolay Borisov <nborisov@suse.com> + Reviewed-by: David Sterba <dsterba@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> + +commit 9f4889dbcf40db16d5cfd02dae54143ecfcf036a +Author: Filipe Manana <fdmanana@suse.com> +Date: Thu Oct 28 16:03:41 2021 +0100 + + btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range + + When doing a direct IO write against a file range that either has + preallocated extents in that range or has regular extents and the file + has the NOCOW attribute set, the write fails with -ENOSPC when all of + the following conditions are met: + + 1) There are no data blocks groups with enough free space matching + the size of the write; + + 2) There's not enough unallocated space for allocating a new data block + group; + + 3) The extents in the target file range are not shared, neither through + snapshots nor through reflinks. + + This is wrong because a NOCOW write can be done in such case, and in fact + it's possible to do it using a buffered IO write, since when failing to + allocate data space, the buffered IO path checks if a NOCOW write is + possible. + + The failure in direct IO write path comes from the fact that early on, + at btrfs_dio_iomap_begin(), we try to allocate data space for the write + and if it that fails we return the error and stop - we never check if we + can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check + if we can do a NOCOW write into the range, or a subset of the range, and + then release the previously reserved data space. + + Fix this by doing the data reservation only if needed, when we must COW, + at btrfs_get_blocks_direct_write() instead of doing it at + btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes + the inneficiency of doing unnecessary data reservations. + + The following example test script reproduces the problem: + + $ cat dio-nocow-enospc.sh + #!/bin/bash + + DEV=/dev/sdj + MNT=/mnt/sdj + + # Use a small fixed size (1G) filesystem so that it's quick to fill + # it up. + # Make sure the mixed block groups feature is not enabled because we + # later want to not have more space available for allocating data + # extents but still have enough metadata space free for the file writes. + mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV + mount $DEV $MNT + + # Create our test file with the NOCOW attribute set. + touch $MNT/foobar + chattr +C $MNT/foobar + + # Now fill in all unallocated space with data for our test file. + # This will allocate a data block group that will be full and leave + # no (or a very small amount of) unallocated space in the device, so + # that it will not be possible to allocate a new block group later. + echo + echo "Creating test file with initial data..." + xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar + + # Now try a direct IO write against file range [0, 10M[. + # This should succeed since this is a NOCOW file and an extent for the + # range was previously allocated. + echo + echo "Trying direct IO write over allocated space..." + xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar + + umount $MNT + + When running the test: + + $ ./dio-nocow-enospc.sh + (...) + + Creating test file with initial data... + wrote 943718400/943718400 bytes at offset 0 + 900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec) + + Trying direct IO write over allocated space... + pwrite: No space left on device + + A test case for fstests will follow, testing both this direct IO write + scenario as well as the buffered IO write scenario to make it less likely + to get future regressions on the buffered IO case. + + Reviewed-by: Josef Bacik <josef@toxicpanda.com> + Signed-off-by: Filipe Manana <fdmanana@suse.com> + Signed-off-by: David Sterba <dsterba@suse.com> +--- + fs/btrfs/backref.c | 16 +- + fs/btrfs/block-group.c | 2 - + fs/btrfs/btrfs_inode.h | 18 +- + fs/btrfs/compression.h | 2 + + fs/btrfs/ctree.c | 208 ++++----- + fs/btrfs/ctree.h | 96 ++-- + fs/btrfs/dev-replace.c | 4 +- + fs/btrfs/dir-item.c | 12 +- + fs/btrfs/disk-io.c | 3 +- + fs/btrfs/extent-tree.c | 14 +- + fs/btrfs/file-item.c | 25 +- + fs/btrfs/file.c | 26 +- + fs/btrfs/free-space-cache.c | 177 ++++++-- + fs/btrfs/free-space-cache.h | 2 + + fs/btrfs/inode-item.c | 14 +- + fs/btrfs/inode.c | 172 +++---- + fs/btrfs/ioctl.c | 258 +++++++---- + fs/btrfs/lzo.c | 20 +- + fs/btrfs/print-tree.c | 8 +- + fs/btrfs/props.c | 2 +- + fs/btrfs/qgroup.c | 41 +- + fs/btrfs/ref-verify.c | 2 +- + fs/btrfs/reflink.c | 2 +- + fs/btrfs/relocation.c | 2 +- + fs/btrfs/root-tree.c | 4 +- + fs/btrfs/scrub.c | 2 +- + fs/btrfs/send.c | 22 +- + fs/btrfs/sysfs.c | 10 +- + fs/btrfs/tests/extent-buffer-tests.c | 17 +- + fs/btrfs/transaction.c | 76 +--- + fs/btrfs/transaction.h | 2 +- + fs/btrfs/tree-checker.c | 56 +-- + fs/btrfs/tree-log.c | 656 ++++++++++++--------------- + fs/btrfs/uuid-tree.c | 10 +- + fs/btrfs/verity.c | 2 +- + fs/btrfs/volumes.c | 13 +- + fs/btrfs/xattr.c | 8 +- + include/uapi/linux/btrfs.h | 6 +- + include/uapi/linux/btrfs_tree.h | 4 +- + 39 files changed, 1062 insertions(+), 952 deletions(-) + +diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c +index 8b090c40daf7..3b0c4bed242e 100644 +--- a/fs/btrfs/backref.c ++++ b/fs/btrfs/backref.c +@@ -950,7 +950,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, + leaf = path->nodes[0]; + slot = path->slots[0]; + +- item_size = btrfs_item_size_nr(leaf, slot); ++ item_size = btrfs_item_size(leaf, slot); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); +@@ -1792,7 +1792,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + } + + eb = path->nodes[0]; +- item_size = btrfs_item_size_nr(eb, path->slots[0]); ++ item_size = btrfs_item_size(eb, path->slots[0]); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); +@@ -2071,7 +2071,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, + u64 parent = 0; + int found = 0; + struct extent_buffer *eb; +- struct btrfs_item *item; + struct btrfs_inode_ref *iref; + struct btrfs_key found_key; + +@@ -2097,10 +2096,9 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, + } + btrfs_release_path(path); + +- item = btrfs_item_nr(slot); + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + +- for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { ++ for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) { + name_len = btrfs_inode_ref_name_len(eb, iref); + /* path must be released before calling iterate()! */ + btrfs_debug(fs_root->fs_info, +@@ -2156,7 +2154,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, + } + btrfs_release_path(path); + +- item_size = btrfs_item_size_nr(eb, slot); ++ item_size = btrfs_item_size(eb, slot); + ptr = btrfs_item_ptr_offset(eb, slot); + cur_offset = 0; + +@@ -2377,7 +2375,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) + iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + iter->end_ptr = (u32)(iter->item_ptr + +- btrfs_item_size_nr(path->nodes[0], path->slots[0])); ++ btrfs_item_size(path->nodes[0], path->slots[0])); + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + +@@ -2417,7 +2415,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) + iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + iter->item_ptr = iter->cur_ptr; +- iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr( ++ iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size( + path->nodes[0], path->slots[0])); + } + +@@ -2482,7 +2480,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) + iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + iter->cur_ptr = iter->item_ptr; +- iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0], ++ iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0], + path->slots[0]); + return 0; + } +diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c +index b67c965725ea..27da1dfbd626 100644 +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -3924,9 +3924,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } +- spin_unlock(&info->unused_bgs_lock); + +- spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->reclaim_bgs)) { + block_group = list_first_entry(&info->reclaim_bgs, + struct btrfs_block_group, +diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h +index ab2a4a52e0bb..b3e46aabc3d8 100644 +--- a/fs/btrfs/btrfs_inode.h ++++ b/fs/btrfs/btrfs_inode.h +@@ -138,19 +138,11 @@ struct btrfs_inode { + /* a local copy of root's last_log_commit */ + int last_log_commit; + +- union { +- /* +- * Total number of bytes pending delalloc, used by stat to +- * calculate the real block usage of the file. This is used +- * only for files. +- */ +- u64 delalloc_bytes; +- /* +- * The offset of the last dir item key that was logged. +- * This is used only for directories. +- */ +- u64 last_dir_item_offset; +- }; ++ /* ++ * Total number of bytes pending delalloc, used by stat to calculate the ++ * real block usage of the file. This is used only for files. ++ */ ++ u64 delalloc_bytes; + + union { + /* +diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h +index 56eef0821e3e..7dbd14caab01 100644 +--- a/fs/btrfs/compression.h ++++ b/fs/btrfs/compression.h +@@ -22,6 +22,8 @@ struct btrfs_inode; + + /* Maximum length of compressed data stored on disk */ + #define BTRFS_MAX_COMPRESSED (SZ_128K) ++static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); ++ + /* Maximum size of data before compression */ + #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) + +diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c +index 35660791e084..5ca7a535d109 100644 +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -2627,19 +2627,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans, + */ + static int leaf_space_used(struct extent_buffer *l, int start, int nr) + { +- struct btrfs_item *start_item; +- struct btrfs_item *end_item; + int data_len; + int nritems = btrfs_header_nritems(l); + int end = min(nritems, start + nr) - 1; + + if (!nr) + return 0; +- start_item = btrfs_item_nr(start); +- end_item = btrfs_item_nr(end); +- data_len = btrfs_item_offset(l, start_item) + +- btrfs_item_size(l, start_item); +- data_len = data_len - btrfs_item_offset(l, end_item); ++ data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); ++ data_len = data_len - btrfs_item_offset(l, end); + data_len += sizeof(struct btrfs_item) * nr; + WARN_ON(data_len < 0); + return data_len; +@@ -2686,7 +2681,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path, + u32 i; + int push_space = 0; + int push_items = 0; +- struct btrfs_item *item; + u32 nr; + u32 right_nritems; + u32 data_end; +@@ -2703,8 +2697,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path, + slot = path->slots[1]; + i = left_nritems - 1; + while (i >= nr) { +- item = btrfs_item_nr(i); +- + if (!empty && push_items > 0) { + if (path->slots[0] > i) + break; +@@ -2719,12 +2711,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path, + if (path->slots[0] == i) + push_space += data_size; + +- this_item_size = btrfs_item_size(left, item); +- if (this_item_size + sizeof(*item) + push_space > free_space) ++ this_item_size = btrfs_item_size(left, i); ++ if (this_item_size + sizeof(struct btrfs_item) + ++ push_space > free_space) + break; + + push_items++; +- push_space += this_item_size + sizeof(*item); ++ push_space += this_item_size + sizeof(struct btrfs_item); + if (i == 0) + break; + i--; +@@ -2738,7 +2731,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, + /* push left to right */ + right_nritems = btrfs_header_nritems(right); + +- push_space = btrfs_item_end_nr(left, left_nritems - push_items); ++ push_space = btrfs_item_data_end(left, left_nritems - push_items); + push_space -= leaf_data_end(left); + + /* make room in the right data area */ +@@ -2769,9 +2762,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path, + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(fs_info); + for (i = 0; i < right_nritems; i++) { +- item = btrfs_item_nr(i); +- push_space -= btrfs_token_item_size(&token, item); +- btrfs_set_token_item_offset(&token, item, push_space); ++ push_space -= btrfs_token_item_size(&token, i); ++ btrfs_set_token_item_offset(&token, i, push_space); + } + + left_nritems -= push_items; +@@ -2856,16 +2848,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + if (free_space < data_size) + goto out_unlock; + +- /* cow and double check */ + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right, BTRFS_NESTING_RIGHT_COW); + if (ret) + goto out_unlock; + +- free_space = btrfs_leaf_free_space(right); +- if (free_space < data_size) +- goto out_unlock; +- + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; +@@ -2916,7 +2903,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, + int i; + int push_space = 0; + int push_items = 0; +- struct btrfs_item *item; + u32 old_left_nritems; + u32 nr; + int ret = 0; +@@ -2930,8 +2916,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, + nr = min(right_nritems - 1, max_slot); + + for (i = 0; i < nr; i++) { +- item = btrfs_item_nr(i); +- + if (!empty && push_items > 0) { + if (path->slots[0] < i) + break; +@@ -2946,12 +2930,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, + if (path->slots[0] == i) + push_space += data_size; + +- this_item_size = btrfs_item_size(right, item); +- if (this_item_size + sizeof(*item) + push_space > free_space) ++ this_item_size = btrfs_item_size(right, i); ++ if (this_item_size + sizeof(struct btrfs_item) + push_space > ++ free_space) + break; + + push_items++; +- push_space += this_item_size + sizeof(*item); ++ push_space += this_item_size + sizeof(struct btrfs_item); + } + + if (push_items == 0) { +@@ -2967,25 +2952,23 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, + push_items * sizeof(struct btrfs_item)); + + push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - +- btrfs_item_offset_nr(right, push_items - 1); ++ btrfs_item_offset(right, push_items - 1); + + copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + + leaf_data_end(left) - push_space, + BTRFS_LEAF_DATA_OFFSET + +- btrfs_item_offset_nr(right, push_items - 1), ++ btrfs_item_offset(right, push_items - 1), + push_space); + old_left_nritems = btrfs_header_nritems(left); + BUG_ON(old_left_nritems <= 0); + + btrfs_init_map_token(&token, left); +- old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); ++ old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); + for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { + u32 ioff; + +- item = btrfs_item_nr(i); +- +- ioff = btrfs_token_item_offset(&token, item); +- btrfs_set_token_item_offset(&token, item, ++ ioff = btrfs_token_item_offset(&token, i); ++ btrfs_set_token_item_offset(&token, i, + ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); + } + btrfs_set_header_nritems(left, old_left_nritems + push_items); +@@ -2996,7 +2979,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, + right_nritems); + + if (push_items < right_nritems) { +- push_space = btrfs_item_offset_nr(right, push_items - 1) - ++ push_space = btrfs_item_offset(right, push_items - 1) - + leaf_data_end(right); + memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, +@@ -3014,10 +2997,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(fs_info); + for (i = 0; i < right_nritems; i++) { +- item = btrfs_item_nr(i); +- +- push_space = push_space - btrfs_token_item_size(&token, item); +- btrfs_set_token_item_offset(&token, item, push_space); ++ push_space = push_space - btrfs_token_item_size(&token, i); ++ btrfs_set_token_item_offset(&token, i, push_space); + } + + btrfs_mark_buffer_dirty(left); +@@ -3096,7 +3077,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + goto out; + } + +- /* cow and double check */ + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left, + BTRFS_NESTING_LEFT_COW); +@@ -3107,12 +3087,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + goto out; + } + +- free_space = btrfs_leaf_free_space(left); +- if (free_space < data_size) { +- ret = 1; +- goto out; +- } +- + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + goto out; +@@ -3145,7 +3119,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, + + nritems = nritems - mid; + btrfs_set_header_nritems(right, nritems); +- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l); ++ data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); + + copy_extent_buffer(right, l, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(mid), +@@ -3156,15 +3130,14 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, + data_copy_size, BTRFS_LEAF_DATA_OFFSET + + leaf_data_end(l), data_copy_size); + +- rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); ++ rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); + + btrfs_init_map_token(&token, right); + for (i = 0; i < nritems; i++) { +- struct btrfs_item *item = btrfs_item_nr(i); + u32 ioff; + +- ioff = btrfs_token_item_offset(&token, item); +- btrfs_set_token_item_offset(&token, item, ioff + rt_data_off); ++ ioff = btrfs_token_item_offset(&token, i); ++ btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); + } + + btrfs_set_header_nritems(l, mid); +@@ -3280,7 +3253,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, + + l = path->nodes[0]; + slot = path->slots[0]; +- if (extend && data_size + btrfs_item_size_nr(l, slot) + ++ if (extend && data_size + btrfs_item_size(l, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) + return -EOVERFLOW; + +@@ -3449,7 +3422,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, + if (btrfs_leaf_free_space(leaf) >= ins_len) + return 0; + +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); +@@ -3469,7 +3442,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, + ret = -EAGAIN; + leaf = path->nodes[0]; + /* if our item isn't there, return now */ +- if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) ++ if (item_size != btrfs_item_size(leaf, path->slots[0])) + goto err; + + /* the leaf has changed, it now has room. return now */ +@@ -3500,9 +3473,7 @@ static noinline int split_item(struct btrfs_path *path, + unsigned long split_offset) + { + struct extent_buffer *leaf; +- struct btrfs_item *item; +- struct btrfs_item *new_item; +- int slot; ++ int orig_slot, slot; + char *buf; + u32 nritems; + u32 item_size; +@@ -3512,9 +3483,9 @@ static noinline int split_item(struct btrfs_path *path, + leaf = path->nodes[0]; + BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); + +- item = btrfs_item_nr(path->slots[0]); +- orig_offset = btrfs_item_offset(leaf, item); +- item_size = btrfs_item_size(leaf, item); ++ orig_slot = path->slots[0]; ++ orig_offset = btrfs_item_offset(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + + buf = kmalloc(item_size, GFP_NOFS); + if (!buf) +@@ -3535,14 +3506,12 @@ static noinline int split_item(struct btrfs_path *path, + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(leaf, &disk_key, slot); + +- new_item = btrfs_item_nr(slot); ++ btrfs_set_item_offset(leaf, slot, orig_offset); ++ btrfs_set_item_size(leaf, slot, item_size - split_offset); + +- btrfs_set_item_offset(leaf, new_item, orig_offset); +- btrfs_set_item_size(leaf, new_item, item_size - split_offset); +- +- btrfs_set_item_offset(leaf, item, +- orig_offset + item_size - split_offset); +- btrfs_set_item_size(leaf, item, split_offset); ++ btrfs_set_item_offset(leaf, orig_slot, ++ orig_offset + item_size - split_offset); ++ btrfs_set_item_size(leaf, orig_slot, split_offset); + + btrfs_set_header_nritems(leaf, nritems + 1); + +@@ -3603,7 +3572,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) + { + int slot; + struct extent_buffer *leaf; +- struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data_start; +@@ -3615,14 +3583,14 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) + leaf = path->nodes[0]; + slot = path->slots[0]; + +- old_size = btrfs_item_size_nr(leaf, slot); ++ old_size = btrfs_item_size(leaf, slot); + if (old_size == new_size) + return; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(leaf); + +- old_data_start = btrfs_item_offset_nr(leaf, slot); ++ old_data_start = btrfs_item_offset(leaf, slot); + + size_diff = old_size - new_size; + +@@ -3636,10 +3604,9 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) + btrfs_init_map_token(&token, leaf); + for (i = slot; i < nritems; i++) { + u32 ioff; +- item = btrfs_item_nr(i); + +- ioff = btrfs_token_item_offset(&token, item); +- btrfs_set_token_item_offset(&token, item, ioff + size_diff); ++ ioff = btrfs_token_item_offset(&token, i); ++ btrfs_set_token_item_offset(&token, i, ioff + size_diff); + } + + /* shift the data */ +@@ -3682,8 +3649,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) + fixup_low_keys(path, &disk_key, 1); + } + +- item = btrfs_item_nr(slot); +- btrfs_set_item_size(leaf, item, new_size); ++ btrfs_set_item_size(leaf, slot, new_size); + btrfs_mark_buffer_dirty(leaf); + + if (btrfs_leaf_free_space(leaf) < 0) { +@@ -3699,7 +3665,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) + { + int slot; + struct extent_buffer *leaf; +- struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data; +@@ -3717,7 +3682,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) + BUG(); + } + slot = path->slots[0]; +- old_data = btrfs_item_end_nr(leaf, slot); ++ old_data = btrfs_item_data_end(leaf, slot); + + BUG_ON(slot < 0); + if (slot >= nritems) { +@@ -3734,10 +3699,9 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) + btrfs_init_map_token(&token, leaf); + for (i = slot; i < nritems; i++) { + u32 ioff; +- item = btrfs_item_nr(i); + +- ioff = btrfs_token_item_offset(&token, item); +- btrfs_set_token_item_offset(&token, item, ioff - data_size); ++ ioff = btrfs_token_item_offset(&token, i); ++ btrfs_set_token_item_offset(&token, i, ioff - data_size); + } + + /* shift the data */ +@@ -3746,9 +3710,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) + data_end, old_data - data_end); + + data_end = old_data; +- old_size = btrfs_item_size_nr(leaf, slot); +- item = btrfs_item_nr(slot); +- btrfs_set_item_size(leaf, item, old_size + data_size); ++ old_size = btrfs_item_size(leaf, slot); ++ btrfs_set_item_size(leaf, slot, old_size + data_size); + btrfs_mark_buffer_dirty(leaf); + + if (btrfs_leaf_free_space(leaf) < 0) { +@@ -3770,7 +3733,6 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p + const struct btrfs_item_batch *batch) + { + struct btrfs_fs_info *fs_info = root->fs_info; +- struct btrfs_item *item; + int i; + u32 nritems; + unsigned int data_end; +@@ -3807,7 +3769,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p + + btrfs_init_map_token(&token, leaf); + if (slot != nritems) { +- unsigned int old_data = btrfs_item_end_nr(leaf, slot); ++ unsigned int old_data = btrfs_item_data_end(leaf, slot); + + if (old_data < data_end) { + btrfs_print_leaf(leaf); +@@ -3823,10 +3785,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p + for (i = slot; i < nritems; i++) { + u32 ioff; + +- item = btrfs_item_nr(i); +- ioff = btrfs_token_item_offset(&token, item); +- btrfs_set_token_item_offset(&token, item, +- ioff - batch->total_data_size); ++ ioff = btrfs_token_item_offset(&token, i); ++ btrfs_set_token_item_offset(&token, i, ++ ioff - batch->total_data_size); + } + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), +@@ -3845,10 +3806,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p + for (i = 0; i < batch->nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); + btrfs_set_item_key(leaf, &disk_key, slot + i); +- item = btrfs_item_nr(slot + i); + data_end -= batch->data_sizes[i]; +- btrfs_set_token_item_offset(&token, item, data_end); +- btrfs_set_token_item_size(&token, item, batch->data_sizes[i]); ++ btrfs_set_token_item_offset(&token, slot + i, data_end); ++ btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); + } + + btrfs_set_header_nritems(leaf, nritems + batch->nr); +@@ -3955,7 +3915,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + u32 item_size; + + leaf = path->nodes[0]; +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + ret = setup_leaf_for_split(trans, root, path, + item_size + sizeof(struct btrfs_item)); + if (ret) +@@ -4056,25 +4016,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + { + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *leaf; +- struct btrfs_item *item; +- u32 last_off; +- u32 dsize = 0; + int ret = 0; + int wret; +- int i; + u32 nritems; + + leaf = path->nodes[0]; +- last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); +- +- for (i = 0; i < nr; i++) +- dsize += btrfs_item_size_nr(leaf, slot + i); +- + nritems = btrfs_header_nritems(leaf); + + if (slot + nr != nritems) { +- int data_end = leaf_data_end(leaf); ++ const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); ++ const int data_end = leaf_data_end(leaf); + struct btrfs_map_token token; ++ u32 dsize = 0; ++ int i; ++ ++ for (i = 0; i < nr; i++) ++ dsize += btrfs_item_size(leaf, slot + i); + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end + dsize, +@@ -4085,9 +4042,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + for (i = slot + nr; i < nritems; i++) { + u32 ioff; + +- item = btrfs_item_nr(i); +- ioff = btrfs_token_item_offset(&token, item); +- btrfs_set_token_item_offset(&token, item, ioff + dsize); ++ ioff = btrfs_token_item_offset(&token, i); ++ btrfs_set_token_item_offset(&token, i, ioff + dsize); + } + + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), +@@ -4115,24 +4071,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + fixup_low_keys(path, &disk_key, 1); + } + +- /* delete the leaf if it is mostly empty */ ++ /* ++ * Try to delete the leaf if it is mostly empty. We do this by ++ * trying to move all its items into its left and right neighbours. ++ * If we can't move all the items, then we don't delete it - it's ++ * not ideal, but future insertions might fill the leaf with more ++ * items, or items from other leaves might be moved later into our ++ * leaf due to deletions on those leaves. ++ */ + if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { ++ u32 min_push_space; ++ + /* push_leaf_left fixes the path. + * make sure the path still points to our leaf + * for possible call to del_ptr below + */ + slot = path->slots[1]; + atomic_inc(&leaf->refs); +- +- wret = push_leaf_left(trans, root, path, 1, 1, +- 1, (u32)-1); ++ /* ++ * We want to be able to at least push one item to the ++ * left neighbour leaf, and that's the first item. ++ */ ++ min_push_space = sizeof(struct btrfs_item) + ++ btrfs_item_size(leaf, 0); ++ wret = push_leaf_left(trans, root, path, 0, ++ min_push_space, 1, (u32)-1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + + if (path->nodes[0] == leaf && + btrfs_header_nritems(leaf)) { +- wret = push_leaf_right(trans, root, path, 1, +- 1, 1, 0); ++ /* ++ * If we were not able to push all items from our ++ * leaf to its left neighbour, then attempt to ++ * either push all the remaining items to the ++ * right neighbour or none. There's no advantage ++ * in pushing only some items, instead of all, as ++ * it's pointless to end up with a leaf having ++ * too few items while the neighbours can be full ++ * or nearly full. ++ */ ++ nritems = btrfs_header_nritems(leaf); ++ min_push_space = leaf_space_used(leaf, 0, nritems); ++ wret = push_leaf_right(trans, root, path, 0, ++ min_push_space, 1, 0); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + } +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h +index 5fe5eccb3c87..223e9d9e1b8b 100644 +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -601,6 +601,9 @@ enum { + /* Indicate whether there are any tree modification log users */ + BTRFS_FS_TREE_MOD_LOG_USERS, + ++ /* Indicate that we want the transaction kthread to commit right now. */ ++ BTRFS_FS_COMMIT_TRANS, ++ + #if BITS_PER_LONG == 32 + /* Indicate if we have error/warn message printed on 32bit systems */ + BTRFS_FS_32BIT_ERROR, +@@ -1603,25 +1606,25 @@ DECLARE_BTRFS_SETGET_BITS(64) + static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ + const type *s) \ + { \ +- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ ++ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + return btrfs_get_##bits(eb, s, offsetof(type, member)); \ + } \ + static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ + u##bits val) \ + { \ +- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ ++ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + btrfs_set_##bits(eb, s, offsetof(type, member), val); \ + } \ + static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ + const type *s) \ + { \ +- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ ++ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + return btrfs_get_token_##bits(token, s, offsetof(type, member));\ + } \ + static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ + type *s, u##bits val) \ + { \ +- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ ++ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ + } + +@@ -1652,8 +1655,8 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ + static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, + struct btrfs_dev_item *s) + { +- BUILD_BUG_ON(sizeof(u64) != +- sizeof(((struct btrfs_dev_item *)0))->total_bytes); ++ static_assert(sizeof(u64) == ++ sizeof(((struct btrfs_dev_item *)0))->total_bytes); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, + total_bytes)); + } +@@ -1661,8 +1664,8 @@ static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, + struct btrfs_dev_item *s, + u64 val) + { +- BUILD_BUG_ON(sizeof(u64) != +- sizeof(((struct btrfs_dev_item *)0))->total_bytes); ++ static_assert(sizeof(u64) == ++ sizeof(((struct btrfs_dev_item *)0))->total_bytes); + WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); + btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); + } +@@ -1960,8 +1963,8 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb, + } + + /* struct btrfs_item */ +-BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); +-BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); ++BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); ++BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); + BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); + BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); + +@@ -1976,25 +1979,36 @@ static inline struct btrfs_item *btrfs_item_nr(int nr) + return (struct btrfs_item *)btrfs_item_nr_offset(nr); + } + +-static inline u32 btrfs_item_end(const struct extent_buffer *eb, +- struct btrfs_item *item) +-{ +- return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); ++#define BTRFS_ITEM_SETGET_FUNCS(member) \ ++static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \ ++ int slot) \ ++{ \ ++ return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \ ++} \ ++static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ ++ int slot, u32 val) \ ++{ \ ++ btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \ ++} \ ++static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ ++ int slot) \ ++{ \ ++ struct btrfs_item *item = btrfs_item_nr(slot); \ ++ return btrfs_token_raw_item_##member(token, item); \ ++} \ ++static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ ++ int slot, u32 val) \ ++{ \ ++ struct btrfs_item *item = btrfs_item_nr(slot); \ ++ btrfs_set_token_raw_item_##member(token, item, val); \ + } + +-static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr) +-{ +- return btrfs_item_end(eb, btrfs_item_nr(nr)); +-} ++BTRFS_ITEM_SETGET_FUNCS(offset) ++BTRFS_ITEM_SETGET_FUNCS(size); + +-static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr) ++static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) + { +- return btrfs_item_offset(eb, btrfs_item_nr(nr)); +-} +- +-static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr) +-{ +- return btrfs_item_size(eb, btrfs_item_nr(nr)); ++ return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); + } + + static inline void btrfs_item_key(const struct extent_buffer *eb, +@@ -2463,7 +2477,7 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) + + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); +- return btrfs_item_offset_nr(leaf, nr - 1); ++ return btrfs_item_offset(leaf, nr - 1); + } + + /* struct btrfs_file_extent_item */ +@@ -2522,9 +2536,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + */ + static inline u32 btrfs_file_extent_inline_item_len( + const struct extent_buffer *eb, +- struct btrfs_item *e) ++ int nr) + { +- return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; ++ return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; + } + + /* btrfs_qgroup_status_item */ +@@ -2616,11 +2630,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, + /* helper function to cast into the data area of the leaf. */ + #define btrfs_item_ptr(leaf, slot, type) \ + ((type *)(BTRFS_LEAF_DATA_OFFSET + \ +- btrfs_item_offset_nr(leaf, slot))) ++ btrfs_item_offset(leaf, slot))) + + #define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ +- btrfs_item_offset_nr(leaf, slot))) ++ btrfs_item_offset(leaf, slot))) + + static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) + { +@@ -3297,9 +3311,27 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns, + int btrfs_ioctl_get_supported_features(void __user *arg); + void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); + int __pure btrfs_is_empty_uuid(u8 *uuid); ++ ++struct btrfs_defrag_ctrl { ++ /* Input, read-only fields */ ++ u64 start; ++ u64 len; ++ u32 extent_thresh; ++ u64 newer_than; ++ u64 max_sectors_to_defrag; ++ u8 compress; ++ u8 flags; ++ ++ /* Output fields */ ++ u64 sectors_defragged; ++ u64 last_scanned; /* Exclusive bytenr */ ++}; ++int btrfs_defrag_ioctl_args_to_ctrl(struct btrfs_fs_info *fs_info, ++ struct btrfs_ioctl_defrag_range_args *args, ++ struct btrfs_defrag_ctrl *ctrl, ++ u64 max_sectors_to_defrag, u64 newer_than); + int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +- struct btrfs_ioctl_defrag_range_args *range, +- u64 newer_than, unsigned long max_to_defrag); ++ struct btrfs_defrag_ctrl *ctrl); + void btrfs_get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space); + void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, +diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c +index e0238dd5f2f2..66fa61cb3f23 100644 +--- a/fs/btrfs/dev-replace.c ++++ b/fs/btrfs/dev-replace.c +@@ -128,7 +128,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) + } + slot = path->slots[0]; + eb = path->nodes[0]; +- item_size = btrfs_item_size_nr(eb, slot); ++ item_size = btrfs_item_size(eb, slot); + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); + + if (item_size != sizeof(struct btrfs_dev_replace_item)) { +@@ -381,7 +381,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) + } + + if (ret == 0 && +- btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { ++ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* + * need to delete old one and insert a new one. + * Since no attempt is made to recover any old state, if the +diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c +index 7721ce0c0604..3b532bab0755 100644 +--- a/fs/btrfs/dir-item.c ++++ b/fs/btrfs/dir-item.c +@@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + char *ptr; +- struct btrfs_item *item; + struct extent_buffer *leaf; + + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); +@@ -41,10 +40,9 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle + return ERR_PTR(ret); + WARN_ON(ret > 0); + leaf = path->nodes[0]; +- item = btrfs_item_nr(path->slots[0]); + ptr = btrfs_item_ptr(leaf, path->slots[0], char); +- BUG_ON(data_size > btrfs_item_size(leaf, item)); +- ptr += btrfs_item_size(leaf, item) - data_size; ++ ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0])); ++ ptr += btrfs_item_size(leaf, path->slots[0]) - data_size; + return (struct btrfs_dir_item *)ptr; + } + +@@ -271,7 +269,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + data_size = sizeof(*di) + name_len; + leaf = path->nodes[0]; + slot = path->slots[0]; +- if (data_size + btrfs_item_size_nr(leaf, slot) + ++ if (data_size + btrfs_item_size(leaf, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { + ret = -EOVERFLOW; + } else { +@@ -409,7 +407,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, + leaf = path->nodes[0]; + dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + +- total_len = btrfs_item_size_nr(leaf, path->slots[0]); ++ total_len = btrfs_item_size(leaf, path->slots[0]); + while (cur < total_len) { + this_len = sizeof(*dir_item) + + btrfs_dir_name_len(leaf, dir_item) + +@@ -445,7 +443,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + leaf = path->nodes[0]; + sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di); +- item_len = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_len = btrfs_item_size(leaf, path->slots[0]); + if (sub_item_len == item_len) { + ret = btrfs_del_item(trans, root, path); + } else { +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 5f0a879c1043..e4275da0572c 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -1935,7 +1935,8 @@ static int transaction_kthread(void *arg) + } + + delta = ktime_get_seconds() - cur->start_time; +- if (cur->state < TRANS_STATE_COMMIT_START && ++ if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && ++ cur->state < TRANS_STATE_COMMIT_START && + delta < fs_info->commit_interval) { + spin_unlock(&fs_info->trans_lock); + delay -= msecs_to_jiffies((delta - 1) * 1000); +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 7b4ee1b2d5d8..91ca32c9459a 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -171,7 +171,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + + if (ret == 0) { + leaf = path->nodes[0]; +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); +@@ -865,7 +865,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, + } + + leaf = path->nodes[0]; +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + if (unlikely(item_size < sizeof(*ei))) { + err = -EINVAL; + btrfs_print_v0_err(fs_info); +@@ -1007,7 +1007,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, + __run_delayed_extent_op(extent_op, leaf, ei); + + ptr = (unsigned long)ei + item_offset; +- end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); ++ end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]); + if (ptr < end - size) + memmove_extent_buffer(leaf, ptr + size, ptr, + end - size - ptr); +@@ -1119,7 +1119,7 @@ void update_inline_extent_backref(struct btrfs_path *path, + } else { + *last_ref = 1; + size = btrfs_extent_inline_ref_size(type); +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + if (ptr + size < end) +@@ -1634,7 +1634,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, + } + + leaf = path->nodes[0]; +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + + if (unlikely(item_size < sizeof(*ei))) { + err = -EINVAL; +@@ -2316,7 +2316,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, + goto out; + + ret = 1; +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + + /* If extent item has more than 1 inline ref then it's shared */ +@@ -3068,7 +3068,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + } + + leaf = path->nodes[0]; +- item_size = btrfs_item_size_nr(leaf, extent_slot); ++ item_size = btrfs_item_size(leaf, extent_slot); + if (unlikely(item_size < sizeof(*ei))) { + ret = -EINVAL; + btrfs_print_v0_err(info); +diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c +index d1cbb64a78f3..107d6557ebc3 100644 +--- a/fs/btrfs/file-item.c ++++ b/fs/btrfs/file-item.c +@@ -208,7 +208,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, + + csum_offset = (bytenr - found_key.offset) >> + fs_info->sectorsize_bits; +- csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); ++ csums_in_item = btrfs_item_size(leaf, path->slots[0]); + csums_in_item /= csum_size; + + if (csum_offset == csums_in_item) { +@@ -274,7 +274,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info, + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); +- itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]); ++ itemsize = btrfs_item_size(path->nodes[0], path->slots[0]); + + csum_start = key.offset; + csum_len = (itemsize / csum_size) * sectorsize; +@@ -291,7 +291,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info, + goto out; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); +- itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]); ++ itemsize = btrfs_item_size(path->nodes[0], path->slots[0]); + + csum_start = key.offset; + csum_len = (itemsize / csum_size) * sectorsize; +@@ -534,7 +534,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + key.type == BTRFS_EXTENT_CSUM_KEY) { + offset = (start - key.offset) >> fs_info->sectorsize_bits; + if (offset * csum_size < +- btrfs_item_size_nr(leaf, path->slots[0] - 1)) ++ btrfs_item_size(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } +@@ -559,7 +559,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + if (key.offset > start) + start = key.offset; + +- size = btrfs_item_size_nr(leaf, path->slots[0]); ++ size = btrfs_item_size(leaf, path->slots[0]); + csum_end = key.offset + (size / csum_size) * fs_info->sectorsize; + if (csum_end <= start) { + path->slots[0]++; +@@ -750,7 +750,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, + u32 blocksize_bits = fs_info->sectorsize_bits; + + leaf = path->nodes[0]; +- csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; ++ csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size; + csum_end <<= blocksize_bits; + csum_end += key->offset; + +@@ -834,7 +834,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, + if (key.offset >= end_byte) + break; + +- csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; ++ csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size; + csum_end <<= blocksize_bits; + csum_end += key.offset; + +@@ -1002,7 +1002,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + item_end = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((char *)item_end + +- btrfs_item_size_nr(leaf, path->slots[0])); ++ btrfs_item_size(leaf, path->slots[0])); + goto found; + } + ret = PTR_ERR(item); +@@ -1013,7 +1013,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + u32 item_size; + /* we found one, but it isn't big enough yet */ + leaf = path->nodes[0]; +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + if ((item_size / csum_size) >= + MAX_CSUM_ITEMS(fs_info, csum_size)) { + /* already at max size, make a new one */ +@@ -1070,7 +1070,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + } + + extend_csum: +- if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) / ++ if (csum_offset == btrfs_item_size(leaf, path->slots[0]) / + csum_size) { + int extend_nr; + u64 tmp; +@@ -1125,7 +1125,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + diff = min(diff, + MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); + +- diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); ++ diff = diff - btrfs_item_size(leaf, path->slots[0]); + diff = min_t(u32, btrfs_leaf_free_space(leaf), diff); + diff /= csum_size; + diff *= csum_size; +@@ -1162,7 +1162,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + csum: + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((unsigned char *)item + +- btrfs_item_size_nr(leaf, path->slots[0])); ++ btrfs_item_size(leaf, path->slots[0])); + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); + found: +@@ -1208,6 +1208,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, + extent_start = key.offset; + extent_end = btrfs_file_extent_end(path); + em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); ++ em->generation = btrfs_file_extent_generation(leaf, fi); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; +diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c +index 11204dbbe053..12e63be6a35b 100644 +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -277,8 +277,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + { + struct btrfs_root *inode_root; + struct inode *inode; +- struct btrfs_ioctl_defrag_range_args range; +- int num_defrag; ++ struct btrfs_defrag_ctrl ctrl = {0}; + int ret; + + /* get the inode */ +@@ -297,21 +296,23 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + + /* do a chunk of defrag */ + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); +- memset(&range, 0, sizeof(range)); +- range.len = (u64)-1; +- range.start = defrag->last_offset; ++ ctrl.len = (u64)-1; ++ ctrl.start = defrag->last_offset; ++ ctrl.newer_than = defrag->transid; ++ ctrl.max_sectors_to_defrag = BTRFS_DEFRAG_BATCH; + + sb_start_write(fs_info->sb); +- num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, +- BTRFS_DEFRAG_BATCH); ++ ret = btrfs_defrag_file(inode, NULL, &ctrl); + sb_end_write(fs_info->sb); ++ if (ret < 0) ++ goto out; + /* + * if we filled the whole defrag batch, there + * must be more work to do. Queue this defrag + * again + */ +- if (num_defrag == BTRFS_DEFRAG_BATCH) { +- defrag->last_offset = range.start; ++ if (ctrl.sectors_defragged == BTRFS_DEFRAG_BATCH) { ++ defrag->last_offset = ctrl.last_scanned; + btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); + } else if (defrag->last_offset && !defrag->cycled) { + /* +@@ -325,7 +326,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } +- ++out: + iput(inode); + return 0; + cleanup: +@@ -718,7 +719,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, + int modify_tree = -1; + int update_refs; + int found = 0; +- int leafs_visited = 0; + struct btrfs_path *path = args->path; + + args->bytes_found = 0; +@@ -756,7 +756,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, + path->slots[0]--; + } + ret = 0; +- leafs_visited++; + next_slot: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { +@@ -768,7 +767,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, + ret = 0; + break; + } +- leafs_visited++; + leaf = path->nodes[0]; + recow = 1; + } +@@ -1014,7 +1012,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, + * which case it unlocked our path, so check path->locks[0] matches a + * write lock. + */ +- if (!ret && args->replace_extent && leafs_visited == 1 && ++ if (!ret && args->replace_extent && + path->locks[0] == BTRFS_WRITE_LOCK && + btrfs_leaf_free_space(leaf) >= + sizeof(struct btrfs_item) + args->extent_item_size) { +diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c +index f3fee88c8ee0..a45017b12185 100644 +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -1580,6 +1580,50 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, + return 0; + } + ++/* ++ * This is a little subtle. We *only* have ->max_extent_size set if we actually ++ * searched through the bitmap and figured out the largest ->max_extent_size, ++ * otherwise it's 0. In the case that it's 0 we don't want to tell the ++ * allocator the wrong thing, we want to use the actual real max_extent_size ++ * we've found already if it's larger, or we want to use ->bytes. ++ * ++ * This matters because find_free_space() will skip entries who's ->bytes is ++ * less than the required bytes. So if we didn't search down this bitmap, we ++ * may pick some previous entry that has a smaller ->max_extent_size than we ++ * have. For example, assume we have two entries, one that has ++ * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set ++ * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will ++ * call into find_free_space(), and return with max_extent_size == 4K, because ++ * that first bitmap entry had ->max_extent_size set, but the second one did ++ * not. If instead we returned 8K we'd come in searching for 8K, and find the ++ * 8K contiguous range. ++ * ++ * Consider the other case, we have 2 8K chunks in that second entry and still ++ * don't have ->max_extent_size set. We'll return 16K, and the next time the ++ * allocator comes in it'll fully search our second bitmap, and this time it'll ++ * get an uptodate value of 8K as the maximum chunk size. Then we'll get the ++ * right allocation the next loop through. ++ */ ++static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) ++{ ++ if (entry->bitmap && entry->max_extent_size) ++ return entry->max_extent_size; ++ return entry->bytes; ++} ++ ++/* ++ * We want the largest entry to be leftmost, so this is inverted from what you'd ++ * normally expect. ++ */ ++static bool entry_less(struct rb_node *node, const struct rb_node *parent) ++{ ++ const struct btrfs_free_space *entry, *exist; ++ ++ entry = rb_entry(node, struct btrfs_free_space, bytes_index); ++ exist = rb_entry(parent, struct btrfs_free_space, bytes_index); ++ return get_max_extent_size(exist) < get_max_extent_size(entry); ++} ++ + /* + * searches the tree for the given offset. + * +@@ -1592,15 +1636,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, + u64 offset, int bitmap_only, int fuzzy) + { + struct rb_node *n = ctl->free_space_offset.rb_node; +- struct btrfs_free_space *entry, *prev = NULL; ++ struct btrfs_free_space *entry = NULL, *prev = NULL; + + /* find entry that is closest to the 'offset' */ +- while (1) { +- if (!n) { +- entry = NULL; +- break; +- } +- ++ while (n) { + entry = rb_entry(n, struct btrfs_free_space, offset_index); + prev = entry; + +@@ -1610,6 +1649,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, + n = n->rb_right; + else + break; ++ ++ entry = NULL; + } + + if (bitmap_only) { +@@ -1686,6 +1727,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, + return NULL; + + while (1) { ++ n = rb_next(&entry->offset_index); ++ if (!n) ++ return NULL; ++ entry = rb_entry(n, struct btrfs_free_space, offset_index); + if (entry->bitmap) { + if (entry->offset + BITS_PER_BITMAP * + ctl->unit > offset) +@@ -1694,11 +1739,6 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, + if (entry->offset + entry->bytes > offset) + break; + } +- +- n = rb_next(&entry->offset_index); +- if (!n) +- return NULL; +- entry = rb_entry(n, struct btrfs_free_space, offset_index); + } + return entry; + } +@@ -1708,6 +1748,7 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) + { + rb_erase(&info->offset_index, &ctl->free_space_offset); ++ rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); + ctl->free_extents--; + + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { +@@ -1734,6 +1775,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, + if (ret) + return ret; + ++ rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); ++ + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR]++; + ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; +@@ -1744,6 +1787,22 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, + return ret; + } + ++static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, ++ struct btrfs_free_space *info) ++{ ++ ASSERT(info->bitmap); ++ ++ /* ++ * If our entry is empty it's because we're on a cluster and we don't ++ * want to re-link it into our ctl bytes index. ++ */ ++ if (RB_EMPTY_NODE(&info->bytes_index)) ++ return; ++ ++ rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); ++ rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); ++} ++ + static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + u64 offset, u64 bytes) +@@ -1762,6 +1821,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + if (info->max_extent_size > ctl->unit) + info->max_extent_size = 0; + ++ relink_bitmap_entry(ctl, info); ++ + if (start && test_bit(start - 1, info->bitmap)) + extent_delta++; + +@@ -1797,9 +1858,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, + + bitmap_set(info->bitmap, start, count); + ++ /* ++ * We set some bytes, we have no idea what the max extent size is ++ * anymore. ++ */ ++ info->max_extent_size = 0; + info->bytes += bytes; + ctl->free_space += bytes; + ++ relink_bitmap_entry(ctl, info); ++ + if (start && test_bit(start - 1, info->bitmap)) + extent_delta--; + +@@ -1867,20 +1935,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, + + *bytes = (u64)(max_bits) * ctl->unit; + bitmap_info->max_extent_size = *bytes; ++ relink_bitmap_entry(ctl, bitmap_info); + return -1; + } + +-static inline u64 get_max_extent_size(struct btrfs_free_space *entry) +-{ +- if (entry->bitmap) +- return entry->max_extent_size; +- return entry->bytes; +-} +- + /* Cache the size of the max extent in bytes */ + static struct btrfs_free_space * + find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, +- unsigned long align, u64 *max_extent_size) ++ unsigned long align, u64 *max_extent_size, bool use_bytes_index) + { + struct btrfs_free_space *entry; + struct rb_node *node; +@@ -1890,16 +1952,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, + + if (!ctl->free_space_offset.rb_node) + goto out; ++again: ++ if (use_bytes_index) { ++ node = rb_first_cached(&ctl->free_space_bytes); ++ } else { ++ entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), ++ 0, 1); ++ if (!entry) ++ goto out; ++ node = &entry->offset_index; ++ } + +- entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); +- if (!entry) +- goto out; ++ for (; node; node = rb_next(node)) { ++ if (use_bytes_index) ++ entry = rb_entry(node, struct btrfs_free_space, ++ bytes_index); ++ else ++ entry = rb_entry(node, struct btrfs_free_space, ++ offset_index); + +- for (node = &entry->offset_index; node; node = rb_next(node)) { +- entry = rb_entry(node, struct btrfs_free_space, offset_index); ++ /* ++ * If we are using the bytes index then all subsequent entries ++ * in this tree are going to be < bytes, so simply set the max ++ * extent size and exit the loop. ++ * ++ * If we're using the offset index then we need to keep going ++ * through the rest of the tree. ++ */ + if (entry->bytes < *bytes) { + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); ++ if (use_bytes_index) ++ break; + continue; + } + +@@ -1916,6 +2000,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, + tmp = entry->offset; + } + ++ /* ++ * We don't break here if we're using the bytes index because we ++ * may have another entry that has the correct alignment that is ++ * the right size, so we don't want to miss that possibility. ++ * At worst this adds another loop through the logic, but if we ++ * broke here we could prematurely ENOSPC. ++ */ + if (entry->bytes < *bytes + align_off) { + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); +@@ -1923,6 +2014,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, + } + + if (entry->bitmap) { ++ struct rb_node *old_next = rb_next(node); + u64 size = *bytes; + + ret = search_bitmap(ctl, entry, &tmp, &size, true); +@@ -1935,6 +2027,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, + max(get_max_extent_size(entry), + *max_extent_size); + } ++ ++ /* ++ * The bitmap may have gotten re-arranged in the space ++ * index here because the max_extent_size may have been ++ * updated. Start from the beginning again if this ++ * happened. ++ */ ++ if (use_bytes_index && old_next != rb_next(node)) ++ goto again; + continue; + } + +@@ -2083,12 +2184,6 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, + + bitmap_set_bits(ctl, info, offset, bytes_to_set); + +- /* +- * We set some bytes, we have no idea what the max extent size is +- * anymore. +- */ +- info->max_extent_size = 0; +- + return bytes_to_set; + + } +@@ -2486,6 +2581,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, + info->bytes = bytes; + info->trim_state = trim_state; + RB_CLEAR_NODE(&info->offset_index); ++ RB_CLEAR_NODE(&info->bytes_index); + + spin_lock(&ctl->tree_lock); + +@@ -2799,6 +2895,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, + ctl->start = block_group->start; + ctl->private = block_group; + ctl->op = &free_space_op; ++ ctl->free_space_bytes = RB_ROOT_CACHED; + INIT_LIST_HEAD(&ctl->trimming_ranges); + mutex_init(&ctl->cache_writeout_mutex); + +@@ -2864,6 +2961,8 @@ static void __btrfs_return_cluster_to_free_space( + } + tree_insert_offset(&ctl->free_space_offset, + entry->offset, &entry->offset_index, bitmap); ++ rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, ++ entry_less); + } + cluster->root = RB_ROOT; + spin_unlock(&cluster->lock); +@@ -2965,12 +3064,14 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, + u64 align_gap = 0; + u64 align_gap_len = 0; + enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; ++ bool use_bytes_index = (offset == block_group->start); + + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + + spin_lock(&ctl->tree_lock); + entry = find_free_space(ctl, &offset, &bytes_search, +- block_group->full_stripe_len, max_extent_size); ++ block_group->full_stripe_len, max_extent_size, ++ use_bytes_index); + if (!entry) + goto out; + +@@ -3254,6 +3355,17 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, + + cluster->window_start = start * ctl->unit + entry->offset; + rb_erase(&entry->offset_index, &ctl->free_space_offset); ++ rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); ++ ++ /* ++ * We need to know if we're currently on the normal space index when we ++ * manipulate the bitmap so that we know we need to remove and re-insert ++ * it into the space_index tree. Clear the bytes_index node here so the ++ * bitmap manipulation helpers know not to mess with the space_index ++ * until this bitmap entry is added back into the normal cache. ++ */ ++ RB_CLEAR_NODE(&entry->bytes_index); ++ + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index, 1); + ASSERT(!ret); /* -EEXIST; Logic error */ +@@ -3344,6 +3456,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, + continue; + + rb_erase(&entry->offset_index, &ctl->free_space_offset); ++ rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index, 0); + total_size += entry->bytes; +diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h +index 1f23088d43f9..dd982d204d2d 100644 +--- a/fs/btrfs/free-space-cache.h ++++ b/fs/btrfs/free-space-cache.h +@@ -22,6 +22,7 @@ enum btrfs_trim_state { + + struct btrfs_free_space { + struct rb_node offset_index; ++ struct rb_node bytes_index; + u64 offset; + u64 bytes; + u64 max_extent_size; +@@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap( + struct btrfs_free_space_ctl { + spinlock_t tree_lock; + struct rb_root free_space_offset; ++ struct rb_root_cached free_space_bytes; + u64 free_space; + int extents_thresh; + int free_extents; +diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c +index 37f36ffdaf6b..56755ce9a907 100644 +--- a/fs/btrfs/inode-item.c ++++ b/fs/btrfs/inode-item.c +@@ -19,7 +19,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, + u32 cur_offset = 0; + int len; + +- item_size = btrfs_item_size_nr(leaf, slot); ++ item_size = btrfs_item_size(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); + while (cur_offset < item_size) { + ref = (struct btrfs_inode_ref *)(ptr + cur_offset); +@@ -45,7 +45,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( + u32 cur_offset = 0; + int ref_name_len; + +- item_size = btrfs_item_size_nr(leaf, slot); ++ item_size = btrfs_item_size(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); + + /* +@@ -139,7 +139,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, + } + + leaf = path->nodes[0]; +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + if (index) + *index = btrfs_inode_extref_index(leaf, extref); + +@@ -208,7 +208,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + goto out; + } + leaf = path->nodes[0]; +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + + if (index) + *index = btrfs_inode_ref_index(leaf, ref); +@@ -256,7 +256,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; +- struct btrfs_item *item; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; +@@ -282,9 +281,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, + goto out; + + leaf = path->nodes[0]; +- item = btrfs_item_nr(path->slots[0]); + ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); +- ptr += btrfs_item_size(leaf, item) - ins_len; ++ ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len; + extref = (struct btrfs_inode_extref *)ptr; + + btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); +@@ -332,7 +330,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + if (ref) + goto out; + +- old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); ++ old_size = btrfs_item_size(path->nodes[0], path->slots[0]); + btrfs_extend_item(path, ins_len); + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 39a674543461..0ed8cc6afa37 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -61,8 +61,6 @@ struct btrfs_iget_args { + }; + + struct btrfs_dio_data { +- u64 reserve; +- loff_t length; + ssize_t submitted; + struct extent_changeset *data_reserved; + }; +@@ -625,7 +623,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) + again: + will_compress = 0; + nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; +- BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0); + nr_pages = min_t(unsigned long, nr_pages, + BTRFS_MAX_COMPRESSED / PAGE_SIZE); + +@@ -5950,21 +5947,17 @@ static struct inode *new_simple_dir(struct super_block *s, + return inode; + } + ++static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN); ++static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE); ++static_assert(BTRFS_FT_DIR == FT_DIR); ++static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV); ++static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV); ++static_assert(BTRFS_FT_FIFO == FT_FIFO); ++static_assert(BTRFS_FT_SOCK == FT_SOCK); ++static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); ++ + static inline u8 btrfs_inode_type(struct inode *inode) + { +- /* +- * Compile-time asserts that generic FT_* types still match +- * BTRFS_FT_* types +- */ +- BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); +- BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); +- BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); +- BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); +- BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); +- BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); +- BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); +- BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); +- + return fs_umode_to_ftype(inode->i_mode); + } + +@@ -6998,8 +6991,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, + WARN_ON(pg_offset != 0); + compress_type = btrfs_file_extent_compression(leaf, item); + max_size = btrfs_file_extent_ram_bytes(leaf, item); +- inline_size = btrfs_file_extent_inline_item_len(leaf, +- btrfs_item_nr(path->slots[0])); ++ inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); + tmp = kmalloc(inline_size, GFP_NOFS); + if (!tmp) + return -ENOMEM; +@@ -7773,6 +7765,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, + { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_map *em = *map; ++ int type; ++ u64 block_start, orig_start, orig_block_len, ram_bytes; ++ bool can_nocow = false; ++ bool space_reserved = false; + int ret = 0; + + /* +@@ -7787,9 +7783,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { +- int type; +- u64 block_start, orig_start, orig_block_len, ram_bytes; +- + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else +@@ -7799,53 +7792,92 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, + + if (can_nocow_extent(inode, start, &len, &orig_start, + &orig_block_len, &ram_bytes, false) == 1 && +- btrfs_inc_nocow_writers(fs_info, block_start)) { +- struct extent_map *em2; ++ btrfs_inc_nocow_writers(fs_info, block_start)) ++ can_nocow = true; ++ } + +- em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, +- orig_start, block_start, +- len, orig_block_len, +- ram_bytes, type); ++ if (can_nocow) { ++ struct extent_map *em2; ++ ++ /* We can NOCOW, so only need to reserve metadata space. */ ++ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); ++ if (ret < 0) { ++ /* Our caller expects us to free the input extent map. */ ++ free_extent_map(em); ++ *map = NULL; + btrfs_dec_nocow_writers(fs_info, block_start); +- if (type == BTRFS_ORDERED_PREALLOC) { +- free_extent_map(em); +- *map = em = em2; +- } +- +- if (em2 && IS_ERR(em2)) { +- ret = PTR_ERR(em2); +- goto out; +- } +- /* +- * For inode marked NODATACOW or extent marked PREALLOC, +- * use the existing or preallocated extent, so does not +- * need to adjust btrfs_space_info's bytes_may_use. +- */ +- btrfs_free_reserved_data_space_noquota(fs_info, len); +- goto skip_cow; ++ goto out; + } ++ space_reserved = true; ++ ++ em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, ++ orig_start, block_start, ++ len, orig_block_len, ++ ram_bytes, type); ++ btrfs_dec_nocow_writers(fs_info, block_start); ++ if (type == BTRFS_ORDERED_PREALLOC) { ++ free_extent_map(em); ++ *map = em = em2; ++ } ++ ++ if (IS_ERR(em2)) { ++ ret = PTR_ERR(em2); ++ goto out; ++ } ++ } else { ++ const u64 prev_len = len; ++ ++ /* Our caller expects us to free the input extent map. */ ++ free_extent_map(em); ++ *map = NULL; ++ ++ /* We have to COW, so need to reserve metadata and data space. */ ++ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), ++ &dio_data->data_reserved, ++ start, len); ++ if (ret < 0) ++ goto out; ++ space_reserved = true; ++ ++ em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); ++ if (IS_ERR(em)) { ++ ret = PTR_ERR(em); ++ goto out; ++ } ++ *map = em; ++ len = min(len, em->len - (start - em->start)); ++ if (len < prev_len) ++ btrfs_delalloc_release_space(BTRFS_I(inode), ++ dio_data->data_reserved, ++ start + len, prev_len - len, ++ true); + } + +- /* this will cow the extent */ +- free_extent_map(em); +- *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); +- if (IS_ERR(em)) { +- ret = PTR_ERR(em); +- goto out; +- } ++ /* ++ * We have created our ordered extent, so we can now release our reservation ++ * for an outstanding extent. ++ */ ++ btrfs_delalloc_release_extents(BTRFS_I(inode), len); + +- len = min(len, em->len - (start - em->start)); +- +-skip_cow: + /* + * Need to update the i_size under the extent lock so buffered + * readers will get the updated i_size when we unlock. + */ + if (start + len > i_size_read(inode)) + i_size_write(inode, start + len); +- +- dio_data->reserve -= len; + out: ++ if (ret && space_reserved) { ++ btrfs_delalloc_release_extents(BTRFS_I(inode), len); ++ if (can_nocow) { ++ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); ++ } else { ++ btrfs_delalloc_release_space(BTRFS_I(inode), ++ dio_data->data_reserved, ++ start, len, true); ++ extent_changeset_free(dio_data->data_reserved); ++ dio_data->data_reserved = NULL; ++ } ++ } + return ret; + } + +@@ -7887,18 +7919,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + if (!dio_data) + return -ENOMEM; + +- dio_data->length = length; +- if (write) { +- dio_data->reserve = round_up(length, fs_info->sectorsize); +- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), +- &dio_data->data_reserved, +- start, dio_data->reserve); +- if (ret) { +- extent_changeset_free(dio_data->data_reserved); +- kfree(dio_data); +- return ret; +- } +- } + iomap->private = dio_data; + + +@@ -7991,14 +8011,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); + err: +- if (dio_data) { +- btrfs_delalloc_release_space(BTRFS_I(inode), +- dio_data->data_reserved, start, +- dio_data->reserve, true); +- btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); +- extent_changeset_free(dio_data->data_reserved); +- kfree(dio_data); +- } ++ kfree(dio_data); ++ + return ret; + } + +@@ -8028,14 +8042,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ret = -ENOTBLK; + } + +- if (write) { +- if (dio_data->reserve) +- btrfs_delalloc_release_space(BTRFS_I(inode), +- dio_data->data_reserved, pos, +- dio_data->reserve, true); +- btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); ++ if (write) + extent_changeset_free(dio_data->data_reserved); +- } + out: + kfree(dio_data); + iomap->private = NULL; +diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c +index 48e03e176f31..5de240144273 100644 +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -1020,23 +1020,37 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, + return em; + } + ++static u32 get_extent_max_capacity(const struct extent_map *em) ++{ ++ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) ++ return BTRFS_MAX_COMPRESSED; ++ return BTRFS_MAX_EXTENT_SIZE; ++} ++ + static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, + bool locked) + { + struct extent_map *next; +- bool ret = true; ++ bool ret = false; + + /* this is the last extent */ + if (em->start + em->len >= i_size_read(inode)) +- return false; ++ return ret; + + next = defrag_lookup_extent(inode, em->start + em->len, locked); ++ /* No more em or hole */ + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) +- ret = false; +- else if ((em->block_start + em->block_len == next->block_start) && +- (em->block_len > SZ_128K && next->block_len > SZ_128K)) +- ret = false; +- ++ goto out; ++ if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) ++ goto out; ++ /* ++ * If the next extent is at its max capcity, defragging current extent ++ * makes no sense, as the total number of extents won't change. ++ */ ++ if (next->len >= get_extent_max_capacity(em)) ++ goto out; ++ ret = true; ++out: + free_extent_map(next); + return ret; + } +@@ -1146,22 +1160,21 @@ struct defrag_target_range { + /* + * Collect all valid target extents. + * ++ * @ctrl: extra defrag policy control + * @start: file offset to lookup + * @len: length to lookup +- * @extent_thresh: file extent size threshold, any extent size >= this value +- * will be ignored +- * @newer_than: only defrag extents newer than this value +- * @do_compress: whether the defrag is doing compression +- * if true, @extent_thresh will be ignored and all regular +- * file extents meeting @newer_than will be targets. + * @locked: if the range has already held extent lock + * @target_list: list of targets file extents ++ * ++ * Will update ctrl::last_scanned. + */ + static int defrag_collect_targets(struct btrfs_inode *inode, +- u64 start, u64 len, u32 extent_thresh, +- u64 newer_than, bool do_compress, +- bool locked, struct list_head *target_list) ++ struct btrfs_defrag_ctrl *ctrl, ++ u64 start, u32 len, bool locked, ++ struct list_head *target_list) + { ++ bool do_compress = ctrl->flags & BTRFS_DEFRAG_RANGE_COMPRESS; ++ bool last_is_target = false; + u64 cur = start; + int ret = 0; + +@@ -1171,6 +1184,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, + bool next_mergeable = true; + u64 range_len; + ++ last_is_target = false; + em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); + if (!em) + break; +@@ -1181,7 +1195,11 @@ static int defrag_collect_targets(struct btrfs_inode *inode, + goto next; + + /* Skip older extent */ +- if (em->generation < newer_than) ++ if (em->generation < ctrl->newer_than) ++ goto next; ++ ++ /* This em is under writeback, no need to defrag */ ++ if (em->generation == (u64)-1) + goto next; + + /* +@@ -1221,7 +1239,14 @@ static int defrag_collect_targets(struct btrfs_inode *inode, + goto add; + + /* Skip too large extent */ +- if (range_len >= extent_thresh) ++ if (range_len >= ctrl->extent_thresh) ++ goto next; ++ ++ /* ++ * Skip extents already at its max capacity, this is mostly for ++ * compressed extents, which max cap is only 128K. ++ */ ++ if (em->len >= get_extent_max_capacity(em)) + goto next; + + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, +@@ -1242,6 +1267,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, + } + + add: ++ last_is_target = true; + range_len = min(extent_map_end(em), start + len) - cur; + /* + * This one is a good target, check if it can be merged into +@@ -1285,10 +1311,27 @@ static int defrag_collect_targets(struct btrfs_inode *inode, + kfree(entry); + } + } ++ if (!ret) { ++ /* ++ * If the last extent is not a target, the caller can skip to ++ * the end of that extent. ++ * Otherwise, we can only go the end of the spcified range. ++ * ++ * And we may got a range smaller than current ++ * ctrl->last_scanned (e.g. executed in the defrag_one_range ++ * call), so we have to ensure we didn't decrease ++ * ctrl->last_scanned. ++ */ ++ if (!last_is_target) ++ ctrl->last_scanned = max(cur, ctrl->last_scanned); ++ else ++ ctrl->last_scanned = max(start + len, ctrl->last_scanned); ++ } + return ret; + } + + #define CLUSTER_SIZE (SZ_256K) ++static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + + /* + * Defrag one contiguous target range. +@@ -1342,8 +1385,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, + return ret; + } + +-static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, +- u32 extent_thresh, u64 newer_than, bool do_compress) ++static int defrag_one_range(struct btrfs_inode *inode, ++ struct btrfs_defrag_ctrl *ctrl, u64 start, u32 len) + { + struct extent_state *cached_state = NULL; + struct defrag_target_range *entry; +@@ -1387,8 +1430,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, + * And this time we have extent locked already, pass @locked = true + * so that we won't relock the extent range and cause deadlock. + */ +- ret = defrag_collect_targets(inode, start, len, extent_thresh, +- newer_than, do_compress, true, ++ ret = defrag_collect_targets(inode, ctrl, start, len, true, + &target_list); + if (ret < 0) + goto unlock_extent; +@@ -1398,6 +1440,8 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, + &cached_state); + if (ret < 0) + break; ++ ctrl->sectors_defragged += entry->len >> ++ inode->root->fs_info->sectorsize_bits; + } + + list_for_each_entry_safe(entry, tmp, &target_list, list) { +@@ -1419,12 +1463,17 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, + return ret; + } + ++/* ++ * Return <0 for error. ++ * Return >0 if we hit the ctrl->max_sectors_to_defrag limit ++ * Return 0 if we finished the range without error. ++ * ++ * For >= 0 case, ctrl->last_scanned and ctrl->sectors_defragged will be updated. ++ */ + static int defrag_one_cluster(struct btrfs_inode *inode, + struct file_ra_state *ra, +- u64 start, u32 len, u32 extent_thresh, +- u64 newer_than, bool do_compress, +- unsigned long *sectors_defragged, +- unsigned long max_sectors) ++ struct btrfs_defrag_ctrl *ctrl, ++ u64 start, u32 len) + { + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct defrag_target_range *entry; +@@ -1432,9 +1481,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode, + LIST_HEAD(target_list); + int ret; + +- BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); +- ret = defrag_collect_targets(inode, start, len, extent_thresh, +- newer_than, do_compress, false, ++ ret = defrag_collect_targets(inode, ctrl, start, len, false, + &target_list); + if (ret < 0) + goto out; +@@ -1443,32 +1490,25 @@ static int defrag_one_cluster(struct btrfs_inode *inode, + u32 range_len = entry->len; + + /* Reached or beyond the limit */ +- if (max_sectors && *sectors_defragged >= max_sectors) { ++ if (ctrl->max_sectors_to_defrag && ++ ctrl->sectors_defragged >= ctrl->max_sectors_to_defrag) { + ret = 1; + break; + } + +- if (max_sectors) ++ if (ctrl->max_sectors_to_defrag) + range_len = min_t(u32, range_len, +- (max_sectors - *sectors_defragged) * sectorsize); ++ (ctrl->max_sectors_to_defrag - ++ ctrl->sectors_defragged) * sectorsize); + + if (ra) + page_cache_sync_readahead(inode->vfs_inode.i_mapping, + ra, NULL, entry->start >> PAGE_SHIFT, + ((entry->start + range_len - 1) >> PAGE_SHIFT) - + (entry->start >> PAGE_SHIFT) + 1); +- /* +- * Here we may not defrag any range if holes are punched before +- * we locked the pages. +- * But that's fine, it only affects the @sectors_defragged +- * accounting. +- */ +- ret = defrag_one_range(inode, entry->start, range_len, +- extent_thresh, newer_than, do_compress); ++ ret = defrag_one_range(inode, ctrl, entry->start, range_len); + if (ret < 0) + break; +- *sectors_defragged += range_len >> +- inode->root->fs_info->sectorsize_bits; + } + out: + list_for_each_entry_safe(entry, tmp, &target_list, list) { +@@ -1478,64 +1518,93 @@ static int defrag_one_cluster(struct btrfs_inode *inode, + return ret; + } + ++/* ++ * Convert the old ioctl format to the new btrfs_defrag_ctrl structure. ++ * ++ * Will also do basic tasks like setting default values and sanity checks. ++ */ ++int btrfs_defrag_ioctl_args_to_ctrl(struct btrfs_fs_info *fs_info, ++ struct btrfs_ioctl_defrag_range_args *args, ++ struct btrfs_defrag_ctrl *ctrl, ++ u64 max_sectors_to_defrag, u64 newer_than) ++{ ++ u64 range_end; ++ ++ if (args->flags & ~BTRFS_DEFRAG_RANGE_FLAGS_MASK) ++ return -EOPNOTSUPP; ++ if (args->compress_type >= BTRFS_NR_COMPRESS_TYPES) ++ return -EOPNOTSUPP; ++ ++ ctrl->start = round_down(args->start, fs_info->sectorsize); ++ /* ++ * If @len does not overflow with @start nor is -1, align the length. ++ * Otherwise set it to (u64)-1 so later btrfs_defrag_file() will ++ * determine the length using isize. ++ */ ++ if (!check_add_overflow(args->start, args->len, &range_end) && ++ args->len != (u64)-1) ++ ctrl->len = round_up(range_end, fs_info->sectorsize) - ++ ctrl->start; ++ else ++ ctrl->len = -1; ++ ctrl->flags = args->flags; ++ ctrl->compress = args->compress_type; ++ if (args->extent_thresh == 0) ++ ctrl->extent_thresh = SZ_256K; ++ else ++ ctrl->extent_thresh = args->extent_thresh; ++ ctrl->newer_than = newer_than; ++ ctrl->last_scanned = 0; ++ ctrl->sectors_defragged = 0; ++ return 0; ++} ++ + /* + * Entry point to file defragmentation. + * + * @inode: inode to be defragged + * @ra: readahead state (can be NUL) +- * @range: defrag options including range and flags +- * @newer_than: minimum transid to defrag +- * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode +- * will be defragged. ++ * @ctrl: defrag options including range and various policy parameters + * + * Return <0 for error. +- * Return >=0 for the number of sectors defragged, and range->start will be updated +- * to indicate the file offset where next defrag should be started at. +- * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without +- * defragging all the range). ++ * Return 0 if the defrag is done without error, ctrl->last_scanned and ++ * ctrl->sectors_defragged will be updated. + */ + int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +- struct btrfs_ioctl_defrag_range_args *range, +- u64 newer_than, unsigned long max_to_defrag) ++ struct btrfs_defrag_ctrl *ctrl) + { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +- unsigned long sectors_defragged = 0; + u64 isize = i_size_read(inode); + u64 cur; + u64 last_byte; +- bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; ++ bool do_compress = ctrl->flags & BTRFS_DEFRAG_RANGE_COMPRESS; + bool ra_allocated = false; +- int compress_type = BTRFS_COMPRESS_ZLIB; + int ret = 0; +- u32 extent_thresh = range->extent_thresh; + pgoff_t start_index; + + if (isize == 0) + return 0; + +- if (range->start >= isize) ++ if (ctrl->start >= isize) + return -EINVAL; + +- if (do_compress) { +- if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) +- return -EINVAL; +- if (range->compress_type) +- compress_type = range->compress_type; +- } ++ if (do_compress) ++ ASSERT(ctrl->compress < BTRFS_NR_COMPRESS_TYPES); + +- if (extent_thresh == 0) +- extent_thresh = SZ_256K; ++ if (ctrl->extent_thresh == 0) ++ ctrl->extent_thresh = SZ_256K; + +- if (range->start + range->len > range->start) { ++ if (ctrl->start + ctrl->len > ctrl->start) { + /* Got a specific range */ +- last_byte = min(isize, range->start + range->len); ++ last_byte = min(isize, ctrl->start + ctrl->len); + } else { + /* Defrag until file end */ + last_byte = isize; + } + + /* Align the range */ +- cur = round_down(range->start, fs_info->sectorsize); ++ cur = round_down(ctrl->start, fs_info->sectorsize); ++ ctrl->last_scanned = cur; + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + + /* +@@ -1559,12 +1628,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, + inode->i_mapping->writeback_index = start_index; + + while (cur < last_byte) { +- const unsigned long prev_sectors_defragged = sectors_defragged; ++ const unsigned long prev_sectors_defragged = ctrl->sectors_defragged; + u64 cluster_end; + +- /* The cluster size 256K should always be page aligned */ +- BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); +- + if (btrfs_defrag_cancelled(fs_info)) { + ret = -EAGAIN; + break; +@@ -1586,48 +1652,41 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, + break; + } + if (do_compress) +- BTRFS_I(inode)->defrag_compress = compress_type; +- ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, +- cluster_end + 1 - cur, extent_thresh, +- newer_than, do_compress, +- §ors_defragged, max_to_defrag); ++ BTRFS_I(inode)->defrag_compress = ctrl->compress; ++ ret = defrag_one_cluster(BTRFS_I(inode), ra, ctrl, cur, ++ cluster_end + 1 - cur); + +- if (sectors_defragged > prev_sectors_defragged) ++ if (ctrl->sectors_defragged > prev_sectors_defragged) + balance_dirty_pages_ratelimited(inode->i_mapping); + + btrfs_inode_unlock(inode, 0); + if (ret < 0) + break; +- cur = cluster_end + 1; ++ cur = max(cluster_end + 1, ctrl->last_scanned); + if (ret > 0) { + ret = 0; + break; + } ++ cond_resched(); + } + + if (ra_allocated) + kfree(ra); +- /* +- * Update range.start for autodefrag, this will indicate where to start +- * in next run. +- */ +- range->start = cur; +- if (sectors_defragged) { ++ if (ctrl->sectors_defragged) { + /* + * We have defragged some sectors, for compression case they + * need to be written back immediately. + */ +- if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { ++ if (ctrl->flags & BTRFS_DEFRAG_RANGE_START_IO) { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } +- if (range->compress_type == BTRFS_COMPRESS_LZO) ++ if (ctrl->compress == BTRFS_COMPRESS_LZO) + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); +- else if (range->compress_type == BTRFS_COMPRESS_ZSTD) ++ else if (ctrl->compress == BTRFS_COMPRESS_ZSTD) + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); +- ret = sectors_defragged; + } + if (do_compress) { + btrfs_inode_lock(inode, 0); +@@ -2147,7 +2206,7 @@ static noinline int copy_to_sk(struct btrfs_path *path, + + for (i = slot; i < nritems; i++) { + item_off = btrfs_item_ptr_offset(leaf, i); +- item_len = btrfs_item_size_nr(leaf, i); ++ item_len = btrfs_item_size(leaf, i); + + btrfs_item_key_to_cpu(leaf, key, i); + if (!key_in_sk(key, sk)) +@@ -2601,7 +2660,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, + btrfs_item_key_to_cpu(leaf, &key, slot); + + item_off = btrfs_item_ptr_offset(leaf, slot); +- item_len = btrfs_item_size_nr(leaf, slot); ++ item_len = btrfs_item_size(leaf, slot); + /* Check if dirid in ROOT_REF corresponds to passed dirid */ + rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); + if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { +@@ -2803,7 +2862,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) + + item_off = btrfs_item_ptr_offset(leaf, slot) + + sizeof(struct btrfs_root_ref); +- item_len = btrfs_item_size_nr(leaf, slot) ++ item_len = btrfs_item_size(leaf, slot) + - sizeof(struct btrfs_root_ref); + read_extent_buffer(leaf, subvol_info->name, + item_off, item_len); +@@ -3148,6 +3207,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ioctl_defrag_range_args range = {0}; ++ struct btrfs_defrag_ctrl ctrl = {0}; + int ret; + + ret = mnt_want_write_file(file); +@@ -3193,8 +3253,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) + /* the rest are all set to zero by kzalloc */ + range.len = (u64)-1; + } +- ret = btrfs_defrag_file(file_inode(file), &file->f_ra, +- &range, BTRFS_OLDEST_GENERATION, 0); ++ ret = btrfs_defrag_ioctl_args_to_ctrl(root->fs_info, &range, ++ &ctrl, 0, BTRFS_OLDEST_GENERATION); ++ if (ret < 0) ++ break; ++ ret = btrfs_defrag_file(file_inode(file), &file->f_ra, &ctrl); + if (ret > 0) + ret = 0; + break; +@@ -3683,7 +3746,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, + { + struct btrfs_trans_handle *trans; + u64 transid; +- int ret; + + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { +@@ -3695,11 +3757,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, + goto out; + } + transid = trans->transid; +- ret = btrfs_commit_transaction_async(trans); +- if (ret) { +- btrfs_end_transaction(trans); +- return ret; +- } ++ btrfs_commit_transaction_async(trans); + out: + if (argp) + if (copy_to_user(argp, &transid, sizeof(transid))) +diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c +index 0fb90cbe7669..430ad36b8b08 100644 +--- a/fs/btrfs/lzo.c ++++ b/fs/btrfs/lzo.c +@@ -55,6 +55,9 @@ + * 0x1000 | SegHdr N+1| Data payload N+1 ... | + */ + ++#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) ++#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) ++ + struct workspace { + void *mem; + void *buf; /* where decompressed data goes */ +@@ -83,8 +86,8 @@ struct list_head *lzo_alloc_workspace(unsigned int level) + return ERR_PTR(-ENOMEM); + + workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); +- workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); +- workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); ++ workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL); ++ workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL); + if (!workspace->mem || !workspace->buf || !workspace->cbuf) + goto fail; + +@@ -380,6 +383,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) + kunmap(cur_page); + cur_in += LZO_LEN; + ++ if (seg_len > WORKSPACE_CBUF_LENGTH) { ++ /* ++ * seg_len shouldn't be larger than we have allocated ++ * for workspace->cbuf ++ */ ++ btrfs_err(fs_info, "unexpectedly large lzo segment len %u", ++ seg_len); ++ ret = -EIO; ++ goto out; ++ } ++ + /* Copy the compressed segment payload into workspace */ + copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in); + +@@ -422,7 +436,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct workspace *workspace = list_entry(ws, struct workspace, list); + size_t in_len; + size_t out_len; +- size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE); ++ size_t max_segment_len = WORKSPACE_BUF_LENGTH; + int ret = 0; + char *kaddr; + unsigned long bytes; +diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c +index aae1027bd76a..0775ae9f4419 100644 +--- a/fs/btrfs/print-tree.c ++++ b/fs/btrfs/print-tree.c +@@ -85,7 +85,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) + struct btrfs_disk_key key; + unsigned long end; + unsigned long ptr; +- u32 item_size = btrfs_item_size_nr(eb, slot); ++ u32 item_size = btrfs_item_size(eb, slot); + u64 flags; + u64 offset; + int ref_index = 0; +@@ -200,7 +200,6 @@ void btrfs_print_leaf(struct extent_buffer *l) + struct btrfs_fs_info *fs_info; + int i; + u32 type, nr; +- struct btrfs_item *item; + struct btrfs_root_item *ri; + struct btrfs_dir_item *di; + struct btrfs_inode_item *ii; +@@ -224,12 +223,11 @@ void btrfs_print_leaf(struct extent_buffer *l) + btrfs_leaf_free_space(l), btrfs_header_owner(l)); + print_eb_refs_lock(l); + for (i = 0 ; i < nr ; i++) { +- item = btrfs_item_nr(i); + btrfs_item_key_to_cpu(l, &key, i); + type = key.type; + pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n", + i, key.objectid, type, key.offset, +- btrfs_item_offset(l, item), btrfs_item_size(l, item)); ++ btrfs_item_offset(l, i), btrfs_item_size(l, i)); + switch (type) { + case BTRFS_INODE_ITEM_KEY: + ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); +@@ -347,7 +345,7 @@ void btrfs_print_leaf(struct extent_buffer *l) + case BTRFS_UUID_KEY_SUBVOL: + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + print_uuid_item(l, btrfs_item_ptr_offset(l, i), +- btrfs_item_size_nr(l, i)); ++ btrfs_item_size(l, i)); + break; + } + } +diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c +index b1cb5a8c2999..a978676aa627 100644 +--- a/fs/btrfs/props.c ++++ b/fs/btrfs/props.c +@@ -158,7 +158,7 @@ static int iterate_object_props(struct btrfs_root *root, + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + cur = 0; +- total_len = btrfs_item_size_nr(leaf, slot); ++ total_len = btrfs_item_size(leaf, slot); + + while (cur < total_len) { + u32 name_len = btrfs_dir_name_len(leaf, di); +diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c +index 26134b7476a2..3712cd5fdbfe 100644 +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -258,16 +258,19 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) + return 0; + } + +-/* must be called with qgroup_lock held */ +-static int add_relation_rb(struct btrfs_fs_info *fs_info, +- u64 memberid, u64 parentid) ++/* ++ * Add relation specified by two qgroups. ++ * ++ * Must be called with qgroup_lock held. ++ * ++ * Return: 0 on success ++ * -ENOENT if one of the qgroups is NULL ++ * <0 other errors ++ */ ++static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) + { +- struct btrfs_qgroup *member; +- struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + +- member = find_qgroup_rb(fs_info, memberid); +- parent = find_qgroup_rb(fs_info, parentid); + if (!member || !parent) + return -ENOENT; + +@@ -283,7 +286,27 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, + return 0; + } + +-/* must be called with qgroup_lock held */ ++/* ++ * Add relation specified by two qgoup ids. ++ * ++ * Must be called with qgroup_lock held. ++ * ++ * Return: 0 on success ++ * -ENOENT if one of the ids does not exist ++ * <0 other errors ++ */ ++static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) ++{ ++ struct btrfs_qgroup *member; ++ struct btrfs_qgroup *parent; ++ ++ member = find_qgroup_rb(fs_info, memberid); ++ parent = find_qgroup_rb(fs_info, parentid); ++ ++ return __add_relation_rb(member, parent); ++} ++ ++/* Must be called with qgroup_lock held */ + static int del_relation_rb(struct btrfs_fs_info *fs_info, + u64 memberid, u64 parentid) + { +@@ -1444,7 +1467,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, + } + + spin_lock(&fs_info->qgroup_lock); +- ret = add_relation_rb(fs_info, src, dst); ++ ret = __add_relation_rb(member, parent); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + goto out; +diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c +index e2b9f8616501..f34130d90dee 100644 +--- a/fs/btrfs/ref-verify.c ++++ b/fs/btrfs/ref-verify.c +@@ -435,7 +435,7 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct extent_buffer *leaf = path->nodes[0]; +- u32 item_size = btrfs_item_size_nr(leaf, slot); ++ u32 item_size = btrfs_item_size(leaf, slot); + unsigned long end, ptr; + u64 offset, flags, count; + int type, ret; +diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c +index e0f93b357548..a3930da4eb3f 100644 +--- a/fs/btrfs/reflink.c ++++ b/fs/btrfs/reflink.c +@@ -439,7 +439,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, + break; + } + next_key_min_offset = key.offset + datal; +- size = btrfs_item_size_nr(leaf, slot); ++ size = btrfs_item_size(leaf, slot); + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), + size); + +diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c +index 33a0ee7ac590..ee0a0efc7efd 100644 +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -3149,7 +3149,7 @@ static int add_tree_block(struct reloc_control *rc, + u64 owner = 0; + + eb = path->nodes[0]; +- item_size = btrfs_item_size_nr(eb, path->slots[0]); ++ item_size = btrfs_item_size(eb, path->slots[0]); + + if (extent_key->type == BTRFS_METADATA_ITEM_KEY || + item_size >= sizeof(*ei) + sizeof(*bi)) { +diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c +index d20166336557..3297368aa359 100644 +--- a/fs/btrfs/root-tree.c ++++ b/fs/btrfs/root-tree.c +@@ -25,7 +25,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, + u32 len; + int need_reset = 0; + +- len = btrfs_item_size_nr(eb, slot); ++ len = btrfs_item_size(eb, slot); + read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot), + min_t(u32, len, sizeof(*item))); + if (len < sizeof(*item)) +@@ -146,7 +146,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr_offset(l, slot); +- old_len = btrfs_item_size_nr(l, slot); ++ old_len = btrfs_item_size(l, slot); + + /* + * If this is the first time we update the root item which originated +diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c +index 8f6ceea33969..d175c5ab1134 100644 +--- a/fs/btrfs/scrub.c ++++ b/fs/btrfs/scrub.c +@@ -758,7 +758,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); +- item_size = btrfs_item_size_nr(eb, path->slots[0]); ++ item_size = btrfs_item_size(eb, path->slots[0]); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + do { +diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c +index 040324d71118..93b9fe2dca67 100644 +--- a/fs/btrfs/send.c ++++ b/fs/btrfs/send.c +@@ -898,7 +898,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, + iterate_inode_ref_t iterate, void *ctx) + { + struct extent_buffer *eb = path->nodes[0]; +- struct btrfs_item *item; + struct btrfs_inode_ref *iref; + struct btrfs_inode_extref *extref; + struct btrfs_path *tmp_path; +@@ -930,12 +929,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, + if (found_key->type == BTRFS_INODE_REF_KEY) { + ptr = (unsigned long)btrfs_item_ptr(eb, slot, + struct btrfs_inode_ref); +- item = btrfs_item_nr(slot); +- total = btrfs_item_size(eb, item); ++ total = btrfs_item_size(eb, slot); + elem_size = sizeof(*iref); + } else { + ptr = btrfs_item_ptr_offset(eb, slot); +- total = btrfs_item_size_nr(eb, slot); ++ total = btrfs_item_size(eb, slot); + elem_size = sizeof(*extref); + } + +@@ -1018,7 +1016,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, + { + int ret = 0; + struct extent_buffer *eb; +- struct btrfs_item *item; + struct btrfs_dir_item *di; + struct btrfs_key di_key; + char *buf = NULL; +@@ -1047,11 +1044,10 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, + + eb = path->nodes[0]; + slot = path->slots[0]; +- item = btrfs_item_nr(slot); + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + cur = 0; + len = 0; +- total = btrfs_item_size(eb, item); ++ total = btrfs_item_size(eb, slot); + + num = 0; + while (cur < total) { +@@ -3622,7 +3618,7 @@ static int is_ancestor(struct btrfs_root *root, + key.type != BTRFS_INODE_EXTREF_KEY) + break; + +- item_size = btrfs_item_size_nr(leaf, slot); ++ item_size = btrfs_item_size(leaf, slot); + while (cur_offset < item_size) { + u64 parent; + u64 parent_gen; +@@ -4983,6 +4979,10 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); ++ btrfs_err(fs_info, ++ "send: IO error at offset %llu for inode %llu root %llu", ++ page_offset(page), sctx->cur_ino, ++ sctx->send_root->root_key.objectid); + put_page(page); + ret = -EIO; + break; +@@ -6566,7 +6566,7 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, + } + + leaf = path->nodes[0]; +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *)(ptr + +@@ -6791,8 +6791,8 @@ static int tree_compare_item(struct btrfs_path *left_path, + int len1, len2; + unsigned long off1, off2; + +- len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); +- len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); ++ len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]); ++ len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]); + if (len1 != len2) + return 1; + +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index f9eff3b0f77c..836a20fdfca1 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1104,6 +1104,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) + static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; + static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS]; + ++static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) == ++ ARRAY_SIZE(btrfs_feature_attrs)); ++static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) == ++ ARRAY_SIZE(btrfs_feature_attrs[0])); ++ + static const u64 supported_feature_masks[FEAT_MAX] = { + [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, + [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, +@@ -1272,11 +1277,6 @@ static void init_feature_attrs(void) + struct btrfs_feature_attr *fa; + int set, i; + +- BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) != +- ARRAY_SIZE(btrfs_feature_attrs)); +- BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) != +- ARRAY_SIZE(btrfs_feature_attrs[0])); +- + memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); + memset(btrfs_unknown_feature_names, 0, + sizeof(btrfs_unknown_feature_names)); +diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c +index 2a95f7224e18..51a8b075c259 100644 +--- a/fs/btrfs/tests/extent-buffer-tests.c ++++ b/fs/btrfs/tests/extent-buffer-tests.c +@@ -15,7 +15,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) + struct btrfs_path *path = NULL; + struct btrfs_root *root = NULL; + struct extent_buffer *eb; +- struct btrfs_item *item; + char *value = "mary had a little lamb"; + char *split1 = "mary had a little"; + char *split2 = " lamb"; +@@ -61,7 +60,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) + key.offset = 0; + + btrfs_setup_item_for_insert(root, path, &key, value_len); +- item = btrfs_item_nr(0); + write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), + value_len); + +@@ -90,8 +88,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) + goto out; + } + +- item = btrfs_item_nr(0); +- if (btrfs_item_size(eb, item) != strlen(split1)) { ++ if (btrfs_item_size(eb, 0) != strlen(split1)) { + test_err("invalid len in the first split"); + ret = -EINVAL; + goto out; +@@ -115,8 +112,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) + goto out; + } + +- item = btrfs_item_nr(1); +- if (btrfs_item_size(eb, item) != strlen(split2)) { ++ if (btrfs_item_size(eb, 1) != strlen(split2)) { + test_err("invalid len in the second split"); + ret = -EINVAL; + goto out; +@@ -147,8 +143,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) + goto out; + } + +- item = btrfs_item_nr(0); +- if (btrfs_item_size(eb, item) != strlen(split3)) { ++ if (btrfs_item_size(eb, 0) != strlen(split3)) { + test_err("invalid len in the first split"); + ret = -EINVAL; + goto out; +@@ -171,8 +166,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) + goto out; + } + +- item = btrfs_item_nr(1); +- if (btrfs_item_size(eb, item) != strlen(split4)) { ++ if (btrfs_item_size(eb, 1) != strlen(split4)) { + test_err("invalid len in the second split"); + ret = -EINVAL; + goto out; +@@ -195,8 +189,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) + goto out; + } + +- item = btrfs_item_nr(2); +- if (btrfs_item_size(eb, item) != strlen(split2)) { ++ if (btrfs_item_size(eb, 2) != strlen(split2)) { + test_err("invalid len in the second split"); + ret = -EINVAL; + goto out; +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index 27b93a6c41bb..f3c094af9283 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1861,50 +1861,14 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) + return ret; + } + +-/* +- * commit transactions asynchronously. once btrfs_commit_transaction_async +- * returns, any subsequent transaction will not be allowed to join. +- */ +-struct btrfs_async_commit { +- struct btrfs_trans_handle *newtrans; +- struct work_struct work; +-}; +- +-static void do_async_commit(struct work_struct *work) +-{ +- struct btrfs_async_commit *ac = +- container_of(work, struct btrfs_async_commit, work); +- +- /* +- * We've got freeze protection passed with the transaction. +- * Tell lockdep about it. +- */ +- if (ac->newtrans->type & __TRANS_FREEZABLE) +- __sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS); +- +- current->journal_info = ac->newtrans; +- +- btrfs_commit_transaction(ac->newtrans); +- kfree(ac); +-} +- +-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) ++void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) + { + struct btrfs_fs_info *fs_info = trans->fs_info; +- struct btrfs_async_commit *ac; + struct btrfs_transaction *cur_trans; + +- ac = kmalloc(sizeof(*ac), GFP_NOFS); +- if (!ac) +- return -ENOMEM; +- +- INIT_WORK(&ac->work, do_async_commit); +- ac->newtrans = btrfs_join_transaction(trans->root); +- if (IS_ERR(ac->newtrans)) { +- int err = PTR_ERR(ac->newtrans); +- kfree(ac); +- return err; +- } ++ /* Kick the transaction kthread. */ ++ set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); ++ wake_up_process(fs_info->transaction_kthread); + + /* take transaction reference */ + cur_trans = trans->transaction; +@@ -1912,14 +1876,6 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) + + btrfs_end_transaction(trans); + +- /* +- * Tell lockdep we've released the freeze rwsem, since the +- * async commit thread will be the one to unlock it. +- */ +- if (ac->newtrans->type & __TRANS_FREEZABLE) +- __sb_writers_release(fs_info->sb, SB_FREEZE_FS); +- +- schedule_work(&ac->work); + /* + * Wait for the current transaction commit to start and block + * subsequent transaction joins +@@ -1927,14 +1883,9 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) + wait_event(fs_info->transaction_blocked_wait, + cur_trans->state >= TRANS_STATE_COMMIT_START || + TRANS_ABORTED(cur_trans)); +- if (current->journal_info == trans) +- current->journal_info = NULL; +- + btrfs_put_transaction(cur_trans); +- return 0; + } + +- + static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) + { + struct btrfs_fs_info *fs_info = trans->fs_info; +@@ -2013,16 +1964,24 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) + static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) + { + /* +- * We use writeback_inodes_sb here because if we used ++ * We use try_to_writeback_inodes_sb() here because if we used + * btrfs_start_delalloc_roots we would deadlock with fs freeze. + * Currently are holding the fs freeze lock, if we do an async flush + * we'll do btrfs_join_transaction() and deadlock because we need to + * wait for the fs freeze lock. Using the direct flushing we benefit + * from already being in a transaction and our join_transaction doesn't + * have to re-take the fs freeze lock. ++ * ++ * Note that try_to_writeback_inodes_sb() will only trigger writeback ++ * if it can read lock sb->s_umount. It will always be able to lock it, ++ * except when the filesystem is being unmounted or being frozen, but in ++ * those cases sync_filesystem() is called, which results in calling ++ * writeback_inodes_sb() while holding a write lock on sb->s_umount. ++ * Note that we don't call writeback_inodes_sb() directly, because it ++ * will emit a warning if sb->s_umount is not locked. + */ + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) +- writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); ++ try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); + return 0; + } + +@@ -2224,6 +2183,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ++ /* ++ * We've started the commit, clear the flag in case we were triggered to ++ * do an async commit but somebody else started before the transaction ++ * kthread could do the work. ++ */ ++ clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); ++ + if (TRANS_ABORTED(cur_trans)) { + ret = cur_trans->aborted; + goto scrub_continue; +diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h +index eba07b8119bb..d0705485f5c8 100644 +--- a/fs/btrfs/transaction.h ++++ b/fs/btrfs/transaction.h +@@ -219,7 +219,7 @@ void btrfs_add_dead_root(struct btrfs_root *root); + int btrfs_defrag_root(struct btrfs_root *root); + int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); + int btrfs_commit_transaction(struct btrfs_trans_handle *trans); +-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); ++void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); + int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); + bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); + void btrfs_throttle(struct btrfs_fs_info *fs_info); +diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c +index 7733e8ac0a69..72e1c942197d 100644 +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -202,7 +202,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_file_extent_item *fi; + u32 sectorsize = fs_info->sectorsize; +- u32 item_size = btrfs_item_size_nr(leaf, slot); ++ u32 item_size = btrfs_item_size(leaf, slot); + u64 extent_end; + + if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { +@@ -354,17 +354,17 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, + key->offset, sectorsize); + return -EUCLEAN; + } +- if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) { ++ if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) { + generic_err(leaf, slot, + "unaligned item size for csum item, have %u should be aligned to %u", +- btrfs_item_size_nr(leaf, slot), csumsize); ++ btrfs_item_size(leaf, slot), csumsize); + return -EUCLEAN; + } + if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) { + u64 prev_csum_end; + u32 prev_item_size; + +- prev_item_size = btrfs_item_size_nr(leaf, slot - 1); ++ prev_item_size = btrfs_item_size(leaf, slot - 1); + prev_csum_end = (prev_item_size / csumsize) * sectorsize; + prev_csum_end += prev_key->offset; + if (unlikely(prev_csum_end > key->offset)) { +@@ -483,7 +483,7 @@ static int check_dir_item(struct extent_buffer *leaf, + { + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_dir_item *di; +- u32 item_size = btrfs_item_size_nr(leaf, slot); ++ u32 item_size = btrfs_item_size(leaf, slot); + u32 cur = 0; + + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) +@@ -640,7 +640,7 @@ static int check_block_group_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) + { + struct btrfs_block_group_item bgi; +- u32 item_size = btrfs_item_size_nr(leaf, slot); ++ u32 item_size = btrfs_item_size(leaf, slot); + u64 flags; + u64 type; + +@@ -912,10 +912,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, + { + int num_stripes; + +- if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) { ++ if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { + chunk_err(leaf, chunk, key->offset, + "invalid chunk item size: have %u expect [%zu, %u)", +- btrfs_item_size_nr(leaf, slot), ++ btrfs_item_size(leaf, slot), + sizeof(struct btrfs_chunk), + BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + return -EUCLEAN; +@@ -927,10 +927,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, + goto out; + + if (unlikely(btrfs_chunk_item_size(num_stripes) != +- btrfs_item_size_nr(leaf, slot))) { ++ btrfs_item_size(leaf, slot))) { + chunk_err(leaf, chunk, key->offset, + "invalid chunk item size: have %u expect %lu", +- btrfs_item_size_nr(leaf, slot), ++ btrfs_item_size(leaf, slot), + btrfs_chunk_item_size(num_stripes)); + return -EUCLEAN; + } +@@ -1095,12 +1095,12 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, + if (unlikely(ret < 0)) + return ret; + +- if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) && +- btrfs_item_size_nr(leaf, slot) != ++ if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) && ++ btrfs_item_size(leaf, slot) != + btrfs_legacy_root_item_size())) { + generic_err(leaf, slot, + "invalid root item size, have %u expect %zu or %u", +- btrfs_item_size_nr(leaf, slot), sizeof(ri), ++ btrfs_item_size(leaf, slot), sizeof(ri), + btrfs_legacy_root_item_size()); + return -EUCLEAN; + } +@@ -1111,7 +1111,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, + * And since we allow geneartion_v2 as 0, it will still pass the check. + */ + read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), +- btrfs_item_size_nr(leaf, slot)); ++ btrfs_item_size(leaf, slot)); + + /* Generation related */ + if (unlikely(btrfs_root_generation(&ri) > +@@ -1208,7 +1208,7 @@ static int check_extent_item(struct extent_buffer *leaf, + bool is_tree_block = false; + unsigned long ptr; /* Current pointer inside inline refs */ + unsigned long end; /* Extent item end */ +- const u32 item_size = btrfs_item_size_nr(leaf, slot); ++ const u32 item_size = btrfs_item_size(leaf, slot); + u64 flags; + u64 generation; + u64 total_refs; /* Total refs in btrfs_extent_item */ +@@ -1432,10 +1432,10 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, + if (key->type == BTRFS_SHARED_DATA_REF_KEY) + expect_item_size = sizeof(struct btrfs_shared_data_ref); + +- if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) { ++ if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { + generic_err(leaf, slot, + "invalid item size, have %u expect %u for key type %u", +- btrfs_item_size_nr(leaf, slot), ++ btrfs_item_size(leaf, slot), + expect_item_size, key->type); + return -EUCLEAN; + } +@@ -1460,12 +1460,12 @@ static int check_extent_data_ref(struct extent_buffer *leaf, + { + struct btrfs_extent_data_ref *dref; + unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); +- const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot); ++ const unsigned long end = ptr + btrfs_item_size(leaf, slot); + +- if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) { ++ if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) { + generic_err(leaf, slot, + "invalid item size, have %u expect aligned to %zu for key type %u", +- btrfs_item_size_nr(leaf, slot), ++ btrfs_item_size(leaf, slot), + sizeof(*dref), key->type); + return -EUCLEAN; + } +@@ -1507,16 +1507,16 @@ static int check_inode_ref(struct extent_buffer *leaf, + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) + return -EUCLEAN; + /* namelen can't be 0, so item_size == sizeof() is also invalid */ +- if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) { ++ if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) { + inode_ref_err(leaf, slot, + "invalid item size, have %u expect (%zu, %u)", +- btrfs_item_size_nr(leaf, slot), ++ btrfs_item_size(leaf, slot), + sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + return -EUCLEAN; + } + + ptr = btrfs_item_ptr_offset(leaf, slot); +- end = ptr + btrfs_item_size_nr(leaf, slot); ++ end = ptr + btrfs_item_size(leaf, slot); + while (ptr < end) { + u16 namelen; + +@@ -1689,12 +1689,12 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) + if (slot == 0) + item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info); + else +- item_end_expected = btrfs_item_offset_nr(leaf, ++ item_end_expected = btrfs_item_offset(leaf, + slot - 1); +- if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) { ++ if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) { + generic_err(leaf, slot, + "unexpected item end, have %u expect %u", +- btrfs_item_end_nr(leaf, slot), ++ btrfs_item_data_end(leaf, slot), + item_end_expected); + return -EUCLEAN; + } +@@ -1704,11 +1704,11 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) + * just in case all the items are consistent to each other, but + * all point outside of the leaf. + */ +- if (unlikely(btrfs_item_end_nr(leaf, slot) > ++ if (unlikely(btrfs_item_data_end(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(fs_info))) { + generic_err(leaf, slot, + "slot end outside of leaf, have %u expect range [0, %u]", +- btrfs_item_end_nr(leaf, slot), ++ btrfs_item_data_end(leaf, slot), + BTRFS_LEAF_DATA_SIZE(fs_info)); + return -EUCLEAN; + } +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 6993dcdba6f1..cc3a8d8a3841 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -386,7 +386,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans, + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + overwrite_root = 1; + +- item_size = btrfs_item_size_nr(eb, slot); ++ item_size = btrfs_item_size(eb, slot); + src_ptr = btrfs_item_ptr_offset(eb, slot); + + /* Our caller must have done a search for the key for us. */ +@@ -409,7 +409,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans, + if (ret == 0) { + char *src_copy; + char *dst_copy; +- u32 dst_size = btrfs_item_size_nr(path->nodes[0], ++ u32 dst_size = btrfs_item_size(path->nodes[0], + path->slots[0]); + if (dst_size != item_size) + goto insert; +@@ -503,7 +503,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans, + /* make sure any existing item is the correct size */ + if (ret == -EEXIST || ret == -EOVERFLOW) { + u32 found_size; +- found_size = btrfs_item_size_nr(path->nodes[0], ++ found_size = btrfs_item_size(path->nodes[0], + path->slots[0]); + if (found_size > item_size) + btrfs_truncate_item(path, item_size, 1); +@@ -1096,7 +1096,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, + * otherwise they must be unlinked as a conflict + */ + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); +- ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); ++ ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); + while (ptr < ptr_end) { + victim_ref = (struct btrfs_inode_ref *)ptr; + victim_name_len = btrfs_inode_ref_name_len(leaf, +@@ -1155,7 +1155,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, + + leaf = path->nodes[0]; + +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + base = btrfs_item_ptr_offset(leaf, path->slots[0]); + + while (cur_offset < item_size) { +@@ -1318,7 +1318,7 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, + + eb = path->nodes[0]; + ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); +- ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); ++ ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); + while (ref_ptr < ref_end) { + char *name = NULL; + int namelen; +@@ -1504,7 +1504,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + int ref_struct_size; + + ref_ptr = btrfs_item_ptr_offset(eb, slot); +- ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); ++ ref_end = ref_ptr + btrfs_item_size(eb, slot); + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *r; +@@ -1678,7 +1678,7 @@ static int count_inode_extrefs(struct btrfs_root *root, + break; + + leaf = path->nodes[0]; +- item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ item_size = btrfs_item_size(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + cur_offset = 0; + +@@ -1732,7 +1732,7 @@ static int count_inode_refs(struct btrfs_root *root, + key.type != BTRFS_INODE_REF_KEY) + break; + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); +- ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], ++ ptr_end = ptr + btrfs_item_size(path->nodes[0], + path->slots[0]); + while (ptr < ptr_end) { + struct btrfs_inode_ref *ref; +@@ -1950,6 +1950,34 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, + return ret; + } + ++static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, ++ struct btrfs_inode *dir, ++ struct btrfs_path *path, ++ struct btrfs_dir_item *dst_di, ++ const struct btrfs_key *log_key, ++ u8 log_type, ++ bool exists) ++{ ++ struct btrfs_key found_key; ++ ++ btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); ++ /* The existing dentry points to the same inode, don't delete it. */ ++ if (found_key.objectid == log_key->objectid && ++ found_key.type == log_key->type && ++ found_key.offset == log_key->offset && ++ btrfs_dir_type(path->nodes[0], dst_di) == log_type) ++ return 1; ++ ++ /* ++ * Don't drop the conflicting directory entry if the inode for the new ++ * entry doesn't exist. ++ */ ++ if (!exists) ++ return 0; ++ ++ return drop_one_dir_item(trans, path, dir, dst_di); ++} ++ + /* + * take a single entry in a log directory item and replay it into + * the subvolume. +@@ -1975,14 +2003,17 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, + { + char *name; + int name_len; +- struct btrfs_dir_item *dst_di; +- struct btrfs_key found_key; ++ struct btrfs_dir_item *dir_dst_di; ++ struct btrfs_dir_item *index_dst_di; ++ bool dir_dst_matches = false; ++ bool index_dst_matches = false; + struct btrfs_key log_key; ++ struct btrfs_key search_key; + struct inode *dir; + u8 log_type; + bool exists; + int ret; +- bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); ++ bool update_size = true; + bool name_added = false; + + dir = read_one_inode(root, key->objectid); +@@ -2008,76 +2039,53 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, + exists = (ret == 0); + ret = 0; + +- if (key->type == BTRFS_DIR_ITEM_KEY) { +- dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, +- name, name_len, 1); +- } else if (key->type == BTRFS_DIR_INDEX_KEY) { +- dst_di = btrfs_lookup_dir_index_item(trans, root, path, +- key->objectid, +- key->offset, name, +- name_len, 1); +- } else { +- /* Corruption */ +- ret = -EINVAL; ++ dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, ++ name, name_len, 1); ++ if (IS_ERR(dir_dst_di)) { ++ ret = PTR_ERR(dir_dst_di); + goto out; +- } +- +- if (IS_ERR(dst_di)) { +- ret = PTR_ERR(dst_di); +- goto out; +- } else if (!dst_di) { +- /* we need a sequence number to insert, so we only +- * do inserts for the BTRFS_DIR_INDEX_KEY types +- */ +- if (key->type != BTRFS_DIR_INDEX_KEY) ++ } else if (dir_dst_di) { ++ ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, ++ dir_dst_di, &log_key, log_type, ++ exists); ++ if (ret < 0) + goto out; +- goto insert; ++ dir_dst_matches = (ret == 1); + } + +- btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); +- /* the existing item matches the logged item */ +- if (found_key.objectid == log_key.objectid && +- found_key.type == log_key.type && +- found_key.offset == log_key.offset && +- btrfs_dir_type(path->nodes[0], dst_di) == log_type) { ++ btrfs_release_path(path); ++ ++ index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, ++ key->objectid, key->offset, ++ name, name_len, 1); ++ if (IS_ERR(index_dst_di)) { ++ ret = PTR_ERR(index_dst_di); ++ goto out; ++ } else if (index_dst_di) { ++ ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, ++ index_dst_di, &log_key, ++ log_type, exists); ++ if (ret < 0) ++ goto out; ++ index_dst_matches = (ret == 1); ++ } ++ ++ btrfs_release_path(path); ++ ++ if (dir_dst_matches && index_dst_matches) { ++ ret = 0; + update_size = false; + goto out; + } + +- /* +- * don't drop the conflicting directory entry if the inode +- * for the new entry doesn't exist +- */ +- if (!exists) +- goto out; +- +- ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di); +- if (ret) +- goto out; +- +- if (key->type == BTRFS_DIR_INDEX_KEY) +- goto insert; +-out: +- btrfs_release_path(path); +- if (!ret && update_size) { +- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); +- ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); +- } +- kfree(name); +- iput(dir); +- if (!ret && name_added) +- ret = 1; +- return ret; +- +-insert: + /* + * Check if the inode reference exists in the log for the given name, + * inode and parent inode + */ +- found_key.objectid = log_key.objectid; +- found_key.type = BTRFS_INODE_REF_KEY; +- found_key.offset = key->objectid; +- ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); ++ search_key.objectid = log_key.objectid; ++ search_key.type = BTRFS_INODE_REF_KEY; ++ search_key.offset = key->objectid; ++ ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); + if (ret < 0) { + goto out; + } else if (ret) { +@@ -2087,10 +2095,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, + goto out; + } + +- found_key.objectid = log_key.objectid; +- found_key.type = BTRFS_INODE_EXTREF_KEY; +- found_key.offset = key->objectid; +- ret = backref_in_log(root->log_root, &found_key, key->objectid, name, ++ search_key.objectid = log_key.objectid; ++ search_key.type = BTRFS_INODE_EXTREF_KEY; ++ search_key.offset = key->objectid; ++ ret = backref_in_log(root->log_root, &search_key, key->objectid, name, + name_len); + if (ret < 0) { + goto out; +@@ -2109,87 +2117,76 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, + name_added = true; + update_size = false; + ret = 0; +- goto out; ++ ++out: ++ if (!ret && update_size) { ++ btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); ++ ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); ++ } ++ kfree(name); ++ iput(dir); ++ if (!ret && name_added) ++ ret = 1; ++ return ret; + } + +-/* +- * find all the names in a directory item and reconcile them into +- * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than +- * one name in a directory item, but the same code gets used for +- * both directory index types +- */ ++/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ + static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) + { +- int ret = 0; +- u32 item_size = btrfs_item_size_nr(eb, slot); ++ int ret; + struct btrfs_dir_item *di; +- int name_len; +- unsigned long ptr; +- unsigned long ptr_end; +- struct btrfs_path *fixup_path = NULL; + +- ptr = btrfs_item_ptr_offset(eb, slot); +- ptr_end = ptr + item_size; +- while (ptr < ptr_end) { +- di = (struct btrfs_dir_item *)ptr; +- name_len = btrfs_dir_name_len(eb, di); +- ret = replay_one_name(trans, root, path, eb, di, key); +- if (ret < 0) +- break; +- ptr = (unsigned long)(di + 1); +- ptr += name_len; ++ /* We only log dir index keys, which only contain a single dir item. */ ++ ASSERT(key->type == BTRFS_DIR_INDEX_KEY); + +- /* +- * If this entry refers to a non-directory (directories can not +- * have a link count > 1) and it was added in the transaction +- * that was not committed, make sure we fixup the link count of +- * the inode it the entry points to. Otherwise something like +- * the following would result in a directory pointing to an +- * inode with a wrong link that does not account for this dir +- * entry: +- * +- * mkdir testdir +- * touch testdir/foo +- * touch testdir/bar +- * sync +- * +- * ln testdir/bar testdir/bar_link +- * ln testdir/foo testdir/foo_link +- * xfs_io -c "fsync" testdir/bar +- * +- * <power failure> +- * +- * mount fs, log replay happens +- * +- * File foo would remain with a link count of 1 when it has two +- * entries pointing to it in the directory testdir. This would +- * make it impossible to ever delete the parent directory has +- * it would result in stale dentries that can never be deleted. +- */ +- if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { +- struct btrfs_key di_key; ++ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); ++ ret = replay_one_name(trans, root, path, eb, di, key); ++ if (ret < 0) ++ return ret; + +- if (!fixup_path) { +- fixup_path = btrfs_alloc_path(); +- if (!fixup_path) { +- ret = -ENOMEM; +- break; +- } +- } ++ /* ++ * If this entry refers to a non-directory (directories can not have a ++ * link count > 1) and it was added in the transaction that was not ++ * committed, make sure we fixup the link count of the inode the entry ++ * points to. Otherwise something like the following would result in a ++ * directory pointing to an inode with a wrong link that does not account ++ * for this dir entry: ++ * ++ * mkdir testdir ++ * touch testdir/foo ++ * touch testdir/bar ++ * sync ++ * ++ * ln testdir/bar testdir/bar_link ++ * ln testdir/foo testdir/foo_link ++ * xfs_io -c "fsync" testdir/bar ++ * ++ * <power failure> ++ * ++ * mount fs, log replay happens ++ * ++ * File foo would remain with a link count of 1 when it has two entries ++ * pointing to it in the directory testdir. This would make it impossible ++ * to ever delete the parent directory has it would result in stale ++ * dentries that can never be deleted. ++ */ ++ if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { ++ struct btrfs_path *fixup_path; ++ struct btrfs_key di_key; + +- btrfs_dir_item_key_to_cpu(eb, di, &di_key); +- ret = link_to_fixup_dir(trans, root, fixup_path, +- di_key.objectid); +- if (ret) +- break; +- } +- ret = 0; ++ fixup_path = btrfs_alloc_path(); ++ if (!fixup_path) ++ return -ENOMEM; ++ ++ btrfs_dir_item_key_to_cpu(eb, di, &di_key); ++ ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); ++ btrfs_free_path(fixup_path); + } +- btrfs_free_path(fixup_path); ++ + return ret; + } + +@@ -2206,7 +2203,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, + */ + static noinline int find_dir_range(struct btrfs_root *root, + struct btrfs_path *path, +- u64 dirid, int key_type, ++ u64 dirid, + u64 *start_ret, u64 *end_ret) + { + struct btrfs_key key; +@@ -2219,7 +2216,7 @@ static noinline int find_dir_range(struct btrfs_root *root, + return 1; + + key.objectid = dirid; +- key.type = key_type; ++ key.type = BTRFS_DIR_LOG_INDEX_KEY; + key.offset = *start_ret; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +@@ -2233,7 +2230,7 @@ static noinline int find_dir_range(struct btrfs_root *root, + if (ret != 0) + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + +- if (key.type != key_type || key.objectid != dirid) { ++ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { + ret = 1; + goto next; + } +@@ -2260,7 +2257,7 @@ static noinline int find_dir_range(struct btrfs_root *root, + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + +- if (key.type != key_type || key.objectid != dirid) { ++ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { + ret = 1; + goto out; + } +@@ -2291,95 +2288,82 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, + int ret; + struct extent_buffer *eb; + int slot; +- u32 item_size; + struct btrfs_dir_item *di; +- struct btrfs_dir_item *log_di; + int name_len; +- unsigned long ptr; +- unsigned long ptr_end; + char *name; +- struct inode *inode; ++ struct inode *inode = NULL; + struct btrfs_key location; + +-again: ++ /* ++ * Currenly we only log dir index keys. Even if we replay a log created ++ * by an older kernel that logged both dir index and dir item keys, all ++ * we need to do is process the dir index keys, we (and our caller) can ++ * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). ++ */ ++ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); ++ + eb = path->nodes[0]; + slot = path->slots[0]; +- item_size = btrfs_item_size_nr(eb, slot); +- ptr = btrfs_item_ptr_offset(eb, slot); +- ptr_end = ptr + item_size; +- while (ptr < ptr_end) { +- di = (struct btrfs_dir_item *)ptr; +- name_len = btrfs_dir_name_len(eb, di); +- name = kmalloc(name_len, GFP_NOFS); +- if (!name) { +- ret = -ENOMEM; +- goto out; +- } +- read_extent_buffer(eb, name, (unsigned long)(di + 1), +- name_len); +- log_di = NULL; +- if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { +- log_di = btrfs_lookup_dir_item(trans, log, log_path, +- dir_key->objectid, +- name, name_len, 0); +- } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { +- log_di = btrfs_lookup_dir_index_item(trans, log, +- log_path, ++ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); ++ name_len = btrfs_dir_name_len(eb, di); ++ name = kmalloc(name_len, GFP_NOFS); ++ if (!name) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); ++ ++ if (log) { ++ struct btrfs_dir_item *log_di; ++ ++ log_di = btrfs_lookup_dir_index_item(trans, log, log_path, + dir_key->objectid, + dir_key->offset, + name, name_len, 0); +- } +- if (!log_di) { +- btrfs_dir_item_key_to_cpu(eb, di, &location); +- btrfs_release_path(path); +- btrfs_release_path(log_path); +- inode = read_one_inode(root, location.objectid); +- if (!inode) { +- kfree(name); +- return -EIO; +- } +- +- ret = link_to_fixup_dir(trans, root, +- path, location.objectid); +- if (ret) { +- kfree(name); +- iput(inode); +- goto out; +- } +- +- inc_nlink(inode); +- ret = btrfs_unlink_inode(trans, BTRFS_I(dir), +- BTRFS_I(inode), name, name_len); +- if (!ret) +- ret = btrfs_run_delayed_items(trans); +- kfree(name); +- iput(inode); +- if (ret) +- goto out; +- +- /* there might still be more names under this key +- * check and repeat if required +- */ +- ret = btrfs_search_slot(NULL, root, dir_key, path, +- 0, 0); +- if (ret == 0) +- goto again; ++ if (IS_ERR(log_di)) { ++ ret = PTR_ERR(log_di); ++ goto out; ++ } else if (log_di) { ++ /* The dentry exists in the log, we have nothing to do. */ + ret = 0; + goto out; +- } else if (IS_ERR(log_di)) { +- kfree(name); +- return PTR_ERR(log_di); + } +- btrfs_release_path(log_path); +- kfree(name); +- +- ptr = (unsigned long)(di + 1); +- ptr += name_len; + } +- ret = 0; ++ ++ btrfs_dir_item_key_to_cpu(eb, di, &location); ++ btrfs_release_path(path); ++ btrfs_release_path(log_path); ++ inode = read_one_inode(root, location.objectid); ++ if (!inode) { ++ ret = -EIO; ++ goto out; ++ } ++ ++ ret = link_to_fixup_dir(trans, root, path, location.objectid); ++ if (ret) ++ goto out; ++ ++ inc_nlink(inode); ++ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, ++ name_len); ++ if (ret) ++ goto out; ++ ++ ret = btrfs_run_delayed_items(trans); ++ if (ret) ++ goto out; ++ ++ /* ++ * Unlike dir item keys, dir index keys can only have one name (entry) in ++ * them, as there are no key collisions since each key has a unique offset ++ * (an index number), so we're done. ++ */ + out: + btrfs_release_path(path); + btrfs_release_path(log_path); ++ kfree(name); ++ iput(inode); + return ret; + } + +@@ -2422,7 +2406,7 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, + } + + di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); +- total_size = btrfs_item_size_nr(path->nodes[0], i); ++ total_size = btrfs_item_size(path->nodes[0], i); + cur = 0; + while (cur < total_size) { + u16 name_len = btrfs_dir_name_len(path->nodes[0], di); +@@ -2499,7 +2483,6 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + { + u64 range_start; + u64 range_end; +- int key_type = BTRFS_DIR_LOG_ITEM_KEY; + int ret = 0; + struct btrfs_key dir_key; + struct btrfs_key found_key; +@@ -2507,7 +2490,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct inode *dir; + + dir_key.objectid = dirid; +- dir_key.type = BTRFS_DIR_ITEM_KEY; ++ dir_key.type = BTRFS_DIR_INDEX_KEY; + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; +@@ -2521,14 +2504,14 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + btrfs_free_path(log_path); + return 0; + } +-again: ++ + range_start = 0; + range_end = 0; + while (1) { + if (del_all) + range_end = (u64)-1; + else { +- ret = find_dir_range(log, path, dirid, key_type, ++ ret = find_dir_range(log, path, dirid, + &range_start, &range_end); + if (ret < 0) + goto out; +@@ -2555,8 +2538,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != dirid || +- found_key.type != dir_key.type) +- goto next_type; ++ found_key.type != dir_key.type) { ++ ret = 0; ++ goto out; ++ } + + if (found_key.offset > range_end) + break; +@@ -2575,15 +2560,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + break; + range_start = range_end + 1; + } +- +-next_type: + ret = 0; +- if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { +- key_type = BTRFS_DIR_LOG_INDEX_KEY; +- dir_key.type = BTRFS_DIR_INDEX_KEY; +- btrfs_release_path(path); +- goto again; +- } + out: + btrfs_release_path(path); + btrfs_free_path(log_path); +@@ -2743,12 +2720,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + eb, i, &key); + if (ret) + break; +- } else if (key.type == BTRFS_DIR_ITEM_KEY) { +- ret = replay_one_dir_item(wc->trans, root, path, +- eb, i, &key); +- if (ret) +- break; + } ++ /* ++ * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the ++ * BTRFS_DIR_INDEX_KEY items which we use to derive the ++ * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an ++ * older kernel with such keys, ignore them. ++ */ + } + btrfs_free_path(path); + return ret; +@@ -3551,20 +3529,10 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + goto out_unlock; + } + +- di = btrfs_lookup_dir_item(trans, log, path, dir_ino, +- name, name_len, -1); +- if (IS_ERR(di)) { +- err = PTR_ERR(di); +- goto fail; +- } +- if (di) { +- ret = btrfs_delete_one_dir_name(trans, log, path, di); +- if (ret) { +- err = ret; +- goto fail; +- } +- } +- btrfs_release_path(path); ++ /* ++ * We only log dir index items of a directory, so we don't need to look ++ * for dir item keys. ++ */ + di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, + index, name, name_len, -1); + if (IS_ERR(di)) { +@@ -3628,7 +3596,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, +- int key_type, u64 dirid, ++ u64 dirid, + u64 first_offset, u64 last_offset) + { + int ret; +@@ -3637,10 +3605,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, + + key.objectid = dirid; + key.offset = first_offset; +- if (key_type == BTRFS_DIR_ITEM_KEY) +- key.type = BTRFS_DIR_LOG_ITEM_KEY; +- else +- key.type = BTRFS_DIR_LOG_INDEX_KEY; ++ key.type = BTRFS_DIR_LOG_INDEX_KEY; + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); + if (ret) + return ret; +@@ -3675,7 +3640,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, + + if (count == 1) { + btrfs_item_key_to_cpu(src, &key, start_slot); +- item_size = btrfs_item_size_nr(src, start_slot); ++ item_size = btrfs_item_size(src, start_slot); + batch.keys = &key; + batch.data_sizes = &item_size; + batch.total_data_size = item_size; +@@ -3698,7 +3663,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, + const int slot = start_slot + i; + + btrfs_item_key_to_cpu(src, &ins_keys[i], slot); +- ins_sizes[i] = btrfs_item_size_nr(src, slot); ++ ins_sizes[i] = btrfs_item_size(src, slot); + batch.total_data_size += ins_sizes[i]; + } + } +@@ -3732,7 +3697,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, +- int key_type, + struct btrfs_log_ctx *ctx) + { + struct btrfs_root *log = inode->root->log_root; +@@ -3740,24 +3704,18 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, + const int nritems = btrfs_header_nritems(src); + const u64 ino = btrfs_ino(inode); + const bool inode_logged_before = inode_logged(trans, inode); +- u64 last_logged_key_offset; + bool last_found = false; + int batch_start = 0; + int batch_size = 0; + int i; + +- if (key_type == BTRFS_DIR_ITEM_KEY) +- last_logged_key_offset = inode->last_dir_item_offset; +- else +- last_logged_key_offset = inode->last_dir_index_offset; +- + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_key key; + int ret; + + btrfs_item_key_to_cpu(src, &key, i); + +- if (key.objectid != ino || key.type != key_type) { ++ if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { + last_found = true; + break; + } +@@ -3806,7 +3764,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, + * we logged is in the log tree, saving time and avoiding adding + * contention on the log tree. + */ +- if (key.offset > last_logged_key_offset) ++ if (key.offset > inode->last_dir_index_offset) + goto add_to_batch; + /* + * Check if the key was already logged before. If not we can add +@@ -3865,7 +3823,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, + static noinline int log_dir_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, +- struct btrfs_path *dst_path, int key_type, ++ struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx, + u64 min_offset, u64 *last_offset_ret) + { +@@ -3879,7 +3837,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + u64 ino = btrfs_ino(inode); + + min_key.objectid = ino; +- min_key.type = key_type; ++ min_key.type = BTRFS_DIR_INDEX_KEY; + min_key.offset = min_offset; + + ret = btrfs_search_forward(root, &min_key, path, trans->transid); +@@ -3888,9 +3846,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + * we didn't find anything from this transaction, see if there + * is anything at all + */ +- if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { ++ if (ret != 0 || min_key.objectid != ino || ++ min_key.type != BTRFS_DIR_INDEX_KEY) { + min_key.objectid = ino; +- min_key.type = key_type; ++ min_key.type = BTRFS_DIR_INDEX_KEY; + min_key.offset = (u64)-1; + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); +@@ -3898,7 +3857,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + btrfs_release_path(path); + return ret; + } +- ret = btrfs_previous_item(root, path, ino, key_type); ++ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); + + /* if ret == 0 there are items for this type, + * create a range to tell us the last key of this type. +@@ -3909,18 +3868,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, + path->slots[0]); +- if (key_type == tmp.type) ++ if (tmp.type == BTRFS_DIR_INDEX_KEY) + first_offset = max(min_offset, tmp.offset) + 1; + } + goto done; + } + + /* go backward to find any previous key */ +- ret = btrfs_previous_item(root, path, ino, key_type); ++ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); +- if (key_type == tmp.type) { ++ if (tmp.type == BTRFS_DIR_INDEX_KEY) { + first_offset = tmp.offset; + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], +@@ -3951,8 +3910,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + * from our directory + */ + while (1) { +- ret = process_dir_items_leaf(trans, inode, path, dst_path, +- key_type, ctx); ++ ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx); + if (ret != 0) { + if (ret < 0) + err = ret; +@@ -3973,7 +3931,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + goto done; + } + btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); +- if (min_key.objectid != ino || min_key.type != key_type) { ++ if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { + last_offset = (u64)-1; + goto done; + } +@@ -4004,8 +3962,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + * insert the log range keys to indicate where the log + * is valid + */ +- ret = insert_dir_log_key(trans, log, path, key_type, +- ino, first_offset, last_offset); ++ ret = insert_dir_log_key(trans, log, path, ino, first_offset, ++ last_offset); + if (ret) + err = ret; + } +@@ -4033,35 +3991,28 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, + u64 min_key; + u64 max_key; + int ret; +- int key_type = BTRFS_DIR_ITEM_KEY; + + /* + * If this is the first time we are being logged in the current + * transaction, or we were logged before but the inode was evicted and +- * reloaded later, in which case its logged_trans is 0, reset the values +- * of the last logged key offsets. Note that we don't use the helper ++ * reloaded later, in which case its logged_trans is 0, reset the value ++ * of the last logged key offset. Note that we don't use the helper + * function inode_logged() here - that is because the function returns + * true after an inode eviction, assuming the worst case as it can not + * know for sure if the inode was logged before. So we can not skip key + * searches in the case the inode was evicted, because it may not have + * been logged in this transaction and may have been logged in a past +- * transaction, so we need to reset the last dir item and index offsets +- * to (u64)-1. ++ * transaction, so we need to reset the last dir index offset to (u64)-1. + */ +- if (inode->logged_trans != trans->transid) { +- inode->last_dir_item_offset = (u64)-1; ++ if (inode->logged_trans != trans->transid) + inode->last_dir_index_offset = (u64)-1; +- } +-again: ++ + min_key = 0; + max_key = 0; +- if (key_type == BTRFS_DIR_ITEM_KEY) +- ctx->last_dir_item_offset = inode->last_dir_item_offset; +- else +- ctx->last_dir_item_offset = inode->last_dir_index_offset; ++ ctx->last_dir_item_offset = inode->last_dir_index_offset; + + while (1) { +- ret = log_dir_items(trans, inode, path, dst_path, key_type, ++ ret = log_dir_items(trans, inode, path, dst_path, + ctx, min_key, &max_key); + if (ret) + return ret; +@@ -4070,13 +4021,8 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, + min_key = max_key + 1; + } + +- if (key_type == BTRFS_DIR_ITEM_KEY) { +- inode->last_dir_item_offset = ctx->last_dir_item_offset; +- key_type = BTRFS_DIR_INDEX_KEY; +- goto again; +- } else { +- inode->last_dir_index_offset = ctx->last_dir_item_offset; +- } ++ inode->last_dir_index_offset = ctx->last_dir_item_offset; ++ + return 0; + } + +@@ -4350,7 +4296,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, + batch.nr = nr; + + for (i = 0; i < nr; i++) { +- ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); ++ ins_sizes[i] = btrfs_item_size(src, i + start_slot); + batch.total_data_size += ins_sizes[i]; + btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); + } +@@ -4573,14 +4519,34 @@ static int log_one_extent(struct btrfs_trans_handle *trans, + { + struct btrfs_drop_extents_args drop_args = { 0 }; + struct btrfs_root *log = inode->root->log_root; +- struct btrfs_file_extent_item *fi; ++ struct btrfs_file_extent_item fi = { 0 }; + struct extent_buffer *leaf; +- struct btrfs_map_token token; + struct btrfs_key key; + u64 extent_offset = em->start - em->orig_start; + u64 block_len; + int ret; + ++ btrfs_set_stack_file_extent_generation(&fi, trans->transid); ++ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) ++ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); ++ else ++ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); ++ ++ block_len = max(em->block_len, em->orig_block_len); ++ if (em->compress_type != BTRFS_COMPRESS_NONE) { ++ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); ++ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); ++ } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { ++ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - ++ extent_offset); ++ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); ++ } ++ ++ btrfs_set_stack_file_extent_offset(&fi, extent_offset); ++ btrfs_set_stack_file_extent_num_bytes(&fi, em->len); ++ btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); ++ btrfs_set_stack_file_extent_compression(&fi, em->compress_type); ++ + ret = log_extent_csums(trans, inode, log, em, ctx); + if (ret) + return ret; +@@ -4599,7 +4565,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, + drop_args.start = em->start; + drop_args.end = em->start + em->len; + drop_args.replace_extent = true; +- drop_args.extent_item_size = sizeof(*fi); ++ drop_args.extent_item_size = sizeof(fi); + ret = btrfs_drop_extents(trans, log, inode, &drop_args); + if (ret) + return ret; +@@ -4611,44 +4577,14 @@ static int log_one_extent(struct btrfs_trans_handle *trans, + key.offset = em->start; + + ret = btrfs_insert_empty_item(trans, log, path, &key, +- sizeof(*fi)); ++ sizeof(fi)); + if (ret) + return ret; + } + leaf = path->nodes[0]; +- btrfs_init_map_token(&token, leaf); +- fi = btrfs_item_ptr(leaf, path->slots[0], +- struct btrfs_file_extent_item); +- +- btrfs_set_token_file_extent_generation(&token, fi, trans->transid); +- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) +- btrfs_set_token_file_extent_type(&token, fi, +- BTRFS_FILE_EXTENT_PREALLOC); +- else +- btrfs_set_token_file_extent_type(&token, fi, +- BTRFS_FILE_EXTENT_REG); +- +- block_len = max(em->block_len, em->orig_block_len); +- if (em->compress_type != BTRFS_COMPRESS_NONE) { +- btrfs_set_token_file_extent_disk_bytenr(&token, fi, +- em->block_start); +- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); +- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { +- btrfs_set_token_file_extent_disk_bytenr(&token, fi, +- em->block_start - +- extent_offset); +- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); +- } else { +- btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); +- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); +- } +- +- btrfs_set_token_file_extent_offset(&token, fi, extent_offset); +- btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); +- btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); +- btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); +- btrfs_set_token_file_extent_encryption(&token, fi, 0); +- btrfs_set_token_file_extent_other_encoding(&token, fi, 0); ++ write_extent_buffer(leaf, &fi, ++ btrfs_item_ptr_offset(leaf, path->slots[0]), ++ sizeof(fi)); + btrfs_mark_buffer_dirty(leaf); + + btrfs_release_path(path); +@@ -4862,7 +4798,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, + WARN_ON(!list_empty(&extents)); + write_unlock(&tree->lock); + +- btrfs_release_path(path); + if (!ret) + ret = btrfs_log_prealloc_extents(trans, inode, path); + if (ret) +@@ -5166,7 +5101,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, + struct btrfs_path *search_path; + char *name = NULL; + u32 name_len = 0; +- u32 item_size = btrfs_item_size_nr(eb, slot); ++ u32 item_size = btrfs_item_size(eb, slot); + u32 cur_offset = 0; + unsigned long ptr = btrfs_item_ptr_offset(eb, slot); + +@@ -5899,18 +5834,12 @@ struct btrfs_dir_list { + * link_to_fixup_dir()); + * + * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that +- * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and +- * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item ++ * while logging the inode's items new index items (key type ++ * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item + * has a size that doesn't match the sum of the lengths of all the logged +- * names. This does not result in a problem because if a dir_item key is +- * logged but its matching dir_index key is not logged, at log replay time we +- * don't use it to replay the respective name (see replay_one_name()). On the +- * other hand if only the dir_index key ends up being logged, the respective +- * name is added to the fs/subvol tree with both the dir_item and dir_index +- * keys created (see replay_one_name()). +- * The directory's inode item with a wrong i_size is not a problem as well, +- * since we don't use it at log replay time to set the i_size in the inode +- * item of the fs/subvol tree (see overwrite_item()). ++ * names - this is ok, not a problem, because at log replay time we set the ++ * directory's i_size to the correct value (see replay_one_name() and ++ * do_overwrite_item()). + */ + static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + struct btrfs_root *root, +@@ -5956,7 +5885,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + goto next_dir_inode; + + min_key.objectid = dir_elem->ino; +- min_key.type = BTRFS_DIR_ITEM_KEY; ++ min_key.type = BTRFS_DIR_INDEX_KEY; + min_key.offset = 0; + again: + btrfs_release_path(path); +@@ -5981,7 +5910,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + + btrfs_item_key_to_cpu(leaf, &min_key, i); + if (min_key.objectid != dir_elem->ino || +- min_key.type != BTRFS_DIR_ITEM_KEY) ++ min_key.type != BTRFS_DIR_INDEX_KEY) + goto next_dir_inode; + + di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); +@@ -6093,7 +6022,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, + if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) + break; + +- item_size = btrfs_item_size_nr(leaf, slot); ++ item_size = btrfs_item_size(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); + while (cur_offset < item_size) { + struct btrfs_key inode_key; +@@ -6795,15 +6724,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, + * was previously logged, make sure the next log attempt on the directory + * is not skipped and logs the inode again. This is because the log may + * not currently be authoritative for a range including the old +- * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make +- * sure after a log replay we do not end up with both the new and old +- * dentries around (in case the inode is a directory we would have a +- * directory with two hard links and 2 inode references for different +- * parents). The next log attempt of old_dir will happen at +- * btrfs_log_all_parents(), called through btrfs_log_inode_parent() +- * below, because we have previously set inode->last_unlink_trans to the +- * current transaction ID, either here or at btrfs_record_unlink_dir() in +- * case inode is a directory. ++ * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we ++ * do not end up with both the new and old dentries around (in case the ++ * inode is a directory we would have a directory with two hard links and ++ * 2 inode references for different parents). The next log attempt of ++ * old_dir will happen at btrfs_log_all_parents(), called through ++ * btrfs_log_inode_parent() below, because we have previously set ++ * inode->last_unlink_trans to the current transaction ID, either here or ++ * at btrfs_record_unlink_dir() in case the inode is a directory. + */ + if (old_dir) + old_dir->logged_trans = 0; +diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c +index 74023c8a783f..b458452a1aaf 100644 +--- a/fs/btrfs/uuid-tree.c ++++ b/fs/btrfs/uuid-tree.c +@@ -52,7 +52,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, + + eb = path->nodes[0]; + slot = path->slots[0]; +- item_size = btrfs_item_size_nr(eb, slot); ++ item_size = btrfs_item_size(eb, slot); + offset = btrfs_item_ptr_offset(eb, slot); + ret = -ENOENT; + +@@ -125,7 +125,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); +- offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); ++ offset += btrfs_item_size(eb, slot) - sizeof(subid_le); + } else { + btrfs_warn(fs_info, + "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", +@@ -186,7 +186,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); +- item_size = btrfs_item_size_nr(eb, slot); ++ item_size = btrfs_item_size(eb, slot); + if (!IS_ALIGNED(item_size, sizeof(u64))) { + btrfs_warn(fs_info, "uuid item with illegal size %lu!", + (unsigned long)item_size); +@@ -208,7 +208,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + goto out; + } + +- item_size = btrfs_item_size_nr(eb, slot); ++ item_size = btrfs_item_size(eb, slot); + if (item_size == sizeof(subid)) { + ret = btrfs_del_item(trans, uuid_root, path); + goto out; +@@ -331,7 +331,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) + goto skip; + + offset = btrfs_item_ptr_offset(leaf, slot); +- item_size = btrfs_item_size_nr(leaf, slot); ++ item_size = btrfs_item_size(leaf, slot); + if (!IS_ALIGNED(item_size, sizeof(u64))) { + btrfs_warn(fs_info, + "uuid item with illegal size %lu!", +diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c +index 4968535dfff0..90eb5c2830a9 100644 +--- a/fs/btrfs/verity.c ++++ b/fs/btrfs/verity.c +@@ -333,7 +333,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, + if (key.objectid != btrfs_ino(inode) || key.type != key_type) + break; + +- item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset; ++ item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset; + + if (copied > 0) { + /* +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index 42391d4aeb11..5f4ac1a2e1f3 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -34,6 +34,10 @@ + #include "discard.h" + #include "zoned.h" + ++#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ ++ BTRFS_BLOCK_GROUP_RAID10 | \ ++ BTRFS_BLOCK_GROUP_RAID56_MASK) ++ + const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = { + .sub_stripes = 2, +@@ -4643,7 +4647,7 @@ int btrfs_uuid_scan_kthread(void *data) + + eb = path->nodes[0]; + slot = path->slots[0]; +- item_size = btrfs_item_size_nr(eb, slot); ++ item_size = btrfs_item_size(eb, slot); + if (item_size < sizeof(root_item)) + goto skip; + +@@ -6314,7 +6318,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, + stripe_offset = offset - stripe_offset; + data_stripes = nr_data_stripes(map); + +- if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { ++ /* Only stripe based profiles needs to check against stripe length. */ ++ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { + u64 max_len = stripe_len - stripe_offset; + + /* +@@ -7730,7 +7735,7 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device, + } + slot = path->slots[0]; + eb = path->nodes[0]; +- item_size = btrfs_item_size_nr(eb, slot); ++ item_size = btrfs_item_size(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); + +@@ -7808,7 +7813,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, + } + + if (ret == 0 && +- btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { ++ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* need to delete old one and insert a new one */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { +diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c +index 2837b4c8424d..99abf41b89b9 100644 +--- a/fs/btrfs/xattr.c ++++ b/fs/btrfs/xattr.c +@@ -168,9 +168,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, + const int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + const u16 old_data_len = btrfs_dir_data_len(leaf, di); +- const u32 item_size = btrfs_item_size_nr(leaf, slot); ++ const u32 item_size = btrfs_item_size(leaf, slot); + const u32 data_size = sizeof(*di) + name_len + size; +- struct btrfs_item *item; + unsigned long data_ptr; + char *ptr; + +@@ -196,9 +195,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, + btrfs_extend_item(path, data_size); + } + +- item = btrfs_item_nr(slot); + ptr = btrfs_item_ptr(leaf, slot, char); +- ptr += btrfs_item_size(leaf, item) - data_size; ++ ptr += btrfs_item_size(leaf, slot) - data_size; + di = (struct btrfs_dir_item *)ptr; + btrfs_set_dir_data_len(leaf, di, size); + data_ptr = ((unsigned long)(di + 1)) + name_len; +@@ -335,7 +333,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) + goto next_item; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); +- item_size = btrfs_item_size_nr(leaf, slot); ++ item_size = btrfs_item_size(leaf, slot); + cur = 0; + while (cur < item_size) { + u16 name_len = btrfs_dir_name_len(leaf, di); +diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h +index 738619994e26..012a71ab5d8e 100644 +--- a/include/uapi/linux/btrfs.h ++++ b/include/uapi/linux/btrfs.h +@@ -575,8 +575,10 @@ struct btrfs_ioctl_clone_range_args { + * Used by: + * struct btrfs_ioctl_defrag_range_args.flags + */ +-#define BTRFS_DEFRAG_RANGE_COMPRESS 1 +-#define BTRFS_DEFRAG_RANGE_START_IO 2 ++#define BTRFS_DEFRAG_RANGE_COMPRESS (1UL << 0) ++#define BTRFS_DEFRAG_RANGE_START_IO (1UL << 1) ++#define BTRFS_DEFRAG_RANGE_FLAGS_MASK (BTRFS_DEFRAG_RANGE_COMPRESS |\ ++ BTRFS_DEFRAG_RANGE_START_IO) + struct btrfs_ioctl_defrag_range_args { + /* start of the defrag operation */ + __u64 start; +diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h +index e1c4c732aaba..5416f1f1a77a 100644 +--- a/include/uapi/linux/btrfs_tree.h ++++ b/include/uapi/linux/btrfs_tree.h +@@ -146,7 +146,9 @@ + + /* + * dir items are the name -> inode pointers in a directory. There is one +- * for every name in a directory. ++ * for every name in a directory. BTRFS_DIR_LOG_ITEM_KEY is no longer used ++ * but it's still defined here for documentation purposes and to help avoid ++ * having its numerical value reused in the future. + */ + #define BTRFS_DIR_LOG_ITEM_KEY 60 + #define BTRFS_DIR_LOG_INDEX_KEY 72 +-- +2.35.1 + |