aboutsummarylogtreecommitdiffstats
diff options
context:
space:
mode:
authorScott B2022-02-10 22:57:20 -0800
committerScott B2022-02-12 00:57:42 -0800
commit44db0f40320d2895d9f2438145152e329fb6dfb1 (patch)
treea2eecf6c2d0248a9a18806f9b4136f586e754c3b
parent248c3c289b71536ece4f14f7bf753f14ce637696 (diff)
downloadaur-44db0f40320d2895d9f2438145152e329fb6dfb1.tar.gz
hotfix: resolve btrfs autodefrag high utilization
-rw-r--r--.SRCINFO2
-rw-r--r--PKGBUILD4
-rw-r--r--btrfs-fix-autodefrag-on-5.16.9.patch6417
3 files changed, 6423 insertions, 0 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 98a9d050de5b..2d11f2665c78 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -25,6 +25,7 @@ pkgbase = linux-xanmod-rog
source = Bluetooth-btintel-Fix-bdaddress-comparison-with-garb.patch
source = Bluetooth-Read-codec-capabilities-only-if-supported.patch
source = Bluetooth-fix-deadlock-for-RFCOMM-sk-state-change.patch
+ source = btrfs-fix-autodefrag-on-5.16.9.patch
source = Revert-XANMOD-fair-Remove-all-energy-efficiency-functions.patch
source = cpufreq-CPPC-Fix-performance-frequency-conversion.patch
source = udp-ipv6-optimisations-v2-net-next.patch
@@ -56,6 +57,7 @@ pkgbase = linux-xanmod-rog
sha256sums = 241f01f06849fcec462d72355ca3ab6bd34931731dec89876d785912ac532398
sha256sums = dd01bd3f774c3a9af42b6d89f534f39c4a5f200db32cd6d4b72a29325645100e
sha256sums = a9647897e59b04cb883dcf649b3108e9397d5a6c672bc545ea0c6bb7bb30d5a9
+ sha256sums = cd2795ab2c355eb0182cba2940712552ff46eee95b04abb41327c208f7f3e546
sha256sums = 3bb1cf422c64b4eea324b71048d0bdee04b5f9132136c6a4774e5205e45c46f1
sha256sums = 5c6c7778bc2d873657a885272956e232138b8b4935c3a3d6b11ef1619d344b20
sha256sums = 56f8f93a38ed7236c2504c79645a33123ee7bdf3c0cbb97dfd90600df06be7dd
diff --git a/PKGBUILD b/PKGBUILD
index c821e026bb5b..910eea0a98a4 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -114,6 +114,9 @@ source=("https://cdn.kernel.org/pub/linux/kernel/v${_branch}/linux-${_major}.tar
"Bluetooth-Read-codec-capabilities-only-if-supported.patch"
"Bluetooth-fix-deadlock-for-RFCOMM-sk-state-change.patch"
+ # hotfix: address btrfs autodefrag excessive utilization
+ "btrfs-fix-autodefrag-on-5.16.9.patch"
+
# Revert Xanmod scheduler power efficiency removal
"Revert-XANMOD-fair-Remove-all-energy-efficiency-functions.patch"
@@ -180,6 +183,7 @@ sha256sums=('027d7e8988bb69ac12ee92406c3be1fe13f990b1ca2249e226225cd1573308bb'
'241f01f06849fcec462d72355ca3ab6bd34931731dec89876d785912ac532398'
'dd01bd3f774c3a9af42b6d89f534f39c4a5f200db32cd6d4b72a29325645100e'
'a9647897e59b04cb883dcf649b3108e9397d5a6c672bc545ea0c6bb7bb30d5a9'
+ 'cd2795ab2c355eb0182cba2940712552ff46eee95b04abb41327c208f7f3e546'
'3bb1cf422c64b4eea324b71048d0bdee04b5f9132136c6a4774e5205e45c46f1'
'5c6c7778bc2d873657a885272956e232138b8b4935c3a3d6b11ef1619d344b20'
'56f8f93a38ed7236c2504c79645a33123ee7bdf3c0cbb97dfd90600df06be7dd'
diff --git a/btrfs-fix-autodefrag-on-5.16.9.patch b/btrfs-fix-autodefrag-on-5.16.9.patch
new file mode 100644
index 000000000000..33053ea7b449
--- /dev/null
+++ b/btrfs-fix-autodefrag-on-5.16.9.patch
@@ -0,0 +1,6417 @@
+From 6c67e14b140aba83be3aee93961ade179dbc2473 Mon Sep 17 00:00:00 2001
+From: Scott B <arglebargle@arglebargle.dev>
+Date: Fri, 11 Feb 2022 23:52:12 -0800
+Subject: [PATCH] btrfs fix autodefrag on 5.16.9
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Squashed commit of the following:
+
+commit 7af5a9b695e62bdb82b55cb255c448e3af3ac587
+Author: Qu Wenruo <wqu@suse.com>
+Date: Fri Feb 11 14:41:43 2022 +0800
+
+ btrfs: defrag: make btrfs_defrag_file() to report accurate number of defragged sectors
+
+ Previously rework btrfs_defrag_file() can only report the number of
+ sectors from the first run of defrag_collect_targets().
+
+ This number is not accurate as if holes are punched after the first
+ defrag_collect_targets() call, we will not choose to defrag the holes.
+
+ Originally this is to avoid passing @sectors_defragged to every involved
+ functions.
+
+ But now since we have btrfs_defrag_ctrl, there is no need to do such
+ inaccurate accounting, just update btrfs_defrag_ctrl::sectors_defragged
+ after a successful defrag_one_locked_target() call.
+
+ Signed-off-by: Qu Wenruo <wqu@suse.com>
+ Reviewed-by: Filipe Manana <fdmanana@suse.com>
+
+commit 7d6ad9ac62135f86c190f4ccf1ea1e8bb2e13480
+Author: Qu Wenruo <wqu@suse.com>
+Date: Fri Feb 11 14:41:42 2022 +0800
+
+ btrfs: defrag: use btrfs_defrag_ctrl to replace btrfs_ioctl_defrag_range_args for btrfs_defrag_file()
+
+ This brings the following benefits:
+
+ - No more strange range->start update to indicate last scanned bytenr
+ We have btrfs_defrag_ctrl::last_scanned (exclusive) for it directly.
+
+ - No more return value to indicate defragged sectors
+ Now btrfs_defrag_file() will just return 0 if no error happened.
+ And btrfs_defrag_ctrl::sectors_defragged will show that value.
+
+ - Less parameters to carry around
+ Now most defrag_* functions only need to fetch its policy parameters
+ from btrfs_defrag_ctrl directly.
+
+ Signed-off-by: Qu Wenruo <wqu@suse.com>
+
+commit b19878cde4728eeb3b5e017a6718ffd9e263c1a2
+Author: Qu Wenruo <wqu@suse.com>
+Date: Fri Feb 11 14:41:41 2022 +0800
+
+ btrfs: defrag: introduce btrfs_defrag_ctrl structure for later usage
+
+ Currently btrfs_defrag_file() accepts not only
+ btrfs_ioctl_defrag_range_args but also other parameters like @newer_than
+ and @max_sectors_to_defrag for extra policies.
+
+ Those extra values are hidden from defrag ioctl and even caused bugs in
+ the past due to different behaviors based on those extra values.
+
+ Here we introduce a new structure, btrfs_defrag_ctrl, to include:
+
+ - all members in btrfs_ioctl_defrag_range_args
+
+ - @max_sectors_to_defrag and @newer_than
+
+ - Extra values which callers of btrfs_defrag_file() may care
+ Like @sectors_defragged and @last_scanned.
+
+ With the new structure, also introduce a new helper,
+ btrfs_defrag_ioctl_args_to_ctrl() to:
+
+ - Do extra sanity check on @compress and @flags
+
+ - Do range alignment when possible
+
+ - Set default values.
+
+ Signed-off-by: Qu Wenruo <wqu@suse.com>
+ Reviewed-by: Filipe Manana <fdmanana@suse.com>
+
+commit a510f6c16dbfead2fcf0b04489d676d16851ba9e
+Author: Qu Wenruo <wqu@suse.com>
+Date: Fri Feb 11 14:41:39 2022 +0800
+
+ btrfs: defrag: allow defrag_one_cluster() to skip large extent which is not a target
+
+ In the rework of btrfs_defrag_file(), we always call
+ defrag_one_cluster() and increase the offset by cluster size, which is
+ only 256K.
+
+ But there are cases where we have a large extent (e.g. 128M) which
+ doesn't need to be defragged at all.
+
+ Before the refactor, we can directly skip the range, but now we have to
+ scan that extent map again and again until the cluster moves after the
+ non-target extent.
+
+ Fix the problem by allow defrag_one_cluster() to increase
+ btrfs_defrag_ctrl::last_scanned to the end of an extent, if and only if
+ the last extent of the cluster is not a target.
+
+ The test script looks like this:
+
+ mkfs.btrfs -f $dev > /dev/null
+
+ mount $dev $mnt
+
+ # As btrfs ioctl uses 32M as extent_threshold
+ xfs_io -f -c "pwrite 0 64M" $mnt/file1
+ sync
+ # Some fragemented range to defrag
+ xfs_io -s -c "pwrite 65548k 4k" \
+ -c "pwrite 65544k 4k" \
+ -c "pwrite 65540k 4k" \
+ -c "pwrite 65536k 4k" \
+ $mnt/file1
+ sync
+
+ echo "=== before ==="
+ xfs_io -c "fiemap -v" $mnt/file1
+ echo "=== after ==="
+ btrfs fi defrag $mnt/file1
+ sync
+ xfs_io -c "fiemap -v" $mnt/file1
+ umount $mnt
+
+ With extra ftrace put into defrag_one_cluster(), before the patch it
+ would result tons of loops:
+
+ (As defrag_one_cluster() is inlined, the function name is its caller)
+
+ btrfs-126062 [005] ..... 4682.816026: btrfs_defrag_file: r/i=5/257 start=0 len=262144
+ btrfs-126062 [005] ..... 4682.816027: btrfs_defrag_file: r/i=5/257 start=262144 len=262144
+ btrfs-126062 [005] ..... 4682.816028: btrfs_defrag_file: r/i=5/257 start=524288 len=262144
+ btrfs-126062 [005] ..... 4682.816028: btrfs_defrag_file: r/i=5/257 start=786432 len=262144
+ btrfs-126062 [005] ..... 4682.816028: btrfs_defrag_file: r/i=5/257 start=1048576 len=262144
+ ...
+ btrfs-126062 [005] ..... 4682.816043: btrfs_defrag_file: r/i=5/257 start=67108864 len=262144
+
+ But with this patch there will be just one loop, then directly to the
+ end of the extent:
+
+ btrfs-130471 [014] ..... 5434.029558: defrag_one_cluster: r/i=5/257 start=0 len=262144
+ btrfs-130471 [014] ..... 5434.029559: defrag_one_cluster: r/i=5/257 start=67108864 len=16384
+
+ Cc: stable@vger.kernel.org # 5.16
+ Signed-off-by: Qu Wenruo <wqu@suse.com>
+ Reviewed-by: Filipe Manana <fdmanana@suse.com>
+
+commit 3f2d69fc4a7a4ce3f389b9e84fa3c830f6a8b5c5
+Author: Qu Wenruo <wqu@suse.com>
+Date: Fri Feb 11 14:41:40 2022 +0800
+
+ btrfs: uapi: introduce BTRFS_DEFRAG_RANGE_MASK for later sanity check
+
+ And since we're here, replace the hardcoded bit flags (1, 2) with
+ (1UL << 0) and (1UL << 1), respectively.
+
+ Signed-off-by: Qu Wenruo <wqu@suse.com>
+ Reviewed-by: Filipe Manana <fdmanana@suse.com>
+
+commit b6c665523425451af94eb3f044d4474c81f94b1e
+Author: Qu Wenruo <wqu@suse.com>
+Date: Fri Jan 28 15:21:22 2022 +0800
+
+ btrfs: defrag: remove an ambiguous condition for rejection
+
+ From the very beginning of btrfs defrag, there is a check to reject
+ extents which meet both conditions:
+
+ - Physically adjacent
+
+ We may want to defrag physically adjacent extents to reduce the number
+ of extents or the size of subvolume tree.
+
+ - Larger than 128K
+
+ This may be there for compressed extents, but unfortunately 128K is
+ exactly the max capacity for compressed extents.
+ And the check is > 128K, thus it never rejects compressed extents.
+
+ Furthermore, the compressed extent capacity bug is fixed by previous
+ patch, there is no reason for that check anymore.
+
+ The original check has a very small ranges to reject (the target extent
+ size is > 128K, and default extent threshold is 256K), and for
+ compressed extent it doesn't work at all.
+
+ So it's better just to remove the rejection, and allow us to defrag
+ physically adjacent extents.
+
+ Reviewed-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: Qu Wenruo <wqu@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit cb53ba48a2b6c9126d128f301b4ed8085dcbce7b
+Author: Qu Wenruo <wqu@suse.com>
+Date: Fri Jan 28 15:21:21 2022 +0800
+
+ btrfs: defrag: don't defrag extents which is already at its max capacity
+
+ [BUG]
+ For compressed extents, defrag ioctl will always try to defrag any
+ compressed extents, wasting not only IO but also CPU time to
+ compress/decompress:
+
+ mkfs.btrfs -f $DEV
+ mount -o compress $DEV $MNT
+ xfs_io -f -c "pwrite -S 0xab 0 128K" $MNT/foobar
+ sync
+ xfs_io -f -c "pwrite -S 0xcd 128K 128K" $MNT/foobar
+ sync
+ echo "=== before ==="
+ xfs_io -c "fiemap -v" $MNT/foobar
+ btrfs filesystem defrag $MNT/foobar
+ sync
+ echo "=== after ==="
+ xfs_io -c "fiemap -v" $MNT/foobar
+
+ Then it shows the 2 128K extents just get CoW for no extra benefit, with
+ extra IO/CPU spent:
+
+ === before ===
+ /mnt/btrfs/file1:
+ EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS
+ 0: [0..255]: 26624..26879 256 0x8
+ 1: [256..511]: 26632..26887 256 0x9
+ === after ===
+ /mnt/btrfs/file1:
+ EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS
+ 0: [0..255]: 26640..26895 256 0x8
+ 1: [256..511]: 26648..26903 256 0x9
+
+ This affects not only v5.16 (after the defrag rework), but also v5.15
+ (before the defrag rework).
+
+ [CAUSE]
+ >From the very beginning, btrfs defrag never checks if one extent is
+ already at its max capacity (128K for compressed extents, 128M
+ otherwise).
+
+ And the default extent size threshold is 256K, which is already beyond
+ the compressed extent max size.
+
+ This means, by default btrfs defrag ioctl will mark all compressed
+ extent which is not adjacent to a hole/preallocated range for defrag.
+
+ [FIX]
+ Introduce a helper to grab the maximum extent size, and then in
+ defrag_collect_targets() and defrag_check_next_extent(), reject extents
+ which are already at their max capacity.
+
+ Reported-by: Filipe Manana <fdmanana@suse.com>
+ Reviewed-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: Qu Wenruo <wqu@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit e5cf566d32d6f7b244d87ab0c0797b43d54b4c37
+Author: Qu Wenruo <wqu@suse.com>
+Date: Fri Jan 28 15:21:20 2022 +0800
+
+ btrfs: defrag: don't try to merge regular extents with preallocated extents
+
+ [BUG]
+ With older kernels (before v5.16), btrfs will defrag preallocated extents.
+ While with newer kernels (v5.16 and newer) btrfs will not defrag
+ preallocated extents, but it will defrag the extent just before the
+ preallocated extent, even it's just a single sector.
+
+ This can be exposed by the following small script:
+
+ mkfs.btrfs -f $dev > /dev/null
+
+ mount $dev $mnt
+ xfs_io -f -c "pwrite 0 4k" -c sync -c "falloc 4k 16K" $mnt/file
+ xfs_io -c "fiemap -v" $mnt/file
+ btrfs fi defrag $mnt/file
+ sync
+ xfs_io -c "fiemap -v" $mnt/file
+
+ The output looks like this on older kernels:
+
+ /mnt/btrfs/file:
+ EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS
+ 0: [0..7]: 26624..26631 8 0x0
+ 1: [8..39]: 26632..26663 32 0x801
+ /mnt/btrfs/file:
+ EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS
+ 0: [0..39]: 26664..26703 40 0x1
+
+ Which defrags the single sector along with the preallocated extent, and
+ replace them with an regular extent into a new location (caused by data
+ COW).
+ This wastes most of the data IO just for the preallocated range.
+
+ On the other hand, v5.16 is slightly better:
+
+ /mnt/btrfs/file:
+ EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS
+ 0: [0..7]: 26624..26631 8 0x0
+ 1: [8..39]: 26632..26663 32 0x801
+ /mnt/btrfs/file:
+ EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS
+ 0: [0..7]: 26664..26671 8 0x0
+ 1: [8..39]: 26632..26663 32 0x801
+
+ The preallocated range is not defragged, but the sector before it still
+ gets defragged, which has no need for it.
+
+ [CAUSE]
+ One of the function reused by the old and new behavior is
+ defrag_check_next_extent(), it will determine if we should defrag
+ current extent by checking the next one.
+
+ It only checks if the next extent is a hole or inlined, but it doesn't
+ check if it's preallocated.
+
+ On the other hand, out of the function, both old and new kernel will
+ reject preallocated extents.
+
+ Such inconsistent behavior causes above behavior.
+
+ [FIX]
+ - Also check if next extent is preallocated
+ If so, don't defrag current extent.
+
+ - Add comments for each branch why we reject the extent
+
+ This will reduce the IO caused by defrag ioctl and autodefrag.
+
+ Reviewed-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: Qu Wenruo <wqu@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit f9649fa5a78f6e27a8bb4ec026efe3b4c1d64bc8
+Author: Sidong Yang <realwakka@gmail.com>
+Date: Sun Feb 6 12:52:48 2022 +0000
+
+ btrfs: qgroup: remove duplicated check in adding qgroup relations
+
+ Removes duplicated check when adding qgroup relations.
+ btrfs_add_qgroup_relations function adds relations by calling
+ add_relation_rb(). add_relation_rb() checks that member/parentid exists
+ in current qgroup_tree. But it already checked before calling the
+ function. It seems that we don't need to double check.
+
+ Add new function __add_relation_rb() that adds relations with
+ qgroup structures and makes old function use the new one. And it makes
+ btrfs_add_qgroup_relation() function work without double checks by
+ calling the new function.
+
+ Signed-off-by: Sidong Yang <realwakka@gmail.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ [ add comments ]
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 843263d23fb3348d562a0e410d3e8e552e829ef3
+Author: Dāvis Mosāns <davispuh@gmail.com>
+Date: Wed Feb 2 23:44:54 2022 +0200
+
+ btrfs: add lzo workspace buffer length constants
+
+ It makes it more readable for length checking and is be used repeatedly.
+
+ Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit b1794acfaaef72cc21c3ec3f92d63b1da0842f54
+Author: Dāvis Mosāns <davispuh@gmail.com>
+Date: Wed Feb 2 23:44:55 2022 +0200
+
+ btrfs: prevent copying too big compressed lzo segment
+
+ Compressed length can be corrupted to be a lot larger than memory
+ we have allocated for buffer.
+ This will cause memcpy in copy_compressed_segment to write outside
+ of allocated memory.
+
+ This mostly results in stuck read syscall but sometimes when using
+ btrfs send can get #GP
+
+ kernel: general protection fault, probably for non-canonical address 0x841551d5c1000: 0000 [#1] PREEMPT SMP NOPTI
+ kernel: CPU: 17 PID: 264 Comm: kworker/u256:7 Tainted: P OE 5.17.0-rc2-1 #12
+ kernel: Workqueue: btrfs-endio btrfs_work_helper [btrfs]
+ kernel: RIP: 0010:lzo_decompress_bio (./include/linux/fortify-string.h:225 fs/btrfs/lzo.c:322 fs/btrfs/lzo.c:394) btrfs
+ Code starting with the faulting instruction
+ ===========================================
+ 0:* 48 8b 06 mov (%rsi),%rax <-- trapping instruction
+ 3: 48 8d 79 08 lea 0x8(%rcx),%rdi
+ 7: 48 83 e7 f8 and $0xfffffffffffffff8,%rdi
+ b: 48 89 01 mov %rax,(%rcx)
+ e: 44 89 f0 mov %r14d,%eax
+ 11: 48 8b 54 06 f8 mov -0x8(%rsi,%rax,1),%rdx
+ kernel: RSP: 0018:ffffb110812efd50 EFLAGS: 00010212
+ kernel: RAX: 0000000000001000 RBX: 000000009ca264c8 RCX: ffff98996e6d8ff8
+ kernel: RDX: 0000000000000064 RSI: 000841551d5c1000 RDI: ffffffff9500435d
+ kernel: RBP: ffff989a3be856c0 R08: 0000000000000000 R09: 0000000000000000
+ kernel: R10: 0000000000000000 R11: 0000000000001000 R12: ffff98996e6d8000
+ kernel: R13: 0000000000000008 R14: 0000000000001000 R15: 000841551d5c1000
+ kernel: FS: 0000000000000000(0000) GS:ffff98a09d640000(0000) knlGS:0000000000000000
+ kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ kernel: CR2: 00001e9f984d9ea8 CR3: 000000014971a000 CR4: 00000000003506e0
+ kernel: Call Trace:
+ kernel: <TASK>
+ kernel: end_compressed_bio_read (fs/btrfs/compression.c:104 fs/btrfs/compression.c:1363 fs/btrfs/compression.c:323) btrfs
+ kernel: end_workqueue_fn (fs/btrfs/disk-io.c:1923) btrfs
+ kernel: btrfs_work_helper (fs/btrfs/async-thread.c:326) btrfs
+ kernel: process_one_work (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:212 ./include/trace/events/workqueue.h:108 kernel/workqueue.c:2312)
+ kernel: worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2455)
+ kernel: ? process_one_work (kernel/workqueue.c:2397)
+ kernel: kthread (kernel/kthread.c:377)
+ kernel: ? kthread_complete_and_exit (kernel/kthread.c:332)
+ kernel: ret_from_fork (arch/x86/entry/entry_64.S:301)
+ kernel: </TASK>
+
+ CC: stable@vger.kernel.org # 4.9+
+ Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 09ad560a431c83f3741f1e545924c7fbb8957dd4
+Author: Dāvis Mosāns <davispuh@gmail.com>
+Date: Sat Feb 5 20:48:23 2022 +0200
+
+ btrfs: send: in case of IO error log it
+
+ Currently if we get IO error while doing send then we abort without
+ logging information about which file caused issue. So log it to help
+ with debugging.
+
+ CC: stable@vger.kernel.org # 4.9+
+ Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 43b4cb906eef17f9a7ca8e660f3e9e44176082f6
+Author: Filipe Manana <fdmanana@suse.com>
+Date: Wed Feb 2 15:26:09 2022 +0000
+
+ btrfs: get rid of warning on transaction commit when using flushoncommit
+
+ When using the flushoncommit mount option, during almost every transaction
+ commit we trigger a warning from __writeback_inodes_sb_nr():
+
+ $ cat fs/fs-writeback.c:
+ (...)
+ static void __writeback_inodes_sb_nr(struct super_block *sb, ...
+ {
+ (...)
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
+ (...)
+ }
+ (...)
+
+ The trace produced in dmesg looks like the following:
+
+ [947.473890] WARNING: CPU: 5 PID: 930 at fs/fs-writeback.c:2610 __writeback_inodes_sb_nr+0x7e/0xb3
+ [947.481623] Modules linked in: nfsd nls_cp437 cifs asn1_decoder cifs_arc4 fscache cifs_md4 ipmi_ssif
+ [947.489571] CPU: 5 PID: 930 Comm: btrfs-transacti Not tainted 95.16.3-srb-asrock-00001-g36437ad63879 #186
+ [947.497969] RIP: 0010:__writeback_inodes_sb_nr+0x7e/0xb3
+ [947.502097] Code: 24 10 4c 89 44 24 18 c6 (...)
+ [947.519760] RSP: 0018:ffffc90000777e10 EFLAGS: 00010246
+ [947.523818] RAX: 0000000000000000 RBX: 0000000000963300 RCX: 0000000000000000
+ [947.529765] RDX: 0000000000000000 RSI: 000000000000fa51 RDI: ffffc90000777e50
+ [947.535740] RBP: ffff888101628a90 R08: ffff888100955800 R09: ffff888100956000
+ [947.541701] R10: 0000000000000002 R11: 0000000000000001 R12: ffff888100963488
+ [947.547645] R13: ffff888100963000 R14: ffff888112fb7200 R15: ffff888100963460
+ [947.553621] FS: 0000000000000000(0000) GS:ffff88841fd40000(0000) knlGS:0000000000000000
+ [947.560537] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ [947.565122] CR2: 0000000008be50c4 CR3: 000000000220c000 CR4: 00000000001006e0
+ [947.571072] Call Trace:
+ [947.572354] <TASK>
+ [947.573266] btrfs_commit_transaction+0x1f1/0x998
+ [947.576785] ? start_transaction+0x3ab/0x44e
+ [947.579867] ? schedule_timeout+0x8a/0xdd
+ [947.582716] transaction_kthread+0xe9/0x156
+ [947.585721] ? btrfs_cleanup_transaction.isra.0+0x407/0x407
+ [947.590104] kthread+0x131/0x139
+ [947.592168] ? set_kthread_struct+0x32/0x32
+ [947.595174] ret_from_fork+0x22/0x30
+ [947.597561] </TASK>
+ [947.598553] ---[ end trace 644721052755541c ]---
+
+ This is because we started using writeback_inodes_sb() to flush delalloc
+ when committing a transaction (when using -o flushoncommit), in order to
+ avoid deadlocks with filesystem freeze operations. This change was made
+ by commit ce8ea7cc6eb313 ("btrfs: don't call btrfs_start_delalloc_roots
+ in flushoncommit"). After that change we started producing that warning,
+ and every now and then a user reports this since the warning happens too
+ often, it spams dmesg/syslog, and a user is unsure if this reflects any
+ problem that might compromise the filesystem's reliability.
+
+ We can not just lock the sb->s_umount semaphore before calling
+ writeback_inodes_sb(), because that would at least deadlock with
+ filesystem freezing, since at fs/super.c:freeze_super() sync_filesystem()
+ is called while we are holding that semaphore in write mode, and that can
+ trigger a transaction commit, resulting in a deadlock. It would also
+ trigger the same type of deadlock in the unmount path. Possibly, it could
+ also introduce some other locking dependencies that lockdep would report.
+
+ To fix this call try_to_writeback_inodes_sb() instead of
+ writeback_inodes_sb(), because that will try to read lock sb->s_umount
+ and then will only call writeback_inodes_sb() if it was able to lock it.
+ This is fine because the cases where it can't read lock sb->s_umount
+ are during a filesystem unmount or during a filesystem freeze - in those
+ cases sb->s_umount is write locked and sync_filesystem() is called, which
+ calls writeback_inodes_sb(). In other words, in all cases where we can't
+ take a read lock on sb->s_umount, writeback is already being triggered
+ elsewhere.
+
+ An alternative would be to call btrfs_start_delalloc_roots() with a
+ number of pages different from LONG_MAX, for example matching the number
+ of delalloc bytes we currently have, in which case we would end up
+ starting all delalloc with filemap_fdatawrite_wbc() and not with an
+ async flush via filemap_flush() - that is only possible after the rather
+ recent commit e076ab2a2ca70a ("btrfs: shrink delalloc pages instead of
+ full inodes"). However that creates a whole new can of worms due to new
+ lock dependencies, which lockdep complains, like for example:
+
+ [ 8948.247280] ======================================================
+ [ 8948.247823] WARNING: possible circular locking dependency detected
+ [ 8948.248353] 5.17.0-rc1-btrfs-next-111 #1 Not tainted
+ [ 8948.248786] ------------------------------------------------------
+ [ 8948.249320] kworker/u16:18/933570 is trying to acquire lock:
+ [ 8948.249812] ffff9b3de1591690 (sb_internal#2){.+.+}-{0:0}, at: find_free_extent+0x141e/0x1590 [btrfs]
+ [ 8948.250638]
+ but task is already holding lock:
+ [ 8948.251140] ffff9b3e09c717d8 (&root->delalloc_mutex){+.+.}-{3:3}, at: start_delalloc_inodes+0x78/0x400 [btrfs]
+ [ 8948.252018]
+ which lock already depends on the new lock.
+
+ [ 8948.252710]
+ the existing dependency chain (in reverse order) is:
+ [ 8948.253343]
+ -> #2 (&root->delalloc_mutex){+.+.}-{3:3}:
+ [ 8948.253950] __mutex_lock+0x90/0x900
+ [ 8948.254354] start_delalloc_inodes+0x78/0x400 [btrfs]
+ [ 8948.254859] btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs]
+ [ 8948.255408] btrfs_commit_transaction+0x32f/0xc00 [btrfs]
+ [ 8948.255942] btrfs_mksubvol+0x380/0x570 [btrfs]
+ [ 8948.256406] btrfs_mksnapshot+0x81/0xb0 [btrfs]
+ [ 8948.256870] __btrfs_ioctl_snap_create+0x17f/0x190 [btrfs]
+ [ 8948.257413] btrfs_ioctl_snap_create_v2+0xbb/0x140 [btrfs]
+ [ 8948.257961] btrfs_ioctl+0x1196/0x3630 [btrfs]
+ [ 8948.258418] __x64_sys_ioctl+0x83/0xb0
+ [ 8948.258793] do_syscall_64+0x3b/0xc0
+ [ 8948.259146] entry_SYSCALL_64_after_hwframe+0x44/0xae
+ [ 8948.259709]
+ -> #1 (&fs_info->delalloc_root_mutex){+.+.}-{3:3}:
+ [ 8948.260330] __mutex_lock+0x90/0x900
+ [ 8948.260692] btrfs_start_delalloc_roots+0x97/0x2a0 [btrfs]
+ [ 8948.261234] btrfs_commit_transaction+0x32f/0xc00 [btrfs]
+ [ 8948.261766] btrfs_set_free_space_cache_v1_active+0x38/0x60 [btrfs]
+ [ 8948.262379] btrfs_start_pre_rw_mount+0x119/0x180 [btrfs]
+ [ 8948.262909] open_ctree+0x1511/0x171e [btrfs]
+ [ 8948.263359] btrfs_mount_root.cold+0x12/0xde [btrfs]
+ [ 8948.263863] legacy_get_tree+0x30/0x50
+ [ 8948.264242] vfs_get_tree+0x28/0xc0
+ [ 8948.264594] vfs_kern_mount.part.0+0x71/0xb0
+ [ 8948.265017] btrfs_mount+0x11d/0x3a0 [btrfs]
+ [ 8948.265462] legacy_get_tree+0x30/0x50
+ [ 8948.265851] vfs_get_tree+0x28/0xc0
+ [ 8948.266203] path_mount+0x2d4/0xbe0
+ [ 8948.266554] __x64_sys_mount+0x103/0x140
+ [ 8948.266940] do_syscall_64+0x3b/0xc0
+ [ 8948.267300] entry_SYSCALL_64_after_hwframe+0x44/0xae
+ [ 8948.267790]
+ -> #0 (sb_internal#2){.+.+}-{0:0}:
+ [ 8948.268322] __lock_acquire+0x12e8/0x2260
+ [ 8948.268733] lock_acquire+0xd7/0x310
+ [ 8948.269092] start_transaction+0x44c/0x6e0 [btrfs]
+ [ 8948.269591] find_free_extent+0x141e/0x1590 [btrfs]
+ [ 8948.270087] btrfs_reserve_extent+0x14b/0x280 [btrfs]
+ [ 8948.270588] cow_file_range+0x17e/0x490 [btrfs]
+ [ 8948.271051] btrfs_run_delalloc_range+0x345/0x7a0 [btrfs]
+ [ 8948.271586] writepage_delalloc+0xb5/0x170 [btrfs]
+ [ 8948.272071] __extent_writepage+0x156/0x3c0 [btrfs]
+ [ 8948.272579] extent_write_cache_pages+0x263/0x460 [btrfs]
+ [ 8948.273113] extent_writepages+0x76/0x130 [btrfs]
+ [ 8948.273573] do_writepages+0xd2/0x1c0
+ [ 8948.273942] filemap_fdatawrite_wbc+0x68/0x90
+ [ 8948.274371] start_delalloc_inodes+0x17f/0x400 [btrfs]
+ [ 8948.274876] btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs]
+ [ 8948.275417] flush_space+0x1f2/0x630 [btrfs]
+ [ 8948.275863] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
+ [ 8948.276438] process_one_work+0x252/0x5a0
+ [ 8948.276829] worker_thread+0x55/0x3b0
+ [ 8948.277189] kthread+0xf2/0x120
+ [ 8948.277506] ret_from_fork+0x22/0x30
+ [ 8948.277868]
+ other info that might help us debug this:
+
+ [ 8948.278548] Chain exists of:
+ sb_internal#2 --> &fs_info->delalloc_root_mutex --> &root->delalloc_mutex
+
+ [ 8948.279601] Possible unsafe locking scenario:
+
+ [ 8948.280102] CPU0 CPU1
+ [ 8948.280508] ---- ----
+ [ 8948.280915] lock(&root->delalloc_mutex);
+ [ 8948.281271] lock(&fs_info->delalloc_root_mutex);
+ [ 8948.281915] lock(&root->delalloc_mutex);
+ [ 8948.282487] lock(sb_internal#2);
+ [ 8948.282800]
+ *** DEADLOCK ***
+
+ [ 8948.283333] 4 locks held by kworker/u16:18/933570:
+ [ 8948.283750] #0: ffff9b3dc00a9d48 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1d2/0x5a0
+ [ 8948.284609] #1: ffffa90349dafe70 ((work_completion)(&fs_info->async_data_reclaim_work)){+.+.}-{0:0}, at: process_one_work+0x1d2/0x5a0
+ [ 8948.285637] #2: ffff9b3e14db5040 (&fs_info->delalloc_root_mutex){+.+.}-{3:3}, at: btrfs_start_delalloc_roots+0x97/0x2a0 [btrfs]
+ [ 8948.286674] #3: ffff9b3e09c717d8 (&root->delalloc_mutex){+.+.}-{3:3}, at: start_delalloc_inodes+0x78/0x400 [btrfs]
+ [ 8948.287596]
+ stack backtrace:
+ [ 8948.287975] CPU: 3 PID: 933570 Comm: kworker/u16:18 Not tainted 5.17.0-rc1-btrfs-next-111 #1
+ [ 8948.288677] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
+ [ 8948.289649] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
+ [ 8948.290298] Call Trace:
+ [ 8948.290517] <TASK>
+ [ 8948.290700] dump_stack_lvl+0x59/0x73
+ [ 8948.291026] check_noncircular+0xf3/0x110
+ [ 8948.291375] ? start_transaction+0x228/0x6e0 [btrfs]
+ [ 8948.291826] __lock_acquire+0x12e8/0x2260
+ [ 8948.292241] lock_acquire+0xd7/0x310
+ [ 8948.292714] ? find_free_extent+0x141e/0x1590 [btrfs]
+ [ 8948.293241] ? lock_is_held_type+0xea/0x140
+ [ 8948.293601] start_transaction+0x44c/0x6e0 [btrfs]
+ [ 8948.294055] ? find_free_extent+0x141e/0x1590 [btrfs]
+ [ 8948.294518] find_free_extent+0x141e/0x1590 [btrfs]
+ [ 8948.294957] ? _raw_spin_unlock+0x29/0x40
+ [ 8948.295312] ? btrfs_get_alloc_profile+0x124/0x290 [btrfs]
+ [ 8948.295813] btrfs_reserve_extent+0x14b/0x280 [btrfs]
+ [ 8948.296270] cow_file_range+0x17e/0x490 [btrfs]
+ [ 8948.296691] btrfs_run_delalloc_range+0x345/0x7a0 [btrfs]
+ [ 8948.297175] ? find_lock_delalloc_range+0x247/0x270 [btrfs]
+ [ 8948.297678] writepage_delalloc+0xb5/0x170 [btrfs]
+ [ 8948.298123] __extent_writepage+0x156/0x3c0 [btrfs]
+ [ 8948.298570] extent_write_cache_pages+0x263/0x460 [btrfs]
+ [ 8948.299061] extent_writepages+0x76/0x130 [btrfs]
+ [ 8948.299495] do_writepages+0xd2/0x1c0
+ [ 8948.299817] ? sched_clock_cpu+0xd/0x110
+ [ 8948.300160] ? lock_release+0x155/0x4a0
+ [ 8948.300494] filemap_fdatawrite_wbc+0x68/0x90
+ [ 8948.300874] ? do_raw_spin_unlock+0x4b/0xa0
+ [ 8948.301243] start_delalloc_inodes+0x17f/0x400 [btrfs]
+ [ 8948.301706] ? lock_release+0x155/0x4a0
+ [ 8948.302055] btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs]
+ [ 8948.302564] flush_space+0x1f2/0x630 [btrfs]
+ [ 8948.302970] btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
+ [ 8948.303510] process_one_work+0x252/0x5a0
+ [ 8948.303860] ? process_one_work+0x5a0/0x5a0
+ [ 8948.304221] worker_thread+0x55/0x3b0
+ [ 8948.304543] ? process_one_work+0x5a0/0x5a0
+ [ 8948.304904] kthread+0xf2/0x120
+ [ 8948.305184] ? kthread_complete_and_exit+0x20/0x20
+ [ 8948.305598] ret_from_fork+0x22/0x30
+ [ 8948.305921] </TASK>
+
+ It all comes from the fact that btrfs_start_delalloc_roots() takes the
+ delalloc_root_mutex, in the transaction commit path we are holding a
+ read lock on one of the superblock's freeze semaphores (via
+ sb_start_intwrite()), the async reclaim task can also do a call to
+ btrfs_start_delalloc_roots(), which ends up triggering writeback with
+ calls to filemap_fdatawrite_wbc(), resulting in extent allocation which
+ in turn can call btrfs_start_transaction(), which will result in taking
+ the freeze semaphore via sb_start_intwrite(), forming a nasty dependency
+ on all those locks which can be taken in different orders by different
+ code paths.
+
+ So just adopt the simple approach of calling try_to_writeback_inodes_sb()
+ at btrfs_start_delalloc_flush().
+
+ Link: https://lore.kernel.org/linux-btrfs/20220130005258.GA7465@cuci.nl/
+ Link: https://lore.kernel.org/linux-btrfs/43acc426-d683-d1b6-729d-c6bc4a2fff4d@gmail.com/
+ Link: https://lore.kernel.org/linux-btrfs/6833930a-08d7-6fbc-0141-eb9cdfd6bb4d@gmail.com/
+ Link: https://lore.kernel.org/linux-btrfs/20190322041731.GF16651@hungrycats.org/
+ Reviewed-by: Omar Sandoval <osandov@fb.com>
+ Signed-off-by: Filipe Manana <fdmanana@suse.com>
+ [ add more link reports ]
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit b805349fbdc9a47199d96bc193f64b9399ec6761
+Author: Qu Wenruo <wqu@suse.com>
+Date: Tue Feb 8 14:54:05 2022 +0800
+
+ btrfs: defrag: don't try to defrag extents which are under writeback
+
+ Once we start writeback (have called btrfs_run_delalloc_range()), we
+ allocate an extent, create an extent map point to that extent, with a
+ generation of (u64)-1, created the ordered extent and then clear the
+ DELALLOC bit from the range in the inode's io tree.
+
+ Such extent map can pass the first call of defrag_collect_targets(), as
+ its generation is (u64)-1, meets any possible minimal generation check.
+ And the range will not have DELALLOC bit, also passing the DELALLOC bit
+ check.
+
+ It will only be re-checked in the second call of
+ defrag_collect_targets(), which will wait for writeback.
+
+ But at that stage we have already spent our time waiting for some IO we
+ may or may not want to defrag.
+
+ Let's reject such extents early so we won't waste our time.
+
+ Reviewed-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: Qu Wenruo <wqu@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit e8a7717c71287a11dc81098199d7116d6a4b6006
+Author: Qu Wenruo <wqu@suse.com>
+Date: Tue Feb 8 13:31:19 2022 +0800
+
+ btrfs: populate extent_map::generation when reading from disk
+
+ When btrfs_get_extent() tries to get some file extent from disk, it
+ never populates extent_map::generation, leaving the value to be 0.
+
+ On the other hand, for extent map generated by IO, it will get its
+ generation properly set at finish_ordered_io()
+
+ finish_ordered_io()
+ |- unpin_extent_cache(gen = trans->transid)
+ |- em->generation = gen;
+
+ [CAUSE]
+ Since extent_map::generation is mostly used by fsync code, and for fsync
+ they only care about modified extents, which all have their
+ em::generation > 0.
+
+ Thus it's fine to not populate em read from disk for fsync.
+
+ [CORNER CASE]
+ However autodefrag also relies on em::generation to determine if one
+ extent needs to be defragged.
+
+ This unpopulated extent_map::generation can prevent the following
+ autodefrag case from working:
+
+ mkfs.btrfs -f $dev
+ mount $dev $mnt -o autodefrag
+
+ # initial write to queue the inode for autodefrag
+ xfs_io -f -c "pwrite 0 4k" $mnt/file
+ sync
+
+ # Real fragmented write
+ xfs_io -f -s -c "pwrite -b 4096 0 32k" $mnt/file
+ sync
+ echo "=== before autodefrag ==="
+ xfs_io -c "fiemap -v" $mnt/file
+
+ # Drop cache to force em to be read from disk
+ echo 3 > /proc/sys/vm/drop_caches
+ mount -o remount,commit=1 $mnt
+ sleep 3
+ sync
+
+ echo "=== After autodefrag ==="
+ xfs_io -c "fiemap -v" $mnt/file
+ umount $mnt
+
+ The result looks like this:
+
+ === before autodefrag ===
+ /mnt/btrfs/file:
+ EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS
+ 0: [0..15]: 26672..26687 16 0x0
+ 1: [16..31]: 26656..26671 16 0x0
+ 2: [32..47]: 26640..26655 16 0x0
+ 3: [48..63]: 26624..26639 16 0x1
+ === After autodefrag ===
+ /mnt/btrfs/file:
+ EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS
+ 0: [0..15]: 26672..26687 16 0x0
+ 1: [16..31]: 26656..26671 16 0x0
+ 2: [32..47]: 26640..26655 16 0x0
+ 3: [48..63]: 26624..26639 16 0x1
+
+ This fragmented 32K will not be defragged by autodefrag.
+
+ [FIX]
+ To make things less weird, just populate extent_map::generation when
+ reading file extents from disk.
+
+ This would make above fragmented extents to be properly defragged:
+
+ == before autodefrag ===
+ /mnt/btrfs/file:
+ EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS
+ 0: [0..15]: 26672..26687 16 0x0
+ 1: [16..31]: 26656..26671 16 0x0
+ 2: [32..47]: 26640..26655 16 0x0
+ 3: [48..63]: 26624..26639 16 0x1
+ === After autodefrag ===
+ /mnt/btrfs/file:
+ EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS
+ 0: [0..63]: 26688..26751 64 0x1
+
+ Reviewed-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: Qu Wenruo <wqu@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit df6d916f35305a85d6636256fbc9708a78df7465
+Author: Filipe Manana <fdmanana@suse.com>
+Date: Thu Feb 3 14:55:50 2022 +0000
+
+ btrfs: prepare extents to be logged before locking a log tree path
+
+ When we want to log an extent, in the fast fsync path, we obtain a path
+ to the leaf that will hold the file extent item either through a deletion
+ search, via btrfs_drop_extents(), or through an insertion search using
+ btrfs_insert_empty_item(). After that we fill the file extent item's
+ fields one by one directly on the leaf.
+
+ Instead of doing that, we could prepare the file extent item before
+ obtaining a btree path, and then copy the prepared extent item with a
+ single operation once we get the path. This helps avoid some contention
+ on the log tree, since we are holding write locks for longer than
+ necessary, especially in the case where the path is obtained via
+ btrfs_drop_extents() through a deletion search, which always keeps a
+ write lock on the nodes at levels 1 and 2 (besides the leaf).
+
+ This change does that, we prepare the file extent item that is going to
+ be inserted before acquiring a path, and then copy it into a leaf using
+ a single copy operation once we get a path.
+
+ This change if part of a patchset that is comprised of the following
+ patches:
+
+ 1/6 btrfs: remove unnecessary leaf free space checks when pushing items
+ 2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+ 3/6 btrfs: avoid unnecessary computation when deleting items from a leaf
+ 4/6 btrfs: remove constraint on number of visited leaves when replacing extents
+ 5/6 btrfs: remove useless path release in the fast fsync path
+ 6/6 btrfs: prepare extents to be logged before locking a log tree path
+
+ The following test was run to measure the impact of the whole patchset:
+
+ $ cat test.sh
+ #!/bin/bash
+
+ DEV=/dev/sdi
+ MNT=/mnt/sdi
+ MOUNT_OPTIONS="-o ssd"
+ MKFS_OPTIONS="-R free-space-tree -O no-holes"
+
+ NUM_JOBS=8
+ FILE_SIZE=128M
+ RUN_TIME=200
+
+ cat <<EOF > /tmp/fio-job.ini
+ [writers]
+ rw=randwrite
+ fsync=1
+ fallocate=none
+ group_reporting=1
+ direct=0
+ bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5
+ ioengine=sync
+ filesize=$FILE_SIZE
+ runtime=$RUN_TIME
+ time_based
+ directory=$MNT
+ numjobs=$NUM_JOBS
+ thread
+ EOF
+
+ echo "performance" | \
+ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+
+ echo
+ echo "Using config:"
+ echo
+ cat /tmp/fio-job.ini
+ echo
+
+ umount $MNT &> /dev/null
+ mkfs.btrfs -f $MKFS_OPTIONS $DEV
+ mount $MOUNT_OPTIONS $DEV $MNT
+
+ fio /tmp/fio-job.ini
+
+ umount $MNT
+
+ The test ran inside a VM (8 cores, 32G of RAM) with the target disk
+ mapping to a raw NVMe device, and using a non-debug kernel config
+ (Debian's default config).
+
+ Before the patchset:
+
+ WRITE: bw=116MiB/s (122MB/s), 116MiB/s-116MiB/s (122MB/s-122MB/s), io=22.7GiB (24.4GB), run=200013-200013msec
+
+ After the patchset:
+
+ WRITE: bw=125MiB/s (131MB/s), 125MiB/s-125MiB/s (131MB/s-131MB/s), io=24.3GiB (26.1GB), run=200007-200007msec
+
+ A 7.8% gain on throughput and +7.0% more IO done in the same period of
+ time (200 seconds).
+
+ Signed-off-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 901ebe7172fa1fd03e4cc43d9d5f6a191d2e6428
+Author: Filipe Manana <fdmanana@suse.com>
+Date: Thu Feb 3 14:55:49 2022 +0000
+
+ btrfs: remove useless path release in the fast fsync path
+
+ There's no point in calling btrfs_release_path() after finishing the loop
+ that logs the modified extents, since log_one_extent() returns with the
+ path released. In case the list of extents is empty, the path is already
+ released, so there's no need for that case as well.
+ So just remove that unnecessary btrfs_release_path() call.
+
+ This change if part of a patchset that is comprised of the following
+ patches:
+
+ 1/6 btrfs: remove unnecessary leaf free space checks when pushing items
+ 2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+ 3/6 btrfs: avoid unnecessary computation when deleting items from a leaf
+ 4/6 btrfs: remove constraint on number of visited leaves when replacing extents
+ 5/6 btrfs: remove useless path release in the fast fsync path
+ 6/6 btrfs: prepare extents to be logged before locking a log tree path
+
+ The last patch in the series has some performance test result in its
+ changelog.
+
+ Signed-off-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 12d0362b8dad82e240e37aa43f7d344f7206c009
+Author: Filipe Manana <fdmanana@suse.com>
+Date: Thu Feb 3 14:55:48 2022 +0000
+
+ btrfs: remove constraint on number of visited leaves when replacing extents
+
+ At btrfs_drop_extents(), we try to replace a range of file extent items
+ with a new file extent in a single btree search, to avoid the need to do
+ a search for deletion, followed by a path release and followed by yet
+ another search for insertion.
+
+ When I originally added that optimization, in commit 1acae57b161ef1
+ ("Btrfs: faster file extent item replace operations"), I left a constraint
+ to do the fast replace only if we visited a single leaf. That was because
+ in the most common case we find all file extent items that need to be
+ deleted (or trimmed) in a single leaf, however it can work for other
+ common cases like when we need to delete a few file extent items located
+ at the end of a leaf and a few more located at the beginning of the next
+ leaf. The key for the new file extent item is greater than the key of
+ any deleted or trimmed file extent item from previous leaves, so we are
+ fine to use the last leaf that we found as long as we are holding a
+ write lock on it - even if the new key ends up at slot 0, as if that's
+ the case, the btree search has obtained a write lock on any upper nodes
+ that need to have a key pointer updated.
+
+ So removed the constraint that limits the optimization to the case where
+ we visited only a single leaf.
+
+ This change if part of a patchset that is comprised of the following
+ patches:
+
+ 1/6 btrfs: remove unnecessary leaf free space checks when pushing items
+ 2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+ 3/6 btrfs: avoid unnecessary computation when deleting items from a leaf
+ 4/6 btrfs: remove constraint on number of visited leaves when replacing extents
+ 5/6 btrfs: remove useless path release in the fast fsync path
+ 6/6 btrfs: prepare extents to be logged before locking a log tree path
+
+ The last patch in the series has some performance test result in its
+ changelog.
+
+ Signed-off-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 967aa565ee63760ea7f1c00be743f8e24ee83aa6
+Author: Filipe Manana <fdmanana@suse.com>
+Date: Thu Feb 3 14:55:47 2022 +0000
+
+ btrfs: avoid unnecessary computation when deleting items from a leaf
+
+ When deleting items from a leaf, we always compute the sum of the data
+ sizes of the items that are going to be deleted. However we only use
+ that sum when the last item to delete is behind the last item in the
+ leaf. This unnecessarily wastes CPU time when we are deleting either
+ the whole leaf or from some slot > 0 up to the last item in the leaf,
+ and both of these cases are common (e.g. truncation operation, either
+ as a result of truncate(2) or when logging inodes, deleting checksums
+ after removing a large enough extent, etc).
+
+ So compute only the sum of the data sizes if the last item to be
+ deleted does not match the last item in the leaf.
+
+ This change if part of a patchset that is comprised of the following
+ patches:
+
+ 1/6 btrfs: remove unnecessary leaf free space checks when pushing items
+ 2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+ 3/6 btrfs: avoid unnecessary computation when deleting items from a leaf
+ 4/6 btrfs: remove constraint on number of visited leaves when replacing extents
+ 5/6 btrfs: remove useless path release in the fast fsync path
+ 6/6 btrfs: prepare extents to be logged before locking a log tree path
+
+ The last patch in the series has some performance test result in its
+ changelog.
+
+ Signed-off-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 4fd3e4a94b70dfab57ce617a2c8196a77e8cc29d
+Author: Filipe Manana <fdmanana@suse.com>
+Date: Thu Feb 3 14:55:46 2022 +0000
+
+ btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+
+ When we delete items from a leaf, if we end up with more than two thirds
+ of unused leaf space, we try to delete the leaf by moving all its items
+ into its left and right neighbour leaves. Sometimes that is not possible
+ because there is not enough free space in the left and right leaves, and
+ in that case we end up not deleting our leaf.
+
+ The way we are doing this is not ideal and can be improved in the
+ following ways:
+
+ 1) When we call push_leaf_left(), we pass a value of 1 byte to the data
+ size parameter of push_leaf_left(). This is not realistic value because
+ no item can have a size less than 25 bytes, which is the size of struct
+ btrfs_item. This means that means that if the left leaf has not enough
+ free space to push any item, we end up COWing it even if we end up not
+ changing its content at all.
+
+ COWing that leaf means allocating a new metadata extent, marking it
+ dirty and doing more IO when committing a transaction or when syncing a
+ log tree. For a log tree case, it's particularly more important to
+ avoid the useless COW operation, as more IO can imply a higher latency
+ for an fsync operation.
+
+ So instead of passing 1 as the minimum data size for push_leaf_left(),
+ pass the size of the first item in our leaf, as we don't want to COW
+ the left leaf if we can't at least push the first item of our leaf;
+
+ 2) When we call push_leaf_right(), we also pass a value of 1 byte as the
+ data size parameter of push_leaf_right(). Like the previous case, it
+ will also result in COWing the right leaf even if we are not able to
+ move any items into it, since there can't be any item with a size
+ smaller than 25 bytes (the size of struct btrfs_item).
+
+ So instead of passing 1 as the minimum data size to push_leaf_right(),
+ pass a size that corresponds to the sum of the size of all the
+ remaining items in our leaf. We are not interested in moving less than
+ that, because if we do, we are not able to delete our leaf and we have
+ COWed the right leaf for nothing. Plus, moving only some of the items
+ of our leaf, it means an even less balanced tree.
+
+ Just like the previous case, we want to avoid the useless COW of the
+ right leaf, this way we don't have to spend time allocating one new
+ metadata extent, and doing more IO when committing a transaction or
+ syncing a log tree. For the log tree case it's specially more important
+ because more IO can result in a higher latency for a fsync operation.
+
+ So adjust the minimum data size passed to push_leaf_left() and
+ push_leaf_right() as mentioned above.
+
+ This change if part of a patchset that is comprised of the following
+ patches:
+
+ 1/6 btrfs: remove unnecessary leaf free space checks when pushing items
+ 2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+ 3/6 btrfs: avoid unnecessary computation when deleting items from a leaf
+ 4/6 btrfs: remove constraint on number of visited leaves when replacing extents
+ 5/6 btrfs: remove useless path release in the fast fsync path
+ 6/6 btrfs: prepare extents to be logged before locking a log tree path
+
+ Not being able to delete a leaf that became less than 1/3 full after
+ deleting items from it is actually common. For example, for the fio test
+ mentioned in the changelog of patch 6/6, we are only able to delete a
+ leaf at btrfs_del_items() about 5.3% of the time, due to its left and
+ right neighbour leaves not having enough free space to push all the
+ remaining items into them.
+
+ The last patch in the series has some performance test result in its
+ changelog.
+
+ Signed-off-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 1b956d9978687f9d463a39c0d66a5eab958b9f3a
+Author: Filipe Manana <fdmanana@suse.com>
+Date: Thu Feb 3 14:55:45 2022 +0000
+
+ btrfs: remove unnecessary leaf free space checks when pushing items
+
+ When trying to push items from a leaf into its left and right neighbours,
+ we lock the left or right leaf, check if it has the required minimum free
+ space, COW the leaf and then check again if it has the minimum required
+ free space. This second check is pointless:
+
+ 1) Most and foremost because it's not needed. We have a write lock on the
+ leaf and on its parent node, so no one can come in and change either
+ the pre-COW or post-COW version of the leaf for the whole duration of
+ the push_leaf_left() and push_leaf_right() calls;
+
+ 2) The call to btrfs_leaf_free_space() is not trivial, it has a fair
+ amount of arithmetic operations and access to fields in the leaf's
+ header and items, so it's not very cheap.
+
+ So remove the duplicated free space checks.
+
+ This change if part of a patchset that is comprised of the following
+ patches:
+
+ 1/6 btrfs: remove unnecessary leaf free space checks when pushing items
+ 2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+ 3/6 btrfs: avoid unnecessary computation when deleting items from a leaf
+ 4/6 btrfs: remove constraint on number of visited leaves when replacing extents
+ 5/6 btrfs: remove useless path release in the fast fsync path
+ 6/6 btrfs: prepare extents to be logged before locking a log tree path
+
+ The last patch in the series has some performance test result in its
+ changelog.
+
+ Signed-off-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 784a4d85814dea9bf1096e864d9368d032c0112a
+Author: David Sterba <dsterba@suse.com>
+Date: Tue Feb 1 15:42:07 2022 +0100
+
+ btrfs: replace BUILD_BUG_ON by static_assert
+
+ The static_assert introduced in 6bab69c65013 ("build_bug.h: add wrapper
+ for _Static_assert") has been supported by compilers for a long time
+ (gcc 4.6, clang 3.0) and can be used in header files. We don't need to
+ put BUILD_BUG_ON to random functions but rather keep it next to the
+ definition.
+
+ The exception here is the UAPI header btrfs_tree.h that could be
+ potentially included by userspace code and the static assert is not
+ defined (nor used in any other header).
+
+ Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 2e0a36dc72d2a11fad03763dbd2ac4da106cfd1f
+Author: Qu Wenruo <wqu@suse.com>
+Date: Sun Jan 30 20:53:15 2022 +0800
+
+ btrfs: don't hold CPU for too long when defragging a file
+
+ There is a user report about "btrfs filesystem defrag" causing 120s
+ timeout problem.
+
+ For btrfs_defrag_file() it will iterate all file extents if called from
+ defrag ioctl, thus it can take a long time.
+
+ There is no reason not to release the CPU during such a long operation.
+
+ Add cond_resched() after defragged one cluster.
+
+ CC: stable@vger.kernel.org # 5.16
+ Link: https://lore.kernel.org/linux-btrfs/10e51417-2203-f0a4-2021-86c8511cc367@gmx.com
+ Signed-off-by: Qu Wenruo <wqu@suse.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit cb792362499dacbdd3986d10ad109d0efd875eab
+Author: Josef Bacik <josef@toxicpanda.com>
+Date: Fri Nov 5 16:45:28 2021 -0400
+
+ btrfs: rework async transaction committing
+
+ Currently we do this awful thing where we get another ref on a trans
+ handle, async off that handle and commit the transaction from that work.
+ Because we do this we have to mess with current->journal_info and the
+ freeze counting stuff.
+
+ We already have an async thing to kick for the transaction commit, the
+ transaction kthread. Replace this work struct with a flag on the
+ fs_info to tell the kthread to go ahead and commit even if it's before
+ our timeout. Then we can drastically simplify the async transaction
+ commit path.
+
+ Note: this can be simplified and functionality based on the pending
+ operation COMMIT.
+
+ Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+ [ add note ]
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 42786287b2e3443e3a3c90d7305fdc9b4287f00b
+Author: Nikolay Borisov <nborisov@suse.com>
+Date: Tue Nov 23 09:23:42 2021 +0200
+
+ btrfs: eliminate if in main loop in tree_search_offset
+
+ Reshuffle the code inside the first loop of tree_search_offset so that
+ one if() is eliminated and the becomes more linear.
+
+ Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+ Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit e8bbc534dbd1d0b0d17bf68d7615e644513e652a
+Author: Qu Wenruo <wqu@suse.com>
+Date: Fri Nov 19 14:19:33 2021 +0800
+
+ btrfs: don't check stripe length if the profile is not stripe based
+
+ [BUG]
+ When debugging calc_bio_boundaries(), I found that even for RAID1
+ metadata, we're following stripe length to calculate stripe boundary.
+
+ # mkfs.btrfs -m raid1 -d raid1 /dev/test/scratch[12]
+ # mount /dev/test/scratch /mnt/btrfs
+ # xfs_io -f -c "pwrite 0 64K" /mnt/btrfs/file
+ # umount
+
+ Above very basic operations will make calc_bio_boundaries() to report
+ the following result:
+
+ submit_extent_page: r/i=1/1 file_offset=22036480 len_to_stripe_boundary=49152
+ submit_extent_page: r/i=1/1 file_offset=30474240 len_to_stripe_boundary=65536
+ ...
+ submit_extent_page: r/i=1/1 file_offset=30523392 len_to_stripe_boundary=16384
+ submit_extent_page: r/i=1/1 file_offset=30457856 len_to_stripe_boundary=16384
+ submit_extent_page: r/i=5/257 file_offset=0 len_to_stripe_boundary=65536
+ submit_extent_page: r/i=5/257 file_offset=65536 len_to_stripe_boundary=65536
+ submit_extent_page: r/i=1/1 file_offset=30490624 len_to_stripe_boundary=49152
+ submit_extent_page: r/i=1/1 file_offset=30507008 len_to_stripe_boundary=32768
+
+ Where "r/i" is the rootid and inode, 1/1 means they metadata.
+ The remaining names match the member used in kernel.
+
+ Even all data/metadata are using RAID1, we're still following stripe
+ length.
+
+ [CAUSE]
+ This behavior is caused by a wrong condition in btrfs_get_io_geometry():
+
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ /* Fill using stripe_len */
+ len = min_t(u64, em->len - offset, max_len);
+ } else {
+ len = em->len - offset;
+ }
+
+ This means, only for SINGLE we will not follow stripe_len.
+
+ However for profiles like RAID1*, DUP, they don't need to bother
+ stripe_len.
+
+ This can lead to unnecessary bio split for RAID1*/DUP profiles, and can
+ even be a blockage for future zoned RAID support.
+
+ [FIX]
+ Introduce one single-use macro, BTRFS_BLOCK_GROUP_STRIPE_MASK, and
+ change the condition to only calculate the length using stripe length
+ for stripe based profiles.
+
+ Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+ Reviewed-by: Anand Jain <anand.jain@oracle.com>
+ Signed-off-by: Qu Wenruo <wqu@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit b8cb209bdc55ae144881b2ae67dd36941813f970
+Author: Nikolay Borisov <nborisov@suse.com>
+Date: Mon Nov 22 17:16:46 2021 +0200
+
+ btrfs: get next entry in tree_search_offset before doing checks
+
+ This is a small optimisation since the currently 'entry' is already
+ checked in the if () {} else if {} construct above the loop. In essence
+ the first iteration of the final while loop is redundant. To eliminate
+ this extra check simply get the next entry at the beginning of the loop.
+
+ Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+ Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 0fe871af1b0056f912e654d5455486c9f76b0c5e
+Author: Josef Bacik <josef@toxicpanda.com>
+Date: Thu Nov 18 16:33:15 2021 -0500
+
+ btrfs: index free space entries on size
+
+ Currently we index free space on offset only, because usually we have a
+ hint from the allocator that we want to honor for locality reasons.
+ However if we fail to use this hint we have to go back to a brute force
+ search through the free space entries to find a large enough extent.
+
+ With sufficiently fragmented free space this becomes quite expensive, as
+ we have to linearly search all of the free space entries to find if we
+ have a part that's long enough.
+
+ To fix this add a cached rb tree to index based on free space entry
+ bytes. This will allow us to quickly look up the largest chunk in the
+ free space tree for this block group, and stop searching once we've
+ found an entry that is too small to satisfy our allocation. We simply
+ choose to use this tree if we're searching from the beginning of the
+ block group, as we know we do not care about locality at that point.
+
+ I wrote an allocator test that creates a 10TiB ram backed null block
+ device and then fallocates random files until the file system is full.
+ I think go through and delete all of the odd files. Then I spawn 8
+ threads that fallocate 64MiB files (1/2 our extent size cap) until the
+ file system is full again. I use bcc's funclatency to measure the
+ latency of find_free_extent. The baseline results are
+
+ nsecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 0 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 0 | |
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 10356 |**** |
+ 512 -> 1023 : 58242 |************************* |
+ 1024 -> 2047 : 74418 |******************************** |
+ 2048 -> 4095 : 90393 |****************************************|
+ 4096 -> 8191 : 79119 |*********************************** |
+ 8192 -> 16383 : 35614 |*************** |
+ 16384 -> 32767 : 13418 |***** |
+ 32768 -> 65535 : 12811 |***** |
+ 65536 -> 131071 : 17090 |******* |
+ 131072 -> 262143 : 26465 |*********** |
+ 262144 -> 524287 : 40179 |***************** |
+ 524288 -> 1048575 : 55469 |************************ |
+ 1048576 -> 2097151 : 48807 |********************* |
+ 2097152 -> 4194303 : 26744 |*********** |
+ 4194304 -> 8388607 : 35351 |*************** |
+ 8388608 -> 16777215 : 13918 |****** |
+ 16777216 -> 33554431 : 21 | |
+
+ avg = 908079 nsecs, total: 580889071441 nsecs, count: 639690
+
+ And the patch results are
+
+ nsecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 0 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 0 | |
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 6883 |** |
+ 512 -> 1023 : 54346 |********************* |
+ 1024 -> 2047 : 79170 |******************************** |
+ 2048 -> 4095 : 98890 |****************************************|
+ 4096 -> 8191 : 81911 |********************************* |
+ 8192 -> 16383 : 27075 |********** |
+ 16384 -> 32767 : 14668 |***** |
+ 32768 -> 65535 : 13251 |***** |
+ 65536 -> 131071 : 15340 |****** |
+ 131072 -> 262143 : 26715 |********** |
+ 262144 -> 524287 : 43274 |***************** |
+ 524288 -> 1048575 : 53870 |********************* |
+ 1048576 -> 2097151 : 55368 |********************** |
+ 2097152 -> 4194303 : 41036 |**************** |
+ 4194304 -> 8388607 : 24927 |********** |
+ 8388608 -> 16777215 : 33 | |
+ 16777216 -> 33554431 : 9 | |
+
+ avg = 623599 nsecs, total: 397259314759 nsecs, count: 637042
+
+ There's a little variation in the amount of calls done because of timing
+ of the threads with metadata requirements, but the avg, total, and
+ count's are relatively consistent between runs (usually within 2-5% of
+ each other). As you can see here we have around a 30% decrease in
+ average latency with a 30% decrease in overall time spent in
+ find_free_extent.
+
+ Reviewed-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit db7a62b00a8ff463466ceb7c68728d8bfcc2d65d
+Author: Josef Bacik <josef@toxicpanda.com>
+Date: Thu Nov 18 16:33:14 2021 -0500
+
+ btrfs: only use ->max_extent_size if it is set in the bitmap
+
+ While adding self tests for my space index change I was hitting a
+ problem where the space indexed tree wasn't returning the expected
+ ->max_extent_size. This is because we will skip searching any entry
+ that doesn't have ->bytes >= the amount of bytes we want. However we'll
+ still set the max_extent_size based on that entry. The problem is if we
+ don't search the bitmap we won't have ->max_extent_size set properly, so
+ we can't really trust it.
+
+ This doesn't really result in a problem per-se, it can just result in us
+ not finding contiguous area that may exist. Fix the max_extent_size
+ helper to return ->bytes if ->max_extent_size isn't set, and add a big
+ comment explaining why we're doing this.
+
+ Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 1796c46d66a36d108c3d13292dd47020dd4e02e1
+Author: Josef Bacik <josef@toxicpanda.com>
+Date: Thu Oct 21 14:58:37 2021 -0400
+
+ btrfs: rename btrfs_item_end_nr to btrfs_item_data_end
+
+ The name btrfs_item_end_nr() is a bit of a misnomer, as it's actually
+ the offset of the end of the data the item points to. In fact all of
+ the helpers that we use btrfs_item_end_nr() use data in their name, like
+ BTRFS_LEAF_DATA_SIZE() and leaf_data(). Rename to btrfs_item_data_end()
+ to make it clear what this helper is giving us.
+
+ Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 89b1779e5b64958f44df12119436d01621a6f87a
+Author: Josef Bacik <josef@toxicpanda.com>
+Date: Thu Oct 21 14:58:36 2021 -0400
+
+ btrfs: remove the btrfs_item_end() helper
+
+ We're only using btrfs_item_end() from btrfs_item_end_nr(), so this can
+ be collapsed.
+
+ Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 8b3c0a73f1c9ae9506fb95f8581d35432f2bb8e8
+Author: Josef Bacik <josef@toxicpanda.com>
+Date: Thu Oct 21 14:58:35 2021 -0400
+
+ btrfs: drop the _nr from the item helpers
+
+ Now that all call sites are using the slot number to modify item values,
+ rename the SETGET helpers to raw_item_*(), and then rework the _nr()
+ helpers to be the btrfs_item_*() btrfs_set_item_*() helpers, and then
+ rename all of the callers to the new helpers.
+
+ Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit cb7d27e5e1205b9ca8f512a48f6772b2eb2b84f9
+Author: Josef Bacik <josef@toxicpanda.com>
+Date: Thu Oct 21 14:58:34 2021 -0400
+
+ btrfs: introduce item_nr token variant helpers
+
+ The last remaining place where we have the pattern of
+
+ item = btrfs_item_nr(slot)
+ <do something with the item>
+
+ are the token helpers. Handle this by introducing token helpers that
+ will do the btrfs_item_nr() work inside of the helper itself, and then
+ convert all users of the btrfs_item token helpers to the new _nr()
+ variants.
+
+ Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 18ba83dcd99b9619cfc1a246cfb84b1c9b530097
+Author: Josef Bacik <josef@toxicpanda.com>
+Date: Thu Oct 21 14:58:33 2021 -0400
+
+ btrfs: make btrfs_file_extent_inline_item_len take a slot
+
+ Instead of getting the btrfs_item for this, simply pass in the slot of
+ the item and then use the btrfs_item_size_nr() helper inside of
+ btrfs_file_extent_inline_item_len().
+
+ Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 898ecfdac6314335d8135f741414b0b6867ce1ab
+Author: Josef Bacik <josef@toxicpanda.com>
+Date: Thu Oct 21 14:58:32 2021 -0400
+
+ btrfs: add btrfs_set_item_*_nr() helpers
+
+ We have the pattern of
+
+ item = btrfs_item_nr(slot);
+ btrfs_set_item_*(leaf, item);
+
+ in a bunch of places in our code. Fix this by adding
+ btrfs_set_item_*_nr() helpers which will do the appropriate work, and
+ replace those calls with
+
+ btrfs_set_item_*_nr(leaf, slot);
+
+ Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 35a9546592ae8d0f3348cd0f776a12dbf3b81aa8
+Author: Josef Bacik <josef@toxicpanda.com>
+Date: Thu Oct 21 14:58:31 2021 -0400
+
+ btrfs: use btrfs_item_size_nr/btrfs_item_offset_nr everywhere
+
+ We have this pattern in a lot of places
+
+ item = btrfs_item_nr(slot);
+ btrfs_item_size(leaf, item);
+
+ when we could simply use
+
+ btrfs_item_size(leaf, slot);
+
+ Fix all callers of btrfs_item_size() and btrfs_item_offset() to use the
+ _nr variation of the helpers.
+
+ Reviewed-by: Qu Wenruo <wqu@suse.com>
+ Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 8e279d85c2ac593b8caf53f3ac72d0b7047d96f5
+Author: Filipe Manana <fdmanana@suse.com>
+Date: Mon Oct 25 17:31:54 2021 +0100
+
+ btrfs: remove no longer needed logic for replaying directory deletes
+
+ Now that we log only dir index keys when logging a directory, we no longer
+ need to deal with dir item keys in the log replay code for replaying
+ directory deletes. This is also true for the case when we replay a log
+ tree created by a kernel that still logs dir items.
+
+ So remove the remaining code of the replay of directory deletes algorithm
+ that deals with dir item keys.
+
+ Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+ Signed-off-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit b8ac7a8b1dd9bba3b30bd034d754bb4932c4a970
+Author: Filipe Manana <fdmanana@suse.com>
+Date: Mon Oct 25 17:31:53 2021 +0100
+
+ btrfs: only copy dir index keys when logging a directory
+
+ Currently, when logging a directory, we copy both dir items and dir index
+ items from the fs/subvolume tree to the log tree. Both items have exactly
+ the same data (same struct btrfs_dir_item), the difference lies in the key
+ values, where a dir index key contains the index number of a directory
+ entry while the dir item key does not, as it's used for doing fast lookups
+ of an entry by name, while the former is used for sorting entries when
+ listing a directory.
+
+ We can exploit that and log only the dir index items, since they contain
+ all the information needed to correctly add, replace and delete directory
+ entries when replaying a log tree. Logging only the dir index items is
+ also backward and forward compatible: an unpatched kernel (without this
+ change) can correctly replay a log tree generated by a patched kernel
+ (with this patch), and a patched kernel can correctly replay a log tree
+ generated by an unpatched kernel.
+
+ The backward compatibility is ensured because:
+
+ 1) For inserting a new dentry: a dentry is only inserted when we find a
+ new dir index key - we can only insert if we know the dir index offset,
+ which is encoded in the dir index key's offset;
+
+ 2) For deleting dentries: during log replay, before adding or replacing
+ dentries, we first replay dentry deletions. Whenever we find a dir item
+ key or a dir index key in the subvolume/fs tree that is not logged in
+ a range for which the log tree is authoritative, we do the unlink of
+ the dentry, which removes both the existing dir item key and the dir
+ index key. Therefore logging just dir index keys is enough to ensure
+ dentry deletions are correctly replayed;
+
+ 3) For dentry replacements: they work when we log only dir index keys
+ and this is mostly due to a combination of 1) and 2). If we replace a
+ dentry with name "foobar" to point from inode A to inode B, then we
+ know the dir index key for the new dentry is different from the old
+ one, as it has an index number (key offset) larger than the old one.
+ This results in replaying a deletion, through replay_dir_deletes(),
+ that causes the old dentry to be removed, both the dir item key and
+ the dir index key, as mentioned at 2). Then when processing the new
+ dir index key, we add the new dentry, adding both a new dir item key
+ and a new index key pointing to inode B, as stated in 1).
+
+ The forward compatibility, the ability for a patched kernel to replay a
+ log created by an older, unpatched kernel, comes from the changes required
+ for making sure we are able to replay a log that only contains dir index
+ keys - we simply ignore every dir item key we find.
+
+ So modify directory logging to log only dir index items, and modify the
+ log replay process to ignore dir item keys, from log trees created by an
+ unpatched kernel, and process only with dir index keys. This reduces the
+ amount of logged metadata by about half, and therefore the time spent
+ logging or fsyncing large directories (less CPU time and less IO).
+
+ The following test script was used to measure this change:
+
+ #!/bin/bash
+
+ DEV=/dev/nvme0n1
+ MNT=/mnt/nvme0n1
+
+ NUM_NEW_FILES=1000000
+ NUM_FILE_DELETES=10000
+
+ mkfs.btrfs -f $DEV
+ mount -o ssd $DEV $MNT
+
+ mkdir $MNT/testdir
+
+ for ((i = 1; i <= $NUM_NEW_FILES; i++)); do
+ echo -n > $MNT/testdir/file_$i
+ done
+
+ start=$(date +%s%N)
+ xfs_io -c "fsync" $MNT/testdir
+ end=$(date +%s%N)
+
+ dur=$(( (end - start) / 1000000 ))
+ echo "dir fsync took $dur ms after adding $NUM_NEW_FILES files"
+
+ # sync to force transaction commit and wipeout the log.
+ sync
+
+ del_inc=$(( $NUM_NEW_FILES / $NUM_FILE_DELETES ))
+ for ((i = 1; i <= $NUM_NEW_FILES; i += $del_inc)); do
+ rm -f $MNT/testdir/file_$i
+ done
+
+ start=$(date +%s%N)
+ xfs_io -c "fsync" $MNT/testdir
+ end=$(date +%s%N)
+
+ dur=$(( (end - start) / 1000000 ))
+ echo "dir fsync took $dur ms after deleting $NUM_FILE_DELETES files"
+ echo
+
+ umount $MNT
+
+ The tests were run on a physical machine, with a non-debug kernel (Debian's
+ default kernel config), for different values of $NUM_NEW_FILES and
+ $NUM_FILE_DELETES, and the results were the following:
+
+ ** Before patch, NUM_NEW_FILES = 1 000 000, NUM_DELETE_FILES = 10 000 **
+
+ dir fsync took 8412 ms after adding 1000000 files
+ dir fsync took 500 ms after deleting 10000 files
+
+ ** After patch, NUM_NEW_FILES = 1 000 000, NUM_DELETE_FILES = 10 000 **
+
+ dir fsync took 4252 ms after adding 1000000 files (-49.5%)
+ dir fsync took 269 ms after deleting 10000 files (-46.2%)
+
+ ** Before patch, NUM_NEW_FILES = 100 000, NUM_DELETE_FILES = 1 000 **
+
+ dir fsync took 745 ms after adding 100000 files
+ dir fsync took 59 ms after deleting 1000 files
+
+ ** After patch, NUM_NEW_FILES = 100 000, NUM_DELETE_FILES = 1 000 **
+
+ dir fsync took 404 ms after adding 100000 files (-45.8%)
+ dir fsync took 31 ms after deleting 1000 files (-47.5%)
+
+ ** Before patch, NUM_NEW_FILES = 10 000, NUM_DELETE_FILES = 1 000 **
+
+ dir fsync took 67 ms after adding 10000 files
+ dir fsync took 9 ms after deleting 1000 files
+
+ ** After patch, NUM_NEW_FILES = 10 000, NUM_DELETE_FILES = 1 000 **
+
+ dir fsync took 36 ms after adding 10000 files (-46.3%)
+ dir fsync took 5 ms after deleting 1000 files (-44.4%)
+
+ ** Before patch, NUM_NEW_FILES = 1 000, NUM_DELETE_FILES = 100 **
+
+ dir fsync took 9 ms after adding 1000 files
+ dir fsync took 4 ms after deleting 100 files
+
+ ** After patch, NUM_NEW_FILES = 1 000, NUM_DELETE_FILES = 100 **
+
+ dir fsync took 7 ms after adding 1000 files (-22.2%)
+ dir fsync took 3 ms after deleting 100 files (-25.0%)
+
+ Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+ Signed-off-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 793440833c94fa896424e2ceef71d376a2ae2454
+Author: Nikolay Borisov <nborisov@suse.com>
+Date: Thu Oct 14 10:03:11 2021 +0300
+
+ btrfs: remove spurious unlock/lock of unused_bgs_lock
+
+ Since both unused block groups and reclaim bgs lists are protected by
+ unused_bgs_lock then free them in the same critical section without
+ doing an extra unlock/lock pair.
+
+ Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+ Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+ Reviewed-by: David Sterba <dsterba@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 9f4889dbcf40db16d5cfd02dae54143ecfcf036a
+Author: Filipe Manana <fdmanana@suse.com>
+Date: Thu Oct 28 16:03:41 2021 +0100
+
+ btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
+
+ When doing a direct IO write against a file range that either has
+ preallocated extents in that range or has regular extents and the file
+ has the NOCOW attribute set, the write fails with -ENOSPC when all of
+ the following conditions are met:
+
+ 1) There are no data blocks groups with enough free space matching
+ the size of the write;
+
+ 2) There's not enough unallocated space for allocating a new data block
+ group;
+
+ 3) The extents in the target file range are not shared, neither through
+ snapshots nor through reflinks.
+
+ This is wrong because a NOCOW write can be done in such case, and in fact
+ it's possible to do it using a buffered IO write, since when failing to
+ allocate data space, the buffered IO path checks if a NOCOW write is
+ possible.
+
+ The failure in direct IO write path comes from the fact that early on,
+ at btrfs_dio_iomap_begin(), we try to allocate data space for the write
+ and if it that fails we return the error and stop - we never check if we
+ can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
+ if we can do a NOCOW write into the range, or a subset of the range, and
+ then release the previously reserved data space.
+
+ Fix this by doing the data reservation only if needed, when we must COW,
+ at btrfs_get_blocks_direct_write() instead of doing it at
+ btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
+ the inneficiency of doing unnecessary data reservations.
+
+ The following example test script reproduces the problem:
+
+ $ cat dio-nocow-enospc.sh
+ #!/bin/bash
+
+ DEV=/dev/sdj
+ MNT=/mnt/sdj
+
+ # Use a small fixed size (1G) filesystem so that it's quick to fill
+ # it up.
+ # Make sure the mixed block groups feature is not enabled because we
+ # later want to not have more space available for allocating data
+ # extents but still have enough metadata space free for the file writes.
+ mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
+ mount $DEV $MNT
+
+ # Create our test file with the NOCOW attribute set.
+ touch $MNT/foobar
+ chattr +C $MNT/foobar
+
+ # Now fill in all unallocated space with data for our test file.
+ # This will allocate a data block group that will be full and leave
+ # no (or a very small amount of) unallocated space in the device, so
+ # that it will not be possible to allocate a new block group later.
+ echo
+ echo "Creating test file with initial data..."
+ xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
+
+ # Now try a direct IO write against file range [0, 10M[.
+ # This should succeed since this is a NOCOW file and an extent for the
+ # range was previously allocated.
+ echo
+ echo "Trying direct IO write over allocated space..."
+ xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
+
+ umount $MNT
+
+ When running the test:
+
+ $ ./dio-nocow-enospc.sh
+ (...)
+
+ Creating test file with initial data...
+ wrote 943718400/943718400 bytes at offset 0
+ 900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
+
+ Trying direct IO write over allocated space...
+ pwrite: No space left on device
+
+ A test case for fstests will follow, testing both this direct IO write
+ scenario as well as the buffered IO write scenario to make it less likely
+ to get future regressions on the buffered IO case.
+
+ Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+ Signed-off-by: Filipe Manana <fdmanana@suse.com>
+ Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/backref.c | 16 +-
+ fs/btrfs/block-group.c | 2 -
+ fs/btrfs/btrfs_inode.h | 18 +-
+ fs/btrfs/compression.h | 2 +
+ fs/btrfs/ctree.c | 208 ++++-----
+ fs/btrfs/ctree.h | 96 ++--
+ fs/btrfs/dev-replace.c | 4 +-
+ fs/btrfs/dir-item.c | 12 +-
+ fs/btrfs/disk-io.c | 3 +-
+ fs/btrfs/extent-tree.c | 14 +-
+ fs/btrfs/file-item.c | 25 +-
+ fs/btrfs/file.c | 26 +-
+ fs/btrfs/free-space-cache.c | 177 ++++++--
+ fs/btrfs/free-space-cache.h | 2 +
+ fs/btrfs/inode-item.c | 14 +-
+ fs/btrfs/inode.c | 172 +++----
+ fs/btrfs/ioctl.c | 258 +++++++----
+ fs/btrfs/lzo.c | 20 +-
+ fs/btrfs/print-tree.c | 8 +-
+ fs/btrfs/props.c | 2 +-
+ fs/btrfs/qgroup.c | 41 +-
+ fs/btrfs/ref-verify.c | 2 +-
+ fs/btrfs/reflink.c | 2 +-
+ fs/btrfs/relocation.c | 2 +-
+ fs/btrfs/root-tree.c | 4 +-
+ fs/btrfs/scrub.c | 2 +-
+ fs/btrfs/send.c | 22 +-
+ fs/btrfs/sysfs.c | 10 +-
+ fs/btrfs/tests/extent-buffer-tests.c | 17 +-
+ fs/btrfs/transaction.c | 76 +---
+ fs/btrfs/transaction.h | 2 +-
+ fs/btrfs/tree-checker.c | 56 +--
+ fs/btrfs/tree-log.c | 656 ++++++++++++---------------
+ fs/btrfs/uuid-tree.c | 10 +-
+ fs/btrfs/verity.c | 2 +-
+ fs/btrfs/volumes.c | 13 +-
+ fs/btrfs/xattr.c | 8 +-
+ include/uapi/linux/btrfs.h | 6 +-
+ include/uapi/linux/btrfs_tree.h | 4 +-
+ 39 files changed, 1062 insertions(+), 952 deletions(-)
+
+diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
+index 8b090c40daf7..3b0c4bed242e 100644
+--- a/fs/btrfs/backref.c
++++ b/fs/btrfs/backref.c
+@@ -950,7 +950,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+- item_size = btrfs_item_size_nr(leaf, slot);
++ item_size = btrfs_item_size(leaf, slot);
+ BUG_ON(item_size < sizeof(*ei));
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+@@ -1792,7 +1792,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+ }
+
+ eb = path->nodes[0];
+- item_size = btrfs_item_size_nr(eb, path->slots[0]);
++ item_size = btrfs_item_size(eb, path->slots[0]);
+ BUG_ON(item_size < sizeof(*ei));
+
+ ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+@@ -2071,7 +2071,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+ u64 parent = 0;
+ int found = 0;
+ struct extent_buffer *eb;
+- struct btrfs_item *item;
+ struct btrfs_inode_ref *iref;
+ struct btrfs_key found_key;
+
+@@ -2097,10 +2096,9 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+ }
+ btrfs_release_path(path);
+
+- item = btrfs_item_nr(slot);
+ iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+- for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
++ for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) {
+ name_len = btrfs_inode_ref_name_len(eb, iref);
+ /* path must be released before calling iterate()! */
+ btrfs_debug(fs_root->fs_info,
+@@ -2156,7 +2154,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+ }
+ btrfs_release_path(path);
+
+- item_size = btrfs_item_size_nr(eb, slot);
++ item_size = btrfs_item_size(eb, slot);
+ ptr = btrfs_item_ptr_offset(eb, slot);
+ cur_offset = 0;
+
+@@ -2377,7 +2375,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
+ iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ iter->end_ptr = (u32)(iter->item_ptr +
+- btrfs_item_size_nr(path->nodes[0], path->slots[0]));
++ btrfs_item_size(path->nodes[0], path->slots[0]));
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+
+@@ -2417,7 +2415,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
+ iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ iter->item_ptr = iter->cur_ptr;
+- iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr(
++ iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size(
+ path->nodes[0], path->slots[0]));
+ }
+
+@@ -2482,7 +2480,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
+ iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ iter->cur_ptr = iter->item_ptr;
+- iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0],
++ iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0],
+ path->slots[0]);
+ return 0;
+ }
+diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
+index b67c965725ea..27da1dfbd626 100644
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -3924,9 +3924,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+- spin_unlock(&info->unused_bgs_lock);
+
+- spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->reclaim_bgs)) {
+ block_group = list_first_entry(&info->reclaim_bgs,
+ struct btrfs_block_group,
+diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
+index ab2a4a52e0bb..b3e46aabc3d8 100644
+--- a/fs/btrfs/btrfs_inode.h
++++ b/fs/btrfs/btrfs_inode.h
+@@ -138,19 +138,11 @@ struct btrfs_inode {
+ /* a local copy of root's last_log_commit */
+ int last_log_commit;
+
+- union {
+- /*
+- * Total number of bytes pending delalloc, used by stat to
+- * calculate the real block usage of the file. This is used
+- * only for files.
+- */
+- u64 delalloc_bytes;
+- /*
+- * The offset of the last dir item key that was logged.
+- * This is used only for directories.
+- */
+- u64 last_dir_item_offset;
+- };
++ /*
++ * Total number of bytes pending delalloc, used by stat to calculate the
++ * real block usage of the file. This is used only for files.
++ */
++ u64 delalloc_bytes;
+
+ union {
+ /*
+diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
+index 56eef0821e3e..7dbd14caab01 100644
+--- a/fs/btrfs/compression.h
++++ b/fs/btrfs/compression.h
+@@ -22,6 +22,8 @@ struct btrfs_inode;
+
+ /* Maximum length of compressed data stored on disk */
+ #define BTRFS_MAX_COMPRESSED (SZ_128K)
++static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
++
+ /* Maximum size of data before compression */
+ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K)
+
+diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
+index 35660791e084..5ca7a535d109 100644
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -2627,19 +2627,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
+ */
+ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
+ {
+- struct btrfs_item *start_item;
+- struct btrfs_item *end_item;
+ int data_len;
+ int nritems = btrfs_header_nritems(l);
+ int end = min(nritems, start + nr) - 1;
+
+ if (!nr)
+ return 0;
+- start_item = btrfs_item_nr(start);
+- end_item = btrfs_item_nr(end);
+- data_len = btrfs_item_offset(l, start_item) +
+- btrfs_item_size(l, start_item);
+- data_len = data_len - btrfs_item_offset(l, end_item);
++ data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
++ data_len = data_len - btrfs_item_offset(l, end);
+ data_len += sizeof(struct btrfs_item) * nr;
+ WARN_ON(data_len < 0);
+ return data_len;
+@@ -2686,7 +2681,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
+ u32 i;
+ int push_space = 0;
+ int push_items = 0;
+- struct btrfs_item *item;
+ u32 nr;
+ u32 right_nritems;
+ u32 data_end;
+@@ -2703,8 +2697,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
+ slot = path->slots[1];
+ i = left_nritems - 1;
+ while (i >= nr) {
+- item = btrfs_item_nr(i);
+-
+ if (!empty && push_items > 0) {
+ if (path->slots[0] > i)
+ break;
+@@ -2719,12 +2711,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
+ if (path->slots[0] == i)
+ push_space += data_size;
+
+- this_item_size = btrfs_item_size(left, item);
+- if (this_item_size + sizeof(*item) + push_space > free_space)
++ this_item_size = btrfs_item_size(left, i);
++ if (this_item_size + sizeof(struct btrfs_item) +
++ push_space > free_space)
+ break;
+
+ push_items++;
+- push_space += this_item_size + sizeof(*item);
++ push_space += this_item_size + sizeof(struct btrfs_item);
+ if (i == 0)
+ break;
+ i--;
+@@ -2738,7 +2731,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
+ /* push left to right */
+ right_nritems = btrfs_header_nritems(right);
+
+- push_space = btrfs_item_end_nr(left, left_nritems - push_items);
++ push_space = btrfs_item_data_end(left, left_nritems - push_items);
+ push_space -= leaf_data_end(left);
+
+ /* make room in the right data area */
+@@ -2769,9 +2762,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
+ btrfs_set_header_nritems(right, right_nritems);
+ push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
+ for (i = 0; i < right_nritems; i++) {
+- item = btrfs_item_nr(i);
+- push_space -= btrfs_token_item_size(&token, item);
+- btrfs_set_token_item_offset(&token, item, push_space);
++ push_space -= btrfs_token_item_size(&token, i);
++ btrfs_set_token_item_offset(&token, i, push_space);
+ }
+
+ left_nritems -= push_items;
+@@ -2856,16 +2848,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+ if (free_space < data_size)
+ goto out_unlock;
+
+- /* cow and double check */
+ ret = btrfs_cow_block(trans, root, right, upper,
+ slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
+ if (ret)
+ goto out_unlock;
+
+- free_space = btrfs_leaf_free_space(right);
+- if (free_space < data_size)
+- goto out_unlock;
+-
+ left_nritems = btrfs_header_nritems(left);
+ if (left_nritems == 0)
+ goto out_unlock;
+@@ -2916,7 +2903,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ int i;
+ int push_space = 0;
+ int push_items = 0;
+- struct btrfs_item *item;
+ u32 old_left_nritems;
+ u32 nr;
+ int ret = 0;
+@@ -2930,8 +2916,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ nr = min(right_nritems - 1, max_slot);
+
+ for (i = 0; i < nr; i++) {
+- item = btrfs_item_nr(i);
+-
+ if (!empty && push_items > 0) {
+ if (path->slots[0] < i)
+ break;
+@@ -2946,12 +2930,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ if (path->slots[0] == i)
+ push_space += data_size;
+
+- this_item_size = btrfs_item_size(right, item);
+- if (this_item_size + sizeof(*item) + push_space > free_space)
++ this_item_size = btrfs_item_size(right, i);
++ if (this_item_size + sizeof(struct btrfs_item) + push_space >
++ free_space)
+ break;
+
+ push_items++;
+- push_space += this_item_size + sizeof(*item);
++ push_space += this_item_size + sizeof(struct btrfs_item);
+ }
+
+ if (push_items == 0) {
+@@ -2967,25 +2952,23 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ push_items * sizeof(struct btrfs_item));
+
+ push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
+- btrfs_item_offset_nr(right, push_items - 1);
++ btrfs_item_offset(right, push_items - 1);
+
+ copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
+ leaf_data_end(left) - push_space,
+ BTRFS_LEAF_DATA_OFFSET +
+- btrfs_item_offset_nr(right, push_items - 1),
++ btrfs_item_offset(right, push_items - 1),
+ push_space);
+ old_left_nritems = btrfs_header_nritems(left);
+ BUG_ON(old_left_nritems <= 0);
+
+ btrfs_init_map_token(&token, left);
+- old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
++ old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
+ for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+ u32 ioff;
+
+- item = btrfs_item_nr(i);
+-
+- ioff = btrfs_token_item_offset(&token, item);
+- btrfs_set_token_item_offset(&token, item,
++ ioff = btrfs_token_item_offset(&token, i);
++ btrfs_set_token_item_offset(&token, i,
+ ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
+ }
+ btrfs_set_header_nritems(left, old_left_nritems + push_items);
+@@ -2996,7 +2979,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ right_nritems);
+
+ if (push_items < right_nritems) {
+- push_space = btrfs_item_offset_nr(right, push_items - 1) -
++ push_space = btrfs_item_offset(right, push_items - 1) -
+ leaf_data_end(right);
+ memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
+ BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
+@@ -3014,10 +2997,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ btrfs_set_header_nritems(right, right_nritems);
+ push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
+ for (i = 0; i < right_nritems; i++) {
+- item = btrfs_item_nr(i);
+-
+- push_space = push_space - btrfs_token_item_size(&token, item);
+- btrfs_set_token_item_offset(&token, item, push_space);
++ push_space = push_space - btrfs_token_item_size(&token, i);
++ btrfs_set_token_item_offset(&token, i, push_space);
+ }
+
+ btrfs_mark_buffer_dirty(left);
+@@ -3096,7 +3077,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ goto out;
+ }
+
+- /* cow and double check */
+ ret = btrfs_cow_block(trans, root, left,
+ path->nodes[1], slot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
+@@ -3107,12 +3087,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ goto out;
+ }
+
+- free_space = btrfs_leaf_free_space(left);
+- if (free_space < data_size) {
+- ret = 1;
+- goto out;
+- }
+-
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ goto out;
+@@ -3145,7 +3119,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
+
+ nritems = nritems - mid;
+ btrfs_set_header_nritems(right, nritems);
+- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
++ data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
+
+ copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(mid),
+@@ -3156,15 +3130,14 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
+ data_copy_size, BTRFS_LEAF_DATA_OFFSET +
+ leaf_data_end(l), data_copy_size);
+
+- rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
++ rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
+
+ btrfs_init_map_token(&token, right);
+ for (i = 0; i < nritems; i++) {
+- struct btrfs_item *item = btrfs_item_nr(i);
+ u32 ioff;
+
+- ioff = btrfs_token_item_offset(&token, item);
+- btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
++ ioff = btrfs_token_item_offset(&token, i);
++ btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
+ }
+
+ btrfs_set_header_nritems(l, mid);
+@@ -3280,7 +3253,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+- if (extend && data_size + btrfs_item_size_nr(l, slot) +
++ if (extend && data_size + btrfs_item_size(l, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
+ return -EOVERFLOW;
+
+@@ -3449,7 +3422,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
+ if (btrfs_leaf_free_space(leaf) >= ins_len)
+ return 0;
+
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+@@ -3469,7 +3442,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
+ ret = -EAGAIN;
+ leaf = path->nodes[0];
+ /* if our item isn't there, return now */
+- if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
++ if (item_size != btrfs_item_size(leaf, path->slots[0]))
+ goto err;
+
+ /* the leaf has changed, it now has room. return now */
+@@ -3500,9 +3473,7 @@ static noinline int split_item(struct btrfs_path *path,
+ unsigned long split_offset)
+ {
+ struct extent_buffer *leaf;
+- struct btrfs_item *item;
+- struct btrfs_item *new_item;
+- int slot;
++ int orig_slot, slot;
+ char *buf;
+ u32 nritems;
+ u32 item_size;
+@@ -3512,9 +3483,9 @@ static noinline int split_item(struct btrfs_path *path,
+ leaf = path->nodes[0];
+ BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
+
+- item = btrfs_item_nr(path->slots[0]);
+- orig_offset = btrfs_item_offset(leaf, item);
+- item_size = btrfs_item_size(leaf, item);
++ orig_slot = path->slots[0];
++ orig_offset = btrfs_item_offset(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+
+ buf = kmalloc(item_size, GFP_NOFS);
+ if (!buf)
+@@ -3535,14 +3506,12 @@ static noinline int split_item(struct btrfs_path *path,
+ btrfs_cpu_key_to_disk(&disk_key, new_key);
+ btrfs_set_item_key(leaf, &disk_key, slot);
+
+- new_item = btrfs_item_nr(slot);
++ btrfs_set_item_offset(leaf, slot, orig_offset);
++ btrfs_set_item_size(leaf, slot, item_size - split_offset);
+
+- btrfs_set_item_offset(leaf, new_item, orig_offset);
+- btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+-
+- btrfs_set_item_offset(leaf, item,
+- orig_offset + item_size - split_offset);
+- btrfs_set_item_size(leaf, item, split_offset);
++ btrfs_set_item_offset(leaf, orig_slot,
++ orig_offset + item_size - split_offset);
++ btrfs_set_item_size(leaf, orig_slot, split_offset);
+
+ btrfs_set_header_nritems(leaf, nritems + 1);
+
+@@ -3603,7 +3572,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+ {
+ int slot;
+ struct extent_buffer *leaf;
+- struct btrfs_item *item;
+ u32 nritems;
+ unsigned int data_end;
+ unsigned int old_data_start;
+@@ -3615,14 +3583,14 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+- old_size = btrfs_item_size_nr(leaf, slot);
++ old_size = btrfs_item_size(leaf, slot);
+ if (old_size == new_size)
+ return;
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(leaf);
+
+- old_data_start = btrfs_item_offset_nr(leaf, slot);
++ old_data_start = btrfs_item_offset(leaf, slot);
+
+ size_diff = old_size - new_size;
+
+@@ -3636,10 +3604,9 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+ btrfs_init_map_token(&token, leaf);
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+- item = btrfs_item_nr(i);
+
+- ioff = btrfs_token_item_offset(&token, item);
+- btrfs_set_token_item_offset(&token, item, ioff + size_diff);
++ ioff = btrfs_token_item_offset(&token, i);
++ btrfs_set_token_item_offset(&token, i, ioff + size_diff);
+ }
+
+ /* shift the data */
+@@ -3682,8 +3649,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+ fixup_low_keys(path, &disk_key, 1);
+ }
+
+- item = btrfs_item_nr(slot);
+- btrfs_set_item_size(leaf, item, new_size);
++ btrfs_set_item_size(leaf, slot, new_size);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (btrfs_leaf_free_space(leaf) < 0) {
+@@ -3699,7 +3665,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+ {
+ int slot;
+ struct extent_buffer *leaf;
+- struct btrfs_item *item;
+ u32 nritems;
+ unsigned int data_end;
+ unsigned int old_data;
+@@ -3717,7 +3682,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+ BUG();
+ }
+ slot = path->slots[0];
+- old_data = btrfs_item_end_nr(leaf, slot);
++ old_data = btrfs_item_data_end(leaf, slot);
+
+ BUG_ON(slot < 0);
+ if (slot >= nritems) {
+@@ -3734,10 +3699,9 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+ btrfs_init_map_token(&token, leaf);
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+- item = btrfs_item_nr(i);
+
+- ioff = btrfs_token_item_offset(&token, item);
+- btrfs_set_token_item_offset(&token, item, ioff - data_size);
++ ioff = btrfs_token_item_offset(&token, i);
++ btrfs_set_token_item_offset(&token, i, ioff - data_size);
+ }
+
+ /* shift the data */
+@@ -3746,9 +3710,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+ data_end, old_data - data_end);
+
+ data_end = old_data;
+- old_size = btrfs_item_size_nr(leaf, slot);
+- item = btrfs_item_nr(slot);
+- btrfs_set_item_size(leaf, item, old_size + data_size);
++ old_size = btrfs_item_size(leaf, slot);
++ btrfs_set_item_size(leaf, slot, old_size + data_size);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (btrfs_leaf_free_space(leaf) < 0) {
+@@ -3770,7 +3733,6 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
+ const struct btrfs_item_batch *batch)
+ {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+- struct btrfs_item *item;
+ int i;
+ u32 nritems;
+ unsigned int data_end;
+@@ -3807,7 +3769,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
+
+ btrfs_init_map_token(&token, leaf);
+ if (slot != nritems) {
+- unsigned int old_data = btrfs_item_end_nr(leaf, slot);
++ unsigned int old_data = btrfs_item_data_end(leaf, slot);
+
+ if (old_data < data_end) {
+ btrfs_print_leaf(leaf);
+@@ -3823,10 +3785,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+
+- item = btrfs_item_nr(i);
+- ioff = btrfs_token_item_offset(&token, item);
+- btrfs_set_token_item_offset(&token, item,
+- ioff - batch->total_data_size);
++ ioff = btrfs_token_item_offset(&token, i);
++ btrfs_set_token_item_offset(&token, i,
++ ioff - batch->total_data_size);
+ }
+ /* shift the items */
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
+@@ -3845,10 +3806,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
+ for (i = 0; i < batch->nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
+ btrfs_set_item_key(leaf, &disk_key, slot + i);
+- item = btrfs_item_nr(slot + i);
+ data_end -= batch->data_sizes[i];
+- btrfs_set_token_item_offset(&token, item, data_end);
+- btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
++ btrfs_set_token_item_offset(&token, slot + i, data_end);
++ btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
+ }
+
+ btrfs_set_header_nritems(leaf, nritems + batch->nr);
+@@ -3955,7 +3915,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ u32 item_size;
+
+ leaf = path->nodes[0];
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ret = setup_leaf_for_split(trans, root, path,
+ item_size + sizeof(struct btrfs_item));
+ if (ret)
+@@ -4056,25 +4016,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *leaf;
+- struct btrfs_item *item;
+- u32 last_off;
+- u32 dsize = 0;
+ int ret = 0;
+ int wret;
+- int i;
+ u32 nritems;
+
+ leaf = path->nodes[0];
+- last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+-
+- for (i = 0; i < nr; i++)
+- dsize += btrfs_item_size_nr(leaf, slot + i);
+-
+ nritems = btrfs_header_nritems(leaf);
+
+ if (slot + nr != nritems) {
+- int data_end = leaf_data_end(leaf);
++ const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
++ const int data_end = leaf_data_end(leaf);
+ struct btrfs_map_token token;
++ u32 dsize = 0;
++ int i;
++
++ for (i = 0; i < nr; i++)
++ dsize += btrfs_item_size(leaf, slot + i);
+
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end + dsize,
+@@ -4085,9 +4042,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ for (i = slot + nr; i < nritems; i++) {
+ u32 ioff;
+
+- item = btrfs_item_nr(i);
+- ioff = btrfs_token_item_offset(&token, item);
+- btrfs_set_token_item_offset(&token, item, ioff + dsize);
++ ioff = btrfs_token_item_offset(&token, i);
++ btrfs_set_token_item_offset(&token, i, ioff + dsize);
+ }
+
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
+@@ -4115,24 +4071,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ fixup_low_keys(path, &disk_key, 1);
+ }
+
+- /* delete the leaf if it is mostly empty */
++ /*
++ * Try to delete the leaf if it is mostly empty. We do this by
++ * trying to move all its items into its left and right neighbours.
++ * If we can't move all the items, then we don't delete it - it's
++ * not ideal, but future insertions might fill the leaf with more
++ * items, or items from other leaves might be moved later into our
++ * leaf due to deletions on those leaves.
++ */
+ if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
++ u32 min_push_space;
++
+ /* push_leaf_left fixes the path.
+ * make sure the path still points to our leaf
+ * for possible call to del_ptr below
+ */
+ slot = path->slots[1];
+ atomic_inc(&leaf->refs);
+-
+- wret = push_leaf_left(trans, root, path, 1, 1,
+- 1, (u32)-1);
++ /*
++ * We want to be able to at least push one item to the
++ * left neighbour leaf, and that's the first item.
++ */
++ min_push_space = sizeof(struct btrfs_item) +
++ btrfs_item_size(leaf, 0);
++ wret = push_leaf_left(trans, root, path, 0,
++ min_push_space, 1, (u32)-1);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+
+ if (path->nodes[0] == leaf &&
+ btrfs_header_nritems(leaf)) {
+- wret = push_leaf_right(trans, root, path, 1,
+- 1, 1, 0);
++ /*
++ * If we were not able to push all items from our
++ * leaf to its left neighbour, then attempt to
++ * either push all the remaining items to the
++ * right neighbour or none. There's no advantage
++ * in pushing only some items, instead of all, as
++ * it's pointless to end up with a leaf having
++ * too few items while the neighbours can be full
++ * or nearly full.
++ */
++ nritems = btrfs_header_nritems(leaf);
++ min_push_space = leaf_space_used(leaf, 0, nritems);
++ wret = push_leaf_right(trans, root, path, 0,
++ min_push_space, 1, 0);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+ }
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 5fe5eccb3c87..223e9d9e1b8b 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -601,6 +601,9 @@ enum {
+ /* Indicate whether there are any tree modification log users */
+ BTRFS_FS_TREE_MOD_LOG_USERS,
+
++ /* Indicate that we want the transaction kthread to commit right now. */
++ BTRFS_FS_COMMIT_TRANS,
++
+ #if BITS_PER_LONG == 32
+ /* Indicate if we have error/warn message printed on 32bit systems */
+ BTRFS_FS_32BIT_ERROR,
+@@ -1603,25 +1606,25 @@ DECLARE_BTRFS_SETGET_BITS(64)
+ static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
+ const type *s) \
+ { \
+- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
++ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ return btrfs_get_##bits(eb, s, offsetof(type, member)); \
+ } \
+ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
+ u##bits val) \
+ { \
+- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
++ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ btrfs_set_##bits(eb, s, offsetof(type, member), val); \
+ } \
+ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
+ const type *s) \
+ { \
+- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
++ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ return btrfs_get_token_##bits(token, s, offsetof(type, member));\
+ } \
+ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
+ type *s, u##bits val) \
+ { \
+- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
++ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
+ }
+
+@@ -1652,8 +1655,8 @@ static inline void btrfs_set_##name(type *s, u##bits val) \
+ static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
+ struct btrfs_dev_item *s)
+ {
+- BUILD_BUG_ON(sizeof(u64) !=
+- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
++ static_assert(sizeof(u64) ==
++ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
+ total_bytes));
+ }
+@@ -1661,8 +1664,8 @@ static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
+ struct btrfs_dev_item *s,
+ u64 val)
+ {
+- BUILD_BUG_ON(sizeof(u64) !=
+- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
++ static_assert(sizeof(u64) ==
++ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
+ btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
+ }
+@@ -1960,8 +1963,8 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb,
+ }
+
+ /* struct btrfs_item */
+-BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
+-BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
++BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
++BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
+ BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
+ BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
+
+@@ -1976,25 +1979,36 @@ static inline struct btrfs_item *btrfs_item_nr(int nr)
+ return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+ }
+
+-static inline u32 btrfs_item_end(const struct extent_buffer *eb,
+- struct btrfs_item *item)
+-{
+- return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
++#define BTRFS_ITEM_SETGET_FUNCS(member) \
++static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \
++ int slot) \
++{ \
++ return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \
++} \
++static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
++ int slot, u32 val) \
++{ \
++ btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \
++} \
++static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
++ int slot) \
++{ \
++ struct btrfs_item *item = btrfs_item_nr(slot); \
++ return btrfs_token_raw_item_##member(token, item); \
++} \
++static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
++ int slot, u32 val) \
++{ \
++ struct btrfs_item *item = btrfs_item_nr(slot); \
++ btrfs_set_token_raw_item_##member(token, item, val); \
+ }
+
+-static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
+-{
+- return btrfs_item_end(eb, btrfs_item_nr(nr));
+-}
++BTRFS_ITEM_SETGET_FUNCS(offset)
++BTRFS_ITEM_SETGET_FUNCS(size);
+
+-static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr)
++static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
+ {
+- return btrfs_item_offset(eb, btrfs_item_nr(nr));
+-}
+-
+-static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr)
+-{
+- return btrfs_item_size(eb, btrfs_item_nr(nr));
++ return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
+ }
+
+ static inline void btrfs_item_key(const struct extent_buffer *eb,
+@@ -2463,7 +2477,7 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
+
+ if (nr == 0)
+ return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
+- return btrfs_item_offset_nr(leaf, nr - 1);
++ return btrfs_item_offset(leaf, nr - 1);
+ }
+
+ /* struct btrfs_file_extent_item */
+@@ -2522,9 +2536,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+ */
+ static inline u32 btrfs_file_extent_inline_item_len(
+ const struct extent_buffer *eb,
+- struct btrfs_item *e)
++ int nr)
+ {
+- return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
++ return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+ }
+
+ /* btrfs_qgroup_status_item */
+@@ -2616,11 +2630,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+ /* helper function to cast into the data area of the leaf. */
+ #define btrfs_item_ptr(leaf, slot, type) \
+ ((type *)(BTRFS_LEAF_DATA_OFFSET + \
+- btrfs_item_offset_nr(leaf, slot)))
++ btrfs_item_offset(leaf, slot)))
+
+ #define btrfs_item_ptr_offset(leaf, slot) \
+ ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
+- btrfs_item_offset_nr(leaf, slot)))
++ btrfs_item_offset(leaf, slot)))
+
+ static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
+ {
+@@ -3297,9 +3311,27 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+ int btrfs_ioctl_get_supported_features(void __user *arg);
+ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
+ int __pure btrfs_is_empty_uuid(u8 *uuid);
++
++struct btrfs_defrag_ctrl {
++ /* Input, read-only fields */
++ u64 start;
++ u64 len;
++ u32 extent_thresh;
++ u64 newer_than;
++ u64 max_sectors_to_defrag;
++ u8 compress;
++ u8 flags;
++
++ /* Output fields */
++ u64 sectors_defragged;
++ u64 last_scanned; /* Exclusive bytenr */
++};
++int btrfs_defrag_ioctl_args_to_ctrl(struct btrfs_fs_info *fs_info,
++ struct btrfs_ioctl_defrag_range_args *args,
++ struct btrfs_defrag_ctrl *ctrl,
++ u64 max_sectors_to_defrag, u64 newer_than);
+ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+- struct btrfs_ioctl_defrag_range_args *range,
+- u64 newer_than, unsigned long max_to_defrag);
++ struct btrfs_defrag_ctrl *ctrl);
+ void btrfs_get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space);
+ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
+index e0238dd5f2f2..66fa61cb3f23 100644
+--- a/fs/btrfs/dev-replace.c
++++ b/fs/btrfs/dev-replace.c
+@@ -128,7 +128,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+- item_size = btrfs_item_size_nr(eb, slot);
++ item_size = btrfs_item_size(eb, slot);
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+
+ if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+@@ -381,7 +381,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
+ }
+
+ if (ret == 0 &&
+- btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
++ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ /*
+ * need to delete old one and insert a new one.
+ * Since no attempt is made to recover any old state, if the
+diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
+index 7721ce0c0604..3b532bab0755 100644
+--- a/fs/btrfs/dir-item.c
++++ b/fs/btrfs/dir-item.c
+@@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+ char *ptr;
+- struct btrfs_item *item;
+ struct extent_buffer *leaf;
+
+ ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+@@ -41,10 +40,9 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+ return ERR_PTR(ret);
+ WARN_ON(ret > 0);
+ leaf = path->nodes[0];
+- item = btrfs_item_nr(path->slots[0]);
+ ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+- BUG_ON(data_size > btrfs_item_size(leaf, item));
+- ptr += btrfs_item_size(leaf, item) - data_size;
++ ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0]));
++ ptr += btrfs_item_size(leaf, path->slots[0]) - data_size;
+ return (struct btrfs_dir_item *)ptr;
+ }
+
+@@ -271,7 +269,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ data_size = sizeof(*di) + name_len;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+- if (data_size + btrfs_item_size_nr(leaf, slot) +
++ if (data_size + btrfs_item_size(leaf, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
+ ret = -EOVERFLOW;
+ } else {
+@@ -409,7 +407,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
+ leaf = path->nodes[0];
+ dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+
+- total_len = btrfs_item_size_nr(leaf, path->slots[0]);
++ total_len = btrfs_item_size(leaf, path->slots[0]);
+ while (cur < total_len) {
+ this_len = sizeof(*dir_item) +
+ btrfs_dir_name_len(leaf, dir_item) +
+@@ -445,7 +443,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+ leaf = path->nodes[0];
+ sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
+ btrfs_dir_data_len(leaf, di);
+- item_len = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_len = btrfs_item_size(leaf, path->slots[0]);
+ if (sub_item_len == item_len) {
+ ret = btrfs_del_item(trans, root, path);
+ } else {
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index 5f0a879c1043..e4275da0572c 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1935,7 +1935,8 @@ static int transaction_kthread(void *arg)
+ }
+
+ delta = ktime_get_seconds() - cur->start_time;
+- if (cur->state < TRANS_STATE_COMMIT_START &&
++ if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
++ cur->state < TRANS_STATE_COMMIT_START &&
+ delta < fs_info->commit_interval) {
+ spin_unlock(&fs_info->trans_lock);
+ delay -= msecs_to_jiffies((delta - 1) * 1000);
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 7b4ee1b2d5d8..91ca32c9459a 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -171,7 +171,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+
+ if (ret == 0) {
+ leaf = path->nodes[0];
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+ if (item_size >= sizeof(*ei)) {
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+@@ -865,7 +865,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
+ }
+
+ leaf = path->nodes[0];
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+ if (unlikely(item_size < sizeof(*ei))) {
+ err = -EINVAL;
+ btrfs_print_v0_err(fs_info);
+@@ -1007,7 +1007,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
+ __run_delayed_extent_op(extent_op, leaf, ei);
+
+ ptr = (unsigned long)ei + item_offset;
+- end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
++ end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
+ if (ptr < end - size)
+ memmove_extent_buffer(leaf, ptr + size, ptr,
+ end - size - ptr);
+@@ -1119,7 +1119,7 @@ void update_inline_extent_backref(struct btrfs_path *path,
+ } else {
+ *last_ref = 1;
+ size = btrfs_extent_inline_ref_size(type);
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ptr = (unsigned long)iref;
+ end = (unsigned long)ei + item_size;
+ if (ptr + size < end)
+@@ -1634,7 +1634,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
+ }
+
+ leaf = path->nodes[0];
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+
+ if (unlikely(item_size < sizeof(*ei))) {
+ err = -EINVAL;
+@@ -2316,7 +2316,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
+ goto out;
+
+ ret = 1;
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+
+ /* If extent item has more than 1 inline ref then it's shared */
+@@ -3068,7 +3068,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ }
+
+ leaf = path->nodes[0];
+- item_size = btrfs_item_size_nr(leaf, extent_slot);
++ item_size = btrfs_item_size(leaf, extent_slot);
+ if (unlikely(item_size < sizeof(*ei))) {
+ ret = -EINVAL;
+ btrfs_print_v0_err(info);
+diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
+index d1cbb64a78f3..107d6557ebc3 100644
+--- a/fs/btrfs/file-item.c
++++ b/fs/btrfs/file-item.c
+@@ -208,7 +208,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+
+ csum_offset = (bytenr - found_key.offset) >>
+ fs_info->sectorsize_bits;
+- csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
++ csums_in_item = btrfs_item_size(leaf, path->slots[0]);
+ csums_in_item /= csum_size;
+
+ if (csum_offset == csums_in_item) {
+@@ -274,7 +274,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+- itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
++ itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
+
+ csum_start = key.offset;
+ csum_len = (itemsize / csum_size) * sectorsize;
+@@ -291,7 +291,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
+ goto out;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+- itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
++ itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
+
+ csum_start = key.offset;
+ csum_len = (itemsize / csum_size) * sectorsize;
+@@ -534,7 +534,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ key.type == BTRFS_EXTENT_CSUM_KEY) {
+ offset = (start - key.offset) >> fs_info->sectorsize_bits;
+ if (offset * csum_size <
+- btrfs_item_size_nr(leaf, path->slots[0] - 1))
++ btrfs_item_size(leaf, path->slots[0] - 1))
+ path->slots[0]--;
+ }
+ }
+@@ -559,7 +559,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ if (key.offset > start)
+ start = key.offset;
+
+- size = btrfs_item_size_nr(leaf, path->slots[0]);
++ size = btrfs_item_size(leaf, path->slots[0]);
+ csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
+ if (csum_end <= start) {
+ path->slots[0]++;
+@@ -750,7 +750,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
+ u32 blocksize_bits = fs_info->sectorsize_bits;
+
+ leaf = path->nodes[0];
+- csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
++ csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
+ csum_end <<= blocksize_bits;
+ csum_end += key->offset;
+
+@@ -834,7 +834,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ if (key.offset >= end_byte)
+ break;
+
+- csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
++ csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
+ csum_end <<= blocksize_bits;
+ csum_end += key.offset;
+
+@@ -1002,7 +1002,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ item_end = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_csum_item);
+ item_end = (struct btrfs_csum_item *)((char *)item_end +
+- btrfs_item_size_nr(leaf, path->slots[0]));
++ btrfs_item_size(leaf, path->slots[0]));
+ goto found;
+ }
+ ret = PTR_ERR(item);
+@@ -1013,7 +1013,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ u32 item_size;
+ /* we found one, but it isn't big enough yet */
+ leaf = path->nodes[0];
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+ if ((item_size / csum_size) >=
+ MAX_CSUM_ITEMS(fs_info, csum_size)) {
+ /* already at max size, make a new one */
+@@ -1070,7 +1070,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ }
+
+ extend_csum:
+- if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
++ if (csum_offset == btrfs_item_size(leaf, path->slots[0]) /
+ csum_size) {
+ int extend_nr;
+ u64 tmp;
+@@ -1125,7 +1125,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ diff = min(diff,
+ MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
+
+- diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
++ diff = diff - btrfs_item_size(leaf, path->slots[0]);
+ diff = min_t(u32, btrfs_leaf_free_space(leaf), diff);
+ diff /= csum_size;
+ diff *= csum_size;
+@@ -1162,7 +1162,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ csum:
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+ item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+- btrfs_item_size_nr(leaf, path->slots[0]));
++ btrfs_item_size(leaf, path->slots[0]));
+ item = (struct btrfs_csum_item *)((unsigned char *)item +
+ csum_offset * csum_size);
+ found:
+@@ -1208,6 +1208,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
+ extent_start = key.offset;
+ extent_end = btrfs_file_extent_end(path);
+ em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
++ em->generation = btrfs_file_extent_generation(leaf, fi);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ em->start = extent_start;
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
+index 11204dbbe053..12e63be6a35b 100644
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -277,8 +277,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ {
+ struct btrfs_root *inode_root;
+ struct inode *inode;
+- struct btrfs_ioctl_defrag_range_args range;
+- int num_defrag;
++ struct btrfs_defrag_ctrl ctrl = {0};
+ int ret;
+
+ /* get the inode */
+@@ -297,21 +296,23 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+
+ /* do a chunk of defrag */
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+- memset(&range, 0, sizeof(range));
+- range.len = (u64)-1;
+- range.start = defrag->last_offset;
++ ctrl.len = (u64)-1;
++ ctrl.start = defrag->last_offset;
++ ctrl.newer_than = defrag->transid;
++ ctrl.max_sectors_to_defrag = BTRFS_DEFRAG_BATCH;
+
+ sb_start_write(fs_info->sb);
+- num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+- BTRFS_DEFRAG_BATCH);
++ ret = btrfs_defrag_file(inode, NULL, &ctrl);
+ sb_end_write(fs_info->sb);
++ if (ret < 0)
++ goto out;
+ /*
+ * if we filled the whole defrag batch, there
+ * must be more work to do. Queue this defrag
+ * again
+ */
+- if (num_defrag == BTRFS_DEFRAG_BATCH) {
+- defrag->last_offset = range.start;
++ if (ctrl.sectors_defragged == BTRFS_DEFRAG_BATCH) {
++ defrag->last_offset = ctrl.last_scanned;
+ btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
+ } else if (defrag->last_offset && !defrag->cycled) {
+ /*
+@@ -325,7 +326,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ } else {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
+-
++out:
+ iput(inode);
+ return 0;
+ cleanup:
+@@ -718,7 +719,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ int modify_tree = -1;
+ int update_refs;
+ int found = 0;
+- int leafs_visited = 0;
+ struct btrfs_path *path = args->path;
+
+ args->bytes_found = 0;
+@@ -756,7 +756,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ path->slots[0]--;
+ }
+ ret = 0;
+- leafs_visited++;
+ next_slot:
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+@@ -768,7 +767,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ ret = 0;
+ break;
+ }
+- leafs_visited++;
+ leaf = path->nodes[0];
+ recow = 1;
+ }
+@@ -1014,7 +1012,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ * which case it unlocked our path, so check path->locks[0] matches a
+ * write lock.
+ */
+- if (!ret && args->replace_extent && leafs_visited == 1 &&
++ if (!ret && args->replace_extent &&
+ path->locks[0] == BTRFS_WRITE_LOCK &&
+ btrfs_leaf_free_space(leaf) >=
+ sizeof(struct btrfs_item) + args->extent_item_size) {
+diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
+index f3fee88c8ee0..a45017b12185 100644
+--- a/fs/btrfs/free-space-cache.c
++++ b/fs/btrfs/free-space-cache.c
+@@ -1580,6 +1580,50 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
+ return 0;
+ }
+
++/*
++ * This is a little subtle. We *only* have ->max_extent_size set if we actually
++ * searched through the bitmap and figured out the largest ->max_extent_size,
++ * otherwise it's 0. In the case that it's 0 we don't want to tell the
++ * allocator the wrong thing, we want to use the actual real max_extent_size
++ * we've found already if it's larger, or we want to use ->bytes.
++ *
++ * This matters because find_free_space() will skip entries who's ->bytes is
++ * less than the required bytes. So if we didn't search down this bitmap, we
++ * may pick some previous entry that has a smaller ->max_extent_size than we
++ * have. For example, assume we have two entries, one that has
++ * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set
++ * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will
++ * call into find_free_space(), and return with max_extent_size == 4K, because
++ * that first bitmap entry had ->max_extent_size set, but the second one did
++ * not. If instead we returned 8K we'd come in searching for 8K, and find the
++ * 8K contiguous range.
++ *
++ * Consider the other case, we have 2 8K chunks in that second entry and still
++ * don't have ->max_extent_size set. We'll return 16K, and the next time the
++ * allocator comes in it'll fully search our second bitmap, and this time it'll
++ * get an uptodate value of 8K as the maximum chunk size. Then we'll get the
++ * right allocation the next loop through.
++ */
++static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
++{
++ if (entry->bitmap && entry->max_extent_size)
++ return entry->max_extent_size;
++ return entry->bytes;
++}
++
++/*
++ * We want the largest entry to be leftmost, so this is inverted from what you'd
++ * normally expect.
++ */
++static bool entry_less(struct rb_node *node, const struct rb_node *parent)
++{
++ const struct btrfs_free_space *entry, *exist;
++
++ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
++ exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
++ return get_max_extent_size(exist) < get_max_extent_size(entry);
++}
++
+ /*
+ * searches the tree for the given offset.
+ *
+@@ -1592,15 +1636,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
+ u64 offset, int bitmap_only, int fuzzy)
+ {
+ struct rb_node *n = ctl->free_space_offset.rb_node;
+- struct btrfs_free_space *entry, *prev = NULL;
++ struct btrfs_free_space *entry = NULL, *prev = NULL;
+
+ /* find entry that is closest to the 'offset' */
+- while (1) {
+- if (!n) {
+- entry = NULL;
+- break;
+- }
+-
++ while (n) {
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ prev = entry;
+
+@@ -1610,6 +1649,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
+ n = n->rb_right;
+ else
+ break;
++
++ entry = NULL;
+ }
+
+ if (bitmap_only) {
+@@ -1686,6 +1727,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
+ return NULL;
+
+ while (1) {
++ n = rb_next(&entry->offset_index);
++ if (!n)
++ return NULL;
++ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (entry->bitmap) {
+ if (entry->offset + BITS_PER_BITMAP *
+ ctl->unit > offset)
+@@ -1694,11 +1739,6 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
+ if (entry->offset + entry->bytes > offset)
+ break;
+ }
+-
+- n = rb_next(&entry->offset_index);
+- if (!n)
+- return NULL;
+- entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ }
+ return entry;
+ }
+@@ -1708,6 +1748,7 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+ {
+ rb_erase(&info->offset_index, &ctl->free_space_offset);
++ rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
+ ctl->free_extents--;
+
+ if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
+@@ -1734,6 +1775,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
+ if (ret)
+ return ret;
+
++ rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
++
+ if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]++;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
+@@ -1744,6 +1787,22 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
+ return ret;
+ }
+
++static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
++ struct btrfs_free_space *info)
++{
++ ASSERT(info->bitmap);
++
++ /*
++ * If our entry is empty it's because we're on a cluster and we don't
++ * want to re-link it into our ctl bytes index.
++ */
++ if (RB_EMPTY_NODE(&info->bytes_index))
++ return;
++
++ rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
++ rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
++}
++
+ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ u64 offset, u64 bytes)
+@@ -1762,6 +1821,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ if (info->max_extent_size > ctl->unit)
+ info->max_extent_size = 0;
+
++ relink_bitmap_entry(ctl, info);
++
+ if (start && test_bit(start - 1, info->bitmap))
+ extent_delta++;
+
+@@ -1797,9 +1858,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
+
+ bitmap_set(info->bitmap, start, count);
+
++ /*
++ * We set some bytes, we have no idea what the max extent size is
++ * anymore.
++ */
++ info->max_extent_size = 0;
+ info->bytes += bytes;
+ ctl->free_space += bytes;
+
++ relink_bitmap_entry(ctl, info);
++
+ if (start && test_bit(start - 1, info->bitmap))
+ extent_delta--;
+
+@@ -1867,20 +1935,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
+
+ *bytes = (u64)(max_bits) * ctl->unit;
+ bitmap_info->max_extent_size = *bytes;
++ relink_bitmap_entry(ctl, bitmap_info);
+ return -1;
+ }
+
+-static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
+-{
+- if (entry->bitmap)
+- return entry->max_extent_size;
+- return entry->bytes;
+-}
+-
+ /* Cache the size of the max extent in bytes */
+ static struct btrfs_free_space *
+ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+- unsigned long align, u64 *max_extent_size)
++ unsigned long align, u64 *max_extent_size, bool use_bytes_index)
+ {
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+@@ -1890,16 +1952,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+
+ if (!ctl->free_space_offset.rb_node)
+ goto out;
++again:
++ if (use_bytes_index) {
++ node = rb_first_cached(&ctl->free_space_bytes);
++ } else {
++ entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
++ 0, 1);
++ if (!entry)
++ goto out;
++ node = &entry->offset_index;
++ }
+
+- entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
+- if (!entry)
+- goto out;
++ for (; node; node = rb_next(node)) {
++ if (use_bytes_index)
++ entry = rb_entry(node, struct btrfs_free_space,
++ bytes_index);
++ else
++ entry = rb_entry(node, struct btrfs_free_space,
++ offset_index);
+
+- for (node = &entry->offset_index; node; node = rb_next(node)) {
+- entry = rb_entry(node, struct btrfs_free_space, offset_index);
++ /*
++ * If we are using the bytes index then all subsequent entries
++ * in this tree are going to be < bytes, so simply set the max
++ * extent size and exit the loop.
++ *
++ * If we're using the offset index then we need to keep going
++ * through the rest of the tree.
++ */
+ if (entry->bytes < *bytes) {
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
++ if (use_bytes_index)
++ break;
+ continue;
+ }
+
+@@ -1916,6 +2000,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+ tmp = entry->offset;
+ }
+
++ /*
++ * We don't break here if we're using the bytes index because we
++ * may have another entry that has the correct alignment that is
++ * the right size, so we don't want to miss that possibility.
++ * At worst this adds another loop through the logic, but if we
++ * broke here we could prematurely ENOSPC.
++ */
+ if (entry->bytes < *bytes + align_off) {
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
+@@ -1923,6 +2014,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+ }
+
+ if (entry->bitmap) {
++ struct rb_node *old_next = rb_next(node);
+ u64 size = *bytes;
+
+ ret = search_bitmap(ctl, entry, &tmp, &size, true);
+@@ -1935,6 +2027,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+ max(get_max_extent_size(entry),
+ *max_extent_size);
+ }
++
++ /*
++ * The bitmap may have gotten re-arranged in the space
++ * index here because the max_extent_size may have been
++ * updated. Start from the beginning again if this
++ * happened.
++ */
++ if (use_bytes_index && old_next != rb_next(node))
++ goto again;
+ continue;
+ }
+
+@@ -2083,12 +2184,6 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
+
+ bitmap_set_bits(ctl, info, offset, bytes_to_set);
+
+- /*
+- * We set some bytes, we have no idea what the max extent size is
+- * anymore.
+- */
+- info->max_extent_size = 0;
+-
+ return bytes_to_set;
+
+ }
+@@ -2486,6 +2581,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
+ info->bytes = bytes;
+ info->trim_state = trim_state;
+ RB_CLEAR_NODE(&info->offset_index);
++ RB_CLEAR_NODE(&info->bytes_index);
+
+ spin_lock(&ctl->tree_lock);
+
+@@ -2799,6 +2895,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
+ ctl->start = block_group->start;
+ ctl->private = block_group;
+ ctl->op = &free_space_op;
++ ctl->free_space_bytes = RB_ROOT_CACHED;
+ INIT_LIST_HEAD(&ctl->trimming_ranges);
+ mutex_init(&ctl->cache_writeout_mutex);
+
+@@ -2864,6 +2961,8 @@ static void __btrfs_return_cluster_to_free_space(
+ }
+ tree_insert_offset(&ctl->free_space_offset,
+ entry->offset, &entry->offset_index, bitmap);
++ rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
++ entry_less);
+ }
+ cluster->root = RB_ROOT;
+ spin_unlock(&cluster->lock);
+@@ -2965,12 +3064,14 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
+ u64 align_gap = 0;
+ u64 align_gap_len = 0;
+ enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
++ bool use_bytes_index = (offset == block_group->start);
+
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
+
+ spin_lock(&ctl->tree_lock);
+ entry = find_free_space(ctl, &offset, &bytes_search,
+- block_group->full_stripe_len, max_extent_size);
++ block_group->full_stripe_len, max_extent_size,
++ use_bytes_index);
+ if (!entry)
+ goto out;
+
+@@ -3254,6 +3355,17 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
+
+ cluster->window_start = start * ctl->unit + entry->offset;
+ rb_erase(&entry->offset_index, &ctl->free_space_offset);
++ rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
++
++ /*
++ * We need to know if we're currently on the normal space index when we
++ * manipulate the bitmap so that we know we need to remove and re-insert
++ * it into the space_index tree. Clear the bytes_index node here so the
++ * bitmap manipulation helpers know not to mess with the space_index
++ * until this bitmap entry is added back into the normal cache.
++ */
++ RB_CLEAR_NODE(&entry->bytes_index);
++
+ ret = tree_insert_offset(&cluster->root, entry->offset,
+ &entry->offset_index, 1);
+ ASSERT(!ret); /* -EEXIST; Logic error */
+@@ -3344,6 +3456,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
+ continue;
+
+ rb_erase(&entry->offset_index, &ctl->free_space_offset);
++ rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
+ ret = tree_insert_offset(&cluster->root, entry->offset,
+ &entry->offset_index, 0);
+ total_size += entry->bytes;
+diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
+index 1f23088d43f9..dd982d204d2d 100644
+--- a/fs/btrfs/free-space-cache.h
++++ b/fs/btrfs/free-space-cache.h
+@@ -22,6 +22,7 @@ enum btrfs_trim_state {
+
+ struct btrfs_free_space {
+ struct rb_node offset_index;
++ struct rb_node bytes_index;
+ u64 offset;
+ u64 bytes;
+ u64 max_extent_size;
+@@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap(
+ struct btrfs_free_space_ctl {
+ spinlock_t tree_lock;
+ struct rb_root free_space_offset;
++ struct rb_root_cached free_space_bytes;
+ u64 free_space;
+ int extents_thresh;
+ int free_extents;
+diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
+index 37f36ffdaf6b..56755ce9a907 100644
+--- a/fs/btrfs/inode-item.c
++++ b/fs/btrfs/inode-item.c
+@@ -19,7 +19,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+ u32 cur_offset = 0;
+ int len;
+
+- item_size = btrfs_item_size_nr(leaf, slot);
++ item_size = btrfs_item_size(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ while (cur_offset < item_size) {
+ ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+@@ -45,7 +45,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
+ u32 cur_offset = 0;
+ int ref_name_len;
+
+- item_size = btrfs_item_size_nr(leaf, slot);
++ item_size = btrfs_item_size(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+
+ /*
+@@ -139,7 +139,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
+ }
+
+ leaf = path->nodes[0];
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+ if (index)
+ *index = btrfs_inode_extref_index(leaf, extref);
+
+@@ -208,7 +208,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ goto out;
+ }
+ leaf = path->nodes[0];
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+
+ if (index)
+ *index = btrfs_inode_ref_index(leaf, ref);
+@@ -256,7 +256,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+- struct btrfs_item *item;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+@@ -282,9 +281,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+ goto out;
+
+ leaf = path->nodes[0];
+- item = btrfs_item_nr(path->slots[0]);
+ ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+- ptr += btrfs_item_size(leaf, item) - ins_len;
++ ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len;
+ extref = (struct btrfs_inode_extref *)ptr;
+
+ btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+@@ -332,7 +330,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ if (ref)
+ goto out;
+
+- old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
++ old_size = btrfs_item_size(path->nodes[0], path->slots[0]);
+ btrfs_extend_item(path, ins_len);
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_ref);
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 39a674543461..0ed8cc6afa37 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -61,8 +61,6 @@ struct btrfs_iget_args {
+ };
+
+ struct btrfs_dio_data {
+- u64 reserve;
+- loff_t length;
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
+ };
+@@ -625,7 +623,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
+ again:
+ will_compress = 0;
+ nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+- BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
+ nr_pages = min_t(unsigned long, nr_pages,
+ BTRFS_MAX_COMPRESSED / PAGE_SIZE);
+
+@@ -5950,21 +5947,17 @@ static struct inode *new_simple_dir(struct super_block *s,
+ return inode;
+ }
+
++static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
++static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
++static_assert(BTRFS_FT_DIR == FT_DIR);
++static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
++static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
++static_assert(BTRFS_FT_FIFO == FT_FIFO);
++static_assert(BTRFS_FT_SOCK == FT_SOCK);
++static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
++
+ static inline u8 btrfs_inode_type(struct inode *inode)
+ {
+- /*
+- * Compile-time asserts that generic FT_* types still match
+- * BTRFS_FT_* types
+- */
+- BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
+- BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
+- BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
+- BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
+- BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
+- BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
+- BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
+- BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
+-
+ return fs_umode_to_ftype(inode->i_mode);
+ }
+
+@@ -6998,8 +6991,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
+ WARN_ON(pg_offset != 0);
+ compress_type = btrfs_file_extent_compression(leaf, item);
+ max_size = btrfs_file_extent_ram_bytes(leaf, item);
+- inline_size = btrfs_file_extent_inline_item_len(leaf,
+- btrfs_item_nr(path->slots[0]));
++ inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
+ tmp = kmalloc(inline_size, GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+@@ -7773,6 +7765,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ {
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_map *em = *map;
++ int type;
++ u64 block_start, orig_start, orig_block_len, ram_bytes;
++ bool can_nocow = false;
++ bool space_reserved = false;
+ int ret = 0;
+
+ /*
+@@ -7787,9 +7783,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ em->block_start != EXTENT_MAP_HOLE)) {
+- int type;
+- u64 block_start, orig_start, orig_block_len, ram_bytes;
+-
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ type = BTRFS_ORDERED_PREALLOC;
+ else
+@@ -7799,53 +7792,92 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
+
+ if (can_nocow_extent(inode, start, &len, &orig_start,
+ &orig_block_len, &ram_bytes, false) == 1 &&
+- btrfs_inc_nocow_writers(fs_info, block_start)) {
+- struct extent_map *em2;
++ btrfs_inc_nocow_writers(fs_info, block_start))
++ can_nocow = true;
++ }
+
+- em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
+- orig_start, block_start,
+- len, orig_block_len,
+- ram_bytes, type);
++ if (can_nocow) {
++ struct extent_map *em2;
++
++ /* We can NOCOW, so only need to reserve metadata space. */
++ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
++ if (ret < 0) {
++ /* Our caller expects us to free the input extent map. */
++ free_extent_map(em);
++ *map = NULL;
+ btrfs_dec_nocow_writers(fs_info, block_start);
+- if (type == BTRFS_ORDERED_PREALLOC) {
+- free_extent_map(em);
+- *map = em = em2;
+- }
+-
+- if (em2 && IS_ERR(em2)) {
+- ret = PTR_ERR(em2);
+- goto out;
+- }
+- /*
+- * For inode marked NODATACOW or extent marked PREALLOC,
+- * use the existing or preallocated extent, so does not
+- * need to adjust btrfs_space_info's bytes_may_use.
+- */
+- btrfs_free_reserved_data_space_noquota(fs_info, len);
+- goto skip_cow;
++ goto out;
+ }
++ space_reserved = true;
++
++ em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
++ orig_start, block_start,
++ len, orig_block_len,
++ ram_bytes, type);
++ btrfs_dec_nocow_writers(fs_info, block_start);
++ if (type == BTRFS_ORDERED_PREALLOC) {
++ free_extent_map(em);
++ *map = em = em2;
++ }
++
++ if (IS_ERR(em2)) {
++ ret = PTR_ERR(em2);
++ goto out;
++ }
++ } else {
++ const u64 prev_len = len;
++
++ /* Our caller expects us to free the input extent map. */
++ free_extent_map(em);
++ *map = NULL;
++
++ /* We have to COW, so need to reserve metadata and data space. */
++ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
++ &dio_data->data_reserved,
++ start, len);
++ if (ret < 0)
++ goto out;
++ space_reserved = true;
++
++ em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
++ if (IS_ERR(em)) {
++ ret = PTR_ERR(em);
++ goto out;
++ }
++ *map = em;
++ len = min(len, em->len - (start - em->start));
++ if (len < prev_len)
++ btrfs_delalloc_release_space(BTRFS_I(inode),
++ dio_data->data_reserved,
++ start + len, prev_len - len,
++ true);
+ }
+
+- /* this will cow the extent */
+- free_extent_map(em);
+- *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
+- if (IS_ERR(em)) {
+- ret = PTR_ERR(em);
+- goto out;
+- }
++ /*
++ * We have created our ordered extent, so we can now release our reservation
++ * for an outstanding extent.
++ */
++ btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+
+- len = min(len, em->len - (start - em->start));
+-
+-skip_cow:
+ /*
+ * Need to update the i_size under the extent lock so buffered
+ * readers will get the updated i_size when we unlock.
+ */
+ if (start + len > i_size_read(inode))
+ i_size_write(inode, start + len);
+-
+- dio_data->reserve -= len;
+ out:
++ if (ret && space_reserved) {
++ btrfs_delalloc_release_extents(BTRFS_I(inode), len);
++ if (can_nocow) {
++ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
++ } else {
++ btrfs_delalloc_release_space(BTRFS_I(inode),
++ dio_data->data_reserved,
++ start, len, true);
++ extent_changeset_free(dio_data->data_reserved);
++ dio_data->data_reserved = NULL;
++ }
++ }
+ return ret;
+ }
+
+@@ -7887,18 +7919,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ if (!dio_data)
+ return -ENOMEM;
+
+- dio_data->length = length;
+- if (write) {
+- dio_data->reserve = round_up(length, fs_info->sectorsize);
+- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+- &dio_data->data_reserved,
+- start, dio_data->reserve);
+- if (ret) {
+- extent_changeset_free(dio_data->data_reserved);
+- kfree(dio_data);
+- return ret;
+- }
+- }
+ iomap->private = dio_data;
+
+
+@@ -7991,14 +8011,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
+ err:
+- if (dio_data) {
+- btrfs_delalloc_release_space(BTRFS_I(inode),
+- dio_data->data_reserved, start,
+- dio_data->reserve, true);
+- btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
+- extent_changeset_free(dio_data->data_reserved);
+- kfree(dio_data);
+- }
++ kfree(dio_data);
++
+ return ret;
+ }
+
+@@ -8028,14 +8042,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ret = -ENOTBLK;
+ }
+
+- if (write) {
+- if (dio_data->reserve)
+- btrfs_delalloc_release_space(BTRFS_I(inode),
+- dio_data->data_reserved, pos,
+- dio_data->reserve, true);
+- btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
++ if (write)
+ extent_changeset_free(dio_data->data_reserved);
+- }
+ out:
+ kfree(dio_data);
+ iomap->private = NULL;
+diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
+index 48e03e176f31..5de240144273 100644
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -1020,23 +1020,37 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+ return em;
+ }
+
++static u32 get_extent_max_capacity(const struct extent_map *em)
++{
++ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
++ return BTRFS_MAX_COMPRESSED;
++ return BTRFS_MAX_EXTENT_SIZE;
++}
++
+ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+ bool locked)
+ {
+ struct extent_map *next;
+- bool ret = true;
++ bool ret = false;
+
+ /* this is the last extent */
+ if (em->start + em->len >= i_size_read(inode))
+- return false;
++ return ret;
+
+ next = defrag_lookup_extent(inode, em->start + em->len, locked);
++ /* No more em or hole */
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+- ret = false;
+- else if ((em->block_start + em->block_len == next->block_start) &&
+- (em->block_len > SZ_128K && next->block_len > SZ_128K))
+- ret = false;
+-
++ goto out;
++ if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
++ goto out;
++ /*
++ * If the next extent is at its max capcity, defragging current extent
++ * makes no sense, as the total number of extents won't change.
++ */
++ if (next->len >= get_extent_max_capacity(em))
++ goto out;
++ ret = true;
++out:
+ free_extent_map(next);
+ return ret;
+ }
+@@ -1146,22 +1160,21 @@ struct defrag_target_range {
+ /*
+ * Collect all valid target extents.
+ *
++ * @ctrl: extra defrag policy control
+ * @start: file offset to lookup
+ * @len: length to lookup
+- * @extent_thresh: file extent size threshold, any extent size >= this value
+- * will be ignored
+- * @newer_than: only defrag extents newer than this value
+- * @do_compress: whether the defrag is doing compression
+- * if true, @extent_thresh will be ignored and all regular
+- * file extents meeting @newer_than will be targets.
+ * @locked: if the range has already held extent lock
+ * @target_list: list of targets file extents
++ *
++ * Will update ctrl::last_scanned.
+ */
+ static int defrag_collect_targets(struct btrfs_inode *inode,
+- u64 start, u64 len, u32 extent_thresh,
+- u64 newer_than, bool do_compress,
+- bool locked, struct list_head *target_list)
++ struct btrfs_defrag_ctrl *ctrl,
++ u64 start, u32 len, bool locked,
++ struct list_head *target_list)
+ {
++ bool do_compress = ctrl->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
++ bool last_is_target = false;
+ u64 cur = start;
+ int ret = 0;
+
+@@ -1171,6 +1184,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
+ bool next_mergeable = true;
+ u64 range_len;
+
++ last_is_target = false;
+ em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+ if (!em)
+ break;
+@@ -1181,7 +1195,11 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
+ goto next;
+
+ /* Skip older extent */
+- if (em->generation < newer_than)
++ if (em->generation < ctrl->newer_than)
++ goto next;
++
++ /* This em is under writeback, no need to defrag */
++ if (em->generation == (u64)-1)
+ goto next;
+
+ /*
+@@ -1221,7 +1239,14 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
+ goto add;
+
+ /* Skip too large extent */
+- if (range_len >= extent_thresh)
++ if (range_len >= ctrl->extent_thresh)
++ goto next;
++
++ /*
++ * Skip extents already at its max capacity, this is mostly for
++ * compressed extents, which max cap is only 128K.
++ */
++ if (em->len >= get_extent_max_capacity(em))
+ goto next;
+
+ next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+@@ -1242,6 +1267,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
+ }
+
+ add:
++ last_is_target = true;
+ range_len = min(extent_map_end(em), start + len) - cur;
+ /*
+ * This one is a good target, check if it can be merged into
+@@ -1285,10 +1311,27 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
+ kfree(entry);
+ }
+ }
++ if (!ret) {
++ /*
++ * If the last extent is not a target, the caller can skip to
++ * the end of that extent.
++ * Otherwise, we can only go the end of the spcified range.
++ *
++ * And we may got a range smaller than current
++ * ctrl->last_scanned (e.g. executed in the defrag_one_range
++ * call), so we have to ensure we didn't decrease
++ * ctrl->last_scanned.
++ */
++ if (!last_is_target)
++ ctrl->last_scanned = max(cur, ctrl->last_scanned);
++ else
++ ctrl->last_scanned = max(start + len, ctrl->last_scanned);
++ }
+ return ret;
+ }
+
+ #define CLUSTER_SIZE (SZ_256K)
++static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+
+ /*
+ * Defrag one contiguous target range.
+@@ -1342,8 +1385,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
+ return ret;
+ }
+
+-static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+- u32 extent_thresh, u64 newer_than, bool do_compress)
++static int defrag_one_range(struct btrfs_inode *inode,
++ struct btrfs_defrag_ctrl *ctrl, u64 start, u32 len)
+ {
+ struct extent_state *cached_state = NULL;
+ struct defrag_target_range *entry;
+@@ -1387,8 +1430,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ * And this time we have extent locked already, pass @locked = true
+ * so that we won't relock the extent range and cause deadlock.
+ */
+- ret = defrag_collect_targets(inode, start, len, extent_thresh,
+- newer_than, do_compress, true,
++ ret = defrag_collect_targets(inode, ctrl, start, len, true,
+ &target_list);
+ if (ret < 0)
+ goto unlock_extent;
+@@ -1398,6 +1440,8 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ &cached_state);
+ if (ret < 0)
+ break;
++ ctrl->sectors_defragged += entry->len >>
++ inode->root->fs_info->sectorsize_bits;
+ }
+
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+@@ -1419,12 +1463,17 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ return ret;
+ }
+
++/*
++ * Return <0 for error.
++ * Return >0 if we hit the ctrl->max_sectors_to_defrag limit
++ * Return 0 if we finished the range without error.
++ *
++ * For >= 0 case, ctrl->last_scanned and ctrl->sectors_defragged will be updated.
++ */
+ static int defrag_one_cluster(struct btrfs_inode *inode,
+ struct file_ra_state *ra,
+- u64 start, u32 len, u32 extent_thresh,
+- u64 newer_than, bool do_compress,
+- unsigned long *sectors_defragged,
+- unsigned long max_sectors)
++ struct btrfs_defrag_ctrl *ctrl,
++ u64 start, u32 len)
+ {
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct defrag_target_range *entry;
+@@ -1432,9 +1481,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
+ LIST_HEAD(target_list);
+ int ret;
+
+- BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+- ret = defrag_collect_targets(inode, start, len, extent_thresh,
+- newer_than, do_compress, false,
++ ret = defrag_collect_targets(inode, ctrl, start, len, false,
+ &target_list);
+ if (ret < 0)
+ goto out;
+@@ -1443,32 +1490,25 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
+ u32 range_len = entry->len;
+
+ /* Reached or beyond the limit */
+- if (max_sectors && *sectors_defragged >= max_sectors) {
++ if (ctrl->max_sectors_to_defrag &&
++ ctrl->sectors_defragged >= ctrl->max_sectors_to_defrag) {
+ ret = 1;
+ break;
+ }
+
+- if (max_sectors)
++ if (ctrl->max_sectors_to_defrag)
+ range_len = min_t(u32, range_len,
+- (max_sectors - *sectors_defragged) * sectorsize);
++ (ctrl->max_sectors_to_defrag -
++ ctrl->sectors_defragged) * sectorsize);
+
+ if (ra)
+ page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ ra, NULL, entry->start >> PAGE_SHIFT,
+ ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+ (entry->start >> PAGE_SHIFT) + 1);
+- /*
+- * Here we may not defrag any range if holes are punched before
+- * we locked the pages.
+- * But that's fine, it only affects the @sectors_defragged
+- * accounting.
+- */
+- ret = defrag_one_range(inode, entry->start, range_len,
+- extent_thresh, newer_than, do_compress);
++ ret = defrag_one_range(inode, ctrl, entry->start, range_len);
+ if (ret < 0)
+ break;
+- *sectors_defragged += range_len >>
+- inode->root->fs_info->sectorsize_bits;
+ }
+ out:
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+@@ -1478,64 +1518,93 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
+ return ret;
+ }
+
++/*
++ * Convert the old ioctl format to the new btrfs_defrag_ctrl structure.
++ *
++ * Will also do basic tasks like setting default values and sanity checks.
++ */
++int btrfs_defrag_ioctl_args_to_ctrl(struct btrfs_fs_info *fs_info,
++ struct btrfs_ioctl_defrag_range_args *args,
++ struct btrfs_defrag_ctrl *ctrl,
++ u64 max_sectors_to_defrag, u64 newer_than)
++{
++ u64 range_end;
++
++ if (args->flags & ~BTRFS_DEFRAG_RANGE_FLAGS_MASK)
++ return -EOPNOTSUPP;
++ if (args->compress_type >= BTRFS_NR_COMPRESS_TYPES)
++ return -EOPNOTSUPP;
++
++ ctrl->start = round_down(args->start, fs_info->sectorsize);
++ /*
++ * If @len does not overflow with @start nor is -1, align the length.
++ * Otherwise set it to (u64)-1 so later btrfs_defrag_file() will
++ * determine the length using isize.
++ */
++ if (!check_add_overflow(args->start, args->len, &range_end) &&
++ args->len != (u64)-1)
++ ctrl->len = round_up(range_end, fs_info->sectorsize) -
++ ctrl->start;
++ else
++ ctrl->len = -1;
++ ctrl->flags = args->flags;
++ ctrl->compress = args->compress_type;
++ if (args->extent_thresh == 0)
++ ctrl->extent_thresh = SZ_256K;
++ else
++ ctrl->extent_thresh = args->extent_thresh;
++ ctrl->newer_than = newer_than;
++ ctrl->last_scanned = 0;
++ ctrl->sectors_defragged = 0;
++ return 0;
++}
++
+ /*
+ * Entry point to file defragmentation.
+ *
+ * @inode: inode to be defragged
+ * @ra: readahead state (can be NUL)
+- * @range: defrag options including range and flags
+- * @newer_than: minimum transid to defrag
+- * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+- * will be defragged.
++ * @ctrl: defrag options including range and various policy parameters
+ *
+ * Return <0 for error.
+- * Return >=0 for the number of sectors defragged, and range->start will be updated
+- * to indicate the file offset where next defrag should be started at.
+- * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
+- * defragging all the range).
++ * Return 0 if the defrag is done without error, ctrl->last_scanned and
++ * ctrl->sectors_defragged will be updated.
+ */
+ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+- struct btrfs_ioctl_defrag_range_args *range,
+- u64 newer_than, unsigned long max_to_defrag)
++ struct btrfs_defrag_ctrl *ctrl)
+ {
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+- unsigned long sectors_defragged = 0;
+ u64 isize = i_size_read(inode);
+ u64 cur;
+ u64 last_byte;
+- bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
++ bool do_compress = ctrl->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+ bool ra_allocated = false;
+- int compress_type = BTRFS_COMPRESS_ZLIB;
+ int ret = 0;
+- u32 extent_thresh = range->extent_thresh;
+ pgoff_t start_index;
+
+ if (isize == 0)
+ return 0;
+
+- if (range->start >= isize)
++ if (ctrl->start >= isize)
+ return -EINVAL;
+
+- if (do_compress) {
+- if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
+- return -EINVAL;
+- if (range->compress_type)
+- compress_type = range->compress_type;
+- }
++ if (do_compress)
++ ASSERT(ctrl->compress < BTRFS_NR_COMPRESS_TYPES);
+
+- if (extent_thresh == 0)
+- extent_thresh = SZ_256K;
++ if (ctrl->extent_thresh == 0)
++ ctrl->extent_thresh = SZ_256K;
+
+- if (range->start + range->len > range->start) {
++ if (ctrl->start + ctrl->len > ctrl->start) {
+ /* Got a specific range */
+- last_byte = min(isize, range->start + range->len);
++ last_byte = min(isize, ctrl->start + ctrl->len);
+ } else {
+ /* Defrag until file end */
+ last_byte = isize;
+ }
+
+ /* Align the range */
+- cur = round_down(range->start, fs_info->sectorsize);
++ cur = round_down(ctrl->start, fs_info->sectorsize);
++ ctrl->last_scanned = cur;
+ last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
+
+ /*
+@@ -1559,12 +1628,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+ inode->i_mapping->writeback_index = start_index;
+
+ while (cur < last_byte) {
+- const unsigned long prev_sectors_defragged = sectors_defragged;
++ const unsigned long prev_sectors_defragged = ctrl->sectors_defragged;
+ u64 cluster_end;
+
+- /* The cluster size 256K should always be page aligned */
+- BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+-
+ if (btrfs_defrag_cancelled(fs_info)) {
+ ret = -EAGAIN;
+ break;
+@@ -1586,48 +1652,41 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+ break;
+ }
+ if (do_compress)
+- BTRFS_I(inode)->defrag_compress = compress_type;
+- ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+- cluster_end + 1 - cur, extent_thresh,
+- newer_than, do_compress,
+- &sectors_defragged, max_to_defrag);
++ BTRFS_I(inode)->defrag_compress = ctrl->compress;
++ ret = defrag_one_cluster(BTRFS_I(inode), ra, ctrl, cur,
++ cluster_end + 1 - cur);
+
+- if (sectors_defragged > prev_sectors_defragged)
++ if (ctrl->sectors_defragged > prev_sectors_defragged)
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+
+ btrfs_inode_unlock(inode, 0);
+ if (ret < 0)
+ break;
+- cur = cluster_end + 1;
++ cur = max(cluster_end + 1, ctrl->last_scanned);
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
++ cond_resched();
+ }
+
+ if (ra_allocated)
+ kfree(ra);
+- /*
+- * Update range.start for autodefrag, this will indicate where to start
+- * in next run.
+- */
+- range->start = cur;
+- if (sectors_defragged) {
++ if (ctrl->sectors_defragged) {
+ /*
+ * We have defragged some sectors, for compression case they
+ * need to be written back immediately.
+ */
+- if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
++ if (ctrl->flags & BTRFS_DEFRAG_RANGE_START_IO) {
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
+- if (range->compress_type == BTRFS_COMPRESS_LZO)
++ if (ctrl->compress == BTRFS_COMPRESS_LZO)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+- else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
++ else if (ctrl->compress == BTRFS_COMPRESS_ZSTD)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+- ret = sectors_defragged;
+ }
+ if (do_compress) {
+ btrfs_inode_lock(inode, 0);
+@@ -2147,7 +2206,7 @@ static noinline int copy_to_sk(struct btrfs_path *path,
+
+ for (i = slot; i < nritems; i++) {
+ item_off = btrfs_item_ptr_offset(leaf, i);
+- item_len = btrfs_item_size_nr(leaf, i);
++ item_len = btrfs_item_size(leaf, i);
+
+ btrfs_item_key_to_cpu(leaf, key, i);
+ if (!key_in_sk(key, sk))
+@@ -2601,7 +2660,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ item_off = btrfs_item_ptr_offset(leaf, slot);
+- item_len = btrfs_item_size_nr(leaf, slot);
++ item_len = btrfs_item_size(leaf, slot);
+ /* Check if dirid in ROOT_REF corresponds to passed dirid */
+ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
+@@ -2803,7 +2862,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
+
+ item_off = btrfs_item_ptr_offset(leaf, slot)
+ + sizeof(struct btrfs_root_ref);
+- item_len = btrfs_item_size_nr(leaf, slot)
++ item_len = btrfs_item_size(leaf, slot)
+ - sizeof(struct btrfs_root_ref);
+ read_extent_buffer(leaf, subvol_info->name,
+ item_off, item_len);
+@@ -3148,6 +3207,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ioctl_defrag_range_args range = {0};
++ struct btrfs_defrag_ctrl ctrl = {0};
+ int ret;
+
+ ret = mnt_want_write_file(file);
+@@ -3193,8 +3253,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
+ /* the rest are all set to zero by kzalloc */
+ range.len = (u64)-1;
+ }
+- ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
+- &range, BTRFS_OLDEST_GENERATION, 0);
++ ret = btrfs_defrag_ioctl_args_to_ctrl(root->fs_info, &range,
++ &ctrl, 0, BTRFS_OLDEST_GENERATION);
++ if (ret < 0)
++ break;
++ ret = btrfs_defrag_file(file_inode(file), &file->f_ra, &ctrl);
+ if (ret > 0)
+ ret = 0;
+ break;
+@@ -3683,7 +3746,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+ {
+ struct btrfs_trans_handle *trans;
+ u64 transid;
+- int ret;
+
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+@@ -3695,11 +3757,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+ goto out;
+ }
+ transid = trans->transid;
+- ret = btrfs_commit_transaction_async(trans);
+- if (ret) {
+- btrfs_end_transaction(trans);
+- return ret;
+- }
++ btrfs_commit_transaction_async(trans);
+ out:
+ if (argp)
+ if (copy_to_user(argp, &transid, sizeof(transid)))
+diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
+index 0fb90cbe7669..430ad36b8b08 100644
+--- a/fs/btrfs/lzo.c
++++ b/fs/btrfs/lzo.c
+@@ -55,6 +55,9 @@
+ * 0x1000 | SegHdr N+1| Data payload N+1 ... |
+ */
+
++#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
++#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
++
+ struct workspace {
+ void *mem;
+ void *buf; /* where decompressed data goes */
+@@ -83,8 +86,8 @@ struct list_head *lzo_alloc_workspace(unsigned int level)
+ return ERR_PTR(-ENOMEM);
+
+ workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+- workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
+- workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
++ workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL);
++ workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL);
+ if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+ goto fail;
+
+@@ -380,6 +383,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+ kunmap(cur_page);
+ cur_in += LZO_LEN;
+
++ if (seg_len > WORKSPACE_CBUF_LENGTH) {
++ /*
++ * seg_len shouldn't be larger than we have allocated
++ * for workspace->cbuf
++ */
++ btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
++ seg_len);
++ ret = -EIO;
++ goto out;
++ }
++
+ /* Copy the compressed segment payload into workspace */
+ copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
+
+@@ -422,7 +436,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ size_t in_len;
+ size_t out_len;
+- size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
++ size_t max_segment_len = WORKSPACE_BUF_LENGTH;
+ int ret = 0;
+ char *kaddr;
+ unsigned long bytes;
+diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
+index aae1027bd76a..0775ae9f4419 100644
+--- a/fs/btrfs/print-tree.c
++++ b/fs/btrfs/print-tree.c
+@@ -85,7 +85,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
+ struct btrfs_disk_key key;
+ unsigned long end;
+ unsigned long ptr;
+- u32 item_size = btrfs_item_size_nr(eb, slot);
++ u32 item_size = btrfs_item_size(eb, slot);
+ u64 flags;
+ u64 offset;
+ int ref_index = 0;
+@@ -200,7 +200,6 @@ void btrfs_print_leaf(struct extent_buffer *l)
+ struct btrfs_fs_info *fs_info;
+ int i;
+ u32 type, nr;
+- struct btrfs_item *item;
+ struct btrfs_root_item *ri;
+ struct btrfs_dir_item *di;
+ struct btrfs_inode_item *ii;
+@@ -224,12 +223,11 @@ void btrfs_print_leaf(struct extent_buffer *l)
+ btrfs_leaf_free_space(l), btrfs_header_owner(l));
+ print_eb_refs_lock(l);
+ for (i = 0 ; i < nr ; i++) {
+- item = btrfs_item_nr(i);
+ btrfs_item_key_to_cpu(l, &key, i);
+ type = key.type;
+ pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n",
+ i, key.objectid, type, key.offset,
+- btrfs_item_offset(l, item), btrfs_item_size(l, item));
++ btrfs_item_offset(l, i), btrfs_item_size(l, i));
+ switch (type) {
+ case BTRFS_INODE_ITEM_KEY:
+ ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
+@@ -347,7 +345,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
+ case BTRFS_UUID_KEY_SUBVOL:
+ case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+ print_uuid_item(l, btrfs_item_ptr_offset(l, i),
+- btrfs_item_size_nr(l, i));
++ btrfs_item_size(l, i));
+ break;
+ }
+ }
+diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
+index b1cb5a8c2999..a978676aa627 100644
+--- a/fs/btrfs/props.c
++++ b/fs/btrfs/props.c
+@@ -158,7 +158,7 @@ static int iterate_object_props(struct btrfs_root *root,
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ cur = 0;
+- total_len = btrfs_item_size_nr(leaf, slot);
++ total_len = btrfs_item_size(leaf, slot);
+
+ while (cur < total_len) {
+ u32 name_len = btrfs_dir_name_len(leaf, di);
+diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
+index 26134b7476a2..3712cd5fdbfe 100644
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -258,16 +258,19 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
+ return 0;
+ }
+
+-/* must be called with qgroup_lock held */
+-static int add_relation_rb(struct btrfs_fs_info *fs_info,
+- u64 memberid, u64 parentid)
++/*
++ * Add relation specified by two qgroups.
++ *
++ * Must be called with qgroup_lock held.
++ *
++ * Return: 0 on success
++ * -ENOENT if one of the qgroups is NULL
++ * <0 other errors
++ */
++static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
+ {
+- struct btrfs_qgroup *member;
+- struct btrfs_qgroup *parent;
+ struct btrfs_qgroup_list *list;
+
+- member = find_qgroup_rb(fs_info, memberid);
+- parent = find_qgroup_rb(fs_info, parentid);
+ if (!member || !parent)
+ return -ENOENT;
+
+@@ -283,7 +286,27 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info,
+ return 0;
+ }
+
+-/* must be called with qgroup_lock held */
++/*
++ * Add relation specified by two qgoup ids.
++ *
++ * Must be called with qgroup_lock held.
++ *
++ * Return: 0 on success
++ * -ENOENT if one of the ids does not exist
++ * <0 other errors
++ */
++static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
++{
++ struct btrfs_qgroup *member;
++ struct btrfs_qgroup *parent;
++
++ member = find_qgroup_rb(fs_info, memberid);
++ parent = find_qgroup_rb(fs_info, parentid);
++
++ return __add_relation_rb(member, parent);
++}
++
++/* Must be called with qgroup_lock held */
+ static int del_relation_rb(struct btrfs_fs_info *fs_info,
+ u64 memberid, u64 parentid)
+ {
+@@ -1444,7 +1467,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
+ }
+
+ spin_lock(&fs_info->qgroup_lock);
+- ret = add_relation_rb(fs_info, src, dst);
++ ret = __add_relation_rb(member, parent);
+ if (ret < 0) {
+ spin_unlock(&fs_info->qgroup_lock);
+ goto out;
+diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
+index e2b9f8616501..f34130d90dee 100644
+--- a/fs/btrfs/ref-verify.c
++++ b/fs/btrfs/ref-verify.c
+@@ -435,7 +435,7 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ struct extent_buffer *leaf = path->nodes[0];
+- u32 item_size = btrfs_item_size_nr(leaf, slot);
++ u32 item_size = btrfs_item_size(leaf, slot);
+ unsigned long end, ptr;
+ u64 offset, flags, count;
+ int type, ret;
+diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
+index e0f93b357548..a3930da4eb3f 100644
+--- a/fs/btrfs/reflink.c
++++ b/fs/btrfs/reflink.c
+@@ -439,7 +439,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
+ break;
+ }
+ next_key_min_offset = key.offset + datal;
+- size = btrfs_item_size_nr(leaf, slot);
++ size = btrfs_item_size(leaf, slot);
+ read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
+index 33a0ee7ac590..ee0a0efc7efd 100644
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -3149,7 +3149,7 @@ static int add_tree_block(struct reloc_control *rc,
+ u64 owner = 0;
+
+ eb = path->nodes[0];
+- item_size = btrfs_item_size_nr(eb, path->slots[0]);
++ item_size = btrfs_item_size(eb, path->slots[0]);
+
+ if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
+ item_size >= sizeof(*ei) + sizeof(*bi)) {
+diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
+index d20166336557..3297368aa359 100644
+--- a/fs/btrfs/root-tree.c
++++ b/fs/btrfs/root-tree.c
+@@ -25,7 +25,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
+ u32 len;
+ int need_reset = 0;
+
+- len = btrfs_item_size_nr(eb, slot);
++ len = btrfs_item_size(eb, slot);
+ read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
+ min_t(u32, len, sizeof(*item)));
+ if (len < sizeof(*item))
+@@ -146,7 +146,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ l = path->nodes[0];
+ slot = path->slots[0];
+ ptr = btrfs_item_ptr_offset(l, slot);
+- old_len = btrfs_item_size_nr(l, slot);
++ old_len = btrfs_item_size(l, slot);
+
+ /*
+ * If this is the first time we update the root item which originated
+diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
+index 8f6ceea33969..d175c5ab1134 100644
+--- a/fs/btrfs/scrub.c
++++ b/fs/btrfs/scrub.c
+@@ -758,7 +758,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
+
+ eb = path->nodes[0];
+ ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+- item_size = btrfs_item_size_nr(eb, path->slots[0]);
++ item_size = btrfs_item_size(eb, path->slots[0]);
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ do {
+diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
+index 040324d71118..93b9fe2dca67 100644
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -898,7 +898,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
+ iterate_inode_ref_t iterate, void *ctx)
+ {
+ struct extent_buffer *eb = path->nodes[0];
+- struct btrfs_item *item;
+ struct btrfs_inode_ref *iref;
+ struct btrfs_inode_extref *extref;
+ struct btrfs_path *tmp_path;
+@@ -930,12 +929,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
+ if (found_key->type == BTRFS_INODE_REF_KEY) {
+ ptr = (unsigned long)btrfs_item_ptr(eb, slot,
+ struct btrfs_inode_ref);
+- item = btrfs_item_nr(slot);
+- total = btrfs_item_size(eb, item);
++ total = btrfs_item_size(eb, slot);
+ elem_size = sizeof(*iref);
+ } else {
+ ptr = btrfs_item_ptr_offset(eb, slot);
+- total = btrfs_item_size_nr(eb, slot);
++ total = btrfs_item_size(eb, slot);
+ elem_size = sizeof(*extref);
+ }
+
+@@ -1018,7 +1016,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
+ {
+ int ret = 0;
+ struct extent_buffer *eb;
+- struct btrfs_item *item;
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ char *buf = NULL;
+@@ -1047,11 +1044,10 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+- item = btrfs_item_nr(slot);
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ cur = 0;
+ len = 0;
+- total = btrfs_item_size(eb, item);
++ total = btrfs_item_size(eb, slot);
+
+ num = 0;
+ while (cur < total) {
+@@ -3622,7 +3618,7 @@ static int is_ancestor(struct btrfs_root *root,
+ key.type != BTRFS_INODE_EXTREF_KEY)
+ break;
+
+- item_size = btrfs_item_size_nr(leaf, slot);
++ item_size = btrfs_item_size(leaf, slot);
+ while (cur_offset < item_size) {
+ u64 parent;
+ u64 parent_gen;
+@@ -4983,6 +4979,10 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
++ btrfs_err(fs_info,
++ "send: IO error at offset %llu for inode %llu root %llu",
++ page_offset(page), sctx->cur_ino,
++ sctx->send_root->root_key.objectid);
+ put_page(page);
+ ret = -EIO;
+ break;
+@@ -6566,7 +6566,7 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
+ }
+
+ leaf = path->nodes[0];
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ while (cur_offset < item_size) {
+ extref = (struct btrfs_inode_extref *)(ptr +
+@@ -6791,8 +6791,8 @@ static int tree_compare_item(struct btrfs_path *left_path,
+ int len1, len2;
+ unsigned long off1, off2;
+
+- len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
+- len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
++ len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]);
++ len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]);
+ if (len1 != len2)
+ return 1;
+
+diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
+index f9eff3b0f77c..836a20fdfca1 100644
+--- a/fs/btrfs/sysfs.c
++++ b/fs/btrfs/sysfs.c
+@@ -1104,6 +1104,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
+ static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
+ static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS];
+
++static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) ==
++ ARRAY_SIZE(btrfs_feature_attrs));
++static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) ==
++ ARRAY_SIZE(btrfs_feature_attrs[0]));
++
+ static const u64 supported_feature_masks[FEAT_MAX] = {
+ [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
+ [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
+@@ -1272,11 +1277,6 @@ static void init_feature_attrs(void)
+ struct btrfs_feature_attr *fa;
+ int set, i;
+
+- BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
+- ARRAY_SIZE(btrfs_feature_attrs));
+- BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
+- ARRAY_SIZE(btrfs_feature_attrs[0]));
+-
+ memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
+ memset(btrfs_unknown_feature_names, 0,
+ sizeof(btrfs_unknown_feature_names));
+diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
+index 2a95f7224e18..51a8b075c259 100644
+--- a/fs/btrfs/tests/extent-buffer-tests.c
++++ b/fs/btrfs/tests/extent-buffer-tests.c
+@@ -15,7 +15,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ struct btrfs_path *path = NULL;
+ struct btrfs_root *root = NULL;
+ struct extent_buffer *eb;
+- struct btrfs_item *item;
+ char *value = "mary had a little lamb";
+ char *split1 = "mary had a little";
+ char *split2 = " lamb";
+@@ -61,7 +60,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ key.offset = 0;
+
+ btrfs_setup_item_for_insert(root, path, &key, value_len);
+- item = btrfs_item_nr(0);
+ write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
+ value_len);
+
+@@ -90,8 +88,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ goto out;
+ }
+
+- item = btrfs_item_nr(0);
+- if (btrfs_item_size(eb, item) != strlen(split1)) {
++ if (btrfs_item_size(eb, 0) != strlen(split1)) {
+ test_err("invalid len in the first split");
+ ret = -EINVAL;
+ goto out;
+@@ -115,8 +112,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ goto out;
+ }
+
+- item = btrfs_item_nr(1);
+- if (btrfs_item_size(eb, item) != strlen(split2)) {
++ if (btrfs_item_size(eb, 1) != strlen(split2)) {
+ test_err("invalid len in the second split");
+ ret = -EINVAL;
+ goto out;
+@@ -147,8 +143,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ goto out;
+ }
+
+- item = btrfs_item_nr(0);
+- if (btrfs_item_size(eb, item) != strlen(split3)) {
++ if (btrfs_item_size(eb, 0) != strlen(split3)) {
+ test_err("invalid len in the first split");
+ ret = -EINVAL;
+ goto out;
+@@ -171,8 +166,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ goto out;
+ }
+
+- item = btrfs_item_nr(1);
+- if (btrfs_item_size(eb, item) != strlen(split4)) {
++ if (btrfs_item_size(eb, 1) != strlen(split4)) {
+ test_err("invalid len in the second split");
+ ret = -EINVAL;
+ goto out;
+@@ -195,8 +189,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ goto out;
+ }
+
+- item = btrfs_item_nr(2);
+- if (btrfs_item_size(eb, item) != strlen(split2)) {
++ if (btrfs_item_size(eb, 2) != strlen(split2)) {
+ test_err("invalid len in the second split");
+ ret = -EINVAL;
+ goto out;
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index 27b93a6c41bb..f3c094af9283 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -1861,50 +1861,14 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+ return ret;
+ }
+
+-/*
+- * commit transactions asynchronously. once btrfs_commit_transaction_async
+- * returns, any subsequent transaction will not be allowed to join.
+- */
+-struct btrfs_async_commit {
+- struct btrfs_trans_handle *newtrans;
+- struct work_struct work;
+-};
+-
+-static void do_async_commit(struct work_struct *work)
+-{
+- struct btrfs_async_commit *ac =
+- container_of(work, struct btrfs_async_commit, work);
+-
+- /*
+- * We've got freeze protection passed with the transaction.
+- * Tell lockdep about it.
+- */
+- if (ac->newtrans->type & __TRANS_FREEZABLE)
+- __sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS);
+-
+- current->journal_info = ac->newtrans;
+-
+- btrfs_commit_transaction(ac->newtrans);
+- kfree(ac);
+-}
+-
+-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
++void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
+ {
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+- struct btrfs_async_commit *ac;
+ struct btrfs_transaction *cur_trans;
+
+- ac = kmalloc(sizeof(*ac), GFP_NOFS);
+- if (!ac)
+- return -ENOMEM;
+-
+- INIT_WORK(&ac->work, do_async_commit);
+- ac->newtrans = btrfs_join_transaction(trans->root);
+- if (IS_ERR(ac->newtrans)) {
+- int err = PTR_ERR(ac->newtrans);
+- kfree(ac);
+- return err;
+- }
++ /* Kick the transaction kthread. */
++ set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
++ wake_up_process(fs_info->transaction_kthread);
+
+ /* take transaction reference */
+ cur_trans = trans->transaction;
+@@ -1912,14 +1876,6 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
+
+ btrfs_end_transaction(trans);
+
+- /*
+- * Tell lockdep we've released the freeze rwsem, since the
+- * async commit thread will be the one to unlock it.
+- */
+- if (ac->newtrans->type & __TRANS_FREEZABLE)
+- __sb_writers_release(fs_info->sb, SB_FREEZE_FS);
+-
+- schedule_work(&ac->work);
+ /*
+ * Wait for the current transaction commit to start and block
+ * subsequent transaction joins
+@@ -1927,14 +1883,9 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
+ wait_event(fs_info->transaction_blocked_wait,
+ cur_trans->state >= TRANS_STATE_COMMIT_START ||
+ TRANS_ABORTED(cur_trans));
+- if (current->journal_info == trans)
+- current->journal_info = NULL;
+-
+ btrfs_put_transaction(cur_trans);
+- return 0;
+ }
+
+-
+ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
+ {
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+@@ -2013,16 +1964,24 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
+ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+ {
+ /*
+- * We use writeback_inodes_sb here because if we used
++ * We use try_to_writeback_inodes_sb() here because if we used
+ * btrfs_start_delalloc_roots we would deadlock with fs freeze.
+ * Currently are holding the fs freeze lock, if we do an async flush
+ * we'll do btrfs_join_transaction() and deadlock because we need to
+ * wait for the fs freeze lock. Using the direct flushing we benefit
+ * from already being in a transaction and our join_transaction doesn't
+ * have to re-take the fs freeze lock.
++ *
++ * Note that try_to_writeback_inodes_sb() will only trigger writeback
++ * if it can read lock sb->s_umount. It will always be able to lock it,
++ * except when the filesystem is being unmounted or being frozen, but in
++ * those cases sync_filesystem() is called, which results in calling
++ * writeback_inodes_sb() while holding a write lock on sb->s_umount.
++ * Note that we don't call writeback_inodes_sb() directly, because it
++ * will emit a warning if sb->s_umount is not locked.
+ */
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+- writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
++ try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
+ return 0;
+ }
+
+@@ -2224,6 +2183,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+
++ /*
++ * We've started the commit, clear the flag in case we were triggered to
++ * do an async commit but somebody else started before the transaction
++ * kthread could do the work.
++ */
++ clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
++
+ if (TRANS_ABORTED(cur_trans)) {
+ ret = cur_trans->aborted;
+ goto scrub_continue;
+diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
+index eba07b8119bb..d0705485f5c8 100644
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -219,7 +219,7 @@ void btrfs_add_dead_root(struct btrfs_root *root);
+ int btrfs_defrag_root(struct btrfs_root *root);
+ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
+ int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
+-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
++void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
+ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
+ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
+ void btrfs_throttle(struct btrfs_fs_info *fs_info);
+diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
+index 7733e8ac0a69..72e1c942197d 100644
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -202,7 +202,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_file_extent_item *fi;
+ u32 sectorsize = fs_info->sectorsize;
+- u32 item_size = btrfs_item_size_nr(leaf, slot);
++ u32 item_size = btrfs_item_size(leaf, slot);
+ u64 extent_end;
+
+ if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
+@@ -354,17 +354,17 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
+ key->offset, sectorsize);
+ return -EUCLEAN;
+ }
+- if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) {
++ if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) {
+ generic_err(leaf, slot,
+ "unaligned item size for csum item, have %u should be aligned to %u",
+- btrfs_item_size_nr(leaf, slot), csumsize);
++ btrfs_item_size(leaf, slot), csumsize);
+ return -EUCLEAN;
+ }
+ if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) {
+ u64 prev_csum_end;
+ u32 prev_item_size;
+
+- prev_item_size = btrfs_item_size_nr(leaf, slot - 1);
++ prev_item_size = btrfs_item_size(leaf, slot - 1);
+ prev_csum_end = (prev_item_size / csumsize) * sectorsize;
+ prev_csum_end += prev_key->offset;
+ if (unlikely(prev_csum_end > key->offset)) {
+@@ -483,7 +483,7 @@ static int check_dir_item(struct extent_buffer *leaf,
+ {
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_dir_item *di;
+- u32 item_size = btrfs_item_size_nr(leaf, slot);
++ u32 item_size = btrfs_item_size(leaf, slot);
+ u32 cur = 0;
+
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
+@@ -640,7 +640,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+ {
+ struct btrfs_block_group_item bgi;
+- u32 item_size = btrfs_item_size_nr(leaf, slot);
++ u32 item_size = btrfs_item_size(leaf, slot);
+ u64 flags;
+ u64 type;
+
+@@ -912,10 +912,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
+ {
+ int num_stripes;
+
+- if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) {
++ if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
+ chunk_err(leaf, chunk, key->offset,
+ "invalid chunk item size: have %u expect [%zu, %u)",
+- btrfs_item_size_nr(leaf, slot),
++ btrfs_item_size(leaf, slot),
+ sizeof(struct btrfs_chunk),
+ BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ return -EUCLEAN;
+@@ -927,10 +927,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
+ goto out;
+
+ if (unlikely(btrfs_chunk_item_size(num_stripes) !=
+- btrfs_item_size_nr(leaf, slot))) {
++ btrfs_item_size(leaf, slot))) {
+ chunk_err(leaf, chunk, key->offset,
+ "invalid chunk item size: have %u expect %lu",
+- btrfs_item_size_nr(leaf, slot),
++ btrfs_item_size(leaf, slot),
+ btrfs_chunk_item_size(num_stripes));
+ return -EUCLEAN;
+ }
+@@ -1095,12 +1095,12 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
+ if (unlikely(ret < 0))
+ return ret;
+
+- if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
+- btrfs_item_size_nr(leaf, slot) !=
++ if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) &&
++ btrfs_item_size(leaf, slot) !=
+ btrfs_legacy_root_item_size())) {
+ generic_err(leaf, slot,
+ "invalid root item size, have %u expect %zu or %u",
+- btrfs_item_size_nr(leaf, slot), sizeof(ri),
++ btrfs_item_size(leaf, slot), sizeof(ri),
+ btrfs_legacy_root_item_size());
+ return -EUCLEAN;
+ }
+@@ -1111,7 +1111,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
+ * And since we allow geneartion_v2 as 0, it will still pass the check.
+ */
+ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
+- btrfs_item_size_nr(leaf, slot));
++ btrfs_item_size(leaf, slot));
+
+ /* Generation related */
+ if (unlikely(btrfs_root_generation(&ri) >
+@@ -1208,7 +1208,7 @@ static int check_extent_item(struct extent_buffer *leaf,
+ bool is_tree_block = false;
+ unsigned long ptr; /* Current pointer inside inline refs */
+ unsigned long end; /* Extent item end */
+- const u32 item_size = btrfs_item_size_nr(leaf, slot);
++ const u32 item_size = btrfs_item_size(leaf, slot);
+ u64 flags;
+ u64 generation;
+ u64 total_refs; /* Total refs in btrfs_extent_item */
+@@ -1432,10 +1432,10 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf,
+ if (key->type == BTRFS_SHARED_DATA_REF_KEY)
+ expect_item_size = sizeof(struct btrfs_shared_data_ref);
+
+- if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) {
++ if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
+ generic_err(leaf, slot,
+ "invalid item size, have %u expect %u for key type %u",
+- btrfs_item_size_nr(leaf, slot),
++ btrfs_item_size(leaf, slot),
+ expect_item_size, key->type);
+ return -EUCLEAN;
+ }
+@@ -1460,12 +1460,12 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
+ {
+ struct btrfs_extent_data_ref *dref;
+ unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
+- const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot);
++ const unsigned long end = ptr + btrfs_item_size(leaf, slot);
+
+- if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) {
++ if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) {
+ generic_err(leaf, slot,
+ "invalid item size, have %u expect aligned to %zu for key type %u",
+- btrfs_item_size_nr(leaf, slot),
++ btrfs_item_size(leaf, slot),
+ sizeof(*dref), key->type);
+ return -EUCLEAN;
+ }
+@@ -1507,16 +1507,16 @@ static int check_inode_ref(struct extent_buffer *leaf,
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
+ return -EUCLEAN;
+ /* namelen can't be 0, so item_size == sizeof() is also invalid */
+- if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) {
++ if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) {
+ inode_ref_err(leaf, slot,
+ "invalid item size, have %u expect (%zu, %u)",
+- btrfs_item_size_nr(leaf, slot),
++ btrfs_item_size(leaf, slot),
+ sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ return -EUCLEAN;
+ }
+
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+- end = ptr + btrfs_item_size_nr(leaf, slot);
++ end = ptr + btrfs_item_size(leaf, slot);
+ while (ptr < end) {
+ u16 namelen;
+
+@@ -1689,12 +1689,12 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
+ if (slot == 0)
+ item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
+ else
+- item_end_expected = btrfs_item_offset_nr(leaf,
++ item_end_expected = btrfs_item_offset(leaf,
+ slot - 1);
+- if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) {
++ if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) {
+ generic_err(leaf, slot,
+ "unexpected item end, have %u expect %u",
+- btrfs_item_end_nr(leaf, slot),
++ btrfs_item_data_end(leaf, slot),
+ item_end_expected);
+ return -EUCLEAN;
+ }
+@@ -1704,11 +1704,11 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
+ * just in case all the items are consistent to each other, but
+ * all point outside of the leaf.
+ */
+- if (unlikely(btrfs_item_end_nr(leaf, slot) >
++ if (unlikely(btrfs_item_data_end(leaf, slot) >
+ BTRFS_LEAF_DATA_SIZE(fs_info))) {
+ generic_err(leaf, slot,
+ "slot end outside of leaf, have %u expect range [0, %u]",
+- btrfs_item_end_nr(leaf, slot),
++ btrfs_item_data_end(leaf, slot),
+ BTRFS_LEAF_DATA_SIZE(fs_info));
+ return -EUCLEAN;
+ }
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 6993dcdba6f1..cc3a8d8a3841 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -386,7 +386,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ overwrite_root = 1;
+
+- item_size = btrfs_item_size_nr(eb, slot);
++ item_size = btrfs_item_size(eb, slot);
+ src_ptr = btrfs_item_ptr_offset(eb, slot);
+
+ /* Our caller must have done a search for the key for us. */
+@@ -409,7 +409,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
+ if (ret == 0) {
+ char *src_copy;
+ char *dst_copy;
+- u32 dst_size = btrfs_item_size_nr(path->nodes[0],
++ u32 dst_size = btrfs_item_size(path->nodes[0],
+ path->slots[0]);
+ if (dst_size != item_size)
+ goto insert;
+@@ -503,7 +503,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
+ /* make sure any existing item is the correct size */
+ if (ret == -EEXIST || ret == -EOVERFLOW) {
+ u32 found_size;
+- found_size = btrfs_item_size_nr(path->nodes[0],
++ found_size = btrfs_item_size(path->nodes[0],
+ path->slots[0]);
+ if (found_size > item_size)
+ btrfs_truncate_item(path, item_size, 1);
+@@ -1096,7 +1096,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
+ * otherwise they must be unlinked as a conflict
+ */
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+- ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
++ ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
+ while (ptr < ptr_end) {
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ victim_name_len = btrfs_inode_ref_name_len(leaf,
+@@ -1155,7 +1155,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
+
+ leaf = path->nodes[0];
+
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+ base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+ while (cur_offset < item_size) {
+@@ -1318,7 +1318,7 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
+
+ eb = path->nodes[0];
+ ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+- ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
++ ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
+ while (ref_ptr < ref_end) {
+ char *name = NULL;
+ int namelen;
+@@ -1504,7 +1504,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+ int ref_struct_size;
+
+ ref_ptr = btrfs_item_ptr_offset(eb, slot);
+- ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
++ ref_end = ref_ptr + btrfs_item_size(eb, slot);
+
+ if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *r;
+@@ -1678,7 +1678,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
+ break;
+
+ leaf = path->nodes[0];
+- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ cur_offset = 0;
+
+@@ -1732,7 +1732,7 @@ static int count_inode_refs(struct btrfs_root *root,
+ key.type != BTRFS_INODE_REF_KEY)
+ break;
+ ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+- ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
++ ptr_end = ptr + btrfs_item_size(path->nodes[0],
+ path->slots[0]);
+ while (ptr < ptr_end) {
+ struct btrfs_inode_ref *ref;
+@@ -1950,6 +1950,34 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+ return ret;
+ }
+
++static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
++ struct btrfs_inode *dir,
++ struct btrfs_path *path,
++ struct btrfs_dir_item *dst_di,
++ const struct btrfs_key *log_key,
++ u8 log_type,
++ bool exists)
++{
++ struct btrfs_key found_key;
++
++ btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
++ /* The existing dentry points to the same inode, don't delete it. */
++ if (found_key.objectid == log_key->objectid &&
++ found_key.type == log_key->type &&
++ found_key.offset == log_key->offset &&
++ btrfs_dir_type(path->nodes[0], dst_di) == log_type)
++ return 1;
++
++ /*
++ * Don't drop the conflicting directory entry if the inode for the new
++ * entry doesn't exist.
++ */
++ if (!exists)
++ return 0;
++
++ return drop_one_dir_item(trans, path, dir, dst_di);
++}
++
+ /*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+@@ -1975,14 +2003,17 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+ {
+ char *name;
+ int name_len;
+- struct btrfs_dir_item *dst_di;
+- struct btrfs_key found_key;
++ struct btrfs_dir_item *dir_dst_di;
++ struct btrfs_dir_item *index_dst_di;
++ bool dir_dst_matches = false;
++ bool index_dst_matches = false;
+ struct btrfs_key log_key;
++ struct btrfs_key search_key;
+ struct inode *dir;
+ u8 log_type;
+ bool exists;
+ int ret;
+- bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
++ bool update_size = true;
+ bool name_added = false;
+
+ dir = read_one_inode(root, key->objectid);
+@@ -2008,76 +2039,53 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+ exists = (ret == 0);
+ ret = 0;
+
+- if (key->type == BTRFS_DIR_ITEM_KEY) {
+- dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+- name, name_len, 1);
+- } else if (key->type == BTRFS_DIR_INDEX_KEY) {
+- dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+- key->objectid,
+- key->offset, name,
+- name_len, 1);
+- } else {
+- /* Corruption */
+- ret = -EINVAL;
++ dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
++ name, name_len, 1);
++ if (IS_ERR(dir_dst_di)) {
++ ret = PTR_ERR(dir_dst_di);
+ goto out;
+- }
+-
+- if (IS_ERR(dst_di)) {
+- ret = PTR_ERR(dst_di);
+- goto out;
+- } else if (!dst_di) {
+- /* we need a sequence number to insert, so we only
+- * do inserts for the BTRFS_DIR_INDEX_KEY types
+- */
+- if (key->type != BTRFS_DIR_INDEX_KEY)
++ } else if (dir_dst_di) {
++ ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
++ dir_dst_di, &log_key, log_type,
++ exists);
++ if (ret < 0)
+ goto out;
+- goto insert;
++ dir_dst_matches = (ret == 1);
+ }
+
+- btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+- /* the existing item matches the logged item */
+- if (found_key.objectid == log_key.objectid &&
+- found_key.type == log_key.type &&
+- found_key.offset == log_key.offset &&
+- btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
++ btrfs_release_path(path);
++
++ index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
++ key->objectid, key->offset,
++ name, name_len, 1);
++ if (IS_ERR(index_dst_di)) {
++ ret = PTR_ERR(index_dst_di);
++ goto out;
++ } else if (index_dst_di) {
++ ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
++ index_dst_di, &log_key,
++ log_type, exists);
++ if (ret < 0)
++ goto out;
++ index_dst_matches = (ret == 1);
++ }
++
++ btrfs_release_path(path);
++
++ if (dir_dst_matches && index_dst_matches) {
++ ret = 0;
+ update_size = false;
+ goto out;
+ }
+
+- /*
+- * don't drop the conflicting directory entry if the inode
+- * for the new entry doesn't exist
+- */
+- if (!exists)
+- goto out;
+-
+- ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di);
+- if (ret)
+- goto out;
+-
+- if (key->type == BTRFS_DIR_INDEX_KEY)
+- goto insert;
+-out:
+- btrfs_release_path(path);
+- if (!ret && update_size) {
+- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
+- ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+- }
+- kfree(name);
+- iput(dir);
+- if (!ret && name_added)
+- ret = 1;
+- return ret;
+-
+-insert:
+ /*
+ * Check if the inode reference exists in the log for the given name,
+ * inode and parent inode
+ */
+- found_key.objectid = log_key.objectid;
+- found_key.type = BTRFS_INODE_REF_KEY;
+- found_key.offset = key->objectid;
+- ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
++ search_key.objectid = log_key.objectid;
++ search_key.type = BTRFS_INODE_REF_KEY;
++ search_key.offset = key->objectid;
++ ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+@@ -2087,10 +2095,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+ goto out;
+ }
+
+- found_key.objectid = log_key.objectid;
+- found_key.type = BTRFS_INODE_EXTREF_KEY;
+- found_key.offset = key->objectid;
+- ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
++ search_key.objectid = log_key.objectid;
++ search_key.type = BTRFS_INODE_EXTREF_KEY;
++ search_key.offset = key->objectid;
++ ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
+ name_len);
+ if (ret < 0) {
+ goto out;
+@@ -2109,87 +2117,76 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+ name_added = true;
+ update_size = false;
+ ret = 0;
+- goto out;
++
++out:
++ if (!ret && update_size) {
++ btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
++ ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
++ }
++ kfree(name);
++ iput(dir);
++ if (!ret && name_added)
++ ret = 1;
++ return ret;
+ }
+
+-/*
+- * find all the names in a directory item and reconcile them into
+- * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
+- * one name in a directory item, but the same code gets used for
+- * both directory index types
+- */
++/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
+ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+ {
+- int ret = 0;
+- u32 item_size = btrfs_item_size_nr(eb, slot);
++ int ret;
+ struct btrfs_dir_item *di;
+- int name_len;
+- unsigned long ptr;
+- unsigned long ptr_end;
+- struct btrfs_path *fixup_path = NULL;
+
+- ptr = btrfs_item_ptr_offset(eb, slot);
+- ptr_end = ptr + item_size;
+- while (ptr < ptr_end) {
+- di = (struct btrfs_dir_item *)ptr;
+- name_len = btrfs_dir_name_len(eb, di);
+- ret = replay_one_name(trans, root, path, eb, di, key);
+- if (ret < 0)
+- break;
+- ptr = (unsigned long)(di + 1);
+- ptr += name_len;
++ /* We only log dir index keys, which only contain a single dir item. */
++ ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
+
+- /*
+- * If this entry refers to a non-directory (directories can not
+- * have a link count > 1) and it was added in the transaction
+- * that was not committed, make sure we fixup the link count of
+- * the inode it the entry points to. Otherwise something like
+- * the following would result in a directory pointing to an
+- * inode with a wrong link that does not account for this dir
+- * entry:
+- *
+- * mkdir testdir
+- * touch testdir/foo
+- * touch testdir/bar
+- * sync
+- *
+- * ln testdir/bar testdir/bar_link
+- * ln testdir/foo testdir/foo_link
+- * xfs_io -c "fsync" testdir/bar
+- *
+- * <power failure>
+- *
+- * mount fs, log replay happens
+- *
+- * File foo would remain with a link count of 1 when it has two
+- * entries pointing to it in the directory testdir. This would
+- * make it impossible to ever delete the parent directory has
+- * it would result in stale dentries that can never be deleted.
+- */
+- if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+- struct btrfs_key di_key;
++ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
++ ret = replay_one_name(trans, root, path, eb, di, key);
++ if (ret < 0)
++ return ret;
+
+- if (!fixup_path) {
+- fixup_path = btrfs_alloc_path();
+- if (!fixup_path) {
+- ret = -ENOMEM;
+- break;
+- }
+- }
++ /*
++ * If this entry refers to a non-directory (directories can not have a
++ * link count > 1) and it was added in the transaction that was not
++ * committed, make sure we fixup the link count of the inode the entry
++ * points to. Otherwise something like the following would result in a
++ * directory pointing to an inode with a wrong link that does not account
++ * for this dir entry:
++ *
++ * mkdir testdir
++ * touch testdir/foo
++ * touch testdir/bar
++ * sync
++ *
++ * ln testdir/bar testdir/bar_link
++ * ln testdir/foo testdir/foo_link
++ * xfs_io -c "fsync" testdir/bar
++ *
++ * <power failure>
++ *
++ * mount fs, log replay happens
++ *
++ * File foo would remain with a link count of 1 when it has two entries
++ * pointing to it in the directory testdir. This would make it impossible
++ * to ever delete the parent directory has it would result in stale
++ * dentries that can never be deleted.
++ */
++ if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
++ struct btrfs_path *fixup_path;
++ struct btrfs_key di_key;
+
+- btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+- ret = link_to_fixup_dir(trans, root, fixup_path,
+- di_key.objectid);
+- if (ret)
+- break;
+- }
+- ret = 0;
++ fixup_path = btrfs_alloc_path();
++ if (!fixup_path)
++ return -ENOMEM;
++
++ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
++ ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
++ btrfs_free_path(fixup_path);
+ }
+- btrfs_free_path(fixup_path);
++
+ return ret;
+ }
+
+@@ -2206,7 +2203,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+ */
+ static noinline int find_dir_range(struct btrfs_root *root,
+ struct btrfs_path *path,
+- u64 dirid, int key_type,
++ u64 dirid,
+ u64 *start_ret, u64 *end_ret)
+ {
+ struct btrfs_key key;
+@@ -2219,7 +2216,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
+ return 1;
+
+ key.objectid = dirid;
+- key.type = key_type;
++ key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ key.offset = *start_ret;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+@@ -2233,7 +2230,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
+ if (ret != 0)
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+- if (key.type != key_type || key.objectid != dirid) {
++ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
+ ret = 1;
+ goto next;
+ }
+@@ -2260,7 +2257,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+- if (key.type != key_type || key.objectid != dirid) {
++ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
+ ret = 1;
+ goto out;
+ }
+@@ -2291,95 +2288,82 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+ int ret;
+ struct extent_buffer *eb;
+ int slot;
+- u32 item_size;
+ struct btrfs_dir_item *di;
+- struct btrfs_dir_item *log_di;
+ int name_len;
+- unsigned long ptr;
+- unsigned long ptr_end;
+ char *name;
+- struct inode *inode;
++ struct inode *inode = NULL;
+ struct btrfs_key location;
+
+-again:
++ /*
++ * Currenly we only log dir index keys. Even if we replay a log created
++ * by an older kernel that logged both dir index and dir item keys, all
++ * we need to do is process the dir index keys, we (and our caller) can
++ * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
++ */
++ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
++
+ eb = path->nodes[0];
+ slot = path->slots[0];
+- item_size = btrfs_item_size_nr(eb, slot);
+- ptr = btrfs_item_ptr_offset(eb, slot);
+- ptr_end = ptr + item_size;
+- while (ptr < ptr_end) {
+- di = (struct btrfs_dir_item *)ptr;
+- name_len = btrfs_dir_name_len(eb, di);
+- name = kmalloc(name_len, GFP_NOFS);
+- if (!name) {
+- ret = -ENOMEM;
+- goto out;
+- }
+- read_extent_buffer(eb, name, (unsigned long)(di + 1),
+- name_len);
+- log_di = NULL;
+- if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
+- log_di = btrfs_lookup_dir_item(trans, log, log_path,
+- dir_key->objectid,
+- name, name_len, 0);
+- } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
+- log_di = btrfs_lookup_dir_index_item(trans, log,
+- log_path,
++ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
++ name_len = btrfs_dir_name_len(eb, di);
++ name = kmalloc(name_len, GFP_NOFS);
++ if (!name) {
++ ret = -ENOMEM;
++ goto out;
++ }
++
++ read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
++
++ if (log) {
++ struct btrfs_dir_item *log_di;
++
++ log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
+ dir_key->objectid,
+ dir_key->offset,
+ name, name_len, 0);
+- }
+- if (!log_di) {
+- btrfs_dir_item_key_to_cpu(eb, di, &location);
+- btrfs_release_path(path);
+- btrfs_release_path(log_path);
+- inode = read_one_inode(root, location.objectid);
+- if (!inode) {
+- kfree(name);
+- return -EIO;
+- }
+-
+- ret = link_to_fixup_dir(trans, root,
+- path, location.objectid);
+- if (ret) {
+- kfree(name);
+- iput(inode);
+- goto out;
+- }
+-
+- inc_nlink(inode);
+- ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
+- BTRFS_I(inode), name, name_len);
+- if (!ret)
+- ret = btrfs_run_delayed_items(trans);
+- kfree(name);
+- iput(inode);
+- if (ret)
+- goto out;
+-
+- /* there might still be more names under this key
+- * check and repeat if required
+- */
+- ret = btrfs_search_slot(NULL, root, dir_key, path,
+- 0, 0);
+- if (ret == 0)
+- goto again;
++ if (IS_ERR(log_di)) {
++ ret = PTR_ERR(log_di);
++ goto out;
++ } else if (log_di) {
++ /* The dentry exists in the log, we have nothing to do. */
+ ret = 0;
+ goto out;
+- } else if (IS_ERR(log_di)) {
+- kfree(name);
+- return PTR_ERR(log_di);
+ }
+- btrfs_release_path(log_path);
+- kfree(name);
+-
+- ptr = (unsigned long)(di + 1);
+- ptr += name_len;
+ }
+- ret = 0;
++
++ btrfs_dir_item_key_to_cpu(eb, di, &location);
++ btrfs_release_path(path);
++ btrfs_release_path(log_path);
++ inode = read_one_inode(root, location.objectid);
++ if (!inode) {
++ ret = -EIO;
++ goto out;
++ }
++
++ ret = link_to_fixup_dir(trans, root, path, location.objectid);
++ if (ret)
++ goto out;
++
++ inc_nlink(inode);
++ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name,
++ name_len);
++ if (ret)
++ goto out;
++
++ ret = btrfs_run_delayed_items(trans);
++ if (ret)
++ goto out;
++
++ /*
++ * Unlike dir item keys, dir index keys can only have one name (entry) in
++ * them, as there are no key collisions since each key has a unique offset
++ * (an index number), so we're done.
++ */
+ out:
+ btrfs_release_path(path);
+ btrfs_release_path(log_path);
++ kfree(name);
++ iput(inode);
+ return ret;
+ }
+
+@@ -2422,7 +2406,7 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
+ }
+
+ di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
+- total_size = btrfs_item_size_nr(path->nodes[0], i);
++ total_size = btrfs_item_size(path->nodes[0], i);
+ cur = 0;
+ while (cur < total_size) {
+ u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
+@@ -2499,7 +2483,6 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ {
+ u64 range_start;
+ u64 range_end;
+- int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+ int ret = 0;
+ struct btrfs_key dir_key;
+ struct btrfs_key found_key;
+@@ -2507,7 +2490,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ struct inode *dir;
+
+ dir_key.objectid = dirid;
+- dir_key.type = BTRFS_DIR_ITEM_KEY;
++ dir_key.type = BTRFS_DIR_INDEX_KEY;
+ log_path = btrfs_alloc_path();
+ if (!log_path)
+ return -ENOMEM;
+@@ -2521,14 +2504,14 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ btrfs_free_path(log_path);
+ return 0;
+ }
+-again:
++
+ range_start = 0;
+ range_end = 0;
+ while (1) {
+ if (del_all)
+ range_end = (u64)-1;
+ else {
+- ret = find_dir_range(log, path, dirid, key_type,
++ ret = find_dir_range(log, path, dirid,
+ &range_start, &range_end);
+ if (ret < 0)
+ goto out;
+@@ -2555,8 +2538,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != dirid ||
+- found_key.type != dir_key.type)
+- goto next_type;
++ found_key.type != dir_key.type) {
++ ret = 0;
++ goto out;
++ }
+
+ if (found_key.offset > range_end)
+ break;
+@@ -2575,15 +2560,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ break;
+ range_start = range_end + 1;
+ }
+-
+-next_type:
+ ret = 0;
+- if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+- key_type = BTRFS_DIR_LOG_INDEX_KEY;
+- dir_key.type = BTRFS_DIR_INDEX_KEY;
+- btrfs_release_path(path);
+- goto again;
+- }
+ out:
+ btrfs_release_path(path);
+ btrfs_free_path(log_path);
+@@ -2743,12 +2720,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+ eb, i, &key);
+ if (ret)
+ break;
+- } else if (key.type == BTRFS_DIR_ITEM_KEY) {
+- ret = replay_one_dir_item(wc->trans, root, path,
+- eb, i, &key);
+- if (ret)
+- break;
+ }
++ /*
++ * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
++ * BTRFS_DIR_INDEX_KEY items which we use to derive the
++ * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
++ * older kernel with such keys, ignore them.
++ */
+ }
+ btrfs_free_path(path);
+ return ret;
+@@ -3551,20 +3529,10 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ goto out_unlock;
+ }
+
+- di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
+- name, name_len, -1);
+- if (IS_ERR(di)) {
+- err = PTR_ERR(di);
+- goto fail;
+- }
+- if (di) {
+- ret = btrfs_delete_one_dir_name(trans, log, path, di);
+- if (ret) {
+- err = ret;
+- goto fail;
+- }
+- }
+- btrfs_release_path(path);
++ /*
++ * We only log dir index items of a directory, so we don't need to look
++ * for dir item keys.
++ */
+ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
+ index, name, name_len, -1);
+ if (IS_ERR(di)) {
+@@ -3628,7 +3596,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+- int key_type, u64 dirid,
++ u64 dirid,
+ u64 first_offset, u64 last_offset)
+ {
+ int ret;
+@@ -3637,10 +3605,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+
+ key.objectid = dirid;
+ key.offset = first_offset;
+- if (key_type == BTRFS_DIR_ITEM_KEY)
+- key.type = BTRFS_DIR_LOG_ITEM_KEY;
+- else
+- key.type = BTRFS_DIR_LOG_INDEX_KEY;
++ key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+ if (ret)
+ return ret;
+@@ -3675,7 +3640,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+
+ if (count == 1) {
+ btrfs_item_key_to_cpu(src, &key, start_slot);
+- item_size = btrfs_item_size_nr(src, start_slot);
++ item_size = btrfs_item_size(src, start_slot);
+ batch.keys = &key;
+ batch.data_sizes = &item_size;
+ batch.total_data_size = item_size;
+@@ -3698,7 +3663,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ const int slot = start_slot + i;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+- ins_sizes[i] = btrfs_item_size_nr(src, slot);
++ ins_sizes[i] = btrfs_item_size(src, slot);
+ batch.total_data_size += ins_sizes[i];
+ }
+ }
+@@ -3732,7 +3697,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+- int key_type,
+ struct btrfs_log_ctx *ctx)
+ {
+ struct btrfs_root *log = inode->root->log_root;
+@@ -3740,24 +3704,18 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ const int nritems = btrfs_header_nritems(src);
+ const u64 ino = btrfs_ino(inode);
+ const bool inode_logged_before = inode_logged(trans, inode);
+- u64 last_logged_key_offset;
+ bool last_found = false;
+ int batch_start = 0;
+ int batch_size = 0;
+ int i;
+
+- if (key_type == BTRFS_DIR_ITEM_KEY)
+- last_logged_key_offset = inode->last_dir_item_offset;
+- else
+- last_logged_key_offset = inode->last_dir_index_offset;
+-
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_key key;
+ int ret;
+
+ btrfs_item_key_to_cpu(src, &key, i);
+
+- if (key.objectid != ino || key.type != key_type) {
++ if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
+ last_found = true;
+ break;
+ }
+@@ -3806,7 +3764,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ * we logged is in the log tree, saving time and avoiding adding
+ * contention on the log tree.
+ */
+- if (key.offset > last_logged_key_offset)
++ if (key.offset > inode->last_dir_index_offset)
+ goto add_to_batch;
+ /*
+ * Check if the key was already logged before. If not we can add
+@@ -3865,7 +3823,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+- struct btrfs_path *dst_path, int key_type,
++ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx,
+ u64 min_offset, u64 *last_offset_ret)
+ {
+@@ -3879,7 +3837,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ u64 ino = btrfs_ino(inode);
+
+ min_key.objectid = ino;
+- min_key.type = key_type;
++ min_key.type = BTRFS_DIR_INDEX_KEY;
+ min_key.offset = min_offset;
+
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
+@@ -3888,9 +3846,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ * we didn't find anything from this transaction, see if there
+ * is anything at all
+ */
+- if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
++ if (ret != 0 || min_key.objectid != ino ||
++ min_key.type != BTRFS_DIR_INDEX_KEY) {
+ min_key.objectid = ino;
+- min_key.type = key_type;
++ min_key.type = BTRFS_DIR_INDEX_KEY;
+ min_key.offset = (u64)-1;
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+@@ -3898,7 +3857,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ btrfs_release_path(path);
+ return ret;
+ }
+- ret = btrfs_previous_item(root, path, ino, key_type);
++ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
+
+ /* if ret == 0 there are items for this type,
+ * create a range to tell us the last key of this type.
+@@ -3909,18 +3868,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ struct btrfs_key tmp;
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+ path->slots[0]);
+- if (key_type == tmp.type)
++ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ first_offset = max(min_offset, tmp.offset) + 1;
+ }
+ goto done;
+ }
+
+ /* go backward to find any previous key */
+- ret = btrfs_previous_item(root, path, ino, key_type);
++ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
+ if (ret == 0) {
+ struct btrfs_key tmp;
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+- if (key_type == tmp.type) {
++ if (tmp.type == BTRFS_DIR_INDEX_KEY) {
+ first_offset = tmp.offset;
+ ret = overwrite_item(trans, log, dst_path,
+ path->nodes[0], path->slots[0],
+@@ -3951,8 +3910,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ * from our directory
+ */
+ while (1) {
+- ret = process_dir_items_leaf(trans, inode, path, dst_path,
+- key_type, ctx);
++ ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx);
+ if (ret != 0) {
+ if (ret < 0)
+ err = ret;
+@@ -3973,7 +3931,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ goto done;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+- if (min_key.objectid != ino || min_key.type != key_type) {
++ if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
+ last_offset = (u64)-1;
+ goto done;
+ }
+@@ -4004,8 +3962,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ * insert the log range keys to indicate where the log
+ * is valid
+ */
+- ret = insert_dir_log_key(trans, log, path, key_type,
+- ino, first_offset, last_offset);
++ ret = insert_dir_log_key(trans, log, path, ino, first_offset,
++ last_offset);
+ if (ret)
+ err = ret;
+ }
+@@ -4033,35 +3991,28 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+ u64 min_key;
+ u64 max_key;
+ int ret;
+- int key_type = BTRFS_DIR_ITEM_KEY;
+
+ /*
+ * If this is the first time we are being logged in the current
+ * transaction, or we were logged before but the inode was evicted and
+- * reloaded later, in which case its logged_trans is 0, reset the values
+- * of the last logged key offsets. Note that we don't use the helper
++ * reloaded later, in which case its logged_trans is 0, reset the value
++ * of the last logged key offset. Note that we don't use the helper
+ * function inode_logged() here - that is because the function returns
+ * true after an inode eviction, assuming the worst case as it can not
+ * know for sure if the inode was logged before. So we can not skip key
+ * searches in the case the inode was evicted, because it may not have
+ * been logged in this transaction and may have been logged in a past
+- * transaction, so we need to reset the last dir item and index offsets
+- * to (u64)-1.
++ * transaction, so we need to reset the last dir index offset to (u64)-1.
+ */
+- if (inode->logged_trans != trans->transid) {
+- inode->last_dir_item_offset = (u64)-1;
++ if (inode->logged_trans != trans->transid)
+ inode->last_dir_index_offset = (u64)-1;
+- }
+-again:
++
+ min_key = 0;
+ max_key = 0;
+- if (key_type == BTRFS_DIR_ITEM_KEY)
+- ctx->last_dir_item_offset = inode->last_dir_item_offset;
+- else
+- ctx->last_dir_item_offset = inode->last_dir_index_offset;
++ ctx->last_dir_item_offset = inode->last_dir_index_offset;
+
+ while (1) {
+- ret = log_dir_items(trans, inode, path, dst_path, key_type,
++ ret = log_dir_items(trans, inode, path, dst_path,
+ ctx, min_key, &max_key);
+ if (ret)
+ return ret;
+@@ -4070,13 +4021,8 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+ min_key = max_key + 1;
+ }
+
+- if (key_type == BTRFS_DIR_ITEM_KEY) {
+- inode->last_dir_item_offset = ctx->last_dir_item_offset;
+- key_type = BTRFS_DIR_INDEX_KEY;
+- goto again;
+- } else {
+- inode->last_dir_index_offset = ctx->last_dir_item_offset;
+- }
++ inode->last_dir_index_offset = ctx->last_dir_item_offset;
++
+ return 0;
+ }
+
+@@ -4350,7 +4296,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
+ batch.nr = nr;
+
+ for (i = 0; i < nr; i++) {
+- ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
++ ins_sizes[i] = btrfs_item_size(src, i + start_slot);
+ batch.total_data_size += ins_sizes[i];
+ btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+ }
+@@ -4573,14 +4519,34 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
+ {
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_root *log = inode->root->log_root;
+- struct btrfs_file_extent_item *fi;
++ struct btrfs_file_extent_item fi = { 0 };
+ struct extent_buffer *leaf;
+- struct btrfs_map_token token;
+ struct btrfs_key key;
+ u64 extent_offset = em->start - em->orig_start;
+ u64 block_len;
+ int ret;
+
++ btrfs_set_stack_file_extent_generation(&fi, trans->transid);
++ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
++ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
++ else
++ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
++
++ block_len = max(em->block_len, em->orig_block_len);
++ if (em->compress_type != BTRFS_COMPRESS_NONE) {
++ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
++ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
++ } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
++ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
++ extent_offset);
++ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
++ }
++
++ btrfs_set_stack_file_extent_offset(&fi, extent_offset);
++ btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
++ btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
++ btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
++
+ ret = log_extent_csums(trans, inode, log, em, ctx);
+ if (ret)
+ return ret;
+@@ -4599,7 +4565,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
+ drop_args.start = em->start;
+ drop_args.end = em->start + em->len;
+ drop_args.replace_extent = true;
+- drop_args.extent_item_size = sizeof(*fi);
++ drop_args.extent_item_size = sizeof(fi);
+ ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+ if (ret)
+ return ret;
+@@ -4611,44 +4577,14 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
+ key.offset = em->start;
+
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
+- sizeof(*fi));
++ sizeof(fi));
+ if (ret)
+ return ret;
+ }
+ leaf = path->nodes[0];
+- btrfs_init_map_token(&token, leaf);
+- fi = btrfs_item_ptr(leaf, path->slots[0],
+- struct btrfs_file_extent_item);
+-
+- btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
+- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+- btrfs_set_token_file_extent_type(&token, fi,
+- BTRFS_FILE_EXTENT_PREALLOC);
+- else
+- btrfs_set_token_file_extent_type(&token, fi,
+- BTRFS_FILE_EXTENT_REG);
+-
+- block_len = max(em->block_len, em->orig_block_len);
+- if (em->compress_type != BTRFS_COMPRESS_NONE) {
+- btrfs_set_token_file_extent_disk_bytenr(&token, fi,
+- em->block_start);
+- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
+- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+- btrfs_set_token_file_extent_disk_bytenr(&token, fi,
+- em->block_start -
+- extent_offset);
+- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
+- } else {
+- btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
+- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
+- }
+-
+- btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
+- btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
+- btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
+- btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
+- btrfs_set_token_file_extent_encryption(&token, fi, 0);
+- btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
++ write_extent_buffer(leaf, &fi,
++ btrfs_item_ptr_offset(leaf, path->slots[0]),
++ sizeof(fi));
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_release_path(path);
+@@ -4862,7 +4798,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+ WARN_ON(!list_empty(&extents));
+ write_unlock(&tree->lock);
+
+- btrfs_release_path(path);
+ if (!ret)
+ ret = btrfs_log_prealloc_extents(trans, inode, path);
+ if (ret)
+@@ -5166,7 +5101,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
+ struct btrfs_path *search_path;
+ char *name = NULL;
+ u32 name_len = 0;
+- u32 item_size = btrfs_item_size_nr(eb, slot);
++ u32 item_size = btrfs_item_size(eb, slot);
+ u32 cur_offset = 0;
+ unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
+
+@@ -5899,18 +5834,12 @@ struct btrfs_dir_list {
+ * link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+- * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
+- * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
++ * while logging the inode's items new index items (key type
++ * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
+ * has a size that doesn't match the sum of the lengths of all the logged
+- * names. This does not result in a problem because if a dir_item key is
+- * logged but its matching dir_index key is not logged, at log replay time we
+- * don't use it to replay the respective name (see replay_one_name()). On the
+- * other hand if only the dir_index key ends up being logged, the respective
+- * name is added to the fs/subvol tree with both the dir_item and dir_index
+- * keys created (see replay_one_name()).
+- * The directory's inode item with a wrong i_size is not a problem as well,
+- * since we don't use it at log replay time to set the i_size in the inode
+- * item of the fs/subvol tree (see overwrite_item()).
++ * names - this is ok, not a problem, because at log replay time we set the
++ * directory's i_size to the correct value (see replay_one_name() and
++ * do_overwrite_item()).
+ */
+ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+@@ -5956,7 +5885,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ goto next_dir_inode;
+
+ min_key.objectid = dir_elem->ino;
+- min_key.type = BTRFS_DIR_ITEM_KEY;
++ min_key.type = BTRFS_DIR_INDEX_KEY;
+ min_key.offset = 0;
+ again:
+ btrfs_release_path(path);
+@@ -5981,7 +5910,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+
+ btrfs_item_key_to_cpu(leaf, &min_key, i);
+ if (min_key.objectid != dir_elem->ino ||
+- min_key.type != BTRFS_DIR_ITEM_KEY)
++ min_key.type != BTRFS_DIR_INDEX_KEY)
+ goto next_dir_inode;
+
+ di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+@@ -6093,7 +6022,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+ if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+ break;
+
+- item_size = btrfs_item_size_nr(leaf, slot);
++ item_size = btrfs_item_size(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ while (cur_offset < item_size) {
+ struct btrfs_key inode_key;
+@@ -6795,15 +6724,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ * was previously logged, make sure the next log attempt on the directory
+ * is not skipped and logs the inode again. This is because the log may
+ * not currently be authoritative for a range including the old
+- * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make
+- * sure after a log replay we do not end up with both the new and old
+- * dentries around (in case the inode is a directory we would have a
+- * directory with two hard links and 2 inode references for different
+- * parents). The next log attempt of old_dir will happen at
+- * btrfs_log_all_parents(), called through btrfs_log_inode_parent()
+- * below, because we have previously set inode->last_unlink_trans to the
+- * current transaction ID, either here or at btrfs_record_unlink_dir() in
+- * case inode is a directory.
++ * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we
++ * do not end up with both the new and old dentries around (in case the
++ * inode is a directory we would have a directory with two hard links and
++ * 2 inode references for different parents). The next log attempt of
++ * old_dir will happen at btrfs_log_all_parents(), called through
++ * btrfs_log_inode_parent() below, because we have previously set
++ * inode->last_unlink_trans to the current transaction ID, either here or
++ * at btrfs_record_unlink_dir() in case the inode is a directory.
+ */
+ if (old_dir)
+ old_dir->logged_trans = 0;
+diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
+index 74023c8a783f..b458452a1aaf 100644
+--- a/fs/btrfs/uuid-tree.c
++++ b/fs/btrfs/uuid-tree.c
+@@ -52,7 +52,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+- item_size = btrfs_item_size_nr(eb, slot);
++ item_size = btrfs_item_size(eb, slot);
+ offset = btrfs_item_ptr_offset(eb, slot);
+ ret = -ENOENT;
+
+@@ -125,7 +125,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ offset = btrfs_item_ptr_offset(eb, slot);
+- offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
++ offset += btrfs_item_size(eb, slot) - sizeof(subid_le);
+ } else {
+ btrfs_warn(fs_info,
+ "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
+@@ -186,7 +186,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ offset = btrfs_item_ptr_offset(eb, slot);
+- item_size = btrfs_item_size_nr(eb, slot);
++ item_size = btrfs_item_size(eb, slot);
+ if (!IS_ALIGNED(item_size, sizeof(u64))) {
+ btrfs_warn(fs_info, "uuid item with illegal size %lu!",
+ (unsigned long)item_size);
+@@ -208,7 +208,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ goto out;
+ }
+
+- item_size = btrfs_item_size_nr(eb, slot);
++ item_size = btrfs_item_size(eb, slot);
+ if (item_size == sizeof(subid)) {
+ ret = btrfs_del_item(trans, uuid_root, path);
+ goto out;
+@@ -331,7 +331,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
+ goto skip;
+
+ offset = btrfs_item_ptr_offset(leaf, slot);
+- item_size = btrfs_item_size_nr(leaf, slot);
++ item_size = btrfs_item_size(leaf, slot);
+ if (!IS_ALIGNED(item_size, sizeof(u64))) {
+ btrfs_warn(fs_info,
+ "uuid item with illegal size %lu!",
+diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
+index 4968535dfff0..90eb5c2830a9 100644
+--- a/fs/btrfs/verity.c
++++ b/fs/btrfs/verity.c
+@@ -333,7 +333,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+ if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+ break;
+
+- item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset;
++ item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset;
+
+ if (copied > 0) {
+ /*
+diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
+index 42391d4aeb11..5f4ac1a2e1f3 100644
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -34,6 +34,10 @@
+ #include "discard.h"
+ #include "zoned.h"
+
++#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
++ BTRFS_BLOCK_GROUP_RAID10 | \
++ BTRFS_BLOCK_GROUP_RAID56_MASK)
++
+ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = {
+ .sub_stripes = 2,
+@@ -4643,7 +4647,7 @@ int btrfs_uuid_scan_kthread(void *data)
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+- item_size = btrfs_item_size_nr(eb, slot);
++ item_size = btrfs_item_size(eb, slot);
+ if (item_size < sizeof(root_item))
+ goto skip;
+
+@@ -6314,7 +6318,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
+ stripe_offset = offset - stripe_offset;
+ data_stripes = nr_data_stripes(map);
+
+- if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
++ /* Only stripe based profiles needs to check against stripe length. */
++ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
+ u64 max_len = stripe_len - stripe_offset;
+
+ /*
+@@ -7730,7 +7735,7 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device,
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+- item_size = btrfs_item_size_nr(eb, slot);
++ item_size = btrfs_item_size(eb, slot);
+
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
+
+@@ -7808,7 +7813,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
+ }
+
+ if (ret == 0 &&
+- btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
++ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ /* need to delete old one and insert a new one */
+ ret = btrfs_del_item(trans, dev_root, path);
+ if (ret != 0) {
+diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
+index 2837b4c8424d..99abf41b89b9 100644
+--- a/fs/btrfs/xattr.c
++++ b/fs/btrfs/xattr.c
+@@ -168,9 +168,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
+ const int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ const u16 old_data_len = btrfs_dir_data_len(leaf, di);
+- const u32 item_size = btrfs_item_size_nr(leaf, slot);
++ const u32 item_size = btrfs_item_size(leaf, slot);
+ const u32 data_size = sizeof(*di) + name_len + size;
+- struct btrfs_item *item;
+ unsigned long data_ptr;
+ char *ptr;
+
+@@ -196,9 +195,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
+ btrfs_extend_item(path, data_size);
+ }
+
+- item = btrfs_item_nr(slot);
+ ptr = btrfs_item_ptr(leaf, slot, char);
+- ptr += btrfs_item_size(leaf, item) - data_size;
++ ptr += btrfs_item_size(leaf, slot) - data_size;
+ di = (struct btrfs_dir_item *)ptr;
+ btrfs_set_dir_data_len(leaf, di, size);
+ data_ptr = ((unsigned long)(di + 1)) + name_len;
+@@ -335,7 +333,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+ goto next_item;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+- item_size = btrfs_item_size_nr(leaf, slot);
++ item_size = btrfs_item_size(leaf, slot);
+ cur = 0;
+ while (cur < item_size) {
+ u16 name_len = btrfs_dir_name_len(leaf, di);
+diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
+index 738619994e26..012a71ab5d8e 100644
+--- a/include/uapi/linux/btrfs.h
++++ b/include/uapi/linux/btrfs.h
+@@ -575,8 +575,10 @@ struct btrfs_ioctl_clone_range_args {
+ * Used by:
+ * struct btrfs_ioctl_defrag_range_args.flags
+ */
+-#define BTRFS_DEFRAG_RANGE_COMPRESS 1
+-#define BTRFS_DEFRAG_RANGE_START_IO 2
++#define BTRFS_DEFRAG_RANGE_COMPRESS (1UL << 0)
++#define BTRFS_DEFRAG_RANGE_START_IO (1UL << 1)
++#define BTRFS_DEFRAG_RANGE_FLAGS_MASK (BTRFS_DEFRAG_RANGE_COMPRESS |\
++ BTRFS_DEFRAG_RANGE_START_IO)
+ struct btrfs_ioctl_defrag_range_args {
+ /* start of the defrag operation */
+ __u64 start;
+diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
+index e1c4c732aaba..5416f1f1a77a 100644
+--- a/include/uapi/linux/btrfs_tree.h
++++ b/include/uapi/linux/btrfs_tree.h
+@@ -146,7 +146,9 @@
+
+ /*
+ * dir items are the name -> inode pointers in a directory. There is one
+- * for every name in a directory.
++ * for every name in a directory. BTRFS_DIR_LOG_ITEM_KEY is no longer used
++ * but it's still defined here for documentation purposes and to help avoid
++ * having its numerical value reused in the future.
+ */
+ #define BTRFS_DIR_LOG_ITEM_KEY 60
+ #define BTRFS_DIR_LOG_INDEX_KEY 72
+--
+2.35.1
+