hotfix: resolve btrfs autodefrag high utilization

author: Scott B 2022-02-10 22:57:20 -0800
committer: Scott B 2022-02-12 00:57:42 -0800
commit: 44db0f40320d2895d9f2438145152e329fb6dfb1 (patch)
tree: a2eecf6c2d0248a9a18806f9b4136f586e754c3b
parent: 248c3c289b71536ece4f14f7bf753f14ce637696 (diff)
download: aur-44db0f40320d2895d9f2438145152e329fb6dfb1.tar.gz
3 files changed, 6423 insertions, 0 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 98a9d050de5b..2d11f2665c78 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -25,6 +25,7 @@ pkgbase = linux-xanmod-rog
 	source = Bluetooth-btintel-Fix-bdaddress-comparison-with-garb.patch
 	source = Bluetooth-Read-codec-capabilities-only-if-supported.patch
 	source = Bluetooth-fix-deadlock-for-RFCOMM-sk-state-change.patch
+	source = btrfs-fix-autodefrag-on-5.16.9.patch
 	source = Revert-XANMOD-fair-Remove-all-energy-efficiency-functions.patch
 	source = cpufreq-CPPC-Fix-performance-frequency-conversion.patch
 	source = udp-ipv6-optimisations-v2-net-next.patch
@@ -56,6 +57,7 @@ pkgbase = linux-xanmod-rog
 	sha256sums = 241f01f06849fcec462d72355ca3ab6bd34931731dec89876d785912ac532398
 	sha256sums = dd01bd3f774c3a9af42b6d89f534f39c4a5f200db32cd6d4b72a29325645100e
 	sha256sums = a9647897e59b04cb883dcf649b3108e9397d5a6c672bc545ea0c6bb7bb30d5a9
+	sha256sums = cd2795ab2c355eb0182cba2940712552ff46eee95b04abb41327c208f7f3e546
 	sha256sums = 3bb1cf422c64b4eea324b71048d0bdee04b5f9132136c6a4774e5205e45c46f1
 	sha256sums = 5c6c7778bc2d873657a885272956e232138b8b4935c3a3d6b11ef1619d344b20
 	sha256sums = 56f8f93a38ed7236c2504c79645a33123ee7bdf3c0cbb97dfd90600df06be7dd
diff --git a/PKGBUILD b/PKGBUILD
index c821e026bb5b..910eea0a98a4 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -114,6 +114,9 @@ source=("https://cdn.kernel.org/pub/linux/kernel/v${_branch}/linux-${_major}.tar
         "Bluetooth-Read-codec-capabilities-only-if-supported.patch"
         "Bluetooth-fix-deadlock-for-RFCOMM-sk-state-change.patch"
 
+        # hotfix: address btrfs autodefrag excessive utilization
+        "btrfs-fix-autodefrag-on-5.16.9.patch"
+
         # Revert Xanmod scheduler power efficiency removal
         "Revert-XANMOD-fair-Remove-all-energy-efficiency-functions.patch"
 
@@ -180,6 +183,7 @@ sha256sums=('027d7e8988bb69ac12ee92406c3be1fe13f990b1ca2249e226225cd1573308bb'
             '241f01f06849fcec462d72355ca3ab6bd34931731dec89876d785912ac532398'
             'dd01bd3f774c3a9af42b6d89f534f39c4a5f200db32cd6d4b72a29325645100e'
             'a9647897e59b04cb883dcf649b3108e9397d5a6c672bc545ea0c6bb7bb30d5a9'
+            'cd2795ab2c355eb0182cba2940712552ff46eee95b04abb41327c208f7f3e546'
             '3bb1cf422c64b4eea324b71048d0bdee04b5f9132136c6a4774e5205e45c46f1'
             '5c6c7778bc2d873657a885272956e232138b8b4935c3a3d6b11ef1619d344b20'
             '56f8f93a38ed7236c2504c79645a33123ee7bdf3c0cbb97dfd90600df06be7dd'
diff --git a/btrfs-fix-autodefrag-on-5.16.9.patch b/btrfs-fix-autodefrag-on-5.16.9.patch
new file mode 100644
index 000000000000..33053ea7b449
--- /dev/null
+++ b/btrfs-fix-autodefrag-on-5.16.9.patch
@@ -0,0 +1,6417 @@
+From 6c67e14b140aba83be3aee93961ade179dbc2473 Mon Sep 17 00:00:00 2001
+From: Scott B <arglebargle@arglebargle.dev>
+Date: Fri, 11 Feb 2022 23:52:12 -0800
+Subject: [PATCH] btrfs fix autodefrag on 5.16.9
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Squashed commit of the following:
+
+commit 7af5a9b695e62bdb82b55cb255c448e3af3ac587
+Author: Qu Wenruo <wqu@suse.com>
+Date:   Fri Feb 11 14:41:43 2022 +0800
+
+    btrfs: defrag: make btrfs_defrag_file() to report accurate number of defragged sectors
+
+    Previously rework btrfs_defrag_file() can only report the number of
+    sectors from the first run of defrag_collect_targets().
+
+    This number is not accurate as if holes are punched after the first
+    defrag_collect_targets() call, we will not choose to defrag the holes.
+
+    Originally this is to avoid passing @sectors_defragged to every involved
+    functions.
+
+    But now since we have btrfs_defrag_ctrl, there is no need to do such
+    inaccurate accounting, just update btrfs_defrag_ctrl::sectors_defragged
+    after a successful defrag_one_locked_target() call.
+
+    Signed-off-by: Qu Wenruo <wqu@suse.com>
+    Reviewed-by: Filipe Manana <fdmanana@suse.com>
+
+commit 7d6ad9ac62135f86c190f4ccf1ea1e8bb2e13480
+Author: Qu Wenruo <wqu@suse.com>
+Date:   Fri Feb 11 14:41:42 2022 +0800
+
+    btrfs: defrag: use btrfs_defrag_ctrl to replace btrfs_ioctl_defrag_range_args for btrfs_defrag_file()
+
+    This brings the following benefits:
+
+    - No more strange range->start update to indicate last scanned bytenr
+      We have btrfs_defrag_ctrl::last_scanned (exclusive) for it directly.
+
+    - No more return value to indicate defragged sectors
+      Now btrfs_defrag_file() will just return 0 if no error happened.
+      And btrfs_defrag_ctrl::sectors_defragged will show that value.
+
+    - Less parameters to carry around
+      Now most defrag_* functions only need to fetch its policy parameters
+      from btrfs_defrag_ctrl directly.
+
+    Signed-off-by: Qu Wenruo <wqu@suse.com>
+
+commit b19878cde4728eeb3b5e017a6718ffd9e263c1a2
+Author: Qu Wenruo <wqu@suse.com>
+Date:   Fri Feb 11 14:41:41 2022 +0800
+
+    btrfs: defrag: introduce btrfs_defrag_ctrl structure for later usage
+
+    Currently btrfs_defrag_file() accepts not only
+    btrfs_ioctl_defrag_range_args but also other parameters like @newer_than
+    and @max_sectors_to_defrag for extra policies.
+
+    Those extra values are hidden from defrag ioctl and even caused bugs in
+    the past due to different behaviors based on those extra values.
+
+    Here we introduce a new structure, btrfs_defrag_ctrl, to include:
+
+    - all members in btrfs_ioctl_defrag_range_args
+
+    - @max_sectors_to_defrag and @newer_than
+
+    - Extra values which callers of btrfs_defrag_file() may care
+      Like @sectors_defragged and @last_scanned.
+
+    With the new structure, also introduce a new helper,
+    btrfs_defrag_ioctl_args_to_ctrl() to:
+
+    - Do extra sanity check on @compress and @flags
+
+    - Do range alignment when possible
+
+    - Set default values.
+
+    Signed-off-by: Qu Wenruo <wqu@suse.com>
+    Reviewed-by: Filipe Manana <fdmanana@suse.com>
+
+commit a510f6c16dbfead2fcf0b04489d676d16851ba9e
+Author: Qu Wenruo <wqu@suse.com>
+Date:   Fri Feb 11 14:41:39 2022 +0800
+
+    btrfs: defrag: allow defrag_one_cluster() to skip large extent which is not a target
+
+    In the rework of btrfs_defrag_file(), we always call
+    defrag_one_cluster() and increase the offset by cluster size, which is
+    only 256K.
+
+    But there are cases where we have a large extent (e.g. 128M) which
+    doesn't need to be defragged at all.
+
+    Before the refactor, we can directly skip the range, but now we have to
+    scan that extent map again and again until the cluster moves after the
+    non-target extent.
+
+    Fix the problem by allow defrag_one_cluster() to increase
+    btrfs_defrag_ctrl::last_scanned to the end of an extent, if and only if
+    the last extent of the cluster is not a target.
+
+    The test script looks like this:
+
+    	mkfs.btrfs -f $dev > /dev/null
+
+    	mount $dev $mnt
+
+    	# As btrfs ioctl uses 32M as extent_threshold
+    	xfs_io -f -c "pwrite 0 64M" $mnt/file1
+    	sync
+    	# Some fragemented range to defrag
+    	xfs_io -s -c "pwrite 65548k 4k" \
+    		  -c "pwrite 65544k 4k" \
+    		  -c "pwrite 65540k 4k" \
+    		  -c "pwrite 65536k 4k" \
+    		  $mnt/file1
+    	sync
+
+    	echo "=== before ==="
+    	xfs_io -c "fiemap -v" $mnt/file1
+    	echo "=== after ==="
+    	btrfs fi defrag $mnt/file1
+    	sync
+    	xfs_io -c "fiemap -v" $mnt/file1
+    	umount $mnt
+
+    With extra ftrace put into defrag_one_cluster(), before the patch it
+    would result tons of loops:
+
+    (As defrag_one_cluster() is inlined, the function name is its caller)
+
+      btrfs-126062  [005] .....  4682.816026: btrfs_defrag_file: r/i=5/257 start=0 len=262144
+      btrfs-126062  [005] .....  4682.816027: btrfs_defrag_file: r/i=5/257 start=262144 len=262144
+      btrfs-126062  [005] .....  4682.816028: btrfs_defrag_file: r/i=5/257 start=524288 len=262144
+      btrfs-126062  [005] .....  4682.816028: btrfs_defrag_file: r/i=5/257 start=786432 len=262144
+      btrfs-126062  [005] .....  4682.816028: btrfs_defrag_file: r/i=5/257 start=1048576 len=262144
+      ...
+      btrfs-126062  [005] .....  4682.816043: btrfs_defrag_file: r/i=5/257 start=67108864 len=262144
+
+    But with this patch there will be just one loop, then directly to the
+    end of the extent:
+
+      btrfs-130471  [014] .....  5434.029558: defrag_one_cluster: r/i=5/257 start=0 len=262144
+      btrfs-130471  [014] .....  5434.029559: defrag_one_cluster: r/i=5/257 start=67108864 len=16384
+
+    Cc: stable@vger.kernel.org # 5.16
+    Signed-off-by: Qu Wenruo <wqu@suse.com>
+    Reviewed-by: Filipe Manana <fdmanana@suse.com>
+
+commit 3f2d69fc4a7a4ce3f389b9e84fa3c830f6a8b5c5
+Author: Qu Wenruo <wqu@suse.com>
+Date:   Fri Feb 11 14:41:40 2022 +0800
+
+    btrfs: uapi: introduce BTRFS_DEFRAG_RANGE_MASK for later sanity check
+
+    And since we're here, replace the hardcoded bit flags (1, 2) with
+    (1UL << 0) and (1UL << 1), respectively.
+
+    Signed-off-by: Qu Wenruo <wqu@suse.com>
+    Reviewed-by: Filipe Manana <fdmanana@suse.com>
+
+commit b6c665523425451af94eb3f044d4474c81f94b1e
+Author: Qu Wenruo <wqu@suse.com>
+Date:   Fri Jan 28 15:21:22 2022 +0800
+
+    btrfs: defrag: remove an ambiguous condition for rejection
+
+    From the very beginning of btrfs defrag, there is a check to reject
+    extents which meet both conditions:
+
+    - Physically adjacent
+
+      We may want to defrag physically adjacent extents to reduce the number
+      of extents or the size of subvolume tree.
+
+    - Larger than 128K
+
+      This may be there for compressed extents, but unfortunately 128K is
+      exactly the max capacity for compressed extents.
+      And the check is > 128K, thus it never rejects compressed extents.
+
+      Furthermore, the compressed extent capacity bug is fixed by previous
+      patch, there is no reason for that check anymore.
+
+    The original check has a very small ranges to reject (the target extent
+    size is > 128K, and default extent threshold is 256K), and for
+    compressed extent it doesn't work at all.
+
+    So it's better just to remove the rejection, and allow us to defrag
+    physically adjacent extents.
+
+    Reviewed-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: Qu Wenruo <wqu@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit cb53ba48a2b6c9126d128f301b4ed8085dcbce7b
+Author: Qu Wenruo <wqu@suse.com>
+Date:   Fri Jan 28 15:21:21 2022 +0800
+
+    btrfs: defrag: don't defrag extents which is already at its max capacity
+
+    [BUG]
+    For compressed extents, defrag ioctl will always try to defrag any
+    compressed extents, wasting not only IO but also CPU time to
+    compress/decompress:
+
+       mkfs.btrfs -f $DEV
+       mount -o compress $DEV $MNT
+       xfs_io -f -c "pwrite -S 0xab 0 128K" $MNT/foobar
+       sync
+       xfs_io -f -c "pwrite -S 0xcd 128K 128K" $MNT/foobar
+       sync
+       echo "=== before ==="
+       xfs_io -c "fiemap -v" $MNT/foobar
+       btrfs filesystem defrag $MNT/foobar
+       sync
+       echo "=== after ==="
+       xfs_io -c "fiemap -v" $MNT/foobar
+
+    Then it shows the 2 128K extents just get CoW for no extra benefit, with
+    extra IO/CPU spent:
+
+        === before ===
+        /mnt/btrfs/file1:
+         EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
+           0: [0..255]:        26624..26879       256   0x8
+           1: [256..511]:      26632..26887       256   0x9
+        === after ===
+        /mnt/btrfs/file1:
+         EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
+           0: [0..255]:        26640..26895       256   0x8
+           1: [256..511]:      26648..26903       256   0x9
+
+    This affects not only v5.16 (after the defrag rework), but also v5.15
+    (before the defrag rework).
+
+    [CAUSE]
+    >From the very beginning, btrfs defrag never checks if one extent is
+    already at its max capacity (128K for compressed extents, 128M
+    otherwise).
+
+    And the default extent size threshold is 256K, which is already beyond
+    the compressed extent max size.
+
+    This means, by default btrfs defrag ioctl will mark all compressed
+    extent which is not adjacent to a hole/preallocated range for defrag.
+
+    [FIX]
+    Introduce a helper to grab the maximum extent size, and then in
+    defrag_collect_targets() and defrag_check_next_extent(), reject extents
+    which are already at their max capacity.
+
+    Reported-by: Filipe Manana <fdmanana@suse.com>
+    Reviewed-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: Qu Wenruo <wqu@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit e5cf566d32d6f7b244d87ab0c0797b43d54b4c37
+Author: Qu Wenruo <wqu@suse.com>
+Date:   Fri Jan 28 15:21:20 2022 +0800
+
+    btrfs: defrag: don't try to merge regular extents with preallocated extents
+
+    [BUG]
+    With older kernels (before v5.16), btrfs will defrag preallocated extents.
+    While with newer kernels (v5.16 and newer) btrfs will not defrag
+    preallocated extents, but it will defrag the extent just before the
+    preallocated extent, even it's just a single sector.
+
+    This can be exposed by the following small script:
+
+    	mkfs.btrfs -f $dev > /dev/null
+
+    	mount $dev $mnt
+    	xfs_io -f -c "pwrite 0 4k" -c sync -c "falloc 4k 16K" $mnt/file
+    	xfs_io -c "fiemap -v" $mnt/file
+    	btrfs fi defrag $mnt/file
+    	sync
+    	xfs_io -c "fiemap -v" $mnt/file
+
+    The output looks like this on older kernels:
+
+    /mnt/btrfs/file:
+     EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
+       0: [0..7]:          26624..26631         8   0x0
+       1: [8..39]:         26632..26663        32 0x801
+    /mnt/btrfs/file:
+     EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
+       0: [0..39]:         26664..26703        40   0x1
+
+    Which defrags the single sector along with the preallocated extent, and
+    replace them with an regular extent into a new location (caused by data
+    COW).
+    This wastes most of the data IO just for the preallocated range.
+
+    On the other hand, v5.16 is slightly better:
+
+    /mnt/btrfs/file:
+     EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
+       0: [0..7]:          26624..26631         8   0x0
+       1: [8..39]:         26632..26663        32 0x801
+    /mnt/btrfs/file:
+     EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
+       0: [0..7]:          26664..26671         8   0x0
+       1: [8..39]:         26632..26663        32 0x801
+
+    The preallocated range is not defragged, but the sector before it still
+    gets defragged, which has no need for it.
+
+    [CAUSE]
+    One of the function reused by the old and new behavior is
+    defrag_check_next_extent(), it will determine if we should defrag
+    current extent by checking the next one.
+
+    It only checks if the next extent is a hole or inlined, but it doesn't
+    check if it's preallocated.
+
+    On the other hand, out of the function, both old and new kernel will
+    reject preallocated extents.
+
+    Such inconsistent behavior causes above behavior.
+
+    [FIX]
+    - Also check if next extent is preallocated
+      If so, don't defrag current extent.
+
+    - Add comments for each branch why we reject the extent
+
+    This will reduce the IO caused by defrag ioctl and autodefrag.
+
+    Reviewed-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: Qu Wenruo <wqu@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit f9649fa5a78f6e27a8bb4ec026efe3b4c1d64bc8
+Author: Sidong Yang <realwakka@gmail.com>
+Date:   Sun Feb 6 12:52:48 2022 +0000
+
+    btrfs: qgroup: remove duplicated check in adding qgroup relations
+
+    Removes duplicated check when adding qgroup relations.
+    btrfs_add_qgroup_relations function adds relations by calling
+    add_relation_rb(). add_relation_rb() checks that member/parentid exists
+    in current qgroup_tree. But it already checked before calling the
+    function. It seems that we don't need to double check.
+
+    Add new function __add_relation_rb() that adds relations with
+    qgroup structures and makes old function use the new one. And it makes
+    btrfs_add_qgroup_relation() function work without double checks by
+    calling the new function.
+
+    Signed-off-by: Sidong Yang <realwakka@gmail.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    [ add comments ]
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 843263d23fb3348d562a0e410d3e8e552e829ef3
+Author: Dāvis Mosāns <davispuh@gmail.com>
+Date:   Wed Feb 2 23:44:54 2022 +0200
+
+    btrfs: add lzo workspace buffer length constants
+
+    It makes it more readable for length checking and is be used repeatedly.
+
+    Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit b1794acfaaef72cc21c3ec3f92d63b1da0842f54
+Author: Dāvis Mosāns <davispuh@gmail.com>
+Date:   Wed Feb 2 23:44:55 2022 +0200
+
+    btrfs: prevent copying too big compressed lzo segment
+
+    Compressed length can be corrupted to be a lot larger than memory
+    we have allocated for buffer.
+    This will cause memcpy in copy_compressed_segment to write outside
+    of allocated memory.
+
+    This mostly results in stuck read syscall but sometimes when using
+    btrfs send can get #GP
+
+      kernel: general protection fault, probably for non-canonical address 0x841551d5c1000: 0000 [#1] PREEMPT SMP NOPTI
+      kernel: CPU: 17 PID: 264 Comm: kworker/u256:7 Tainted: P           OE     5.17.0-rc2-1 #12
+      kernel: Workqueue: btrfs-endio btrfs_work_helper [btrfs]
+      kernel: RIP: 0010:lzo_decompress_bio (./include/linux/fortify-string.h:225 fs/btrfs/lzo.c:322 fs/btrfs/lzo.c:394) btrfs
+      Code starting with the faulting instruction
+      ===========================================
+         0:*  48 8b 06                mov    (%rsi),%rax              <-- trapping instruction
+         3:   48 8d 79 08             lea    0x8(%rcx),%rdi
+         7:   48 83 e7 f8             and    $0xfffffffffffffff8,%rdi
+         b:   48 89 01                mov    %rax,(%rcx)
+         e:   44 89 f0                mov    %r14d,%eax
+        11:   48 8b 54 06 f8          mov    -0x8(%rsi,%rax,1),%rdx
+      kernel: RSP: 0018:ffffb110812efd50 EFLAGS: 00010212
+      kernel: RAX: 0000000000001000 RBX: 000000009ca264c8 RCX: ffff98996e6d8ff8
+      kernel: RDX: 0000000000000064 RSI: 000841551d5c1000 RDI: ffffffff9500435d
+      kernel: RBP: ffff989a3be856c0 R08: 0000000000000000 R09: 0000000000000000
+      kernel: R10: 0000000000000000 R11: 0000000000001000 R12: ffff98996e6d8000
+      kernel: R13: 0000000000000008 R14: 0000000000001000 R15: 000841551d5c1000
+      kernel: FS:  0000000000000000(0000) GS:ffff98a09d640000(0000) knlGS:0000000000000000
+      kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+      kernel: CR2: 00001e9f984d9ea8 CR3: 000000014971a000 CR4: 00000000003506e0
+      kernel: Call Trace:
+      kernel:  <TASK>
+      kernel: end_compressed_bio_read (fs/btrfs/compression.c:104 fs/btrfs/compression.c:1363 fs/btrfs/compression.c:323) btrfs
+      kernel: end_workqueue_fn (fs/btrfs/disk-io.c:1923) btrfs
+      kernel: btrfs_work_helper (fs/btrfs/async-thread.c:326) btrfs
+      kernel: process_one_work (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:212 ./include/trace/events/workqueue.h:108 kernel/workqueue.c:2312)
+      kernel: worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2455)
+      kernel: ? process_one_work (kernel/workqueue.c:2397)
+      kernel: kthread (kernel/kthread.c:377)
+      kernel: ? kthread_complete_and_exit (kernel/kthread.c:332)
+      kernel: ret_from_fork (arch/x86/entry/entry_64.S:301)
+      kernel:  </TASK>
+
+    CC: stable@vger.kernel.org # 4.9+
+    Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 09ad560a431c83f3741f1e545924c7fbb8957dd4
+Author: Dāvis Mosāns <davispuh@gmail.com>
+Date:   Sat Feb 5 20:48:23 2022 +0200
+
+    btrfs: send: in case of IO error log it
+
+    Currently if we get IO error while doing send then we abort without
+    logging information about which file caused issue.  So log it to help
+    with debugging.
+
+    CC: stable@vger.kernel.org # 4.9+
+    Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 43b4cb906eef17f9a7ca8e660f3e9e44176082f6
+Author: Filipe Manana <fdmanana@suse.com>
+Date:   Wed Feb 2 15:26:09 2022 +0000
+
+    btrfs: get rid of warning on transaction commit when using flushoncommit
+
+    When using the flushoncommit mount option, during almost every transaction
+    commit we trigger a warning from __writeback_inodes_sb_nr():
+
+      $ cat fs/fs-writeback.c:
+      (...)
+      static void __writeback_inodes_sb_nr(struct super_block *sb, ...
+      {
+            (...)
+            WARN_ON(!rwsem_is_locked(&sb->s_umount));
+            (...)
+      }
+      (...)
+
+    The trace produced in dmesg looks like the following:
+
+      [947.473890] WARNING: CPU: 5 PID: 930 at fs/fs-writeback.c:2610 __writeback_inodes_sb_nr+0x7e/0xb3
+      [947.481623] Modules linked in: nfsd nls_cp437 cifs asn1_decoder cifs_arc4 fscache cifs_md4 ipmi_ssif
+      [947.489571] CPU: 5 PID: 930 Comm: btrfs-transacti Not tainted 95.16.3-srb-asrock-00001-g36437ad63879 #186
+      [947.497969] RIP: 0010:__writeback_inodes_sb_nr+0x7e/0xb3
+      [947.502097] Code: 24 10 4c 89 44 24 18 c6 (...)
+      [947.519760] RSP: 0018:ffffc90000777e10 EFLAGS: 00010246
+      [947.523818] RAX: 0000000000000000 RBX: 0000000000963300 RCX: 0000000000000000
+      [947.529765] RDX: 0000000000000000 RSI: 000000000000fa51 RDI: ffffc90000777e50
+      [947.535740] RBP: ffff888101628a90 R08: ffff888100955800 R09: ffff888100956000
+      [947.541701] R10: 0000000000000002 R11: 0000000000000001 R12: ffff888100963488
+      [947.547645] R13: ffff888100963000 R14: ffff888112fb7200 R15: ffff888100963460
+      [947.553621] FS:  0000000000000000(0000) GS:ffff88841fd40000(0000) knlGS:0000000000000000
+      [947.560537] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+      [947.565122] CR2: 0000000008be50c4 CR3: 000000000220c000 CR4: 00000000001006e0
+      [947.571072] Call Trace:
+      [947.572354]  <TASK>
+      [947.573266]  btrfs_commit_transaction+0x1f1/0x998
+      [947.576785]  ? start_transaction+0x3ab/0x44e
+      [947.579867]  ? schedule_timeout+0x8a/0xdd
+      [947.582716]  transaction_kthread+0xe9/0x156
+      [947.585721]  ? btrfs_cleanup_transaction.isra.0+0x407/0x407
+      [947.590104]  kthread+0x131/0x139
+      [947.592168]  ? set_kthread_struct+0x32/0x32
+      [947.595174]  ret_from_fork+0x22/0x30
+      [947.597561]  </TASK>
+      [947.598553] ---[ end trace 644721052755541c ]---
+
+    This is because we started using writeback_inodes_sb() to flush delalloc
+    when committing a transaction (when using -o flushoncommit), in order to
+    avoid deadlocks with filesystem freeze operations. This change was made
+    by commit ce8ea7cc6eb313 ("btrfs: don't call btrfs_start_delalloc_roots
+    in flushoncommit"). After that change we started producing that warning,
+    and every now and then a user reports this since the warning happens too
+    often, it spams dmesg/syslog, and a user is unsure if this reflects any
+    problem that might compromise the filesystem's reliability.
+
+    We can not just lock the sb->s_umount semaphore before calling
+    writeback_inodes_sb(), because that would at least deadlock with
+    filesystem freezing, since at fs/super.c:freeze_super() sync_filesystem()
+    is called while we are holding that semaphore in write mode, and that can
+    trigger a transaction commit, resulting in a deadlock. It would also
+    trigger the same type of deadlock in the unmount path. Possibly, it could
+    also introduce some other locking dependencies that lockdep would report.
+
+    To fix this call try_to_writeback_inodes_sb() instead of
+    writeback_inodes_sb(), because that will try to read lock sb->s_umount
+    and then will only call writeback_inodes_sb() if it was able to lock it.
+    This is fine because the cases where it can't read lock sb->s_umount
+    are during a filesystem unmount or during a filesystem freeze - in those
+    cases sb->s_umount is write locked and sync_filesystem() is called, which
+    calls writeback_inodes_sb(). In other words, in all cases where we can't
+    take a read lock on sb->s_umount, writeback is already being triggered
+    elsewhere.
+
+    An alternative would be to call btrfs_start_delalloc_roots() with a
+    number of pages different from LONG_MAX, for example matching the number
+    of delalloc bytes we currently have, in which case we would end up
+    starting all delalloc with filemap_fdatawrite_wbc() and not with an
+    async flush via filemap_flush() - that is only possible after the rather
+    recent commit e076ab2a2ca70a ("btrfs: shrink delalloc pages instead of
+    full inodes"). However that creates a whole new can of worms due to new
+    lock dependencies, which lockdep complains, like for example:
+
+    [ 8948.247280] ======================================================
+    [ 8948.247823] WARNING: possible circular locking dependency detected
+    [ 8948.248353] 5.17.0-rc1-btrfs-next-111 #1 Not tainted
+    [ 8948.248786] ------------------------------------------------------
+    [ 8948.249320] kworker/u16:18/933570 is trying to acquire lock:
+    [ 8948.249812] ffff9b3de1591690 (sb_internal#2){.+.+}-{0:0}, at: find_free_extent+0x141e/0x1590 [btrfs]
+    [ 8948.250638]
+                   but task is already holding lock:
+    [ 8948.251140] ffff9b3e09c717d8 (&root->delalloc_mutex){+.+.}-{3:3}, at: start_delalloc_inodes+0x78/0x400 [btrfs]
+    [ 8948.252018]
+                   which lock already depends on the new lock.
+
+    [ 8948.252710]
+                   the existing dependency chain (in reverse order) is:
+    [ 8948.253343]
+                   -> #2 (&root->delalloc_mutex){+.+.}-{3:3}:
+    [ 8948.253950]        __mutex_lock+0x90/0x900
+    [ 8948.254354]        start_delalloc_inodes+0x78/0x400 [btrfs]
+    [ 8948.254859]        btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs]
+    [ 8948.255408]        btrfs_commit_transaction+0x32f/0xc00 [btrfs]
+    [ 8948.255942]        btrfs_mksubvol+0x380/0x570 [btrfs]
+    [ 8948.256406]        btrfs_mksnapshot+0x81/0xb0 [btrfs]
+    [ 8948.256870]        __btrfs_ioctl_snap_create+0x17f/0x190 [btrfs]
+    [ 8948.257413]        btrfs_ioctl_snap_create_v2+0xbb/0x140 [btrfs]
+    [ 8948.257961]        btrfs_ioctl+0x1196/0x3630 [btrfs]
+    [ 8948.258418]        __x64_sys_ioctl+0x83/0xb0
+    [ 8948.258793]        do_syscall_64+0x3b/0xc0
+    [ 8948.259146]        entry_SYSCALL_64_after_hwframe+0x44/0xae
+    [ 8948.259709]
+                   -> #1 (&fs_info->delalloc_root_mutex){+.+.}-{3:3}:
+    [ 8948.260330]        __mutex_lock+0x90/0x900
+    [ 8948.260692]        btrfs_start_delalloc_roots+0x97/0x2a0 [btrfs]
+    [ 8948.261234]        btrfs_commit_transaction+0x32f/0xc00 [btrfs]
+    [ 8948.261766]        btrfs_set_free_space_cache_v1_active+0x38/0x60 [btrfs]
+    [ 8948.262379]        btrfs_start_pre_rw_mount+0x119/0x180 [btrfs]
+    [ 8948.262909]        open_ctree+0x1511/0x171e [btrfs]
+    [ 8948.263359]        btrfs_mount_root.cold+0x12/0xde [btrfs]
+    [ 8948.263863]        legacy_get_tree+0x30/0x50
+    [ 8948.264242]        vfs_get_tree+0x28/0xc0
+    [ 8948.264594]        vfs_kern_mount.part.0+0x71/0xb0
+    [ 8948.265017]        btrfs_mount+0x11d/0x3a0 [btrfs]
+    [ 8948.265462]        legacy_get_tree+0x30/0x50
+    [ 8948.265851]        vfs_get_tree+0x28/0xc0
+    [ 8948.266203]        path_mount+0x2d4/0xbe0
+    [ 8948.266554]        __x64_sys_mount+0x103/0x140
+    [ 8948.266940]        do_syscall_64+0x3b/0xc0
+    [ 8948.267300]        entry_SYSCALL_64_after_hwframe+0x44/0xae
+    [ 8948.267790]
+                   -> #0 (sb_internal#2){.+.+}-{0:0}:
+    [ 8948.268322]        __lock_acquire+0x12e8/0x2260
+    [ 8948.268733]        lock_acquire+0xd7/0x310
+    [ 8948.269092]        start_transaction+0x44c/0x6e0 [btrfs]
+    [ 8948.269591]        find_free_extent+0x141e/0x1590 [btrfs]
+    [ 8948.270087]        btrfs_reserve_extent+0x14b/0x280 [btrfs]
+    [ 8948.270588]        cow_file_range+0x17e/0x490 [btrfs]
+    [ 8948.271051]        btrfs_run_delalloc_range+0x345/0x7a0 [btrfs]
+    [ 8948.271586]        writepage_delalloc+0xb5/0x170 [btrfs]
+    [ 8948.272071]        __extent_writepage+0x156/0x3c0 [btrfs]
+    [ 8948.272579]        extent_write_cache_pages+0x263/0x460 [btrfs]
+    [ 8948.273113]        extent_writepages+0x76/0x130 [btrfs]
+    [ 8948.273573]        do_writepages+0xd2/0x1c0
+    [ 8948.273942]        filemap_fdatawrite_wbc+0x68/0x90
+    [ 8948.274371]        start_delalloc_inodes+0x17f/0x400 [btrfs]
+    [ 8948.274876]        btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs]
+    [ 8948.275417]        flush_space+0x1f2/0x630 [btrfs]
+    [ 8948.275863]        btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
+    [ 8948.276438]        process_one_work+0x252/0x5a0
+    [ 8948.276829]        worker_thread+0x55/0x3b0
+    [ 8948.277189]        kthread+0xf2/0x120
+    [ 8948.277506]        ret_from_fork+0x22/0x30
+    [ 8948.277868]
+                   other info that might help us debug this:
+
+    [ 8948.278548] Chain exists of:
+                     sb_internal#2 --> &fs_info->delalloc_root_mutex --> &root->delalloc_mutex
+
+    [ 8948.279601]  Possible unsafe locking scenario:
+
+    [ 8948.280102]        CPU0                    CPU1
+    [ 8948.280508]        ----                    ----
+    [ 8948.280915]   lock(&root->delalloc_mutex);
+    [ 8948.281271]                                lock(&fs_info->delalloc_root_mutex);
+    [ 8948.281915]                                lock(&root->delalloc_mutex);
+    [ 8948.282487]   lock(sb_internal#2);
+    [ 8948.282800]
+                    *** DEADLOCK ***
+
+    [ 8948.283333] 4 locks held by kworker/u16:18/933570:
+    [ 8948.283750]  #0: ffff9b3dc00a9d48 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1d2/0x5a0
+    [ 8948.284609]  #1: ffffa90349dafe70 ((work_completion)(&fs_info->async_data_reclaim_work)){+.+.}-{0:0}, at: process_one_work+0x1d2/0x5a0
+    [ 8948.285637]  #2: ffff9b3e14db5040 (&fs_info->delalloc_root_mutex){+.+.}-{3:3}, at: btrfs_start_delalloc_roots+0x97/0x2a0 [btrfs]
+    [ 8948.286674]  #3: ffff9b3e09c717d8 (&root->delalloc_mutex){+.+.}-{3:3}, at: start_delalloc_inodes+0x78/0x400 [btrfs]
+    [ 8948.287596]
+                  stack backtrace:
+    [ 8948.287975] CPU: 3 PID: 933570 Comm: kworker/u16:18 Not tainted 5.17.0-rc1-btrfs-next-111 #1
+    [ 8948.288677] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
+    [ 8948.289649] Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
+    [ 8948.290298] Call Trace:
+    [ 8948.290517]  <TASK>
+    [ 8948.290700]  dump_stack_lvl+0x59/0x73
+    [ 8948.291026]  check_noncircular+0xf3/0x110
+    [ 8948.291375]  ? start_transaction+0x228/0x6e0 [btrfs]
+    [ 8948.291826]  __lock_acquire+0x12e8/0x2260
+    [ 8948.292241]  lock_acquire+0xd7/0x310
+    [ 8948.292714]  ? find_free_extent+0x141e/0x1590 [btrfs]
+    [ 8948.293241]  ? lock_is_held_type+0xea/0x140
+    [ 8948.293601]  start_transaction+0x44c/0x6e0 [btrfs]
+    [ 8948.294055]  ? find_free_extent+0x141e/0x1590 [btrfs]
+    [ 8948.294518]  find_free_extent+0x141e/0x1590 [btrfs]
+    [ 8948.294957]  ? _raw_spin_unlock+0x29/0x40
+    [ 8948.295312]  ? btrfs_get_alloc_profile+0x124/0x290 [btrfs]
+    [ 8948.295813]  btrfs_reserve_extent+0x14b/0x280 [btrfs]
+    [ 8948.296270]  cow_file_range+0x17e/0x490 [btrfs]
+    [ 8948.296691]  btrfs_run_delalloc_range+0x345/0x7a0 [btrfs]
+    [ 8948.297175]  ? find_lock_delalloc_range+0x247/0x270 [btrfs]
+    [ 8948.297678]  writepage_delalloc+0xb5/0x170 [btrfs]
+    [ 8948.298123]  __extent_writepage+0x156/0x3c0 [btrfs]
+    [ 8948.298570]  extent_write_cache_pages+0x263/0x460 [btrfs]
+    [ 8948.299061]  extent_writepages+0x76/0x130 [btrfs]
+    [ 8948.299495]  do_writepages+0xd2/0x1c0
+    [ 8948.299817]  ? sched_clock_cpu+0xd/0x110
+    [ 8948.300160]  ? lock_release+0x155/0x4a0
+    [ 8948.300494]  filemap_fdatawrite_wbc+0x68/0x90
+    [ 8948.300874]  ? do_raw_spin_unlock+0x4b/0xa0
+    [ 8948.301243]  start_delalloc_inodes+0x17f/0x400 [btrfs]
+    [ 8948.301706]  ? lock_release+0x155/0x4a0
+    [ 8948.302055]  btrfs_start_delalloc_roots+0x194/0x2a0 [btrfs]
+    [ 8948.302564]  flush_space+0x1f2/0x630 [btrfs]
+    [ 8948.302970]  btrfs_async_reclaim_data_space+0x108/0x1b0 [btrfs]
+    [ 8948.303510]  process_one_work+0x252/0x5a0
+    [ 8948.303860]  ? process_one_work+0x5a0/0x5a0
+    [ 8948.304221]  worker_thread+0x55/0x3b0
+    [ 8948.304543]  ? process_one_work+0x5a0/0x5a0
+    [ 8948.304904]  kthread+0xf2/0x120
+    [ 8948.305184]  ? kthread_complete_and_exit+0x20/0x20
+    [ 8948.305598]  ret_from_fork+0x22/0x30
+    [ 8948.305921]  </TASK>
+
+    It all comes from the fact that btrfs_start_delalloc_roots() takes the
+    delalloc_root_mutex, in the transaction commit path we are holding a
+    read lock on one of the superblock's freeze semaphores (via
+    sb_start_intwrite()), the async reclaim task can also do a call to
+    btrfs_start_delalloc_roots(), which ends up triggering writeback with
+    calls to filemap_fdatawrite_wbc(), resulting in extent allocation which
+    in turn can call btrfs_start_transaction(), which will result in taking
+    the freeze semaphore via sb_start_intwrite(), forming a nasty dependency
+    on all those locks which can be taken in different orders by different
+    code paths.
+
+    So just adopt the simple approach of calling try_to_writeback_inodes_sb()
+    at btrfs_start_delalloc_flush().
+
+    Link: https://lore.kernel.org/linux-btrfs/20220130005258.GA7465@cuci.nl/
+    Link: https://lore.kernel.org/linux-btrfs/43acc426-d683-d1b6-729d-c6bc4a2fff4d@gmail.com/
+    Link: https://lore.kernel.org/linux-btrfs/6833930a-08d7-6fbc-0141-eb9cdfd6bb4d@gmail.com/
+    Link: https://lore.kernel.org/linux-btrfs/20190322041731.GF16651@hungrycats.org/
+    Reviewed-by: Omar Sandoval <osandov@fb.com>
+    Signed-off-by: Filipe Manana <fdmanana@suse.com>
+    [ add more link reports ]
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit b805349fbdc9a47199d96bc193f64b9399ec6761
+Author: Qu Wenruo <wqu@suse.com>
+Date:   Tue Feb 8 14:54:05 2022 +0800
+
+    btrfs: defrag: don't try to defrag extents which are under writeback
+
+    Once we start writeback (have called btrfs_run_delalloc_range()), we
+    allocate an extent, create an extent map point to that extent, with a
+    generation of (u64)-1, created the ordered extent and then clear the
+    DELALLOC bit from the range in the inode's io tree.
+
+    Such extent map can pass the first call of defrag_collect_targets(), as
+    its generation is (u64)-1, meets any possible minimal generation check.
+    And the range will not have DELALLOC bit, also passing the DELALLOC bit
+    check.
+
+    It will only be re-checked in the second call of
+    defrag_collect_targets(), which will wait for writeback.
+
+    But at that stage we have already spent our time waiting for some IO we
+    may or may not want to defrag.
+
+    Let's reject such extents early so we won't waste our time.
+
+    Reviewed-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: Qu Wenruo <wqu@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit e8a7717c71287a11dc81098199d7116d6a4b6006
+Author: Qu Wenruo <wqu@suse.com>
+Date:   Tue Feb 8 13:31:19 2022 +0800
+
+    btrfs: populate extent_map::generation when reading from disk
+
+    When btrfs_get_extent() tries to get some file extent from disk, it
+    never populates extent_map::generation, leaving the value to be 0.
+
+    On the other hand, for extent map generated by IO, it will get its
+    generation properly set at finish_ordered_io()
+
+     finish_ordered_io()
+     |- unpin_extent_cache(gen = trans->transid)
+        |- em->generation = gen;
+
+    [CAUSE]
+    Since extent_map::generation is mostly used by fsync code, and for fsync
+    they only care about modified extents, which all have their
+    em::generation > 0.
+
+    Thus it's fine to not populate em read from disk for fsync.
+
+    [CORNER CASE]
+    However autodefrag also relies on em::generation to determine if one
+    extent needs to be defragged.
+
+    This unpopulated extent_map::generation can prevent the following
+    autodefrag case from working:
+
+    	mkfs.btrfs -f $dev
+    	mount $dev $mnt -o autodefrag
+
+    	# initial write to queue the inode for autodefrag
+    	xfs_io -f -c "pwrite 0 4k" $mnt/file
+    	sync
+
+    	# Real fragmented write
+    	xfs_io -f -s -c "pwrite -b 4096 0 32k" $mnt/file
+    	sync
+    	echo "=== before autodefrag ==="
+    	xfs_io -c "fiemap -v" $mnt/file
+
+    	# Drop cache to force em to be read from disk
+    	echo 3 > /proc/sys/vm/drop_caches
+    	mount -o remount,commit=1 $mnt
+    	sleep 3
+    	sync
+
+    	echo "=== After autodefrag ==="
+    	xfs_io -c "fiemap -v" $mnt/file
+    	umount $mnt
+
+    The result looks like this:
+
+      === before autodefrag ===
+      /mnt/btrfs/file:
+       EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
+         0: [0..15]:         26672..26687        16   0x0
+         1: [16..31]:        26656..26671        16   0x0
+         2: [32..47]:        26640..26655        16   0x0
+         3: [48..63]:        26624..26639        16   0x1
+      === After autodefrag ===
+      /mnt/btrfs/file:
+       EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
+         0: [0..15]:         26672..26687        16   0x0
+         1: [16..31]:        26656..26671        16   0x0
+         2: [32..47]:        26640..26655        16   0x0
+         3: [48..63]:        26624..26639        16   0x1
+
+    This fragmented 32K will not be defragged by autodefrag.
+
+    [FIX]
+    To make things less weird, just populate extent_map::generation when
+    reading file extents from disk.
+
+    This would make above fragmented extents to be properly defragged:
+
+      == before autodefrag ===
+      /mnt/btrfs/file:
+       EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
+         0: [0..15]:         26672..26687        16   0x0
+         1: [16..31]:        26656..26671        16   0x0
+         2: [32..47]:        26640..26655        16   0x0
+         3: [48..63]:        26624..26639        16   0x1
+      === After autodefrag ===
+      /mnt/btrfs/file:
+       EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
+         0: [0..63]:         26688..26751        64   0x1
+
+    Reviewed-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: Qu Wenruo <wqu@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit df6d916f35305a85d6636256fbc9708a78df7465
+Author: Filipe Manana <fdmanana@suse.com>
+Date:   Thu Feb 3 14:55:50 2022 +0000
+
+    btrfs: prepare extents to be logged before locking a log tree path
+
+    When we want to log an extent, in the fast fsync path, we obtain a path
+    to the leaf that will hold the file extent item either through a deletion
+    search, via btrfs_drop_extents(), or through an insertion search using
+    btrfs_insert_empty_item(). After that we fill the file extent item's
+    fields one by one directly on the leaf.
+
+    Instead of doing that, we could prepare the file extent item before
+    obtaining a btree path, and then copy the prepared extent item with a
+    single operation once we get the path. This helps avoid some contention
+    on the log tree, since we are holding write locks for longer than
+    necessary, especially in the case where the path is obtained via
+    btrfs_drop_extents() through a deletion search, which always keeps a
+    write lock on the nodes at levels 1 and 2 (besides the leaf).
+
+    This change does that, we prepare the file extent item that is going to
+    be inserted before acquiring a path, and then copy it into a leaf using
+    a single copy operation once we get a path.
+
+    This change if part of a patchset that is comprised of the following
+    patches:
+
+      1/6 btrfs: remove unnecessary leaf free space checks when pushing items
+      2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+      3/6 btrfs: avoid unnecessary computation when deleting items from a leaf
+      4/6 btrfs: remove constraint on number of visited leaves when replacing extents
+      5/6 btrfs: remove useless path release in the fast fsync path
+      6/6 btrfs: prepare extents to be logged before locking a log tree path
+
+    The following test was run to measure the impact of the whole patchset:
+
+      $ cat test.sh
+      #!/bin/bash
+
+      DEV=/dev/sdi
+      MNT=/mnt/sdi
+      MOUNT_OPTIONS="-o ssd"
+      MKFS_OPTIONS="-R free-space-tree -O no-holes"
+
+      NUM_JOBS=8
+      FILE_SIZE=128M
+      RUN_TIME=200
+
+      cat <<EOF > /tmp/fio-job.ini
+      [writers]
+      rw=randwrite
+      fsync=1
+      fallocate=none
+      group_reporting=1
+      direct=0
+      bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5
+      ioengine=sync
+      filesize=$FILE_SIZE
+      runtime=$RUN_TIME
+      time_based
+      directory=$MNT
+      numjobs=$NUM_JOBS
+      thread
+      EOF
+
+      echo "performance" | \
+          tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+
+      echo
+      echo "Using config:"
+      echo
+      cat /tmp/fio-job.ini
+      echo
+
+      umount $MNT &> /dev/null
+      mkfs.btrfs -f $MKFS_OPTIONS $DEV
+      mount $MOUNT_OPTIONS $DEV $MNT
+
+      fio /tmp/fio-job.ini
+
+      umount $MNT
+
+    The test ran inside a VM (8 cores, 32G of RAM) with the target disk
+    mapping to a raw NVMe device, and using a non-debug kernel config
+    (Debian's default config).
+
+    Before the patchset:
+
+    WRITE: bw=116MiB/s (122MB/s), 116MiB/s-116MiB/s (122MB/s-122MB/s), io=22.7GiB (24.4GB), run=200013-200013msec
+
+    After the patchset:
+
+    WRITE: bw=125MiB/s (131MB/s), 125MiB/s-125MiB/s (131MB/s-131MB/s), io=24.3GiB (26.1GB), run=200007-200007msec
+
+    A 7.8% gain on throughput and +7.0% more IO done in the same period of
+    time (200 seconds).
+
+    Signed-off-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 901ebe7172fa1fd03e4cc43d9d5f6a191d2e6428
+Author: Filipe Manana <fdmanana@suse.com>
+Date:   Thu Feb 3 14:55:49 2022 +0000
+
+    btrfs: remove useless path release in the fast fsync path
+
+    There's no point in calling btrfs_release_path() after finishing the loop
+    that logs the modified extents, since log_one_extent() returns with the
+    path released. In case the list of extents is empty, the path is already
+    released, so there's no need for that case as well.
+    So just remove that unnecessary btrfs_release_path() call.
+
+    This change if part of a patchset that is comprised of the following
+    patches:
+
+      1/6 btrfs: remove unnecessary leaf free space checks when pushing items
+      2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+      3/6 btrfs: avoid unnecessary computation when deleting items from a leaf
+      4/6 btrfs: remove constraint on number of visited leaves when replacing extents
+      5/6 btrfs: remove useless path release in the fast fsync path
+      6/6 btrfs: prepare extents to be logged before locking a log tree path
+
+    The last patch in the series has some performance test result in its
+    changelog.
+
+    Signed-off-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 12d0362b8dad82e240e37aa43f7d344f7206c009
+Author: Filipe Manana <fdmanana@suse.com>
+Date:   Thu Feb 3 14:55:48 2022 +0000
+
+    btrfs: remove constraint on number of visited leaves when replacing extents
+
+    At btrfs_drop_extents(), we try to replace a range of file extent items
+    with a new file extent in a single btree search, to avoid the need to do
+    a search for deletion, followed by a path release and followed by yet
+    another search for insertion.
+
+    When I originally added that optimization, in commit 1acae57b161ef1
+    ("Btrfs: faster file extent item replace operations"), I left a constraint
+    to do the fast replace only if we visited a single leaf. That was because
+    in the most common case we find all file extent items that need to be
+    deleted (or trimmed) in a single leaf, however it can work for other
+    common cases like when we need to delete a few file extent items located
+    at the end of a leaf and a few more located at the beginning of the next
+    leaf. The key for the new file extent item is greater than the key of
+    any deleted or trimmed file extent item from previous leaves, so we are
+    fine to use the last leaf that we found as long as we are holding a
+    write lock on it - even if the new key ends up at slot 0, as if that's
+    the case, the btree search has obtained a write lock on any upper nodes
+    that need to have a key pointer updated.
+
+    So removed the constraint that limits the optimization to the case where
+    we visited only a single leaf.
+
+    This change if part of a patchset that is comprised of the following
+    patches:
+
+      1/6 btrfs: remove unnecessary leaf free space checks when pushing items
+      2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+      3/6 btrfs: avoid unnecessary computation when deleting items from a leaf
+      4/6 btrfs: remove constraint on number of visited leaves when replacing extents
+      5/6 btrfs: remove useless path release in the fast fsync path
+      6/6 btrfs: prepare extents to be logged before locking a log tree path
+
+    The last patch in the series has some performance test result in its
+    changelog.
+
+    Signed-off-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 967aa565ee63760ea7f1c00be743f8e24ee83aa6
+Author: Filipe Manana <fdmanana@suse.com>
+Date:   Thu Feb 3 14:55:47 2022 +0000
+
+    btrfs: avoid unnecessary computation when deleting items from a leaf
+
+    When deleting items from a leaf, we always compute the sum of the data
+    sizes of the items that are going to be deleted. However we only use
+    that sum when the last item to delete is behind the last item in the
+    leaf. This unnecessarily wastes CPU time when we are deleting either
+    the whole leaf or from some slot > 0 up to the last item in the leaf,
+    and both of these cases are common (e.g. truncation operation, either
+    as a result of truncate(2) or when logging inodes, deleting checksums
+    after removing a large enough extent, etc).
+
+    So compute only the sum of the data sizes if the last item to be
+    deleted does not match the last item in the leaf.
+
+    This change if part of a patchset that is comprised of the following
+    patches:
+
+      1/6 btrfs: remove unnecessary leaf free space checks when pushing items
+      2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+      3/6 btrfs: avoid unnecessary computation when deleting items from a leaf
+      4/6 btrfs: remove constraint on number of visited leaves when replacing extents
+      5/6 btrfs: remove useless path release in the fast fsync path
+      6/6 btrfs: prepare extents to be logged before locking a log tree path
+
+    The last patch in the series has some performance test result in its
+    changelog.
+
+    Signed-off-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 4fd3e4a94b70dfab57ce617a2c8196a77e8cc29d
+Author: Filipe Manana <fdmanana@suse.com>
+Date:   Thu Feb 3 14:55:46 2022 +0000
+
+    btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+
+    When we delete items from a leaf, if we end up with more than two thirds
+    of unused leaf space, we try to delete the leaf by moving all its items
+    into its left and right neighbour leaves. Sometimes that is not possible
+    because there is not enough free space in the left and right leaves, and
+    in that case we end up not deleting our leaf.
+
+    The way we are doing this is not ideal and can be improved in the
+    following ways:
+
+    1) When we call push_leaf_left(), we pass a value of 1 byte to the data
+       size parameter of push_leaf_left(). This is not realistic value because
+       no item can have a size less than 25 bytes, which is the size of struct
+       btrfs_item. This means that means that if the left leaf has not enough
+       free space to push any item, we end up COWing it even if we end up not
+       changing its content at all.
+
+       COWing that leaf means allocating a new metadata extent, marking it
+       dirty and doing more IO when committing a transaction or when syncing a
+       log tree. For a log tree case, it's particularly more important to
+       avoid the useless COW operation, as more IO can imply a higher latency
+       for an fsync operation.
+
+       So instead of passing 1 as the minimum data size for push_leaf_left(),
+       pass the size of the first item in our leaf, as we don't want to COW
+       the left leaf if we can't at least push the first item of our leaf;
+
+    2) When we call push_leaf_right(), we also pass a value of 1 byte as the
+       data size parameter of push_leaf_right(). Like the previous case, it
+       will also result in COWing the right leaf even if we are not able to
+       move any items into it, since there can't be any item with a size
+       smaller than 25 bytes (the size of struct btrfs_item).
+
+       So instead of passing 1 as the minimum data size to push_leaf_right(),
+       pass a size that corresponds to the sum of the size of all the
+       remaining items in our leaf. We are not interested in moving less than
+       that, because if we do, we are not able to delete our leaf and we have
+       COWed the right leaf for nothing. Plus, moving only some of the items
+       of our leaf, it means an even less balanced tree.
+
+       Just like the previous case, we want to avoid the useless COW of the
+       right leaf, this way we don't have to spend time allocating one new
+       metadata extent, and doing more IO when committing a transaction or
+       syncing a log tree. For the log tree case it's specially more important
+       because more IO can result in a higher latency for a fsync operation.
+
+    So adjust the minimum data size passed to push_leaf_left() and
+    push_leaf_right() as mentioned above.
+
+    This change if part of a patchset that is comprised of the following
+    patches:
+
+      1/6 btrfs: remove unnecessary leaf free space checks when pushing items
+      2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+      3/6 btrfs: avoid unnecessary computation when deleting items from a leaf
+      4/6 btrfs: remove constraint on number of visited leaves when replacing extents
+      5/6 btrfs: remove useless path release in the fast fsync path
+      6/6 btrfs: prepare extents to be logged before locking a log tree path
+
+    Not being able to delete a leaf that became less than 1/3 full after
+    deleting items from it is actually common. For example, for the fio test
+    mentioned in the changelog of patch 6/6, we are only able to delete a
+    leaf at btrfs_del_items() about 5.3% of the time, due to its left and
+    right neighbour leaves not having enough free space to push all the
+    remaining items into them.
+
+    The last patch in the series has some performance test result in its
+    changelog.
+
+    Signed-off-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 1b956d9978687f9d463a39c0d66a5eab958b9f3a
+Author: Filipe Manana <fdmanana@suse.com>
+Date:   Thu Feb 3 14:55:45 2022 +0000
+
+    btrfs: remove unnecessary leaf free space checks when pushing items
+
+    When trying to push items from a leaf into its left and right neighbours,
+    we lock the left or right leaf, check if it has the required minimum free
+    space, COW the leaf and then check again if it has the minimum required
+    free space. This second check is pointless:
+
+    1) Most and foremost because it's not needed. We have a write lock on the
+       leaf and on its parent node, so no one can come in and change either
+       the pre-COW or post-COW version of the leaf for the whole duration of
+       the push_leaf_left() and push_leaf_right() calls;
+
+    2) The call to btrfs_leaf_free_space() is not trivial, it has a fair
+       amount of arithmetic operations and access to fields in the leaf's
+       header and items, so it's not very cheap.
+
+    So remove the duplicated free space checks.
+
+    This change if part of a patchset that is comprised of the following
+    patches:
+
+      1/6 btrfs: remove unnecessary leaf free space checks when pushing items
+      2/6 btrfs: avoid unnecessary COW of leaves when deleting items from a leaf
+      3/6 btrfs: avoid unnecessary computation when deleting items from a leaf
+      4/6 btrfs: remove constraint on number of visited leaves when replacing extents
+      5/6 btrfs: remove useless path release in the fast fsync path
+      6/6 btrfs: prepare extents to be logged before locking a log tree path
+
+    The last patch in the series has some performance test result in its
+    changelog.
+
+    Signed-off-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 784a4d85814dea9bf1096e864d9368d032c0112a
+Author: David Sterba <dsterba@suse.com>
+Date:   Tue Feb 1 15:42:07 2022 +0100
+
+    btrfs: replace BUILD_BUG_ON by static_assert
+
+    The static_assert introduced in 6bab69c65013 ("build_bug.h: add wrapper
+    for _Static_assert") has been supported by compilers for a long time
+    (gcc 4.6, clang 3.0) and can be used in header files. We don't need to
+    put BUILD_BUG_ON to random functions but rather keep it next to the
+    definition.
+
+    The exception here is the UAPI header btrfs_tree.h that could be
+    potentially included by userspace code and the static assert is not
+    defined (nor used in any other header).
+
+    Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 2e0a36dc72d2a11fad03763dbd2ac4da106cfd1f
+Author: Qu Wenruo <wqu@suse.com>
+Date:   Sun Jan 30 20:53:15 2022 +0800
+
+    btrfs: don't hold CPU for too long when defragging a file
+
+    There is a user report about "btrfs filesystem defrag" causing 120s
+    timeout problem.
+
+    For btrfs_defrag_file() it will iterate all file extents if called from
+    defrag ioctl, thus it can take a long time.
+
+    There is no reason not to release the CPU during such a long operation.
+
+    Add cond_resched() after defragged one cluster.
+
+    CC: stable@vger.kernel.org # 5.16
+    Link: https://lore.kernel.org/linux-btrfs/10e51417-2203-f0a4-2021-86c8511cc367@gmx.com
+    Signed-off-by: Qu Wenruo <wqu@suse.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit cb792362499dacbdd3986d10ad109d0efd875eab
+Author: Josef Bacik <josef@toxicpanda.com>
+Date:   Fri Nov 5 16:45:28 2021 -0400
+
+    btrfs: rework async transaction committing
+
+    Currently we do this awful thing where we get another ref on a trans
+    handle, async off that handle and commit the transaction from that work.
+    Because we do this we have to mess with current->journal_info and the
+    freeze counting stuff.
+
+    We already have an async thing to kick for the transaction commit, the
+    transaction kthread.  Replace this work struct with a flag on the
+    fs_info to tell the kthread to go ahead and commit even if it's before
+    our timeout.  Then we can drastically simplify the async transaction
+    commit path.
+
+    Note: this can be simplified and functionality based on the pending
+    operation COMMIT.
+
+    Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+    [ add note ]
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 42786287b2e3443e3a3c90d7305fdc9b4287f00b
+Author: Nikolay Borisov <nborisov@suse.com>
+Date:   Tue Nov 23 09:23:42 2021 +0200
+
+    btrfs: eliminate if in main loop in tree_search_offset
+
+    Reshuffle the code inside the first loop of tree_search_offset so that
+    one if() is eliminated and the becomes more linear.
+
+    Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+    Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit e8bbc534dbd1d0b0d17bf68d7615e644513e652a
+Author: Qu Wenruo <wqu@suse.com>
+Date:   Fri Nov 19 14:19:33 2021 +0800
+
+    btrfs: don't check stripe length if the profile is not stripe based
+
+    [BUG]
+    When debugging calc_bio_boundaries(), I found that even for RAID1
+    metadata, we're following stripe length to calculate stripe boundary.
+
+      # mkfs.btrfs -m raid1 -d raid1 /dev/test/scratch[12]
+      # mount /dev/test/scratch /mnt/btrfs
+      # xfs_io -f -c "pwrite 0 64K" /mnt/btrfs/file
+      # umount
+
+    Above very basic operations will make calc_bio_boundaries() to report
+    the following result:
+
+      submit_extent_page: r/i=1/1 file_offset=22036480 len_to_stripe_boundary=49152
+      submit_extent_page: r/i=1/1 file_offset=30474240 len_to_stripe_boundary=65536
+      ...
+      submit_extent_page: r/i=1/1 file_offset=30523392 len_to_stripe_boundary=16384
+      submit_extent_page: r/i=1/1 file_offset=30457856 len_to_stripe_boundary=16384
+      submit_extent_page: r/i=5/257 file_offset=0 len_to_stripe_boundary=65536
+      submit_extent_page: r/i=5/257 file_offset=65536 len_to_stripe_boundary=65536
+      submit_extent_page: r/i=1/1 file_offset=30490624 len_to_stripe_boundary=49152
+      submit_extent_page: r/i=1/1 file_offset=30507008 len_to_stripe_boundary=32768
+
+    Where "r/i" is the rootid and inode, 1/1 means they metadata.
+    The remaining names match the member used in kernel.
+
+    Even all data/metadata are using RAID1, we're still following stripe
+    length.
+
+    [CAUSE]
+    This behavior is caused by a wrong condition in btrfs_get_io_geometry():
+
+    	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+    		/* Fill using stripe_len */
+    		len = min_t(u64, em->len - offset, max_len);
+    	} else {
+    		len = em->len - offset;
+    	}
+
+    This means, only for SINGLE we will not follow stripe_len.
+
+    However for profiles like RAID1*, DUP, they don't need to bother
+    stripe_len.
+
+    This can lead to unnecessary bio split for RAID1*/DUP profiles, and can
+    even be a blockage for future zoned RAID support.
+
+    [FIX]
+    Introduce one single-use macro, BTRFS_BLOCK_GROUP_STRIPE_MASK, and
+    change the condition to only calculate the length using stripe length
+    for stripe based profiles.
+
+    Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+    Reviewed-by: Anand Jain <anand.jain@oracle.com>
+    Signed-off-by: Qu Wenruo <wqu@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit b8cb209bdc55ae144881b2ae67dd36941813f970
+Author: Nikolay Borisov <nborisov@suse.com>
+Date:   Mon Nov 22 17:16:46 2021 +0200
+
+    btrfs: get next entry in tree_search_offset before doing checks
+
+    This is a small optimisation since the currently 'entry' is already
+    checked in the if () {} else if {} construct above the loop. In essence
+    the first iteration of the final while loop is redundant. To eliminate
+    this extra check simply get the next entry at the beginning of the loop.
+
+    Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+    Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 0fe871af1b0056f912e654d5455486c9f76b0c5e
+Author: Josef Bacik <josef@toxicpanda.com>
+Date:   Thu Nov 18 16:33:15 2021 -0500
+
+    btrfs: index free space entries on size
+
+    Currently we index free space on offset only, because usually we have a
+    hint from the allocator that we want to honor for locality reasons.
+    However if we fail to use this hint we have to go back to a brute force
+    search through the free space entries to find a large enough extent.
+
+    With sufficiently fragmented free space this becomes quite expensive, as
+    we have to linearly search all of the free space entries to find if we
+    have a part that's long enough.
+
+    To fix this add a cached rb tree to index based on free space entry
+    bytes.  This will allow us to quickly look up the largest chunk in the
+    free space tree for this block group, and stop searching once we've
+    found an entry that is too small to satisfy our allocation.  We simply
+    choose to use this tree if we're searching from the beginning of the
+    block group, as we know we do not care about locality at that point.
+
+    I wrote an allocator test that creates a 10TiB ram backed null block
+    device and then fallocates random files until the file system is full.
+    I think go through and delete all of the odd files.  Then I spawn 8
+    threads that fallocate 64MiB files (1/2 our extent size cap) until the
+    file system is full again.  I use bcc's funclatency to measure the
+    latency of find_free_extent.  The baseline results are
+
+         nsecs               : count     distribution
+             0 -> 1          : 0        |                                        |
+             2 -> 3          : 0        |                                        |
+             4 -> 7          : 0        |                                        |
+             8 -> 15         : 0        |                                        |
+            16 -> 31         : 0        |                                        |
+            32 -> 63         : 0        |                                        |
+            64 -> 127        : 0        |                                        |
+           128 -> 255        : 0        |                                        |
+           256 -> 511        : 10356    |****                                    |
+           512 -> 1023       : 58242    |*************************               |
+          1024 -> 2047       : 74418    |********************************        |
+          2048 -> 4095       : 90393    |****************************************|
+          4096 -> 8191       : 79119    |***********************************     |
+          8192 -> 16383      : 35614    |***************                         |
+         16384 -> 32767      : 13418    |*****                                   |
+         32768 -> 65535      : 12811    |*****                                   |
+         65536 -> 131071     : 17090    |*******                                 |
+        131072 -> 262143     : 26465    |***********                             |
+        262144 -> 524287     : 40179    |*****************                       |
+        524288 -> 1048575    : 55469    |************************                |
+       1048576 -> 2097151    : 48807    |*********************                   |
+       2097152 -> 4194303    : 26744    |***********                             |
+       4194304 -> 8388607    : 35351    |***************                         |
+       8388608 -> 16777215   : 13918    |******                                  |
+      16777216 -> 33554431   : 21       |                                        |
+
+    avg = 908079 nsecs, total: 580889071441 nsecs, count: 639690
+
+    And the patch results are
+
+         nsecs               : count     distribution
+             0 -> 1          : 0        |                                        |
+             2 -> 3          : 0        |                                        |
+             4 -> 7          : 0        |                                        |
+             8 -> 15         : 0        |                                        |
+            16 -> 31         : 0        |                                        |
+            32 -> 63         : 0        |                                        |
+            64 -> 127        : 0        |                                        |
+           128 -> 255        : 0        |                                        |
+           256 -> 511        : 6883     |**                                      |
+           512 -> 1023       : 54346    |*********************                   |
+          1024 -> 2047       : 79170    |********************************        |
+          2048 -> 4095       : 98890    |****************************************|
+          4096 -> 8191       : 81911    |*********************************       |
+          8192 -> 16383      : 27075    |**********                              |
+         16384 -> 32767      : 14668    |*****                                   |
+         32768 -> 65535      : 13251    |*****                                   |
+         65536 -> 131071     : 15340    |******                                  |
+        131072 -> 262143     : 26715    |**********                              |
+        262144 -> 524287     : 43274    |*****************                       |
+        524288 -> 1048575    : 53870    |*********************                   |
+       1048576 -> 2097151    : 55368    |**********************                  |
+       2097152 -> 4194303    : 41036    |****************                        |
+       4194304 -> 8388607    : 24927    |**********                              |
+       8388608 -> 16777215   : 33       |                                        |
+      16777216 -> 33554431   : 9        |                                        |
+
+    avg = 623599 nsecs, total: 397259314759 nsecs, count: 637042
+
+    There's a little variation in the amount of calls done because of timing
+    of the threads with metadata requirements, but the avg, total, and
+    count's are relatively consistent between runs (usually within 2-5% of
+    each other).  As you can see here we have around a 30% decrease in
+    average latency with a 30% decrease in overall time spent in
+    find_free_extent.
+
+    Reviewed-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit db7a62b00a8ff463466ceb7c68728d8bfcc2d65d
+Author: Josef Bacik <josef@toxicpanda.com>
+Date:   Thu Nov 18 16:33:14 2021 -0500
+
+    btrfs: only use ->max_extent_size if it is set in the bitmap
+
+    While adding self tests for my space index change I was hitting a
+    problem where the space indexed tree wasn't returning the expected
+    ->max_extent_size.  This is because we will skip searching any entry
+    that doesn't have ->bytes >= the amount of bytes we want.  However we'll
+    still set the max_extent_size based on that entry.  The problem is if we
+    don't search the bitmap we won't have ->max_extent_size set properly, so
+    we can't really trust it.
+
+    This doesn't really result in a problem per-se, it can just result in us
+    not finding contiguous area that may exist.  Fix the max_extent_size
+    helper to return ->bytes if ->max_extent_size isn't set, and add a big
+    comment explaining why we're doing this.
+
+    Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 1796c46d66a36d108c3d13292dd47020dd4e02e1
+Author: Josef Bacik <josef@toxicpanda.com>
+Date:   Thu Oct 21 14:58:37 2021 -0400
+
+    btrfs: rename btrfs_item_end_nr to btrfs_item_data_end
+
+    The name btrfs_item_end_nr() is a bit of a misnomer, as it's actually
+    the offset of the end of the data the item points to.  In fact all of
+    the helpers that we use btrfs_item_end_nr() use data in their name, like
+    BTRFS_LEAF_DATA_SIZE() and leaf_data().  Rename to btrfs_item_data_end()
+    to make it clear what this helper is giving us.
+
+    Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 89b1779e5b64958f44df12119436d01621a6f87a
+Author: Josef Bacik <josef@toxicpanda.com>
+Date:   Thu Oct 21 14:58:36 2021 -0400
+
+    btrfs: remove the btrfs_item_end() helper
+
+    We're only using btrfs_item_end() from btrfs_item_end_nr(), so this can
+    be collapsed.
+
+    Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 8b3c0a73f1c9ae9506fb95f8581d35432f2bb8e8
+Author: Josef Bacik <josef@toxicpanda.com>
+Date:   Thu Oct 21 14:58:35 2021 -0400
+
+    btrfs: drop the _nr from the item helpers
+
+    Now that all call sites are using the slot number to modify item values,
+    rename the SETGET helpers to raw_item_*(), and then rework the _nr()
+    helpers to be the btrfs_item_*() btrfs_set_item_*() helpers, and then
+    rename all of the callers to the new helpers.
+
+    Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit cb7d27e5e1205b9ca8f512a48f6772b2eb2b84f9
+Author: Josef Bacik <josef@toxicpanda.com>
+Date:   Thu Oct 21 14:58:34 2021 -0400
+
+    btrfs: introduce item_nr token variant helpers
+
+    The last remaining place where we have the pattern of
+
+    	item = btrfs_item_nr(slot)
+    	<do something with the item>
+
+    are the token helpers.  Handle this by introducing token helpers that
+    will do the btrfs_item_nr() work inside of the helper itself, and then
+    convert all users of the btrfs_item token helpers to the new _nr()
+    variants.
+
+    Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 18ba83dcd99b9619cfc1a246cfb84b1c9b530097
+Author: Josef Bacik <josef@toxicpanda.com>
+Date:   Thu Oct 21 14:58:33 2021 -0400
+
+    btrfs: make btrfs_file_extent_inline_item_len take a slot
+
+    Instead of getting the btrfs_item for this, simply pass in the slot of
+    the item and then use the btrfs_item_size_nr() helper inside of
+    btrfs_file_extent_inline_item_len().
+
+    Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 898ecfdac6314335d8135f741414b0b6867ce1ab
+Author: Josef Bacik <josef@toxicpanda.com>
+Date:   Thu Oct 21 14:58:32 2021 -0400
+
+    btrfs: add btrfs_set_item_*_nr() helpers
+
+    We have the pattern of
+
+    	item = btrfs_item_nr(slot);
+    	btrfs_set_item_*(leaf, item);
+
+    in a bunch of places in our code.  Fix this by adding
+    btrfs_set_item_*_nr() helpers which will do the appropriate work, and
+    replace those calls with
+
+    	btrfs_set_item_*_nr(leaf, slot);
+
+    Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 35a9546592ae8d0f3348cd0f776a12dbf3b81aa8
+Author: Josef Bacik <josef@toxicpanda.com>
+Date:   Thu Oct 21 14:58:31 2021 -0400
+
+    btrfs: use btrfs_item_size_nr/btrfs_item_offset_nr everywhere
+
+    We have this pattern in a lot of places
+
+    	item = btrfs_item_nr(slot);
+    	btrfs_item_size(leaf, item);
+
+    when we could simply use
+
+    	btrfs_item_size(leaf, slot);
+
+    Fix all callers of btrfs_item_size() and btrfs_item_offset() to use the
+    _nr variation of the helpers.
+
+    Reviewed-by: Qu Wenruo <wqu@suse.com>
+    Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 8e279d85c2ac593b8caf53f3ac72d0b7047d96f5
+Author: Filipe Manana <fdmanana@suse.com>
+Date:   Mon Oct 25 17:31:54 2021 +0100
+
+    btrfs: remove no longer needed logic for replaying directory deletes
+
+    Now that we log only dir index keys when logging a directory, we no longer
+    need to deal with dir item keys in the log replay code for replaying
+    directory deletes. This is also true for the case when we replay a log
+    tree created by a kernel that still logs dir items.
+
+    So remove the remaining code of the replay of directory deletes algorithm
+    that deals with dir item keys.
+
+    Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+    Signed-off-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit b8ac7a8b1dd9bba3b30bd034d754bb4932c4a970
+Author: Filipe Manana <fdmanana@suse.com>
+Date:   Mon Oct 25 17:31:53 2021 +0100
+
+    btrfs: only copy dir index keys when logging a directory
+
+    Currently, when logging a directory, we copy both dir items and dir index
+    items from the fs/subvolume tree to the log tree. Both items have exactly
+    the same data (same struct btrfs_dir_item), the difference lies in the key
+    values, where a dir index key contains the index number of a directory
+    entry while the dir item key does not, as it's used for doing fast lookups
+    of an entry by name, while the former is used for sorting entries when
+    listing a directory.
+
+    We can exploit that and log only the dir index items, since they contain
+    all the information needed to correctly add, replace and delete directory
+    entries when replaying a log tree. Logging only the dir index items is
+    also backward and forward compatible: an unpatched kernel (without this
+    change) can correctly replay a log tree generated by a patched kernel
+    (with this patch), and a patched kernel can correctly replay a log tree
+    generated by an unpatched kernel.
+
+    The backward compatibility is ensured because:
+
+    1) For inserting a new dentry: a dentry is only inserted when we find a
+       new dir index key - we can only insert if we know the dir index offset,
+       which is encoded in the dir index key's offset;
+
+    2) For deleting dentries: during log replay, before adding or replacing
+       dentries, we first replay dentry deletions. Whenever we find a dir item
+       key or a dir index key in the subvolume/fs tree that is not logged in
+       a range for which the log tree is authoritative, we do the unlink of
+       the dentry, which removes both the existing dir item key and the dir
+       index key. Therefore logging just dir index keys is enough to ensure
+       dentry deletions are correctly replayed;
+
+    3) For dentry replacements: they work when we log only dir index keys
+       and this is mostly due to a combination of 1) and 2). If we replace a
+       dentry with name "foobar" to point from inode A to inode B, then we
+       know the dir index key for the new dentry is different from the old
+       one, as it has an index number (key offset) larger than the old one.
+       This results in replaying a deletion, through replay_dir_deletes(),
+       that causes the old dentry to be removed, both the dir item key and
+       the dir index key, as mentioned at 2). Then when processing the new
+       dir index key, we add the new dentry, adding both a new dir item key
+       and a new index key pointing to inode B, as stated in 1).
+
+    The forward compatibility, the ability for a patched kernel to replay a
+    log created by an older, unpatched kernel, comes from the changes required
+    for making sure we are able to replay a log that only contains dir index
+    keys - we simply ignore every dir item key we find.
+
+    So modify directory logging to log only dir index items, and modify the
+    log replay process to ignore dir item keys, from log trees created by an
+    unpatched kernel, and process only with dir index keys. This reduces the
+    amount of logged metadata by about half, and therefore the time spent
+    logging or fsyncing large directories (less CPU time and less IO).
+
+    The following test script was used to measure this change:
+
+       #!/bin/bash
+
+       DEV=/dev/nvme0n1
+       MNT=/mnt/nvme0n1
+
+       NUM_NEW_FILES=1000000
+       NUM_FILE_DELETES=10000
+
+       mkfs.btrfs -f $DEV
+       mount -o ssd $DEV $MNT
+
+       mkdir $MNT/testdir
+
+       for ((i = 1; i <= $NUM_NEW_FILES; i++)); do
+               echo -n > $MNT/testdir/file_$i
+       done
+
+       start=$(date +%s%N)
+       xfs_io -c "fsync" $MNT/testdir
+       end=$(date +%s%N)
+
+       dur=$(( (end - start) / 1000000 ))
+       echo "dir fsync took $dur ms after adding $NUM_NEW_FILES files"
+
+       # sync to force transaction commit and wipeout the log.
+       sync
+
+       del_inc=$(( $NUM_NEW_FILES / $NUM_FILE_DELETES ))
+       for ((i = 1; i <= $NUM_NEW_FILES; i += $del_inc)); do
+               rm -f $MNT/testdir/file_$i
+       done
+
+       start=$(date +%s%N)
+       xfs_io -c "fsync" $MNT/testdir
+       end=$(date +%s%N)
+
+       dur=$(( (end - start) / 1000000 ))
+       echo "dir fsync took $dur ms after deleting $NUM_FILE_DELETES files"
+       echo
+
+       umount $MNT
+
+    The tests were run on a physical machine, with a non-debug kernel (Debian's
+    default kernel config), for different values of $NUM_NEW_FILES and
+    $NUM_FILE_DELETES, and the results were the following:
+
+    ** Before patch, NUM_NEW_FILES = 1 000 000, NUM_DELETE_FILES = 10 000 **
+
+    dir fsync took 8412 ms after adding 1000000 files
+    dir fsync took 500 ms after deleting 10000 files
+
+    ** After patch, NUM_NEW_FILES = 1 000 000, NUM_DELETE_FILES = 10 000 **
+
+    dir fsync took 4252 ms after adding 1000000 files   (-49.5%)
+    dir fsync took 269 ms after deleting 10000 files    (-46.2%)
+
+    ** Before patch, NUM_NEW_FILES = 100 000, NUM_DELETE_FILES = 1 000 **
+
+    dir fsync took 745 ms after adding 100000 files
+    dir fsync took 59 ms after deleting 1000 files
+
+    ** After patch, NUM_NEW_FILES = 100 000, NUM_DELETE_FILES = 1 000 **
+
+    dir fsync took 404 ms after adding 100000 files   (-45.8%)
+    dir fsync took 31 ms after deleting 1000 files    (-47.5%)
+
+    ** Before patch, NUM_NEW_FILES = 10 000, NUM_DELETE_FILES = 1 000 **
+
+    dir fsync took 67 ms after adding 10000 files
+    dir fsync took 9 ms after deleting 1000 files
+
+    ** After patch, NUM_NEW_FILES = 10 000, NUM_DELETE_FILES = 1 000 **
+
+    dir fsync took 36 ms after adding 10000 files   (-46.3%)
+    dir fsync took 5 ms after deleting 1000 files   (-44.4%)
+
+    ** Before patch, NUM_NEW_FILES = 1 000, NUM_DELETE_FILES = 100 **
+
+    dir fsync took 9 ms after adding 1000 files
+    dir fsync took 4 ms after deleting 100 files
+
+    ** After patch, NUM_NEW_FILES = 1 000, NUM_DELETE_FILES = 100 **
+
+    dir fsync took 7 ms after adding 1000 files     (-22.2%)
+    dir fsync took 3 ms after deleting 100 files    (-25.0%)
+
+    Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+    Signed-off-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 793440833c94fa896424e2ceef71d376a2ae2454
+Author: Nikolay Borisov <nborisov@suse.com>
+Date:   Thu Oct 14 10:03:11 2021 +0300
+
+    btrfs: remove spurious unlock/lock of unused_bgs_lock
+
+    Since both unused block groups and reclaim bgs lists are protected by
+    unused_bgs_lock then free them in the same critical section without
+    doing an extra unlock/lock pair.
+
+    Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+    Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+    Reviewed-by: David Sterba <dsterba@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+
+commit 9f4889dbcf40db16d5cfd02dae54143ecfcf036a
+Author: Filipe Manana <fdmanana@suse.com>
+Date:   Thu Oct 28 16:03:41 2021 +0100
+
+    btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range
+
+    When doing a direct IO write against a file range that either has
+    preallocated extents in that range or has regular extents and the file
+    has the NOCOW attribute set, the write fails with -ENOSPC when all of
+    the following conditions are met:
+
+    1) There are no data blocks groups with enough free space matching
+       the size of the write;
+
+    2) There's not enough unallocated space for allocating a new data block
+       group;
+
+    3) The extents in the target file range are not shared, neither through
+       snapshots nor through reflinks.
+
+    This is wrong because a NOCOW write can be done in such case, and in fact
+    it's possible to do it using a buffered IO write, since when failing to
+    allocate data space, the buffered IO path checks if a NOCOW write is
+    possible.
+
+    The failure in direct IO write path comes from the fact that early on,
+    at btrfs_dio_iomap_begin(), we try to allocate data space for the write
+    and if it that fails we return the error and stop - we never check if we
+    can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
+    if we can do a NOCOW write into the range, or a subset of the range, and
+    then release the previously reserved data space.
+
+    Fix this by doing the data reservation only if needed, when we must COW,
+    at btrfs_get_blocks_direct_write() instead of doing it at
+    btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
+    the inneficiency of doing unnecessary data reservations.
+
+    The following example test script reproduces the problem:
+
+      $ cat dio-nocow-enospc.sh
+      #!/bin/bash
+
+      DEV=/dev/sdj
+      MNT=/mnt/sdj
+
+      # Use a small fixed size (1G) filesystem so that it's quick to fill
+      # it up.
+      # Make sure the mixed block groups feature is not enabled because we
+      # later want to not have more space available for allocating data
+      # extents but still have enough metadata space free for the file writes.
+      mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
+      mount $DEV $MNT
+
+      # Create our test file with the NOCOW attribute set.
+      touch $MNT/foobar
+      chattr +C $MNT/foobar
+
+      # Now fill in all unallocated space with data for our test file.
+      # This will allocate a data block group that will be full and leave
+      # no (or a very small amount of) unallocated space in the device, so
+      # that it will not be possible to allocate a new block group later.
+      echo
+      echo "Creating test file with initial data..."
+      xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar
+
+      # Now try a direct IO write against file range [0, 10M[.
+      # This should succeed since this is a NOCOW file and an extent for the
+      # range was previously allocated.
+      echo
+      echo "Trying direct IO write over allocated space..."
+      xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar
+
+      umount $MNT
+
+    When running the test:
+
+      $ ./dio-nocow-enospc.sh
+      (...)
+
+      Creating test file with initial data...
+      wrote 943718400/943718400 bytes at offset 0
+      900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)
+
+      Trying direct IO write over allocated space...
+      pwrite: No space left on device
+
+    A test case for fstests will follow, testing both this direct IO write
+    scenario as well as the buffered IO write scenario to make it less likely
+    to get future regressions on the buffered IO case.
+
+    Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+    Signed-off-by: Filipe Manana <fdmanana@suse.com>
+    Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/backref.c                   |  16 +-
+ fs/btrfs/block-group.c               |   2 -
+ fs/btrfs/btrfs_inode.h               |  18 +-
+ fs/btrfs/compression.h               |   2 +
+ fs/btrfs/ctree.c                     | 208 ++++-----
+ fs/btrfs/ctree.h                     |  96 ++--
+ fs/btrfs/dev-replace.c               |   4 +-
+ fs/btrfs/dir-item.c                  |  12 +-
+ fs/btrfs/disk-io.c                   |   3 +-
+ fs/btrfs/extent-tree.c               |  14 +-
+ fs/btrfs/file-item.c                 |  25 +-
+ fs/btrfs/file.c                      |  26 +-
+ fs/btrfs/free-space-cache.c          | 177 ++++++--
+ fs/btrfs/free-space-cache.h          |   2 +
+ fs/btrfs/inode-item.c                |  14 +-
+ fs/btrfs/inode.c                     | 172 +++----
+ fs/btrfs/ioctl.c                     | 258 +++++++----
+ fs/btrfs/lzo.c                       |  20 +-
+ fs/btrfs/print-tree.c                |   8 +-
+ fs/btrfs/props.c                     |   2 +-
+ fs/btrfs/qgroup.c                    |  41 +-
+ fs/btrfs/ref-verify.c                |   2 +-
+ fs/btrfs/reflink.c                   |   2 +-
+ fs/btrfs/relocation.c                |   2 +-
+ fs/btrfs/root-tree.c                 |   4 +-
+ fs/btrfs/scrub.c                     |   2 +-
+ fs/btrfs/send.c                      |  22 +-
+ fs/btrfs/sysfs.c                     |  10 +-
+ fs/btrfs/tests/extent-buffer-tests.c |  17 +-
+ fs/btrfs/transaction.c               |  76 +---
+ fs/btrfs/transaction.h               |   2 +-
+ fs/btrfs/tree-checker.c              |  56 +--
+ fs/btrfs/tree-log.c                  | 656 ++++++++++++---------------
+ fs/btrfs/uuid-tree.c                 |  10 +-
+ fs/btrfs/verity.c                    |   2 +-
+ fs/btrfs/volumes.c                   |  13 +-
+ fs/btrfs/xattr.c                     |   8 +-
+ include/uapi/linux/btrfs.h           |   6 +-
+ include/uapi/linux/btrfs_tree.h      |   4 +-
+ 39 files changed, 1062 insertions(+), 952 deletions(-)
+
+diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
+index 8b090c40daf7..3b0c4bed242e 100644
+--- a/fs/btrfs/backref.c
++++ b/fs/btrfs/backref.c
+@@ -950,7 +950,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
+ 	leaf = path->nodes[0];
+ 	slot = path->slots[0];
+ 
+-	item_size = btrfs_item_size_nr(leaf, slot);
++	item_size = btrfs_item_size(leaf, slot);
+ 	BUG_ON(item_size < sizeof(*ei));
+ 
+ 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+@@ -1792,7 +1792,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+ 	}
+ 
+ 	eb = path->nodes[0];
+-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
++	item_size = btrfs_item_size(eb, path->slots[0]);
+ 	BUG_ON(item_size < sizeof(*ei));
+ 
+ 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+@@ -2071,7 +2071,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+ 	u64 parent = 0;
+ 	int found = 0;
+ 	struct extent_buffer *eb;
+-	struct btrfs_item *item;
+ 	struct btrfs_inode_ref *iref;
+ 	struct btrfs_key found_key;
+ 
+@@ -2097,10 +2096,9 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+ 		}
+ 		btrfs_release_path(path);
+ 
+-		item = btrfs_item_nr(slot);
+ 		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+ 
+-		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
++		for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) {
+ 			name_len = btrfs_inode_ref_name_len(eb, iref);
+ 			/* path must be released before calling iterate()! */
+ 			btrfs_debug(fs_root->fs_info,
+@@ -2156,7 +2154,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+ 		}
+ 		btrfs_release_path(path);
+ 
+-		item_size = btrfs_item_size_nr(eb, slot);
++		item_size = btrfs_item_size(eb, slot);
+ 		ptr = btrfs_item_ptr_offset(eb, slot);
+ 		cur_offset = 0;
+ 
+@@ -2377,7 +2375,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
+ 	iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ 						    path->slots[0]);
+ 	iter->end_ptr = (u32)(iter->item_ptr +
+-			btrfs_item_size_nr(path->nodes[0], path->slots[0]));
++			btrfs_item_size(path->nodes[0], path->slots[0]));
+ 	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ 			    struct btrfs_extent_item);
+ 
+@@ -2417,7 +2415,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
+ 		iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ 							   path->slots[0]);
+ 		iter->item_ptr = iter->cur_ptr;
+-		iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr(
++		iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size(
+ 				      path->nodes[0], path->slots[0]));
+ 	}
+ 
+@@ -2482,7 +2480,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
+ 	iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ 					path->slots[0]);
+ 	iter->cur_ptr = iter->item_ptr;
+-	iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0],
++	iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0],
+ 						path->slots[0]);
+ 	return 0;
+ }
+diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
+index b67c965725ea..27da1dfbd626 100644
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -3924,9 +3924,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
+ 		list_del_init(&block_group->bg_list);
+ 		btrfs_put_block_group(block_group);
+ 	}
+-	spin_unlock(&info->unused_bgs_lock);
+ 
+-	spin_lock(&info->unused_bgs_lock);
+ 	while (!list_empty(&info->reclaim_bgs)) {
+ 		block_group = list_first_entry(&info->reclaim_bgs,
+ 					       struct btrfs_block_group,
+diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
+index ab2a4a52e0bb..b3e46aabc3d8 100644
+--- a/fs/btrfs/btrfs_inode.h
++++ b/fs/btrfs/btrfs_inode.h
+@@ -138,19 +138,11 @@ struct btrfs_inode {
+ 	/* a local copy of root's last_log_commit */
+ 	int last_log_commit;
+ 
+-	union {
+-		/*
+-		 * Total number of bytes pending delalloc, used by stat to
+-		 * calculate the real block usage of the file. This is used
+-		 * only for files.
+-		 */
+-		u64 delalloc_bytes;
+-		/*
+-		 * The offset of the last dir item key that was logged.
+-		 * This is used only for directories.
+-		 */
+-		u64 last_dir_item_offset;
+-	};
++	/*
++	 * Total number of bytes pending delalloc, used by stat to calculate the
++	 * real block usage of the file. This is used only for files.
++	 */
++	u64 delalloc_bytes;
+ 
+ 	union {
+ 		/*
+diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
+index 56eef0821e3e..7dbd14caab01 100644
+--- a/fs/btrfs/compression.h
++++ b/fs/btrfs/compression.h
+@@ -22,6 +22,8 @@ struct btrfs_inode;
+ 
+ /* Maximum length of compressed data stored on disk */
+ #define BTRFS_MAX_COMPRESSED		(SZ_128K)
++static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
++
+ /* Maximum size of data before compression */
+ #define BTRFS_MAX_UNCOMPRESSED		(SZ_128K)
+ 
+diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
+index 35660791e084..5ca7a535d109 100644
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -2627,19 +2627,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
+  */
+ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
+ {
+-	struct btrfs_item *start_item;
+-	struct btrfs_item *end_item;
+ 	int data_len;
+ 	int nritems = btrfs_header_nritems(l);
+ 	int end = min(nritems, start + nr) - 1;
+ 
+ 	if (!nr)
+ 		return 0;
+-	start_item = btrfs_item_nr(start);
+-	end_item = btrfs_item_nr(end);
+-	data_len = btrfs_item_offset(l, start_item) +
+-		   btrfs_item_size(l, start_item);
+-	data_len = data_len - btrfs_item_offset(l, end_item);
++	data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
++	data_len = data_len - btrfs_item_offset(l, end);
+ 	data_len += sizeof(struct btrfs_item) * nr;
+ 	WARN_ON(data_len < 0);
+ 	return data_len;
+@@ -2686,7 +2681,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
+ 	u32 i;
+ 	int push_space = 0;
+ 	int push_items = 0;
+-	struct btrfs_item *item;
+ 	u32 nr;
+ 	u32 right_nritems;
+ 	u32 data_end;
+@@ -2703,8 +2697,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
+ 	slot = path->slots[1];
+ 	i = left_nritems - 1;
+ 	while (i >= nr) {
+-		item = btrfs_item_nr(i);
+-
+ 		if (!empty && push_items > 0) {
+ 			if (path->slots[0] > i)
+ 				break;
+@@ -2719,12 +2711,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
+ 		if (path->slots[0] == i)
+ 			push_space += data_size;
+ 
+-		this_item_size = btrfs_item_size(left, item);
+-		if (this_item_size + sizeof(*item) + push_space > free_space)
++		this_item_size = btrfs_item_size(left, i);
++		if (this_item_size + sizeof(struct btrfs_item) +
++		    push_space > free_space)
+ 			break;
+ 
+ 		push_items++;
+-		push_space += this_item_size + sizeof(*item);
++		push_space += this_item_size + sizeof(struct btrfs_item);
+ 		if (i == 0)
+ 			break;
+ 		i--;
+@@ -2738,7 +2731,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
+ 	/* push left to right */
+ 	right_nritems = btrfs_header_nritems(right);
+ 
+-	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
++	push_space = btrfs_item_data_end(left, left_nritems - push_items);
+ 	push_space -= leaf_data_end(left);
+ 
+ 	/* make room in the right data area */
+@@ -2769,9 +2762,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
+ 	btrfs_set_header_nritems(right, right_nritems);
+ 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
+ 	for (i = 0; i < right_nritems; i++) {
+-		item = btrfs_item_nr(i);
+-		push_space -= btrfs_token_item_size(&token, item);
+-		btrfs_set_token_item_offset(&token, item, push_space);
++		push_space -= btrfs_token_item_size(&token, i);
++		btrfs_set_token_item_offset(&token, i, push_space);
+ 	}
+ 
+ 	left_nritems -= push_items;
+@@ -2856,16 +2848,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+ 	if (free_space < data_size)
+ 		goto out_unlock;
+ 
+-	/* cow and double check */
+ 	ret = btrfs_cow_block(trans, root, right, upper,
+ 			      slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
+ 	if (ret)
+ 		goto out_unlock;
+ 
+-	free_space = btrfs_leaf_free_space(right);
+-	if (free_space < data_size)
+-		goto out_unlock;
+-
+ 	left_nritems = btrfs_header_nritems(left);
+ 	if (left_nritems == 0)
+ 		goto out_unlock;
+@@ -2916,7 +2903,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ 	int i;
+ 	int push_space = 0;
+ 	int push_items = 0;
+-	struct btrfs_item *item;
+ 	u32 old_left_nritems;
+ 	u32 nr;
+ 	int ret = 0;
+@@ -2930,8 +2916,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ 		nr = min(right_nritems - 1, max_slot);
+ 
+ 	for (i = 0; i < nr; i++) {
+-		item = btrfs_item_nr(i);
+-
+ 		if (!empty && push_items > 0) {
+ 			if (path->slots[0] < i)
+ 				break;
+@@ -2946,12 +2930,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ 		if (path->slots[0] == i)
+ 			push_space += data_size;
+ 
+-		this_item_size = btrfs_item_size(right, item);
+-		if (this_item_size + sizeof(*item) + push_space > free_space)
++		this_item_size = btrfs_item_size(right, i);
++		if (this_item_size + sizeof(struct btrfs_item) + push_space >
++		    free_space)
+ 			break;
+ 
+ 		push_items++;
+-		push_space += this_item_size + sizeof(*item);
++		push_space += this_item_size + sizeof(struct btrfs_item);
+ 	}
+ 
+ 	if (push_items == 0) {
+@@ -2967,25 +2952,23 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ 			   push_items * sizeof(struct btrfs_item));
+ 
+ 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
+-		     btrfs_item_offset_nr(right, push_items - 1);
++		     btrfs_item_offset(right, push_items - 1);
+ 
+ 	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
+ 		     leaf_data_end(left) - push_space,
+ 		     BTRFS_LEAF_DATA_OFFSET +
+-		     btrfs_item_offset_nr(right, push_items - 1),
++		     btrfs_item_offset(right, push_items - 1),
+ 		     push_space);
+ 	old_left_nritems = btrfs_header_nritems(left);
+ 	BUG_ON(old_left_nritems <= 0);
+ 
+ 	btrfs_init_map_token(&token, left);
+-	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
++	old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
+ 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+ 		u32 ioff;
+ 
+-		item = btrfs_item_nr(i);
+-
+-		ioff = btrfs_token_item_offset(&token, item);
+-		btrfs_set_token_item_offset(&token, item,
++		ioff = btrfs_token_item_offset(&token, i);
++		btrfs_set_token_item_offset(&token, i,
+ 		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
+ 	}
+ 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
+@@ -2996,7 +2979,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ 		       right_nritems);
+ 
+ 	if (push_items < right_nritems) {
+-		push_space = btrfs_item_offset_nr(right, push_items - 1) -
++		push_space = btrfs_item_offset(right, push_items - 1) -
+ 						  leaf_data_end(right);
+ 		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
+ 				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
+@@ -3014,10 +2997,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ 	btrfs_set_header_nritems(right, right_nritems);
+ 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
+ 	for (i = 0; i < right_nritems; i++) {
+-		item = btrfs_item_nr(i);
+-
+-		push_space = push_space - btrfs_token_item_size(&token, item);
+-		btrfs_set_token_item_offset(&token, item, push_space);
++		push_space = push_space - btrfs_token_item_size(&token, i);
++		btrfs_set_token_item_offset(&token, i, push_space);
+ 	}
+ 
+ 	btrfs_mark_buffer_dirty(left);
+@@ -3096,7 +3077,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ 		goto out;
+ 	}
+ 
+-	/* cow and double check */
+ 	ret = btrfs_cow_block(trans, root, left,
+ 			      path->nodes[1], slot - 1, &left,
+ 			      BTRFS_NESTING_LEFT_COW);
+@@ -3107,12 +3087,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ 		goto out;
+ 	}
+ 
+-	free_space = btrfs_leaf_free_space(left);
+-	if (free_space < data_size) {
+-		ret = 1;
+-		goto out;
+-	}
+-
+ 	if (check_sibling_keys(left, right)) {
+ 		ret = -EUCLEAN;
+ 		goto out;
+@@ -3145,7 +3119,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
+ 
+ 	nritems = nritems - mid;
+ 	btrfs_set_header_nritems(right, nritems);
+-	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
++	data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
+ 
+ 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+ 			   btrfs_item_nr_offset(mid),
+@@ -3156,15 +3130,14 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
+ 		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
+ 		     leaf_data_end(l), data_copy_size);
+ 
+-	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
++	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
+ 
+ 	btrfs_init_map_token(&token, right);
+ 	for (i = 0; i < nritems; i++) {
+-		struct btrfs_item *item = btrfs_item_nr(i);
+ 		u32 ioff;
+ 
+-		ioff = btrfs_token_item_offset(&token, item);
+-		btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
++		ioff = btrfs_token_item_offset(&token, i);
++		btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
+ 	}
+ 
+ 	btrfs_set_header_nritems(l, mid);
+@@ -3280,7 +3253,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
+ 
+ 	l = path->nodes[0];
+ 	slot = path->slots[0];
+-	if (extend && data_size + btrfs_item_size_nr(l, slot) +
++	if (extend && data_size + btrfs_item_size(l, slot) +
+ 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
+ 		return -EOVERFLOW;
+ 
+@@ -3449,7 +3422,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
+ 	if (btrfs_leaf_free_space(leaf) >= ins_len)
+ 		return 0;
+ 
+-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++	item_size = btrfs_item_size(leaf, path->slots[0]);
+ 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ 		fi = btrfs_item_ptr(leaf, path->slots[0],
+ 				    struct btrfs_file_extent_item);
+@@ -3469,7 +3442,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
+ 	ret = -EAGAIN;
+ 	leaf = path->nodes[0];
+ 	/* if our item isn't there, return now */
+-	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
++	if (item_size != btrfs_item_size(leaf, path->slots[0]))
+ 		goto err;
+ 
+ 	/* the leaf has  changed, it now has room.  return now */
+@@ -3500,9 +3473,7 @@ static noinline int split_item(struct btrfs_path *path,
+ 			       unsigned long split_offset)
+ {
+ 	struct extent_buffer *leaf;
+-	struct btrfs_item *item;
+-	struct btrfs_item *new_item;
+-	int slot;
++	int orig_slot, slot;
+ 	char *buf;
+ 	u32 nritems;
+ 	u32 item_size;
+@@ -3512,9 +3483,9 @@ static noinline int split_item(struct btrfs_path *path,
+ 	leaf = path->nodes[0];
+ 	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
+ 
+-	item = btrfs_item_nr(path->slots[0]);
+-	orig_offset = btrfs_item_offset(leaf, item);
+-	item_size = btrfs_item_size(leaf, item);
++	orig_slot = path->slots[0];
++	orig_offset = btrfs_item_offset(leaf, path->slots[0]);
++	item_size = btrfs_item_size(leaf, path->slots[0]);
+ 
+ 	buf = kmalloc(item_size, GFP_NOFS);
+ 	if (!buf)
+@@ -3535,14 +3506,12 @@ static noinline int split_item(struct btrfs_path *path,
+ 	btrfs_cpu_key_to_disk(&disk_key, new_key);
+ 	btrfs_set_item_key(leaf, &disk_key, slot);
+ 
+-	new_item = btrfs_item_nr(slot);
++	btrfs_set_item_offset(leaf, slot, orig_offset);
++	btrfs_set_item_size(leaf, slot, item_size - split_offset);
+ 
+-	btrfs_set_item_offset(leaf, new_item, orig_offset);
+-	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+-
+-	btrfs_set_item_offset(leaf, item,
+-			      orig_offset + item_size - split_offset);
+-	btrfs_set_item_size(leaf, item, split_offset);
++	btrfs_set_item_offset(leaf, orig_slot,
++				 orig_offset + item_size - split_offset);
++	btrfs_set_item_size(leaf, orig_slot, split_offset);
+ 
+ 	btrfs_set_header_nritems(leaf, nritems + 1);
+ 
+@@ -3603,7 +3572,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+ {
+ 	int slot;
+ 	struct extent_buffer *leaf;
+-	struct btrfs_item *item;
+ 	u32 nritems;
+ 	unsigned int data_end;
+ 	unsigned int old_data_start;
+@@ -3615,14 +3583,14 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+ 	leaf = path->nodes[0];
+ 	slot = path->slots[0];
+ 
+-	old_size = btrfs_item_size_nr(leaf, slot);
++	old_size = btrfs_item_size(leaf, slot);
+ 	if (old_size == new_size)
+ 		return;
+ 
+ 	nritems = btrfs_header_nritems(leaf);
+ 	data_end = leaf_data_end(leaf);
+ 
+-	old_data_start = btrfs_item_offset_nr(leaf, slot);
++	old_data_start = btrfs_item_offset(leaf, slot);
+ 
+ 	size_diff = old_size - new_size;
+ 
+@@ -3636,10 +3604,9 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+ 	btrfs_init_map_token(&token, leaf);
+ 	for (i = slot; i < nritems; i++) {
+ 		u32 ioff;
+-		item = btrfs_item_nr(i);
+ 
+-		ioff = btrfs_token_item_offset(&token, item);
+-		btrfs_set_token_item_offset(&token, item, ioff + size_diff);
++		ioff = btrfs_token_item_offset(&token, i);
++		btrfs_set_token_item_offset(&token, i, ioff + size_diff);
+ 	}
+ 
+ 	/* shift the data */
+@@ -3682,8 +3649,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+ 			fixup_low_keys(path, &disk_key, 1);
+ 	}
+ 
+-	item = btrfs_item_nr(slot);
+-	btrfs_set_item_size(leaf, item, new_size);
++	btrfs_set_item_size(leaf, slot, new_size);
+ 	btrfs_mark_buffer_dirty(leaf);
+ 
+ 	if (btrfs_leaf_free_space(leaf) < 0) {
+@@ -3699,7 +3665,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+ {
+ 	int slot;
+ 	struct extent_buffer *leaf;
+-	struct btrfs_item *item;
+ 	u32 nritems;
+ 	unsigned int data_end;
+ 	unsigned int old_data;
+@@ -3717,7 +3682,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+ 		BUG();
+ 	}
+ 	slot = path->slots[0];
+-	old_data = btrfs_item_end_nr(leaf, slot);
++	old_data = btrfs_item_data_end(leaf, slot);
+ 
+ 	BUG_ON(slot < 0);
+ 	if (slot >= nritems) {
+@@ -3734,10 +3699,9 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+ 	btrfs_init_map_token(&token, leaf);
+ 	for (i = slot; i < nritems; i++) {
+ 		u32 ioff;
+-		item = btrfs_item_nr(i);
+ 
+-		ioff = btrfs_token_item_offset(&token, item);
+-		btrfs_set_token_item_offset(&token, item, ioff - data_size);
++		ioff = btrfs_token_item_offset(&token, i);
++		btrfs_set_token_item_offset(&token, i, ioff - data_size);
+ 	}
+ 
+ 	/* shift the data */
+@@ -3746,9 +3710,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+ 		      data_end, old_data - data_end);
+ 
+ 	data_end = old_data;
+-	old_size = btrfs_item_size_nr(leaf, slot);
+-	item = btrfs_item_nr(slot);
+-	btrfs_set_item_size(leaf, item, old_size + data_size);
++	old_size = btrfs_item_size(leaf, slot);
++	btrfs_set_item_size(leaf, slot, old_size + data_size);
+ 	btrfs_mark_buffer_dirty(leaf);
+ 
+ 	if (btrfs_leaf_free_space(leaf) < 0) {
+@@ -3770,7 +3733,6 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
+ 				   const struct btrfs_item_batch *batch)
+ {
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
+-	struct btrfs_item *item;
+ 	int i;
+ 	u32 nritems;
+ 	unsigned int data_end;
+@@ -3807,7 +3769,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
+ 
+ 	btrfs_init_map_token(&token, leaf);
+ 	if (slot != nritems) {
+-		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
++		unsigned int old_data = btrfs_item_data_end(leaf, slot);
+ 
+ 		if (old_data < data_end) {
+ 			btrfs_print_leaf(leaf);
+@@ -3823,10 +3785,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
+ 		for (i = slot; i < nritems; i++) {
+ 			u32 ioff;
+ 
+-			item = btrfs_item_nr(i);
+-			ioff = btrfs_token_item_offset(&token, item);
+-			btrfs_set_token_item_offset(&token, item,
+-						    ioff - batch->total_data_size);
++			ioff = btrfs_token_item_offset(&token, i);
++			btrfs_set_token_item_offset(&token, i,
++						       ioff - batch->total_data_size);
+ 		}
+ 		/* shift the items */
+ 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
+@@ -3845,10 +3806,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
+ 	for (i = 0; i < batch->nr; i++) {
+ 		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
+ 		btrfs_set_item_key(leaf, &disk_key, slot + i);
+-		item = btrfs_item_nr(slot + i);
+ 		data_end -= batch->data_sizes[i];
+-		btrfs_set_token_item_offset(&token, item, data_end);
+-		btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
++		btrfs_set_token_item_offset(&token, slot + i, data_end);
++		btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
+ 	}
+ 
+ 	btrfs_set_header_nritems(leaf, nritems + batch->nr);
+@@ -3955,7 +3915,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ 	u32 item_size;
+ 
+ 	leaf = path->nodes[0];
+-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++	item_size = btrfs_item_size(leaf, path->slots[0]);
+ 	ret = setup_leaf_for_split(trans, root, path,
+ 				   item_size + sizeof(struct btrfs_item));
+ 	if (ret)
+@@ -4056,25 +4016,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ {
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
+ 	struct extent_buffer *leaf;
+-	struct btrfs_item *item;
+-	u32 last_off;
+-	u32 dsize = 0;
+ 	int ret = 0;
+ 	int wret;
+-	int i;
+ 	u32 nritems;
+ 
+ 	leaf = path->nodes[0];
+-	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+-
+-	for (i = 0; i < nr; i++)
+-		dsize += btrfs_item_size_nr(leaf, slot + i);
+-
+ 	nritems = btrfs_header_nritems(leaf);
+ 
+ 	if (slot + nr != nritems) {
+-		int data_end = leaf_data_end(leaf);
++		const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
++		const int data_end = leaf_data_end(leaf);
+ 		struct btrfs_map_token token;
++		u32 dsize = 0;
++		int i;
++
++		for (i = 0; i < nr; i++)
++			dsize += btrfs_item_size(leaf, slot + i);
+ 
+ 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ 			      data_end + dsize,
+@@ -4085,9 +4042,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ 		for (i = slot + nr; i < nritems; i++) {
+ 			u32 ioff;
+ 
+-			item = btrfs_item_nr(i);
+-			ioff = btrfs_token_item_offset(&token, item);
+-			btrfs_set_token_item_offset(&token, item, ioff + dsize);
++			ioff = btrfs_token_item_offset(&token, i);
++			btrfs_set_token_item_offset(&token, i, ioff + dsize);
+ 		}
+ 
+ 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
+@@ -4115,24 +4071,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ 			fixup_low_keys(path, &disk_key, 1);
+ 		}
+ 
+-		/* delete the leaf if it is mostly empty */
++		/*
++		 * Try to delete the leaf if it is mostly empty. We do this by
++		 * trying to move all its items into its left and right neighbours.
++		 * If we can't move all the items, then we don't delete it - it's
++		 * not ideal, but future insertions might fill the leaf with more
++		 * items, or items from other leaves might be moved later into our
++		 * leaf due to deletions on those leaves.
++		 */
+ 		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
++			u32 min_push_space;
++
+ 			/* push_leaf_left fixes the path.
+ 			 * make sure the path still points to our leaf
+ 			 * for possible call to del_ptr below
+ 			 */
+ 			slot = path->slots[1];
+ 			atomic_inc(&leaf->refs);
+-
+-			wret = push_leaf_left(trans, root, path, 1, 1,
+-					      1, (u32)-1);
++			/*
++			 * We want to be able to at least push one item to the
++			 * left neighbour leaf, and that's the first item.
++			 */
++			min_push_space = sizeof(struct btrfs_item) +
++				btrfs_item_size(leaf, 0);
++			wret = push_leaf_left(trans, root, path, 0,
++					      min_push_space, 1, (u32)-1);
+ 			if (wret < 0 && wret != -ENOSPC)
+ 				ret = wret;
+ 
+ 			if (path->nodes[0] == leaf &&
+ 			    btrfs_header_nritems(leaf)) {
+-				wret = push_leaf_right(trans, root, path, 1,
+-						       1, 1, 0);
++				/*
++				 * If we were not able to push all items from our
++				 * leaf to its left neighbour, then attempt to
++				 * either push all the remaining items to the
++				 * right neighbour or none. There's no advantage
++				 * in pushing only some items, instead of all, as
++				 * it's pointless to end up with a leaf having
++				 * too few items while the neighbours can be full
++				 * or nearly full.
++				 */
++				nritems = btrfs_header_nritems(leaf);
++				min_push_space = leaf_space_used(leaf, 0, nritems);
++				wret = push_leaf_right(trans, root, path, 0,
++						       min_push_space, 1, 0);
+ 				if (wret < 0 && wret != -ENOSPC)
+ 					ret = wret;
+ 			}
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 5fe5eccb3c87..223e9d9e1b8b 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -601,6 +601,9 @@ enum {
+ 	/* Indicate whether there are any tree modification log users */
+ 	BTRFS_FS_TREE_MOD_LOG_USERS,
+ 
++	/* Indicate that we want the transaction kthread to commit right now. */
++	BTRFS_FS_COMMIT_TRANS,
++
+ #if BITS_PER_LONG == 32
+ 	/* Indicate if we have error/warn message printed on 32bit systems */
+ 	BTRFS_FS_32BIT_ERROR,
+@@ -1603,25 +1606,25 @@ DECLARE_BTRFS_SETGET_BITS(64)
+ static inline u##bits btrfs_##name(const struct extent_buffer *eb,	\
+ 				   const type *s)			\
+ {									\
+-	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
++	static_assert(sizeof(u##bits) == sizeof(((type *)0))->member);	\
+ 	return btrfs_get_##bits(eb, s, offsetof(type, member));		\
+ }									\
+ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
+ 				    u##bits val)			\
+ {									\
+-	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
++	static_assert(sizeof(u##bits) == sizeof(((type *)0))->member);	\
+ 	btrfs_set_##bits(eb, s, offsetof(type, member), val);		\
+ }									\
+ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token,	\
+ 					 const type *s)			\
+ {									\
+-	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
++	static_assert(sizeof(u##bits) == sizeof(((type *)0))->member);	\
+ 	return btrfs_get_token_##bits(token, s, offsetof(type, member));\
+ }									\
+ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
+ 					  type *s, u##bits val)		\
+ {									\
+-	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
++	static_assert(sizeof(u##bits) == sizeof(((type *)0))->member);	\
+ 	btrfs_set_token_##bits(token, s, offsetof(type, member), val);	\
+ }
+ 
+@@ -1652,8 +1655,8 @@ static inline void btrfs_set_##name(type *s, u##bits val)		\
+ static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
+ 					   struct btrfs_dev_item *s)
+ {
+-	BUILD_BUG_ON(sizeof(u64) !=
+-		     sizeof(((struct btrfs_dev_item *)0))->total_bytes);
++	static_assert(sizeof(u64) ==
++		      sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ 	return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
+ 					    total_bytes));
+ }
+@@ -1661,8 +1664,8 @@ static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
+ 						struct btrfs_dev_item *s,
+ 						u64 val)
+ {
+-	BUILD_BUG_ON(sizeof(u64) !=
+-		     sizeof(((struct btrfs_dev_item *)0))->total_bytes);
++	static_assert(sizeof(u64) ==
++		      sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ 	WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
+ 	btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
+ }
+@@ -1960,8 +1963,8 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb,
+ }
+ 
+ /* struct btrfs_item */
+-BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
+-BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
++BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
++BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
+ BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
+ BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
+ 
+@@ -1976,25 +1979,36 @@ static inline struct btrfs_item *btrfs_item_nr(int nr)
+ 	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+ }
+ 
+-static inline u32 btrfs_item_end(const struct extent_buffer *eb,
+-				 struct btrfs_item *item)
+-{
+-	return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
++#define BTRFS_ITEM_SETGET_FUNCS(member)						\
++static inline u32 btrfs_item_##member(const struct extent_buffer *eb,		\
++				      int slot)					\
++{										\
++	return btrfs_raw_item_##member(eb, btrfs_item_nr(slot));		\
++}										\
++static inline void btrfs_set_item_##member(const struct extent_buffer *eb,	\
++					   int slot, u32 val)			\
++{										\
++	btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val);		\
++}										\
++static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token,	\
++					    int slot)				\
++{										\
++	struct btrfs_item *item = btrfs_item_nr(slot);				\
++	return btrfs_token_raw_item_##member(token, item);			\
++}										\
++static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token,	\
++						 int slot, u32 val)		\
++{										\
++	struct btrfs_item *item = btrfs_item_nr(slot);				\
++	btrfs_set_token_raw_item_##member(token, item, val);			\
+ }
+ 
+-static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
+-{
+-	return btrfs_item_end(eb, btrfs_item_nr(nr));
+-}
++BTRFS_ITEM_SETGET_FUNCS(offset)
++BTRFS_ITEM_SETGET_FUNCS(size);
+ 
+-static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr)
++static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
+ {
+-	return btrfs_item_offset(eb, btrfs_item_nr(nr));
+-}
+-
+-static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr)
+-{
+-	return btrfs_item_size(eb, btrfs_item_nr(nr));
++	return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
+ }
+ 
+ static inline void btrfs_item_key(const struct extent_buffer *eb,
+@@ -2463,7 +2477,7 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
+ 
+ 	if (nr == 0)
+ 		return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
+-	return btrfs_item_offset_nr(leaf, nr - 1);
++	return btrfs_item_offset(leaf, nr - 1);
+ }
+ 
+ /* struct btrfs_file_extent_item */
+@@ -2522,9 +2536,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+  */
+ static inline u32 btrfs_file_extent_inline_item_len(
+ 						const struct extent_buffer *eb,
+-						struct btrfs_item *e)
++						int nr)
+ {
+-	return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
++	return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+ }
+ 
+ /* btrfs_qgroup_status_item */
+@@ -2616,11 +2630,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+ /* helper function to cast into the data area of the leaf. */
+ #define btrfs_item_ptr(leaf, slot, type) \
+ 	((type *)(BTRFS_LEAF_DATA_OFFSET + \
+-	btrfs_item_offset_nr(leaf, slot)))
++	btrfs_item_offset(leaf, slot)))
+ 
+ #define btrfs_item_ptr_offset(leaf, slot) \
+ 	((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
+-	btrfs_item_offset_nr(leaf, slot)))
++	btrfs_item_offset(leaf, slot)))
+ 
+ static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
+ {
+@@ -3297,9 +3311,27 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+ int btrfs_ioctl_get_supported_features(void __user *arg);
+ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
+ int __pure btrfs_is_empty_uuid(u8 *uuid);
++
++struct btrfs_defrag_ctrl {
++	/* Input, read-only fields */
++	u64	start;
++	u64	len;
++	u32	extent_thresh;
++	u64	newer_than;
++	u64	max_sectors_to_defrag;
++	u8	compress;
++	u8	flags;
++
++	/* Output fields */
++	u64	sectors_defragged;
++	u64	last_scanned;	/* Exclusive bytenr */
++};
++int btrfs_defrag_ioctl_args_to_ctrl(struct btrfs_fs_info *fs_info,
++				    struct btrfs_ioctl_defrag_range_args *args,
++				    struct btrfs_defrag_ctrl *ctrl,
++				    u64 max_sectors_to_defrag, u64 newer_than);
+ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+-		      struct btrfs_ioctl_defrag_range_args *range,
+-		      u64 newer_than, unsigned long max_to_defrag);
++		      struct btrfs_defrag_ctrl *ctrl);
+ void btrfs_get_block_group_info(struct list_head *groups_list,
+ 				struct btrfs_ioctl_space_info *space);
+ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
+index e0238dd5f2f2..66fa61cb3f23 100644
+--- a/fs/btrfs/dev-replace.c
++++ b/fs/btrfs/dev-replace.c
+@@ -128,7 +128,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+ 	}
+ 	slot = path->slots[0];
+ 	eb = path->nodes[0];
+-	item_size = btrfs_item_size_nr(eb, slot);
++	item_size = btrfs_item_size(eb, slot);
+ 	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+ 
+ 	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+@@ -381,7 +381,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
+ 	}
+ 
+ 	if (ret == 0 &&
+-	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
++	    btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ 		/*
+ 		 * need to delete old one and insert a new one.
+ 		 * Since no attempt is made to recover any old state, if the
+diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
+index 7721ce0c0604..3b532bab0755 100644
+--- a/fs/btrfs/dir-item.c
++++ b/fs/btrfs/dir-item.c
+@@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
+ 	int ret;
+ 	char *ptr;
+-	struct btrfs_item *item;
+ 	struct extent_buffer *leaf;
+ 
+ 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+@@ -41,10 +40,9 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+ 		return ERR_PTR(ret);
+ 	WARN_ON(ret > 0);
+ 	leaf = path->nodes[0];
+-	item = btrfs_item_nr(path->slots[0]);
+ 	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+-	BUG_ON(data_size > btrfs_item_size(leaf, item));
+-	ptr += btrfs_item_size(leaf, item) - data_size;
++	ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0]));
++	ptr += btrfs_item_size(leaf, path->slots[0]) - data_size;
+ 	return (struct btrfs_dir_item *)ptr;
+ }
+ 
+@@ -271,7 +269,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ 	data_size = sizeof(*di) + name_len;
+ 	leaf = path->nodes[0];
+ 	slot = path->slots[0];
+-	if (data_size + btrfs_item_size_nr(leaf, slot) +
++	if (data_size + btrfs_item_size(leaf, slot) +
+ 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
+ 		ret = -EOVERFLOW;
+ 	} else {
+@@ -409,7 +407,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
+ 	leaf = path->nodes[0];
+ 	dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+ 
+-	total_len = btrfs_item_size_nr(leaf, path->slots[0]);
++	total_len = btrfs_item_size(leaf, path->slots[0]);
+ 	while (cur < total_len) {
+ 		this_len = sizeof(*dir_item) +
+ 			btrfs_dir_name_len(leaf, dir_item) +
+@@ -445,7 +443,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+ 	leaf = path->nodes[0];
+ 	sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
+ 		btrfs_dir_data_len(leaf, di);
+-	item_len = btrfs_item_size_nr(leaf, path->slots[0]);
++	item_len = btrfs_item_size(leaf, path->slots[0]);
+ 	if (sub_item_len == item_len) {
+ 		ret = btrfs_del_item(trans, root, path);
+ 	} else {
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index 5f0a879c1043..e4275da0572c 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1935,7 +1935,8 @@ static int transaction_kthread(void *arg)
+ 		}
+ 
+ 		delta = ktime_get_seconds() - cur->start_time;
+-		if (cur->state < TRANS_STATE_COMMIT_START &&
++		if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
++		    cur->state < TRANS_STATE_COMMIT_START &&
+ 		    delta < fs_info->commit_interval) {
+ 			spin_unlock(&fs_info->trans_lock);
+ 			delay -= msecs_to_jiffies((delta - 1) * 1000);
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 7b4ee1b2d5d8..91ca32c9459a 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -171,7 +171,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ 
+ 	if (ret == 0) {
+ 		leaf = path->nodes[0];
+-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++		item_size = btrfs_item_size(leaf, path->slots[0]);
+ 		if (item_size >= sizeof(*ei)) {
+ 			ei = btrfs_item_ptr(leaf, path->slots[0],
+ 					    struct btrfs_extent_item);
+@@ -865,7 +865,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
+ 	}
+ 
+ 	leaf = path->nodes[0];
+-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++	item_size = btrfs_item_size(leaf, path->slots[0]);
+ 	if (unlikely(item_size < sizeof(*ei))) {
+ 		err = -EINVAL;
+ 		btrfs_print_v0_err(fs_info);
+@@ -1007,7 +1007,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
+ 		__run_delayed_extent_op(extent_op, leaf, ei);
+ 
+ 	ptr = (unsigned long)ei + item_offset;
+-	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
++	end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
+ 	if (ptr < end - size)
+ 		memmove_extent_buffer(leaf, ptr + size, ptr,
+ 				      end - size - ptr);
+@@ -1119,7 +1119,7 @@ void update_inline_extent_backref(struct btrfs_path *path,
+ 	} else {
+ 		*last_ref = 1;
+ 		size =  btrfs_extent_inline_ref_size(type);
+-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++		item_size = btrfs_item_size(leaf, path->slots[0]);
+ 		ptr = (unsigned long)iref;
+ 		end = (unsigned long)ei + item_size;
+ 		if (ptr + size < end)
+@@ -1634,7 +1634,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
+ 	}
+ 
+ 	leaf = path->nodes[0];
+-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++	item_size = btrfs_item_size(leaf, path->slots[0]);
+ 
+ 	if (unlikely(item_size < sizeof(*ei))) {
+ 		err = -EINVAL;
+@@ -2316,7 +2316,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
+ 		goto out;
+ 
+ 	ret = 1;
+-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++	item_size = btrfs_item_size(leaf, path->slots[0]);
+ 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ 
+ 	/* If extent item has more than 1 inline ref then it's shared */
+@@ -3068,7 +3068,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ 	}
+ 
+ 	leaf = path->nodes[0];
+-	item_size = btrfs_item_size_nr(leaf, extent_slot);
++	item_size = btrfs_item_size(leaf, extent_slot);
+ 	if (unlikely(item_size < sizeof(*ei))) {
+ 		ret = -EINVAL;
+ 		btrfs_print_v0_err(info);
+diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
+index d1cbb64a78f3..107d6557ebc3 100644
+--- a/fs/btrfs/file-item.c
++++ b/fs/btrfs/file-item.c
+@@ -208,7 +208,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+ 
+ 		csum_offset = (bytenr - found_key.offset) >>
+ 				fs_info->sectorsize_bits;
+-		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
++		csums_in_item = btrfs_item_size(leaf, path->slots[0]);
+ 		csums_in_item /= csum_size;
+ 
+ 		if (csum_offset == csums_in_item) {
+@@ -274,7 +274,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
+ 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ 				      struct btrfs_csum_item);
+ 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+-		itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
++		itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
+ 
+ 		csum_start = key.offset;
+ 		csum_len = (itemsize / csum_size) * sectorsize;
+@@ -291,7 +291,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
+ 		goto out;
+ 	}
+ 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+-	itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
++	itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
+ 
+ 	csum_start = key.offset;
+ 	csum_len = (itemsize / csum_size) * sectorsize;
+@@ -534,7 +534,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ 		    key.type == BTRFS_EXTENT_CSUM_KEY) {
+ 			offset = (start - key.offset) >> fs_info->sectorsize_bits;
+ 			if (offset * csum_size <
+-			    btrfs_item_size_nr(leaf, path->slots[0] - 1))
++			    btrfs_item_size(leaf, path->slots[0] - 1))
+ 				path->slots[0]--;
+ 		}
+ 	}
+@@ -559,7 +559,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ 		if (key.offset > start)
+ 			start = key.offset;
+ 
+-		size = btrfs_item_size_nr(leaf, path->slots[0]);
++		size = btrfs_item_size(leaf, path->slots[0]);
+ 		csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
+ 		if (csum_end <= start) {
+ 			path->slots[0]++;
+@@ -750,7 +750,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
+ 	u32 blocksize_bits = fs_info->sectorsize_bits;
+ 
+ 	leaf = path->nodes[0];
+-	csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
++	csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
+ 	csum_end <<= blocksize_bits;
+ 	csum_end += key->offset;
+ 
+@@ -834,7 +834,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ 		if (key.offset >= end_byte)
+ 			break;
+ 
+-		csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
++		csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
+ 		csum_end <<= blocksize_bits;
+ 		csum_end += key.offset;
+ 
+@@ -1002,7 +1002,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ 		item_end = btrfs_item_ptr(leaf, path->slots[0],
+ 					  struct btrfs_csum_item);
+ 		item_end = (struct btrfs_csum_item *)((char *)item_end +
+-			   btrfs_item_size_nr(leaf, path->slots[0]));
++			   btrfs_item_size(leaf, path->slots[0]));
+ 		goto found;
+ 	}
+ 	ret = PTR_ERR(item);
+@@ -1013,7 +1013,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ 		u32 item_size;
+ 		/* we found one, but it isn't big enough yet */
+ 		leaf = path->nodes[0];
+-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++		item_size = btrfs_item_size(leaf, path->slots[0]);
+ 		if ((item_size / csum_size) >=
+ 		    MAX_CSUM_ITEMS(fs_info, csum_size)) {
+ 			/* already at max size, make a new one */
+@@ -1070,7 +1070,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ 	}
+ 
+ extend_csum:
+-	if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
++	if (csum_offset == btrfs_item_size(leaf, path->slots[0]) /
+ 	    csum_size) {
+ 		int extend_nr;
+ 		u64 tmp;
+@@ -1125,7 +1125,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ 		diff = min(diff,
+ 			   MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
+ 
+-		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
++		diff = diff - btrfs_item_size(leaf, path->slots[0]);
+ 		diff = min_t(u32, btrfs_leaf_free_space(leaf), diff);
+ 		diff /= csum_size;
+ 		diff *= csum_size;
+@@ -1162,7 +1162,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ csum:
+ 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+ 	item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+-				      btrfs_item_size_nr(leaf, path->slots[0]));
++				      btrfs_item_size(leaf, path->slots[0]));
+ 	item = (struct btrfs_csum_item *)((unsigned char *)item +
+ 					  csum_offset * csum_size);
+ found:
+@@ -1208,6 +1208,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
+ 	extent_start = key.offset;
+ 	extent_end = btrfs_file_extent_end(path);
+ 	em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
++	em->generation = btrfs_file_extent_generation(leaf, fi);
+ 	if (type == BTRFS_FILE_EXTENT_REG ||
+ 	    type == BTRFS_FILE_EXTENT_PREALLOC) {
+ 		em->start = extent_start;
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
+index 11204dbbe053..12e63be6a35b 100644
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -277,8 +277,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ {
+ 	struct btrfs_root *inode_root;
+ 	struct inode *inode;
+-	struct btrfs_ioctl_defrag_range_args range;
+-	int num_defrag;
++	struct btrfs_defrag_ctrl ctrl = {0};
+ 	int ret;
+ 
+ 	/* get the inode */
+@@ -297,21 +296,23 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ 
+ 	/* do a chunk of defrag */
+ 	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+-	memset(&range, 0, sizeof(range));
+-	range.len = (u64)-1;
+-	range.start = defrag->last_offset;
++	ctrl.len = (u64)-1;
++	ctrl.start = defrag->last_offset;
++	ctrl.newer_than = defrag->transid;
++	ctrl.max_sectors_to_defrag = BTRFS_DEFRAG_BATCH;
+ 
+ 	sb_start_write(fs_info->sb);
+-	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+-				       BTRFS_DEFRAG_BATCH);
++	ret = btrfs_defrag_file(inode, NULL, &ctrl);
+ 	sb_end_write(fs_info->sb);
++	if (ret < 0)
++		goto out;
+ 	/*
+ 	 * if we filled the whole defrag batch, there
+ 	 * must be more work to do.  Queue this defrag
+ 	 * again
+ 	 */
+-	if (num_defrag == BTRFS_DEFRAG_BATCH) {
+-		defrag->last_offset = range.start;
++	if (ctrl.sectors_defragged == BTRFS_DEFRAG_BATCH) {
++		defrag->last_offset = ctrl.last_scanned;
+ 		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
+ 	} else if (defrag->last_offset && !defrag->cycled) {
+ 		/*
+@@ -325,7 +326,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ 	} else {
+ 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ 	}
+-
++out:
+ 	iput(inode);
+ 	return 0;
+ cleanup:
+@@ -718,7 +719,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ 	int modify_tree = -1;
+ 	int update_refs;
+ 	int found = 0;
+-	int leafs_visited = 0;
+ 	struct btrfs_path *path = args->path;
+ 
+ 	args->bytes_found = 0;
+@@ -756,7 +756,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ 				path->slots[0]--;
+ 		}
+ 		ret = 0;
+-		leafs_visited++;
+ next_slot:
+ 		leaf = path->nodes[0];
+ 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+@@ -768,7 +767,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ 				ret = 0;
+ 				break;
+ 			}
+-			leafs_visited++;
+ 			leaf = path->nodes[0];
+ 			recow = 1;
+ 		}
+@@ -1014,7 +1012,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ 	 * which case it unlocked our path, so check path->locks[0] matches a
+ 	 * write lock.
+ 	 */
+-	if (!ret && args->replace_extent && leafs_visited == 1 &&
++	if (!ret && args->replace_extent &&
+ 	    path->locks[0] == BTRFS_WRITE_LOCK &&
+ 	    btrfs_leaf_free_space(leaf) >=
+ 	    sizeof(struct btrfs_item) + args->extent_item_size) {
+diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
+index f3fee88c8ee0..a45017b12185 100644
+--- a/fs/btrfs/free-space-cache.c
++++ b/fs/btrfs/free-space-cache.c
+@@ -1580,6 +1580,50 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
+ 	return 0;
+ }
+ 
++/*
++ * This is a little subtle.  We *only* have ->max_extent_size set if we actually
++ * searched through the bitmap and figured out the largest ->max_extent_size,
++ * otherwise it's 0.  In the case that it's 0 we don't want to tell the
++ * allocator the wrong thing, we want to use the actual real max_extent_size
++ * we've found already if it's larger, or we want to use ->bytes.
++ *
++ * This matters because find_free_space() will skip entries who's ->bytes is
++ * less than the required bytes.  So if we didn't search down this bitmap, we
++ * may pick some previous entry that has a smaller ->max_extent_size than we
++ * have.  For example, assume we have two entries, one that has
++ * ->max_extent_size set to 4K and ->bytes set to 1M.  A second entry hasn't set
++ * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous.  We will
++ *  call into find_free_space(), and return with max_extent_size == 4K, because
++ *  that first bitmap entry had ->max_extent_size set, but the second one did
++ *  not.  If instead we returned 8K we'd come in searching for 8K, and find the
++ *  8K contiguous range.
++ *
++ *  Consider the other case, we have 2 8K chunks in that second entry and still
++ *  don't have ->max_extent_size set.  We'll return 16K, and the next time the
++ *  allocator comes in it'll fully search our second bitmap, and this time it'll
++ *  get an uptodate value of 8K as the maximum chunk size.  Then we'll get the
++ *  right allocation the next loop through.
++ */
++static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
++{
++	if (entry->bitmap && entry->max_extent_size)
++		return entry->max_extent_size;
++	return entry->bytes;
++}
++
++/*
++ * We want the largest entry to be leftmost, so this is inverted from what you'd
++ * normally expect.
++ */
++static bool entry_less(struct rb_node *node, const struct rb_node *parent)
++{
++	const struct btrfs_free_space *entry, *exist;
++
++	entry = rb_entry(node, struct btrfs_free_space, bytes_index);
++	exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
++	return get_max_extent_size(exist) < get_max_extent_size(entry);
++}
++
+ /*
+  * searches the tree for the given offset.
+  *
+@@ -1592,15 +1636,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
+ 		   u64 offset, int bitmap_only, int fuzzy)
+ {
+ 	struct rb_node *n = ctl->free_space_offset.rb_node;
+-	struct btrfs_free_space *entry, *prev = NULL;
++	struct btrfs_free_space *entry = NULL, *prev = NULL;
+ 
+ 	/* find entry that is closest to the 'offset' */
+-	while (1) {
+-		if (!n) {
+-			entry = NULL;
+-			break;
+-		}
+-
++	while (n) {
+ 		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ 		prev = entry;
+ 
+@@ -1610,6 +1649,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
+ 			n = n->rb_right;
+ 		else
+ 			break;
++
++		entry = NULL;
+ 	}
+ 
+ 	if (bitmap_only) {
+@@ -1686,6 +1727,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
+ 		return NULL;
+ 
+ 	while (1) {
++		n = rb_next(&entry->offset_index);
++		if (!n)
++			return NULL;
++		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ 		if (entry->bitmap) {
+ 			if (entry->offset + BITS_PER_BITMAP *
+ 			    ctl->unit > offset)
+@@ -1694,11 +1739,6 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
+ 			if (entry->offset + entry->bytes > offset)
+ 				break;
+ 		}
+-
+-		n = rb_next(&entry->offset_index);
+-		if (!n)
+-			return NULL;
+-		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ 	}
+ 	return entry;
+ }
+@@ -1708,6 +1748,7 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ 		    struct btrfs_free_space *info)
+ {
+ 	rb_erase(&info->offset_index, &ctl->free_space_offset);
++	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
+ 	ctl->free_extents--;
+ 
+ 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
+@@ -1734,6 +1775,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
+ 	if (ret)
+ 		return ret;
+ 
++	rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
++
+ 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
+ 		ctl->discardable_extents[BTRFS_STAT_CURR]++;
+ 		ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
+@@ -1744,6 +1787,22 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
+ 	return ret;
+ }
+ 
++static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
++				struct btrfs_free_space *info)
++{
++	ASSERT(info->bitmap);
++
++	/*
++	 * If our entry is empty it's because we're on a cluster and we don't
++	 * want to re-link it into our ctl bytes index.
++	 */
++	if (RB_EMPTY_NODE(&info->bytes_index))
++		return;
++
++	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
++	rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
++}
++
+ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ 				       struct btrfs_free_space *info,
+ 				       u64 offset, u64 bytes)
+@@ -1762,6 +1821,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ 	if (info->max_extent_size > ctl->unit)
+ 		info->max_extent_size = 0;
+ 
++	relink_bitmap_entry(ctl, info);
++
+ 	if (start && test_bit(start - 1, info->bitmap))
+ 		extent_delta++;
+ 
+@@ -1797,9 +1858,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
+ 
+ 	bitmap_set(info->bitmap, start, count);
+ 
++	/*
++	 * We set some bytes, we have no idea what the max extent size is
++	 * anymore.
++	 */
++	info->max_extent_size = 0;
+ 	info->bytes += bytes;
+ 	ctl->free_space += bytes;
+ 
++	relink_bitmap_entry(ctl, info);
++
+ 	if (start && test_bit(start - 1, info->bitmap))
+ 		extent_delta--;
+ 
+@@ -1867,20 +1935,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
+ 
+ 	*bytes = (u64)(max_bits) * ctl->unit;
+ 	bitmap_info->max_extent_size = *bytes;
++	relink_bitmap_entry(ctl, bitmap_info);
+ 	return -1;
+ }
+ 
+-static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
+-{
+-	if (entry->bitmap)
+-		return entry->max_extent_size;
+-	return entry->bytes;
+-}
+-
+ /* Cache the size of the max extent in bytes */
+ static struct btrfs_free_space *
+ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+-		unsigned long align, u64 *max_extent_size)
++		unsigned long align, u64 *max_extent_size, bool use_bytes_index)
+ {
+ 	struct btrfs_free_space *entry;
+ 	struct rb_node *node;
+@@ -1890,16 +1952,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+ 
+ 	if (!ctl->free_space_offset.rb_node)
+ 		goto out;
++again:
++	if (use_bytes_index) {
++		node = rb_first_cached(&ctl->free_space_bytes);
++	} else {
++		entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
++					   0, 1);
++		if (!entry)
++			goto out;
++		node = &entry->offset_index;
++	}
+ 
+-	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
+-	if (!entry)
+-		goto out;
++	for (; node; node = rb_next(node)) {
++		if (use_bytes_index)
++			entry = rb_entry(node, struct btrfs_free_space,
++					 bytes_index);
++		else
++			entry = rb_entry(node, struct btrfs_free_space,
++					 offset_index);
+ 
+-	for (node = &entry->offset_index; node; node = rb_next(node)) {
+-		entry = rb_entry(node, struct btrfs_free_space, offset_index);
++		/*
++		 * If we are using the bytes index then all subsequent entries
++		 * in this tree are going to be < bytes, so simply set the max
++		 * extent size and exit the loop.
++		 *
++		 * If we're using the offset index then we need to keep going
++		 * through the rest of the tree.
++		 */
+ 		if (entry->bytes < *bytes) {
+ 			*max_extent_size = max(get_max_extent_size(entry),
+ 					       *max_extent_size);
++			if (use_bytes_index)
++				break;
+ 			continue;
+ 		}
+ 
+@@ -1916,6 +2000,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+ 			tmp = entry->offset;
+ 		}
+ 
++		/*
++		 * We don't break here if we're using the bytes index because we
++		 * may have another entry that has the correct alignment that is
++		 * the right size, so we don't want to miss that possibility.
++		 * At worst this adds another loop through the logic, but if we
++		 * broke here we could prematurely ENOSPC.
++		 */
+ 		if (entry->bytes < *bytes + align_off) {
+ 			*max_extent_size = max(get_max_extent_size(entry),
+ 					       *max_extent_size);
+@@ -1923,6 +2014,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+ 		}
+ 
+ 		if (entry->bitmap) {
++			struct rb_node *old_next = rb_next(node);
+ 			u64 size = *bytes;
+ 
+ 			ret = search_bitmap(ctl, entry, &tmp, &size, true);
+@@ -1935,6 +2027,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+ 					max(get_max_extent_size(entry),
+ 					    *max_extent_size);
+ 			}
++
++			/*
++			 * The bitmap may have gotten re-arranged in the space
++			 * index here because the max_extent_size may have been
++			 * updated.  Start from the beginning again if this
++			 * happened.
++			 */
++			if (use_bytes_index && old_next != rb_next(node))
++				goto again;
+ 			continue;
+ 		}
+ 
+@@ -2083,12 +2184,6 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
+ 
+ 	bitmap_set_bits(ctl, info, offset, bytes_to_set);
+ 
+-	/*
+-	 * We set some bytes, we have no idea what the max extent size is
+-	 * anymore.
+-	 */
+-	info->max_extent_size = 0;
+-
+ 	return bytes_to_set;
+ 
+ }
+@@ -2486,6 +2581,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
+ 	info->bytes = bytes;
+ 	info->trim_state = trim_state;
+ 	RB_CLEAR_NODE(&info->offset_index);
++	RB_CLEAR_NODE(&info->bytes_index);
+ 
+ 	spin_lock(&ctl->tree_lock);
+ 
+@@ -2799,6 +2895,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
+ 	ctl->start = block_group->start;
+ 	ctl->private = block_group;
+ 	ctl->op = &free_space_op;
++	ctl->free_space_bytes = RB_ROOT_CACHED;
+ 	INIT_LIST_HEAD(&ctl->trimming_ranges);
+ 	mutex_init(&ctl->cache_writeout_mutex);
+ 
+@@ -2864,6 +2961,8 @@ static void __btrfs_return_cluster_to_free_space(
+ 		}
+ 		tree_insert_offset(&ctl->free_space_offset,
+ 				   entry->offset, &entry->offset_index, bitmap);
++		rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
++			      entry_less);
+ 	}
+ 	cluster->root = RB_ROOT;
+ 	spin_unlock(&cluster->lock);
+@@ -2965,12 +3064,14 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
+ 	u64 align_gap = 0;
+ 	u64 align_gap_len = 0;
+ 	enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
++	bool use_bytes_index = (offset == block_group->start);
+ 
+ 	ASSERT(!btrfs_is_zoned(block_group->fs_info));
+ 
+ 	spin_lock(&ctl->tree_lock);
+ 	entry = find_free_space(ctl, &offset, &bytes_search,
+-				block_group->full_stripe_len, max_extent_size);
++				block_group->full_stripe_len, max_extent_size,
++				use_bytes_index);
+ 	if (!entry)
+ 		goto out;
+ 
+@@ -3254,6 +3355,17 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
+ 
+ 	cluster->window_start = start * ctl->unit + entry->offset;
+ 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
++	rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
++
++	/*
++	 * We need to know if we're currently on the normal space index when we
++	 * manipulate the bitmap so that we know we need to remove and re-insert
++	 * it into the space_index tree.  Clear the bytes_index node here so the
++	 * bitmap manipulation helpers know not to mess with the space_index
++	 * until this bitmap entry is added back into the normal cache.
++	 */
++	RB_CLEAR_NODE(&entry->bytes_index);
++
+ 	ret = tree_insert_offset(&cluster->root, entry->offset,
+ 				 &entry->offset_index, 1);
+ 	ASSERT(!ret); /* -EEXIST; Logic error */
+@@ -3344,6 +3456,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
+ 			continue;
+ 
+ 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
++		rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
+ 		ret = tree_insert_offset(&cluster->root, entry->offset,
+ 					 &entry->offset_index, 0);
+ 		total_size += entry->bytes;
+diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
+index 1f23088d43f9..dd982d204d2d 100644
+--- a/fs/btrfs/free-space-cache.h
++++ b/fs/btrfs/free-space-cache.h
+@@ -22,6 +22,7 @@ enum btrfs_trim_state {
+ 
+ struct btrfs_free_space {
+ 	struct rb_node offset_index;
++	struct rb_node bytes_index;
+ 	u64 offset;
+ 	u64 bytes;
+ 	u64 max_extent_size;
+@@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap(
+ struct btrfs_free_space_ctl {
+ 	spinlock_t tree_lock;
+ 	struct rb_root free_space_offset;
++	struct rb_root_cached free_space_bytes;
+ 	u64 free_space;
+ 	int extents_thresh;
+ 	int free_extents;
+diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
+index 37f36ffdaf6b..56755ce9a907 100644
+--- a/fs/btrfs/inode-item.c
++++ b/fs/btrfs/inode-item.c
+@@ -19,7 +19,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+ 	u32 cur_offset = 0;
+ 	int len;
+ 
+-	item_size = btrfs_item_size_nr(leaf, slot);
++	item_size = btrfs_item_size(leaf, slot);
+ 	ptr = btrfs_item_ptr_offset(leaf, slot);
+ 	while (cur_offset < item_size) {
+ 		ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+@@ -45,7 +45,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
+ 	u32 cur_offset = 0;
+ 	int ref_name_len;
+ 
+-	item_size = btrfs_item_size_nr(leaf, slot);
++	item_size = btrfs_item_size(leaf, slot);
+ 	ptr = btrfs_item_ptr_offset(leaf, slot);
+ 
+ 	/*
+@@ -139,7 +139,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
+ 	}
+ 
+ 	leaf = path->nodes[0];
+-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++	item_size = btrfs_item_size(leaf, path->slots[0]);
+ 	if (index)
+ 		*index = btrfs_inode_extref_index(leaf, extref);
+ 
+@@ -208,7 +208,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ 		goto out;
+ 	}
+ 	leaf = path->nodes[0];
+-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++	item_size = btrfs_item_size(leaf, path->slots[0]);
+ 
+ 	if (index)
+ 		*index = btrfs_inode_ref_index(leaf, ref);
+@@ -256,7 +256,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+ 	struct btrfs_path *path;
+ 	struct btrfs_key key;
+ 	struct extent_buffer *leaf;
+-	struct btrfs_item *item;
+ 
+ 	key.objectid = inode_objectid;
+ 	key.type = BTRFS_INODE_EXTREF_KEY;
+@@ -282,9 +281,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+ 		goto out;
+ 
+ 	leaf = path->nodes[0];
+-	item = btrfs_item_nr(path->slots[0]);
+ 	ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+-	ptr += btrfs_item_size(leaf, item) - ins_len;
++	ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len;
+ 	extref = (struct btrfs_inode_extref *)ptr;
+ 
+ 	btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+@@ -332,7 +330,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ 		if (ref)
+ 			goto out;
+ 
+-		old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
++		old_size = btrfs_item_size(path->nodes[0], path->slots[0]);
+ 		btrfs_extend_item(path, ins_len);
+ 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ 				     struct btrfs_inode_ref);
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 39a674543461..0ed8cc6afa37 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -61,8 +61,6 @@ struct btrfs_iget_args {
+ };
+ 
+ struct btrfs_dio_data {
+-	u64 reserve;
+-	loff_t length;
+ 	ssize_t submitted;
+ 	struct extent_changeset *data_reserved;
+ };
+@@ -625,7 +623,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
+ again:
+ 	will_compress = 0;
+ 	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+-	BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
+ 	nr_pages = min_t(unsigned long, nr_pages,
+ 			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
+ 
+@@ -5950,21 +5947,17 @@ static struct inode *new_simple_dir(struct super_block *s,
+ 	return inode;
+ }
+ 
++static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
++static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
++static_assert(BTRFS_FT_DIR == FT_DIR);
++static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
++static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
++static_assert(BTRFS_FT_FIFO == FT_FIFO);
++static_assert(BTRFS_FT_SOCK == FT_SOCK);
++static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
++
+ static inline u8 btrfs_inode_type(struct inode *inode)
+ {
+-	/*
+-	 * Compile-time asserts that generic FT_* types still match
+-	 * BTRFS_FT_* types
+-	 */
+-	BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
+-	BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
+-	BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
+-	BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
+-	BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
+-	BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
+-	BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
+-	BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
+-
+ 	return fs_umode_to_ftype(inode->i_mode);
+ }
+ 
+@@ -6998,8 +6991,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
+ 	WARN_ON(pg_offset != 0);
+ 	compress_type = btrfs_file_extent_compression(leaf, item);
+ 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
+-	inline_size = btrfs_file_extent_inline_item_len(leaf,
+-					btrfs_item_nr(path->slots[0]));
++	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
+ 	tmp = kmalloc(inline_size, GFP_NOFS);
+ 	if (!tmp)
+ 		return -ENOMEM;
+@@ -7773,6 +7765,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ {
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ 	struct extent_map *em = *map;
++	int type;
++	u64 block_start, orig_start, orig_block_len, ram_bytes;
++	bool can_nocow = false;
++	bool space_reserved = false;
+ 	int ret = 0;
+ 
+ 	/*
+@@ -7787,9 +7783,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ 	     em->block_start != EXTENT_MAP_HOLE)) {
+-		int type;
+-		u64 block_start, orig_start, orig_block_len, ram_bytes;
+-
+ 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ 			type = BTRFS_ORDERED_PREALLOC;
+ 		else
+@@ -7799,53 +7792,92 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ 
+ 		if (can_nocow_extent(inode, start, &len, &orig_start,
+ 				     &orig_block_len, &ram_bytes, false) == 1 &&
+-		    btrfs_inc_nocow_writers(fs_info, block_start)) {
+-			struct extent_map *em2;
++		    btrfs_inc_nocow_writers(fs_info, block_start))
++			can_nocow = true;
++	}
+ 
+-			em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
+-						      orig_start, block_start,
+-						      len, orig_block_len,
+-						      ram_bytes, type);
++	if (can_nocow) {
++		struct extent_map *em2;
++
++		/* We can NOCOW, so only need to reserve metadata space. */
++		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
++		if (ret < 0) {
++			/* Our caller expects us to free the input extent map. */
++			free_extent_map(em);
++			*map = NULL;
+ 			btrfs_dec_nocow_writers(fs_info, block_start);
+-			if (type == BTRFS_ORDERED_PREALLOC) {
+-				free_extent_map(em);
+-				*map = em = em2;
+-			}
+-
+-			if (em2 && IS_ERR(em2)) {
+-				ret = PTR_ERR(em2);
+-				goto out;
+-			}
+-			/*
+-			 * For inode marked NODATACOW or extent marked PREALLOC,
+-			 * use the existing or preallocated extent, so does not
+-			 * need to adjust btrfs_space_info's bytes_may_use.
+-			 */
+-			btrfs_free_reserved_data_space_noquota(fs_info, len);
+-			goto skip_cow;
++			goto out;
+ 		}
++		space_reserved = true;
++
++		em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
++					      orig_start, block_start,
++					      len, orig_block_len,
++					      ram_bytes, type);
++		btrfs_dec_nocow_writers(fs_info, block_start);
++		if (type == BTRFS_ORDERED_PREALLOC) {
++			free_extent_map(em);
++			*map = em = em2;
++		}
++
++		if (IS_ERR(em2)) {
++			ret = PTR_ERR(em2);
++			goto out;
++		}
++	} else {
++		const u64 prev_len = len;
++
++		/* Our caller expects us to free the input extent map. */
++		free_extent_map(em);
++		*map = NULL;
++
++		/* We have to COW, so need to reserve metadata and data space. */
++		ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
++						   &dio_data->data_reserved,
++						   start, len);
++		if (ret < 0)
++			goto out;
++		space_reserved = true;
++
++		em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
++		if (IS_ERR(em)) {
++			ret = PTR_ERR(em);
++			goto out;
++		}
++		*map = em;
++		len = min(len, em->len - (start - em->start));
++		if (len < prev_len)
++			btrfs_delalloc_release_space(BTRFS_I(inode),
++						     dio_data->data_reserved,
++						     start + len, prev_len - len,
++						     true);
+ 	}
+ 
+-	/* this will cow the extent */
+-	free_extent_map(em);
+-	*map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
+-	if (IS_ERR(em)) {
+-		ret = PTR_ERR(em);
+-		goto out;
+-	}
++	/*
++	 * We have created our ordered extent, so we can now release our reservation
++	 * for an outstanding extent.
++	 */
++	btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+ 
+-	len = min(len, em->len - (start - em->start));
+-
+-skip_cow:
+ 	/*
+ 	 * Need to update the i_size under the extent lock so buffered
+ 	 * readers will get the updated i_size when we unlock.
+ 	 */
+ 	if (start + len > i_size_read(inode))
+ 		i_size_write(inode, start + len);
+-
+-	dio_data->reserve -= len;
+ out:
++	if (ret && space_reserved) {
++		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
++		if (can_nocow) {
++			btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
++		} else {
++			btrfs_delalloc_release_space(BTRFS_I(inode),
++						     dio_data->data_reserved,
++						     start, len, true);
++			extent_changeset_free(dio_data->data_reserved);
++			dio_data->data_reserved = NULL;
++		}
++	}
+ 	return ret;
+ }
+ 
+@@ -7887,18 +7919,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ 	if (!dio_data)
+ 		return -ENOMEM;
+ 
+-	dio_data->length = length;
+-	if (write) {
+-		dio_data->reserve = round_up(length, fs_info->sectorsize);
+-		ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+-				&dio_data->data_reserved,
+-				start, dio_data->reserve);
+-		if (ret) {
+-			extent_changeset_free(dio_data->data_reserved);
+-			kfree(dio_data);
+-			return ret;
+-		}
+-	}
+ 	iomap->private = dio_data;
+ 
+ 
+@@ -7991,14 +8011,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 			     &cached_state);
+ err:
+-	if (dio_data) {
+-		btrfs_delalloc_release_space(BTRFS_I(inode),
+-				dio_data->data_reserved, start,
+-				dio_data->reserve, true);
+-		btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
+-		extent_changeset_free(dio_data->data_reserved);
+-		kfree(dio_data);
+-	}
++	kfree(dio_data);
++
+ 	return ret;
+ }
+ 
+@@ -8028,14 +8042,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ 		ret = -ENOTBLK;
+ 	}
+ 
+-	if (write) {
+-		if (dio_data->reserve)
+-			btrfs_delalloc_release_space(BTRFS_I(inode),
+-					dio_data->data_reserved, pos,
+-					dio_data->reserve, true);
+-		btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
++	if (write)
+ 		extent_changeset_free(dio_data->data_reserved);
+-	}
+ out:
+ 	kfree(dio_data);
+ 	iomap->private = NULL;
+diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
+index 48e03e176f31..5de240144273 100644
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -1020,23 +1020,37 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+ 	return em;
+ }
+ 
++static u32 get_extent_max_capacity(const struct extent_map *em)
++{
++	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
++		return BTRFS_MAX_COMPRESSED;
++	return BTRFS_MAX_EXTENT_SIZE;
++}
++
+ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+ 				     bool locked)
+ {
+ 	struct extent_map *next;
+-	bool ret = true;
++	bool ret = false;
+ 
+ 	/* this is the last extent */
+ 	if (em->start + em->len >= i_size_read(inode))
+-		return false;
++		return ret;
+ 
+ 	next = defrag_lookup_extent(inode, em->start + em->len, locked);
++	/* No more em or hole */
+ 	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+-		ret = false;
+-	else if ((em->block_start + em->block_len == next->block_start) &&
+-		 (em->block_len > SZ_128K && next->block_len > SZ_128K))
+-		ret = false;
+-
++		goto out;
++	if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
++		goto out;
++	/*
++	 * If the next extent is at its max capcity, defragging current extent
++	 * makes no sense, as the total number of extents won't change.
++	 */
++	if (next->len >= get_extent_max_capacity(em))
++		goto out;
++	ret = true;
++out:
+ 	free_extent_map(next);
+ 	return ret;
+ }
+@@ -1146,22 +1160,21 @@ struct defrag_target_range {
+ /*
+  * Collect all valid target extents.
+  *
++ * @ctrl:	   extra defrag policy control
+  * @start:	   file offset to lookup
+  * @len:	   length to lookup
+- * @extent_thresh: file extent size threshold, any extent size >= this value
+- *		   will be ignored
+- * @newer_than:    only defrag extents newer than this value
+- * @do_compress:   whether the defrag is doing compression
+- *		   if true, @extent_thresh will be ignored and all regular
+- *		   file extents meeting @newer_than will be targets.
+  * @locked:	   if the range has already held extent lock
+  * @target_list:   list of targets file extents
++ *
++ * Will update ctrl::last_scanned.
+  */
+ static int defrag_collect_targets(struct btrfs_inode *inode,
+-				  u64 start, u64 len, u32 extent_thresh,
+-				  u64 newer_than, bool do_compress,
+-				  bool locked, struct list_head *target_list)
++				  struct btrfs_defrag_ctrl *ctrl,
++				  u64 start, u32 len, bool locked,
++				  struct list_head *target_list)
+ {
++	bool do_compress = ctrl->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
++	bool last_is_target = false;
+ 	u64 cur = start;
+ 	int ret = 0;
+ 
+@@ -1171,6 +1184,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
+ 		bool next_mergeable = true;
+ 		u64 range_len;
+ 
++		last_is_target = false;
+ 		em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+ 		if (!em)
+ 			break;
+@@ -1181,7 +1195,11 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
+ 			goto next;
+ 
+ 		/* Skip older extent */
+-		if (em->generation < newer_than)
++		if (em->generation < ctrl->newer_than)
++			goto next;
++
++		/* This em is under writeback, no need to defrag */
++		if (em->generation == (u64)-1)
+ 			goto next;
+ 
+ 		/*
+@@ -1221,7 +1239,14 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
+ 			goto add;
+ 
+ 		/* Skip too large extent */
+-		if (range_len >= extent_thresh)
++		if (range_len >= ctrl->extent_thresh)
++			goto next;
++
++		/*
++		 * Skip extents already at its max capacity, this is mostly for
++		 * compressed extents, which max cap is only 128K.
++		 */
++		if (em->len >= get_extent_max_capacity(em))
+ 			goto next;
+ 
+ 		next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+@@ -1242,6 +1267,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
+ 		}
+ 
+ add:
++		last_is_target = true;
+ 		range_len = min(extent_map_end(em), start + len) - cur;
+ 		/*
+ 		 * This one is a good target, check if it can be merged into
+@@ -1285,10 +1311,27 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
+ 			kfree(entry);
+ 		}
+ 	}
++	if (!ret) {
++		/*
++		 * If the last extent is not a target, the caller can skip to
++		 * the end of that extent.
++		 * Otherwise, we can only go the end of the spcified range.
++		 *
++		 * And we may got a range smaller than current
++		 * ctrl->last_scanned (e.g. executed in the defrag_one_range
++		 * call), so we have to ensure we didn't decrease
++		 * ctrl->last_scanned.
++		 */
++		if (!last_is_target)
++			ctrl->last_scanned = max(cur, ctrl->last_scanned);
++		else
++			ctrl->last_scanned = max(start + len, ctrl->last_scanned);
++	}
+ 	return ret;
+ }
+ 
+ #define CLUSTER_SIZE	(SZ_256K)
++static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+ 
+ /*
+  * Defrag one contiguous target range.
+@@ -1342,8 +1385,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
+ 	return ret;
+ }
+ 
+-static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+-			    u32 extent_thresh, u64 newer_than, bool do_compress)
++static int defrag_one_range(struct btrfs_inode *inode,
++			    struct btrfs_defrag_ctrl *ctrl, u64 start, u32 len)
+ {
+ 	struct extent_state *cached_state = NULL;
+ 	struct defrag_target_range *entry;
+@@ -1387,8 +1430,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ 	 * And this time we have extent locked already, pass @locked = true
+ 	 * so that we won't relock the extent range and cause deadlock.
+ 	 */
+-	ret = defrag_collect_targets(inode, start, len, extent_thresh,
+-				     newer_than, do_compress, true,
++	ret = defrag_collect_targets(inode, ctrl, start, len, true,
+ 				     &target_list);
+ 	if (ret < 0)
+ 		goto unlock_extent;
+@@ -1398,6 +1440,8 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ 					       &cached_state);
+ 		if (ret < 0)
+ 			break;
++		ctrl->sectors_defragged += entry->len >>
++					   inode->root->fs_info->sectorsize_bits;
+ 	}
+ 
+ 	list_for_each_entry_safe(entry, tmp, &target_list, list) {
+@@ -1419,12 +1463,17 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ 	return ret;
+ }
+ 
++/*
++ * Return <0 for error.
++ * Return >0 if we hit the ctrl->max_sectors_to_defrag limit
++ * Return 0 if we finished the range without error.
++ *
++ * For >= 0 case, ctrl->last_scanned and ctrl->sectors_defragged will be updated.
++ */
+ static int defrag_one_cluster(struct btrfs_inode *inode,
+ 			      struct file_ra_state *ra,
+-			      u64 start, u32 len, u32 extent_thresh,
+-			      u64 newer_than, bool do_compress,
+-			      unsigned long *sectors_defragged,
+-			      unsigned long max_sectors)
++			      struct btrfs_defrag_ctrl *ctrl,
++			      u64 start, u32 len)
+ {
+ 	const u32 sectorsize = inode->root->fs_info->sectorsize;
+ 	struct defrag_target_range *entry;
+@@ -1432,9 +1481,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
+ 	LIST_HEAD(target_list);
+ 	int ret;
+ 
+-	BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+-	ret = defrag_collect_targets(inode, start, len, extent_thresh,
+-				     newer_than, do_compress, false,
++	ret = defrag_collect_targets(inode, ctrl, start, len, false,
+ 				     &target_list);
+ 	if (ret < 0)
+ 		goto out;
+@@ -1443,32 +1490,25 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
+ 		u32 range_len = entry->len;
+ 
+ 		/* Reached or beyond the limit */
+-		if (max_sectors && *sectors_defragged >= max_sectors) {
++		if (ctrl->max_sectors_to_defrag &&
++		    ctrl->sectors_defragged >= ctrl->max_sectors_to_defrag) {
+ 			ret = 1;
+ 			break;
+ 		}
+ 
+-		if (max_sectors)
++		if (ctrl->max_sectors_to_defrag)
+ 			range_len = min_t(u32, range_len,
+-				(max_sectors - *sectors_defragged) * sectorsize);
++				(ctrl->max_sectors_to_defrag -
++				 ctrl->sectors_defragged) * sectorsize);
+ 
+ 		if (ra)
+ 			page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ 				ra, NULL, entry->start >> PAGE_SHIFT,
+ 				((entry->start + range_len - 1) >> PAGE_SHIFT) -
+ 				(entry->start >> PAGE_SHIFT) + 1);
+-		/*
+-		 * Here we may not defrag any range if holes are punched before
+-		 * we locked the pages.
+-		 * But that's fine, it only affects the @sectors_defragged
+-		 * accounting.
+-		 */
+-		ret = defrag_one_range(inode, entry->start, range_len,
+-				       extent_thresh, newer_than, do_compress);
++		ret = defrag_one_range(inode, ctrl, entry->start, range_len);
+ 		if (ret < 0)
+ 			break;
+-		*sectors_defragged += range_len >>
+-				      inode->root->fs_info->sectorsize_bits;
+ 	}
+ out:
+ 	list_for_each_entry_safe(entry, tmp, &target_list, list) {
+@@ -1478,64 +1518,93 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
+ 	return ret;
+ }
+ 
++/*
++ * Convert the old ioctl format to the new btrfs_defrag_ctrl structure.
++ *
++ * Will also do basic tasks like setting default values and sanity checks.
++ */
++int btrfs_defrag_ioctl_args_to_ctrl(struct btrfs_fs_info *fs_info,
++				    struct btrfs_ioctl_defrag_range_args *args,
++				    struct btrfs_defrag_ctrl *ctrl,
++				    u64 max_sectors_to_defrag, u64 newer_than)
++{
++	u64 range_end;
++
++	if (args->flags & ~BTRFS_DEFRAG_RANGE_FLAGS_MASK)
++		return -EOPNOTSUPP;
++	if (args->compress_type >= BTRFS_NR_COMPRESS_TYPES)
++		return -EOPNOTSUPP;
++
++	ctrl->start = round_down(args->start, fs_info->sectorsize);
++	/*
++	 * If @len does not overflow with @start nor is -1, align the length.
++	 * Otherwise set it to (u64)-1 so later btrfs_defrag_file() will
++	 * determine the length using isize.
++	 */
++	if (!check_add_overflow(args->start, args->len, &range_end) &&
++	    args->len != (u64)-1)
++		ctrl->len = round_up(range_end, fs_info->sectorsize) -
++			    ctrl->start;
++	else
++		ctrl->len = -1;
++	ctrl->flags = args->flags;
++	ctrl->compress = args->compress_type;
++	if (args->extent_thresh == 0)
++		ctrl->extent_thresh = SZ_256K;
++	else
++		ctrl->extent_thresh = args->extent_thresh;
++	ctrl->newer_than = newer_than;
++	ctrl->last_scanned = 0;
++	ctrl->sectors_defragged = 0;
++	return 0;
++}
++
+ /*
+  * Entry point to file defragmentation.
+  *
+  * @inode:	   inode to be defragged
+  * @ra:		   readahead state (can be NUL)
+- * @range:	   defrag options including range and flags
+- * @newer_than:	   minimum transid to defrag
+- * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+- *		   will be defragged.
++ * @ctrl:	   defrag options including range and various policy parameters
+  *
+  * Return <0 for error.
+- * Return >=0 for the number of sectors defragged, and range->start will be updated
+- * to indicate the file offset where next defrag should be started at.
+- * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
+- *  defragging all the range).
++ * Return 0 if the defrag is done without error, ctrl->last_scanned and
++ * ctrl->sectors_defragged will be updated.
+  */
+ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+-		      struct btrfs_ioctl_defrag_range_args *range,
+-		      u64 newer_than, unsigned long max_to_defrag)
++		      struct btrfs_defrag_ctrl *ctrl)
+ {
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+-	unsigned long sectors_defragged = 0;
+ 	u64 isize = i_size_read(inode);
+ 	u64 cur;
+ 	u64 last_byte;
+-	bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
++	bool do_compress = ctrl->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+ 	bool ra_allocated = false;
+-	int compress_type = BTRFS_COMPRESS_ZLIB;
+ 	int ret = 0;
+-	u32 extent_thresh = range->extent_thresh;
+ 	pgoff_t start_index;
+ 
+ 	if (isize == 0)
+ 		return 0;
+ 
+-	if (range->start >= isize)
++	if (ctrl->start >= isize)
+ 		return -EINVAL;
+ 
+-	if (do_compress) {
+-		if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
+-			return -EINVAL;
+-		if (range->compress_type)
+-			compress_type = range->compress_type;
+-	}
++	if (do_compress)
++		ASSERT(ctrl->compress < BTRFS_NR_COMPRESS_TYPES);
+ 
+-	if (extent_thresh == 0)
+-		extent_thresh = SZ_256K;
++	if (ctrl->extent_thresh == 0)
++		ctrl->extent_thresh = SZ_256K;
+ 
+-	if (range->start + range->len > range->start) {
++	if (ctrl->start + ctrl->len > ctrl->start) {
+ 		/* Got a specific range */
+-		last_byte = min(isize, range->start + range->len);
++		last_byte = min(isize, ctrl->start + ctrl->len);
+ 	} else {
+ 		/* Defrag until file end */
+ 		last_byte = isize;
+ 	}
+ 
+ 	/* Align the range */
+-	cur = round_down(range->start, fs_info->sectorsize);
++	cur = round_down(ctrl->start, fs_info->sectorsize);
++	ctrl->last_scanned = cur;
+ 	last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
+ 
+ 	/*
+@@ -1559,12 +1628,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+ 		inode->i_mapping->writeback_index = start_index;
+ 
+ 	while (cur < last_byte) {
+-		const unsigned long prev_sectors_defragged = sectors_defragged;
++		const unsigned long prev_sectors_defragged = ctrl->sectors_defragged;
+ 		u64 cluster_end;
+ 
+-		/* The cluster size 256K should always be page aligned */
+-		BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+-
+ 		if (btrfs_defrag_cancelled(fs_info)) {
+ 			ret = -EAGAIN;
+ 			break;
+@@ -1586,48 +1652,41 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+ 			break;
+ 		}
+ 		if (do_compress)
+-			BTRFS_I(inode)->defrag_compress = compress_type;
+-		ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+-				cluster_end + 1 - cur, extent_thresh,
+-				newer_than, do_compress,
+-				&sectors_defragged, max_to_defrag);
++			BTRFS_I(inode)->defrag_compress = ctrl->compress;
++		ret = defrag_one_cluster(BTRFS_I(inode), ra, ctrl, cur,
++				cluster_end + 1 - cur);
+ 
+-		if (sectors_defragged > prev_sectors_defragged)
++		if (ctrl->sectors_defragged > prev_sectors_defragged)
+ 			balance_dirty_pages_ratelimited(inode->i_mapping);
+ 
+ 		btrfs_inode_unlock(inode, 0);
+ 		if (ret < 0)
+ 			break;
+-		cur = cluster_end + 1;
++		cur = max(cluster_end + 1, ctrl->last_scanned);
+ 		if (ret > 0) {
+ 			ret = 0;
+ 			break;
+ 		}
++		cond_resched();
+ 	}
+ 
+ 	if (ra_allocated)
+ 		kfree(ra);
+-	/*
+-	 * Update range.start for autodefrag, this will indicate where to start
+-	 * in next run.
+-	 */
+-	range->start = cur;
+-	if (sectors_defragged) {
++	if (ctrl->sectors_defragged) {
+ 		/*
+ 		 * We have defragged some sectors, for compression case they
+ 		 * need to be written back immediately.
+ 		 */
+-		if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
++		if (ctrl->flags & BTRFS_DEFRAG_RANGE_START_IO) {
+ 			filemap_flush(inode->i_mapping);
+ 			if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ 				     &BTRFS_I(inode)->runtime_flags))
+ 				filemap_flush(inode->i_mapping);
+ 		}
+-		if (range->compress_type == BTRFS_COMPRESS_LZO)
++		if (ctrl->compress == BTRFS_COMPRESS_LZO)
+ 			btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+-		else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
++		else if (ctrl->compress == BTRFS_COMPRESS_ZSTD)
+ 			btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+-		ret = sectors_defragged;
+ 	}
+ 	if (do_compress) {
+ 		btrfs_inode_lock(inode, 0);
+@@ -2147,7 +2206,7 @@ static noinline int copy_to_sk(struct btrfs_path *path,
+ 
+ 	for (i = slot; i < nritems; i++) {
+ 		item_off = btrfs_item_ptr_offset(leaf, i);
+-		item_len = btrfs_item_size_nr(leaf, i);
++		item_len = btrfs_item_size(leaf, i);
+ 
+ 		btrfs_item_key_to_cpu(leaf, key, i);
+ 		if (!key_in_sk(key, sk))
+@@ -2601,7 +2660,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
+ 	btrfs_item_key_to_cpu(leaf, &key, slot);
+ 
+ 	item_off = btrfs_item_ptr_offset(leaf, slot);
+-	item_len = btrfs_item_size_nr(leaf, slot);
++	item_len = btrfs_item_size(leaf, slot);
+ 	/* Check if dirid in ROOT_REF corresponds to passed dirid */
+ 	rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ 	if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
+@@ -2803,7 +2862,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
+ 
+ 			item_off = btrfs_item_ptr_offset(leaf, slot)
+ 					+ sizeof(struct btrfs_root_ref);
+-			item_len = btrfs_item_size_nr(leaf, slot)
++			item_len = btrfs_item_size(leaf, slot)
+ 					- sizeof(struct btrfs_root_ref);
+ 			read_extent_buffer(leaf, subvol_info->name,
+ 					   item_off, item_len);
+@@ -3148,6 +3207,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
+ 	struct inode *inode = file_inode(file);
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	struct btrfs_ioctl_defrag_range_args range = {0};
++	struct btrfs_defrag_ctrl ctrl = {0};
+ 	int ret;
+ 
+ 	ret = mnt_want_write_file(file);
+@@ -3193,8 +3253,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
+ 			/* the rest are all set to zero by kzalloc */
+ 			range.len = (u64)-1;
+ 		}
+-		ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
+-					&range, BTRFS_OLDEST_GENERATION, 0);
++		ret = btrfs_defrag_ioctl_args_to_ctrl(root->fs_info, &range,
++						      &ctrl, 0, BTRFS_OLDEST_GENERATION);
++		if (ret < 0)
++			break;
++		ret = btrfs_defrag_file(file_inode(file), &file->f_ra, &ctrl);
+ 		if (ret > 0)
+ 			ret = 0;
+ 		break;
+@@ -3683,7 +3746,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+ {
+ 	struct btrfs_trans_handle *trans;
+ 	u64 transid;
+-	int ret;
+ 
+ 	trans = btrfs_attach_transaction_barrier(root);
+ 	if (IS_ERR(trans)) {
+@@ -3695,11 +3757,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+ 		goto out;
+ 	}
+ 	transid = trans->transid;
+-	ret = btrfs_commit_transaction_async(trans);
+-	if (ret) {
+-		btrfs_end_transaction(trans);
+-		return ret;
+-	}
++	btrfs_commit_transaction_async(trans);
+ out:
+ 	if (argp)
+ 		if (copy_to_user(argp, &transid, sizeof(transid)))
+diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
+index 0fb90cbe7669..430ad36b8b08 100644
+--- a/fs/btrfs/lzo.c
++++ b/fs/btrfs/lzo.c
+@@ -55,6 +55,9 @@
+  * 0x1000   | SegHdr N+1| Data payload N+1 ...                |
+  */
+ 
++#define WORKSPACE_BUF_LENGTH	(lzo1x_worst_compress(PAGE_SIZE))
++#define WORKSPACE_CBUF_LENGTH	(lzo1x_worst_compress(PAGE_SIZE))
++
+ struct workspace {
+ 	void *mem;
+ 	void *buf;	/* where decompressed data goes */
+@@ -83,8 +86,8 @@ struct list_head *lzo_alloc_workspace(unsigned int level)
+ 		return ERR_PTR(-ENOMEM);
+ 
+ 	workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+-	workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
+-	workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
++	workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL);
++	workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL);
+ 	if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+ 		goto fail;
+ 
+@@ -380,6 +383,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+ 		kunmap(cur_page);
+ 		cur_in += LZO_LEN;
+ 
++		if (seg_len > WORKSPACE_CBUF_LENGTH) {
++			/*
++			 * seg_len shouldn't be larger than we have allocated
++			 * for workspace->cbuf
++			 */
++			btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
++					seg_len);
++			ret = -EIO;
++			goto out;
++		}
++
+ 		/* Copy the compressed segment payload into workspace */
+ 		copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
+ 
+@@ -422,7 +436,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ 	struct workspace *workspace = list_entry(ws, struct workspace, list);
+ 	size_t in_len;
+ 	size_t out_len;
+-	size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
++	size_t max_segment_len = WORKSPACE_BUF_LENGTH;
+ 	int ret = 0;
+ 	char *kaddr;
+ 	unsigned long bytes;
+diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
+index aae1027bd76a..0775ae9f4419 100644
+--- a/fs/btrfs/print-tree.c
++++ b/fs/btrfs/print-tree.c
+@@ -85,7 +85,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
+ 	struct btrfs_disk_key key;
+ 	unsigned long end;
+ 	unsigned long ptr;
+-	u32 item_size = btrfs_item_size_nr(eb, slot);
++	u32 item_size = btrfs_item_size(eb, slot);
+ 	u64 flags;
+ 	u64 offset;
+ 	int ref_index = 0;
+@@ -200,7 +200,6 @@ void btrfs_print_leaf(struct extent_buffer *l)
+ 	struct btrfs_fs_info *fs_info;
+ 	int i;
+ 	u32 type, nr;
+-	struct btrfs_item *item;
+ 	struct btrfs_root_item *ri;
+ 	struct btrfs_dir_item *di;
+ 	struct btrfs_inode_item *ii;
+@@ -224,12 +223,11 @@ void btrfs_print_leaf(struct extent_buffer *l)
+ 		   btrfs_leaf_free_space(l), btrfs_header_owner(l));
+ 	print_eb_refs_lock(l);
+ 	for (i = 0 ; i < nr ; i++) {
+-		item = btrfs_item_nr(i);
+ 		btrfs_item_key_to_cpu(l, &key, i);
+ 		type = key.type;
+ 		pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n",
+ 			i, key.objectid, type, key.offset,
+-			btrfs_item_offset(l, item), btrfs_item_size(l, item));
++			btrfs_item_offset(l, i), btrfs_item_size(l, i));
+ 		switch (type) {
+ 		case BTRFS_INODE_ITEM_KEY:
+ 			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
+@@ -347,7 +345,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
+ 		case BTRFS_UUID_KEY_SUBVOL:
+ 		case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+ 			print_uuid_item(l, btrfs_item_ptr_offset(l, i),
+-					btrfs_item_size_nr(l, i));
++					btrfs_item_size(l, i));
+ 			break;
+ 		}
+ 	}
+diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
+index b1cb5a8c2999..a978676aa627 100644
+--- a/fs/btrfs/props.c
++++ b/fs/btrfs/props.c
+@@ -158,7 +158,7 @@ static int iterate_object_props(struct btrfs_root *root,
+ 
+ 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ 		cur = 0;
+-		total_len = btrfs_item_size_nr(leaf, slot);
++		total_len = btrfs_item_size(leaf, slot);
+ 
+ 		while (cur < total_len) {
+ 			u32 name_len = btrfs_dir_name_len(leaf, di);
+diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
+index 26134b7476a2..3712cd5fdbfe 100644
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -258,16 +258,19 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
+ 	return 0;
+ }
+ 
+-/* must be called with qgroup_lock held */
+-static int add_relation_rb(struct btrfs_fs_info *fs_info,
+-			   u64 memberid, u64 parentid)
++/*
++ * Add relation specified by two qgroups.
++ *
++ * Must be called with qgroup_lock held.
++ *
++ * Return: 0        on success
++ *         -ENOENT  if one of the qgroups is NULL
++ *         <0       other errors
++ */
++static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
+ {
+-	struct btrfs_qgroup *member;
+-	struct btrfs_qgroup *parent;
+ 	struct btrfs_qgroup_list *list;
+ 
+-	member = find_qgroup_rb(fs_info, memberid);
+-	parent = find_qgroup_rb(fs_info, parentid);
+ 	if (!member || !parent)
+ 		return -ENOENT;
+ 
+@@ -283,7 +286,27 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info,
+ 	return 0;
+ }
+ 
+-/* must be called with qgroup_lock held */
++/*
++ * Add relation specified by two qgoup ids.
++ *
++ * Must be called with qgroup_lock held.
++ *
++ * Return: 0        on success
++ *         -ENOENT  if one of the ids does not exist
++ *         <0       other errors
++ */
++static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
++{
++	struct btrfs_qgroup *member;
++	struct btrfs_qgroup *parent;
++
++	member = find_qgroup_rb(fs_info, memberid);
++	parent = find_qgroup_rb(fs_info, parentid);
++
++	return __add_relation_rb(member, parent);
++}
++
++/* Must be called with qgroup_lock held */
+ static int del_relation_rb(struct btrfs_fs_info *fs_info,
+ 			   u64 memberid, u64 parentid)
+ {
+@@ -1444,7 +1467,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
+ 	}
+ 
+ 	spin_lock(&fs_info->qgroup_lock);
+-	ret = add_relation_rb(fs_info, src, dst);
++	ret = __add_relation_rb(member, parent);
+ 	if (ret < 0) {
+ 		spin_unlock(&fs_info->qgroup_lock);
+ 		goto out;
+diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
+index e2b9f8616501..f34130d90dee 100644
+--- a/fs/btrfs/ref-verify.c
++++ b/fs/btrfs/ref-verify.c
+@@ -435,7 +435,7 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
+ 	struct btrfs_extent_data_ref *dref;
+ 	struct btrfs_shared_data_ref *sref;
+ 	struct extent_buffer *leaf = path->nodes[0];
+-	u32 item_size = btrfs_item_size_nr(leaf, slot);
++	u32 item_size = btrfs_item_size(leaf, slot);
+ 	unsigned long end, ptr;
+ 	u64 offset, flags, count;
+ 	int type, ret;
+diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
+index e0f93b357548..a3930da4eb3f 100644
+--- a/fs/btrfs/reflink.c
++++ b/fs/btrfs/reflink.c
+@@ -439,7 +439,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
+ 			break;
+ 		}
+ 		next_key_min_offset = key.offset + datal;
+-		size = btrfs_item_size_nr(leaf, slot);
++		size = btrfs_item_size(leaf, slot);
+ 		read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
+ 				   size);
+ 
+diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
+index 33a0ee7ac590..ee0a0efc7efd 100644
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -3149,7 +3149,7 @@ static int add_tree_block(struct reloc_control *rc,
+ 	u64 owner = 0;
+ 
+ 	eb =  path->nodes[0];
+-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
++	item_size = btrfs_item_size(eb, path->slots[0]);
+ 
+ 	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
+ 	    item_size >= sizeof(*ei) + sizeof(*bi)) {
+diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
+index d20166336557..3297368aa359 100644
+--- a/fs/btrfs/root-tree.c
++++ b/fs/btrfs/root-tree.c
+@@ -25,7 +25,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
+ 	u32 len;
+ 	int need_reset = 0;
+ 
+-	len = btrfs_item_size_nr(eb, slot);
++	len = btrfs_item_size(eb, slot);
+ 	read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
+ 			   min_t(u32, len, sizeof(*item)));
+ 	if (len < sizeof(*item))
+@@ -146,7 +146,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ 	l = path->nodes[0];
+ 	slot = path->slots[0];
+ 	ptr = btrfs_item_ptr_offset(l, slot);
+-	old_len = btrfs_item_size_nr(l, slot);
++	old_len = btrfs_item_size(l, slot);
+ 
+ 	/*
+ 	 * If this is the first time we update the root item which originated
+diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
+index 8f6ceea33969..d175c5ab1134 100644
+--- a/fs/btrfs/scrub.c
++++ b/fs/btrfs/scrub.c
+@@ -758,7 +758,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
+ 
+ 	eb = path->nodes[0];
+ 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
++	item_size = btrfs_item_size(eb, path->slots[0]);
+ 
+ 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ 		do {
+diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
+index 040324d71118..93b9fe2dca67 100644
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -898,7 +898,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
+ 			     iterate_inode_ref_t iterate, void *ctx)
+ {
+ 	struct extent_buffer *eb = path->nodes[0];
+-	struct btrfs_item *item;
+ 	struct btrfs_inode_ref *iref;
+ 	struct btrfs_inode_extref *extref;
+ 	struct btrfs_path *tmp_path;
+@@ -930,12 +929,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
+ 	if (found_key->type == BTRFS_INODE_REF_KEY) {
+ 		ptr = (unsigned long)btrfs_item_ptr(eb, slot,
+ 						    struct btrfs_inode_ref);
+-		item = btrfs_item_nr(slot);
+-		total = btrfs_item_size(eb, item);
++		total = btrfs_item_size(eb, slot);
+ 		elem_size = sizeof(*iref);
+ 	} else {
+ 		ptr = btrfs_item_ptr_offset(eb, slot);
+-		total = btrfs_item_size_nr(eb, slot);
++		total = btrfs_item_size(eb, slot);
+ 		elem_size = sizeof(*extref);
+ 	}
+ 
+@@ -1018,7 +1016,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
+ {
+ 	int ret = 0;
+ 	struct extent_buffer *eb;
+-	struct btrfs_item *item;
+ 	struct btrfs_dir_item *di;
+ 	struct btrfs_key di_key;
+ 	char *buf = NULL;
+@@ -1047,11 +1044,10 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
+ 
+ 	eb = path->nodes[0];
+ 	slot = path->slots[0];
+-	item = btrfs_item_nr(slot);
+ 	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ 	cur = 0;
+ 	len = 0;
+-	total = btrfs_item_size(eb, item);
++	total = btrfs_item_size(eb, slot);
+ 
+ 	num = 0;
+ 	while (cur < total) {
+@@ -3622,7 +3618,7 @@ static int is_ancestor(struct btrfs_root *root,
+ 		    key.type != BTRFS_INODE_EXTREF_KEY)
+ 			break;
+ 
+-		item_size = btrfs_item_size_nr(leaf, slot);
++		item_size = btrfs_item_size(leaf, slot);
+ 		while (cur_offset < item_size) {
+ 			u64 parent;
+ 			u64 parent_gen;
+@@ -4983,6 +4979,10 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
+ 			lock_page(page);
+ 			if (!PageUptodate(page)) {
+ 				unlock_page(page);
++				btrfs_err(fs_info,
++			"send: IO error at offset %llu for inode %llu root %llu",
++					page_offset(page), sctx->cur_ino,
++					sctx->send_root->root_key.objectid);
+ 				put_page(page);
+ 				ret = -EIO;
+ 				break;
+@@ -6566,7 +6566,7 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
+ 	}
+ 
+ 	leaf = path->nodes[0];
+-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++	item_size = btrfs_item_size(leaf, path->slots[0]);
+ 	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ 	while (cur_offset < item_size) {
+ 		extref = (struct btrfs_inode_extref *)(ptr +
+@@ -6791,8 +6791,8 @@ static int tree_compare_item(struct btrfs_path *left_path,
+ 	int len1, len2;
+ 	unsigned long off1, off2;
+ 
+-	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
+-	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
++	len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]);
++	len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]);
+ 	if (len1 != len2)
+ 		return 1;
+ 
+diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
+index f9eff3b0f77c..836a20fdfca1 100644
+--- a/fs/btrfs/sysfs.c
++++ b/fs/btrfs/sysfs.c
+@@ -1104,6 +1104,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
+ static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
+ static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS];
+ 
++static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) ==
++	      ARRAY_SIZE(btrfs_feature_attrs));
++static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) ==
++	      ARRAY_SIZE(btrfs_feature_attrs[0]));
++
+ static const u64 supported_feature_masks[FEAT_MAX] = {
+ 	[FEAT_COMPAT]    = BTRFS_FEATURE_COMPAT_SUPP,
+ 	[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
+@@ -1272,11 +1277,6 @@ static void init_feature_attrs(void)
+ 	struct btrfs_feature_attr *fa;
+ 	int set, i;
+ 
+-	BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
+-		     ARRAY_SIZE(btrfs_feature_attrs));
+-	BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
+-		     ARRAY_SIZE(btrfs_feature_attrs[0]));
+-
+ 	memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
+ 	memset(btrfs_unknown_feature_names, 0,
+ 	       sizeof(btrfs_unknown_feature_names));
+diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
+index 2a95f7224e18..51a8b075c259 100644
+--- a/fs/btrfs/tests/extent-buffer-tests.c
++++ b/fs/btrfs/tests/extent-buffer-tests.c
+@@ -15,7 +15,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ 	struct btrfs_path *path = NULL;
+ 	struct btrfs_root *root = NULL;
+ 	struct extent_buffer *eb;
+-	struct btrfs_item *item;
+ 	char *value = "mary had a little lamb";
+ 	char *split1 = "mary had a little";
+ 	char *split2 = " lamb";
+@@ -61,7 +60,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ 	key.offset = 0;
+ 
+ 	btrfs_setup_item_for_insert(root, path, &key, value_len);
+-	item = btrfs_item_nr(0);
+ 	write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
+ 			    value_len);
+ 
+@@ -90,8 +88,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ 		goto out;
+ 	}
+ 
+-	item = btrfs_item_nr(0);
+-	if (btrfs_item_size(eb, item) != strlen(split1)) {
++	if (btrfs_item_size(eb, 0) != strlen(split1)) {
+ 		test_err("invalid len in the first split");
+ 		ret = -EINVAL;
+ 		goto out;
+@@ -115,8 +112,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ 		goto out;
+ 	}
+ 
+-	item = btrfs_item_nr(1);
+-	if (btrfs_item_size(eb, item) != strlen(split2)) {
++	if (btrfs_item_size(eb, 1) != strlen(split2)) {
+ 		test_err("invalid len in the second split");
+ 		ret = -EINVAL;
+ 		goto out;
+@@ -147,8 +143,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ 		goto out;
+ 	}
+ 
+-	item = btrfs_item_nr(0);
+-	if (btrfs_item_size(eb, item) != strlen(split3)) {
++	if (btrfs_item_size(eb, 0) != strlen(split3)) {
+ 		test_err("invalid len in the first split");
+ 		ret = -EINVAL;
+ 		goto out;
+@@ -171,8 +166,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ 		goto out;
+ 	}
+ 
+-	item = btrfs_item_nr(1);
+-	if (btrfs_item_size(eb, item) != strlen(split4)) {
++	if (btrfs_item_size(eb, 1) != strlen(split4)) {
+ 		test_err("invalid len in the second split");
+ 		ret = -EINVAL;
+ 		goto out;
+@@ -195,8 +189,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+ 		goto out;
+ 	}
+ 
+-	item = btrfs_item_nr(2);
+-	if (btrfs_item_size(eb, item) != strlen(split2)) {
++	if (btrfs_item_size(eb, 2) != strlen(split2)) {
+ 		test_err("invalid len in the second split");
+ 		ret = -EINVAL;
+ 		goto out;
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index 27b93a6c41bb..f3c094af9283 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -1861,50 +1861,14 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+ 	return ret;
+ }
+ 
+-/*
+- * commit transactions asynchronously. once btrfs_commit_transaction_async
+- * returns, any subsequent transaction will not be allowed to join.
+- */
+-struct btrfs_async_commit {
+-	struct btrfs_trans_handle *newtrans;
+-	struct work_struct work;
+-};
+-
+-static void do_async_commit(struct work_struct *work)
+-{
+-	struct btrfs_async_commit *ac =
+-		container_of(work, struct btrfs_async_commit, work);
+-
+-	/*
+-	 * We've got freeze protection passed with the transaction.
+-	 * Tell lockdep about it.
+-	 */
+-	if (ac->newtrans->type & __TRANS_FREEZABLE)
+-		__sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS);
+-
+-	current->journal_info = ac->newtrans;
+-
+-	btrfs_commit_transaction(ac->newtrans);
+-	kfree(ac);
+-}
+-
+-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
++void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
+ {
+ 	struct btrfs_fs_info *fs_info = trans->fs_info;
+-	struct btrfs_async_commit *ac;
+ 	struct btrfs_transaction *cur_trans;
+ 
+-	ac = kmalloc(sizeof(*ac), GFP_NOFS);
+-	if (!ac)
+-		return -ENOMEM;
+-
+-	INIT_WORK(&ac->work, do_async_commit);
+-	ac->newtrans = btrfs_join_transaction(trans->root);
+-	if (IS_ERR(ac->newtrans)) {
+-		int err = PTR_ERR(ac->newtrans);
+-		kfree(ac);
+-		return err;
+-	}
++	/* Kick the transaction kthread. */
++	set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
++	wake_up_process(fs_info->transaction_kthread);
+ 
+ 	/* take transaction reference */
+ 	cur_trans = trans->transaction;
+@@ -1912,14 +1876,6 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
+ 
+ 	btrfs_end_transaction(trans);
+ 
+-	/*
+-	 * Tell lockdep we've released the freeze rwsem, since the
+-	 * async commit thread will be the one to unlock it.
+-	 */
+-	if (ac->newtrans->type & __TRANS_FREEZABLE)
+-		__sb_writers_release(fs_info->sb, SB_FREEZE_FS);
+-
+-	schedule_work(&ac->work);
+ 	/*
+ 	 * Wait for the current transaction commit to start and block
+ 	 * subsequent transaction joins
+@@ -1927,14 +1883,9 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
+ 	wait_event(fs_info->transaction_blocked_wait,
+ 		   cur_trans->state >= TRANS_STATE_COMMIT_START ||
+ 		   TRANS_ABORTED(cur_trans));
+-	if (current->journal_info == trans)
+-		current->journal_info = NULL;
+-
+ 	btrfs_put_transaction(cur_trans);
+-	return 0;
+ }
+ 
+-
+ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
+ {
+ 	struct btrfs_fs_info *fs_info = trans->fs_info;
+@@ -2013,16 +1964,24 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
+ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+ {
+ 	/*
+-	 * We use writeback_inodes_sb here because if we used
++	 * We use try_to_writeback_inodes_sb() here because if we used
+ 	 * btrfs_start_delalloc_roots we would deadlock with fs freeze.
+ 	 * Currently are holding the fs freeze lock, if we do an async flush
+ 	 * we'll do btrfs_join_transaction() and deadlock because we need to
+ 	 * wait for the fs freeze lock.  Using the direct flushing we benefit
+ 	 * from already being in a transaction and our join_transaction doesn't
+ 	 * have to re-take the fs freeze lock.
++	 *
++	 * Note that try_to_writeback_inodes_sb() will only trigger writeback
++	 * if it can read lock sb->s_umount. It will always be able to lock it,
++	 * except when the filesystem is being unmounted or being frozen, but in
++	 * those cases sync_filesystem() is called, which results in calling
++	 * writeback_inodes_sb() while holding a write lock on sb->s_umount.
++	 * Note that we don't call writeback_inodes_sb() directly, because it
++	 * will emit a warning if sb->s_umount is not locked.
+ 	 */
+ 	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+-		writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
++		try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
+ 	return 0;
+ }
+ 
+@@ -2224,6 +2183,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
+ 	wait_event(cur_trans->writer_wait,
+ 		   atomic_read(&cur_trans->num_writers) == 1);
+ 
++	/*
++	 * We've started the commit, clear the flag in case we were triggered to
++	 * do an async commit but somebody else started before the transaction
++	 * kthread could do the work.
++	 */
++	clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
++
+ 	if (TRANS_ABORTED(cur_trans)) {
+ 		ret = cur_trans->aborted;
+ 		goto scrub_continue;
+diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
+index eba07b8119bb..d0705485f5c8 100644
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -219,7 +219,7 @@ void btrfs_add_dead_root(struct btrfs_root *root);
+ int btrfs_defrag_root(struct btrfs_root *root);
+ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
+ int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
+-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
++void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
+ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
+ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
+ void btrfs_throttle(struct btrfs_fs_info *fs_info);
+diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
+index 7733e8ac0a69..72e1c942197d 100644
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -202,7 +202,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
+ 	struct btrfs_fs_info *fs_info = leaf->fs_info;
+ 	struct btrfs_file_extent_item *fi;
+ 	u32 sectorsize = fs_info->sectorsize;
+-	u32 item_size = btrfs_item_size_nr(leaf, slot);
++	u32 item_size = btrfs_item_size(leaf, slot);
+ 	u64 extent_end;
+ 
+ 	if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
+@@ -354,17 +354,17 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
+ 			key->offset, sectorsize);
+ 		return -EUCLEAN;
+ 	}
+-	if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) {
++	if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) {
+ 		generic_err(leaf, slot,
+ 	"unaligned item size for csum item, have %u should be aligned to %u",
+-			btrfs_item_size_nr(leaf, slot), csumsize);
++			btrfs_item_size(leaf, slot), csumsize);
+ 		return -EUCLEAN;
+ 	}
+ 	if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) {
+ 		u64 prev_csum_end;
+ 		u32 prev_item_size;
+ 
+-		prev_item_size = btrfs_item_size_nr(leaf, slot - 1);
++		prev_item_size = btrfs_item_size(leaf, slot - 1);
+ 		prev_csum_end = (prev_item_size / csumsize) * sectorsize;
+ 		prev_csum_end += prev_key->offset;
+ 		if (unlikely(prev_csum_end > key->offset)) {
+@@ -483,7 +483,7 @@ static int check_dir_item(struct extent_buffer *leaf,
+ {
+ 	struct btrfs_fs_info *fs_info = leaf->fs_info;
+ 	struct btrfs_dir_item *di;
+-	u32 item_size = btrfs_item_size_nr(leaf, slot);
++	u32 item_size = btrfs_item_size(leaf, slot);
+ 	u32 cur = 0;
+ 
+ 	if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
+@@ -640,7 +640,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
+ 				  struct btrfs_key *key, int slot)
+ {
+ 	struct btrfs_block_group_item bgi;
+-	u32 item_size = btrfs_item_size_nr(leaf, slot);
++	u32 item_size = btrfs_item_size(leaf, slot);
+ 	u64 flags;
+ 	u64 type;
+ 
+@@ -912,10 +912,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
+ {
+ 	int num_stripes;
+ 
+-	if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) {
++	if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
+ 		chunk_err(leaf, chunk, key->offset,
+ 			"invalid chunk item size: have %u expect [%zu, %u)",
+-			btrfs_item_size_nr(leaf, slot),
++			btrfs_item_size(leaf, slot),
+ 			sizeof(struct btrfs_chunk),
+ 			BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ 		return -EUCLEAN;
+@@ -927,10 +927,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
+ 		goto out;
+ 
+ 	if (unlikely(btrfs_chunk_item_size(num_stripes) !=
+-		     btrfs_item_size_nr(leaf, slot))) {
++		     btrfs_item_size(leaf, slot))) {
+ 		chunk_err(leaf, chunk, key->offset,
+ 			"invalid chunk item size: have %u expect %lu",
+-			btrfs_item_size_nr(leaf, slot),
++			btrfs_item_size(leaf, slot),
+ 			btrfs_chunk_item_size(num_stripes));
+ 		return -EUCLEAN;
+ 	}
+@@ -1095,12 +1095,12 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
+ 	if (unlikely(ret < 0))
+ 		return ret;
+ 
+-	if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
+-		     btrfs_item_size_nr(leaf, slot) !=
++	if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) &&
++		     btrfs_item_size(leaf, slot) !=
+ 		     btrfs_legacy_root_item_size())) {
+ 		generic_err(leaf, slot,
+ 			    "invalid root item size, have %u expect %zu or %u",
+-			    btrfs_item_size_nr(leaf, slot), sizeof(ri),
++			    btrfs_item_size(leaf, slot), sizeof(ri),
+ 			    btrfs_legacy_root_item_size());
+ 		return -EUCLEAN;
+ 	}
+@@ -1111,7 +1111,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
+ 	 * And since we allow geneartion_v2 as 0, it will still pass the check.
+ 	 */
+ 	read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
+-			   btrfs_item_size_nr(leaf, slot));
++			   btrfs_item_size(leaf, slot));
+ 
+ 	/* Generation related */
+ 	if (unlikely(btrfs_root_generation(&ri) >
+@@ -1208,7 +1208,7 @@ static int check_extent_item(struct extent_buffer *leaf,
+ 	bool is_tree_block = false;
+ 	unsigned long ptr;	/* Current pointer inside inline refs */
+ 	unsigned long end;	/* Extent item end */
+-	const u32 item_size = btrfs_item_size_nr(leaf, slot);
++	const u32 item_size = btrfs_item_size(leaf, slot);
+ 	u64 flags;
+ 	u64 generation;
+ 	u64 total_refs;		/* Total refs in btrfs_extent_item */
+@@ -1432,10 +1432,10 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf,
+ 	if (key->type == BTRFS_SHARED_DATA_REF_KEY)
+ 		expect_item_size = sizeof(struct btrfs_shared_data_ref);
+ 
+-	if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) {
++	if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
+ 		generic_err(leaf, slot,
+ 		"invalid item size, have %u expect %u for key type %u",
+-			    btrfs_item_size_nr(leaf, slot),
++			    btrfs_item_size(leaf, slot),
+ 			    expect_item_size, key->type);
+ 		return -EUCLEAN;
+ 	}
+@@ -1460,12 +1460,12 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
+ {
+ 	struct btrfs_extent_data_ref *dref;
+ 	unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
+-	const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot);
++	const unsigned long end = ptr + btrfs_item_size(leaf, slot);
+ 
+-	if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) {
++	if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) {
+ 		generic_err(leaf, slot,
+ 	"invalid item size, have %u expect aligned to %zu for key type %u",
+-			    btrfs_item_size_nr(leaf, slot),
++			    btrfs_item_size(leaf, slot),
+ 			    sizeof(*dref), key->type);
+ 		return -EUCLEAN;
+ 	}
+@@ -1507,16 +1507,16 @@ static int check_inode_ref(struct extent_buffer *leaf,
+ 	if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
+ 		return -EUCLEAN;
+ 	/* namelen can't be 0, so item_size == sizeof() is also invalid */
+-	if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) {
++	if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) {
+ 		inode_ref_err(leaf, slot,
+ 			"invalid item size, have %u expect (%zu, %u)",
+-			btrfs_item_size_nr(leaf, slot),
++			btrfs_item_size(leaf, slot),
+ 			sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ 		return -EUCLEAN;
+ 	}
+ 
+ 	ptr = btrfs_item_ptr_offset(leaf, slot);
+-	end = ptr + btrfs_item_size_nr(leaf, slot);
++	end = ptr + btrfs_item_size(leaf, slot);
+ 	while (ptr < end) {
+ 		u16 namelen;
+ 
+@@ -1689,12 +1689,12 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
+ 		if (slot == 0)
+ 			item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
+ 		else
+-			item_end_expected = btrfs_item_offset_nr(leaf,
++			item_end_expected = btrfs_item_offset(leaf,
+ 								 slot - 1);
+-		if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) {
++		if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) {
+ 			generic_err(leaf, slot,
+ 				"unexpected item end, have %u expect %u",
+-				btrfs_item_end_nr(leaf, slot),
++				btrfs_item_data_end(leaf, slot),
+ 				item_end_expected);
+ 			return -EUCLEAN;
+ 		}
+@@ -1704,11 +1704,11 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
+ 		 * just in case all the items are consistent to each other, but
+ 		 * all point outside of the leaf.
+ 		 */
+-		if (unlikely(btrfs_item_end_nr(leaf, slot) >
++		if (unlikely(btrfs_item_data_end(leaf, slot) >
+ 			     BTRFS_LEAF_DATA_SIZE(fs_info))) {
+ 			generic_err(leaf, slot,
+ 			"slot end outside of leaf, have %u expect range [0, %u]",
+-				btrfs_item_end_nr(leaf, slot),
++				btrfs_item_data_end(leaf, slot),
+ 				BTRFS_LEAF_DATA_SIZE(fs_info));
+ 			return -EUCLEAN;
+ 		}
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 6993dcdba6f1..cc3a8d8a3841 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -386,7 +386,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
+ 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ 		overwrite_root = 1;
+ 
+-	item_size = btrfs_item_size_nr(eb, slot);
++	item_size = btrfs_item_size(eb, slot);
+ 	src_ptr = btrfs_item_ptr_offset(eb, slot);
+ 
+ 	/* Our caller must have done a search for the key for us. */
+@@ -409,7 +409,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
+ 	if (ret == 0) {
+ 		char *src_copy;
+ 		char *dst_copy;
+-		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
++		u32 dst_size = btrfs_item_size(path->nodes[0],
+ 						  path->slots[0]);
+ 		if (dst_size != item_size)
+ 			goto insert;
+@@ -503,7 +503,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
+ 	/* make sure any existing item is the correct size */
+ 	if (ret == -EEXIST || ret == -EOVERFLOW) {
+ 		u32 found_size;
+-		found_size = btrfs_item_size_nr(path->nodes[0],
++		found_size = btrfs_item_size(path->nodes[0],
+ 						path->slots[0]);
+ 		if (found_size > item_size)
+ 			btrfs_truncate_item(path, item_size, 1);
+@@ -1096,7 +1096,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
+ 		 * otherwise they must be unlinked as a conflict
+ 		 */
+ 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+-		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
++		ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
+ 		while (ptr < ptr_end) {
+ 			victim_ref = (struct btrfs_inode_ref *)ptr;
+ 			victim_name_len = btrfs_inode_ref_name_len(leaf,
+@@ -1155,7 +1155,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
+ 
+ 		leaf = path->nodes[0];
+ 
+-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++		item_size = btrfs_item_size(leaf, path->slots[0]);
+ 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ 
+ 		while (cur_offset < item_size) {
+@@ -1318,7 +1318,7 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
+ 
+ 	eb = path->nodes[0];
+ 	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+-	ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
++	ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
+ 	while (ref_ptr < ref_end) {
+ 		char *name = NULL;
+ 		int namelen;
+@@ -1504,7 +1504,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+ 	int ref_struct_size;
+ 
+ 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
+-	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
++	ref_end = ref_ptr + btrfs_item_size(eb, slot);
+ 
+ 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ 		struct btrfs_inode_extref *r;
+@@ -1678,7 +1678,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
+ 			break;
+ 
+ 		leaf = path->nodes[0];
+-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++		item_size = btrfs_item_size(leaf, path->slots[0]);
+ 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ 		cur_offset = 0;
+ 
+@@ -1732,7 +1732,7 @@ static int count_inode_refs(struct btrfs_root *root,
+ 		    key.type != BTRFS_INODE_REF_KEY)
+ 			break;
+ 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+-		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
++		ptr_end = ptr + btrfs_item_size(path->nodes[0],
+ 						   path->slots[0]);
+ 		while (ptr < ptr_end) {
+ 			struct btrfs_inode_ref *ref;
+@@ -1950,6 +1950,34 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+ 	return ret;
+ }
+ 
++static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
++					struct btrfs_inode *dir,
++					struct btrfs_path *path,
++					struct btrfs_dir_item *dst_di,
++					const struct btrfs_key *log_key,
++					u8 log_type,
++					bool exists)
++{
++	struct btrfs_key found_key;
++
++	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
++	/* The existing dentry points to the same inode, don't delete it. */
++	if (found_key.objectid == log_key->objectid &&
++	    found_key.type == log_key->type &&
++	    found_key.offset == log_key->offset &&
++	    btrfs_dir_type(path->nodes[0], dst_di) == log_type)
++		return 1;
++
++	/*
++	 * Don't drop the conflicting directory entry if the inode for the new
++	 * entry doesn't exist.
++	 */
++	if (!exists)
++		return 0;
++
++	return drop_one_dir_item(trans, path, dir, dst_di);
++}
++
+ /*
+  * take a single entry in a log directory item and replay it into
+  * the subvolume.
+@@ -1975,14 +2003,17 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+ {
+ 	char *name;
+ 	int name_len;
+-	struct btrfs_dir_item *dst_di;
+-	struct btrfs_key found_key;
++	struct btrfs_dir_item *dir_dst_di;
++	struct btrfs_dir_item *index_dst_di;
++	bool dir_dst_matches = false;
++	bool index_dst_matches = false;
+ 	struct btrfs_key log_key;
++	struct btrfs_key search_key;
+ 	struct inode *dir;
+ 	u8 log_type;
+ 	bool exists;
+ 	int ret;
+-	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
++	bool update_size = true;
+ 	bool name_added = false;
+ 
+ 	dir = read_one_inode(root, key->objectid);
+@@ -2008,76 +2039,53 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+ 	exists = (ret == 0);
+ 	ret = 0;
+ 
+-	if (key->type == BTRFS_DIR_ITEM_KEY) {
+-		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+-				       name, name_len, 1);
+-	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
+-		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+-						     key->objectid,
+-						     key->offset, name,
+-						     name_len, 1);
+-	} else {
+-		/* Corruption */
+-		ret = -EINVAL;
++	dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
++					   name, name_len, 1);
++	if (IS_ERR(dir_dst_di)) {
++		ret = PTR_ERR(dir_dst_di);
+ 		goto out;
+-	}
+-
+-	if (IS_ERR(dst_di)) {
+-		ret = PTR_ERR(dst_di);
+-		goto out;
+-	} else if (!dst_di) {
+-		/* we need a sequence number to insert, so we only
+-		 * do inserts for the BTRFS_DIR_INDEX_KEY types
+-		 */
+-		if (key->type != BTRFS_DIR_INDEX_KEY)
++	} else if (dir_dst_di) {
++		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
++						   dir_dst_di, &log_key, log_type,
++						   exists);
++		if (ret < 0)
+ 			goto out;
+-		goto insert;
++		dir_dst_matches = (ret == 1);
+ 	}
+ 
+-	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+-	/* the existing item matches the logged item */
+-	if (found_key.objectid == log_key.objectid &&
+-	    found_key.type == log_key.type &&
+-	    found_key.offset == log_key.offset &&
+-	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
++	btrfs_release_path(path);
++
++	index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
++						   key->objectid, key->offset,
++						   name, name_len, 1);
++	if (IS_ERR(index_dst_di)) {
++		ret = PTR_ERR(index_dst_di);
++		goto out;
++	} else if (index_dst_di) {
++		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
++						   index_dst_di, &log_key,
++						   log_type, exists);
++		if (ret < 0)
++			goto out;
++		index_dst_matches = (ret == 1);
++	}
++
++	btrfs_release_path(path);
++
++	if (dir_dst_matches && index_dst_matches) {
++		ret = 0;
+ 		update_size = false;
+ 		goto out;
+ 	}
+ 
+-	/*
+-	 * don't drop the conflicting directory entry if the inode
+-	 * for the new entry doesn't exist
+-	 */
+-	if (!exists)
+-		goto out;
+-
+-	ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di);
+-	if (ret)
+-		goto out;
+-
+-	if (key->type == BTRFS_DIR_INDEX_KEY)
+-		goto insert;
+-out:
+-	btrfs_release_path(path);
+-	if (!ret && update_size) {
+-		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
+-		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+-	}
+-	kfree(name);
+-	iput(dir);
+-	if (!ret && name_added)
+-		ret = 1;
+-	return ret;
+-
+-insert:
+ 	/*
+ 	 * Check if the inode reference exists in the log for the given name,
+ 	 * inode and parent inode
+ 	 */
+-	found_key.objectid = log_key.objectid;
+-	found_key.type = BTRFS_INODE_REF_KEY;
+-	found_key.offset = key->objectid;
+-	ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
++	search_key.objectid = log_key.objectid;
++	search_key.type = BTRFS_INODE_REF_KEY;
++	search_key.offset = key->objectid;
++	ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
+ 	if (ret < 0) {
+ 	        goto out;
+ 	} else if (ret) {
+@@ -2087,10 +2095,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+ 	        goto out;
+ 	}
+ 
+-	found_key.objectid = log_key.objectid;
+-	found_key.type = BTRFS_INODE_EXTREF_KEY;
+-	found_key.offset = key->objectid;
+-	ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
++	search_key.objectid = log_key.objectid;
++	search_key.type = BTRFS_INODE_EXTREF_KEY;
++	search_key.offset = key->objectid;
++	ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
+ 			     name_len);
+ 	if (ret < 0) {
+ 		goto out;
+@@ -2109,87 +2117,76 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+ 		name_added = true;
+ 	update_size = false;
+ 	ret = 0;
+-	goto out;
++
++out:
++	if (!ret && update_size) {
++		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
++		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
++	}
++	kfree(name);
++	iput(dir);
++	if (!ret && name_added)
++		ret = 1;
++	return ret;
+ }
+ 
+-/*
+- * find all the names in a directory item and reconcile them into
+- * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
+- * one name in a directory item, but the same code gets used for
+- * both directory index types
+- */
++/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
+ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+ 					struct btrfs_root *root,
+ 					struct btrfs_path *path,
+ 					struct extent_buffer *eb, int slot,
+ 					struct btrfs_key *key)
+ {
+-	int ret = 0;
+-	u32 item_size = btrfs_item_size_nr(eb, slot);
++	int ret;
+ 	struct btrfs_dir_item *di;
+-	int name_len;
+-	unsigned long ptr;
+-	unsigned long ptr_end;
+-	struct btrfs_path *fixup_path = NULL;
+ 
+-	ptr = btrfs_item_ptr_offset(eb, slot);
+-	ptr_end = ptr + item_size;
+-	while (ptr < ptr_end) {
+-		di = (struct btrfs_dir_item *)ptr;
+-		name_len = btrfs_dir_name_len(eb, di);
+-		ret = replay_one_name(trans, root, path, eb, di, key);
+-		if (ret < 0)
+-			break;
+-		ptr = (unsigned long)(di + 1);
+-		ptr += name_len;
++	/* We only log dir index keys, which only contain a single dir item. */
++	ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
+ 
+-		/*
+-		 * If this entry refers to a non-directory (directories can not
+-		 * have a link count > 1) and it was added in the transaction
+-		 * that was not committed, make sure we fixup the link count of
+-		 * the inode it the entry points to. Otherwise something like
+-		 * the following would result in a directory pointing to an
+-		 * inode with a wrong link that does not account for this dir
+-		 * entry:
+-		 *
+-		 * mkdir testdir
+-		 * touch testdir/foo
+-		 * touch testdir/bar
+-		 * sync
+-		 *
+-		 * ln testdir/bar testdir/bar_link
+-		 * ln testdir/foo testdir/foo_link
+-		 * xfs_io -c "fsync" testdir/bar
+-		 *
+-		 * <power failure>
+-		 *
+-		 * mount fs, log replay happens
+-		 *
+-		 * File foo would remain with a link count of 1 when it has two
+-		 * entries pointing to it in the directory testdir. This would
+-		 * make it impossible to ever delete the parent directory has
+-		 * it would result in stale dentries that can never be deleted.
+-		 */
+-		if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+-			struct btrfs_key di_key;
++	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
++	ret = replay_one_name(trans, root, path, eb, di, key);
++	if (ret < 0)
++		return ret;
+ 
+-			if (!fixup_path) {
+-				fixup_path = btrfs_alloc_path();
+-				if (!fixup_path) {
+-					ret = -ENOMEM;
+-					break;
+-				}
+-			}
++	/*
++	 * If this entry refers to a non-directory (directories can not have a
++	 * link count > 1) and it was added in the transaction that was not
++	 * committed, make sure we fixup the link count of the inode the entry
++	 * points to. Otherwise something like the following would result in a
++	 * directory pointing to an inode with a wrong link that does not account
++	 * for this dir entry:
++	 *
++	 * mkdir testdir
++	 * touch testdir/foo
++	 * touch testdir/bar
++	 * sync
++	 *
++	 * ln testdir/bar testdir/bar_link
++	 * ln testdir/foo testdir/foo_link
++	 * xfs_io -c "fsync" testdir/bar
++	 *
++	 * <power failure>
++	 *
++	 * mount fs, log replay happens
++	 *
++	 * File foo would remain with a link count of 1 when it has two entries
++	 * pointing to it in the directory testdir. This would make it impossible
++	 * to ever delete the parent directory has it would result in stale
++	 * dentries that can never be deleted.
++	 */
++	if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
++		struct btrfs_path *fixup_path;
++		struct btrfs_key di_key;
+ 
+-			btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+-			ret = link_to_fixup_dir(trans, root, fixup_path,
+-						di_key.objectid);
+-			if (ret)
+-				break;
+-		}
+-		ret = 0;
++		fixup_path = btrfs_alloc_path();
++		if (!fixup_path)
++			return -ENOMEM;
++
++		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
++		ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
++		btrfs_free_path(fixup_path);
+ 	}
+-	btrfs_free_path(fixup_path);
++
+ 	return ret;
+ }
+ 
+@@ -2206,7 +2203,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+  */
+ static noinline int find_dir_range(struct btrfs_root *root,
+ 				   struct btrfs_path *path,
+-				   u64 dirid, int key_type,
++				   u64 dirid,
+ 				   u64 *start_ret, u64 *end_ret)
+ {
+ 	struct btrfs_key key;
+@@ -2219,7 +2216,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
+ 		return 1;
+ 
+ 	key.objectid = dirid;
+-	key.type = key_type;
++	key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ 	key.offset = *start_ret;
+ 
+ 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+@@ -2233,7 +2230,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
+ 	if (ret != 0)
+ 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ 
+-	if (key.type != key_type || key.objectid != dirid) {
++	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
+ 		ret = 1;
+ 		goto next;
+ 	}
+@@ -2260,7 +2257,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
+ 
+ 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ 
+-	if (key.type != key_type || key.objectid != dirid) {
++	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
+ 		ret = 1;
+ 		goto out;
+ 	}
+@@ -2291,95 +2288,82 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+ 	int ret;
+ 	struct extent_buffer *eb;
+ 	int slot;
+-	u32 item_size;
+ 	struct btrfs_dir_item *di;
+-	struct btrfs_dir_item *log_di;
+ 	int name_len;
+-	unsigned long ptr;
+-	unsigned long ptr_end;
+ 	char *name;
+-	struct inode *inode;
++	struct inode *inode = NULL;
+ 	struct btrfs_key location;
+ 
+-again:
++	/*
++	 * Currenly we only log dir index keys. Even if we replay a log created
++	 * by an older kernel that logged both dir index and dir item keys, all
++	 * we need to do is process the dir index keys, we (and our caller) can
++	 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
++	 */
++	ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
++
+ 	eb = path->nodes[0];
+ 	slot = path->slots[0];
+-	item_size = btrfs_item_size_nr(eb, slot);
+-	ptr = btrfs_item_ptr_offset(eb, slot);
+-	ptr_end = ptr + item_size;
+-	while (ptr < ptr_end) {
+-		di = (struct btrfs_dir_item *)ptr;
+-		name_len = btrfs_dir_name_len(eb, di);
+-		name = kmalloc(name_len, GFP_NOFS);
+-		if (!name) {
+-			ret = -ENOMEM;
+-			goto out;
+-		}
+-		read_extent_buffer(eb, name, (unsigned long)(di + 1),
+-				  name_len);
+-		log_di = NULL;
+-		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
+-			log_di = btrfs_lookup_dir_item(trans, log, log_path,
+-						       dir_key->objectid,
+-						       name, name_len, 0);
+-		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
+-			log_di = btrfs_lookup_dir_index_item(trans, log,
+-						     log_path,
++	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
++	name_len = btrfs_dir_name_len(eb, di);
++	name = kmalloc(name_len, GFP_NOFS);
++	if (!name) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
++
++	if (log) {
++		struct btrfs_dir_item *log_di;
++
++		log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
+ 						     dir_key->objectid,
+ 						     dir_key->offset,
+ 						     name, name_len, 0);
+-		}
+-		if (!log_di) {
+-			btrfs_dir_item_key_to_cpu(eb, di, &location);
+-			btrfs_release_path(path);
+-			btrfs_release_path(log_path);
+-			inode = read_one_inode(root, location.objectid);
+-			if (!inode) {
+-				kfree(name);
+-				return -EIO;
+-			}
+-
+-			ret = link_to_fixup_dir(trans, root,
+-						path, location.objectid);
+-			if (ret) {
+-				kfree(name);
+-				iput(inode);
+-				goto out;
+-			}
+-
+-			inc_nlink(inode);
+-			ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
+-					BTRFS_I(inode), name, name_len);
+-			if (!ret)
+-				ret = btrfs_run_delayed_items(trans);
+-			kfree(name);
+-			iput(inode);
+-			if (ret)
+-				goto out;
+-
+-			/* there might still be more names under this key
+-			 * check and repeat if required
+-			 */
+-			ret = btrfs_search_slot(NULL, root, dir_key, path,
+-						0, 0);
+-			if (ret == 0)
+-				goto again;
++		if (IS_ERR(log_di)) {
++			ret = PTR_ERR(log_di);
++			goto out;
++		} else if (log_di) {
++			/* The dentry exists in the log, we have nothing to do. */
+ 			ret = 0;
+ 			goto out;
+-		} else if (IS_ERR(log_di)) {
+-			kfree(name);
+-			return PTR_ERR(log_di);
+ 		}
+-		btrfs_release_path(log_path);
+-		kfree(name);
+-
+-		ptr = (unsigned long)(di + 1);
+-		ptr += name_len;
+ 	}
+-	ret = 0;
++
++	btrfs_dir_item_key_to_cpu(eb, di, &location);
++	btrfs_release_path(path);
++	btrfs_release_path(log_path);
++	inode = read_one_inode(root, location.objectid);
++	if (!inode) {
++		ret = -EIO;
++		goto out;
++	}
++
++	ret = link_to_fixup_dir(trans, root, path, location.objectid);
++	if (ret)
++		goto out;
++
++	inc_nlink(inode);
++	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name,
++				 name_len);
++	if (ret)
++		goto out;
++
++	ret = btrfs_run_delayed_items(trans);
++	if (ret)
++		goto out;
++
++	/*
++	 * Unlike dir item keys, dir index keys can only have one name (entry) in
++	 * them, as there are no key collisions since each key has a unique offset
++	 * (an index number), so we're done.
++	 */
+ out:
+ 	btrfs_release_path(path);
+ 	btrfs_release_path(log_path);
++	kfree(name);
++	iput(inode);
+ 	return ret;
+ }
+ 
+@@ -2422,7 +2406,7 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
+ 		}
+ 
+ 		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
+-		total_size = btrfs_item_size_nr(path->nodes[0], i);
++		total_size = btrfs_item_size(path->nodes[0], i);
+ 		cur = 0;
+ 		while (cur < total_size) {
+ 			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
+@@ -2499,7 +2483,6 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ {
+ 	u64 range_start;
+ 	u64 range_end;
+-	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+ 	int ret = 0;
+ 	struct btrfs_key dir_key;
+ 	struct btrfs_key found_key;
+@@ -2507,7 +2490,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ 	struct inode *dir;
+ 
+ 	dir_key.objectid = dirid;
+-	dir_key.type = BTRFS_DIR_ITEM_KEY;
++	dir_key.type = BTRFS_DIR_INDEX_KEY;
+ 	log_path = btrfs_alloc_path();
+ 	if (!log_path)
+ 		return -ENOMEM;
+@@ -2521,14 +2504,14 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ 		btrfs_free_path(log_path);
+ 		return 0;
+ 	}
+-again:
++
+ 	range_start = 0;
+ 	range_end = 0;
+ 	while (1) {
+ 		if (del_all)
+ 			range_end = (u64)-1;
+ 		else {
+-			ret = find_dir_range(log, path, dirid, key_type,
++			ret = find_dir_range(log, path, dirid,
+ 					     &range_start, &range_end);
+ 			if (ret < 0)
+ 				goto out;
+@@ -2555,8 +2538,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ 					      path->slots[0]);
+ 			if (found_key.objectid != dirid ||
+-			    found_key.type != dir_key.type)
+-				goto next_type;
++			    found_key.type != dir_key.type) {
++				ret = 0;
++				goto out;
++			}
+ 
+ 			if (found_key.offset > range_end)
+ 				break;
+@@ -2575,15 +2560,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ 			break;
+ 		range_start = range_end + 1;
+ 	}
+-
+-next_type:
+ 	ret = 0;
+-	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+-		key_type = BTRFS_DIR_LOG_INDEX_KEY;
+-		dir_key.type = BTRFS_DIR_INDEX_KEY;
+-		btrfs_release_path(path);
+-		goto again;
+-	}
+ out:
+ 	btrfs_release_path(path);
+ 	btrfs_free_path(log_path);
+@@ -2743,12 +2720,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+ 						eb, i, &key);
+ 			if (ret)
+ 				break;
+-		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
+-			ret = replay_one_dir_item(wc->trans, root, path,
+-						  eb, i, &key);
+-			if (ret)
+-				break;
+ 		}
++		/*
++		 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
++		 * BTRFS_DIR_INDEX_KEY items which we use to derive the
++		 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
++		 * older kernel with such keys, ignore them.
++		 */
+ 	}
+ 	btrfs_free_path(path);
+ 	return ret;
+@@ -3551,20 +3529,10 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ 		goto out_unlock;
+ 	}
+ 
+-	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
+-				   name, name_len, -1);
+-	if (IS_ERR(di)) {
+-		err = PTR_ERR(di);
+-		goto fail;
+-	}
+-	if (di) {
+-		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+-		if (ret) {
+-			err = ret;
+-			goto fail;
+-		}
+-	}
+-	btrfs_release_path(path);
++	/*
++	 * We only log dir index items of a directory, so we don't need to look
++	 * for dir item keys.
++	 */
+ 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
+ 					 index, name, name_len, -1);
+ 	if (IS_ERR(di)) {
+@@ -3628,7 +3596,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+ 				       struct btrfs_root *log,
+ 				       struct btrfs_path *path,
+-				       int key_type, u64 dirid,
++				       u64 dirid,
+ 				       u64 first_offset, u64 last_offset)
+ {
+ 	int ret;
+@@ -3637,10 +3605,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+ 
+ 	key.objectid = dirid;
+ 	key.offset = first_offset;
+-	if (key_type == BTRFS_DIR_ITEM_KEY)
+-		key.type = BTRFS_DIR_LOG_ITEM_KEY;
+-	else
+-		key.type = BTRFS_DIR_LOG_INDEX_KEY;
++	key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+ 	if (ret)
+ 		return ret;
+@@ -3675,7 +3640,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ 
+ 	if (count == 1) {
+ 		btrfs_item_key_to_cpu(src, &key, start_slot);
+-		item_size = btrfs_item_size_nr(src, start_slot);
++		item_size = btrfs_item_size(src, start_slot);
+ 		batch.keys = &key;
+ 		batch.data_sizes = &item_size;
+ 		batch.total_data_size = item_size;
+@@ -3698,7 +3663,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ 			const int slot = start_slot + i;
+ 
+ 			btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+-			ins_sizes[i] = btrfs_item_size_nr(src, slot);
++			ins_sizes[i] = btrfs_item_size(src, slot);
+ 			batch.total_data_size += ins_sizes[i];
+ 		}
+ 	}
+@@ -3732,7 +3697,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ 				  struct btrfs_inode *inode,
+ 				  struct btrfs_path *path,
+ 				  struct btrfs_path *dst_path,
+-				  int key_type,
+ 				  struct btrfs_log_ctx *ctx)
+ {
+ 	struct btrfs_root *log = inode->root->log_root;
+@@ -3740,24 +3704,18 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ 	const int nritems = btrfs_header_nritems(src);
+ 	const u64 ino = btrfs_ino(inode);
+ 	const bool inode_logged_before = inode_logged(trans, inode);
+-	u64 last_logged_key_offset;
+ 	bool last_found = false;
+ 	int batch_start = 0;
+ 	int batch_size = 0;
+ 	int i;
+ 
+-	if (key_type == BTRFS_DIR_ITEM_KEY)
+-		last_logged_key_offset = inode->last_dir_item_offset;
+-	else
+-		last_logged_key_offset = inode->last_dir_index_offset;
+-
+ 	for (i = path->slots[0]; i < nritems; i++) {
+ 		struct btrfs_key key;
+ 		int ret;
+ 
+ 		btrfs_item_key_to_cpu(src, &key, i);
+ 
+-		if (key.objectid != ino || key.type != key_type) {
++		if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
+ 			last_found = true;
+ 			break;
+ 		}
+@@ -3806,7 +3764,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ 		 * we logged is in the log tree, saving time and avoiding adding
+ 		 * contention on the log tree.
+ 		 */
+-		if (key.offset > last_logged_key_offset)
++		if (key.offset > inode->last_dir_index_offset)
+ 			goto add_to_batch;
+ 		/*
+ 		 * Check if the key was already logged before. If not we can add
+@@ -3865,7 +3823,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 			  struct btrfs_inode *inode,
+ 			  struct btrfs_path *path,
+-			  struct btrfs_path *dst_path, int key_type,
++			  struct btrfs_path *dst_path,
+ 			  struct btrfs_log_ctx *ctx,
+ 			  u64 min_offset, u64 *last_offset_ret)
+ {
+@@ -3879,7 +3837,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 	u64 ino = btrfs_ino(inode);
+ 
+ 	min_key.objectid = ino;
+-	min_key.type = key_type;
++	min_key.type = BTRFS_DIR_INDEX_KEY;
+ 	min_key.offset = min_offset;
+ 
+ 	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
+@@ -3888,9 +3846,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 	 * we didn't find anything from this transaction, see if there
+ 	 * is anything at all
+ 	 */
+-	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
++	if (ret != 0 || min_key.objectid != ino ||
++	    min_key.type != BTRFS_DIR_INDEX_KEY) {
+ 		min_key.objectid = ino;
+-		min_key.type = key_type;
++		min_key.type = BTRFS_DIR_INDEX_KEY;
+ 		min_key.offset = (u64)-1;
+ 		btrfs_release_path(path);
+ 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+@@ -3898,7 +3857,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 			btrfs_release_path(path);
+ 			return ret;
+ 		}
+-		ret = btrfs_previous_item(root, path, ino, key_type);
++		ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
+ 
+ 		/* if ret == 0 there are items for this type,
+ 		 * create a range to tell us the last key of this type.
+@@ -3909,18 +3868,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 			struct btrfs_key tmp;
+ 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+ 					      path->slots[0]);
+-			if (key_type == tmp.type)
++			if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ 				first_offset = max(min_offset, tmp.offset) + 1;
+ 		}
+ 		goto done;
+ 	}
+ 
+ 	/* go backward to find any previous key */
+-	ret = btrfs_previous_item(root, path, ino, key_type);
++	ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
+ 	if (ret == 0) {
+ 		struct btrfs_key tmp;
+ 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+-		if (key_type == tmp.type) {
++		if (tmp.type == BTRFS_DIR_INDEX_KEY) {
+ 			first_offset = tmp.offset;
+ 			ret = overwrite_item(trans, log, dst_path,
+ 					     path->nodes[0], path->slots[0],
+@@ -3951,8 +3910,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 	 * from our directory
+ 	 */
+ 	while (1) {
+-		ret = process_dir_items_leaf(trans, inode, path, dst_path,
+-					     key_type, ctx);
++		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx);
+ 		if (ret != 0) {
+ 			if (ret < 0)
+ 				err = ret;
+@@ -3973,7 +3931,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 			goto done;
+ 		}
+ 		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+-		if (min_key.objectid != ino || min_key.type != key_type) {
++		if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
+ 			last_offset = (u64)-1;
+ 			goto done;
+ 		}
+@@ -4004,8 +3962,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 		 * insert the log range keys to indicate where the log
+ 		 * is valid
+ 		 */
+-		ret = insert_dir_log_key(trans, log, path, key_type,
+-					 ino, first_offset, last_offset);
++		ret = insert_dir_log_key(trans, log, path, ino, first_offset,
++					 last_offset);
+ 		if (ret)
+ 			err = ret;
+ 	}
+@@ -4033,35 +3991,28 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+ 	u64 min_key;
+ 	u64 max_key;
+ 	int ret;
+-	int key_type = BTRFS_DIR_ITEM_KEY;
+ 
+ 	/*
+ 	 * If this is the first time we are being logged in the current
+ 	 * transaction, or we were logged before but the inode was evicted and
+-	 * reloaded later, in which case its logged_trans is 0, reset the values
+-	 * of the last logged key offsets. Note that we don't use the helper
++	 * reloaded later, in which case its logged_trans is 0, reset the value
++	 * of the last logged key offset. Note that we don't use the helper
+ 	 * function inode_logged() here - that is because the function returns
+ 	 * true after an inode eviction, assuming the worst case as it can not
+ 	 * know for sure if the inode was logged before. So we can not skip key
+ 	 * searches in the case the inode was evicted, because it may not have
+ 	 * been logged in this transaction and may have been logged in a past
+-	 * transaction, so we need to reset the last dir item and index offsets
+-	 * to (u64)-1.
++	 * transaction, so we need to reset the last dir index offset to (u64)-1.
+ 	 */
+-	if (inode->logged_trans != trans->transid) {
+-		inode->last_dir_item_offset = (u64)-1;
++	if (inode->logged_trans != trans->transid)
+ 		inode->last_dir_index_offset = (u64)-1;
+-	}
+-again:
++
+ 	min_key = 0;
+ 	max_key = 0;
+-	if (key_type == BTRFS_DIR_ITEM_KEY)
+-		ctx->last_dir_item_offset = inode->last_dir_item_offset;
+-	else
+-		ctx->last_dir_item_offset = inode->last_dir_index_offset;
++	ctx->last_dir_item_offset = inode->last_dir_index_offset;
+ 
+ 	while (1) {
+-		ret = log_dir_items(trans, inode, path, dst_path, key_type,
++		ret = log_dir_items(trans, inode, path, dst_path,
+ 				ctx, min_key, &max_key);
+ 		if (ret)
+ 			return ret;
+@@ -4070,13 +4021,8 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+ 		min_key = max_key + 1;
+ 	}
+ 
+-	if (key_type == BTRFS_DIR_ITEM_KEY) {
+-		inode->last_dir_item_offset = ctx->last_dir_item_offset;
+-		key_type = BTRFS_DIR_INDEX_KEY;
+-		goto again;
+-	} else {
+-		inode->last_dir_index_offset = ctx->last_dir_item_offset;
+-	}
++	inode->last_dir_index_offset = ctx->last_dir_item_offset;
++
+ 	return 0;
+ }
+ 
+@@ -4350,7 +4296,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
+ 	batch.nr = nr;
+ 
+ 	for (i = 0; i < nr; i++) {
+-		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
++		ins_sizes[i] = btrfs_item_size(src, i + start_slot);
+ 		batch.total_data_size += ins_sizes[i];
+ 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+ 	}
+@@ -4573,14 +4519,34 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
+ {
+ 	struct btrfs_drop_extents_args drop_args = { 0 };
+ 	struct btrfs_root *log = inode->root->log_root;
+-	struct btrfs_file_extent_item *fi;
++	struct btrfs_file_extent_item fi = { 0 };
+ 	struct extent_buffer *leaf;
+-	struct btrfs_map_token token;
+ 	struct btrfs_key key;
+ 	u64 extent_offset = em->start - em->orig_start;
+ 	u64 block_len;
+ 	int ret;
+ 
++	btrfs_set_stack_file_extent_generation(&fi, trans->transid);
++	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
++		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
++	else
++		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
++
++	block_len = max(em->block_len, em->orig_block_len);
++	if (em->compress_type != BTRFS_COMPRESS_NONE) {
++		btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
++		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
++	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
++		btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
++							extent_offset);
++		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
++	}
++
++	btrfs_set_stack_file_extent_offset(&fi, extent_offset);
++	btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
++	btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
++	btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
++
+ 	ret = log_extent_csums(trans, inode, log, em, ctx);
+ 	if (ret)
+ 		return ret;
+@@ -4599,7 +4565,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
+ 		drop_args.start = em->start;
+ 		drop_args.end = em->start + em->len;
+ 		drop_args.replace_extent = true;
+-		drop_args.extent_item_size = sizeof(*fi);
++		drop_args.extent_item_size = sizeof(fi);
+ 		ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+ 		if (ret)
+ 			return ret;
+@@ -4611,44 +4577,14 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
+ 		key.offset = em->start;
+ 
+ 		ret = btrfs_insert_empty_item(trans, log, path, &key,
+-					      sizeof(*fi));
++					      sizeof(fi));
+ 		if (ret)
+ 			return ret;
+ 	}
+ 	leaf = path->nodes[0];
+-	btrfs_init_map_token(&token, leaf);
+-	fi = btrfs_item_ptr(leaf, path->slots[0],
+-			    struct btrfs_file_extent_item);
+-
+-	btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
+-	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+-		btrfs_set_token_file_extent_type(&token, fi,
+-						 BTRFS_FILE_EXTENT_PREALLOC);
+-	else
+-		btrfs_set_token_file_extent_type(&token, fi,
+-						 BTRFS_FILE_EXTENT_REG);
+-
+-	block_len = max(em->block_len, em->orig_block_len);
+-	if (em->compress_type != BTRFS_COMPRESS_NONE) {
+-		btrfs_set_token_file_extent_disk_bytenr(&token, fi,
+-							em->block_start);
+-		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
+-	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+-		btrfs_set_token_file_extent_disk_bytenr(&token, fi,
+-							em->block_start -
+-							extent_offset);
+-		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
+-	} else {
+-		btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
+-		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
+-	}
+-
+-	btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
+-	btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
+-	btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
+-	btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
+-	btrfs_set_token_file_extent_encryption(&token, fi, 0);
+-	btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
++	write_extent_buffer(leaf, &fi,
++			    btrfs_item_ptr_offset(leaf, path->slots[0]),
++			    sizeof(fi));
+ 	btrfs_mark_buffer_dirty(leaf);
+ 
+ 	btrfs_release_path(path);
+@@ -4862,7 +4798,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+ 	WARN_ON(!list_empty(&extents));
+ 	write_unlock(&tree->lock);
+ 
+-	btrfs_release_path(path);
+ 	if (!ret)
+ 		ret = btrfs_log_prealloc_extents(trans, inode, path);
+ 	if (ret)
+@@ -5166,7 +5101,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
+ 	struct btrfs_path *search_path;
+ 	char *name = NULL;
+ 	u32 name_len = 0;
+-	u32 item_size = btrfs_item_size_nr(eb, slot);
++	u32 item_size = btrfs_item_size(eb, slot);
+ 	u32 cur_offset = 0;
+ 	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
+ 
+@@ -5899,18 +5834,12 @@ struct btrfs_dir_list {
+  *    link_to_fixup_dir());
+  *
+  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+- *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
+- *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
++ *    while logging the inode's items new index items (key type
++ *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
+  *    has a size that doesn't match the sum of the lengths of all the logged
+- *    names. This does not result in a problem because if a dir_item key is
+- *    logged but its matching dir_index key is not logged, at log replay time we
+- *    don't use it to replay the respective name (see replay_one_name()). On the
+- *    other hand if only the dir_index key ends up being logged, the respective
+- *    name is added to the fs/subvol tree with both the dir_item and dir_index
+- *    keys created (see replay_one_name()).
+- *    The directory's inode item with a wrong i_size is not a problem as well,
+- *    since we don't use it at log replay time to set the i_size in the inode
+- *    item of the fs/subvol tree (see overwrite_item()).
++ *    names - this is ok, not a problem, because at log replay time we set the
++ *    directory's i_size to the correct value (see replay_one_name() and
++ *    do_overwrite_item()).
+  */
+ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ 				struct btrfs_root *root,
+@@ -5956,7 +5885,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ 			goto next_dir_inode;
+ 
+ 		min_key.objectid = dir_elem->ino;
+-		min_key.type = BTRFS_DIR_ITEM_KEY;
++		min_key.type = BTRFS_DIR_INDEX_KEY;
+ 		min_key.offset = 0;
+ again:
+ 		btrfs_release_path(path);
+@@ -5981,7 +5910,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ 
+ 			btrfs_item_key_to_cpu(leaf, &min_key, i);
+ 			if (min_key.objectid != dir_elem->ino ||
+-			    min_key.type != BTRFS_DIR_ITEM_KEY)
++			    min_key.type != BTRFS_DIR_INDEX_KEY)
+ 				goto next_dir_inode;
+ 
+ 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+@@ -6093,7 +6022,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+ 		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+ 			break;
+ 
+-		item_size = btrfs_item_size_nr(leaf, slot);
++		item_size = btrfs_item_size(leaf, slot);
+ 		ptr = btrfs_item_ptr_offset(leaf, slot);
+ 		while (cur_offset < item_size) {
+ 			struct btrfs_key inode_key;
+@@ -6795,15 +6724,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ 	 * was previously logged, make sure the next log attempt on the directory
+ 	 * is not skipped and logs the inode again. This is because the log may
+ 	 * not currently be authoritative for a range including the old
+-	 * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make
+-	 * sure after a log replay we do not end up with both the new and old
+-	 * dentries around (in case the inode is a directory we would have a
+-	 * directory with two hard links and 2 inode references for different
+-	 * parents). The next log attempt of old_dir will happen at
+-	 * btrfs_log_all_parents(), called through btrfs_log_inode_parent()
+-	 * below, because we have previously set inode->last_unlink_trans to the
+-	 * current transaction ID, either here or at btrfs_record_unlink_dir() in
+-	 * case inode is a directory.
++	 * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we
++	 * do not end up with both the new and old dentries around (in case the
++	 * inode is a directory we would have a directory with two hard links and
++	 * 2 inode references for different parents). The next log attempt of
++	 * old_dir will happen at btrfs_log_all_parents(), called through
++	 * btrfs_log_inode_parent() below, because we have previously set
++	 * inode->last_unlink_trans to the current transaction ID, either here or
++	 * at btrfs_record_unlink_dir() in case the inode is a directory.
+ 	 */
+ 	if (old_dir)
+ 		old_dir->logged_trans = 0;
+diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
+index 74023c8a783f..b458452a1aaf 100644
+--- a/fs/btrfs/uuid-tree.c
++++ b/fs/btrfs/uuid-tree.c
+@@ -52,7 +52,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
+ 
+ 	eb = path->nodes[0];
+ 	slot = path->slots[0];
+-	item_size = btrfs_item_size_nr(eb, slot);
++	item_size = btrfs_item_size(eb, slot);
+ 	offset = btrfs_item_ptr_offset(eb, slot);
+ 	ret = -ENOENT;
+ 
+@@ -125,7 +125,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ 		eb = path->nodes[0];
+ 		slot = path->slots[0];
+ 		offset = btrfs_item_ptr_offset(eb, slot);
+-		offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
++		offset += btrfs_item_size(eb, slot) - sizeof(subid_le);
+ 	} else {
+ 		btrfs_warn(fs_info,
+ 			   "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
+@@ -186,7 +186,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ 	eb = path->nodes[0];
+ 	slot = path->slots[0];
+ 	offset = btrfs_item_ptr_offset(eb, slot);
+-	item_size = btrfs_item_size_nr(eb, slot);
++	item_size = btrfs_item_size(eb, slot);
+ 	if (!IS_ALIGNED(item_size, sizeof(u64))) {
+ 		btrfs_warn(fs_info, "uuid item with illegal size %lu!",
+ 			   (unsigned long)item_size);
+@@ -208,7 +208,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ 		goto out;
+ 	}
+ 
+-	item_size = btrfs_item_size_nr(eb, slot);
++	item_size = btrfs_item_size(eb, slot);
+ 	if (item_size == sizeof(subid)) {
+ 		ret = btrfs_del_item(trans, uuid_root, path);
+ 		goto out;
+@@ -331,7 +331,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
+ 			goto skip;
+ 
+ 		offset = btrfs_item_ptr_offset(leaf, slot);
+-		item_size = btrfs_item_size_nr(leaf, slot);
++		item_size = btrfs_item_size(leaf, slot);
+ 		if (!IS_ALIGNED(item_size, sizeof(u64))) {
+ 			btrfs_warn(fs_info,
+ 				   "uuid item with illegal size %lu!",
+diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
+index 4968535dfff0..90eb5c2830a9 100644
+--- a/fs/btrfs/verity.c
++++ b/fs/btrfs/verity.c
+@@ -333,7 +333,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+ 		if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+ 			break;
+ 
+-		item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset;
++		item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset;
+ 
+ 		if (copied > 0) {
+ 			/*
+diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
+index 42391d4aeb11..5f4ac1a2e1f3 100644
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -34,6 +34,10 @@
+ #include "discard.h"
+ #include "zoned.h"
+ 
++#define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
++					 BTRFS_BLOCK_GROUP_RAID10 | \
++					 BTRFS_BLOCK_GROUP_RAID56_MASK)
++
+ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+ 	[BTRFS_RAID_RAID10] = {
+ 		.sub_stripes	= 2,
+@@ -4643,7 +4647,7 @@ int btrfs_uuid_scan_kthread(void *data)
+ 
+ 		eb = path->nodes[0];
+ 		slot = path->slots[0];
+-		item_size = btrfs_item_size_nr(eb, slot);
++		item_size = btrfs_item_size(eb, slot);
+ 		if (item_size < sizeof(root_item))
+ 			goto skip;
+ 
+@@ -6314,7 +6318,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
+ 	stripe_offset = offset - stripe_offset;
+ 	data_stripes = nr_data_stripes(map);
+ 
+-	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
++	/* Only stripe based profiles needs to check against stripe length. */
++	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
+ 		u64 max_len = stripe_len - stripe_offset;
+ 
+ 		/*
+@@ -7730,7 +7735,7 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device,
+ 	}
+ 	slot = path->slots[0];
+ 	eb = path->nodes[0];
+-	item_size = btrfs_item_size_nr(eb, slot);
++	item_size = btrfs_item_size(eb, slot);
+ 
+ 	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
+ 
+@@ -7808,7 +7813,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
+ 	}
+ 
+ 	if (ret == 0 &&
+-	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
++	    btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ 		/* need to delete old one and insert a new one */
+ 		ret = btrfs_del_item(trans, dev_root, path);
+ 		if (ret != 0) {
+diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
+index 2837b4c8424d..99abf41b89b9 100644
+--- a/fs/btrfs/xattr.c
++++ b/fs/btrfs/xattr.c
+@@ -168,9 +168,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
+ 		const int slot = path->slots[0];
+ 		struct extent_buffer *leaf = path->nodes[0];
+ 		const u16 old_data_len = btrfs_dir_data_len(leaf, di);
+-		const u32 item_size = btrfs_item_size_nr(leaf, slot);
++		const u32 item_size = btrfs_item_size(leaf, slot);
+ 		const u32 data_size = sizeof(*di) + name_len + size;
+-		struct btrfs_item *item;
+ 		unsigned long data_ptr;
+ 		char *ptr;
+ 
+@@ -196,9 +195,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
+ 			btrfs_extend_item(path, data_size);
+ 		}
+ 
+-		item = btrfs_item_nr(slot);
+ 		ptr = btrfs_item_ptr(leaf, slot, char);
+-		ptr += btrfs_item_size(leaf, item) - data_size;
++		ptr += btrfs_item_size(leaf, slot) - data_size;
+ 		di = (struct btrfs_dir_item *)ptr;
+ 		btrfs_set_dir_data_len(leaf, di, size);
+ 		data_ptr = ((unsigned long)(di + 1)) + name_len;
+@@ -335,7 +333,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+ 			goto next_item;
+ 
+ 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+-		item_size = btrfs_item_size_nr(leaf, slot);
++		item_size = btrfs_item_size(leaf, slot);
+ 		cur = 0;
+ 		while (cur < item_size) {
+ 			u16 name_len = btrfs_dir_name_len(leaf, di);
+diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
+index 738619994e26..012a71ab5d8e 100644
+--- a/include/uapi/linux/btrfs.h
++++ b/include/uapi/linux/btrfs.h
+@@ -575,8 +575,10 @@ struct btrfs_ioctl_clone_range_args {
+  * Used by:
+  * struct btrfs_ioctl_defrag_range_args.flags
+  */
+-#define BTRFS_DEFRAG_RANGE_COMPRESS 1
+-#define BTRFS_DEFRAG_RANGE_START_IO 2
++#define BTRFS_DEFRAG_RANGE_COMPRESS	(1UL << 0)
++#define BTRFS_DEFRAG_RANGE_START_IO	(1UL << 1)
++#define BTRFS_DEFRAG_RANGE_FLAGS_MASK	(BTRFS_DEFRAG_RANGE_COMPRESS |\
++					 BTRFS_DEFRAG_RANGE_START_IO)
+ struct btrfs_ioctl_defrag_range_args {
+ 	/* start of the defrag operation */
+ 	__u64 start;
+diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
+index e1c4c732aaba..5416f1f1a77a 100644
+--- a/include/uapi/linux/btrfs_tree.h
++++ b/include/uapi/linux/btrfs_tree.h
+@@ -146,7 +146,9 @@
+ 
+ /*
+  * dir items are the name -> inode pointers in a directory.  There is one
+- * for every name in a directory.
++ * for every name in a directory.  BTRFS_DIR_LOG_ITEM_KEY is no longer used
++ * but it's still defined here for documentation purposes and to help avoid
++ * having its numerical value reused in the future.
+  */
+ #define BTRFS_DIR_LOG_ITEM_KEY  60
+ #define BTRFS_DIR_LOG_INDEX_KEY 72
+-- 
+2.35.1
+
author	Scott B	2022-02-10 22:57:20 -0800
committer	Scott B	2022-02-12 00:57:42 -0800
commit	44db0f40320d2895d9f2438145152e329fb6dfb1 (patch)
tree	a2eecf6c2d0248a9a18806f9b4136f586e754c3b
parent	248c3c289b71536ece4f14f7bf753f14ce637696 (diff)
download	aur-44db0f40320d2895d9f2438145152e329fb6dfb1.tar.gz