summarylogtreecommitdiffstats
diff options
context:
space:
mode:
authorBazaah2023-08-26 14:08:48 +0000
committerBazaah2023-09-09 17:10:45 +0000
commit216795bb003540d295ee7b6c9884b7979d60cc41 (patch)
tree5a790c90f7359bbf859671a52f7d61bf121419f5
parent9e5e0ad72c252a9a96b189a4b19baa69e8a99494 (diff)
downloadaur-216795bb003540d295ee7b6c9884b7979d60cc41.tar.gz
repo: add backport-log-runway-expansion.patch
This fixes failures found in test_concurrent_dir_link_and_compact_log_56210 where the underlying machinery would attempt to write into a too-small ondisk segment. We just backport the entire patch from `main`. Upstream-ref: https://github.com/ceph/ceph/commit/93961385d79a9fef44266181ce6a2c2b805093d4 References: https://github.com/ceph/ceph/blob/main/src/test/objectstore/test_bluefs.cc#L1359 References: https://github.com/ceph/ceph/pull/50216 References: https://tracker.ceph.com/issues/58759
-rw-r--r--ceph-18.2.0-backport-log-runway-expansion.patch577
1 files changed, 577 insertions, 0 deletions
diff --git a/ceph-18.2.0-backport-log-runway-expansion.patch b/ceph-18.2.0-backport-log-runway-expansion.patch
new file mode 100644
index 000000000000..92bea1a2c633
--- /dev/null
+++ b/ceph-18.2.0-backport-log-runway-expansion.patch
@@ -0,0 +1,577 @@
+From a5509f93a7b02e31950d1aba0625dd9996c9608e Mon Sep 17 00:00:00 2001
+From: Pere Diaz Bou <pere-altea@hotmail.com>
+Date: Tue, 25 Jul 2023 17:27:14 +0200
+Subject: [PATCH 1/2] os/bluestore: test log runway expansion error
+
+Signed-off-by: Pere Diaz Bou <pere-altea@hotmail.com>
+---
+ src/test/objectstore/test_bluefs.cc | 153 ++++++++++++++++++++++++++++
+ 1 file changed, 153 insertions(+)
+
+diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc
+index 4f77d8597ae1d..75496a89d2c39 100644
+--- a/src/test/objectstore/test_bluefs.cc
++++ b/src/test/objectstore/test_bluefs.cc
+@@ -1401,6 +1401,159 @@ TEST(BlueFS, test_concurrent_dir_link_and_compact_log_56210) {
+ }
+ }
+
++TEST(BlueFS, test_log_runway) {
++ uint64_t max_log_runway = 65536;
++ ConfSaver conf(g_ceph_context->_conf);
++ conf.SetVal("bluefs_compact_log_sync", "false");
++ conf.SetVal("bluefs_min_log_runway", "32768");
++ conf.SetVal("bluefs_max_log_runway", std::to_string(max_log_runway).c_str());
++ conf.ApplyChanges();
++
++ uint64_t size = 1048576 * 128;
++ TempBdev bdev{size};
++ BlueFS fs(g_ceph_context);
++ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
++ uuid_d fsid;
++ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
++ ASSERT_EQ(0, fs.mount());
++ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
++ // longer transaction than current runway
++ std::string longdir(max_log_runway, 'a');
++ fs.mkdir(longdir);
++ {
++ BlueFS::FileWriter *h;
++ ASSERT_EQ(0, fs.mkdir("dir"));
++ ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false));
++ h->append("foo", 3);
++ h->append("bar", 3);
++ h->append("baz", 3);
++ fs.fsync(h);
++ fs.close_writer(h);
++ }
++ fs.umount(true);
++ fs.mount();
++
++ std::vector<std::string> ls;
++ fs.readdir("dir", &ls);
++ ASSERT_EQ(ls.front(), "file");
++ uint64_t file_size = 0;
++ utime_t mtime;
++ fs.stat("dir", "file", &file_size, &mtime);
++ ASSERT_EQ(file_size, 9);
++}
++
++TEST(BlueFS, test_log_runway_2) {
++ uint64_t max_log_runway = 65536;
++ ConfSaver conf(g_ceph_context->_conf);
++ conf.SetVal("bluefs_compact_log_sync", "false");
++ conf.SetVal("bluefs_min_log_runway", "32768");
++ conf.SetVal("bluefs_max_log_runway", std::to_string(max_log_runway).c_str());
++ conf.ApplyChanges();
++
++ uint64_t size = 1048576 * 128;
++ TempBdev bdev{size};
++ BlueFS fs(g_ceph_context);
++ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
++ uuid_d fsid;
++ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
++ ASSERT_EQ(0, fs.mount());
++ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
++ // longer transaction than current runway
++ std::string longdir(max_log_runway * 2, 'a');
++ std::string longfile(max_log_runway * 2, 'b');
++ {
++ BlueFS::FileWriter *h;
++ ASSERT_EQ(0, fs.mkdir(longdir));
++ ASSERT_EQ(0, fs.open_for_write(longdir, longfile, &h, false));
++ h->append("canary", 6);
++ fs.fsync(h);
++ fs.close_writer(h);
++ fs.sync_metadata(true);
++ }
++ {
++ BlueFS::FileWriter *h;
++ ASSERT_EQ(0, fs.mkdir("dir"));
++ ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false));
++ h->append("foo", 3);
++ h->append("bar", 3);
++ h->append("baz", 3);
++ fs.fsync(h);
++ fs.close_writer(h);
++ }
++ fs.umount(true);
++ fs.mount();
++
++ std::vector<std::string> ls;
++ fs.readdir("dir", &ls);
++ ASSERT_EQ(ls.front(), "file");
++ uint64_t file_size = 0;
++ utime_t mtime;
++ fs.stat("dir", "file", &file_size, &mtime);
++ ASSERT_EQ(file_size, 9);
++ fs.stat(longdir, longfile, &file_size, &mtime);
++ ASSERT_EQ(file_size, 6);
++}
++
++TEST(BlueFS, test_log_runway_3) {
++ uint64_t max_log_runway = 65536;
++ ConfSaver conf(g_ceph_context->_conf);
++ conf.SetVal("bluefs_alloc_size", "4096");
++ conf.SetVal("bluefs_shared_alloc_size", "4096");
++ conf.SetVal("bluefs_compact_log_sync", "false");
++ conf.SetVal("bluefs_min_log_runway", "32768");
++ conf.SetVal("bluefs_max_log_runway", std::to_string(max_log_runway).c_str());
++ conf.ApplyChanges();
++
++ uint64_t size = 1048576 * 128;
++ TempBdev bdev{size};
++ BlueFS fs(g_ceph_context);
++ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
++ uuid_d fsid;
++ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
++ ASSERT_EQ(0, fs.mount());
++ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
++ // longer transaction than current runway
++ for (size_t m = 0; m < 40; m++) {
++ std::string longdir(max_log_runway + m, 'A' + m);
++ std::string longfile(max_log_runway + m, 'A' + m);
++ BlueFS::FileWriter *h;
++ ASSERT_EQ(0, fs.mkdir(longdir));
++ ASSERT_EQ(0, fs.open_for_write(longdir, longfile, &h, false));
++ h->append("canary", 6);
++ fs.fsync(h);
++ fs.close_writer(h);
++ fs.sync_metadata(true);
++ }
++ {
++ BlueFS::FileWriter *h;
++ ASSERT_EQ(0, fs.mkdir("dir"));
++ ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false));
++ h->append("foo", 3);
++ h->append("bar", 3);
++ h->append("baz", 3);
++ fs.fsync(h);
++ fs.close_writer(h);
++ }
++ fs.umount(true);
++ fs.mount();
++
++ std::vector<std::string> ls;
++ fs.readdir("dir", &ls);
++ ASSERT_EQ(ls.front(), "file");
++ uint64_t file_size = 0;
++ utime_t mtime;
++ fs.stat("dir", "file", &file_size, &mtime);
++ ASSERT_EQ(file_size, 9);
++ for (size_t m = 0; m < 40; m++) {
++ uint64_t file_size = 0;
++ utime_t mtime;
++ std::string longdir(max_log_runway + m, 'A' + m);
++ std::string longfile(max_log_runway + m, 'A' + m);
++ fs.stat(longdir, longfile, &file_size, &mtime);
++ ASSERT_EQ(file_size, 6);
++ }
++}
++
+ int main(int argc, char **argv) {
+ auto args = argv_to_vec(argc, argv);
+ map<string,string> defaults = {
+
+From e97aa857c241a9694871de3d3a4079c4d9a120d3 Mon Sep 17 00:00:00 2001
+From: Pere Diaz Bou <pere-altea@hotmail.com>
+Date: Tue, 25 Jul 2023 17:28:14 +0200
+Subject: [PATCH 2/2] os/bluestore: fix bluefs log runway enospc
+
+With these changes, every call to log compaction will try to expand its
+runway in case of insufficient log space. async compaction will ignore
+the `log_forbidden_to_expand` atomic since we know it should't be
+harmful. In any other case, expansion of log will wait until compaction
+is completed.
+
+in order to ensure op_file_update_inc fits on disk we increase the size
+of logs as previously used in _maybe_extend_log. This means we too bring
+back _maybe_extend_log with a different usage.
+
+_maybe_extend_log increases the size of the log if the runway is less
+than the min runway and if the current transaction is too big to fit.
+
+Fixes: https://tracker.ceph.com/issues/58759
+Signed-off-by: Pere Diaz Bou <pere-altea@hotmail.com>
+---
+ src/os/bluestore/BlueFS.cc | 157 ++++++++++++++--------------
+ src/os/bluestore/BlueFS.h | 7 +-
+ src/os/bluestore/bluefs_types.cc | 19 ++++
+ src/os/bluestore/bluefs_types.h | 2 +
+ src/test/objectstore/test_bluefs.cc | 9 +-
+ 5 files changed, 110 insertions(+), 84 deletions(-)
+
+diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
+index 8454ddaf8078c..19953b130d62a 100644
+--- a/src/os/bluestore/BlueFS.cc
++++ b/src/os/bluestore/BlueFS.cc
+@@ -20,15 +20,12 @@ using TOPNSPC::common::cmd_getval;
+
+ using std::byte;
+ using std::list;
+-using std::make_pair;
+ using std::map;
+ using std::ostream;
+-using std::pair;
+ using std::set;
+ using std::string;
+ using std::to_string;
+ using std::vector;
+-using std::chrono::duration;
+ using std::chrono::seconds;
+
+ using ceph::bufferlist;
+@@ -2745,14 +2742,19 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
+ // Part 0.
+ // Lock the log and forbid its expansion and other compactions
+
++ // lock log's run-time structures for a while
++ log.lock.lock();
++
++ // Extend log in case of having a big transaction waiting before starting compaction.
++ _maybe_extend_log();
++
+ // only one compaction allowed at one time
+ bool old_is_comp = std::atomic_exchange(&log_is_compacting, true);
+ if (old_is_comp) {
+ dout(10) << __func__ << " ongoing" <<dendl;
++ log.lock.unlock();
+ return;
+ }
+- // lock log's run-time structures for a while
+- log.lock.lock();
+ auto t0 = mono_clock::now();
+
+ // Part 1.
+@@ -2763,7 +2765,7 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
+ // During that, no one else can write to log, otherwise we risk jumping backwards.
+ // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
+
+- //signal _maybe_extend_log that expansion of log is temporary inacceptable
++ //signal _extend_log that expansion of log is temporary inacceptable
+ bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true);
+ ceph_assert(old_forbidden == false);
+
+@@ -2779,9 +2781,9 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
+
+ // 1.1 allocate new log extents and store them at fnode_tail
+ File *log_file = log.writer->file.get();
++
+ old_log_jump_to = log_file->fnode.get_allocated();
+ bluefs_fnode_t fnode_tail;
+- uint64_t runway = log_file->fnode.get_allocated() - log.writer->get_effective_write_pos();
+ dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
+ << " need 0x" << cct->_conf->bluefs_max_log_runway << std::dec << dendl;
+ int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
+@@ -2809,7 +2811,7 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
+ // TODO - think - if _flush_and_sync_log_jump will not add dirty files nor release pending allocations
+ // then flush_bdev() will not be necessary
+ _flush_bdev();
+- _flush_and_sync_log_jump_D(old_log_jump_to, runway);
++ _flush_and_sync_log_jump_D(old_log_jump_to);
+
+ //
+ // Part 2.
+@@ -3054,54 +3056,68 @@ void BlueFS::_consume_dirty(uint64_t seq)
+ }
+ }
+
+-// Extends log if its free space is smaller then bluefs_min_log_runway.
+-// Returns space available *BEFORE* adding new space. Signed for additional <0 detection.
+-int64_t BlueFS::_maybe_extend_log()
+-{
++int64_t BlueFS::_maybe_extend_log() {
++ uint64_t runway = log.writer->file->fnode.get_allocated() - log.writer->get_effective_write_pos();
++ // increasing the size of the log involves adding a OP_FILE_UPDATE_INC which its size will
++ // increase with respect the number of extents. bluefs_min_log_runway should cover the max size
++ // a log can get.
++ // inject new allocation in case log is too big
++ size_t expected_log_size = 0;
++ log.t.bound_encode(expected_log_size);
++ if (expected_log_size + cct->_conf->bluefs_min_log_runway > runway) {
++ _extend_log(expected_log_size + cct->_conf->bluefs_max_log_runway);
++ } else if (runway < cct->_conf->bluefs_min_log_runway) {
++ _extend_log(cct->_conf->bluefs_max_log_runway);
++ }
++ runway = log.writer->file->fnode.get_allocated() - log.writer->get_effective_write_pos();
++ return runway;
++}
++
++void BlueFS::_extend_log(uint64_t amount) {
+ ceph_assert(ceph_mutex_is_locked(log.lock));
+- // allocate some more space (before we run out)?
+- // BTW: this triggers `flush()` in the `page_aligned_appender` of `log.writer`.
+- int64_t runway = log.writer->file->fnode.get_allocated() -
+- log.writer->get_effective_write_pos();
+- if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
+- dout(10) << __func__ << " allocating more log runway (0x"
+- << std::hex << runway << std::dec << " remaining)" << dendl;
+- /*
+- * Usually, when we are low on space in log, we just allocate new extent,
+- * put update op(log) to log and we are fine.
+- * Problem - it interferes with log compaction:
+- * New log produced in compaction will include - as last op - jump into some offset (anchor) of current log.
+- * It is assumed that log region (anchor - end) will contain all changes made by bluefs since
+- * full state capture into new log.
+- * Putting log update into (anchor - end) region is illegal, because any update there must be compatible with
+- * both logs, but old log is different then new log.
+- *
+- * Possible solutions:
+- * - stall extending log until we finish compacting and switch log (CURRENT)
+- * - re-run compaction with more runway for old log
+- * - add OP_FILE_ADDEXT that adds extent; will be compatible with both logs
+- */
+- if (log_forbidden_to_expand.load() == true) {
+- return -EWOULDBLOCK;
+- }
+- vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
+- int r = _allocate(
++ std::unique_lock<ceph::mutex> ll(log.lock, std::adopt_lock);
++ while (log_forbidden_to_expand.load() == true) {
++ log_cond.wait(ll);
++ }
++ ll.release();
++ uint64_t allocated_before_extension = log.writer->file->fnode.get_allocated();
++ vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
++ amount = round_up_to(amount, super.block_size);
++ int r = _allocate(
+ vselector->select_prefer_bdev(log.writer->file->vselector_hint),
+- cct->_conf->bluefs_max_log_runway,
++ amount,
+ 0,
+ &log.writer->file->fnode);
+- ceph_assert(r == 0);
+- vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
+- log.t.op_file_update_inc(log.writer->file->fnode);
++ ceph_assert(r == 0);
++ dout(10) << "extended log by 0x" << std::hex << amount << " bytes " << dendl;
++ vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
++
++ bluefs_transaction_t log_extend_transaction;
++ log_extend_transaction.seq = log.t.seq;
++ log_extend_transaction.uuid = log.t.uuid;
++ log_extend_transaction.op_file_update_inc(log.writer->file->fnode);
++
++ bufferlist bl;
++ bl.reserve(super.block_size);
++ encode(log_extend_transaction, bl);
++ _pad_bl(bl, super.block_size);
++ log.writer->append(bl);
++ ceph_assert(allocated_before_extension >= log.writer->get_effective_write_pos());
++ log.t.seq = log.seq_live;
++
++ // before sync_core we advance the seq
++ {
++ std::unique_lock<ceph::mutex> l(dirty.lock);
++ _log_advance_seq();
+ }
+- return runway;
+ }
+
+-void BlueFS::_flush_and_sync_log_core(int64_t runway)
++void BlueFS::_flush_and_sync_log_core()
+ {
+ ceph_assert(ceph_mutex_is_locked(log.lock));
+ dout(10) << __func__ << " " << log.t << dendl;
+
++
+ bufferlist bl;
+ bl.reserve(super.block_size);
+ encode(log.t, bl);
+@@ -3113,10 +3129,11 @@ void BlueFS::_flush_and_sync_log_core(int64_t runway)
+ logger->inc(l_bluefs_log_write_count, 1);
+ logger->inc(l_bluefs_logged_bytes, bl.length());
+
+- if (true) {
+- ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
+- // transaction will not fit extents before growth -> data loss on _replay
+- }
++ uint64_t runway = log.writer->file->fnode.get_allocated() - log.writer->get_effective_write_pos();
++ // ensure runway is big enough, this should be taken care of by _maybe_extend_log,
++ // but let's keep this here just in case.
++ ceph_assert(bl.length() <= runway);
++
+
+ log.writer->append(bl);
+
+@@ -3185,31 +3202,15 @@ void BlueFS::_release_pending_allocations(vector<interval_set<uint64_t>>& to_rel
+
+ int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq)
+ {
+- int64_t available_runway;
+- do {
+- log.lock.lock();
+- dirty.lock.lock();
+- if (want_seq && want_seq <= dirty.seq_stable) {
+- dout(10) << __func__ << " want_seq " << want_seq << " <= seq_stable "
+- << dirty.seq_stable << ", done" << dendl;
+- dirty.lock.unlock();
+- log.lock.unlock();
+- return 0;
+- }
+-
+- available_runway = _maybe_extend_log();
+- if (available_runway == -EWOULDBLOCK) {
+- // we are in need of adding runway, but we are during log-switch from compaction
+- dirty.lock.unlock();
+- //instead log.lock.unlock() do move ownership
+- std::unique_lock<ceph::mutex> ll(log.lock, std::adopt_lock);
+- while (log_forbidden_to_expand.load()) {
+- log_cond.wait(ll);
+- }
+- } else {
+- ceph_assert(available_runway >= 0);
+- }
+- } while (available_runway < 0);
++ log.lock.lock();
++ dirty.lock.lock();
++ if (want_seq && want_seq <= dirty.seq_stable) {
++ dout(10) << __func__ << " want_seq " << want_seq << " <= seq_stable "
++ << dirty.seq_stable << ", done" << dendl;
++ dirty.lock.unlock();
++ log.lock.unlock();
++ return 0;
++ }
+
+ ceph_assert(want_seq == 0 || want_seq <= dirty.seq_live); // illegal to request seq that was not created yet
+ uint64_t seq =_log_advance_seq();
+@@ -3218,7 +3219,8 @@ int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq)
+ to_release.swap(dirty.pending_release);
+ dirty.lock.unlock();
+
+- _flush_and_sync_log_core(available_runway);
++ _maybe_extend_log();
++ _flush_and_sync_log_core();
+ _flush_bdev(log.writer);
+ logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
+ //now log.lock is no longer needed
+@@ -3232,8 +3234,7 @@ int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq)
+ }
+
+ // Flushes log and immediately adjusts log_writer pos.
+-int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to,
+- int64_t available_runway)
++int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to)
+ {
+ ceph_assert(ceph_mutex_is_locked(log.lock));
+
+@@ -3246,7 +3247,7 @@ int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to,
+ vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
+ to_release.swap(dirty.pending_release);
+ dirty.lock.unlock();
+- _flush_and_sync_log_core(available_runway);
++ _flush_and_sync_log_core();
+
+ dout(10) << __func__ << " jumping log offset from 0x" << std::hex
+ << log.writer->pos << " -> 0x" << jump_to << std::dec << dendl;
+diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
+index adfc8eb0a235b..4c89baea3a6c1 100644
+--- a/src/os/bluestore/BlueFS.h
++++ b/src/os/bluestore/BlueFS.h
+@@ -453,15 +453,14 @@ class BlueFS {
+ #endif
+
+ int64_t _maybe_extend_log();
+- void _extend_log();
++ void _extend_log(uint64_t amount);
+ uint64_t _log_advance_seq();
+ void _consume_dirty(uint64_t seq);
+ void _clear_dirty_set_stable_D(uint64_t seq_stable);
+ void _release_pending_allocations(std::vector<interval_set<uint64_t>>& to_release);
+
+- void _flush_and_sync_log_core(int64_t available_runway);
+- int _flush_and_sync_log_jump_D(uint64_t jump_to,
+- int64_t available_runway);
++ void _flush_and_sync_log_core();
++ int _flush_and_sync_log_jump_D(uint64_t jump_to);
+ int _flush_and_sync_log_LD(uint64_t want_seq = 0);
+
+ uint64_t _estimate_transaction_size(bluefs_transaction_t* t);
+diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc
+index c8d2ede7bed92..70c8a4fbf1c56 100644
+--- a/src/os/bluestore/bluefs_types.cc
++++ b/src/os/bluestore/bluefs_types.cc
+@@ -4,6 +4,7 @@
+ #include <algorithm>
+ #include "bluefs_types.h"
+ #include "common/Formatter.h"
++#include "include/denc.h"
+ #include "include/uuid.h"
+ #include "include/stringify.h"
+
+@@ -218,6 +219,23 @@ std::ostream& operator<<(std::ostream& out, const bluefs_fnode_delta_t& delta)
+
+ // bluefs_transaction_t
+
++DENC_HELPERS
++void bluefs_transaction_t::bound_encode(size_t &s) const {
++ uint32_t crc = op_bl.crc32c(-1);
++ DENC_START(1, 1, s);
++ denc(uuid, s);
++ denc_varint(seq, s);
++ // not using bufferlist encode method, as it merely copies the bufferptr and not
++ // contents, meaning we're left with fragmented target bl
++ __u32 len = op_bl.length();
++ denc(len, s);
++ for (auto& it : op_bl.buffers()) {
++ s += it.length();
++ }
++ denc(crc, s);
++ DENC_FINISH(s);
++}
++
+ void bluefs_transaction_t::encode(bufferlist& bl) const
+ {
+ uint32_t crc = op_bl.crc32c(-1);
+@@ -282,3 +300,4 @@ ostream& operator<<(ostream& out, const bluefs_transaction_t& t)
+ << " crc 0x" << t.op_bl.crc32c(-1)
+ << std::dec << ")";
+ }
++
+diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h
+index d5d8ee5a62826..b0ce7c5c9d38d 100644
+--- a/src/os/bluestore/bluefs_types.h
++++ b/src/os/bluestore/bluefs_types.h
+@@ -308,6 +308,7 @@ struct bluefs_transaction_t {
+ encode(delta, op_bl);
+ file.reset_delta();
+ }
++
+ void op_file_remove(uint64_t ino) {
+ using ceph::encode;
+ encode((__u8)OP_FILE_REMOVE, op_bl);
+@@ -328,6 +329,7 @@ struct bluefs_transaction_t {
+ op_bl.claim_append(from.op_bl);
+ }
+
++ void bound_encode(size_t &s) const;
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& p);
+ void dump(ceph::Formatter *f) const;
+diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc
+index 75496a89d2c39..6d3ff1218a437 100644
+--- a/src/test/objectstore/test_bluefs.cc
++++ b/src/test/objectstore/test_bluefs.cc
+@@ -1459,8 +1459,9 @@ TEST(BlueFS, test_log_runway_2) {
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ // longer transaction than current runway
+- std::string longdir(max_log_runway * 2, 'a');
+- std::string longfile(max_log_runway * 2, 'b');
++ size_t name_length = max_log_runway * 2;
++ std::string longdir(name_length, 'a');
++ std::string longfile(name_length, 'b');
+ {
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.mkdir(longdir));
+@@ -1492,6 +1493,10 @@ TEST(BlueFS, test_log_runway_2) {
+ ASSERT_EQ(file_size, 9);
+ fs.stat(longdir, longfile, &file_size, &mtime);
+ ASSERT_EQ(file_size, 6);
++
++ std::vector<std::string> ls_longdir;
++ fs.readdir(longdir, &ls_longdir);
++ ASSERT_EQ(ls_longdir.front(), longfile);
+ }
+
+ TEST(BlueFS, test_log_runway_3) {