diff options
Diffstat (limited to 'block.patch')
-rw-r--r-- | block.patch | 2650 |
1 files changed, 0 insertions, 2650 deletions
diff --git a/block.patch b/block.patch deleted file mode 100644 index 1f99d7869a0d..000000000000 --- a/block.patch +++ /dev/null @@ -1,2650 +0,0 @@ -To: LKML <linux-kernel@vger.kernel.org>, Jens Axboe <axboe@fb.com> -From: =?UTF-8?Q?Holger_Hoffst=c3=a4tte?= <holger.hoffstaette@googlemail.com> -Subject: [PATCH] loop: properly observe rotational flag of underlying device -Organization: Applied Asynchrony, Inc. -Date: Wed, 11 Nov 2015 16:21:51 +0100 - -The loop driver always declares the rotational flag of its device as -rotational, even when the device of the mapped file is nonrotational, -as is the case with SSDs or on tmpfs. This can confuse filesystem tools -which are SSD-aware; in my case I frequently forget to tell mkfs.btrfs -that my loop device on tmpfs is nonrotational, and that I really don't -need any automatic metadata redundancy. - -The attached patch fixes this by introspecting the rotational flag of the -mapped file's underlying block device, if it exists. If the mapped file's -filesystem has no associated block device - as is the case on e.g. tmpfs - -we assume nonrotational storage. If there is a better way to identify such -non-devices I'd love to hear them. - -Signed-off-by: Holger Hoffstätte <holger.hoffstaette@googlemail.com> ---- - drivers/block/loop.c | 19 +++++++++++++++++++ - 1 file changed, 19 insertions(+) - -diff --git a/drivers/block/loop.c b/drivers/block/loop.c -index 423f4ca..2984aca 100644 ---- a/drivers/block/loop.c -+++ b/drivers/block/loop.c -@@ -843,6 +843,24 @@ static void loop_config_discard(struct loop_device *lo) - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); - } - -+static void loop_update_rotational(struct loop_device *lo) -+{ -+ struct file *file = lo->lo_backing_file; -+ struct inode *file_inode = file->f_mapping->host; -+ struct block_device *file_bdev = file_inode->i_sb->s_bdev; -+ struct request_queue *q = lo->lo_queue; -+ bool nonrot = true; -+ -+ /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */ -+ if (file_bdev) -+ nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev)); -+ -+ if (nonrot) -+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); -+ else -+ queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q); -+} -+ - static void loop_unprepare_queue(struct loop_device *lo) - { - flush_kthread_worker(&lo->worker); -@@ -939,6 +957,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, - if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) - blk_queue_flush(lo->lo_queue, REQ_FLUSH); - -+ loop_update_rotational(lo); - loop_update_dio(lo); - set_capacity(lo->lo_disk, size); - bd_set_size(bdev, size << 9); --- -2.6.3 -From 273d4cb9fc3d75b6b7f147d1a064f75a5412a76c Mon Sep 17 00:00:00 2001 -From: Jens Axboe <axboe@fb.com> -Date: Wed, 27 Jul 2016 15:30:35 -0600 -Subject: block: add WRITE_BG - -This adds a new request flag, REQ_BG, that callers can use to tell -the block layer that this is background (non-urgent) IO. - -Signed-off-by: Jens Axboe <axboe@fb.com> ---- - include/linux/blk_types.h | 4 +++- - include/linux/fs.h | 3 +++ - 2 files changed, 6 insertions(+), 1 deletion(-) - -diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h -index 436f43f..be4409b 100644 ---- a/include/linux/blk_types.h -+++ b/include/linux/blk_types.h -@@ -155,6 +155,7 @@ enum rq_flag_bits { - __REQ_INTEGRITY, /* I/O includes block integrity payload */ - __REQ_FUA, /* forced unit access */ - __REQ_PREFLUSH, /* request for cache flush */ -+ __REQ_BG, /* background activity */ - - /* bio only flags */ - __REQ_RAHEAD, /* read ahead, can fail anytime */ -@@ -198,7 +199,7 @@ enum rq_flag_bits { - (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) - #define REQ_COMMON_MASK \ - (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \ -- REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE) -+ REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_BG) - #define REQ_CLONE_MASK REQ_COMMON_MASK - - /* This mask is used for both bio and request merge checking */ -@@ -223,6 +224,7 @@ enum rq_flag_bits { - #define REQ_COPY_USER (1ULL << __REQ_COPY_USER) - #define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) - #define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) -+#define REQ_BG (1ULL << __REQ_BG) - #define REQ_IO_STAT (1ULL << __REQ_IO_STAT) - #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) - #define REQ_PM (1ULL << __REQ_PM) -diff --git a/include/linux/fs.h b/include/linux/fs.h -index 901e25d..7c7951f 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -189,6 +189,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, - * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded - * by a cache flush and data is guaranteed to be on - * non-volatile media on completion. -+ * WRITE_BG Background write. This is for background activity like -+ * the periodic flush and background threshold writeback - * - */ - #define RW_MASK REQ_OP_WRITE -@@ -202,6 +204,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, - #define WRITE_FLUSH (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH) - #define WRITE_FUA (REQ_SYNC | REQ_NOIDLE | REQ_FUA) - #define WRITE_FLUSH_FUA (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA) -+#define WRITE_BG (REQ_NOIDLE | REQ_BG) - - /* - * Attribute flags. These should be or-ed together to figure out what --- -cgit v0.11.2 - -From 33a170c4f076584bc05feb19efa7beb0ee099318 Mon Sep 17 00:00:00 2001 -From: Jens Axboe <axboe@fb.com> -Date: Wed, 27 Jul 2016 15:24:08 -0600 -Subject: writeback: add wbc_to_write_flags() - -Add wbc_to_write_flags(), which returns the write modifier flags to use, -based on a struct writeback_control. No functional changes in this -patch, but it prepares us for factoring other wbc fields for write type. - -Signed-off-by: Jens Axboe <axboe@fb.com> -Reviewed-by: Jan Kara <jack@suse.cz> ---- - fs/buffer.c | 2 +- - fs/f2fs/data.c | 2 +- - fs/f2fs/node.c | 2 +- - fs/gfs2/meta_io.c | 3 +-- - fs/mpage.c | 2 +- - fs/xfs/xfs_aops.c | 7 +++---- - include/linux/writeback.h | 8 ++++++++ - 7 files changed, 16 insertions(+), 10 deletions(-) - -diff --git a/fs/buffer.c b/fs/buffer.c -index 9c8eb9b..6a5f1a0 100644 ---- a/fs/buffer.c -+++ b/fs/buffer.c -@@ -1698,7 +1698,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, - struct buffer_head *bh, *head; - unsigned int blocksize, bbits; - int nr_underway = 0; -- int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); -+ int write_flags = wbc_to_write_flags(wbc); - - head = create_page_buffers(page, inode, - (1 << BH_Dirty)|(1 << BH_Uptodate)); -diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c -index ccb401e..cb0528b 100644 ---- a/fs/f2fs/data.c -+++ b/fs/f2fs/data.c -@@ -1240,7 +1240,7 @@ static int f2fs_write_data_page(struct page *page, - .sbi = sbi, - .type = DATA, - .op = REQ_OP_WRITE, -- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, -+ .op_flags = wbc_to_write_flags(wbc), - .page = page, - .encrypted_page = NULL, - }; -diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c -index f75d197..c1713da 100644 ---- a/fs/f2fs/node.c -+++ b/fs/f2fs/node.c -@@ -1561,7 +1561,7 @@ static int f2fs_write_node_page(struct page *page, - .sbi = sbi, - .type = NODE, - .op = REQ_OP_WRITE, -- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, -+ .op_flags = wbc_to_write_flags(wbc), - .page = page, - .encrypted_page = NULL, - }; -diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c -index 950b8be..7991c62 100644 ---- a/fs/gfs2/meta_io.c -+++ b/fs/gfs2/meta_io.c -@@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb - { - struct buffer_head *bh, *head; - int nr_underway = 0; -- int write_flags = REQ_META | REQ_PRIO | -- (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); -+ int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc); - - BUG_ON(!PageLocked(page)); - BUG_ON(!page_has_buffers(page)); -diff --git a/fs/mpage.c b/fs/mpage.c -index d2413af..d6f1afe 100644 ---- a/fs/mpage.c -+++ b/fs/mpage.c -@@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, - struct buffer_head map_bh; - loff_t i_size = i_size_read(inode); - int ret = 0; -- int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); -+ int op_flags = wbc_to_write_flags(wbc); - - if (page_has_buffers(page)) { - struct buffer_head *head = page_buffers(page); -diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c -index 7575cfc..a68645a 100644 ---- a/fs/xfs/xfs_aops.c -+++ b/fs/xfs/xfs_aops.c -@@ -447,8 +447,8 @@ xfs_submit_ioend( - - ioend->io_bio->bi_private = ioend; - ioend->io_bio->bi_end_io = xfs_end_bio; -- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, -- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); -+ bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc)); -+ - /* - * If we are failing the IO now, just mark the ioend with an - * error and finish it. This will run IO completion immediately -@@ -519,8 +519,7 @@ xfs_chain_bio( - - bio_chain(ioend->io_bio, new); - bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ -- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, -- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); -+ bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc)); - submit_bio(ioend->io_bio); - ioend->io_bio = new; - } -diff --git a/include/linux/writeback.h b/include/linux/writeback.h -index fc1e16c..608afd3 100644 ---- a/include/linux/writeback.h -+++ b/include/linux/writeback.h -@@ -100,6 +100,14 @@ struct writeback_control { - #endif - }; - -+static inline int wbc_to_write_flags(struct writeback_control *wbc) -+{ -+ if (wbc->sync_mode == WB_SYNC_ALL) -+ return WRITE_SYNC; -+ -+ return 0; -+} -+ - /* - * A wb_domain represents a domain that wb's (bdi_writeback's) belong to - * and are measured against each other in. There always is one global --- -cgit v0.11.2 - -From d6cf7bfd4d627114ba3e2cce96fa9468042a6fba Mon Sep 17 00:00:00 2001 -From: Jens Axboe <axboe@fb.com> -Date: Thu, 14 Apr 2016 09:53:24 -0600 -Subject: writeback: use WRITE_BG for kupdate and background writeback - -If we're doing background type writes, then use the appropriate -write command for that. - -Signed-off-by: Jens Axboe <axboe@fb.com> ---- - include/linux/writeback.h | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/include/linux/writeback.h b/include/linux/writeback.h -index 608afd3..e53abf2 100644 ---- a/include/linux/writeback.h -+++ b/include/linux/writeback.h -@@ -104,6 +104,8 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc) - { - if (wbc->sync_mode == WB_SYNC_ALL) - return WRITE_SYNC; -+ else if (wbc->for_kupdate || wbc->for_background) -+ return WRITE_BG; - - return 0; - } --- -cgit v0.11.2 - -From cd38cff40da34de0bf78f8305c89bdfafc606e7f Mon Sep 17 00:00:00 2001 -From: Jens Axboe <axboe@fb.com> -Date: Thu, 1 Sep 2016 10:20:33 -0600 -Subject: writeback: track if we're sleeping on progress in - balance_dirty_pages() - -Note in the bdi_writeback structure whenever a task ends up sleeping -waiting for progress. We can use that information in the lower layers -to increase the priority of writes. - -Signed-off-by: Jens Axboe <axboe@fb.com> ---- - include/linux/backing-dev-defs.h | 2 ++ - mm/backing-dev.c | 1 + - mm/page-writeback.c | 1 + - 3 files changed, 4 insertions(+) - -diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h -index c357f27..dc5f76d 100644 ---- a/include/linux/backing-dev-defs.h -+++ b/include/linux/backing-dev-defs.h -@@ -116,6 +116,8 @@ struct bdi_writeback { - struct list_head work_list; - struct delayed_work dwork; /* work item used for writeback */ - -+ unsigned long dirty_sleep; /* last wait */ -+ - struct list_head bdi_node; /* anchored at bdi->wb_list */ - - #ifdef CONFIG_CGROUP_WRITEBACK -diff --git a/mm/backing-dev.c b/mm/backing-dev.c -index 8fde443..3bfed5ab 100644 ---- a/mm/backing-dev.c -+++ b/mm/backing-dev.c -@@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, - spin_lock_init(&wb->work_lock); - INIT_LIST_HEAD(&wb->work_list); - INIT_DELAYED_WORK(&wb->dwork, wb_workfn); -+ wb->dirty_sleep = jiffies; - - wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); - if (!wb->congested) -diff --git a/mm/page-writeback.c b/mm/page-writeback.c -index f4cd7d8..98bc3fc 100644 ---- a/mm/page-writeback.c -+++ b/mm/page-writeback.c -@@ -1778,6 +1778,7 @@ pause: - pause, - start_time); - __set_current_state(TASK_KILLABLE); -+ wb->dirty_sleep = now; - io_schedule_timeout(pause); - - current->dirty_paused_when = now + pause; --- -cgit v0.11.2 - -From a98f5ab3840c2e6008c478aafe5df055404acdd1 Mon Sep 17 00:00:00 2001 -From: Jens Axboe <axboe@fb.com> -Date: Wed, 30 Mar 2016 10:21:08 -0600 -Subject: block: add code to track actual device queue depth - -For blk-mq, ->nr_requests does track queue depth, at least at init -time. But for the older queue paths, it's simply a soft setting. -On top of that, it's generally larger than the hardware setting -on purpose, to allow backup of requests for merging. - -Fill a hole in struct request with a 'queue_depth' member, that -drivers can call to more closely inform the block layer of the -real queue depth. - -Signed-off-by: Jens Axboe <axboe@fb.com> ---- - block/blk-settings.c | 12 ++++++++++++ - drivers/scsi/scsi.c | 3 +++ - include/linux/blkdev.h | 11 +++++++++++ - 3 files changed, 26 insertions(+) - -diff --git a/block/blk-settings.c b/block/blk-settings.c -index f679ae1..f7e122e 100644 ---- a/block/blk-settings.c -+++ b/block/blk-settings.c -@@ -832,6 +832,18 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable) - EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); - - /** -+ * blk_set_queue_depth - tell the block layer about the device queue depth -+ * @q: the request queue for the device -+ * @depth: queue depth -+ * -+ */ -+void blk_set_queue_depth(struct request_queue *q, unsigned int depth) -+{ -+ q->queue_depth = depth; -+} -+EXPORT_SYMBOL(blk_set_queue_depth); -+ -+/** - * blk_queue_write_cache - configure queue's write cache - * @q: the request queue for the device - * @wc: write back cache on or off -diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c -index 1f36aca..f3de98a 100644 ---- a/drivers/scsi/scsi.c -+++ b/drivers/scsi/scsi.c -@@ -621,6 +621,9 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) - wmb(); - } - -+ if (sdev->request_queue) -+ blk_set_queue_depth(sdev->request_queue, depth); -+ - return sdev->queue_depth; - } - EXPORT_SYMBOL(scsi_change_queue_depth); -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index e79055c..1d12aa6 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -327,6 +327,8 @@ struct request_queue { - struct blk_mq_ctx __percpu *queue_ctx; - unsigned int nr_queues; - -+ unsigned int queue_depth; -+ - /* hw dispatch queues */ - struct blk_mq_hw_ctx **queue_hw_ctx; - unsigned int nr_hw_queues; -@@ -683,6 +685,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) - return false; - } - -+static inline unsigned int blk_queue_depth(struct request_queue *q) -+{ -+ if (q->queue_depth) -+ return q->queue_depth; -+ -+ return q->nr_requests; -+} -+ - /* - * q->prep_rq_fn return values - */ -@@ -999,6 +1009,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); - extern void blk_queue_io_min(struct request_queue *q, unsigned int min); - extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); - extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); -+extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); - extern void blk_set_default_limits(struct queue_limits *lim); - extern void blk_set_stacking_limits(struct queue_limits *lim); - extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, --- -cgit v0.11.2 - -From a13cc5885ddd5582129869c1837821d6af6d48bb Mon Sep 17 00:00:00 2001 -From: Jens Axboe <axboe@fb.com> -Date: Thu, 1 Sep 2016 10:22:41 -0600 -Subject: block: add scalable completion tracking of requests - -For legacy block, we simply track them in the request queue. For -blk-mq, we track them on a per-sw queue basis, which we can then -sum up through the hardware queues and finally to a per device -state. - -The stats are tracked in, roughly, 0.1s interval windows. - -Add sysfs files to display the stats. - -Signed-off-by: Jens Axboe <axboe@fb.com> ---- - block/Makefile | 2 +- - block/blk-core.c | 4 + - block/blk-mq-sysfs.c | 47 ++++++++++ - block/blk-mq.c | 14 +++ - block/blk-mq.h | 3 + - block/blk-stat.c | 220 ++++++++++++++++++++++++++++++++++++++++++++++ - block/blk-stat.h | 18 ++++ - block/blk-sysfs.c | 26 ++++++ - include/linux/blk_types.h | 12 +++ - include/linux/blkdev.h | 4 + - 10 files changed, 349 insertions(+), 1 deletion(-) - create mode 100644 block/blk-stat.c - create mode 100644 block/blk-stat.h - -diff --git a/block/Makefile b/block/Makefile -index 9eda232..3446e04 100644 ---- a/block/Makefile -+++ b/block/Makefile -@@ -5,7 +5,7 @@ - obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ - blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ -- blk-lib.o blk-mq.o blk-mq-tag.o \ -+ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ - blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ - genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ - badblocks.o partitions/ -diff --git a/block/blk-core.c b/block/blk-core.c -index 36c7ac3..4075cbe 100644 ---- a/block/blk-core.c -+++ b/block/blk-core.c -@@ -2475,6 +2475,8 @@ void blk_start_request(struct request *req) - { - blk_dequeue_request(req); - -+ req->issue_time = ktime_to_ns(ktime_get()); -+ - /* - * We are now handing the request to the hardware, initialize - * resid_len to full count and add the timeout handler. -@@ -2542,6 +2544,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) - - trace_block_rq_complete(req->q, req, nr_bytes); - -+ blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req); -+ - if (!req->bio) - return false; - -diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c -index fe822aa..b66bbf1 100644 ---- a/block/blk-mq-sysfs.c -+++ b/block/blk-mq-sysfs.c -@@ -247,6 +247,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) - return ret; - } - -+static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx) -+{ -+ struct blk_mq_ctx *ctx; -+ unsigned int i; -+ -+ hctx_for_each_ctx(hctx, ctx, i) { -+ blk_stat_init(&ctx->stat[0]); -+ blk_stat_init(&ctx->stat[1]); -+ } -+} -+ -+static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx, -+ const char *page, size_t count) -+{ -+ blk_mq_stat_clear(hctx); -+ return count; -+} -+ -+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) -+{ -+ return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", -+ pre, (long long) stat->nr_samples, -+ (long long) stat->mean, (long long) stat->min, -+ (long long) stat->max); -+} -+ -+static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page) -+{ -+ struct blk_rq_stat stat[2]; -+ ssize_t ret; -+ -+ blk_stat_init(&stat[0]); -+ blk_stat_init(&stat[1]); -+ -+ blk_hctx_stat_get(hctx, stat); -+ -+ ret = print_stat(page, &stat[0], "read :"); -+ ret += print_stat(page + ret, &stat[1], "write:"); -+ return ret; -+} -+ - static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { - .attr = {.name = "dispatched", .mode = S_IRUGO }, - .show = blk_mq_sysfs_dispatched_show, -@@ -304,6 +345,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { - .attr = {.name = "io_poll", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_poll_show, - }; -+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = { -+ .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR }, -+ .show = blk_mq_hw_sysfs_stat_show, -+ .store = blk_mq_hw_sysfs_stat_store, -+}; - - static struct attribute *default_hw_ctx_attrs[] = { - &blk_mq_hw_sysfs_queued.attr, -@@ -314,6 +360,7 @@ static struct attribute *default_hw_ctx_attrs[] = { - &blk_mq_hw_sysfs_cpus.attr, - &blk_mq_hw_sysfs_active.attr, - &blk_mq_hw_sysfs_poll.attr, -+ &blk_mq_hw_sysfs_stat.attr, - NULL, - }; - -diff --git a/block/blk-mq.c b/block/blk-mq.c -index 13f5a6c..712f141 100644 ---- a/block/blk-mq.c -+++ b/block/blk-mq.c -@@ -29,6 +29,7 @@ - #include "blk.h" - #include "blk-mq.h" - #include "blk-mq-tag.h" -+#include "blk-stat.h" - - static DEFINE_MUTEX(all_q_mutex); - static LIST_HEAD(all_q_list); -@@ -400,10 +401,19 @@ static void blk_mq_ipi_complete_request(struct request *rq) - put_cpu(); - } - -+static void blk_mq_stat_add(struct request *rq) -+{ -+ struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)]; -+ -+ blk_stat_add(stat, rq); -+} -+ - static void __blk_mq_complete_request(struct request *rq) - { - struct request_queue *q = rq->q; - -+ blk_mq_stat_add(rq); -+ - if (!q->softirq_done_fn) - blk_mq_end_request(rq, rq->errors); - else -@@ -447,6 +457,8 @@ void blk_mq_start_request(struct request *rq) - if (unlikely(blk_bidi_rq(rq))) - rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); - -+ rq->issue_time = ktime_to_ns(ktime_get()); -+ - blk_add_timer(rq); - - /* -@@ -1795,6 +1807,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, - spin_lock_init(&__ctx->lock); - INIT_LIST_HEAD(&__ctx->rq_list); - __ctx->queue = q; -+ blk_stat_init(&__ctx->stat[0]); -+ blk_stat_init(&__ctx->stat[1]); - - /* If the cpu isn't online, the cpu is mapped to first hctx */ - if (!cpu_online(i)) -diff --git a/block/blk-mq.h b/block/blk-mq.h -index 9087b11..e107f70 100644 ---- a/block/blk-mq.h -+++ b/block/blk-mq.h -@@ -1,6 +1,8 @@ - #ifndef INT_BLK_MQ_H - #define INT_BLK_MQ_H - -+#include "blk-stat.h" -+ - struct blk_mq_tag_set; - - struct blk_mq_ctx { -@@ -20,6 +22,7 @@ struct blk_mq_ctx { - - /* incremented at completion time */ - unsigned long ____cacheline_aligned_in_smp rq_completed[2]; -+ struct blk_rq_stat stat[2]; - - struct request_queue *queue; - struct kobject kobj; -diff --git a/block/blk-stat.c b/block/blk-stat.c -new file mode 100644 -index 0000000..3965e8a ---- /dev/null -+++ b/block/blk-stat.c -@@ -0,0 +1,220 @@ -+/* -+ * Block stat tracking code -+ * -+ * Copyright (C) 2016 Jens Axboe -+ */ -+#include <linux/kernel.h> -+#include <linux/blk-mq.h> -+ -+#include "blk-stat.h" -+#include "blk-mq.h" -+ -+static void blk_stat_flush_batch(struct blk_rq_stat *stat) -+{ -+ if (!stat->nr_batch) -+ return; -+ if (!stat->nr_samples) -+ stat->mean = div64_s64(stat->batch, stat->nr_batch); -+ else { -+ stat->mean = div64_s64((stat->mean * stat->nr_samples) + -+ stat->batch, -+ stat->nr_samples + stat->nr_batch); -+ } -+ -+ stat->nr_samples += stat->nr_batch; -+ stat->nr_batch = stat->batch = 0; -+} -+ -+void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) -+{ -+ if (!src->nr_samples) -+ return; -+ -+ blk_stat_flush_batch(src); -+ -+ dst->min = min(dst->min, src->min); -+ dst->max = max(dst->max, src->max); -+ -+ if (!dst->nr_samples) -+ dst->mean = src->mean; -+ else { -+ dst->mean = div64_s64((src->mean * src->nr_samples) + -+ (dst->mean * dst->nr_samples), -+ dst->nr_samples + src->nr_samples); -+ } -+ dst->nr_samples += src->nr_samples; -+} -+ -+static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) -+{ -+ struct blk_mq_hw_ctx *hctx; -+ struct blk_mq_ctx *ctx; -+ uint64_t latest = 0; -+ int i, j, nr; -+ -+ blk_stat_init(&dst[0]); -+ blk_stat_init(&dst[1]); -+ -+ nr = 0; -+ do { -+ uint64_t newest = 0; -+ -+ queue_for_each_hw_ctx(q, hctx, i) { -+ hctx_for_each_ctx(hctx, ctx, j) { -+ if (!ctx->stat[0].nr_samples && -+ !ctx->stat[1].nr_samples) -+ continue; -+ if (ctx->stat[0].time > newest) -+ newest = ctx->stat[0].time; -+ if (ctx->stat[1].time > newest) -+ newest = ctx->stat[1].time; -+ } -+ } -+ -+ /* -+ * No samples -+ */ -+ if (!newest) -+ break; -+ -+ if (newest > latest) -+ latest = newest; -+ -+ queue_for_each_hw_ctx(q, hctx, i) { -+ hctx_for_each_ctx(hctx, ctx, j) { -+ if (ctx->stat[0].time == newest) { -+ blk_stat_sum(&dst[0], &ctx->stat[0]); -+ nr++; -+ } -+ if (ctx->stat[1].time == newest) { -+ blk_stat_sum(&dst[1], &ctx->stat[1]); -+ nr++; -+ } -+ } -+ } -+ /* -+ * If we race on finding an entry, just loop back again. -+ * Should be very rare. -+ */ -+ } while (!nr); -+ -+ dst[0].time = dst[1].time = latest; -+} -+ -+void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) -+{ -+ if (q->mq_ops) -+ blk_mq_stat_get(q, dst); -+ else { -+ memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat)); -+ memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat)); -+ } -+} -+ -+void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) -+{ -+ struct blk_mq_ctx *ctx; -+ unsigned int i, nr; -+ -+ nr = 0; -+ do { -+ uint64_t newest = 0; -+ -+ hctx_for_each_ctx(hctx, ctx, i) { -+ if (!ctx->stat[0].nr_samples && -+ !ctx->stat[1].nr_samples) -+ continue; -+ -+ if (ctx->stat[0].time > newest) -+ newest = ctx->stat[0].time; -+ if (ctx->stat[1].time > newest) -+ newest = ctx->stat[1].time; -+ } -+ -+ if (!newest) -+ break; -+ -+ hctx_for_each_ctx(hctx, ctx, i) { -+ if (ctx->stat[0].time == newest) { -+ blk_stat_sum(&dst[0], &ctx->stat[0]); -+ nr++; -+ } -+ if (ctx->stat[1].time == newest) { -+ blk_stat_sum(&dst[1], &ctx->stat[1]); -+ nr++; -+ } -+ } -+ /* -+ * If we race on finding an entry, just loop back again. -+ * Should be very rare, as the window is only updated -+ * occasionally -+ */ -+ } while (!nr); -+} -+ -+static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) -+{ -+ stat->min = -1ULL; -+ stat->max = stat->nr_samples = stat->mean = 0; -+ stat->batch = stat->nr_batch = 0; -+ stat->time = time_now & BLK_STAT_MASK; -+} -+ -+void blk_stat_init(struct blk_rq_stat *stat) -+{ -+ __blk_stat_init(stat, ktime_to_ns(ktime_get())); -+} -+ -+static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now) -+{ -+ return (now & BLK_STAT_MASK) == (stat->time & BLK_STAT_MASK); -+} -+ -+bool blk_stat_is_current(struct blk_rq_stat *stat) -+{ -+ return __blk_stat_is_current(stat, ktime_to_ns(ktime_get())); -+} -+ -+void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) -+{ -+ s64 now, value; -+ -+ now = ktime_to_ns(ktime_get()); -+ if (now < rq->issue_time) -+ return; -+ -+ if (!__blk_stat_is_current(stat, now)) -+ __blk_stat_init(stat, now); -+ -+ value = now - rq->issue_time; -+ if (value > stat->max) -+ stat->max = value; -+ if (value < stat->min) -+ stat->min = value; -+ -+ if (stat->batch + value < stat->batch || -+ stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) -+ blk_stat_flush_batch(stat); -+ -+ stat->batch += value; -+ stat->nr_batch++; -+} -+ -+void blk_stat_clear(struct request_queue *q) -+{ -+ if (q->mq_ops) { -+ struct blk_mq_hw_ctx *hctx; -+ struct blk_mq_ctx *ctx; -+ int i, j; -+ -+ queue_for_each_hw_ctx(q, hctx, i) { -+ hctx_for_each_ctx(hctx, ctx, j) { -+ blk_stat_init(&ctx->stat[0]); -+ blk_stat_init(&ctx->stat[1]); -+ } -+ } -+ } else { -+ blk_stat_init(&q->rq_stats[0]); -+ blk_stat_init(&q->rq_stats[1]); -+ } -+} -diff --git a/block/blk-stat.h b/block/blk-stat.h -new file mode 100644 -index 0000000..376a6cc ---- /dev/null -+++ b/block/blk-stat.h -@@ -0,0 +1,18 @@ -+#ifndef BLK_STAT_H -+#define BLK_STAT_H -+ -+/* -+ * ~0.13s window as a power-of-2 (2^27 nsecs) -+ */ -+#define BLK_STAT_NSEC 134217728ULL -+#define BLK_STAT_MASK ~(BLK_STAT_NSEC - 1) -+ -+void blk_stat_add(struct blk_rq_stat *, struct request *); -+void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); -+void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); -+void blk_stat_clear(struct request_queue *q); -+void blk_stat_init(struct blk_rq_stat *); -+void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *); -+bool blk_stat_is_current(struct blk_rq_stat *); -+ -+#endif -diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c -index f87a7e7..0b9e435 100644 ---- a/block/blk-sysfs.c -+++ b/block/blk-sysfs.c -@@ -384,6 +384,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) - return queue_var_show(blk_queue_dax(q), page); - } - -+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) -+{ -+ return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", -+ pre, (long long) stat->nr_samples, -+ (long long) stat->mean, (long long) stat->min, -+ (long long) stat->max); -+} -+ -+static ssize_t queue_stats_show(struct request_queue *q, char *page) -+{ -+ struct blk_rq_stat stat[2]; -+ ssize_t ret; -+ -+ blk_queue_stat_get(q, stat); -+ -+ ret = print_stat(page, &stat[0], "read :"); -+ ret += print_stat(page + ret, &stat[1], "write:"); -+ return ret; -+} -+ - static struct queue_sysfs_entry queue_requests_entry = { - .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, - .show = queue_requests_show, -@@ -526,6 +546,11 @@ static struct queue_sysfs_entry queue_dax_entry = { - .show = queue_dax_show, - }; - -+static struct queue_sysfs_entry queue_stats_entry = { -+ .attr = {.name = "stats", .mode = S_IRUGO }, -+ .show = queue_stats_show, -+}; -+ - static struct attribute *default_attrs[] = { - &queue_requests_entry.attr, - &queue_ra_entry.attr, -@@ -553,6 +578,7 @@ static struct attribute *default_attrs[] = { - &queue_poll_entry.attr, - &queue_wc_entry.attr, - &queue_dax_entry.attr, -+ &queue_stats_entry.attr, - NULL, - }; - -diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h -index be4409b..95fbfa1 100644 ---- a/include/linux/blk_types.h -+++ b/include/linux/blk_types.h -@@ -266,4 +266,16 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) - return cookie & ((1u << BLK_QC_T_SHIFT) - 1); - } - -+#define BLK_RQ_STAT_BATCH 64 -+ -+struct blk_rq_stat { -+ s64 mean; -+ u64 min; -+ u64 max; -+ s32 nr_samples; -+ s32 nr_batch; -+ u64 batch; -+ s64 time; -+}; -+ - #endif /* __LINUX_BLK_TYPES_H */ -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 1d12aa6..259eba8 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -151,6 +151,7 @@ struct request { - struct gendisk *rq_disk; - struct hd_struct *part; - unsigned long start_time; -+ s64 issue_time; - #ifdef CONFIG_BLK_CGROUP - struct request_list *rl; /* rl this rq is alloced from */ - unsigned long long start_time_ns; -@@ -414,6 +415,9 @@ struct request_queue { - - unsigned int nr_sorted; - unsigned int in_flight[2]; -+ -+ struct blk_rq_stat rq_stats[2]; -+ - /* - * Number of active block driver functions for which blk_drain_queue() - * must wait. Must be incremented around functions that unlock the --- -cgit v0.11.2 - -From 9a38b8e46f9f759dbb3fd81810579ac1013bf814 Mon Sep 17 00:00:00 2001 -From: Jens Axboe <axboe@fb.com> -Date: Thu, 8 Sep 2016 11:07:16 -0600 -Subject: wbt: add general throttling mechanism - -We can hook this up to the block layer, to help throttle buffered -writes. Or NFS can tap into it, to accomplish the same. - -wbt registers a few trace points that can be used to track what is -happening in the system: - -wbt_lat: 259:0: latency 2446318 -wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1, - wmean=518866, wmin=15522, wmax=5330353, wsamples=57 -wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, max=32 - -This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat -dumps the current read/write stats for that window, and wbt_step shows a -step down event where we now scale back writes. Each trace includes the -device, 259:0 in this case. - -Signed-off-by: Jens Axboe <axboe@fb.com> ---- - include/linux/wbt.h | 120 ++++++++ - include/trace/events/wbt.h | 153 ++++++++++ - lib/Kconfig | 3 + - lib/Makefile | 1 + - lib/wbt.c | 681 +++++++++++++++++++++++++++++++++++++++++++++ - 5 files changed, 958 insertions(+) - create mode 100644 include/linux/wbt.h - create mode 100644 include/trace/events/wbt.h - create mode 100644 lib/wbt.c - -diff --git a/include/linux/wbt.h b/include/linux/wbt.h -new file mode 100644 -index 0000000..5ffcd14 ---- /dev/null -+++ b/include/linux/wbt.h -@@ -0,0 +1,120 @@ -+#ifndef WB_THROTTLE_H -+#define WB_THROTTLE_H -+ -+#include <linux/atomic.h> -+#include <linux/wait.h> -+#include <linux/timer.h> -+#include <linux/ktime.h> -+ -+enum { -+ ISSUE_STAT_TRACKED = 1ULL << 63, -+ ISSUE_STAT_READ = 1ULL << 62, -+ ISSUE_STAT_MASK = ISSUE_STAT_TRACKED | ISSUE_STAT_READ, -+ ISSUE_STAT_TIME_MASK = ~ISSUE_STAT_MASK, -+ -+ WBT_TRACKED = 1, -+ WBT_READ = 2, -+}; -+ -+struct wb_issue_stat { -+ u64 time; -+}; -+ -+static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat) -+{ -+ stat->time = (stat->time & ISSUE_STAT_MASK) | -+ (ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK); -+} -+ -+static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat) -+{ -+ return stat->time & ISSUE_STAT_TIME_MASK; -+} -+ -+static inline void wbt_mark_tracked(struct wb_issue_stat *stat) -+{ -+ stat->time |= ISSUE_STAT_TRACKED; -+} -+ -+static inline void wbt_clear_state(struct wb_issue_stat *stat) -+{ -+ stat->time &= ~(ISSUE_STAT_TRACKED | ISSUE_STAT_READ); -+} -+ -+static inline bool wbt_tracked(struct wb_issue_stat *stat) -+{ -+ return (stat->time & ISSUE_STAT_TRACKED) != 0; -+} -+ -+static inline void wbt_mark_read(struct wb_issue_stat *stat) -+{ -+ stat->time |= ISSUE_STAT_READ; -+} -+ -+static inline bool wbt_is_read(struct wb_issue_stat *stat) -+{ -+ return (stat->time & ISSUE_STAT_READ) != 0; -+} -+ -+struct wb_stat_ops { -+ void (*get)(void *, struct blk_rq_stat *); -+ bool (*is_current)(struct blk_rq_stat *); -+ void (*clear)(void *); -+}; -+ -+struct rq_wb { -+ /* -+ * Settings that govern how we throttle -+ */ -+ unsigned int wb_background; /* background writeback */ -+ unsigned int wb_normal; /* normal writeback */ -+ unsigned int wb_max; /* max throughput writeback */ -+ int scale_step; -+ bool scaled_max; -+ -+ u64 win_nsec; /* default window size */ -+ u64 cur_win_nsec; /* current window size */ -+ -+ /* -+ * Number of consecutive periods where we don't have enough -+ * information to make a firm scale up/down decision. -+ */ -+ unsigned int unknown_cnt; -+ -+ struct timer_list window_timer; -+ -+ s64 sync_issue; -+ void *sync_cookie; -+ -+ unsigned int wc; -+ unsigned int queue_depth; -+ -+ unsigned long last_issue; /* last non-throttled issue */ -+ unsigned long last_comp; /* last non-throttled comp */ -+ unsigned long min_lat_nsec; -+ struct backing_dev_info *bdi; -+ struct request_queue *q; -+ wait_queue_head_t wait; -+ atomic_t inflight; -+ -+ struct wb_stat_ops *stat_ops; -+ void *ops_data; -+}; -+ -+struct backing_dev_info; -+ -+void __wbt_done(struct rq_wb *); -+void wbt_done(struct rq_wb *, struct wb_issue_stat *); -+unsigned int wbt_wait(struct rq_wb *, unsigned int, spinlock_t *); -+struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void *); -+void wbt_exit(struct rq_wb *); -+void wbt_update_limits(struct rq_wb *); -+void wbt_requeue(struct rq_wb *, struct wb_issue_stat *); -+void wbt_issue(struct rq_wb *, struct wb_issue_stat *); -+void wbt_disable(struct rq_wb *); -+void wbt_track(struct wb_issue_stat *, unsigned int); -+ -+void wbt_set_queue_depth(struct rq_wb *, unsigned int); -+void wbt_set_write_cache(struct rq_wb *, bool); -+ -+#endif -diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h -new file mode 100644 -index 0000000..926c7ee ---- /dev/null -+++ b/include/trace/events/wbt.h -@@ -0,0 +1,153 @@ -+#undef TRACE_SYSTEM -+#define TRACE_SYSTEM wbt -+ -+#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ) -+#define _TRACE_WBT_H -+ -+#include <linux/tracepoint.h> -+#include <linux/wbt.h> -+ -+/** -+ * wbt_stat - trace stats for blk_wb -+ * @stat: array of read/write stats -+ */ -+TRACE_EVENT(wbt_stat, -+ -+ TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat), -+ -+ TP_ARGS(bdi, stat), -+ -+ TP_STRUCT__entry( -+ __array(char, name, 32) -+ __field(s64, rmean) -+ __field(u64, rmin) -+ __field(u64, rmax) -+ __field(s64, rnr_samples) -+ __field(s64, rtime) -+ __field(s64, wmean) -+ __field(u64, wmin) -+ __field(u64, wmax) -+ __field(s64, wnr_samples) -+ __field(s64, wtime) -+ ), -+ -+ TP_fast_assign( -+ strncpy(__entry->name, dev_name(bdi->dev), 32); -+ __entry->rmean = stat[0].mean; -+ __entry->rmin = stat[0].min; -+ __entry->rmax = stat[0].max; -+ __entry->rnr_samples = stat[0].nr_samples; -+ __entry->wmean = stat[1].mean; -+ __entry->wmin = stat[1].min; -+ __entry->wmax = stat[1].max; -+ __entry->wnr_samples = stat[1].nr_samples; -+ ), -+ -+ TP_printk("%s: rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, " -+ "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n", -+ __entry->name, __entry->rmean, __entry->rmin, __entry->rmax, -+ __entry->rnr_samples, __entry->wmean, __entry->wmin, -+ __entry->wmax, __entry->wnr_samples) -+); -+ -+/** -+ * wbt_lat - trace latency event -+ * @lat: latency trigger -+ */ -+TRACE_EVENT(wbt_lat, -+ -+ TP_PROTO(struct backing_dev_info *bdi, unsigned long lat), -+ -+ TP_ARGS(bdi, lat), -+ -+ TP_STRUCT__entry( -+ __array(char, name, 32) -+ __field(unsigned long, lat) -+ ), -+ -+ TP_fast_assign( -+ strncpy(__entry->name, dev_name(bdi->dev), 32); -+ __entry->lat = div_u64(lat, 1000); -+ ), -+ -+ TP_printk("%s: latency %lluus\n", __entry->name, -+ (unsigned long long) __entry->lat) -+); -+ -+/** -+ * wbt_step - trace wb event step -+ * @msg: context message -+ * @step: the current scale step count -+ * @window: the current monitoring window -+ * @bg: the current background queue limit -+ * @normal: the current normal writeback limit -+ * @max: the current max throughput writeback limit -+ */ -+TRACE_EVENT(wbt_step, -+ -+ TP_PROTO(struct backing_dev_info *bdi, const char *msg, -+ int step, unsigned long window, unsigned int bg, -+ unsigned int normal, unsigned int max), -+ -+ TP_ARGS(bdi, msg, step, window, bg, normal, max), -+ -+ TP_STRUCT__entry( -+ __array(char, name, 32) -+ __field(const char *, msg) -+ __field(int, step) -+ __field(unsigned long, window) -+ __field(unsigned int, bg) -+ __field(unsigned int, normal) -+ __field(unsigned int, max) -+ ), -+ -+ TP_fast_assign( -+ strncpy(__entry->name, dev_name(bdi->dev), 32); -+ __entry->msg = msg; -+ __entry->step = step; -+ __entry->window = div_u64(window, 1000); -+ __entry->bg = bg; -+ __entry->normal = normal; -+ __entry->max = max; -+ ), -+ -+ TP_printk("%s: %s: step=%d, window=%luus, background=%u, normal=%u, max=%u\n", -+ __entry->name, __entry->msg, __entry->step, __entry->window, -+ __entry->bg, __entry->normal, __entry->max) -+); -+ -+/** -+ * wbt_timer - trace wb timer event -+ * @status: timer state status -+ * @step: the current scale step count -+ * @inflight: tracked writes inflight -+ */ -+TRACE_EVENT(wbt_timer, -+ -+ TP_PROTO(struct backing_dev_info *bdi, unsigned int status, -+ int step, unsigned int inflight), -+ -+ TP_ARGS(bdi, status, step, inflight), -+ -+ TP_STRUCT__entry( -+ __array(char, name, 32) -+ __field(unsigned int, status) -+ __field(int, step) -+ __field(unsigned int, inflight) -+ ), -+ -+ TP_fast_assign( -+ strncpy(__entry->name, dev_name(bdi->dev), 32); -+ __entry->status = status; -+ __entry->step = step; -+ __entry->inflight = inflight; -+ ), -+ -+ TP_printk("%s: status=%u, step=%d, inflight=%u\n", __entry->name, -+ __entry->status, __entry->step, __entry->inflight) -+); -+ -+#endif /* _TRACE_WBT_H */ -+ -+/* This part must be outside protection */ -+#include <trace/define_trace.h> -diff --git a/lib/Kconfig b/lib/Kconfig -index d79909d..c585e4c 100644 ---- a/lib/Kconfig -+++ b/lib/Kconfig -@@ -550,4 +550,7 @@ config STACKDEPOT - bool - select STACKTRACE - -+config WBT -+ bool -+ - endmenu -diff --git a/lib/Makefile b/lib/Makefile -index 5dc77a8..23afd63 100644 ---- a/lib/Makefile -+++ b/lib/Makefile -@@ -177,6 +177,7 @@ obj-$(CONFIG_SG_SPLIT) += sg_split.o - obj-$(CONFIG_SG_POOL) += sg_pool.o - obj-$(CONFIG_STMP_DEVICE) += stmp_device.o - obj-$(CONFIG_IRQ_POLL) += irq_poll.o -+obj-$(CONFIG_WBT) += wbt.o - - obj-$(CONFIG_STACKDEPOT) += stackdepot.o - KASAN_SANITIZE_stackdepot.o := n -diff --git a/lib/wbt.c b/lib/wbt.c -new file mode 100644 -index 0000000..a995703 ---- /dev/null -+++ b/lib/wbt.c -@@ -0,0 +1,681 @@ -+/* -+ * buffered writeback throttling. losely based on CoDel. We can't drop -+ * packets for IO scheduling, so the logic is something like this: -+ * -+ * - Monitor latencies in a defined window of time. -+ * - If the minimum latency in the above window exceeds some target, increment -+ * scaling step and scale down queue depth by a factor of 2x. The monitoring -+ * window is then shrunk to 100 / sqrt(scaling step + 1). -+ * - For any window where we don't have solid data on what the latencies -+ * look like, retain status quo. -+ * - If latencies look good, decrement scaling step. -+ * - If we're only doing writes, allow the scaling step to go negative. This -+ * will temporarily boost write performance, snapping back to a stable -+ * scaling step of 0 if reads show up or the heavy writers finish. Unlike -+ * positive scaling steps where we shrink the monitoring window, a negative -+ * scaling step retains the default step==0 window size. -+ * -+ * Copyright (C) 2016 Jens Axboe -+ * -+ */ -+#include <linux/kernel.h> -+#include <linux/blk_types.h> -+#include <linux/slab.h> -+#include <linux/backing-dev.h> -+#include <linux/wbt.h> -+ -+#define CREATE_TRACE_POINTS -+#include <trace/events/wbt.h> -+ -+enum { -+ /* -+ * Default setting, we'll scale up (to 75% of QD max) or down (min 1) -+ * from here depending on device stats -+ */ -+ RWB_DEF_DEPTH = 16, -+ -+ /* -+ * 100msec window -+ */ -+ RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, -+ -+ /* -+ * Disregard stats, if we don't meet this minimum -+ */ -+ RWB_MIN_WRITE_SAMPLES = 3, -+ -+ /* -+ * If we have this number of consecutive windows with not enough -+ * information to scale up or down, scale up. -+ */ -+ RWB_UNKNOWN_BUMP = 5, -+}; -+ -+static inline bool rwb_enabled(struct rq_wb *rwb) -+{ -+ return rwb && rwb->wb_normal != 0; -+} -+ -+/* -+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, -+ * false if 'v' + 1 would be bigger than 'below'. -+ */ -+static bool atomic_inc_below(atomic_t *v, int below) -+{ -+ int cur = atomic_read(v); -+ -+ for (;;) { -+ int old; -+ -+ if (cur >= below) -+ return false; -+ old = atomic_cmpxchg(v, cur, cur + 1); -+ if (old == cur) -+ break; -+ cur = old; -+ } -+ -+ return true; -+} -+ -+static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) -+{ -+ if (rwb_enabled(rwb)) { -+ const unsigned long cur = jiffies; -+ -+ if (cur != *var) -+ *var = cur; -+ } -+} -+ -+/* -+ * If a task was rate throttled in balance_dirty_pages() within the last -+ * second or so, use that to indicate a higher cleaning rate. -+ */ -+static bool wb_recent_wait(struct rq_wb *rwb) -+{ -+ struct bdi_writeback *wb = &rwb->bdi->wb; -+ -+ return time_before(jiffies, wb->dirty_sleep + HZ); -+} -+ -+void __wbt_done(struct rq_wb *rwb) -+{ -+ int inflight, limit; -+ -+ inflight = atomic_dec_return(&rwb->inflight); -+ -+ /* -+ * wbt got disabled with IO in flight. Wake up any potential -+ * waiters, we don't have to do more than that. -+ */ -+ if (unlikely(!rwb_enabled(rwb))) { -+ wake_up_all(&rwb->wait); -+ return; -+ } -+ -+ /* -+ * If the device does write back caching, drop further down -+ * before we wake people up. -+ */ -+ if (rwb->wc && !wb_recent_wait(rwb)) -+ limit = 0; -+ else -+ limit = rwb->wb_normal; -+ -+ /* -+ * Don't wake anyone up if we are above the normal limit. -+ */ -+ if (inflight && inflight >= limit) -+ return; -+ -+ if (waitqueue_active(&rwb->wait)) { -+ int diff = limit - inflight; -+ -+ if (!inflight || diff >= rwb->wb_background / 2) -+ wake_up(&rwb->wait); -+ } -+} -+ -+/* -+ * Called on completion of a request. Note that it's also called when -+ * a request is merged, when the request gets freed. -+ */ -+void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat) -+{ -+ if (!rwb) -+ return; -+ -+ if (!wbt_tracked(stat)) { -+ if (rwb->sync_cookie == stat) { -+ rwb->sync_issue = 0; -+ rwb->sync_cookie = NULL; -+ } -+ -+ if (wbt_is_read(stat)) -+ wb_timestamp(rwb, &rwb->last_comp); -+ wbt_clear_state(stat); -+ } else { -+ WARN_ON_ONCE(stat == rwb->sync_cookie); -+ __wbt_done(rwb); -+ wbt_clear_state(stat); -+ } -+} -+ -+/* -+ * Return true, if we can't increase the depth further by scaling -+ */ -+static bool calc_wb_limits(struct rq_wb *rwb) -+{ -+ unsigned int depth; -+ bool ret = false; -+ -+ if (!rwb->min_lat_nsec) { -+ rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; -+ return false; -+ } -+ -+ /* -+ * For QD=1 devices, this is a special case. It's important for those -+ * to have one request ready when one completes, so force a depth of -+ * 2 for those devices. On the backend, it'll be a depth of 1 anyway, -+ * since the device can't have more than that in flight. If we're -+ * scaling down, then keep a setting of 1/1/1. -+ */ -+ if (rwb->queue_depth == 1) { -+ if (rwb->scale_step > 0) -+ rwb->wb_max = rwb->wb_normal = 1; -+ else { -+ rwb->wb_max = rwb->wb_normal = 2; -+ ret = true; -+ } -+ rwb->wb_background = 1; -+ } else { -+ /* -+ * scale_step == 0 is our default state. If we have suffered -+ * latency spikes, step will be > 0, and we shrink the -+ * allowed write depths. If step is < 0, we're only doing -+ * writes, and we allow a temporarily higher depth to -+ * increase performance. -+ */ -+ depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); -+ if (rwb->scale_step > 0) -+ depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); -+ else if (rwb->scale_step < 0) { -+ unsigned int maxd = 3 * rwb->queue_depth / 4; -+ -+ depth = 1 + ((depth - 1) << -rwb->scale_step); -+ if (depth > maxd) { -+ depth = maxd; -+ ret = true; -+ } -+ } -+ -+ /* -+ * Set our max/normal/bg queue depths based on how far -+ * we have scaled down (->scale_step). -+ */ -+ rwb->wb_max = depth; -+ rwb->wb_normal = (rwb->wb_max + 1) / 2; -+ rwb->wb_background = (rwb->wb_max + 3) / 4; -+ } -+ -+ return ret; -+} -+ -+static bool inline stat_sample_valid(struct blk_rq_stat *stat) -+{ -+ /* -+ * We need at least one read sample, and a minimum of -+ * RWB_MIN_WRITE_SAMPLES. We require some write samples to know -+ * that it's writes impacting us, and not just some sole read on -+ * a device that is in a lower power state. -+ */ -+ return stat[0].nr_samples >= 1 && -+ stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES; -+} -+ -+static u64 rwb_sync_issue_lat(struct rq_wb *rwb) -+{ -+ u64 now, issue = ACCESS_ONCE(rwb->sync_issue); -+ -+ if (!issue || !rwb->sync_cookie) -+ return 0; -+ -+ now = ktime_to_ns(ktime_get()); -+ return now - issue; -+} -+ -+enum { -+ LAT_OK = 1, -+ LAT_UNKNOWN, -+ LAT_UNKNOWN_WRITES, -+ LAT_EXCEEDED, -+}; -+ -+static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) -+{ -+ u64 thislat; -+ -+ /* -+ * If our stored sync issue exceeds the window size, or it -+ * exceeds our min target AND we haven't logged any entries, -+ * flag the latency as exceeded. wbt works off completion latencies, -+ * but for a flooded device, a single sync IO can take a long time -+ * to complete after being issued. If this time exceeds our -+ * monitoring window AND we didn't see any other completions in that -+ * window, then count that sync IO as a violation of the latency. -+ */ -+ thislat = rwb_sync_issue_lat(rwb); -+ if (thislat > rwb->cur_win_nsec || -+ (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) { -+ trace_wbt_lat(rwb->bdi, thislat); -+ return LAT_EXCEEDED; -+ } -+ -+ /* -+ * No read/write mix, if stat isn't valid -+ */ -+ if (!stat_sample_valid(stat)) { -+ /* -+ * If we had writes in this stat window and the window is -+ * current, we're only doing writes. If a task recently -+ * waited or still has writes in flights, consider us doing -+ * just writes as well. -+ */ -+ if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) || -+ wb_recent_wait(rwb) || atomic_read(&rwb->inflight)) -+ return LAT_UNKNOWN_WRITES; -+ return LAT_UNKNOWN; -+ } -+ -+ /* -+ * If the 'min' latency exceeds our target, step down. -+ */ -+ if (stat[0].min > rwb->min_lat_nsec) { -+ trace_wbt_lat(rwb->bdi, stat[0].min); -+ trace_wbt_stat(rwb->bdi, stat); -+ return LAT_EXCEEDED; -+ } -+ -+ if (rwb->scale_step) -+ trace_wbt_stat(rwb->bdi, stat); -+ -+ return LAT_OK; -+} -+ -+static int latency_exceeded(struct rq_wb *rwb) -+{ -+ struct blk_rq_stat stat[2]; -+ -+ rwb->stat_ops->get(rwb->ops_data, stat); -+ return __latency_exceeded(rwb, stat); -+} -+ -+static void rwb_trace_step(struct rq_wb *rwb, const char *msg) -+{ -+ trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec, -+ rwb->wb_background, rwb->wb_normal, rwb->wb_max); -+} -+ -+static void scale_up(struct rq_wb *rwb) -+{ -+ /* -+ * Hit max in previous round, stop here -+ */ -+ if (rwb->scaled_max) -+ return; -+ -+ rwb->scale_step--; -+ rwb->unknown_cnt = 0; -+ rwb->stat_ops->clear(rwb->ops_data); -+ -+ rwb->scaled_max = calc_wb_limits(rwb); -+ -+ if (waitqueue_active(&rwb->wait)) -+ wake_up_all(&rwb->wait); -+ -+ rwb_trace_step(rwb, "step up"); -+} -+ -+/* -+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we -+ * had a latency violation. -+ */ -+static void scale_down(struct rq_wb *rwb, bool hard_throttle) -+{ -+ /* -+ * Stop scaling down when we've hit the limit. This also prevents -+ * ->scale_step from going to crazy values, if the device can't -+ * keep up. -+ */ -+ if (rwb->wb_max == 1) -+ return; -+ -+ if (rwb->scale_step < 0 && hard_throttle) -+ rwb->scale_step = 0; -+ else -+ rwb->scale_step++; -+ -+ rwb->scaled_max = false; -+ rwb->unknown_cnt = 0; -+ rwb->stat_ops->clear(rwb->ops_data); -+ calc_wb_limits(rwb); -+ rwb_trace_step(rwb, "step down"); -+} -+ -+static void rwb_arm_timer(struct rq_wb *rwb) -+{ -+ unsigned long expires; -+ -+ if (rwb->scale_step > 0) { -+ /* -+ * We should speed this up, using some variant of a fast -+ * integer inverse square root calculation. Since we only do -+ * this for every window expiration, it's not a huge deal, -+ * though. -+ */ -+ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, -+ int_sqrt((rwb->scale_step + 1) << 8)); -+ } else { -+ /* -+ * For step < 0, we don't want to increase/decrease the -+ * window size. -+ */ -+ rwb->cur_win_nsec = rwb->win_nsec; -+ } -+ -+ expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); -+ mod_timer(&rwb->window_timer, expires); -+} -+ -+static void wb_timer_fn(unsigned long data) -+{ -+ struct rq_wb *rwb = (struct rq_wb *) data; -+ int status, inflight; -+ -+ inflight = atomic_read(&rwb->inflight); -+ -+ status = latency_exceeded(rwb); -+ -+ trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight); -+ -+ /* -+ * If we exceeded the latency target, step down. If we did not, -+ * step one level up. If we don't know enough to say either exceeded -+ * or ok, then don't do anything. -+ */ -+ switch (status) { -+ case LAT_EXCEEDED: -+ scale_down(rwb, true); -+ break; -+ case LAT_OK: -+ scale_up(rwb); -+ break; -+ case LAT_UNKNOWN_WRITES: -+ scale_up(rwb); -+ break; -+ case LAT_UNKNOWN: -+ if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) -+ break; -+ /* -+ * We get here for two reasons: -+ * -+ * 1) We previously scaled reduced depth, and we currently -+ * don't have a valid read/write sample. For that case, -+ * slowly return to center state (step == 0). -+ * 2) We started a the center step, but don't have a valid -+ * read/write sample, but we do have writes going on. -+ * Allow step to go negative, to increase write perf. -+ */ -+ if (rwb->scale_step > 0) -+ scale_up(rwb); -+ else if (rwb->scale_step < 0) -+ scale_down(rwb, false); -+ break; -+ default: -+ break; -+ } -+ -+ /* -+ * Re-arm timer, if we have IO in flight -+ */ -+ if (rwb->scale_step || inflight) -+ rwb_arm_timer(rwb); -+} -+ -+void wbt_update_limits(struct rq_wb *rwb) -+{ -+ rwb->scale_step = 0; -+ rwb->scaled_max = false; -+ calc_wb_limits(rwb); -+ -+ if (waitqueue_active(&rwb->wait)) -+ wake_up_all(&rwb->wait); -+} -+ -+static bool close_io(struct rq_wb *rwb) -+{ -+ const unsigned long now = jiffies; -+ -+ return time_before(now, rwb->last_issue + HZ / 10) || -+ time_before(now, rwb->last_comp + HZ / 10); -+} -+ -+#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) -+ -+static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) -+{ -+ unsigned int limit; -+ -+ /* -+ * At this point we know it's a buffered write. If REQ_SYNC is -+ * set, then it's WB_SYNC_ALL writeback, and we'll use the max -+ * limit for that. If the write is marked as a background write, -+ * then use the idle limit, or go to normal if we haven't had -+ * competing IO for a bit. -+ */ -+ if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb)) -+ limit = rwb->wb_max; -+ else if ((rw & REQ_BG) || close_io(rwb)) { -+ /* -+ * If less than 100ms since we completed unrelated IO, -+ * limit us to half the depth for background writeback. -+ */ -+ limit = rwb->wb_background; -+ } else -+ limit = rwb->wb_normal; -+ -+ return limit; -+} -+ -+static inline bool may_queue(struct rq_wb *rwb, unsigned long rw) -+{ -+ /* -+ * inc it here even if disabled, since we'll dec it at completion. -+ * this only happens if the task was sleeping in __wbt_wait(), -+ * and someone turned it off at the same time. -+ */ -+ if (!rwb_enabled(rwb)) { -+ atomic_inc(&rwb->inflight); -+ return true; -+ } -+ -+ return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw)); -+} -+ -+/* -+ * Block if we will exceed our limit, or if we are currently waiting for -+ * the timer to kick off queuing again. -+ */ -+static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) -+{ -+ DEFINE_WAIT(wait); -+ -+ if (may_queue(rwb, rw)) -+ return; -+ -+ do { -+ prepare_to_wait_exclusive(&rwb->wait, &wait, -+ TASK_UNINTERRUPTIBLE); -+ -+ if (may_queue(rwb, rw)) -+ break; -+ -+ if (lock) -+ spin_unlock_irq(lock); -+ -+ io_schedule(); -+ -+ if (lock) -+ spin_lock_irq(lock); -+ } while (1); -+ -+ finish_wait(&rwb->wait, &wait); -+} -+ -+static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw) -+{ -+ const int op = rw >> BIO_OP_SHIFT; -+ -+ /* -+ * If not a WRITE (or a discard), do nothing -+ */ -+ if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD)) -+ return false; -+ -+ /* -+ * Don't throttle WRITE_ODIRECT -+ */ -+ if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * Returns true if the IO request should be accounted, false if not. -+ * May sleep, if we have exceeded the writeback limits. Caller can pass -+ * in an irq held spinlock, if it holds one when calling this function. -+ * If we do sleep, we'll release and re-grab it. -+ */ -+unsigned int wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock) -+{ -+ unsigned int ret; -+ -+ if (!rwb_enabled(rwb)) -+ return 0; -+ -+ if ((rw >> BIO_OP_SHIFT) == REQ_OP_READ) -+ ret = WBT_READ; -+ -+ if (!wbt_should_throttle(rwb, rw)) { -+ if (ret & WBT_READ) -+ wb_timestamp(rwb, &rwb->last_issue); -+ return ret; -+ } -+ -+ __wbt_wait(rwb, rw, lock); -+ -+ if (!timer_pending(&rwb->window_timer)) -+ rwb_arm_timer(rwb); -+ -+ return ret | WBT_TRACKED; -+} -+ -+void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat) -+{ -+ if (!rwb_enabled(rwb)) -+ return; -+ -+ wbt_issue_stat_set_time(stat); -+ -+ /* -+ * Track sync issue, in case it takes a long time to complete. Allows -+ * us to react quicker, if a sync IO takes a long time to complete. -+ * Note that this is just a hint. 'stat' can go away when the -+ * request completes, so it's important we never dereference it. We -+ * only use the address to compare with, which is why we store the -+ * sync_issue time locally. -+ */ -+ if (wbt_is_read(stat) && !rwb->sync_issue) { -+ rwb->sync_cookie = stat; -+ rwb->sync_issue = wbt_issue_stat_get_time(stat); -+ } -+} -+ -+void wbt_track(struct wb_issue_stat *stat, unsigned int wb_acct) -+{ -+ if (wb_acct & WBT_TRACKED) -+ wbt_mark_tracked(stat); -+ else if (wb_acct & WBT_READ) -+ wbt_mark_read(stat); -+} -+ -+void wbt_requeue(struct rq_wb *rwb, struct wb_issue_stat *stat) -+{ -+ if (!rwb_enabled(rwb)) -+ return; -+ if (stat == rwb->sync_cookie) { -+ rwb->sync_issue = 0; -+ rwb->sync_cookie = NULL; -+ } -+} -+ -+void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) -+{ -+ if (rwb) { -+ rwb->queue_depth = depth; -+ wbt_update_limits(rwb); -+ } -+} -+ -+void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) -+{ -+ if (rwb) -+ rwb->wc = write_cache_on; -+} -+ -+void wbt_disable(struct rq_wb *rwb) -+{ -+ if (rwb) { -+ del_timer_sync(&rwb->window_timer); -+ rwb->win_nsec = rwb->min_lat_nsec = 0; -+ wbt_update_limits(rwb); -+ } -+} -+EXPORT_SYMBOL_GPL(wbt_disable); -+ -+struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops, -+ void *ops_data) -+{ -+ struct rq_wb *rwb; -+ -+ if (!ops->get || !ops->is_current || !ops->clear) -+ return ERR_PTR(-EINVAL); -+ -+ rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); -+ if (!rwb) -+ return ERR_PTR(-ENOMEM); -+ -+ atomic_set(&rwb->inflight, 0); -+ init_waitqueue_head(&rwb->wait); -+ setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); -+ rwb->wc = 1; -+ rwb->queue_depth = RWB_DEF_DEPTH; -+ rwb->last_comp = rwb->last_issue = jiffies; -+ rwb->bdi = bdi; -+ rwb->win_nsec = RWB_WINDOW_NSEC; -+ rwb->stat_ops = ops, -+ rwb->ops_data = ops_data; -+ wbt_update_limits(rwb); -+ return rwb; -+} -+ -+void wbt_exit(struct rq_wb *rwb) -+{ -+ if (rwb) { -+ del_timer_sync(&rwb->window_timer); -+ kfree(rwb); -+ } -+} --- -cgit v0.11.2 - -From db3de07314ef350fceb90ade08474fe4eea5e665 Mon Sep 17 00:00:00 2001 -From: Jens Axboe <axboe@fb.com> -Date: Thu, 8 Sep 2016 11:08:17 -0600 -Subject: writeback: throttle buffered writeback - -Test patch that throttles buffered writeback to make it a lot -more smooth, and has way less impact on other system activity. -Background writeback should be, by definition, background -activity. The fact that we flush huge bundles of it at the time -means that it potentially has heavy impacts on foreground workloads, -which isn't ideal. We can't easily limit the sizes of writes that -we do, since that would impact file system layout in the presence -of delayed allocation. So just throttle back buffered writeback, -unless someone is waiting for it. - -The algorithm for when to throttle takes its inspiration in the -CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors -the minimum latencies of requests over a window of time. In that -window of time, if the minimum latency of any request exceeds a -given target, then a scale count is incremented and the queue depth -is shrunk. The next monitoring window is shrunk accordingly. Unlike -CoDel, if we hit a window that exhibits good behavior, then we -simply increment the scale count and re-calculate the limits for that -scale value. This prevents us from oscillating between a -close-to-ideal value and max all the time, instead remaining in the -windows where we get good behavior. - -Unlike CoDel, blk-wb allows the scale count to to negative. This -happens if we primarily have writes going on. Unlike positive -scale counts, this doesn't change the size of the monitoring window. -When the heavy writers finish, blk-bw quickly snaps back to it's -stable state of a zero scale count. - -The patch registers two sysfs entries. The first one, 'wb_window_usec', -defines the window of monitoring. The second one, 'wb_lat_usec', -sets the latency target for the window. It defaults to 2 msec for -non-rotational storage, and 75 msec for rotational storage. Setting -this value to '0' disables blk-wb. Generally, a user would not have -to touch these settings. - -We don't enable WBT on devices that are managed with CFQ, and have -a non-root block cgroup attached. If we have a proportional share setup -on this particular disk, then the wbt throttling will interfere with -that. We don't have a strong need for wbt for that case, since we will -rely on CFQ doing that for us. - -Signed-off-by: Jens Axboe <axboe@fb.com> ---- - Documentation/block/queue-sysfs.txt | 13 ++++ - block/Kconfig | 1 + - block/blk-core.c | 20 +++++- - block/blk-mq.c | 30 ++++++++- - block/blk-settings.c | 3 + - block/blk-stat.c | 5 +- - block/blk-sysfs.c | 125 ++++++++++++++++++++++++++++++++++++ - block/cfq-iosched.c | 13 ++++ - include/linux/blkdev.h | 6 +- - 9 files changed, 207 insertions(+), 9 deletions(-) - -diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt -index 2a39040..2847219 100644 ---- a/Documentation/block/queue-sysfs.txt -+++ b/Documentation/block/queue-sysfs.txt -@@ -169,5 +169,18 @@ This is the number of bytes the device can write in a single write-same - command. A value of '0' means write-same is not supported by this - device. - -+wb_lat_usec (RW) -+---------------- -+If the device is registered for writeback throttling, then this file shows -+the target minimum read latency. If this latency is exceeded in a given -+window of time (see wb_window_usec), then the writeback throttling will start -+scaling back writes. -+ -+wb_window_usec (RW) -+------------------- -+If the device is registered for writeback throttling, then this file shows -+the value of the monitoring window in which we'll look at the target -+latency. See wb_lat_usec. -+ - - Jens Axboe <jens.axboe@oracle.com>, February 2009 -diff --git a/block/Kconfig b/block/Kconfig -index 161491d..6da79e6 100644 ---- a/block/Kconfig -+++ b/block/Kconfig -@@ -4,6 +4,7 @@ - menuconfig BLOCK - bool "Enable the block layer" if EXPERT - default y -+ select WBT - help - Provide block layer support for the kernel. - -diff --git a/block/blk-core.c b/block/blk-core.c -index 4075cbe..4f4ce05 100644 ---- a/block/blk-core.c -+++ b/block/blk-core.c -@@ -33,6 +33,7 @@ - #include <linux/ratelimit.h> - #include <linux/pm_runtime.h> - #include <linux/blk-cgroup.h> -+#include <linux/wbt.h> - - #define CREATE_TRACE_POINTS - #include <trace/events/block.h> -@@ -882,6 +883,8 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, - - fail: - blk_free_flush_queue(q->fq); -+ wbt_exit(q->rq_wb); -+ q->rq_wb = NULL; - return NULL; - } - EXPORT_SYMBOL(blk_init_allocated_queue); -@@ -1346,6 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) - blk_delete_timer(rq); - blk_clear_rq_complete(rq); - trace_block_rq_requeue(q, rq); -+ wbt_requeue(q->rq_wb, &rq->wb_stat); - - if (rq->cmd_flags & REQ_QUEUED) - blk_queue_end_tag(q, rq); -@@ -1436,6 +1440,8 @@ void __blk_put_request(struct request_queue *q, struct request *req) - /* this is a bio leak */ - WARN_ON(req->bio != NULL); - -+ wbt_done(q->rq_wb, &req->wb_stat); -+ - /* - * Request may not have originated from ll_rw_blk. if not, - * it didn't come out of our reserved rq pools -@@ -1667,6 +1673,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) - int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT; - struct request *req; - unsigned int request_count = 0; -+ unsigned int wb_acct; - - /* - * low level driver can indicate that it wants pages above a -@@ -1719,6 +1726,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) - } - - get_rq: -+ wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, q->queue_lock); -+ - /* - * This sync check and mask will be re-done in init_request_from_bio(), - * but we need to set it earlier to expose the sync flag to the -@@ -1738,11 +1747,15 @@ get_rq: - */ - req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO); - if (IS_ERR(req)) { -+ if (wb_acct & WBT_TRACKED) -+ __wbt_done(q->rq_wb); - bio->bi_error = PTR_ERR(req); - bio_endio(bio); - goto out_unlock; - } - -+ wbt_track(&req->wb_stat, wb_acct); -+ - /* - * After dropping the lock and possibly sleeping here, our request - * may now be mergeable after it had proven unmergeable (above). -@@ -2475,7 +2488,7 @@ void blk_start_request(struct request *req) - { - blk_dequeue_request(req); - -- req->issue_time = ktime_to_ns(ktime_get()); -+ wbt_issue(req->q->rq_wb, &req->wb_stat); - - /* - * We are now handing the request to the hardware, initialize -@@ -2713,9 +2726,10 @@ void blk_finish_request(struct request *req, int error) - - blk_account_io_done(req); - -- if (req->end_io) -+ if (req->end_io) { -+ wbt_done(req->q->rq_wb, &req->wb_stat); - req->end_io(req, error); -- else { -+ } else { - if (blk_bidi_rq(req)) - __blk_put_request(req->next_rq->q, req->next_rq); - -diff --git a/block/blk-mq.c b/block/blk-mq.c -index 712f141..511289a 100644 ---- a/block/blk-mq.c -+++ b/block/blk-mq.c -@@ -22,6 +22,7 @@ - #include <linux/sched/sysctl.h> - #include <linux/delay.h> - #include <linux/crash_dump.h> -+#include <linux/wbt.h> - - #include <trace/events/block.h> - -@@ -319,6 +320,8 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, - - if (rq->cmd_flags & REQ_MQ_INFLIGHT) - atomic_dec(&hctx->nr_active); -+ -+ wbt_done(q->rq_wb, &rq->wb_stat); - rq->cmd_flags = 0; - - clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); -@@ -351,6 +354,7 @@ inline void __blk_mq_end_request(struct request *rq, int error) - blk_account_io_done(rq); - - if (rq->end_io) { -+ wbt_done(rq->q->rq_wb, &rq->wb_stat); - rq->end_io(rq, error); - } else { - if (unlikely(blk_bidi_rq(rq))) -@@ -457,7 +461,7 @@ void blk_mq_start_request(struct request *rq) - if (unlikely(blk_bidi_rq(rq))) - rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); - -- rq->issue_time = ktime_to_ns(ktime_get()); -+ wbt_issue(q->rq_wb, &rq->wb_stat); - - blk_add_timer(rq); - -@@ -494,6 +498,7 @@ static void __blk_mq_requeue_request(struct request *rq) - struct request_queue *q = rq->q; - - trace_block_rq_requeue(q, rq); -+ wbt_requeue(q->rq_wb, &rq->wb_stat); - - if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { - if (q->dma_drain_size && blk_rq_bytes(rq)) -@@ -1312,6 +1317,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) - struct blk_plug *plug; - struct request *same_queue_rq = NULL; - blk_qc_t cookie; -+ unsigned int wb_acct; - - blk_queue_bounce(q, &bio); - -@@ -1326,9 +1332,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) - blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) - return BLK_QC_T_NONE; - -+ wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, NULL); -+ - rq = blk_mq_map_request(q, bio, &data); -- if (unlikely(!rq)) -+ if (unlikely(!rq)) { -+ if (wb_acct & WBT_TRACKED) -+ __wbt_done(q->rq_wb); - return BLK_QC_T_NONE; -+ } -+ -+ wbt_track(&rq->wb_stat, wb_acct); - - cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); - -@@ -1405,6 +1418,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) - struct blk_map_ctx data; - struct request *rq; - blk_qc_t cookie; -+ unsigned int wb_acct; - - blk_queue_bounce(q, &bio); - -@@ -1421,9 +1435,16 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) - } else - request_count = blk_plug_queued_count(q); - -+ wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, NULL); -+ - rq = blk_mq_map_request(q, bio, &data); -- if (unlikely(!rq)) -+ if (unlikely(!rq)) { -+ if (wb_acct & WBT_TRACKED) -+ __wbt_done(q->rq_wb); - return BLK_QC_T_NONE; -+ } -+ -+ wbt_track(&rq->wb_stat, wb_acct); - - cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); - -@@ -2147,6 +2168,9 @@ void blk_mq_free_queue(struct request_queue *q) - list_del_init(&q->all_q_node); - mutex_unlock(&all_q_mutex); - -+ wbt_exit(q->rq_wb); -+ q->rq_wb = NULL; -+ - blk_mq_del_queue_tag_set(q); - - blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); -diff --git a/block/blk-settings.c b/block/blk-settings.c -index f7e122e..746dc9f 100644 ---- a/block/blk-settings.c -+++ b/block/blk-settings.c -@@ -840,6 +840,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); - void blk_set_queue_depth(struct request_queue *q, unsigned int depth) - { - q->queue_depth = depth; -+ wbt_set_queue_depth(q->rq_wb, depth); - } - EXPORT_SYMBOL(blk_set_queue_depth); - -@@ -863,6 +864,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) - else - queue_flag_clear(QUEUE_FLAG_FUA, q); - spin_unlock_irq(q->queue_lock); -+ -+ wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); - } - EXPORT_SYMBOL_GPL(blk_queue_write_cache); - -diff --git a/block/blk-stat.c b/block/blk-stat.c -index 3965e8a..bdb16d8 100644 ---- a/block/blk-stat.c -+++ b/block/blk-stat.c -@@ -178,15 +178,16 @@ bool blk_stat_is_current(struct blk_rq_stat *stat) - void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) - { - s64 now, value; -+ u64 rq_time = wbt_issue_stat_get_time(&rq->wb_stat); - - now = ktime_to_ns(ktime_get()); -- if (now < rq->issue_time) -+ if (now < rq_time) - return; - - if (!__blk_stat_is_current(stat, now)) - __blk_stat_init(stat, now); - -- value = now - rq->issue_time; -+ value = now - rq_time; - if (value > stat->max) - stat->max = value; - if (value < stat->min) -diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c -index 0b9e435..85c3dc2 100644 ---- a/block/blk-sysfs.c -+++ b/block/blk-sysfs.c -@@ -10,6 +10,7 @@ - #include <linux/blktrace_api.h> - #include <linux/blk-mq.h> - #include <linux/blk-cgroup.h> -+#include <linux/wbt.h> - - #include "blk.h" - #include "blk-mq.h" -@@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count) - return count; - } - -+static ssize_t queue_var_store64(u64 *var, const char *page) -+{ -+ int err; -+ u64 v; -+ -+ err = kstrtou64(page, 10, &v); -+ if (err < 0) -+ return err; -+ -+ *var = v; -+ return 0; -+} -+ - static ssize_t queue_requests_show(struct request_queue *q, char *page) - { - return queue_var_show(q->nr_requests, (page)); -@@ -347,6 +361,58 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, - return ret; - } - -+static ssize_t queue_wb_win_show(struct request_queue *q, char *page) -+{ -+ if (!q->rq_wb) -+ return -EINVAL; -+ -+ return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000)); -+} -+ -+static ssize_t queue_wb_win_store(struct request_queue *q, const char *page, -+ size_t count) -+{ -+ ssize_t ret; -+ u64 val; -+ -+ if (!q->rq_wb) -+ return -EINVAL; -+ -+ ret = queue_var_store64(&val, page); -+ if (ret < 0) -+ return ret; -+ -+ q->rq_wb->win_nsec = val * 1000ULL; -+ wbt_update_limits(q->rq_wb); -+ return count; -+} -+ -+static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) -+{ -+ if (!q->rq_wb) -+ return -EINVAL; -+ -+ return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); -+} -+ -+static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, -+ size_t count) -+{ -+ ssize_t ret; -+ u64 val; -+ -+ if (!q->rq_wb) -+ return -EINVAL; -+ -+ ret = queue_var_store64(&val, page); -+ if (ret < 0) -+ return ret; -+ -+ q->rq_wb->min_lat_nsec = val * 1000ULL; -+ wbt_update_limits(q->rq_wb); -+ return count; -+} -+ - static ssize_t queue_wc_show(struct request_queue *q, char *page) - { - if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) -@@ -551,6 +617,18 @@ static struct queue_sysfs_entry queue_stats_entry = { - .show = queue_stats_show, - }; - -+static struct queue_sysfs_entry queue_wb_lat_entry = { -+ .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, -+ .show = queue_wb_lat_show, -+ .store = queue_wb_lat_store, -+}; -+ -+static struct queue_sysfs_entry queue_wb_win_entry = { -+ .attr = {.name = "wbt_window_usec", .mode = S_IRUGO | S_IWUSR }, -+ .show = queue_wb_win_show, -+ .store = queue_wb_win_store, -+}; -+ - static struct attribute *default_attrs[] = { - &queue_requests_entry.attr, - &queue_ra_entry.attr, -@@ -579,6 +657,8 @@ static struct attribute *default_attrs[] = { - &queue_wc_entry.attr, - &queue_dax_entry.attr, - &queue_stats_entry.attr, -+ &queue_wb_lat_entry.attr, -+ &queue_wb_win_entry.attr, - NULL, - }; - -@@ -693,6 +773,49 @@ struct kobj_type blk_queue_ktype = { - .release = blk_release_queue, - }; - -+static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat) -+{ -+ blk_queue_stat_get(data, stat); -+} -+ -+static void blk_wb_stat_clear(void *data) -+{ -+ blk_stat_clear(data); -+} -+ -+static bool blk_wb_stat_is_current(struct blk_rq_stat *stat) -+{ -+ return blk_stat_is_current(stat); -+} -+ -+static struct wb_stat_ops wb_stat_ops = { -+ .get = blk_wb_stat_get, -+ .is_current = blk_wb_stat_is_current, -+ .clear = blk_wb_stat_clear, -+}; -+ -+static void blk_wb_init(struct request_queue *q) -+{ -+ struct rq_wb *rwb; -+ -+ rwb = wbt_init(&q->backing_dev_info, &wb_stat_ops, q); -+ -+ /* -+ * If this fails, we don't get throttling -+ */ -+ if (IS_ERR(rwb)) -+ return; -+ -+ if (blk_queue_nonrot(q)) -+ rwb->min_lat_nsec = 2000000ULL; -+ else -+ rwb->min_lat_nsec = 75000000ULL; -+ -+ wbt_set_queue_depth(rwb, blk_queue_depth(q)); -+ wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); -+ q->rq_wb = rwb; -+} -+ - int blk_register_queue(struct gendisk *disk) - { - int ret; -@@ -732,6 +855,8 @@ int blk_register_queue(struct gendisk *disk) - if (q->mq_ops) - blk_mq_register_disk(disk); - -+ blk_wb_init(q); -+ - if (!q->request_fn) - return 0; - -diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c -index cc2f6db..fdcd5999 100644 ---- a/block/cfq-iosched.c -+++ b/block/cfq-iosched.c -@@ -3764,9 +3764,11 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) - struct cfq_data *cfqd = cic_to_cfqd(cic); - struct cfq_queue *cfqq; - uint64_t serial_nr; -+ bool nonroot_cg; - - rcu_read_lock(); - serial_nr = bio_blkcg(bio)->css.serial_nr; -+ nonroot_cg = bio_blkcg(bio) != &blkcg_root; - rcu_read_unlock(); - - /* -@@ -3777,6 +3779,17 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) - return; - - /* -+ * If we have a non-root cgroup, we can depend on that to -+ * do proper throttling of writes. Turn off wbt for that -+ * case. -+ */ -+ if (nonroot_cg) { -+ struct request_queue *q = cfqd->queue; -+ -+ wbt_disable(q->rq_wb); -+ } -+ -+ /* - * Drop reference to queues. New queues will be assigned in new - * group upon arrival of fresh requests. - */ -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 259eba8..45256d7 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -24,6 +24,7 @@ - #include <linux/rcupdate.h> - #include <linux/percpu-refcount.h> - #include <linux/scatterlist.h> -+#include <linux/wbt.h> - - struct module; - struct scsi_ioctl_command; -@@ -37,6 +38,7 @@ struct bsg_job; - struct blkcg_gq; - struct blk_flush_queue; - struct pr_ops; -+struct rq_wb; - - #define BLKDEV_MIN_RQ 4 - #define BLKDEV_MAX_RQ 128 /* Default maximum */ -@@ -151,7 +153,7 @@ struct request { - struct gendisk *rq_disk; - struct hd_struct *part; - unsigned long start_time; -- s64 issue_time; -+ struct wb_issue_stat wb_stat; - #ifdef CONFIG_BLK_CGROUP - struct request_list *rl; /* rl this rq is alloced from */ - unsigned long long start_time_ns; -@@ -303,6 +305,8 @@ struct request_queue { - int nr_rqs[2]; /* # allocated [a]sync rqs */ - int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ - -+ struct rq_wb *rq_wb; -+ - /* - * If blkcg is not used, @q->root_rl serves all requests. If blkcg - * is used, root blkg allocates from @q->root_rl and all other --- -cgit v0.11.2 - -From 21c990f3ab1d3324ad3152cb94f86e6e0772b73c Mon Sep 17 00:00:00 2001 -From: Jens Axboe <axboe@fb.com> -Date: Sat, 10 Sep 2016 10:06:26 -0600 -Subject: wbt: spelling check fix - -Signed-off-by: Jens Axboe <axboe@fb.com> ---- - lib/wbt.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/lib/wbt.c b/lib/wbt.c -index a995703..5c507e5 100644 ---- a/lib/wbt.c -+++ b/lib/wbt.c -@@ -1,5 +1,5 @@ - /* -- * buffered writeback throttling. losely based on CoDel. We can't drop -+ * buffered writeback throttling. loosely based on CoDel. We can't drop - * packets for IO scheduling, so the logic is something like this: - * - * - Monitor latencies in a defined window of time. --- -cgit v0.11.2 - |