diff options
-rw-r--r-- | .SRCINFO | 38 | ||||
-rw-r--r-- | PKGBUILD | 47 | ||||
-rw-r--r-- | block.patch | 2650 | ||||
-rw-r--r-- | config.x86_64 | 16 | ||||
-rw-r--r-- | init.patch | 38 | ||||
-rw-r--r-- | kconfig.patch | 426 | ||||
-rw-r--r-- | xattr.patch | 69 | ||||
-rw-r--r-- | xfs.patch | 137 |
8 files changed, 3372 insertions, 49 deletions
@@ -1,6 +1,6 @@ pkgbase = linux-surfacepro3-rt - pkgver = 4.8.11 - pkgrel = 2.1 + pkgver = 4.8.14 + pkgrel = 2.2 url = https://github.com/alyptik/linux-surfacepro3-rt arch = i686 arch = x86_64 @@ -12,36 +12,36 @@ pkgbase = linux-surfacepro3-rt makedepends = bc makedepends = elfutils options = !strip - source = https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.8.11.tar.xz - source = https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.8.11.tar.sign - source = https://www.kernel.org/pub/linux/kernel/projects/rt/4.8/older/patch-4.8.11-rt7.patch.xz - source = https://www.kernel.org/pub/linux/kernel/projects/rt/4.8/older/patch-4.8.11-rt7.patch.sign + source = https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.8.14.tar.xz + source = https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.8.14.tar.sign + source = https://www.kernel.org/pub/linux/kernel/projects/rt/4.8/older/patch-4.8.14-rt9.patch.xz + source = https://www.kernel.org/pub/linux/kernel/projects/rt/4.8/older/patch-4.8.14-rt9.patch.sign source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfq.patch source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs.patch source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs-fixes1.patch source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs-fixes2.patch source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs-fixes3.patch - source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/block.patch - source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/init.patch - source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/kconfig.patch - source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/xattr.patch - source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/xfs.patch - source = multitouch.patch + source = block.patch + source = init.patch + source = kconfig.patch + source = xattr.patch + source = xfs.patch source = touchscreen_multitouch_fixes1.patch source = touchscreen_multitouch_fixes2.patch source = wifi.patch + source = multitouch.patch + source = change-default-console-loglevel.patch source = config source = config.x86_64 source = config.sp3 source = linux.preset - source = change-default-console-loglevel.patch validpgpkeys = ABAF11C65A2970B130ABE3C479BE3E4300411886 validpgpkeys = 647F28654894E3BD457199BE38DBBDC86092693E validpgpkeys = 64254695FFF0AA4466CC19E67B96E8162A8CF5D1 validpgpkeys = 8D633C480C2247466051B7ADE314F17E08EF006D - sha256sums = cc0f42f408ba3e51f8b0e93e3d8050ff18569456d286cb2a1aca3327dd06890f + sha256sums = 81e344d7852128a80fe54f659b2c87bb1b1bde560cd80c52c79c99f568fd5acf sha256sums = SKIP - sha256sums = f258a256ebdb51ceabbe1e482706756437c7113c6d987025203468bfb8601f9a + sha256sums = 157492d303dd0504181e55cdcfe65471c578a3a540527c1c11b0a39297b23de5 sha256sums = SKIP sha256sums = 242d32d0fe819852e74d93b8a044cf24a40a9474d6f00ca93a19aa98298dcefa sha256sums = 51f91681b708149fe91e565f5c40811477428e2aa86f8726a20e0e7c55c5407c @@ -53,15 +53,15 @@ pkgbase = linux-surfacepro3-rt sha256sums = f479a5ca6abe4d50ca4c09e6e83a027369fcd3efff8d5ce60f0699d8fa47beb8 sha256sums = 4633ae19b9a9871a3cfffba98ec7c3cd240f64bef8a0eebcf1212219c80972fd sha256sums = 6618ef72495a6f7c7e50ecfba4a897f78668a3cbaabb93e97ad3d276e7abc52c - sha256sums = 3a4722981f689225a0ad550e45d829fcc3ca29d4258df3c6c989a916199e1c08 sha256sums = cc78e8844d9ec4bd29cce392a3e4683061646e1ad7c100c4958a5cadabb25b52 sha256sums = 34b4e00ffcf9efc43ab47444d14febb94432d340d0f1d5bcd56153879d1be113 sha256sums = 52e7c895aeb505bc8d3b5321a346fcdbb749f8035cacc97a237c24c1f527adbc + sha256sums = 3a4722981f689225a0ad550e45d829fcc3ca29d4258df3c6c989a916199e1c08 + sha256sums = 1256b241cd477b265a3c2d64bdc19ffe3c9bbcee82ea3994c590c2c76e767d99 sha256sums = 0fcd0b22fe9ec58ba41b81b463f68d619b6898a5c405fb26c85237a183240371 - sha256sums = 577a3c4c211e6946fb8c1448d6a325861b41c8c8660203ae7d63a58f3af0d279 - sha256sums = 5c92eb5febe5bafcc76f19aa3d4aaf723cfbb465615cd68ecfaea54a2f773994 + sha256sums = ed9b9e6efaf4f23e7ae3406322b4d1d3080e8dbc7ab3f03bcbf728ca2010e21b + sha256sums = f0c70a988490189ac3869ef948db2ca28704f1d85a1519d5b5f99afa7b10f741 sha256sums = f0d90e756f14533ee67afda280500511a62465b4f76adcc5effa95a40045179c - sha256sums = 1256b241cd477b265a3c2d64bdc19ffe3c9bbcee82ea3994c590c2c76e767d99 pkgname = linux-surfacepro3-rt pkgdesc = The Linux-surfacepro3-rt kernel and modules @@ -2,9 +2,10 @@ # Contributor: Matthew Wardrop <mister.wardrop@gmail.com> pkgbase=linux-surfacepro3-rt -_srcname=linux-4.8.11 -pkgver=4.8.11 -pkgrel=2.1 +_srcname=linux-4.8.14 +pkgver=${_srcname#linux-} +_rtver=rt9 +pkgrel=2.2 arch=('i686' 'x86_64') url="https://github.com/alyptik/linux-surfacepro3-rt" license=('GPL2') @@ -12,33 +13,28 @@ makedepends=('xmlto' 'docbook-xsl' 'kmod' 'inetutils' 'bc' 'elfutils') options=('!strip') source=("https://www.kernel.org/pub/linux/kernel/v4.x/${_srcname}.tar.xz" "https://www.kernel.org/pub/linux/kernel/v4.x/${_srcname}.tar.sign" - "https://www.kernel.org/pub/linux/kernel/projects/rt/${pkgver%.*}/older/patch-${pkgver}-rt7.patch.xz" - "https://www.kernel.org/pub/linux/kernel/projects/rt/${pkgver%.*}/older/patch-${pkgver}-rt7.patch.sign" + "https://www.kernel.org/pub/linux/kernel/projects/rt/${pkgver%.*}/older/patch-${pkgver}-${_rtver}.patch.xz" + "https://www.kernel.org/pub/linux/kernel/projects/rt/${pkgver%.*}/older/patch-${pkgver}-${_rtver}.patch.sign" # Brain Fuck Scheduler & other personal patches 'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfq.patch' 'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs.patch' 'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs-fixes1.patch' 'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs-fixes2.patch' 'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs-fixes3.patch' - 'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/block.patch' - 'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/init.patch' - 'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/kconfig.patch' - 'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/xattr.patch' - 'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/xfs.patch' - 'multitouch.patch' - 'touchscreen_multitouch_fixes1.patch' - 'touchscreen_multitouch_fixes2.patch' + 'block.patch' 'init.patch' 'kconfig.patch' 'xattr.patch' 'xfs.patch' + 'touchscreen_multitouch_fixes1.patch' 'touchscreen_multitouch_fixes2.patch' 'wifi.patch' + 'multitouch.patch' + 'change-default-console-loglevel.patch' # the main kernel config files 'config' 'config.x86_64' 'config.sp3' # standard config files for mkinitcpio ramdisk 'linux.preset' - 'change-default-console-loglevel.patch' ) -sha256sums=('cc0f42f408ba3e51f8b0e93e3d8050ff18569456d286cb2a1aca3327dd06890f' +sha256sums=('81e344d7852128a80fe54f659b2c87bb1b1bde560cd80c52c79c99f568fd5acf' 'SKIP' - 'f258a256ebdb51ceabbe1e482706756437c7113c6d987025203468bfb8601f9a' + '157492d303dd0504181e55cdcfe65471c578a3a540527c1c11b0a39297b23de5' 'SKIP' '242d32d0fe819852e74d93b8a044cf24a40a9474d6f00ca93a19aa98298dcefa' '51f91681b708149fe91e565f5c40811477428e2aa86f8726a20e0e7c55c5407c' @@ -50,15 +46,15 @@ sha256sums=('cc0f42f408ba3e51f8b0e93e3d8050ff18569456d286cb2a1aca3327dd06890f' 'f479a5ca6abe4d50ca4c09e6e83a027369fcd3efff8d5ce60f0699d8fa47beb8' '4633ae19b9a9871a3cfffba98ec7c3cd240f64bef8a0eebcf1212219c80972fd' '6618ef72495a6f7c7e50ecfba4a897f78668a3cbaabb93e97ad3d276e7abc52c' - '3a4722981f689225a0ad550e45d829fcc3ca29d4258df3c6c989a916199e1c08' 'cc78e8844d9ec4bd29cce392a3e4683061646e1ad7c100c4958a5cadabb25b52' '34b4e00ffcf9efc43ab47444d14febb94432d340d0f1d5bcd56153879d1be113' '52e7c895aeb505bc8d3b5321a346fcdbb749f8035cacc97a237c24c1f527adbc' + '3a4722981f689225a0ad550e45d829fcc3ca29d4258df3c6c989a916199e1c08' + '1256b241cd477b265a3c2d64bdc19ffe3c9bbcee82ea3994c590c2c76e767d99' '0fcd0b22fe9ec58ba41b81b463f68d619b6898a5c405fb26c85237a183240371' - '577a3c4c211e6946fb8c1448d6a325861b41c8c8660203ae7d63a58f3af0d279' - '5c92eb5febe5bafcc76f19aa3d4aaf723cfbb465615cd68ecfaea54a2f773994' - 'f0d90e756f14533ee67afda280500511a62465b4f76adcc5effa95a40045179c' - '1256b241cd477b265a3c2d64bdc19ffe3c9bbcee82ea3994c590c2c76e767d99') + 'ed9b9e6efaf4f23e7ae3406322b4d1d3080e8dbc7ab3f03bcbf728ca2010e21b' + 'f0c70a988490189ac3869ef948db2ca28704f1d85a1519d5b5f99afa7b10f741' + 'f0d90e756f14533ee67afda280500511a62465b4f76adcc5effa95a40045179c') validpgpkeys=( 'ABAF11C65A2970B130ABE3C479BE3E4300411886' # Linus Torvalds @@ -67,11 +63,11 @@ validpgpkeys=( '8D633C480C2247466051B7ADE314F17E08EF006D' # Joey Pabalinas ) multitouch='y' -sp3config='y' bcache='n' bfs='n' bfq='n' personal='y' +sp3config='n' _kernelname=${pkgbase#linux} @@ -82,7 +78,8 @@ prepare() { if [ "$bfq" = 'y' ]; then patch -p1 -i "${srcdir}/bfq.patch"; fi if [ "$bfs" = 'y' ]; then for i in bfs bfs-fixes{1..3}; do patch -p1 -i "${srcdir}/${i}.patch"; done; fi if [ "$bcache" = 'y' ]; then - sed -i '\%^diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig$%,+11 d' "${srcdir}/patch-${pkgver}-rt2.patch" + sed -i '\%^diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig$%,+11 d' \ + "${srcdir}/patch-${pkgver}-${_rtver}.patch" cp "${srcdir}/linux-${pkgver}/include/linux/rwsem.h" "${srcdir}/linux-${pkgver}/drivers/md/bcache/" sed -i '/#include "bcache.h"/i #include "rwsem.h"\n' "${srcdir}/linux-${pkgver}/drivers/md/bcache/request.c" fi @@ -91,7 +88,7 @@ prepare() { if [ "$personal" = 'y' ]; then for i in block init kconfig xattr xfs; do patch -p1 -i "${srcdir}/${i}.patch"; done; fi # Add RT patches - patch -p1 -i ${srcdir}/patch-${pkgver}*.patch + patch -p1 -i ${srcdir}/patch-${pkgver}-${_rtver}.patch # set DEFAULT_CONSOLE_LOGLEVEL to 4 (same value as the 'quiet' kernel param) # remove this when a Kconfig knob is made available by upstream @@ -113,7 +110,7 @@ prepare() { ## If sp3config='y' use personal config as a base if [ "$sp3config" = 'y' ]; then cat "${srcdir}/config.sp3" >./.config - elif [ "${CARCH}" = "x86_64" ]; then + elif [ "$CARCH" = "x86_64" ]; then cat "${srcdir}/config.x86_64" >./.config else cat "${srcdir}/config" > ./.config diff --git a/block.patch b/block.patch new file mode 100644 index 000000000000..1f99d7869a0d --- /dev/null +++ b/block.patch @@ -0,0 +1,2650 @@ +To: LKML <linux-kernel@vger.kernel.org>, Jens Axboe <axboe@fb.com> +From: =?UTF-8?Q?Holger_Hoffst=c3=a4tte?= <holger.hoffstaette@googlemail.com> +Subject: [PATCH] loop: properly observe rotational flag of underlying device +Organization: Applied Asynchrony, Inc. +Date: Wed, 11 Nov 2015 16:21:51 +0100 + +The loop driver always declares the rotational flag of its device as +rotational, even when the device of the mapped file is nonrotational, +as is the case with SSDs or on tmpfs. This can confuse filesystem tools +which are SSD-aware; in my case I frequently forget to tell mkfs.btrfs +that my loop device on tmpfs is nonrotational, and that I really don't +need any automatic metadata redundancy. + +The attached patch fixes this by introspecting the rotational flag of the +mapped file's underlying block device, if it exists. If the mapped file's +filesystem has no associated block device - as is the case on e.g. tmpfs - +we assume nonrotational storage. If there is a better way to identify such +non-devices I'd love to hear them. + +Signed-off-by: Holger Hoffstätte <holger.hoffstaette@googlemail.com> +--- + drivers/block/loop.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +diff --git a/drivers/block/loop.c b/drivers/block/loop.c +index 423f4ca..2984aca 100644 +--- a/drivers/block/loop.c ++++ b/drivers/block/loop.c +@@ -843,6 +843,24 @@ static void loop_config_discard(struct loop_device *lo) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + } + ++static void loop_update_rotational(struct loop_device *lo) ++{ ++ struct file *file = lo->lo_backing_file; ++ struct inode *file_inode = file->f_mapping->host; ++ struct block_device *file_bdev = file_inode->i_sb->s_bdev; ++ struct request_queue *q = lo->lo_queue; ++ bool nonrot = true; ++ ++ /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */ ++ if (file_bdev) ++ nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev)); ++ ++ if (nonrot) ++ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); ++ else ++ queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q); ++} ++ + static void loop_unprepare_queue(struct loop_device *lo) + { + flush_kthread_worker(&lo->worker); +@@ -939,6 +957,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, + if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) + blk_queue_flush(lo->lo_queue, REQ_FLUSH); + ++ loop_update_rotational(lo); + loop_update_dio(lo); + set_capacity(lo->lo_disk, size); + bd_set_size(bdev, size << 9); +-- +2.6.3 +From 273d4cb9fc3d75b6b7f147d1a064f75a5412a76c Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@fb.com> +Date: Wed, 27 Jul 2016 15:30:35 -0600 +Subject: block: add WRITE_BG + +This adds a new request flag, REQ_BG, that callers can use to tell +the block layer that this is background (non-urgent) IO. + +Signed-off-by: Jens Axboe <axboe@fb.com> +--- + include/linux/blk_types.h | 4 +++- + include/linux/fs.h | 3 +++ + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h +index 436f43f..be4409b 100644 +--- a/include/linux/blk_types.h ++++ b/include/linux/blk_types.h +@@ -155,6 +155,7 @@ enum rq_flag_bits { + __REQ_INTEGRITY, /* I/O includes block integrity payload */ + __REQ_FUA, /* forced unit access */ + __REQ_PREFLUSH, /* request for cache flush */ ++ __REQ_BG, /* background activity */ + + /* bio only flags */ + __REQ_RAHEAD, /* read ahead, can fail anytime */ +@@ -198,7 +199,7 @@ enum rq_flag_bits { + (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) + #define REQ_COMMON_MASK \ + (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \ +- REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE) ++ REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_BG) + #define REQ_CLONE_MASK REQ_COMMON_MASK + + /* This mask is used for both bio and request merge checking */ +@@ -223,6 +224,7 @@ enum rq_flag_bits { + #define REQ_COPY_USER (1ULL << __REQ_COPY_USER) + #define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) + #define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) ++#define REQ_BG (1ULL << __REQ_BG) + #define REQ_IO_STAT (1ULL << __REQ_IO_STAT) + #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) + #define REQ_PM (1ULL << __REQ_PM) +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 901e25d..7c7951f 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -189,6 +189,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, + * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded + * by a cache flush and data is guaranteed to be on + * non-volatile media on completion. ++ * WRITE_BG Background write. This is for background activity like ++ * the periodic flush and background threshold writeback + * + */ + #define RW_MASK REQ_OP_WRITE +@@ -202,6 +204,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, + #define WRITE_FLUSH (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH) + #define WRITE_FUA (REQ_SYNC | REQ_NOIDLE | REQ_FUA) + #define WRITE_FLUSH_FUA (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA) ++#define WRITE_BG (REQ_NOIDLE | REQ_BG) + + /* + * Attribute flags. These should be or-ed together to figure out what +-- +cgit v0.11.2 + +From 33a170c4f076584bc05feb19efa7beb0ee099318 Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@fb.com> +Date: Wed, 27 Jul 2016 15:24:08 -0600 +Subject: writeback: add wbc_to_write_flags() + +Add wbc_to_write_flags(), which returns the write modifier flags to use, +based on a struct writeback_control. No functional changes in this +patch, but it prepares us for factoring other wbc fields for write type. + +Signed-off-by: Jens Axboe <axboe@fb.com> +Reviewed-by: Jan Kara <jack@suse.cz> +--- + fs/buffer.c | 2 +- + fs/f2fs/data.c | 2 +- + fs/f2fs/node.c | 2 +- + fs/gfs2/meta_io.c | 3 +-- + fs/mpage.c | 2 +- + fs/xfs/xfs_aops.c | 7 +++---- + include/linux/writeback.h | 8 ++++++++ + 7 files changed, 16 insertions(+), 10 deletions(-) + +diff --git a/fs/buffer.c b/fs/buffer.c +index 9c8eb9b..6a5f1a0 100644 +--- a/fs/buffer.c ++++ b/fs/buffer.c +@@ -1698,7 +1698,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, + struct buffer_head *bh, *head; + unsigned int blocksize, bbits; + int nr_underway = 0; +- int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); ++ int write_flags = wbc_to_write_flags(wbc); + + head = create_page_buffers(page, inode, + (1 << BH_Dirty)|(1 << BH_Uptodate)); +diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c +index ccb401e..cb0528b 100644 +--- a/fs/f2fs/data.c ++++ b/fs/f2fs/data.c +@@ -1240,7 +1240,7 @@ static int f2fs_write_data_page(struct page *page, + .sbi = sbi, + .type = DATA, + .op = REQ_OP_WRITE, +- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, ++ .op_flags = wbc_to_write_flags(wbc), + .page = page, + .encrypted_page = NULL, + }; +diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c +index f75d197..c1713da 100644 +--- a/fs/f2fs/node.c ++++ b/fs/f2fs/node.c +@@ -1561,7 +1561,7 @@ static int f2fs_write_node_page(struct page *page, + .sbi = sbi, + .type = NODE, + .op = REQ_OP_WRITE, +- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, ++ .op_flags = wbc_to_write_flags(wbc), + .page = page, + .encrypted_page = NULL, + }; +diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c +index 950b8be..7991c62 100644 +--- a/fs/gfs2/meta_io.c ++++ b/fs/gfs2/meta_io.c +@@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb + { + struct buffer_head *bh, *head; + int nr_underway = 0; +- int write_flags = REQ_META | REQ_PRIO | +- (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); ++ int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc); + + BUG_ON(!PageLocked(page)); + BUG_ON(!page_has_buffers(page)); +diff --git a/fs/mpage.c b/fs/mpage.c +index d2413af..d6f1afe 100644 +--- a/fs/mpage.c ++++ b/fs/mpage.c +@@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, + struct buffer_head map_bh; + loff_t i_size = i_size_read(inode); + int ret = 0; +- int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); ++ int op_flags = wbc_to_write_flags(wbc); + + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); +diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c +index 7575cfc..a68645a 100644 +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -447,8 +447,8 @@ xfs_submit_ioend( + + ioend->io_bio->bi_private = ioend; + ioend->io_bio->bi_end_io = xfs_end_bio; +- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, +- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); ++ bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc)); ++ + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately +@@ -519,8 +519,7 @@ xfs_chain_bio( + + bio_chain(ioend->io_bio, new); + bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ +- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, +- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); ++ bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc)); + submit_bio(ioend->io_bio); + ioend->io_bio = new; + } +diff --git a/include/linux/writeback.h b/include/linux/writeback.h +index fc1e16c..608afd3 100644 +--- a/include/linux/writeback.h ++++ b/include/linux/writeback.h +@@ -100,6 +100,14 @@ struct writeback_control { + #endif + }; + ++static inline int wbc_to_write_flags(struct writeback_control *wbc) ++{ ++ if (wbc->sync_mode == WB_SYNC_ALL) ++ return WRITE_SYNC; ++ ++ return 0; ++} ++ + /* + * A wb_domain represents a domain that wb's (bdi_writeback's) belong to + * and are measured against each other in. There always is one global +-- +cgit v0.11.2 + +From d6cf7bfd4d627114ba3e2cce96fa9468042a6fba Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@fb.com> +Date: Thu, 14 Apr 2016 09:53:24 -0600 +Subject: writeback: use WRITE_BG for kupdate and background writeback + +If we're doing background type writes, then use the appropriate +write command for that. + +Signed-off-by: Jens Axboe <axboe@fb.com> +--- + include/linux/writeback.h | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/include/linux/writeback.h b/include/linux/writeback.h +index 608afd3..e53abf2 100644 +--- a/include/linux/writeback.h ++++ b/include/linux/writeback.h +@@ -104,6 +104,8 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc) + { + if (wbc->sync_mode == WB_SYNC_ALL) + return WRITE_SYNC; ++ else if (wbc->for_kupdate || wbc->for_background) ++ return WRITE_BG; + + return 0; + } +-- +cgit v0.11.2 + +From cd38cff40da34de0bf78f8305c89bdfafc606e7f Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@fb.com> +Date: Thu, 1 Sep 2016 10:20:33 -0600 +Subject: writeback: track if we're sleeping on progress in + balance_dirty_pages() + +Note in the bdi_writeback structure whenever a task ends up sleeping +waiting for progress. We can use that information in the lower layers +to increase the priority of writes. + +Signed-off-by: Jens Axboe <axboe@fb.com> +--- + include/linux/backing-dev-defs.h | 2 ++ + mm/backing-dev.c | 1 + + mm/page-writeback.c | 1 + + 3 files changed, 4 insertions(+) + +diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h +index c357f27..dc5f76d 100644 +--- a/include/linux/backing-dev-defs.h ++++ b/include/linux/backing-dev-defs.h +@@ -116,6 +116,8 @@ struct bdi_writeback { + struct list_head work_list; + struct delayed_work dwork; /* work item used for writeback */ + ++ unsigned long dirty_sleep; /* last wait */ ++ + struct list_head bdi_node; /* anchored at bdi->wb_list */ + + #ifdef CONFIG_CGROUP_WRITEBACK +diff --git a/mm/backing-dev.c b/mm/backing-dev.c +index 8fde443..3bfed5ab 100644 +--- a/mm/backing-dev.c ++++ b/mm/backing-dev.c +@@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, + spin_lock_init(&wb->work_lock); + INIT_LIST_HEAD(&wb->work_list); + INIT_DELAYED_WORK(&wb->dwork, wb_workfn); ++ wb->dirty_sleep = jiffies; + + wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); + if (!wb->congested) +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index f4cd7d8..98bc3fc 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -1778,6 +1778,7 @@ pause: + pause, + start_time); + __set_current_state(TASK_KILLABLE); ++ wb->dirty_sleep = now; + io_schedule_timeout(pause); + + current->dirty_paused_when = now + pause; +-- +cgit v0.11.2 + +From a98f5ab3840c2e6008c478aafe5df055404acdd1 Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@fb.com> +Date: Wed, 30 Mar 2016 10:21:08 -0600 +Subject: block: add code to track actual device queue depth + +For blk-mq, ->nr_requests does track queue depth, at least at init +time. But for the older queue paths, it's simply a soft setting. +On top of that, it's generally larger than the hardware setting +on purpose, to allow backup of requests for merging. + +Fill a hole in struct request with a 'queue_depth' member, that +drivers can call to more closely inform the block layer of the +real queue depth. + +Signed-off-by: Jens Axboe <axboe@fb.com> +--- + block/blk-settings.c | 12 ++++++++++++ + drivers/scsi/scsi.c | 3 +++ + include/linux/blkdev.h | 11 +++++++++++ + 3 files changed, 26 insertions(+) + +diff --git a/block/blk-settings.c b/block/blk-settings.c +index f679ae1..f7e122e 100644 +--- a/block/blk-settings.c ++++ b/block/blk-settings.c +@@ -832,6 +832,18 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable) + EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); + + /** ++ * blk_set_queue_depth - tell the block layer about the device queue depth ++ * @q: the request queue for the device ++ * @depth: queue depth ++ * ++ */ ++void blk_set_queue_depth(struct request_queue *q, unsigned int depth) ++{ ++ q->queue_depth = depth; ++} ++EXPORT_SYMBOL(blk_set_queue_depth); ++ ++/** + * blk_queue_write_cache - configure queue's write cache + * @q: the request queue for the device + * @wc: write back cache on or off +diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c +index 1f36aca..f3de98a 100644 +--- a/drivers/scsi/scsi.c ++++ b/drivers/scsi/scsi.c +@@ -621,6 +621,9 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) + wmb(); + } + ++ if (sdev->request_queue) ++ blk_set_queue_depth(sdev->request_queue, depth); ++ + return sdev->queue_depth; + } + EXPORT_SYMBOL(scsi_change_queue_depth); +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index e79055c..1d12aa6 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -327,6 +327,8 @@ struct request_queue { + struct blk_mq_ctx __percpu *queue_ctx; + unsigned int nr_queues; + ++ unsigned int queue_depth; ++ + /* hw dispatch queues */ + struct blk_mq_hw_ctx **queue_hw_ctx; + unsigned int nr_hw_queues; +@@ -683,6 +685,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) + return false; + } + ++static inline unsigned int blk_queue_depth(struct request_queue *q) ++{ ++ if (q->queue_depth) ++ return q->queue_depth; ++ ++ return q->nr_requests; ++} ++ + /* + * q->prep_rq_fn return values + */ +@@ -999,6 +1009,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); + extern void blk_queue_io_min(struct request_queue *q, unsigned int min); + extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); + extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); ++extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); + extern void blk_set_default_limits(struct queue_limits *lim); + extern void blk_set_stacking_limits(struct queue_limits *lim); + extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, +-- +cgit v0.11.2 + +From a13cc5885ddd5582129869c1837821d6af6d48bb Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@fb.com> +Date: Thu, 1 Sep 2016 10:22:41 -0600 +Subject: block: add scalable completion tracking of requests + +For legacy block, we simply track them in the request queue. For +blk-mq, we track them on a per-sw queue basis, which we can then +sum up through the hardware queues and finally to a per device +state. + +The stats are tracked in, roughly, 0.1s interval windows. + +Add sysfs files to display the stats. + +Signed-off-by: Jens Axboe <axboe@fb.com> +--- + block/Makefile | 2 +- + block/blk-core.c | 4 + + block/blk-mq-sysfs.c | 47 ++++++++++ + block/blk-mq.c | 14 +++ + block/blk-mq.h | 3 + + block/blk-stat.c | 220 ++++++++++++++++++++++++++++++++++++++++++++++ + block/blk-stat.h | 18 ++++ + block/blk-sysfs.c | 26 ++++++ + include/linux/blk_types.h | 12 +++ + include/linux/blkdev.h | 4 + + 10 files changed, 349 insertions(+), 1 deletion(-) + create mode 100644 block/blk-stat.c + create mode 100644 block/blk-stat.h + +diff --git a/block/Makefile b/block/Makefile +index 9eda232..3446e04 100644 +--- a/block/Makefile ++++ b/block/Makefile +@@ -5,7 +5,7 @@ + obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ + blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ + blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ +- blk-lib.o blk-mq.o blk-mq-tag.o \ ++ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ + blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ + genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ + badblocks.o partitions/ +diff --git a/block/blk-core.c b/block/blk-core.c +index 36c7ac3..4075cbe 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -2475,6 +2475,8 @@ void blk_start_request(struct request *req) + { + blk_dequeue_request(req); + ++ req->issue_time = ktime_to_ns(ktime_get()); ++ + /* + * We are now handing the request to the hardware, initialize + * resid_len to full count and add the timeout handler. +@@ -2542,6 +2544,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) + + trace_block_rq_complete(req->q, req, nr_bytes); + ++ blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req); ++ + if (!req->bio) + return false; + +diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c +index fe822aa..b66bbf1 100644 +--- a/block/blk-mq-sysfs.c ++++ b/block/blk-mq-sysfs.c +@@ -247,6 +247,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) + return ret; + } + ++static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx) ++{ ++ struct blk_mq_ctx *ctx; ++ unsigned int i; ++ ++ hctx_for_each_ctx(hctx, ctx, i) { ++ blk_stat_init(&ctx->stat[0]); ++ blk_stat_init(&ctx->stat[1]); ++ } ++} ++ ++static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx, ++ const char *page, size_t count) ++{ ++ blk_mq_stat_clear(hctx); ++ return count; ++} ++ ++static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) ++{ ++ return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", ++ pre, (long long) stat->nr_samples, ++ (long long) stat->mean, (long long) stat->min, ++ (long long) stat->max); ++} ++ ++static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page) ++{ ++ struct blk_rq_stat stat[2]; ++ ssize_t ret; ++ ++ blk_stat_init(&stat[0]); ++ blk_stat_init(&stat[1]); ++ ++ blk_hctx_stat_get(hctx, stat); ++ ++ ret = print_stat(page, &stat[0], "read :"); ++ ret += print_stat(page + ret, &stat[1], "write:"); ++ return ret; ++} ++ + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { + .attr = {.name = "dispatched", .mode = S_IRUGO }, + .show = blk_mq_sysfs_dispatched_show, +@@ -304,6 +345,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { + .attr = {.name = "io_poll", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_poll_show, + }; ++static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = { ++ .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR }, ++ .show = blk_mq_hw_sysfs_stat_show, ++ .store = blk_mq_hw_sysfs_stat_store, ++}; + + static struct attribute *default_hw_ctx_attrs[] = { + &blk_mq_hw_sysfs_queued.attr, +@@ -314,6 +360,7 @@ static struct attribute *default_hw_ctx_attrs[] = { + &blk_mq_hw_sysfs_cpus.attr, + &blk_mq_hw_sysfs_active.attr, + &blk_mq_hw_sysfs_poll.attr, ++ &blk_mq_hw_sysfs_stat.attr, + NULL, + }; + +diff --git a/block/blk-mq.c b/block/blk-mq.c +index 13f5a6c..712f141 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -29,6 +29,7 @@ + #include "blk.h" + #include "blk-mq.h" + #include "blk-mq-tag.h" ++#include "blk-stat.h" + + static DEFINE_MUTEX(all_q_mutex); + static LIST_HEAD(all_q_list); +@@ -400,10 +401,19 @@ static void blk_mq_ipi_complete_request(struct request *rq) + put_cpu(); + } + ++static void blk_mq_stat_add(struct request *rq) ++{ ++ struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)]; ++ ++ blk_stat_add(stat, rq); ++} ++ + static void __blk_mq_complete_request(struct request *rq) + { + struct request_queue *q = rq->q; + ++ blk_mq_stat_add(rq); ++ + if (!q->softirq_done_fn) + blk_mq_end_request(rq, rq->errors); + else +@@ -447,6 +457,8 @@ void blk_mq_start_request(struct request *rq) + if (unlikely(blk_bidi_rq(rq))) + rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + ++ rq->issue_time = ktime_to_ns(ktime_get()); ++ + blk_add_timer(rq); + + /* +@@ -1795,6 +1807,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, + spin_lock_init(&__ctx->lock); + INIT_LIST_HEAD(&__ctx->rq_list); + __ctx->queue = q; ++ blk_stat_init(&__ctx->stat[0]); ++ blk_stat_init(&__ctx->stat[1]); + + /* If the cpu isn't online, the cpu is mapped to first hctx */ + if (!cpu_online(i)) +diff --git a/block/blk-mq.h b/block/blk-mq.h +index 9087b11..e107f70 100644 +--- a/block/blk-mq.h ++++ b/block/blk-mq.h +@@ -1,6 +1,8 @@ + #ifndef INT_BLK_MQ_H + #define INT_BLK_MQ_H + ++#include "blk-stat.h" ++ + struct blk_mq_tag_set; + + struct blk_mq_ctx { +@@ -20,6 +22,7 @@ struct blk_mq_ctx { + + /* incremented at completion time */ + unsigned long ____cacheline_aligned_in_smp rq_completed[2]; ++ struct blk_rq_stat stat[2]; + + struct request_queue *queue; + struct kobject kobj; +diff --git a/block/blk-stat.c b/block/blk-stat.c +new file mode 100644 +index 0000000..3965e8a +--- /dev/null ++++ b/block/blk-stat.c +@@ -0,0 +1,220 @@ ++/* ++ * Block stat tracking code ++ * ++ * Copyright (C) 2016 Jens Axboe ++ */ ++#include <linux/kernel.h> ++#include <linux/blk-mq.h> ++ ++#include "blk-stat.h" ++#include "blk-mq.h" ++ ++static void blk_stat_flush_batch(struct blk_rq_stat *stat) ++{ ++ if (!stat->nr_batch) ++ return; ++ if (!stat->nr_samples) ++ stat->mean = div64_s64(stat->batch, stat->nr_batch); ++ else { ++ stat->mean = div64_s64((stat->mean * stat->nr_samples) + ++ stat->batch, ++ stat->nr_samples + stat->nr_batch); ++ } ++ ++ stat->nr_samples += stat->nr_batch; ++ stat->nr_batch = stat->batch = 0; ++} ++ ++void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) ++{ ++ if (!src->nr_samples) ++ return; ++ ++ blk_stat_flush_batch(src); ++ ++ dst->min = min(dst->min, src->min); ++ dst->max = max(dst->max, src->max); ++ ++ if (!dst->nr_samples) ++ dst->mean = src->mean; ++ else { ++ dst->mean = div64_s64((src->mean * src->nr_samples) + ++ (dst->mean * dst->nr_samples), ++ dst->nr_samples + src->nr_samples); ++ } ++ dst->nr_samples += src->nr_samples; ++} ++ ++static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) ++{ ++ struct blk_mq_hw_ctx *hctx; ++ struct blk_mq_ctx *ctx; ++ uint64_t latest = 0; ++ int i, j, nr; ++ ++ blk_stat_init(&dst[0]); ++ blk_stat_init(&dst[1]); ++ ++ nr = 0; ++ do { ++ uint64_t newest = 0; ++ ++ queue_for_each_hw_ctx(q, hctx, i) { ++ hctx_for_each_ctx(hctx, ctx, j) { ++ if (!ctx->stat[0].nr_samples && ++ !ctx->stat[1].nr_samples) ++ continue; ++ if (ctx->stat[0].time > newest) ++ newest = ctx->stat[0].time; ++ if (ctx->stat[1].time > newest) ++ newest = ctx->stat[1].time; ++ } ++ } ++ ++ /* ++ * No samples ++ */ ++ if (!newest) ++ break; ++ ++ if (newest > latest) ++ latest = newest; ++ ++ queue_for_each_hw_ctx(q, hctx, i) { ++ hctx_for_each_ctx(hctx, ctx, j) { ++ if (ctx->stat[0].time == newest) { ++ blk_stat_sum(&dst[0], &ctx->stat[0]); ++ nr++; ++ } ++ if (ctx->stat[1].time == newest) { ++ blk_stat_sum(&dst[1], &ctx->stat[1]); ++ nr++; ++ } ++ } ++ } ++ /* ++ * If we race on finding an entry, just loop back again. ++ * Should be very rare. ++ */ ++ } while (!nr); ++ ++ dst[0].time = dst[1].time = latest; ++} ++ ++void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) ++{ ++ if (q->mq_ops) ++ blk_mq_stat_get(q, dst); ++ else { ++ memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat)); ++ memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat)); ++ } ++} ++ ++void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) ++{ ++ struct blk_mq_ctx *ctx; ++ unsigned int i, nr; ++ ++ nr = 0; ++ do { ++ uint64_t newest = 0; ++ ++ hctx_for_each_ctx(hctx, ctx, i) { ++ if (!ctx->stat[0].nr_samples && ++ !ctx->stat[1].nr_samples) ++ continue; ++ ++ if (ctx->stat[0].time > newest) ++ newest = ctx->stat[0].time; ++ if (ctx->stat[1].time > newest) ++ newest = ctx->stat[1].time; ++ } ++ ++ if (!newest) ++ break; ++ ++ hctx_for_each_ctx(hctx, ctx, i) { ++ if (ctx->stat[0].time == newest) { ++ blk_stat_sum(&dst[0], &ctx->stat[0]); ++ nr++; ++ } ++ if (ctx->stat[1].time == newest) { ++ blk_stat_sum(&dst[1], &ctx->stat[1]); ++ nr++; ++ } ++ } ++ /* ++ * If we race on finding an entry, just loop back again. ++ * Should be very rare, as the window is only updated ++ * occasionally ++ */ ++ } while (!nr); ++} ++ ++static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) ++{ ++ stat->min = -1ULL; ++ stat->max = stat->nr_samples = stat->mean = 0; ++ stat->batch = stat->nr_batch = 0; ++ stat->time = time_now & BLK_STAT_MASK; ++} ++ ++void blk_stat_init(struct blk_rq_stat *stat) ++{ ++ __blk_stat_init(stat, ktime_to_ns(ktime_get())); ++} ++ ++static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now) ++{ ++ return (now & BLK_STAT_MASK) == (stat->time & BLK_STAT_MASK); ++} ++ ++bool blk_stat_is_current(struct blk_rq_stat *stat) ++{ ++ return __blk_stat_is_current(stat, ktime_to_ns(ktime_get())); ++} ++ ++void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) ++{ ++ s64 now, value; ++ ++ now = ktime_to_ns(ktime_get()); ++ if (now < rq->issue_time) ++ return; ++ ++ if (!__blk_stat_is_current(stat, now)) ++ __blk_stat_init(stat, now); ++ ++ value = now - rq->issue_time; ++ if (value > stat->max) ++ stat->max = value; ++ if (value < stat->min) ++ stat->min = value; ++ ++ if (stat->batch + value < stat->batch || ++ stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) ++ blk_stat_flush_batch(stat); ++ ++ stat->batch += value; ++ stat->nr_batch++; ++} ++ ++void blk_stat_clear(struct request_queue *q) ++{ ++ if (q->mq_ops) { ++ struct blk_mq_hw_ctx *hctx; ++ struct blk_mq_ctx *ctx; ++ int i, j; ++ ++ queue_for_each_hw_ctx(q, hctx, i) { ++ hctx_for_each_ctx(hctx, ctx, j) { ++ blk_stat_init(&ctx->stat[0]); ++ blk_stat_init(&ctx->stat[1]); ++ } ++ } ++ } else { ++ blk_stat_init(&q->rq_stats[0]); ++ blk_stat_init(&q->rq_stats[1]); ++ } ++} +diff --git a/block/blk-stat.h b/block/blk-stat.h +new file mode 100644 +index 0000000..376a6cc +--- /dev/null ++++ b/block/blk-stat.h +@@ -0,0 +1,18 @@ ++#ifndef BLK_STAT_H ++#define BLK_STAT_H ++ ++/* ++ * ~0.13s window as a power-of-2 (2^27 nsecs) ++ */ ++#define BLK_STAT_NSEC 134217728ULL ++#define BLK_STAT_MASK ~(BLK_STAT_NSEC - 1) ++ ++void blk_stat_add(struct blk_rq_stat *, struct request *); ++void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); ++void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); ++void blk_stat_clear(struct request_queue *q); ++void blk_stat_init(struct blk_rq_stat *); ++void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *); ++bool blk_stat_is_current(struct blk_rq_stat *); ++ ++#endif +diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c +index f87a7e7..0b9e435 100644 +--- a/block/blk-sysfs.c ++++ b/block/blk-sysfs.c +@@ -384,6 +384,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) + return queue_var_show(blk_queue_dax(q), page); + } + ++static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) ++{ ++ return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", ++ pre, (long long) stat->nr_samples, ++ (long long) stat->mean, (long long) stat->min, ++ (long long) stat->max); ++} ++ ++static ssize_t queue_stats_show(struct request_queue *q, char *page) ++{ ++ struct blk_rq_stat stat[2]; ++ ssize_t ret; ++ ++ blk_queue_stat_get(q, stat); ++ ++ ret = print_stat(page, &stat[0], "read :"); ++ ret += print_stat(page + ret, &stat[1], "write:"); ++ return ret; ++} ++ + static struct queue_sysfs_entry queue_requests_entry = { + .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, + .show = queue_requests_show, +@@ -526,6 +546,11 @@ static struct queue_sysfs_entry queue_dax_entry = { + .show = queue_dax_show, + }; + ++static struct queue_sysfs_entry queue_stats_entry = { ++ .attr = {.name = "stats", .mode = S_IRUGO }, ++ .show = queue_stats_show, ++}; ++ + static struct attribute *default_attrs[] = { + &queue_requests_entry.attr, + &queue_ra_entry.attr, +@@ -553,6 +578,7 @@ static struct attribute *default_attrs[] = { + &queue_poll_entry.attr, + &queue_wc_entry.attr, + &queue_dax_entry.attr, ++ &queue_stats_entry.attr, + NULL, + }; + +diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h +index be4409b..95fbfa1 100644 +--- a/include/linux/blk_types.h ++++ b/include/linux/blk_types.h +@@ -266,4 +266,16 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) + return cookie & ((1u << BLK_QC_T_SHIFT) - 1); + } + ++#define BLK_RQ_STAT_BATCH 64 ++ ++struct blk_rq_stat { ++ s64 mean; ++ u64 min; ++ u64 max; ++ s32 nr_samples; ++ s32 nr_batch; ++ u64 batch; ++ s64 time; ++}; ++ + #endif /* __LINUX_BLK_TYPES_H */ +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 1d12aa6..259eba8 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -151,6 +151,7 @@ struct request { + struct gendisk *rq_disk; + struct hd_struct *part; + unsigned long start_time; ++ s64 issue_time; + #ifdef CONFIG_BLK_CGROUP + struct request_list *rl; /* rl this rq is alloced from */ + unsigned long long start_time_ns; +@@ -414,6 +415,9 @@ struct request_queue { + + unsigned int nr_sorted; + unsigned int in_flight[2]; ++ ++ struct blk_rq_stat rq_stats[2]; ++ + /* + * Number of active block driver functions for which blk_drain_queue() + * must wait. Must be incremented around functions that unlock the +-- +cgit v0.11.2 + +From 9a38b8e46f9f759dbb3fd81810579ac1013bf814 Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@fb.com> +Date: Thu, 8 Sep 2016 11:07:16 -0600 +Subject: wbt: add general throttling mechanism + +We can hook this up to the block layer, to help throttle buffered +writes. Or NFS can tap into it, to accomplish the same. + +wbt registers a few trace points that can be used to track what is +happening in the system: + +wbt_lat: 259:0: latency 2446318 +wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1, + wmean=518866, wmin=15522, wmax=5330353, wsamples=57 +wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, max=32 + +This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat +dumps the current read/write stats for that window, and wbt_step shows a +step down event where we now scale back writes. Each trace includes the +device, 259:0 in this case. + +Signed-off-by: Jens Axboe <axboe@fb.com> +--- + include/linux/wbt.h | 120 ++++++++ + include/trace/events/wbt.h | 153 ++++++++++ + lib/Kconfig | 3 + + lib/Makefile | 1 + + lib/wbt.c | 681 +++++++++++++++++++++++++++++++++++++++++++++ + 5 files changed, 958 insertions(+) + create mode 100644 include/linux/wbt.h + create mode 100644 include/trace/events/wbt.h + create mode 100644 lib/wbt.c + +diff --git a/include/linux/wbt.h b/include/linux/wbt.h +new file mode 100644 +index 0000000..5ffcd14 +--- /dev/null ++++ b/include/linux/wbt.h +@@ -0,0 +1,120 @@ ++#ifndef WB_THROTTLE_H ++#define WB_THROTTLE_H ++ ++#include <linux/atomic.h> ++#include <linux/wait.h> ++#include <linux/timer.h> ++#include <linux/ktime.h> ++ ++enum { ++ ISSUE_STAT_TRACKED = 1ULL << 63, ++ ISSUE_STAT_READ = 1ULL << 62, ++ ISSUE_STAT_MASK = ISSUE_STAT_TRACKED | ISSUE_STAT_READ, ++ ISSUE_STAT_TIME_MASK = ~ISSUE_STAT_MASK, ++ ++ WBT_TRACKED = 1, ++ WBT_READ = 2, ++}; ++ ++struct wb_issue_stat { ++ u64 time; ++}; ++ ++static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat) ++{ ++ stat->time = (stat->time & ISSUE_STAT_MASK) | ++ (ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK); ++} ++ ++static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat) ++{ ++ return stat->time & ISSUE_STAT_TIME_MASK; ++} ++ ++static inline void wbt_mark_tracked(struct wb_issue_stat *stat) ++{ ++ stat->time |= ISSUE_STAT_TRACKED; ++} ++ ++static inline void wbt_clear_state(struct wb_issue_stat *stat) ++{ ++ stat->time &= ~(ISSUE_STAT_TRACKED | ISSUE_STAT_READ); ++} ++ ++static inline bool wbt_tracked(struct wb_issue_stat *stat) ++{ ++ return (stat->time & ISSUE_STAT_TRACKED) != 0; ++} ++ ++static inline void wbt_mark_read(struct wb_issue_stat *stat) ++{ ++ stat->time |= ISSUE_STAT_READ; ++} ++ ++static inline bool wbt_is_read(struct wb_issue_stat *stat) ++{ ++ return (stat->time & ISSUE_STAT_READ) != 0; ++} ++ ++struct wb_stat_ops { ++ void (*get)(void *, struct blk_rq_stat *); ++ bool (*is_current)(struct blk_rq_stat *); ++ void (*clear)(void *); ++}; ++ ++struct rq_wb { ++ /* ++ * Settings that govern how we throttle ++ */ ++ unsigned int wb_background; /* background writeback */ ++ unsigned int wb_normal; /* normal writeback */ ++ unsigned int wb_max; /* max throughput writeback */ ++ int scale_step; ++ bool scaled_max; ++ ++ u64 win_nsec; /* default window size */ ++ u64 cur_win_nsec; /* current window size */ ++ ++ /* ++ * Number of consecutive periods where we don't have enough ++ * information to make a firm scale up/down decision. ++ */ ++ unsigned int unknown_cnt; ++ ++ struct timer_list window_timer; ++ ++ s64 sync_issue; ++ void *sync_cookie; ++ ++ unsigned int wc; ++ unsigned int queue_depth; ++ ++ unsigned long last_issue; /* last non-throttled issue */ ++ unsigned long last_comp; /* last non-throttled comp */ ++ unsigned long min_lat_nsec; ++ struct backing_dev_info *bdi; ++ struct request_queue *q; ++ wait_queue_head_t wait; ++ atomic_t inflight; ++ ++ struct wb_stat_ops *stat_ops; ++ void *ops_data; ++}; ++ ++struct backing_dev_info; ++ ++void __wbt_done(struct rq_wb *); ++void wbt_done(struct rq_wb *, struct wb_issue_stat *); ++unsigned int wbt_wait(struct rq_wb *, unsigned int, spinlock_t *); ++struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void *); ++void wbt_exit(struct rq_wb *); ++void wbt_update_limits(struct rq_wb *); ++void wbt_requeue(struct rq_wb *, struct wb_issue_stat *); ++void wbt_issue(struct rq_wb *, struct wb_issue_stat *); ++void wbt_disable(struct rq_wb *); ++void wbt_track(struct wb_issue_stat *, unsigned int); ++ ++void wbt_set_queue_depth(struct rq_wb *, unsigned int); ++void wbt_set_write_cache(struct rq_wb *, bool); ++ ++#endif +diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h +new file mode 100644 +index 0000000..926c7ee +--- /dev/null ++++ b/include/trace/events/wbt.h +@@ -0,0 +1,153 @@ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM wbt ++ ++#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_WBT_H ++ ++#include <linux/tracepoint.h> ++#include <linux/wbt.h> ++ ++/** ++ * wbt_stat - trace stats for blk_wb ++ * @stat: array of read/write stats ++ */ ++TRACE_EVENT(wbt_stat, ++ ++ TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat), ++ ++ TP_ARGS(bdi, stat), ++ ++ TP_STRUCT__entry( ++ __array(char, name, 32) ++ __field(s64, rmean) ++ __field(u64, rmin) ++ __field(u64, rmax) ++ __field(s64, rnr_samples) ++ __field(s64, rtime) ++ __field(s64, wmean) ++ __field(u64, wmin) ++ __field(u64, wmax) ++ __field(s64, wnr_samples) ++ __field(s64, wtime) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->name, dev_name(bdi->dev), 32); ++ __entry->rmean = stat[0].mean; ++ __entry->rmin = stat[0].min; ++ __entry->rmax = stat[0].max; ++ __entry->rnr_samples = stat[0].nr_samples; ++ __entry->wmean = stat[1].mean; ++ __entry->wmin = stat[1].min; ++ __entry->wmax = stat[1].max; ++ __entry->wnr_samples = stat[1].nr_samples; ++ ), ++ ++ TP_printk("%s: rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, " ++ "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n", ++ __entry->name, __entry->rmean, __entry->rmin, __entry->rmax, ++ __entry->rnr_samples, __entry->wmean, __entry->wmin, ++ __entry->wmax, __entry->wnr_samples) ++); ++ ++/** ++ * wbt_lat - trace latency event ++ * @lat: latency trigger ++ */ ++TRACE_EVENT(wbt_lat, ++ ++ TP_PROTO(struct backing_dev_info *bdi, unsigned long lat), ++ ++ TP_ARGS(bdi, lat), ++ ++ TP_STRUCT__entry( ++ __array(char, name, 32) ++ __field(unsigned long, lat) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->name, dev_name(bdi->dev), 32); ++ __entry->lat = div_u64(lat, 1000); ++ ), ++ ++ TP_printk("%s: latency %lluus\n", __entry->name, ++ (unsigned long long) __entry->lat) ++); ++ ++/** ++ * wbt_step - trace wb event step ++ * @msg: context message ++ * @step: the current scale step count ++ * @window: the current monitoring window ++ * @bg: the current background queue limit ++ * @normal: the current normal writeback limit ++ * @max: the current max throughput writeback limit ++ */ ++TRACE_EVENT(wbt_step, ++ ++ TP_PROTO(struct backing_dev_info *bdi, const char *msg, ++ int step, unsigned long window, unsigned int bg, ++ unsigned int normal, unsigned int max), ++ ++ TP_ARGS(bdi, msg, step, window, bg, normal, max), ++ ++ TP_STRUCT__entry( ++ __array(char, name, 32) ++ __field(const char *, msg) ++ __field(int, step) ++ __field(unsigned long, window) ++ __field(unsigned int, bg) ++ __field(unsigned int, normal) ++ __field(unsigned int, max) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->name, dev_name(bdi->dev), 32); ++ __entry->msg = msg; ++ __entry->step = step; ++ __entry->window = div_u64(window, 1000); ++ __entry->bg = bg; ++ __entry->normal = normal; ++ __entry->max = max; ++ ), ++ ++ TP_printk("%s: %s: step=%d, window=%luus, background=%u, normal=%u, max=%u\n", ++ __entry->name, __entry->msg, __entry->step, __entry->window, ++ __entry->bg, __entry->normal, __entry->max) ++); ++ ++/** ++ * wbt_timer - trace wb timer event ++ * @status: timer state status ++ * @step: the current scale step count ++ * @inflight: tracked writes inflight ++ */ ++TRACE_EVENT(wbt_timer, ++ ++ TP_PROTO(struct backing_dev_info *bdi, unsigned int status, ++ int step, unsigned int inflight), ++ ++ TP_ARGS(bdi, status, step, inflight), ++ ++ TP_STRUCT__entry( ++ __array(char, name, 32) ++ __field(unsigned int, status) ++ __field(int, step) ++ __field(unsigned int, inflight) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->name, dev_name(bdi->dev), 32); ++ __entry->status = status; ++ __entry->step = step; ++ __entry->inflight = inflight; ++ ), ++ ++ TP_printk("%s: status=%u, step=%d, inflight=%u\n", __entry->name, ++ __entry->status, __entry->step, __entry->inflight) ++); ++ ++#endif /* _TRACE_WBT_H */ ++ ++/* This part must be outside protection */ ++#include <trace/define_trace.h> +diff --git a/lib/Kconfig b/lib/Kconfig +index d79909d..c585e4c 100644 +--- a/lib/Kconfig ++++ b/lib/Kconfig +@@ -550,4 +550,7 @@ config STACKDEPOT + bool + select STACKTRACE + ++config WBT ++ bool ++ + endmenu +diff --git a/lib/Makefile b/lib/Makefile +index 5dc77a8..23afd63 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -177,6 +177,7 @@ obj-$(CONFIG_SG_SPLIT) += sg_split.o + obj-$(CONFIG_SG_POOL) += sg_pool.o + obj-$(CONFIG_STMP_DEVICE) += stmp_device.o + obj-$(CONFIG_IRQ_POLL) += irq_poll.o ++obj-$(CONFIG_WBT) += wbt.o + + obj-$(CONFIG_STACKDEPOT) += stackdepot.o + KASAN_SANITIZE_stackdepot.o := n +diff --git a/lib/wbt.c b/lib/wbt.c +new file mode 100644 +index 0000000..a995703 +--- /dev/null ++++ b/lib/wbt.c +@@ -0,0 +1,681 @@ ++/* ++ * buffered writeback throttling. losely based on CoDel. We can't drop ++ * packets for IO scheduling, so the logic is something like this: ++ * ++ * - Monitor latencies in a defined window of time. ++ * - If the minimum latency in the above window exceeds some target, increment ++ * scaling step and scale down queue depth by a factor of 2x. The monitoring ++ * window is then shrunk to 100 / sqrt(scaling step + 1). ++ * - For any window where we don't have solid data on what the latencies ++ * look like, retain status quo. ++ * - If latencies look good, decrement scaling step. ++ * - If we're only doing writes, allow the scaling step to go negative. This ++ * will temporarily boost write performance, snapping back to a stable ++ * scaling step of 0 if reads show up or the heavy writers finish. Unlike ++ * positive scaling steps where we shrink the monitoring window, a negative ++ * scaling step retains the default step==0 window size. ++ * ++ * Copyright (C) 2016 Jens Axboe ++ * ++ */ ++#include <linux/kernel.h> ++#include <linux/blk_types.h> ++#include <linux/slab.h> ++#include <linux/backing-dev.h> ++#include <linux/wbt.h> ++ ++#define CREATE_TRACE_POINTS ++#include <trace/events/wbt.h> ++ ++enum { ++ /* ++ * Default setting, we'll scale up (to 75% of QD max) or down (min 1) ++ * from here depending on device stats ++ */ ++ RWB_DEF_DEPTH = 16, ++ ++ /* ++ * 100msec window ++ */ ++ RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, ++ ++ /* ++ * Disregard stats, if we don't meet this minimum ++ */ ++ RWB_MIN_WRITE_SAMPLES = 3, ++ ++ /* ++ * If we have this number of consecutive windows with not enough ++ * information to scale up or down, scale up. ++ */ ++ RWB_UNKNOWN_BUMP = 5, ++}; ++ ++static inline bool rwb_enabled(struct rq_wb *rwb) ++{ ++ return rwb && rwb->wb_normal != 0; ++} ++ ++/* ++ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, ++ * false if 'v' + 1 would be bigger than 'below'. ++ */ ++static bool atomic_inc_below(atomic_t *v, int below) ++{ ++ int cur = atomic_read(v); ++ ++ for (;;) { ++ int old; ++ ++ if (cur >= below) ++ return false; ++ old = atomic_cmpxchg(v, cur, cur + 1); ++ if (old == cur) ++ break; ++ cur = old; ++ } ++ ++ return true; ++} ++ ++static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) ++{ ++ if (rwb_enabled(rwb)) { ++ const unsigned long cur = jiffies; ++ ++ if (cur != *var) ++ *var = cur; ++ } ++} ++ ++/* ++ * If a task was rate throttled in balance_dirty_pages() within the last ++ * second or so, use that to indicate a higher cleaning rate. ++ */ ++static bool wb_recent_wait(struct rq_wb *rwb) ++{ ++ struct bdi_writeback *wb = &rwb->bdi->wb; ++ ++ return time_before(jiffies, wb->dirty_sleep + HZ); ++} ++ ++void __wbt_done(struct rq_wb *rwb) ++{ ++ int inflight, limit; ++ ++ inflight = atomic_dec_return(&rwb->inflight); ++ ++ /* ++ * wbt got disabled with IO in flight. Wake up any potential ++ * waiters, we don't have to do more than that. ++ */ ++ if (unlikely(!rwb_enabled(rwb))) { ++ wake_up_all(&rwb->wait); ++ return; ++ } ++ ++ /* ++ * If the device does write back caching, drop further down ++ * before we wake people up. ++ */ ++ if (rwb->wc && !wb_recent_wait(rwb)) ++ limit = 0; ++ else ++ limit = rwb->wb_normal; ++ ++ /* ++ * Don't wake anyone up if we are above the normal limit. ++ */ ++ if (inflight && inflight >= limit) ++ return; ++ ++ if (waitqueue_active(&rwb->wait)) { ++ int diff = limit - inflight; ++ ++ if (!inflight || diff >= rwb->wb_background / 2) ++ wake_up(&rwb->wait); ++ } ++} ++ ++/* ++ * Called on completion of a request. Note that it's also called when ++ * a request is merged, when the request gets freed. ++ */ ++void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat) ++{ ++ if (!rwb) ++ return; ++ ++ if (!wbt_tracked(stat)) { ++ if (rwb->sync_cookie == stat) { ++ rwb->sync_issue = 0; ++ rwb->sync_cookie = NULL; ++ } ++ ++ if (wbt_is_read(stat)) ++ wb_timestamp(rwb, &rwb->last_comp); ++ wbt_clear_state(stat); ++ } else { ++ WARN_ON_ONCE(stat == rwb->sync_cookie); ++ __wbt_done(rwb); ++ wbt_clear_state(stat); ++ } ++} ++ ++/* ++ * Return true, if we can't increase the depth further by scaling ++ */ ++static bool calc_wb_limits(struct rq_wb *rwb) ++{ ++ unsigned int depth; ++ bool ret = false; ++ ++ if (!rwb->min_lat_nsec) { ++ rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; ++ return false; ++ } ++ ++ /* ++ * For QD=1 devices, this is a special case. It's important for those ++ * to have one request ready when one completes, so force a depth of ++ * 2 for those devices. On the backend, it'll be a depth of 1 anyway, ++ * since the device can't have more than that in flight. If we're ++ * scaling down, then keep a setting of 1/1/1. ++ */ ++ if (rwb->queue_depth == 1) { ++ if (rwb->scale_step > 0) ++ rwb->wb_max = rwb->wb_normal = 1; ++ else { ++ rwb->wb_max = rwb->wb_normal = 2; ++ ret = true; ++ } ++ rwb->wb_background = 1; ++ } else { ++ /* ++ * scale_step == 0 is our default state. If we have suffered ++ * latency spikes, step will be > 0, and we shrink the ++ * allowed write depths. If step is < 0, we're only doing ++ * writes, and we allow a temporarily higher depth to ++ * increase performance. ++ */ ++ depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); ++ if (rwb->scale_step > 0) ++ depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); ++ else if (rwb->scale_step < 0) { ++ unsigned int maxd = 3 * rwb->queue_depth / 4; ++ ++ depth = 1 + ((depth - 1) << -rwb->scale_step); ++ if (depth > maxd) { ++ depth = maxd; ++ ret = true; ++ } ++ } ++ ++ /* ++ * Set our max/normal/bg queue depths based on how far ++ * we have scaled down (->scale_step). ++ */ ++ rwb->wb_max = depth; ++ rwb->wb_normal = (rwb->wb_max + 1) / 2; ++ rwb->wb_background = (rwb->wb_max + 3) / 4; ++ } ++ ++ return ret; ++} ++ ++static bool inline stat_sample_valid(struct blk_rq_stat *stat) ++{ ++ /* ++ * We need at least one read sample, and a minimum of ++ * RWB_MIN_WRITE_SAMPLES. We require some write samples to know ++ * that it's writes impacting us, and not just some sole read on ++ * a device that is in a lower power state. ++ */ ++ return stat[0].nr_samples >= 1 && ++ stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES; ++} ++ ++static u64 rwb_sync_issue_lat(struct rq_wb *rwb) ++{ ++ u64 now, issue = ACCESS_ONCE(rwb->sync_issue); ++ ++ if (!issue || !rwb->sync_cookie) ++ return 0; ++ ++ now = ktime_to_ns(ktime_get()); ++ return now - issue; ++} ++ ++enum { ++ LAT_OK = 1, ++ LAT_UNKNOWN, ++ LAT_UNKNOWN_WRITES, ++ LAT_EXCEEDED, ++}; ++ ++static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) ++{ ++ u64 thislat; ++ ++ /* ++ * If our stored sync issue exceeds the window size, or it ++ * exceeds our min target AND we haven't logged any entries, ++ * flag the latency as exceeded. wbt works off completion latencies, ++ * but for a flooded device, a single sync IO can take a long time ++ * to complete after being issued. If this time exceeds our ++ * monitoring window AND we didn't see any other completions in that ++ * window, then count that sync IO as a violation of the latency. ++ */ ++ thislat = rwb_sync_issue_lat(rwb); ++ if (thislat > rwb->cur_win_nsec || ++ (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) { ++ trace_wbt_lat(rwb->bdi, thislat); ++ return LAT_EXCEEDED; ++ } ++ ++ /* ++ * No read/write mix, if stat isn't valid ++ */ ++ if (!stat_sample_valid(stat)) { ++ /* ++ * If we had writes in this stat window and the window is ++ * current, we're only doing writes. If a task recently ++ * waited or still has writes in flights, consider us doing ++ * just writes as well. ++ */ ++ if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) || ++ wb_recent_wait(rwb) || atomic_read(&rwb->inflight)) ++ return LAT_UNKNOWN_WRITES; ++ return LAT_UNKNOWN; ++ } ++ ++ /* ++ * If the 'min' latency exceeds our target, step down. ++ */ ++ if (stat[0].min > rwb->min_lat_nsec) { ++ trace_wbt_lat(rwb->bdi, stat[0].min); ++ trace_wbt_stat(rwb->bdi, stat); ++ return LAT_EXCEEDED; ++ } ++ ++ if (rwb->scale_step) ++ trace_wbt_stat(rwb->bdi, stat); ++ ++ return LAT_OK; ++} ++ ++static int latency_exceeded(struct rq_wb *rwb) ++{ ++ struct blk_rq_stat stat[2]; ++ ++ rwb->stat_ops->get(rwb->ops_data, stat); ++ return __latency_exceeded(rwb, stat); ++} ++ ++static void rwb_trace_step(struct rq_wb *rwb, const char *msg) ++{ ++ trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec, ++ rwb->wb_background, rwb->wb_normal, rwb->wb_max); ++} ++ ++static void scale_up(struct rq_wb *rwb) ++{ ++ /* ++ * Hit max in previous round, stop here ++ */ ++ if (rwb->scaled_max) ++ return; ++ ++ rwb->scale_step--; ++ rwb->unknown_cnt = 0; ++ rwb->stat_ops->clear(rwb->ops_data); ++ ++ rwb->scaled_max = calc_wb_limits(rwb); ++ ++ if (waitqueue_active(&rwb->wait)) ++ wake_up_all(&rwb->wait); ++ ++ rwb_trace_step(rwb, "step up"); ++} ++ ++/* ++ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we ++ * had a latency violation. ++ */ ++static void scale_down(struct rq_wb *rwb, bool hard_throttle) ++{ ++ /* ++ * Stop scaling down when we've hit the limit. This also prevents ++ * ->scale_step from going to crazy values, if the device can't ++ * keep up. ++ */ ++ if (rwb->wb_max == 1) ++ return; ++ ++ if (rwb->scale_step < 0 && hard_throttle) ++ rwb->scale_step = 0; ++ else ++ rwb->scale_step++; ++ ++ rwb->scaled_max = false; ++ rwb->unknown_cnt = 0; ++ rwb->stat_ops->clear(rwb->ops_data); ++ calc_wb_limits(rwb); ++ rwb_trace_step(rwb, "step down"); ++} ++ ++static void rwb_arm_timer(struct rq_wb *rwb) ++{ ++ unsigned long expires; ++ ++ if (rwb->scale_step > 0) { ++ /* ++ * We should speed this up, using some variant of a fast ++ * integer inverse square root calculation. Since we only do ++ * this for every window expiration, it's not a huge deal, ++ * though. ++ */ ++ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, ++ int_sqrt((rwb->scale_step + 1) << 8)); ++ } else { ++ /* ++ * For step < 0, we don't want to increase/decrease the ++ * window size. ++ */ ++ rwb->cur_win_nsec = rwb->win_nsec; ++ } ++ ++ expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); ++ mod_timer(&rwb->window_timer, expires); ++} ++ ++static void wb_timer_fn(unsigned long data) ++{ ++ struct rq_wb *rwb = (struct rq_wb *) data; ++ int status, inflight; ++ ++ inflight = atomic_read(&rwb->inflight); ++ ++ status = latency_exceeded(rwb); ++ ++ trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight); ++ ++ /* ++ * If we exceeded the latency target, step down. If we did not, ++ * step one level up. If we don't know enough to say either exceeded ++ * or ok, then don't do anything. ++ */ ++ switch (status) { ++ case LAT_EXCEEDED: ++ scale_down(rwb, true); ++ break; ++ case LAT_OK: ++ scale_up(rwb); ++ break; ++ case LAT_UNKNOWN_WRITES: ++ scale_up(rwb); ++ break; ++ case LAT_UNKNOWN: ++ if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) ++ break; ++ /* ++ * We get here for two reasons: ++ * ++ * 1) We previously scaled reduced depth, and we currently ++ * don't have a valid read/write sample. For that case, ++ * slowly return to center state (step == 0). ++ * 2) We started a the center step, but don't have a valid ++ * read/write sample, but we do have writes going on. ++ * Allow step to go negative, to increase write perf. ++ */ ++ if (rwb->scale_step > 0) ++ scale_up(rwb); ++ else if (rwb->scale_step < 0) ++ scale_down(rwb, false); ++ break; ++ default: ++ break; ++ } ++ ++ /* ++ * Re-arm timer, if we have IO in flight ++ */ ++ if (rwb->scale_step || inflight) ++ rwb_arm_timer(rwb); ++} ++ ++void wbt_update_limits(struct rq_wb *rwb) ++{ ++ rwb->scale_step = 0; ++ rwb->scaled_max = false; ++ calc_wb_limits(rwb); ++ ++ if (waitqueue_active(&rwb->wait)) ++ wake_up_all(&rwb->wait); ++} ++ ++static bool close_io(struct rq_wb *rwb) ++{ ++ const unsigned long now = jiffies; ++ ++ return time_before(now, rwb->last_issue + HZ / 10) || ++ time_before(now, rwb->last_comp + HZ / 10); ++} ++ ++#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) ++ ++static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) ++{ ++ unsigned int limit; ++ ++ /* ++ * At this point we know it's a buffered write. If REQ_SYNC is ++ * set, then it's WB_SYNC_ALL writeback, and we'll use the max ++ * limit for that. If the write is marked as a background write, ++ * then use the idle limit, or go to normal if we haven't had ++ * competing IO for a bit. ++ */ ++ if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb)) ++ limit = rwb->wb_max; ++ else if ((rw & REQ_BG) || close_io(rwb)) { ++ /* ++ * If less than 100ms since we completed unrelated IO, ++ * limit us to half the depth for background writeback. ++ */ ++ limit = rwb->wb_background; ++ } else ++ limit = rwb->wb_normal; ++ ++ return limit; ++} ++ ++static inline bool may_queue(struct rq_wb *rwb, unsigned long rw) ++{ ++ /* ++ * inc it here even if disabled, since we'll dec it at completion. ++ * this only happens if the task was sleeping in __wbt_wait(), ++ * and someone turned it off at the same time. ++ */ ++ if (!rwb_enabled(rwb)) { ++ atomic_inc(&rwb->inflight); ++ return true; ++ } ++ ++ return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw)); ++} ++ ++/* ++ * Block if we will exceed our limit, or if we are currently waiting for ++ * the timer to kick off queuing again. ++ */ ++static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) ++{ ++ DEFINE_WAIT(wait); ++ ++ if (may_queue(rwb, rw)) ++ return; ++ ++ do { ++ prepare_to_wait_exclusive(&rwb->wait, &wait, ++ TASK_UNINTERRUPTIBLE); ++ ++ if (may_queue(rwb, rw)) ++ break; ++ ++ if (lock) ++ spin_unlock_irq(lock); ++ ++ io_schedule(); ++ ++ if (lock) ++ spin_lock_irq(lock); ++ } while (1); ++ ++ finish_wait(&rwb->wait, &wait); ++} ++ ++static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw) ++{ ++ const int op = rw >> BIO_OP_SHIFT; ++ ++ /* ++ * If not a WRITE (or a discard), do nothing ++ */ ++ if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD)) ++ return false; ++ ++ /* ++ * Don't throttle WRITE_ODIRECT ++ */ ++ if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Returns true if the IO request should be accounted, false if not. ++ * May sleep, if we have exceeded the writeback limits. Caller can pass ++ * in an irq held spinlock, if it holds one when calling this function. ++ * If we do sleep, we'll release and re-grab it. ++ */ ++unsigned int wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock) ++{ ++ unsigned int ret; ++ ++ if (!rwb_enabled(rwb)) ++ return 0; ++ ++ if ((rw >> BIO_OP_SHIFT) == REQ_OP_READ) ++ ret = WBT_READ; ++ ++ if (!wbt_should_throttle(rwb, rw)) { ++ if (ret & WBT_READ) ++ wb_timestamp(rwb, &rwb->last_issue); ++ return ret; ++ } ++ ++ __wbt_wait(rwb, rw, lock); ++ ++ if (!timer_pending(&rwb->window_timer)) ++ rwb_arm_timer(rwb); ++ ++ return ret | WBT_TRACKED; ++} ++ ++void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat) ++{ ++ if (!rwb_enabled(rwb)) ++ return; ++ ++ wbt_issue_stat_set_time(stat); ++ ++ /* ++ * Track sync issue, in case it takes a long time to complete. Allows ++ * us to react quicker, if a sync IO takes a long time to complete. ++ * Note that this is just a hint. 'stat' can go away when the ++ * request completes, so it's important we never dereference it. We ++ * only use the address to compare with, which is why we store the ++ * sync_issue time locally. ++ */ ++ if (wbt_is_read(stat) && !rwb->sync_issue) { ++ rwb->sync_cookie = stat; ++ rwb->sync_issue = wbt_issue_stat_get_time(stat); ++ } ++} ++ ++void wbt_track(struct wb_issue_stat *stat, unsigned int wb_acct) ++{ ++ if (wb_acct & WBT_TRACKED) ++ wbt_mark_tracked(stat); ++ else if (wb_acct & WBT_READ) ++ wbt_mark_read(stat); ++} ++ ++void wbt_requeue(struct rq_wb *rwb, struct wb_issue_stat *stat) ++{ ++ if (!rwb_enabled(rwb)) ++ return; ++ if (stat == rwb->sync_cookie) { ++ rwb->sync_issue = 0; ++ rwb->sync_cookie = NULL; ++ } ++} ++ ++void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) ++{ ++ if (rwb) { ++ rwb->queue_depth = depth; ++ wbt_update_limits(rwb); ++ } ++} ++ ++void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) ++{ ++ if (rwb) ++ rwb->wc = write_cache_on; ++} ++ ++void wbt_disable(struct rq_wb *rwb) ++{ ++ if (rwb) { ++ del_timer_sync(&rwb->window_timer); ++ rwb->win_nsec = rwb->min_lat_nsec = 0; ++ wbt_update_limits(rwb); ++ } ++} ++EXPORT_SYMBOL_GPL(wbt_disable); ++ ++struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops, ++ void *ops_data) ++{ ++ struct rq_wb *rwb; ++ ++ if (!ops->get || !ops->is_current || !ops->clear) ++ return ERR_PTR(-EINVAL); ++ ++ rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); ++ if (!rwb) ++ return ERR_PTR(-ENOMEM); ++ ++ atomic_set(&rwb->inflight, 0); ++ init_waitqueue_head(&rwb->wait); ++ setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); ++ rwb->wc = 1; ++ rwb->queue_depth = RWB_DEF_DEPTH; ++ rwb->last_comp = rwb->last_issue = jiffies; ++ rwb->bdi = bdi; ++ rwb->win_nsec = RWB_WINDOW_NSEC; ++ rwb->stat_ops = ops, ++ rwb->ops_data = ops_data; ++ wbt_update_limits(rwb); ++ return rwb; ++} ++ ++void wbt_exit(struct rq_wb *rwb) ++{ ++ if (rwb) { ++ del_timer_sync(&rwb->window_timer); ++ kfree(rwb); ++ } ++} +-- +cgit v0.11.2 + +From db3de07314ef350fceb90ade08474fe4eea5e665 Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@fb.com> +Date: Thu, 8 Sep 2016 11:08:17 -0600 +Subject: writeback: throttle buffered writeback + +Test patch that throttles buffered writeback to make it a lot +more smooth, and has way less impact on other system activity. +Background writeback should be, by definition, background +activity. The fact that we flush huge bundles of it at the time +means that it potentially has heavy impacts on foreground workloads, +which isn't ideal. We can't easily limit the sizes of writes that +we do, since that would impact file system layout in the presence +of delayed allocation. So just throttle back buffered writeback, +unless someone is waiting for it. + +The algorithm for when to throttle takes its inspiration in the +CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors +the minimum latencies of requests over a window of time. In that +window of time, if the minimum latency of any request exceeds a +given target, then a scale count is incremented and the queue depth +is shrunk. The next monitoring window is shrunk accordingly. Unlike +CoDel, if we hit a window that exhibits good behavior, then we +simply increment the scale count and re-calculate the limits for that +scale value. This prevents us from oscillating between a +close-to-ideal value and max all the time, instead remaining in the +windows where we get good behavior. + +Unlike CoDel, blk-wb allows the scale count to to negative. This +happens if we primarily have writes going on. Unlike positive +scale counts, this doesn't change the size of the monitoring window. +When the heavy writers finish, blk-bw quickly snaps back to it's +stable state of a zero scale count. + +The patch registers two sysfs entries. The first one, 'wb_window_usec', +defines the window of monitoring. The second one, 'wb_lat_usec', +sets the latency target for the window. It defaults to 2 msec for +non-rotational storage, and 75 msec for rotational storage. Setting +this value to '0' disables blk-wb. Generally, a user would not have +to touch these settings. + +We don't enable WBT on devices that are managed with CFQ, and have +a non-root block cgroup attached. If we have a proportional share setup +on this particular disk, then the wbt throttling will interfere with +that. We don't have a strong need for wbt for that case, since we will +rely on CFQ doing that for us. + +Signed-off-by: Jens Axboe <axboe@fb.com> +--- + Documentation/block/queue-sysfs.txt | 13 ++++ + block/Kconfig | 1 + + block/blk-core.c | 20 +++++- + block/blk-mq.c | 30 ++++++++- + block/blk-settings.c | 3 + + block/blk-stat.c | 5 +- + block/blk-sysfs.c | 125 ++++++++++++++++++++++++++++++++++++ + block/cfq-iosched.c | 13 ++++ + include/linux/blkdev.h | 6 +- + 9 files changed, 207 insertions(+), 9 deletions(-) + +diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt +index 2a39040..2847219 100644 +--- a/Documentation/block/queue-sysfs.txt ++++ b/Documentation/block/queue-sysfs.txt +@@ -169,5 +169,18 @@ This is the number of bytes the device can write in a single write-same + command. A value of '0' means write-same is not supported by this + device. + ++wb_lat_usec (RW) ++---------------- ++If the device is registered for writeback throttling, then this file shows ++the target minimum read latency. If this latency is exceeded in a given ++window of time (see wb_window_usec), then the writeback throttling will start ++scaling back writes. ++ ++wb_window_usec (RW) ++------------------- ++If the device is registered for writeback throttling, then this file shows ++the value of the monitoring window in which we'll look at the target ++latency. See wb_lat_usec. ++ + + Jens Axboe <jens.axboe@oracle.com>, February 2009 +diff --git a/block/Kconfig b/block/Kconfig +index 161491d..6da79e6 100644 +--- a/block/Kconfig ++++ b/block/Kconfig +@@ -4,6 +4,7 @@ + menuconfig BLOCK + bool "Enable the block layer" if EXPERT + default y ++ select WBT + help + Provide block layer support for the kernel. + +diff --git a/block/blk-core.c b/block/blk-core.c +index 4075cbe..4f4ce05 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -33,6 +33,7 @@ + #include <linux/ratelimit.h> + #include <linux/pm_runtime.h> + #include <linux/blk-cgroup.h> ++#include <linux/wbt.h> + + #define CREATE_TRACE_POINTS + #include <trace/events/block.h> +@@ -882,6 +883,8 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, + + fail: + blk_free_flush_queue(q->fq); ++ wbt_exit(q->rq_wb); ++ q->rq_wb = NULL; + return NULL; + } + EXPORT_SYMBOL(blk_init_allocated_queue); +@@ -1346,6 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) + blk_delete_timer(rq); + blk_clear_rq_complete(rq); + trace_block_rq_requeue(q, rq); ++ wbt_requeue(q->rq_wb, &rq->wb_stat); + + if (rq->cmd_flags & REQ_QUEUED) + blk_queue_end_tag(q, rq); +@@ -1436,6 +1440,8 @@ void __blk_put_request(struct request_queue *q, struct request *req) + /* this is a bio leak */ + WARN_ON(req->bio != NULL); + ++ wbt_done(q->rq_wb, &req->wb_stat); ++ + /* + * Request may not have originated from ll_rw_blk. if not, + * it didn't come out of our reserved rq pools +@@ -1667,6 +1673,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) + int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT; + struct request *req; + unsigned int request_count = 0; ++ unsigned int wb_acct; + + /* + * low level driver can indicate that it wants pages above a +@@ -1719,6 +1726,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) + } + + get_rq: ++ wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, q->queue_lock); ++ + /* + * This sync check and mask will be re-done in init_request_from_bio(), + * but we need to set it earlier to expose the sync flag to the +@@ -1738,11 +1747,15 @@ get_rq: + */ + req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO); + if (IS_ERR(req)) { ++ if (wb_acct & WBT_TRACKED) ++ __wbt_done(q->rq_wb); + bio->bi_error = PTR_ERR(req); + bio_endio(bio); + goto out_unlock; + } + ++ wbt_track(&req->wb_stat, wb_acct); ++ + /* + * After dropping the lock and possibly sleeping here, our request + * may now be mergeable after it had proven unmergeable (above). +@@ -2475,7 +2488,7 @@ void blk_start_request(struct request *req) + { + blk_dequeue_request(req); + +- req->issue_time = ktime_to_ns(ktime_get()); ++ wbt_issue(req->q->rq_wb, &req->wb_stat); + + /* + * We are now handing the request to the hardware, initialize +@@ -2713,9 +2726,10 @@ void blk_finish_request(struct request *req, int error) + + blk_account_io_done(req); + +- if (req->end_io) ++ if (req->end_io) { ++ wbt_done(req->q->rq_wb, &req->wb_stat); + req->end_io(req, error); +- else { ++ } else { + if (blk_bidi_rq(req)) + __blk_put_request(req->next_rq->q, req->next_rq); + +diff --git a/block/blk-mq.c b/block/blk-mq.c +index 712f141..511289a 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -22,6 +22,7 @@ + #include <linux/sched/sysctl.h> + #include <linux/delay.h> + #include <linux/crash_dump.h> ++#include <linux/wbt.h> + + #include <trace/events/block.h> + +@@ -319,6 +320,8 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, + + if (rq->cmd_flags & REQ_MQ_INFLIGHT) + atomic_dec(&hctx->nr_active); ++ ++ wbt_done(q->rq_wb, &rq->wb_stat); + rq->cmd_flags = 0; + + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); +@@ -351,6 +354,7 @@ inline void __blk_mq_end_request(struct request *rq, int error) + blk_account_io_done(rq); + + if (rq->end_io) { ++ wbt_done(rq->q->rq_wb, &rq->wb_stat); + rq->end_io(rq, error); + } else { + if (unlikely(blk_bidi_rq(rq))) +@@ -457,7 +461,7 @@ void blk_mq_start_request(struct request *rq) + if (unlikely(blk_bidi_rq(rq))) + rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + +- rq->issue_time = ktime_to_ns(ktime_get()); ++ wbt_issue(q->rq_wb, &rq->wb_stat); + + blk_add_timer(rq); + +@@ -494,6 +498,7 @@ static void __blk_mq_requeue_request(struct request *rq) + struct request_queue *q = rq->q; + + trace_block_rq_requeue(q, rq); ++ wbt_requeue(q->rq_wb, &rq->wb_stat); + + if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + if (q->dma_drain_size && blk_rq_bytes(rq)) +@@ -1312,6 +1317,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) + struct blk_plug *plug; + struct request *same_queue_rq = NULL; + blk_qc_t cookie; ++ unsigned int wb_acct; + + blk_queue_bounce(q, &bio); + +@@ -1326,9 +1332,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) + blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) + return BLK_QC_T_NONE; + ++ wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, NULL); ++ + rq = blk_mq_map_request(q, bio, &data); +- if (unlikely(!rq)) ++ if (unlikely(!rq)) { ++ if (wb_acct & WBT_TRACKED) ++ __wbt_done(q->rq_wb); + return BLK_QC_T_NONE; ++ } ++ ++ wbt_track(&rq->wb_stat, wb_acct); + + cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); + +@@ -1405,6 +1418,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) + struct blk_map_ctx data; + struct request *rq; + blk_qc_t cookie; ++ unsigned int wb_acct; + + blk_queue_bounce(q, &bio); + +@@ -1421,9 +1435,16 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) + } else + request_count = blk_plug_queued_count(q); + ++ wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, NULL); ++ + rq = blk_mq_map_request(q, bio, &data); +- if (unlikely(!rq)) ++ if (unlikely(!rq)) { ++ if (wb_acct & WBT_TRACKED) ++ __wbt_done(q->rq_wb); + return BLK_QC_T_NONE; ++ } ++ ++ wbt_track(&rq->wb_stat, wb_acct); + + cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); + +@@ -2147,6 +2168,9 @@ void blk_mq_free_queue(struct request_queue *q) + list_del_init(&q->all_q_node); + mutex_unlock(&all_q_mutex); + ++ wbt_exit(q->rq_wb); ++ q->rq_wb = NULL; ++ + blk_mq_del_queue_tag_set(q); + + blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); +diff --git a/block/blk-settings.c b/block/blk-settings.c +index f7e122e..746dc9f 100644 +--- a/block/blk-settings.c ++++ b/block/blk-settings.c +@@ -840,6 +840,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); + void blk_set_queue_depth(struct request_queue *q, unsigned int depth) + { + q->queue_depth = depth; ++ wbt_set_queue_depth(q->rq_wb, depth); + } + EXPORT_SYMBOL(blk_set_queue_depth); + +@@ -863,6 +864,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) + else + queue_flag_clear(QUEUE_FLAG_FUA, q); + spin_unlock_irq(q->queue_lock); ++ ++ wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + } + EXPORT_SYMBOL_GPL(blk_queue_write_cache); + +diff --git a/block/blk-stat.c b/block/blk-stat.c +index 3965e8a..bdb16d8 100644 +--- a/block/blk-stat.c ++++ b/block/blk-stat.c +@@ -178,15 +178,16 @@ bool blk_stat_is_current(struct blk_rq_stat *stat) + void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) + { + s64 now, value; ++ u64 rq_time = wbt_issue_stat_get_time(&rq->wb_stat); + + now = ktime_to_ns(ktime_get()); +- if (now < rq->issue_time) ++ if (now < rq_time) + return; + + if (!__blk_stat_is_current(stat, now)) + __blk_stat_init(stat, now); + +- value = now - rq->issue_time; ++ value = now - rq_time; + if (value > stat->max) + stat->max = value; + if (value < stat->min) +diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c +index 0b9e435..85c3dc2 100644 +--- a/block/blk-sysfs.c ++++ b/block/blk-sysfs.c +@@ -10,6 +10,7 @@ + #include <linux/blktrace_api.h> + #include <linux/blk-mq.h> + #include <linux/blk-cgroup.h> ++#include <linux/wbt.h> + + #include "blk.h" + #include "blk-mq.h" +@@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count) + return count; + } + ++static ssize_t queue_var_store64(u64 *var, const char *page) ++{ ++ int err; ++ u64 v; ++ ++ err = kstrtou64(page, 10, &v); ++ if (err < 0) ++ return err; ++ ++ *var = v; ++ return 0; ++} ++ + static ssize_t queue_requests_show(struct request_queue *q, char *page) + { + return queue_var_show(q->nr_requests, (page)); +@@ -347,6 +361,58 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, + return ret; + } + ++static ssize_t queue_wb_win_show(struct request_queue *q, char *page) ++{ ++ if (!q->rq_wb) ++ return -EINVAL; ++ ++ return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000)); ++} ++ ++static ssize_t queue_wb_win_store(struct request_queue *q, const char *page, ++ size_t count) ++{ ++ ssize_t ret; ++ u64 val; ++ ++ if (!q->rq_wb) ++ return -EINVAL; ++ ++ ret = queue_var_store64(&val, page); ++ if (ret < 0) ++ return ret; ++ ++ q->rq_wb->win_nsec = val * 1000ULL; ++ wbt_update_limits(q->rq_wb); ++ return count; ++} ++ ++static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) ++{ ++ if (!q->rq_wb) ++ return -EINVAL; ++ ++ return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); ++} ++ ++static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, ++ size_t count) ++{ ++ ssize_t ret; ++ u64 val; ++ ++ if (!q->rq_wb) ++ return -EINVAL; ++ ++ ret = queue_var_store64(&val, page); ++ if (ret < 0) ++ return ret; ++ ++ q->rq_wb->min_lat_nsec = val * 1000ULL; ++ wbt_update_limits(q->rq_wb); ++ return count; ++} ++ + static ssize_t queue_wc_show(struct request_queue *q, char *page) + { + if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) +@@ -551,6 +617,18 @@ static struct queue_sysfs_entry queue_stats_entry = { + .show = queue_stats_show, + }; + ++static struct queue_sysfs_entry queue_wb_lat_entry = { ++ .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, ++ .show = queue_wb_lat_show, ++ .store = queue_wb_lat_store, ++}; ++ ++static struct queue_sysfs_entry queue_wb_win_entry = { ++ .attr = {.name = "wbt_window_usec", .mode = S_IRUGO | S_IWUSR }, ++ .show = queue_wb_win_show, ++ .store = queue_wb_win_store, ++}; ++ + static struct attribute *default_attrs[] = { + &queue_requests_entry.attr, + &queue_ra_entry.attr, +@@ -579,6 +657,8 @@ static struct attribute *default_attrs[] = { + &queue_wc_entry.attr, + &queue_dax_entry.attr, + &queue_stats_entry.attr, ++ &queue_wb_lat_entry.attr, ++ &queue_wb_win_entry.attr, + NULL, + }; + +@@ -693,6 +773,49 @@ struct kobj_type blk_queue_ktype = { + .release = blk_release_queue, + }; + ++static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat) ++{ ++ blk_queue_stat_get(data, stat); ++} ++ ++static void blk_wb_stat_clear(void *data) ++{ ++ blk_stat_clear(data); ++} ++ ++static bool blk_wb_stat_is_current(struct blk_rq_stat *stat) ++{ ++ return blk_stat_is_current(stat); ++} ++ ++static struct wb_stat_ops wb_stat_ops = { ++ .get = blk_wb_stat_get, ++ .is_current = blk_wb_stat_is_current, ++ .clear = blk_wb_stat_clear, ++}; ++ ++static void blk_wb_init(struct request_queue *q) ++{ ++ struct rq_wb *rwb; ++ ++ rwb = wbt_init(&q->backing_dev_info, &wb_stat_ops, q); ++ ++ /* ++ * If this fails, we don't get throttling ++ */ ++ if (IS_ERR(rwb)) ++ return; ++ ++ if (blk_queue_nonrot(q)) ++ rwb->min_lat_nsec = 2000000ULL; ++ else ++ rwb->min_lat_nsec = 75000000ULL; ++ ++ wbt_set_queue_depth(rwb, blk_queue_depth(q)); ++ wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); ++ q->rq_wb = rwb; ++} ++ + int blk_register_queue(struct gendisk *disk) + { + int ret; +@@ -732,6 +855,8 @@ int blk_register_queue(struct gendisk *disk) + if (q->mq_ops) + blk_mq_register_disk(disk); + ++ blk_wb_init(q); ++ + if (!q->request_fn) + return 0; + +diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c +index cc2f6db..fdcd5999 100644 +--- a/block/cfq-iosched.c ++++ b/block/cfq-iosched.c +@@ -3764,9 +3764,11 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) + struct cfq_data *cfqd = cic_to_cfqd(cic); + struct cfq_queue *cfqq; + uint64_t serial_nr; ++ bool nonroot_cg; + + rcu_read_lock(); + serial_nr = bio_blkcg(bio)->css.serial_nr; ++ nonroot_cg = bio_blkcg(bio) != &blkcg_root; + rcu_read_unlock(); + + /* +@@ -3777,6 +3779,17 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) + return; + + /* ++ * If we have a non-root cgroup, we can depend on that to ++ * do proper throttling of writes. Turn off wbt for that ++ * case. ++ */ ++ if (nonroot_cg) { ++ struct request_queue *q = cfqd->queue; ++ ++ wbt_disable(q->rq_wb); ++ } ++ ++ /* + * Drop reference to queues. New queues will be assigned in new + * group upon arrival of fresh requests. + */ +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 259eba8..45256d7 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -24,6 +24,7 @@ + #include <linux/rcupdate.h> + #include <linux/percpu-refcount.h> + #include <linux/scatterlist.h> ++#include <linux/wbt.h> + + struct module; + struct scsi_ioctl_command; +@@ -37,6 +38,7 @@ struct bsg_job; + struct blkcg_gq; + struct blk_flush_queue; + struct pr_ops; ++struct rq_wb; + + #define BLKDEV_MIN_RQ 4 + #define BLKDEV_MAX_RQ 128 /* Default maximum */ +@@ -151,7 +153,7 @@ struct request { + struct gendisk *rq_disk; + struct hd_struct *part; + unsigned long start_time; +- s64 issue_time; ++ struct wb_issue_stat wb_stat; + #ifdef CONFIG_BLK_CGROUP + struct request_list *rl; /* rl this rq is alloced from */ + unsigned long long start_time_ns; +@@ -303,6 +305,8 @@ struct request_queue { + int nr_rqs[2]; /* # allocated [a]sync rqs */ + int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + ++ struct rq_wb *rq_wb; ++ + /* + * If blkcg is not used, @q->root_rl serves all requests. If blkcg + * is used, root blkg allocates from @q->root_rl and all other +-- +cgit v0.11.2 + +From 21c990f3ab1d3324ad3152cb94f86e6e0772b73c Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@fb.com> +Date: Sat, 10 Sep 2016 10:06:26 -0600 +Subject: wbt: spelling check fix + +Signed-off-by: Jens Axboe <axboe@fb.com> +--- + lib/wbt.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/wbt.c b/lib/wbt.c +index a995703..5c507e5 100644 +--- a/lib/wbt.c ++++ b/lib/wbt.c +@@ -1,5 +1,5 @@ + /* +- * buffered writeback throttling. losely based on CoDel. We can't drop ++ * buffered writeback throttling. loosely based on CoDel. We can't drop + * packets for IO scheduling, so the logic is something like this: + * + * - Monitor latencies in a defined window of time. +-- +cgit v0.11.2 + diff --git a/config.x86_64 b/config.x86_64 index 63c2f3f9a3af..ba7715200831 100644 --- a/config.x86_64 +++ b/config.x86_64 @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.6.0 Kernel Configuration +# Linux/x86 4.8.11-2.1 Kernel Configuration # CONFIG_64BIT=y CONFIG_X86_64=y @@ -441,9 +441,15 @@ CONFIG_IOMMU_HELPER=y CONFIG_NR_CPUS=128 CONFIG_SCHED_SMT=y CONFIG_SCHED_MC=y +CONFIG_PREEMPT=y +CONFIG_PREEMPT_RT_BASE=y +CONFIG_HAVE_PREEMPT_LAZY=y +CONFIG_PREEMPT_LAZY=y # CONFIG_PREEMPT_NONE is not set # CONFIG_PREEMPT_VOLUNTARY is not set -CONFIG_PREEMPT=y +# CONFIG_PREEMPT__LL is not set +# CONFIG_PREEMPT_RTB is not set +CONFIG_PREEMPT_RT_FULL=y CONFIG_PREEMPT_COUNT=y CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y @@ -663,7 +669,7 @@ CONFIG_SFI=y # CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_GOV_COMMON=y -CONFIG_CPU_FREQ_STAT=y +CONFIG_CPU_FREQ_STAT=m CONFIG_CPU_FREQ_STAT_DETAILS=y # CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set @@ -1611,7 +1617,7 @@ CONFIG_BT_ATH3K=m CONFIG_BT_WILINK=m CONFIG_AF_RXRPC=m # CONFIG_AF_RXRPC_DEBUG is not set -CONFIG_RXKAD=y +CONFIG_RXKAD=m CONFIG_AF_KCM=m CONFIG_FIB_RULES=y CONFIG_WIRELESS=y @@ -7708,7 +7714,7 @@ CONFIG_CRYPTO_DEV_QAT_C62X=m CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m CONFIG_CRYPTO_DEV_QAT_C62XVF=m -CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_KEY_TYPE=m CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m CONFIG_X509_CERTIFICATE_PARSER=m CONFIG_PKCS7_MESSAGE_PARSER=m diff --git a/init.patch b/init.patch new file mode 100644 index 000000000000..60af1eb9412a --- /dev/null +++ b/init.patch @@ -0,0 +1,38 @@ +--- a/init/do_mounts.c 2015-08-19 10:27:16.753852576 -0400 ++++ b/init/do_mounts.c 2015-08-19 10:34:25.473850353 -0400 +@@ -490,7 +490,11 @@ void __init change_floppy(char *fmt, ... + va_start(args, fmt); + vsprintf(buf, fmt, args); + va_end(args); +- fd = sys_open("/dev/root", O_RDWR | O_NDELAY, 0); ++ if (saved_root_name[0]) ++ fd = sys_open(saved_root_name, O_RDWR | O_NDELAY, 0); ++ else ++ fd = sys_open("/dev/root", O_RDWR | O_NDELAY, 0); ++ + if (fd >= 0) { + sys_ioctl(fd, FDEJECT, 0); + sys_close(fd); +@@ -534,11 +538,17 @@ void __init mount_root(void) + #endif + #ifdef CONFIG_BLOCK + { +- int err = create_dev("/dev/root", ROOT_DEV); +- +- if (err < 0) +- pr_emerg("Failed to create /dev/root: %d\n", err); +- mount_block_root("/dev/root", root_mountflags); ++ if (saved_root_name[0] == '/') { ++ int err = create_dev(saved_root_name, ROOT_DEV); ++ if (err < 0) ++ pr_emerg("Failed to create %s: %d\n", saved_root_name, err); ++ mount_block_root(saved_root_name, root_mountflags); ++ } else { ++ int err = create_dev("/dev/root", ROOT_DEV); ++ if (err < 0) ++ pr_emerg("Failed to create /dev/root: %d\n", err); ++ mount_block_root("/dev/root", root_mountflags); ++ } + } + #endif + } diff --git a/kconfig.patch b/kconfig.patch new file mode 100644 index 000000000000..d9729b23c3e6 --- /dev/null +++ b/kconfig.patch @@ -0,0 +1,426 @@ +WARNING - this version of the patch works with version 4.9+ of gcc and with +kernel version 3.15.x+ and should NOT be applied when compiling on older +versions due to name changes of the flags with the 4.9 release of gcc. +Use the older version of this patch hosted on the same github for older +versions of gcc. For example: + +corei7 --> nehalem +corei7-avx --> sandybridge +core-avx-i --> ivybridge +core-avx2 --> haswell + +For more, see: https://gcc.gnu.org/gcc-4.9/changes.html + +It also changes 'atom' to 'bonnell' in accordance with the gcc v4.9 changes. +Note that upstream is using the deprecated 'match=atom' flags when I believe it +should use the newer 'march=bonnell' flag for atom processors. + +I have made that change to this patch set as well. See the following kernel +bug report to see if I'm right: https://bugzilla.kernel.org/show_bug.cgi?id=77461 + +This patch will expand the number of microarchitectures to include newer +processors including: AMD K10-family, AMD Family 10h (Barcelona), AMD Family +14h (Bobcat), AMD Family 15h (Bulldozer), AMD Family 15h (Piledriver), AMD +Family 15h (Steamroller), Family 16h (Jaguar), Intel 1st Gen Core i3/i5/i7 +(Nehalem), Intel 1.5 Gen Core i3/i5/i7 (Westmere), Intel 2nd Gen Core i3/i5/i7 +(Sandybridge), Intel 3rd Gen Core i3/i5/i7 (Ivybridge), Intel 4th Gen Core +i3/i5/i7 (Haswell), Intel 5th Gen Core i3/i5/i7 (Broadwell), and the low power +Silvermont series of Atom processors (Silvermont). It also offers the compiler +the 'native' flag. + +Small but real speed increases are measurable using a make endpoint comparing +a generic kernel to one built with one of the respective microarchs. + +See the following experimental evidence supporting this statement: +https://github.com/graysky2/kernel_gcc_patch + +REQUIREMENTS +linux version >=3.15 +gcc version >=4.9 + +--- a/arch/x86/include/asm/module.h 2015-08-30 14:34:09.000000000 -0400 ++++ b/arch/x86/include/asm/module.h 2015-11-06 14:18:24.234941036 -0500 +@@ -15,6 +15,24 @@ + #define MODULE_PROC_FAMILY "586MMX " + #elif defined CONFIG_MCORE2 + #define MODULE_PROC_FAMILY "CORE2 " ++#elif defined CONFIG_MNATIVE ++#define MODULE_PROC_FAMILY "NATIVE " ++#elif defined CONFIG_MNEHALEM ++#define MODULE_PROC_FAMILY "NEHALEM " ++#elif defined CONFIG_MWESTMERE ++#define MODULE_PROC_FAMILY "WESTMERE " ++#elif defined CONFIG_MSILVERMONT ++#define MODULE_PROC_FAMILY "SILVERMONT " ++#elif defined CONFIG_MSANDYBRIDGE ++#define MODULE_PROC_FAMILY "SANDYBRIDGE " ++#elif defined CONFIG_MIVYBRIDGE ++#define MODULE_PROC_FAMILY "IVYBRIDGE " ++#elif defined CONFIG_MHASWELL ++#define MODULE_PROC_FAMILY "HASWELL " ++#elif defined CONFIG_MBROADWELL ++#define MODULE_PROC_FAMILY "BROADWELL " ++#elif defined CONFIG_MSKYLAKE ++#define MODULE_PROC_FAMILY "SKYLAKE " + #elif defined CONFIG_MATOM + #define MODULE_PROC_FAMILY "ATOM " + #elif defined CONFIG_M686 +@@ -33,6 +51,22 @@ + #define MODULE_PROC_FAMILY "K7 " + #elif defined CONFIG_MK8 + #define MODULE_PROC_FAMILY "K8 " ++#elif defined CONFIG_MK8SSE3 ++#define MODULE_PROC_FAMILY "K8SSE3 " ++#elif defined CONFIG_MK10 ++#define MODULE_PROC_FAMILY "K10 " ++#elif defined CONFIG_MBARCELONA ++#define MODULE_PROC_FAMILY "BARCELONA " ++#elif defined CONFIG_MBOBCAT ++#define MODULE_PROC_FAMILY "BOBCAT " ++#elif defined CONFIG_MBULLDOZER ++#define MODULE_PROC_FAMILY "BULLDOZER " ++#elif defined CONFIG_MPILEDRIVER ++#define MODULE_PROC_FAMILY "STEAMROLLER " ++#elif defined CONFIG_MSTEAMROLLER ++#define MODULE_PROC_FAMILY "PILEDRIVER " ++#elif defined CONFIG_MJAGUAR ++#define MODULE_PROC_FAMILY "JAGUAR " + #elif defined CONFIG_MELAN + #define MODULE_PROC_FAMILY "ELAN " + #elif defined CONFIG_MCRUSOE +--- a/arch/x86/Kconfig.cpu 2015-08-30 14:34:09.000000000 -0400 ++++ b/arch/x86/Kconfig.cpu 2015-11-06 14:20:14.948369244 -0500 +@@ -137,9 +137,8 @@ config MPENTIUM4 + -Paxville + -Dempsey + +- + config MK6 +- bool "K6/K6-II/K6-III" ++ bool "AMD K6/K6-II/K6-III" + depends on X86_32 + ---help--- + Select this for an AMD K6-family processor. Enables use of +@@ -147,7 +146,7 @@ config MK6 + flags to GCC. + + config MK7 +- bool "Athlon/Duron/K7" ++ bool "AMD Athlon/Duron/K7" + depends on X86_32 + ---help--- + Select this for an AMD Athlon K7-family processor. Enables use of +@@ -155,12 +154,69 @@ config MK7 + flags to GCC. + + config MK8 +- bool "Opteron/Athlon64/Hammer/K8" ++ bool "AMD Opteron/Athlon64/Hammer/K8" + ---help--- + Select this for an AMD Opteron or Athlon64 Hammer-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + ++config MK8SSE3 ++ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" ++ ---help--- ++ Select this for improved AMD Opteron or Athlon64 Hammer-family processors. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MK10 ++ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" ++ ---help--- ++ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, ++ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MBARCELONA ++ bool "AMD Barcelona" ++ ---help--- ++ Select this for AMD Barcelona and newer processors. ++ ++ Enables -march=barcelona ++ ++config MBOBCAT ++ bool "AMD Bobcat" ++ ---help--- ++ Select this for AMD Bobcat processors. ++ ++ Enables -march=btver1 ++ ++config MBULLDOZER ++ bool "AMD Bulldozer" ++ ---help--- ++ Select this for AMD Bulldozer processors. ++ ++ Enables -march=bdver1 ++ ++config MPILEDRIVER ++ bool "AMD Piledriver" ++ ---help--- ++ Select this for AMD Piledriver processors. ++ ++ Enables -march=bdver2 ++ ++config MSTEAMROLLER ++ bool "AMD Steamroller" ++ ---help--- ++ Select this for AMD Steamroller processors. ++ ++ Enables -march=bdver3 ++ ++config MJAGUAR ++ bool "AMD Jaguar" ++ ---help--- ++ Select this for AMD Jaguar processors. ++ ++ Enables -march=btver2 ++ + config MCRUSOE + bool "Crusoe" + depends on X86_32 +@@ -251,8 +307,17 @@ config MPSC + using the cpu family field + in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. + ++config MATOM ++ bool "Intel Atom" ++ ---help--- ++ ++ Select this for the Intel Atom platform. Intel Atom CPUs have an ++ in-order pipelining architecture and thus can benefit from ++ accordingly optimized code. Use a recent GCC with specific Atom ++ support in order to fully benefit from selecting this option. ++ + config MCORE2 +- bool "Core 2/newer Xeon" ++ bool "Intel Core 2" + ---help--- + + Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and +@@ -260,14 +325,71 @@ config MCORE2 + family in /proc/cpuinfo. Newer ones have 6 and older ones 15 + (not a typo) + +-config MATOM +- bool "Intel Atom" ++ Enables -march=core2 ++ ++config MNEHALEM ++ bool "Intel Nehalem" + ---help--- + +- Select this for the Intel Atom platform. Intel Atom CPUs have an +- in-order pipelining architecture and thus can benefit from +- accordingly optimized code. Use a recent GCC with specific Atom +- support in order to fully benefit from selecting this option. ++ Select this for 1st Gen Core processors in the Nehalem family. ++ ++ Enables -march=nehalem ++ ++config MWESTMERE ++ bool "Intel Westmere" ++ ---help--- ++ ++ Select this for the Intel Westmere formerly Nehalem-C family. ++ ++ Enables -march=westmere ++ ++config MSILVERMONT ++ bool "Intel Silvermont" ++ ---help--- ++ ++ Select this for the Intel Silvermont platform. ++ ++ Enables -march=silvermont ++ ++config MSANDYBRIDGE ++ bool "Intel Sandy Bridge" ++ ---help--- ++ ++ Select this for 2nd Gen Core processors in the Sandy Bridge family. ++ ++ Enables -march=sandybridge ++ ++config MIVYBRIDGE ++ bool "Intel Ivy Bridge" ++ ---help--- ++ ++ Select this for 3rd Gen Core processors in the Ivy Bridge family. ++ ++ Enables -march=ivybridge ++ ++config MHASWELL ++ bool "Intel Haswell" ++ ---help--- ++ ++ Select this for 4th Gen Core processors in the Haswell family. ++ ++ Enables -march=haswell ++ ++config MBROADWELL ++ bool "Intel Broadwell" ++ ---help--- ++ ++ Select this for 5th Gen Core processors in the Broadwell family. ++ ++ Enables -march=broadwell ++ ++config MSKYLAKE ++ bool "Intel Skylake" ++ ---help--- ++ ++ Select this for 6th Gen Core processors in the Skylake family. ++ ++ Enables -march=skylake + + config GENERIC_CPU + bool "Generic-x86-64" +@@ -276,6 +398,19 @@ config GENERIC_CPU + Generic x86-64 CPU. + Run equally well on all x86-64 CPUs. + ++config MNATIVE ++ bool "Native optimizations autodetected by GCC" ++ ---help--- ++ ++ GCC 4.2 and above support -march=native, which automatically detects ++ the optimum settings to use based on your processor. -march=native ++ also detects and applies additional settings beyond -march specific ++ to your CPU, (eg. -msse4). Unless you have a specific reason not to ++ (e.g. distcc cross-compiling), you should probably be using ++ -march=native rather than anything listed below. ++ ++ Enables -march=native ++ + endchoice + + config X86_GENERIC +@@ -300,7 +435,7 @@ config X86_INTERNODE_CACHE_SHIFT + config X86_L1_CACHE_SHIFT + int + default "7" if MPENTIUM4 || MPSC +- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU ++ default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "4" if MELAN || M486 || MGEODEGX1 + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + +@@ -331,11 +466,11 @@ config X86_ALIGNMENT_16 + + config X86_INTEL_USERCOPY + def_bool y +- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 ++ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE + + config X86_USE_PPRO_CHECKSUM + def_bool y +- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM ++ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MATOM || MNATIVE + + config X86_USE_3DNOW + def_bool y +@@ -359,17 +494,17 @@ config X86_P6_NOP + + config X86_TSC + def_bool y +- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 ++ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM) || X86_64 + + config X86_CMPXCHG64 + def_bool y +- depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM ++ depends on X86_PAE || X86_64 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE + + # this should be set for all -march=.. options where the compiler + # generates cmov. + config X86_CMOV + def_bool y +- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) ++ depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX) + + config X86_MINIMUM_CPU_FAMILY + int +--- a/arch/x86/Makefile 2015-08-30 14:34:09.000000000 -0400 ++++ b/arch/x86/Makefile 2015-11-06 14:21:05.708983344 -0500 +@@ -94,13 +94,38 @@ else + KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) + + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) ++ cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) + cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) ++ cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8) ++ cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10) ++ cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona) ++ cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1) ++ cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1) ++ cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2) ++ cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3) ++ cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2) + cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) + + cflags-$(CONFIG_MCORE2) += \ +- $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) +- cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \ +- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) ++ $(call cc-option,-march=core2,$(call cc-option,-mtune=core2)) ++ cflags-$(CONFIG_MNEHALEM) += \ ++ $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem)) ++ cflags-$(CONFIG_MWESTMERE) += \ ++ $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere)) ++ cflags-$(CONFIG_MSILVERMONT) += \ ++ $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont)) ++ cflags-$(CONFIG_MSANDYBRIDGE) += \ ++ $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge)) ++ cflags-$(CONFIG_MIVYBRIDGE) += \ ++ $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge)) ++ cflags-$(CONFIG_MHASWELL) += \ ++ $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell)) ++ cflags-$(CONFIG_MBROADWELL) += \ ++ $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell)) ++ cflags-$(CONFIG_MSKYLAKE) += \ ++ $(call cc-option,-march=skylake,$(call cc-option,-mtune=skylake)) ++ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \ ++ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) + cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) + KBUILD_CFLAGS += $(cflags-y) + +--- a/arch/x86/Makefile_32.cpu 2015-08-30 14:34:09.000000000 -0400 ++++ b/arch/x86/Makefile_32.cpu 2015-11-06 14:21:43.604429077 -0500 +@@ -23,7 +23,16 @@ cflags-$(CONFIG_MK6) += -march=k6 + # Please note, that patches that add -march=athlon-xp and friends are pointless. + # They make zero difference whatsosever to performance at this time. + cflags-$(CONFIG_MK7) += -march=athlon ++cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) + cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) ++cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-march=athlon) ++cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10,-march=athlon) ++cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona,-march=athlon) ++cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1,-march=athlon) ++cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1,-march=athlon) ++cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon) ++cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3,-march=athlon) ++cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2,-march=athlon) + cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 + cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 + cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) +@@ -32,8 +41,16 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc- + cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) + cflags-$(CONFIG_MVIAC7) += -march=i686 + cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) +-cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ +- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) ++cflags-$(CONFIG_MNEHALEM) += -march=i686 $(call tune,nehalem) ++cflags-$(CONFIG_MWESTMERE) += -march=i686 $(call tune,westmere) ++cflags-$(CONFIG_MSILVERMONT) += -march=i686 $(call tune,silvermont) ++cflags-$(CONFIG_MSANDYBRIDGE) += -march=i686 $(call tune,sandybridge) ++cflags-$(CONFIG_MIVYBRIDGE) += -march=i686 $(call tune,ivybridge) ++cflags-$(CONFIG_MHASWELL) += -march=i686 $(call tune,haswell) ++cflags-$(CONFIG_MBROADWELL) += -march=i686 $(call tune,broadwell) ++cflags-$(CONFIG_MSKYLAKE) += -march=i686 $(call tune,skylake) ++cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \ ++ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) + + # AMD Elan support + cflags-$(CONFIG_MELAN) += -march=i486 diff --git a/xattr.patch b/xattr.patch new file mode 100644 index 000000000000..bacd0322989b --- /dev/null +++ b/xattr.patch @@ -0,0 +1,69 @@ +From: Anthony G. Basile <blueness@gentoo.org> + +This patch adds support for a restricted user-controlled namespace on +tmpfs filesystem used to house PaX flags. The namespace must be of the +form user.pax.* and its value cannot exceed a size of 8 bytes. + +This is needed even on all Gentoo systems so that XATTR_PAX flags +are preserved for users who might build packages using portage on +a tmpfs system with a non-hardened kernel and then switch to a +hardened kernel with XATTR_PAX enabled. + +The namespace is added to any user with Extended Attribute support +enabled for tmpfs. Users who do not enable xattrs will not have +the XATTR_PAX flags preserved. + +diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h +index 1590c49..5eab462 100644 +--- a/include/uapi/linux/xattr.h ++++ b/include/uapi/linux/xattr.h +@@ -73,5 +73,9 @@ + #define XATTR_POSIX_ACL_DEFAULT "posix_acl_default" + #define XATTR_NAME_POSIX_ACL_DEFAULT XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_DEFAULT + ++/* User namespace */ ++#define XATTR_PAX_PREFIX XATTR_USER_PREFIX "pax." ++#define XATTR_PAX_FLAGS_SUFFIX "flags" ++#define XATTR_NAME_PAX_FLAGS XATTR_PAX_PREFIX XATTR_PAX_FLAGS_SUFFIX + + #endif /* _UAPI_LINUX_XATTR_H */ +diff --git a/mm/shmem.c b/mm/shmem.c +index 440e2a7..c377172 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -2667,6 +2667,14 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + + name = xattr_full_name(handler, name); ++ ++ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { ++ if (strcmp(name, XATTR_NAME_PAX_FLAGS)) ++ return -EOPNOTSUPP; ++ if (size > 8) ++ return -EINVAL; ++ } ++ + return simple_xattr_set(&info->xattrs, name, value, size, flags); + } + +@@ -2682,6 +2690,12 @@ static const struct xattr_handler shmem_trusted_xattr_handler = { + .set = shmem_xattr_handler_set, + }; + ++static const struct xattr_handler shmem_user_xattr_handler = { ++ .prefix = XATTR_USER_PREFIX, ++ .get = shmem_xattr_handler_get, ++ .set = shmem_xattr_handler_set, ++}; ++ + static const struct xattr_handler *shmem_xattr_handlers[] = { + #ifdef CONFIG_TMPFS_POSIX_ACL + &posix_acl_access_xattr_handler, +@@ -2689,6 +2703,7 @@ static const struct xattr_handler *shmem_xattr_handlers[] = { + #endif + &shmem_security_xattr_handler, + &shmem_trusted_xattr_handler, ++ &shmem_user_xattr_handler, + NULL + }; + diff --git a/xfs.patch b/xfs.patch new file mode 100644 index 000000000000..1020cdcb1e53 --- /dev/null +++ b/xfs.patch @@ -0,0 +1,137 @@ +From: Dave Chinner <david@fromorbit.com> +To: linux-xfs@vger.kernel.org +Cc: xfs@oss.sgi.com +Subject: [PATCH] xfs: quiesce the filesystem after recovery on readonly mount +Date: Fri, 23 Sep 2016 10:11:40 +1000 + +From: Dave Chinner <dchinner@redhat.com> + +Recently we've had a number of reports where log recovery on a v5 +filesystem has reported corruptions that looked to be caused by +recovery being re-run over the top of an already-recovered +metadata. This has uncovered a bug in recovery (fixed elsewhere) +but the vector that caused this was largely unknown. + +A kdump test started tripping over this problem - the system +would be crashed, the kdump kernel and environment would boot and +dump the kernel core image, and then the system would reboot. After +reboot, the root filesystem was triggering log recovery and +corruptions were being detected. The metadumps indicated the above +log recovery issue. + +What is happening is that the kdump kernel and environment is +mounting the root device read-only to find the binaries needed to do +it's work. The result of this is that it is running log recovery. +However, because there were unlinked files and EFIs to be processed +by recovery, the completion of phase 1 of log recovery could not +mark the log clean. And because it's a read-only mount, the unmount +process does not write records to the log to mark it clean, either. +Hence on the next mount of the filesystem, log recovery was run +again across all the metadata that had already been recovered and +this is what triggered corruption warnings. + +To avoid this problem, we need to ensure that a read-only mount +always updates the log when it completes the second phase of +recovery. We already handle this sort of issue with rw->ro remount +transitions, so the solution is as simple as quiescing the +filesystem at the appropriate time during the mount process. This +results in the log being marked clean so the mount behaviour +recorded in the logs on repeated RO mounts will change (i.e. log +recovery will no longer be run on every mount until a RW mount is +done). This is a user visible change in behaviour, but it is +harmless. + +Signed-off-by: Dave Chinner <dchinner@redhat.com> +--- + fs/xfs/xfs_mount.c | 14 ++++++++++++++ + fs/xfs/xfs_super.c | 2 +- + fs/xfs/xfs_super.h | 1 + + 3 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c +index faeead6..56e85a6 100644 +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -934,6 +934,20 @@ xfs_mountfs( + } + + /* ++ * Now the log is fully replayed, we can transition to full read-only ++ * mode for read-only mounts. This will sync all the metadata and clean ++ * the log so that the recovery we just performed does not have to be ++ * replayed again on the next mount. ++ * ++ * We use the same quiesce mechanism as the rw->ro remount, as they are ++ * semantically identical operations. ++ */ ++ if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) == ++ XFS_MOUNT_RDONLY) { ++ xfs_quiesce_attr(mp); ++ } ++ ++ /* + * Complete the quota initialisation, post-log-replay component. + */ + if (quotamount) { +diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c +index 3409753..2d092f9 100644 +--- a/fs/xfs/xfs_super.c ++++ b/fs/xfs/xfs_super.c +@@ -1137,7 +1137,7 @@ xfs_restore_resvblks(struct xfs_mount *mp) + * Note: xfs_log_quiesce() stops background log work - the callers must ensure + * it is started again when appropriate. + */ +-static void ++void + xfs_quiesce_attr( + struct xfs_mount *mp) + { +diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h +index 529bce9..b6418ab 100644 +--- a/fs/xfs/xfs_super.h ++++ b/fs/xfs/xfs_super.h +@@ -61,6 +61,7 @@ struct xfs_mount; + struct xfs_buftarg; + struct block_device; + ++extern void xfs_quiesce_attr(struct xfs_mount *mp); + extern void xfs_flush_inodes(struct xfs_mount *mp); + extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); + extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, +From: Eryu Guan <eguan@redhat.com> +To: linux-xfs@vger.kernel.org +Cc: xfs@oss.sgi.com, Eryu Guan <eguan@redhat.com> +Subject: [PATCH v2] xfs: undo block reservation correctly in xfs_trans_reserve() +Date: Tue, 6 Sep 2016 20:14:40 +0800 + +"blocks" should be added back to fdblocks at undo time, not taken +away, i.e. the minus sign should not be used. + +This is a regression introduced by commit 0d485ada404b ("xfs: use +generic percpu counters for free block counter"). And it's found by +code inspection, I didn't it in real world, so there's no +reproducer. + +Signed-off-by: Eryu Guan <eguan@redhat.com> +--- +v2: +- Remove "Fixes:" tag and describe relevant commit in commit log +- Update commit log to mention that it's found by code inspection +- Remove outer () from the "int64_t" cast + + fs/xfs/xfs_trans.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c +index 5f3d33d..836eb80 100644 +--- a/fs/xfs/xfs_trans.c ++++ b/fs/xfs/xfs_trans.c +@@ -217,7 +217,7 @@ undo_log: + + undo_blocks: + if (blocks > 0) { +- xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); ++ xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd); + tp->t_blk_res = 0; + } + |