8 files changed, 3372 insertions, 49 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 9e439216e671..22defbe522fb 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -1,6 +1,6 @@
 pkgbase = linux-surfacepro3-rt
-	pkgver = 4.8.11
-	pkgrel = 2.1
+	pkgver = 4.8.14
+	pkgrel = 2.2
 	url = https://github.com/alyptik/linux-surfacepro3-rt
 	arch = i686
 	arch = x86_64
@@ -12,36 +12,36 @@ pkgbase = linux-surfacepro3-rt
 	makedepends = bc
 	makedepends = elfutils
 	options = !strip
-	source = https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.8.11.tar.xz
-	source = https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.8.11.tar.sign
-	source = https://www.kernel.org/pub/linux/kernel/projects/rt/4.8/older/patch-4.8.11-rt7.patch.xz
-	source = https://www.kernel.org/pub/linux/kernel/projects/rt/4.8/older/patch-4.8.11-rt7.patch.sign
+	source = https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.8.14.tar.xz
+	source = https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.8.14.tar.sign
+	source = https://www.kernel.org/pub/linux/kernel/projects/rt/4.8/older/patch-4.8.14-rt9.patch.xz
+	source = https://www.kernel.org/pub/linux/kernel/projects/rt/4.8/older/patch-4.8.14-rt9.patch.sign
 	source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfq.patch
 	source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs.patch
 	source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs-fixes1.patch
 	source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs-fixes2.patch
 	source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs-fixes3.patch
-	source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/block.patch
-	source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/init.patch
-	source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/kconfig.patch
-	source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/xattr.patch
-	source = https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/xfs.patch
-	source = multitouch.patch
+	source = block.patch
+	source = init.patch
+	source = kconfig.patch
+	source = xattr.patch
+	source = xfs.patch
 	source = touchscreen_multitouch_fixes1.patch
 	source = touchscreen_multitouch_fixes2.patch
 	source = wifi.patch
+	source = multitouch.patch
+	source = change-default-console-loglevel.patch
 	source = config
 	source = config.x86_64
 	source = config.sp3
 	source = linux.preset
-	source = change-default-console-loglevel.patch
 	validpgpkeys = ABAF11C65A2970B130ABE3C479BE3E4300411886
 	validpgpkeys = 647F28654894E3BD457199BE38DBBDC86092693E
 	validpgpkeys = 64254695FFF0AA4466CC19E67B96E8162A8CF5D1
 	validpgpkeys = 8D633C480C2247466051B7ADE314F17E08EF006D
-	sha256sums = cc0f42f408ba3e51f8b0e93e3d8050ff18569456d286cb2a1aca3327dd06890f
+	sha256sums = 81e344d7852128a80fe54f659b2c87bb1b1bde560cd80c52c79c99f568fd5acf
 	sha256sums = SKIP
-	sha256sums = f258a256ebdb51ceabbe1e482706756437c7113c6d987025203468bfb8601f9a
+	sha256sums = 157492d303dd0504181e55cdcfe65471c578a3a540527c1c11b0a39297b23de5
 	sha256sums = SKIP
 	sha256sums = 242d32d0fe819852e74d93b8a044cf24a40a9474d6f00ca93a19aa98298dcefa
 	sha256sums = 51f91681b708149fe91e565f5c40811477428e2aa86f8726a20e0e7c55c5407c
@@ -53,15 +53,15 @@ pkgbase = linux-surfacepro3-rt
 	sha256sums = f479a5ca6abe4d50ca4c09e6e83a027369fcd3efff8d5ce60f0699d8fa47beb8
 	sha256sums = 4633ae19b9a9871a3cfffba98ec7c3cd240f64bef8a0eebcf1212219c80972fd
 	sha256sums = 6618ef72495a6f7c7e50ecfba4a897f78668a3cbaabb93e97ad3d276e7abc52c
-	sha256sums = 3a4722981f689225a0ad550e45d829fcc3ca29d4258df3c6c989a916199e1c08
 	sha256sums = cc78e8844d9ec4bd29cce392a3e4683061646e1ad7c100c4958a5cadabb25b52
 	sha256sums = 34b4e00ffcf9efc43ab47444d14febb94432d340d0f1d5bcd56153879d1be113
 	sha256sums = 52e7c895aeb505bc8d3b5321a346fcdbb749f8035cacc97a237c24c1f527adbc
+	sha256sums = 3a4722981f689225a0ad550e45d829fcc3ca29d4258df3c6c989a916199e1c08
+	sha256sums = 1256b241cd477b265a3c2d64bdc19ffe3c9bbcee82ea3994c590c2c76e767d99
 	sha256sums = 0fcd0b22fe9ec58ba41b81b463f68d619b6898a5c405fb26c85237a183240371
-	sha256sums = 577a3c4c211e6946fb8c1448d6a325861b41c8c8660203ae7d63a58f3af0d279
-	sha256sums = 5c92eb5febe5bafcc76f19aa3d4aaf723cfbb465615cd68ecfaea54a2f773994
+	sha256sums = ed9b9e6efaf4f23e7ae3406322b4d1d3080e8dbc7ab3f03bcbf728ca2010e21b
+	sha256sums = f0c70a988490189ac3869ef948db2ca28704f1d85a1519d5b5f99afa7b10f741
 	sha256sums = f0d90e756f14533ee67afda280500511a62465b4f76adcc5effa95a40045179c
-	sha256sums = 1256b241cd477b265a3c2d64bdc19ffe3c9bbcee82ea3994c590c2c76e767d99
 
 pkgname = linux-surfacepro3-rt
 	pkgdesc = The Linux-surfacepro3-rt kernel and modules
diff --git a/PKGBUILD b/PKGBUILD
index 27cd06986335..bc7e7334c3dd 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -2,9 +2,10 @@
 # Contributor: Matthew Wardrop <mister.wardrop@gmail.com>
 
 pkgbase=linux-surfacepro3-rt
-_srcname=linux-4.8.11
-pkgver=4.8.11
-pkgrel=2.1
+_srcname=linux-4.8.14
+pkgver=${_srcname#linux-}
+_rtver=rt9
+pkgrel=2.2
 arch=('i686' 'x86_64')
 url="https://github.com/alyptik/linux-surfacepro3-rt"
 license=('GPL2')
@@ -12,33 +13,28 @@ makedepends=('xmlto' 'docbook-xsl' 'kmod' 'inetutils' 'bc' 'elfutils')
 options=('!strip')
 source=("https://www.kernel.org/pub/linux/kernel/v4.x/${_srcname}.tar.xz"
         "https://www.kernel.org/pub/linux/kernel/v4.x/${_srcname}.tar.sign"
-        "https://www.kernel.org/pub/linux/kernel/projects/rt/${pkgver%.*}/older/patch-${pkgver}-rt7.patch.xz"
-        "https://www.kernel.org/pub/linux/kernel/projects/rt/${pkgver%.*}/older/patch-${pkgver}-rt7.patch.sign"
+        "https://www.kernel.org/pub/linux/kernel/projects/rt/${pkgver%.*}/older/patch-${pkgver}-${_rtver}.patch.xz"
+        "https://www.kernel.org/pub/linux/kernel/projects/rt/${pkgver%.*}/older/patch-${pkgver}-${_rtver}.patch.sign"
         # Brain Fuck Scheduler & other personal patches
         'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfq.patch'
         'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs.patch'
         'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs-fixes1.patch'
         'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs-fixes2.patch'
         'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/bfs-fixes3.patch'
-        'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/block.patch'
-        'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/init.patch'
-        'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/kconfig.patch'
-        'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/xattr.patch'
-        'https://raw.githubusercontent.com/alyptik/linux-surfacepro3-rt/github/xfs.patch'
-        'multitouch.patch'
-	'touchscreen_multitouch_fixes1.patch'
-	'touchscreen_multitouch_fixes2.patch'
+        'block.patch' 'init.patch' 'kconfig.patch' 'xattr.patch' 'xfs.patch'
+	'touchscreen_multitouch_fixes1.patch' 'touchscreen_multitouch_fixes2.patch'
 	'wifi.patch'
+        'multitouch.patch'
+        'change-default-console-loglevel.patch'
         # the main kernel config files
         'config' 'config.x86_64' 'config.sp3'
         # standard config files for mkinitcpio ramdisk
         'linux.preset'
-        'change-default-console-loglevel.patch'
 )
 
-sha256sums=('cc0f42f408ba3e51f8b0e93e3d8050ff18569456d286cb2a1aca3327dd06890f'
+sha256sums=('81e344d7852128a80fe54f659b2c87bb1b1bde560cd80c52c79c99f568fd5acf'
             'SKIP'
-            'f258a256ebdb51ceabbe1e482706756437c7113c6d987025203468bfb8601f9a'
+            '157492d303dd0504181e55cdcfe65471c578a3a540527c1c11b0a39297b23de5'
             'SKIP'
             '242d32d0fe819852e74d93b8a044cf24a40a9474d6f00ca93a19aa98298dcefa'
             '51f91681b708149fe91e565f5c40811477428e2aa86f8726a20e0e7c55c5407c'
@@ -50,15 +46,15 @@ sha256sums=('cc0f42f408ba3e51f8b0e93e3d8050ff18569456d286cb2a1aca3327dd06890f'
             'f479a5ca6abe4d50ca4c09e6e83a027369fcd3efff8d5ce60f0699d8fa47beb8'
             '4633ae19b9a9871a3cfffba98ec7c3cd240f64bef8a0eebcf1212219c80972fd'
             '6618ef72495a6f7c7e50ecfba4a897f78668a3cbaabb93e97ad3d276e7abc52c'
-            '3a4722981f689225a0ad550e45d829fcc3ca29d4258df3c6c989a916199e1c08'
             'cc78e8844d9ec4bd29cce392a3e4683061646e1ad7c100c4958a5cadabb25b52'
             '34b4e00ffcf9efc43ab47444d14febb94432d340d0f1d5bcd56153879d1be113'
             '52e7c895aeb505bc8d3b5321a346fcdbb749f8035cacc97a237c24c1f527adbc'
+            '3a4722981f689225a0ad550e45d829fcc3ca29d4258df3c6c989a916199e1c08'
+            '1256b241cd477b265a3c2d64bdc19ffe3c9bbcee82ea3994c590c2c76e767d99'
             '0fcd0b22fe9ec58ba41b81b463f68d619b6898a5c405fb26c85237a183240371'
-            '577a3c4c211e6946fb8c1448d6a325861b41c8c8660203ae7d63a58f3af0d279'
-            '5c92eb5febe5bafcc76f19aa3d4aaf723cfbb465615cd68ecfaea54a2f773994'
-            'f0d90e756f14533ee67afda280500511a62465b4f76adcc5effa95a40045179c'
-            '1256b241cd477b265a3c2d64bdc19ffe3c9bbcee82ea3994c590c2c76e767d99')
+            'ed9b9e6efaf4f23e7ae3406322b4d1d3080e8dbc7ab3f03bcbf728ca2010e21b'
+            'f0c70a988490189ac3869ef948db2ca28704f1d85a1519d5b5f99afa7b10f741'
+            'f0d90e756f14533ee67afda280500511a62465b4f76adcc5effa95a40045179c')
 
 validpgpkeys=(
               'ABAF11C65A2970B130ABE3C479BE3E4300411886' # Linus Torvalds
@@ -67,11 +63,11 @@ validpgpkeys=(
               '8D633C480C2247466051B7ADE314F17E08EF006D' # Joey Pabalinas
              )
 multitouch='y'
-sp3config='y'
 bcache='n'
 bfs='n'
 bfq='n'
 personal='y'
+sp3config='n'
 
 _kernelname=${pkgbase#linux}
 
@@ -82,7 +78,8 @@ prepare() {
   if [ "$bfq" = 'y' ]; then patch -p1 -i "${srcdir}/bfq.patch"; fi
   if [ "$bfs" = 'y' ]; then for i in bfs bfs-fixes{1..3}; do patch -p1 -i "${srcdir}/${i}.patch"; done; fi
   if [ "$bcache" = 'y' ]; then
-    sed -i '\%^diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig$%,+11 d' "${srcdir}/patch-${pkgver}-rt2.patch"
+    sed -i '\%^diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig$%,+11 d' \
+      "${srcdir}/patch-${pkgver}-${_rtver}.patch"
     cp "${srcdir}/linux-${pkgver}/include/linux/rwsem.h" "${srcdir}/linux-${pkgver}/drivers/md/bcache/"
     sed -i '/#include "bcache.h"/i #include "rwsem.h"\n' "${srcdir}/linux-${pkgver}/drivers/md/bcache/request.c"
   fi
@@ -91,7 +88,7 @@ prepare() {
   if [ "$personal" = 'y' ]; then for i in block init kconfig xattr xfs; do patch -p1 -i "${srcdir}/${i}.patch"; done; fi
 
   # Add RT patches
-  patch -p1 -i ${srcdir}/patch-${pkgver}*.patch
+  patch -p1 -i ${srcdir}/patch-${pkgver}-${_rtver}.patch
 
   # set DEFAULT_CONSOLE_LOGLEVEL to 4 (same value as the 'quiet' kernel param)
   # remove this when a Kconfig knob is made available by upstream
@@ -113,7 +110,7 @@ prepare() {
   ## If sp3config='y' use personal config as a base
   if [ "$sp3config" = 'y' ]; then
     cat "${srcdir}/config.sp3" >./.config
-  elif [ "${CARCH}" = "x86_64" ]; then
+  elif [ "$CARCH" = "x86_64" ]; then
     cat "${srcdir}/config.x86_64" >./.config
   else
     cat "${srcdir}/config" > ./.config
diff --git a/block.patch b/block.patch
new file mode 100644
index 000000000000..1f99d7869a0d
--- /dev/null
+++ b/block.patch
@@ -0,0 +1,2650 @@
+To: LKML <linux-kernel@vger.kernel.org>, Jens Axboe <axboe@fb.com>
+From: =?UTF-8?Q?Holger_Hoffst=c3=a4tte?= <holger.hoffstaette@googlemail.com>
+Subject: [PATCH] loop: properly observe rotational flag of underlying device
+Organization: Applied Asynchrony, Inc.
+Date: Wed, 11 Nov 2015 16:21:51 +0100
+
+The loop driver always declares the rotational flag of its device as
+rotational, even when the device of the mapped file is nonrotational,
+as is the case with SSDs or on tmpfs. This can confuse filesystem tools
+which are SSD-aware; in my case I frequently forget to tell mkfs.btrfs
+that my loop device on tmpfs is nonrotational, and that I really don't
+need any automatic metadata redundancy.
+
+The attached patch fixes this by introspecting the rotational flag of the
+mapped file's underlying block device, if it exists. If the mapped file's
+filesystem has no associated block device - as is the case on e.g. tmpfs -
+we assume nonrotational storage. If there is a better way to identify such
+non-devices I'd love to hear them.
+
+Signed-off-by: Holger Hoffstätte <holger.hoffstaette@googlemail.com>
+---
+ drivers/block/loop.c | 19 +++++++++++++++++++
+ 1 file changed, 19 insertions(+)
+
+diff --git a/drivers/block/loop.c b/drivers/block/loop.c
+index 423f4ca..2984aca 100644
+--- a/drivers/block/loop.c
++++ b/drivers/block/loop.c
+@@ -843,6 +843,24 @@ static void loop_config_discard(struct loop_device *lo)
+ 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+ }
+ 
++static void loop_update_rotational(struct loop_device *lo)
++{
++	struct file *file = lo->lo_backing_file;
++	struct inode *file_inode = file->f_mapping->host;
++	struct block_device *file_bdev = file_inode->i_sb->s_bdev;
++	struct request_queue *q = lo->lo_queue;
++	bool nonrot = true;
++
++	/* not all filesystems (e.g. tmpfs) have a sb->s_bdev */
++	if (file_bdev)
++		nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev));
++
++	if (nonrot)
++		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
++	else
++		queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
++}
++
+ static void loop_unprepare_queue(struct loop_device *lo)
+ {
+ 	flush_kthread_worker(&lo->worker);
+@@ -939,6 +957,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
+ 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
+ 		blk_queue_flush(lo->lo_queue, REQ_FLUSH);
+ 
++	loop_update_rotational(lo);
+ 	loop_update_dio(lo);
+ 	set_capacity(lo->lo_disk, size);
+ 	bd_set_size(bdev, size << 9);
+-- 
+2.6.3
+From 273d4cb9fc3d75b6b7f147d1a064f75a5412a76c Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Wed, 27 Jul 2016 15:30:35 -0600
+Subject: block: add WRITE_BG
+
+This adds a new request flag, REQ_BG, that callers can use to tell
+the block layer that this is background (non-urgent) IO.
+
+Signed-off-by: Jens Axboe <axboe@fb.com>
+---
+ include/linux/blk_types.h | 4 +++-
+ include/linux/fs.h        | 3 +++
+ 2 files changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
+index 436f43f..be4409b 100644
+--- a/include/linux/blk_types.h
++++ b/include/linux/blk_types.h
+@@ -155,6 +155,7 @@ enum rq_flag_bits {
+ 	__REQ_INTEGRITY,	/* I/O includes block integrity payload */
+ 	__REQ_FUA,		/* forced unit access */
+ 	__REQ_PREFLUSH,		/* request for cache flush */
++	__REQ_BG,		/* background activity */
+ 
+ 	/* bio only flags */
+ 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
+@@ -198,7 +199,7 @@ enum rq_flag_bits {
+ 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
+ #define REQ_COMMON_MASK \
+ 	(REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \
+-	 REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE)
++	 REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_BG)
+ #define REQ_CLONE_MASK		REQ_COMMON_MASK
+ 
+ /* This mask is used for both bio and request merge checking */
+@@ -223,6 +224,7 @@ enum rq_flag_bits {
+ #define REQ_COPY_USER		(1ULL << __REQ_COPY_USER)
+ #define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
+ #define REQ_FLUSH_SEQ		(1ULL << __REQ_FLUSH_SEQ)
++#define REQ_BG			(1ULL << __REQ_BG)
+ #define REQ_IO_STAT		(1ULL << __REQ_IO_STAT)
+ #define REQ_MIXED_MERGE		(1ULL << __REQ_MIXED_MERGE)
+ #define REQ_PM			(1ULL << __REQ_PM)
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index 901e25d..7c7951f 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -189,6 +189,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
+  * WRITE_FLUSH_FUA	Combination of WRITE_FLUSH and FUA. The IO is preceded
+  *			by a cache flush and data is guaranteed to be on
+  *			non-volatile media on completion.
++ * WRITE_BG		Background write. This is for background activity like
++ *			the periodic flush and background threshold writeback
+  *
+  */
+ #define RW_MASK			REQ_OP_WRITE
+@@ -202,6 +204,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
+ #define WRITE_FLUSH		(REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH)
+ #define WRITE_FUA		(REQ_SYNC | REQ_NOIDLE | REQ_FUA)
+ #define WRITE_FLUSH_FUA		(REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA)
++#define WRITE_BG		(REQ_NOIDLE | REQ_BG)
+ 
+ /*
+  * Attribute flags.  These should be or-ed together to figure out what
+-- 
+cgit v0.11.2
+
+From 33a170c4f076584bc05feb19efa7beb0ee099318 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Wed, 27 Jul 2016 15:24:08 -0600
+Subject: writeback: add wbc_to_write_flags()
+
+Add wbc_to_write_flags(), which returns the write modifier flags to use,
+based on a struct writeback_control. No functional changes in this
+patch, but it prepares us for factoring other wbc fields for write type.
+
+Signed-off-by: Jens Axboe <axboe@fb.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+---
+ fs/buffer.c               | 2 +-
+ fs/f2fs/data.c            | 2 +-
+ fs/f2fs/node.c            | 2 +-
+ fs/gfs2/meta_io.c         | 3 +--
+ fs/mpage.c                | 2 +-
+ fs/xfs/xfs_aops.c         | 7 +++----
+ include/linux/writeback.h | 8 ++++++++
+ 7 files changed, 16 insertions(+), 10 deletions(-)
+
+diff --git a/fs/buffer.c b/fs/buffer.c
+index 9c8eb9b..6a5f1a0 100644
+--- a/fs/buffer.c
++++ b/fs/buffer.c
+@@ -1698,7 +1698,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
+ 	struct buffer_head *bh, *head;
+ 	unsigned int blocksize, bbits;
+ 	int nr_underway = 0;
+-	int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
++	int write_flags = wbc_to_write_flags(wbc);
+ 
+ 	head = create_page_buffers(page, inode,
+ 					(1 << BH_Dirty)|(1 << BH_Uptodate));
+diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
+index ccb401e..cb0528b 100644
+--- a/fs/f2fs/data.c
++++ b/fs/f2fs/data.c
+@@ -1240,7 +1240,7 @@ static int f2fs_write_data_page(struct page *page,
+ 		.sbi = sbi,
+ 		.type = DATA,
+ 		.op = REQ_OP_WRITE,
+-		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
++		.op_flags = wbc_to_write_flags(wbc),
+ 		.page = page,
+ 		.encrypted_page = NULL,
+ 	};
+diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
+index f75d197..c1713da 100644
+--- a/fs/f2fs/node.c
++++ b/fs/f2fs/node.c
+@@ -1561,7 +1561,7 @@ static int f2fs_write_node_page(struct page *page,
+ 		.sbi = sbi,
+ 		.type = NODE,
+ 		.op = REQ_OP_WRITE,
+-		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
++		.op_flags = wbc_to_write_flags(wbc),
+ 		.page = page,
+ 		.encrypted_page = NULL,
+ 	};
+diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
+index 950b8be..7991c62 100644
+--- a/fs/gfs2/meta_io.c
++++ b/fs/gfs2/meta_io.c
+@@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
+ {
+ 	struct buffer_head *bh, *head;
+ 	int nr_underway = 0;
+-	int write_flags = REQ_META | REQ_PRIO |
+-		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
++	int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc);
+ 
+ 	BUG_ON(!PageLocked(page));
+ 	BUG_ON(!page_has_buffers(page));
+diff --git a/fs/mpage.c b/fs/mpage.c
+index d2413af..d6f1afe 100644
+--- a/fs/mpage.c
++++ b/fs/mpage.c
+@@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+ 	struct buffer_head map_bh;
+ 	loff_t i_size = i_size_read(inode);
+ 	int ret = 0;
+-	int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : 0);
++	int op_flags = wbc_to_write_flags(wbc);
+ 
+ 	if (page_has_buffers(page)) {
+ 		struct buffer_head *head = page_buffers(page);
+diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
+index 7575cfc..a68645a 100644
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -447,8 +447,8 @@ xfs_submit_ioend(
+ 
+ 	ioend->io_bio->bi_private = ioend;
+ 	ioend->io_bio->bi_end_io = xfs_end_bio;
+-	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
+-			 (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
++	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc));
++
+ 	/*
+ 	 * If we are failing the IO now, just mark the ioend with an
+ 	 * error and finish it. This will run IO completion immediately
+@@ -519,8 +519,7 @@ xfs_chain_bio(
+ 
+ 	bio_chain(ioend->io_bio, new);
+ 	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
+-	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
+-			  (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
++	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc));
+ 	submit_bio(ioend->io_bio);
+ 	ioend->io_bio = new;
+ }
+diff --git a/include/linux/writeback.h b/include/linux/writeback.h
+index fc1e16c..608afd3 100644
+--- a/include/linux/writeback.h
++++ b/include/linux/writeback.h
+@@ -100,6 +100,14 @@ struct writeback_control {
+ #endif
+ };
+ 
++static inline int wbc_to_write_flags(struct writeback_control *wbc)
++{
++	if (wbc->sync_mode == WB_SYNC_ALL)
++		return WRITE_SYNC;
++
++	return 0;
++}
++
+ /*
+  * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
+  * and are measured against each other in.  There always is one global
+-- 
+cgit v0.11.2
+
+From d6cf7bfd4d627114ba3e2cce96fa9468042a6fba Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Thu, 14 Apr 2016 09:53:24 -0600
+Subject: writeback: use WRITE_BG for kupdate and background writeback
+
+If we're doing background type writes, then use the appropriate
+write command for that.
+
+Signed-off-by: Jens Axboe <axboe@fb.com>
+---
+ include/linux/writeback.h | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/include/linux/writeback.h b/include/linux/writeback.h
+index 608afd3..e53abf2 100644
+--- a/include/linux/writeback.h
++++ b/include/linux/writeback.h
+@@ -104,6 +104,8 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc)
+ {
+ 	if (wbc->sync_mode == WB_SYNC_ALL)
+ 		return WRITE_SYNC;
++	else if (wbc->for_kupdate || wbc->for_background)
++		return WRITE_BG;
+ 
+ 	return 0;
+ }
+-- 
+cgit v0.11.2
+
+From cd38cff40da34de0bf78f8305c89bdfafc606e7f Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Thu, 1 Sep 2016 10:20:33 -0600
+Subject: writeback: track if we're sleeping on progress in
+ balance_dirty_pages()
+
+Note in the bdi_writeback structure whenever a task ends up sleeping
+waiting for progress. We can use that information in the lower layers
+to increase the priority of writes.
+
+Signed-off-by: Jens Axboe <axboe@fb.com>
+---
+ include/linux/backing-dev-defs.h | 2 ++
+ mm/backing-dev.c                 | 1 +
+ mm/page-writeback.c              | 1 +
+ 3 files changed, 4 insertions(+)
+
+diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
+index c357f27..dc5f76d 100644
+--- a/include/linux/backing-dev-defs.h
++++ b/include/linux/backing-dev-defs.h
+@@ -116,6 +116,8 @@ struct bdi_writeback {
+ 	struct list_head work_list;
+ 	struct delayed_work dwork;	/* work item used for writeback */
+ 
++	unsigned long dirty_sleep;	/* last wait */
++
+ 	struct list_head bdi_node;	/* anchored at bdi->wb_list */
+ 
+ #ifdef CONFIG_CGROUP_WRITEBACK
+diff --git a/mm/backing-dev.c b/mm/backing-dev.c
+index 8fde443..3bfed5ab 100644
+--- a/mm/backing-dev.c
++++ b/mm/backing-dev.c
+@@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
+ 	spin_lock_init(&wb->work_lock);
+ 	INIT_LIST_HEAD(&wb->work_list);
+ 	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
++	wb->dirty_sleep = jiffies;
+ 
+ 	wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
+ 	if (!wb->congested)
+diff --git a/mm/page-writeback.c b/mm/page-writeback.c
+index f4cd7d8..98bc3fc 100644
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -1778,6 +1778,7 @@ pause:
+ 					  pause,
+ 					  start_time);
+ 		__set_current_state(TASK_KILLABLE);
++		wb->dirty_sleep = now;
+ 		io_schedule_timeout(pause);
+ 
+ 		current->dirty_paused_when = now + pause;
+-- 
+cgit v0.11.2
+
+From a98f5ab3840c2e6008c478aafe5df055404acdd1 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Wed, 30 Mar 2016 10:21:08 -0600
+Subject: block: add code to track actual device queue depth
+
+For blk-mq, ->nr_requests does track queue depth, at least at init
+time. But for the older queue paths, it's simply a soft setting.
+On top of that, it's generally larger than the hardware setting
+on purpose, to allow backup of requests for merging.
+
+Fill a hole in struct request with a 'queue_depth' member, that
+drivers can call to more closely inform the block layer of the
+real queue depth.
+
+Signed-off-by: Jens Axboe <axboe@fb.com>
+---
+ block/blk-settings.c   | 12 ++++++++++++
+ drivers/scsi/scsi.c    |  3 +++
+ include/linux/blkdev.h | 11 +++++++++++
+ 3 files changed, 26 insertions(+)
+
+diff --git a/block/blk-settings.c b/block/blk-settings.c
+index f679ae1..f7e122e 100644
+--- a/block/blk-settings.c
++++ b/block/blk-settings.c
+@@ -832,6 +832,18 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
+ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
+ 
+ /**
++ * blk_set_queue_depth - tell the block layer about the device queue depth
++ * @q:		the request queue for the device
++ * @depth:		queue depth
++ *
++ */
++void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
++{
++	q->queue_depth = depth;
++}
++EXPORT_SYMBOL(blk_set_queue_depth);
++
++/**
+  * blk_queue_write_cache - configure queue's write cache
+  * @q:		the request queue for the device
+  * @wc:		write back cache on or off
+diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
+index 1f36aca..f3de98a 100644
+--- a/drivers/scsi/scsi.c
++++ b/drivers/scsi/scsi.c
+@@ -621,6 +621,9 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth)
+ 		wmb();
+ 	}
+ 
++	if (sdev->request_queue)
++		blk_set_queue_depth(sdev->request_queue, depth);
++
+ 	return sdev->queue_depth;
+ }
+ EXPORT_SYMBOL(scsi_change_queue_depth);
+diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
+index e79055c..1d12aa6 100644
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -327,6 +327,8 @@ struct request_queue {
+ 	struct blk_mq_ctx __percpu	*queue_ctx;
+ 	unsigned int		nr_queues;
+ 
++	unsigned int		queue_depth;
++
+ 	/* hw dispatch queues */
+ 	struct blk_mq_hw_ctx	**queue_hw_ctx;
+ 	unsigned int		nr_hw_queues;
+@@ -683,6 +685,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
+ 	return false;
+ }
+ 
++static inline unsigned int blk_queue_depth(struct request_queue *q)
++{
++	if (q->queue_depth)
++		return q->queue_depth;
++
++	return q->nr_requests;
++}
++
+ /*
+  * q->prep_rq_fn return values
+  */
+@@ -999,6 +1009,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
+ extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
+ extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
+ extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
++extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
+ extern void blk_set_default_limits(struct queue_limits *lim);
+ extern void blk_set_stacking_limits(struct queue_limits *lim);
+ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
+-- 
+cgit v0.11.2
+
+From a13cc5885ddd5582129869c1837821d6af6d48bb Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Thu, 1 Sep 2016 10:22:41 -0600
+Subject: block: add scalable completion tracking of requests
+
+For legacy block, we simply track them in the request queue. For
+blk-mq, we track them on a per-sw queue basis, which we can then
+sum up through the hardware queues and finally to a per device
+state.
+
+The stats are tracked in, roughly, 0.1s interval windows.
+
+Add sysfs files to display the stats.
+
+Signed-off-by: Jens Axboe <axboe@fb.com>
+---
+ block/Makefile            |   2 +-
+ block/blk-core.c          |   4 +
+ block/blk-mq-sysfs.c      |  47 ++++++++++
+ block/blk-mq.c            |  14 +++
+ block/blk-mq.h            |   3 +
+ block/blk-stat.c          | 220 ++++++++++++++++++++++++++++++++++++++++++++++
+ block/blk-stat.h          |  18 ++++
+ block/blk-sysfs.c         |  26 ++++++
+ include/linux/blk_types.h |  12 +++
+ include/linux/blkdev.h    |   4 +
+ 10 files changed, 349 insertions(+), 1 deletion(-)
+ create mode 100644 block/blk-stat.c
+ create mode 100644 block/blk-stat.h
+
+diff --git a/block/Makefile b/block/Makefile
+index 9eda232..3446e04 100644
+--- a/block/Makefile
++++ b/block/Makefile
+@@ -5,7 +5,7 @@
+ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
+ 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
+ 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
+-			blk-lib.o blk-mq.o blk-mq-tag.o \
++			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
+ 			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
+ 			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
+ 			badblocks.o partitions/
+diff --git a/block/blk-core.c b/block/blk-core.c
+index 36c7ac3..4075cbe 100644
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -2475,6 +2475,8 @@ void blk_start_request(struct request *req)
+ {
+ 	blk_dequeue_request(req);
+ 
++	req->issue_time = ktime_to_ns(ktime_get());
++
+ 	/*
+ 	 * We are now handing the request to the hardware, initialize
+ 	 * resid_len to full count and add the timeout handler.
+@@ -2542,6 +2544,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
+ 
+ 	trace_block_rq_complete(req->q, req, nr_bytes);
+ 
++	blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req);
++
+ 	if (!req->bio)
+ 		return false;
+ 
+diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
+index fe822aa..b66bbf1 100644
+--- a/block/blk-mq-sysfs.c
++++ b/block/blk-mq-sysfs.c
+@@ -247,6 +247,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
+ 	return ret;
+ }
+ 
++static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
++{
++	struct blk_mq_ctx *ctx;
++	unsigned int i;
++
++	hctx_for_each_ctx(hctx, ctx, i) {
++		blk_stat_init(&ctx->stat[0]);
++		blk_stat_init(&ctx->stat[1]);
++	}
++}
++
++static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
++					  const char *page, size_t count)
++{
++	blk_mq_stat_clear(hctx);
++	return count;
++}
++
++static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
++{
++	return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
++			pre, (long long) stat->nr_samples,
++			(long long) stat->mean, (long long) stat->min,
++			(long long) stat->max);
++}
++
++static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
++{
++	struct blk_rq_stat stat[2];
++	ssize_t ret;
++
++	blk_stat_init(&stat[0]);
++	blk_stat_init(&stat[1]);
++
++	blk_hctx_stat_get(hctx, stat);
++
++	ret = print_stat(page, &stat[0], "read :");
++	ret += print_stat(page + ret, &stat[1], "write:");
++	return ret;
++}
++
+ static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
+ 	.attr = {.name = "dispatched", .mode = S_IRUGO },
+ 	.show = blk_mq_sysfs_dispatched_show,
+@@ -304,6 +345,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
+ 	.attr = {.name = "io_poll", .mode = S_IRUGO },
+ 	.show = blk_mq_hw_sysfs_poll_show,
+ };
++static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
++	.attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
++	.show = blk_mq_hw_sysfs_stat_show,
++	.store = blk_mq_hw_sysfs_stat_store,
++};
+ 
+ static struct attribute *default_hw_ctx_attrs[] = {
+ 	&blk_mq_hw_sysfs_queued.attr,
+@@ -314,6 +360,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
+ 	&blk_mq_hw_sysfs_cpus.attr,
+ 	&blk_mq_hw_sysfs_active.attr,
+ 	&blk_mq_hw_sysfs_poll.attr,
++	&blk_mq_hw_sysfs_stat.attr,
+ 	NULL,
+ };
+ 
+diff --git a/block/blk-mq.c b/block/blk-mq.c
+index 13f5a6c..712f141 100644
+--- a/block/blk-mq.c
++++ b/block/blk-mq.c
+@@ -29,6 +29,7 @@
+ #include "blk.h"
+ #include "blk-mq.h"
+ #include "blk-mq-tag.h"
++#include "blk-stat.h"
+ 
+ static DEFINE_MUTEX(all_q_mutex);
+ static LIST_HEAD(all_q_list);
+@@ -400,10 +401,19 @@ static void blk_mq_ipi_complete_request(struct request *rq)
+ 	put_cpu();
+ }
+ 
++static void blk_mq_stat_add(struct request *rq)
++{
++	struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)];
++
++	blk_stat_add(stat, rq);
++}
++
+ static void __blk_mq_complete_request(struct request *rq)
+ {
+ 	struct request_queue *q = rq->q;
+ 
++	blk_mq_stat_add(rq);
++
+ 	if (!q->softirq_done_fn)
+ 		blk_mq_end_request(rq, rq->errors);
+ 	else
+@@ -447,6 +457,8 @@ void blk_mq_start_request(struct request *rq)
+ 	if (unlikely(blk_bidi_rq(rq)))
+ 		rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
+ 
++	rq->issue_time = ktime_to_ns(ktime_get());
++
+ 	blk_add_timer(rq);
+ 
+ 	/*
+@@ -1795,6 +1807,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
+ 		spin_lock_init(&__ctx->lock);
+ 		INIT_LIST_HEAD(&__ctx->rq_list);
+ 		__ctx->queue = q;
++		blk_stat_init(&__ctx->stat[0]);
++		blk_stat_init(&__ctx->stat[1]);
+ 
+ 		/* If the cpu isn't online, the cpu is mapped to first hctx */
+ 		if (!cpu_online(i))
+diff --git a/block/blk-mq.h b/block/blk-mq.h
+index 9087b11..e107f70 100644
+--- a/block/blk-mq.h
++++ b/block/blk-mq.h
+@@ -1,6 +1,8 @@
+ #ifndef INT_BLK_MQ_H
+ #define INT_BLK_MQ_H
+ 
++#include "blk-stat.h"
++
+ struct blk_mq_tag_set;
+ 
+ struct blk_mq_ctx {
+@@ -20,6 +22,7 @@ struct blk_mq_ctx {
+ 
+ 	/* incremented at completion time */
+ 	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
++	struct blk_rq_stat	stat[2];
+ 
+ 	struct request_queue	*queue;
+ 	struct kobject		kobj;
+diff --git a/block/blk-stat.c b/block/blk-stat.c
+new file mode 100644
+index 0000000..3965e8a
+--- /dev/null
++++ b/block/blk-stat.c
+@@ -0,0 +1,220 @@
++/*
++ * Block stat tracking code
++ *
++ * Copyright (C) 2016 Jens Axboe
++ */
++#include <linux/kernel.h>
++#include <linux/blk-mq.h>
++
++#include "blk-stat.h"
++#include "blk-mq.h"
++
++static void blk_stat_flush_batch(struct blk_rq_stat *stat)
++{
++	if (!stat->nr_batch)
++		return;
++	if (!stat->nr_samples)
++		stat->mean = div64_s64(stat->batch, stat->nr_batch);
++	else {
++		stat->mean = div64_s64((stat->mean * stat->nr_samples) +
++					stat->batch,
++					stat->nr_samples + stat->nr_batch);
++	}
++
++	stat->nr_samples += stat->nr_batch;
++	stat->nr_batch = stat->batch = 0;
++}
++
++void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
++{
++	if (!src->nr_samples)
++		return;
++
++	blk_stat_flush_batch(src);
++
++	dst->min = min(dst->min, src->min);
++	dst->max = max(dst->max, src->max);
++
++	if (!dst->nr_samples)
++		dst->mean = src->mean;
++	else {
++		dst->mean = div64_s64((src->mean * src->nr_samples) +
++					(dst->mean * dst->nr_samples),
++					dst->nr_samples + src->nr_samples);
++	}
++	dst->nr_samples += src->nr_samples;
++}
++
++static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
++{
++	struct blk_mq_hw_ctx *hctx;
++	struct blk_mq_ctx *ctx;
++	uint64_t latest = 0;
++	int i, j, nr;
++
++	blk_stat_init(&dst[0]);
++	blk_stat_init(&dst[1]);
++
++	nr = 0;
++	do {
++		uint64_t newest = 0;
++
++		queue_for_each_hw_ctx(q, hctx, i) {
++			hctx_for_each_ctx(hctx, ctx, j) {
++				if (!ctx->stat[0].nr_samples &&
++				    !ctx->stat[1].nr_samples)
++					continue;
++				if (ctx->stat[0].time > newest)
++					newest = ctx->stat[0].time;
++				if (ctx->stat[1].time > newest)
++					newest = ctx->stat[1].time;
++			}
++		}
++
++		/*
++		 * No samples
++		 */
++		if (!newest)
++			break;
++
++		if (newest > latest)
++			latest = newest;
++
++		queue_for_each_hw_ctx(q, hctx, i) {
++			hctx_for_each_ctx(hctx, ctx, j) {
++				if (ctx->stat[0].time == newest) {
++					blk_stat_sum(&dst[0], &ctx->stat[0]);
++					nr++;
++				}
++				if (ctx->stat[1].time == newest) {
++					blk_stat_sum(&dst[1], &ctx->stat[1]);
++					nr++;
++				}
++			}
++		}
++		/*
++		 * If we race on finding an entry, just loop back again.
++		 * Should be very rare.
++		 */
++	} while (!nr);
++
++	dst[0].time = dst[1].time = latest;
++}
++
++void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
++{
++	if (q->mq_ops)
++		blk_mq_stat_get(q, dst);
++	else {
++		memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat));
++		memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat));
++	}
++}
++
++void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
++{
++	struct blk_mq_ctx *ctx;
++	unsigned int i, nr;
++
++	nr = 0;
++	do {
++		uint64_t newest = 0;
++
++		hctx_for_each_ctx(hctx, ctx, i) {
++			if (!ctx->stat[0].nr_samples &&
++			    !ctx->stat[1].nr_samples)
++				continue;
++
++			if (ctx->stat[0].time > newest)
++				newest = ctx->stat[0].time;
++			if (ctx->stat[1].time > newest)
++				newest = ctx->stat[1].time;
++		}
++
++		if (!newest)
++			break;
++
++		hctx_for_each_ctx(hctx, ctx, i) {
++			if (ctx->stat[0].time == newest) {
++				blk_stat_sum(&dst[0], &ctx->stat[0]);
++				nr++;
++			}
++			if (ctx->stat[1].time == newest) {
++				blk_stat_sum(&dst[1], &ctx->stat[1]);
++				nr++;
++			}
++		}
++		/*
++		 * If we race on finding an entry, just loop back again.
++		 * Should be very rare, as the window is only updated
++		 * occasionally
++		 */
++	} while (!nr);
++}
++
++static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
++{
++	stat->min = -1ULL;
++	stat->max = stat->nr_samples = stat->mean = 0;
++	stat->batch = stat->nr_batch = 0;
++	stat->time = time_now & BLK_STAT_MASK;
++}
++
++void blk_stat_init(struct blk_rq_stat *stat)
++{
++	__blk_stat_init(stat, ktime_to_ns(ktime_get()));
++}
++
++static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
++{
++	return (now & BLK_STAT_MASK) == (stat->time & BLK_STAT_MASK);
++}
++
++bool blk_stat_is_current(struct blk_rq_stat *stat)
++{
++	return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
++}
++
++void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
++{
++	s64 now, value;
++
++	now = ktime_to_ns(ktime_get());
++	if (now < rq->issue_time)
++		return;
++
++	if (!__blk_stat_is_current(stat, now))
++		__blk_stat_init(stat, now);
++
++	value = now - rq->issue_time;
++	if (value > stat->max)
++		stat->max = value;
++	if (value < stat->min)
++		stat->min = value;
++
++	if (stat->batch + value < stat->batch ||
++	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
++		blk_stat_flush_batch(stat);
++
++	stat->batch += value;
++	stat->nr_batch++;
++}
++
++void blk_stat_clear(struct request_queue *q)
++{
++	if (q->mq_ops) {
++		struct blk_mq_hw_ctx *hctx;
++		struct blk_mq_ctx *ctx;
++		int i, j;
++
++		queue_for_each_hw_ctx(q, hctx, i) {
++			hctx_for_each_ctx(hctx, ctx, j) {
++				blk_stat_init(&ctx->stat[0]);
++				blk_stat_init(&ctx->stat[1]);
++			}
++		}
++	} else {
++		blk_stat_init(&q->rq_stats[0]);
++		blk_stat_init(&q->rq_stats[1]);
++	}
++}
+diff --git a/block/blk-stat.h b/block/blk-stat.h
+new file mode 100644
+index 0000000..376a6cc
+--- /dev/null
++++ b/block/blk-stat.h
+@@ -0,0 +1,18 @@
++#ifndef BLK_STAT_H
++#define BLK_STAT_H
++
++/*
++ * ~0.13s window as a power-of-2 (2^27 nsecs)
++ */
++#define BLK_STAT_NSEC	134217728ULL
++#define BLK_STAT_MASK	~(BLK_STAT_NSEC - 1)
++
++void blk_stat_add(struct blk_rq_stat *, struct request *);
++void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
++void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
++void blk_stat_clear(struct request_queue *q);
++void blk_stat_init(struct blk_rq_stat *);
++void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
++bool blk_stat_is_current(struct blk_rq_stat *);
++
++#endif
+diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
+index f87a7e7..0b9e435 100644
+--- a/block/blk-sysfs.c
++++ b/block/blk-sysfs.c
+@@ -384,6 +384,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
+ 	return queue_var_show(blk_queue_dax(q), page);
+ }
+ 
++static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
++{
++	return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
++			pre, (long long) stat->nr_samples,
++			(long long) stat->mean, (long long) stat->min,
++			(long long) stat->max);
++}
++
++static ssize_t queue_stats_show(struct request_queue *q, char *page)
++{
++	struct blk_rq_stat stat[2];
++	ssize_t ret;
++
++	blk_queue_stat_get(q, stat);
++
++	ret = print_stat(page, &stat[0], "read :");
++	ret += print_stat(page + ret, &stat[1], "write:");
++	return ret;
++}
++
+ static struct queue_sysfs_entry queue_requests_entry = {
+ 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
+ 	.show = queue_requests_show,
+@@ -526,6 +546,11 @@ static struct queue_sysfs_entry queue_dax_entry = {
+ 	.show = queue_dax_show,
+ };
+ 
++static struct queue_sysfs_entry queue_stats_entry = {
++	.attr = {.name = "stats", .mode = S_IRUGO },
++	.show = queue_stats_show,
++};
++
+ static struct attribute *default_attrs[] = {
+ 	&queue_requests_entry.attr,
+ 	&queue_ra_entry.attr,
+@@ -553,6 +578,7 @@ static struct attribute *default_attrs[] = {
+ 	&queue_poll_entry.attr,
+ 	&queue_wc_entry.attr,
+ 	&queue_dax_entry.attr,
++	&queue_stats_entry.attr,
+ 	NULL,
+ };
+ 
+diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
+index be4409b..95fbfa1 100644
+--- a/include/linux/blk_types.h
++++ b/include/linux/blk_types.h
+@@ -266,4 +266,16 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
+ 	return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
+ }
+ 
++#define BLK_RQ_STAT_BATCH	64
++
++struct blk_rq_stat {
++	s64 mean;
++	u64 min;
++	u64 max;
++	s32 nr_samples;
++	s32 nr_batch;
++	u64 batch;
++	s64 time;
++};
++
+ #endif /* __LINUX_BLK_TYPES_H */
+diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
+index 1d12aa6..259eba8 100644
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -151,6 +151,7 @@ struct request {
+ 	struct gendisk *rq_disk;
+ 	struct hd_struct *part;
+ 	unsigned long start_time;
++	s64 issue_time;
+ #ifdef CONFIG_BLK_CGROUP
+ 	struct request_list *rl;		/* rl this rq is alloced from */
+ 	unsigned long long start_time_ns;
+@@ -414,6 +415,9 @@ struct request_queue {
+ 
+ 	unsigned int		nr_sorted;
+ 	unsigned int		in_flight[2];
++
++	struct blk_rq_stat	rq_stats[2];
++
+ 	/*
+ 	 * Number of active block driver functions for which blk_drain_queue()
+ 	 * must wait. Must be incremented around functions that unlock the
+-- 
+cgit v0.11.2
+
+From 9a38b8e46f9f759dbb3fd81810579ac1013bf814 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Thu, 8 Sep 2016 11:07:16 -0600
+Subject: wbt: add general throttling mechanism
+
+We can hook this up to the block layer, to help throttle buffered
+writes. Or NFS can tap into it, to accomplish the same.
+
+wbt registers a few trace points that can be used to track what is
+happening in the system:
+
+wbt_lat: 259:0: latency 2446318
+wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1,
+               wmean=518866, wmin=15522, wmax=5330353, wsamples=57
+wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, max=32
+
+This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat
+dumps the current read/write stats for that window, and wbt_step shows a
+step down event where we now scale back writes. Each trace includes the
+device, 259:0 in this case.
+
+Signed-off-by: Jens Axboe <axboe@fb.com>
+---
+ include/linux/wbt.h        | 120 ++++++++
+ include/trace/events/wbt.h | 153 ++++++++++
+ lib/Kconfig                |   3 +
+ lib/Makefile               |   1 +
+ lib/wbt.c                  | 681 +++++++++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 958 insertions(+)
+ create mode 100644 include/linux/wbt.h
+ create mode 100644 include/trace/events/wbt.h
+ create mode 100644 lib/wbt.c
+
+diff --git a/include/linux/wbt.h b/include/linux/wbt.h
+new file mode 100644
+index 0000000..5ffcd14
+--- /dev/null
++++ b/include/linux/wbt.h
+@@ -0,0 +1,120 @@
++#ifndef WB_THROTTLE_H
++#define WB_THROTTLE_H
++
++#include <linux/atomic.h>
++#include <linux/wait.h>
++#include <linux/timer.h>
++#include <linux/ktime.h>
++
++enum {
++	ISSUE_STAT_TRACKED	= 1ULL << 63,
++	ISSUE_STAT_READ		= 1ULL << 62,
++	ISSUE_STAT_MASK 	= ISSUE_STAT_TRACKED | ISSUE_STAT_READ,
++	ISSUE_STAT_TIME_MASK	= ~ISSUE_STAT_MASK,
++
++	WBT_TRACKED		= 1,
++	WBT_READ		= 2,
++};
++
++struct wb_issue_stat {
++	u64 time;
++};
++
++static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat)
++{
++	stat->time = (stat->time & ISSUE_STAT_MASK) |
++			(ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK);
++}
++
++static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat)
++{
++	return stat->time & ISSUE_STAT_TIME_MASK;
++}
++
++static inline void wbt_mark_tracked(struct wb_issue_stat *stat)
++{
++	stat->time |= ISSUE_STAT_TRACKED;
++}
++
++static inline void wbt_clear_state(struct wb_issue_stat *stat)
++{
++	stat->time &= ~(ISSUE_STAT_TRACKED | ISSUE_STAT_READ);
++}
++
++static inline bool wbt_tracked(struct wb_issue_stat *stat)
++{
++	return (stat->time & ISSUE_STAT_TRACKED) != 0;
++}
++
++static inline void wbt_mark_read(struct wb_issue_stat *stat)
++{
++	stat->time |= ISSUE_STAT_READ;
++}
++
++static inline bool wbt_is_read(struct wb_issue_stat *stat)
++{
++	return (stat->time & ISSUE_STAT_READ) != 0;
++}
++
++struct wb_stat_ops {
++	void (*get)(void *, struct blk_rq_stat *);
++	bool (*is_current)(struct blk_rq_stat *);
++	void (*clear)(void *);
++};
++
++struct rq_wb {
++	/*
++	 * Settings that govern how we throttle
++	 */
++	unsigned int wb_background;		/* background writeback */
++	unsigned int wb_normal;			/* normal writeback */
++	unsigned int wb_max;			/* max throughput writeback */
++	int scale_step;
++	bool scaled_max;
++
++	u64 win_nsec;				/* default window size */
++	u64 cur_win_nsec;			/* current window size */
++
++	/*
++	 * Number of consecutive periods where we don't have enough
++	 * information to make a firm scale up/down decision.
++	 */
++	unsigned int unknown_cnt;
++
++	struct timer_list window_timer;
++
++	s64 sync_issue;
++	void *sync_cookie;
++
++	unsigned int wc;
++	unsigned int queue_depth;
++
++	unsigned long last_issue;		/* last non-throttled issue */
++	unsigned long last_comp;		/* last non-throttled comp */
++	unsigned long min_lat_nsec;
++	struct backing_dev_info *bdi;
++	struct request_queue *q;
++	wait_queue_head_t wait;
++	atomic_t inflight;
++
++	struct wb_stat_ops *stat_ops;
++	void *ops_data;
++};
++
++struct backing_dev_info;
++
++void __wbt_done(struct rq_wb *);
++void wbt_done(struct rq_wb *, struct wb_issue_stat *);
++unsigned int wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
++struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void *);
++void wbt_exit(struct rq_wb *);
++void wbt_update_limits(struct rq_wb *);
++void wbt_requeue(struct rq_wb *, struct wb_issue_stat *);
++void wbt_issue(struct rq_wb *, struct wb_issue_stat *);
++void wbt_disable(struct rq_wb *);
++void wbt_track(struct wb_issue_stat *, unsigned int);
++
++void wbt_set_queue_depth(struct rq_wb *, unsigned int);
++void wbt_set_write_cache(struct rq_wb *, bool);
++
++#endif
+diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
+new file mode 100644
+index 0000000..926c7ee
+--- /dev/null
++++ b/include/trace/events/wbt.h
+@@ -0,0 +1,153 @@
++#undef TRACE_SYSTEM
++#define TRACE_SYSTEM wbt
++
++#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ)
++#define _TRACE_WBT_H
++
++#include <linux/tracepoint.h>
++#include <linux/wbt.h>
++
++/**
++ * wbt_stat - trace stats for blk_wb
++ * @stat: array of read/write stats
++ */
++TRACE_EVENT(wbt_stat,
++
++	TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat),
++
++	TP_ARGS(bdi, stat),
++
++	TP_STRUCT__entry(
++		__array(char, name, 32)
++		__field(s64, rmean)
++		__field(u64, rmin)
++		__field(u64, rmax)
++		__field(s64, rnr_samples)
++		__field(s64, rtime)
++		__field(s64, wmean)
++		__field(u64, wmin)
++		__field(u64, wmax)
++		__field(s64, wnr_samples)
++		__field(s64, wtime)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->name, dev_name(bdi->dev), 32);
++		__entry->rmean		= stat[0].mean;
++		__entry->rmin		= stat[0].min;
++		__entry->rmax		= stat[0].max;
++		__entry->rnr_samples	= stat[0].nr_samples;
++		__entry->wmean		= stat[1].mean;
++		__entry->wmin		= stat[1].min;
++		__entry->wmax		= stat[1].max;
++		__entry->wnr_samples	= stat[1].nr_samples;
++	),
++
++	TP_printk("%s: rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, "
++		  "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n",
++		  __entry->name, __entry->rmean, __entry->rmin, __entry->rmax,
++		  __entry->rnr_samples, __entry->wmean, __entry->wmin,
++		  __entry->wmax, __entry->wnr_samples)
++);
++
++/**
++ * wbt_lat - trace latency event
++ * @lat: latency trigger
++ */
++TRACE_EVENT(wbt_lat,
++
++	TP_PROTO(struct backing_dev_info *bdi, unsigned long lat),
++
++	TP_ARGS(bdi, lat),
++
++	TP_STRUCT__entry(
++		__array(char, name, 32)
++		__field(unsigned long, lat)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->name, dev_name(bdi->dev), 32);
++		__entry->lat = div_u64(lat, 1000);
++	),
++
++	TP_printk("%s: latency %lluus\n", __entry->name,
++			(unsigned long long) __entry->lat)
++);
++
++/**
++ * wbt_step - trace wb event step
++ * @msg: context message
++ * @step: the current scale step count
++ * @window: the current monitoring window
++ * @bg: the current background queue limit
++ * @normal: the current normal writeback limit
++ * @max: the current max throughput writeback limit
++ */
++TRACE_EVENT(wbt_step,
++
++	TP_PROTO(struct backing_dev_info *bdi, const char *msg,
++		 int step, unsigned long window, unsigned int bg,
++		 unsigned int normal, unsigned int max),
++
++	TP_ARGS(bdi, msg, step, window, bg, normal, max),
++
++	TP_STRUCT__entry(
++		__array(char, name, 32)
++		__field(const char *, msg)
++		__field(int, step)
++		__field(unsigned long, window)
++		__field(unsigned int, bg)
++		__field(unsigned int, normal)
++		__field(unsigned int, max)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->name, dev_name(bdi->dev), 32);
++		__entry->msg	= msg;
++		__entry->step	= step;
++		__entry->window	= div_u64(window, 1000);
++		__entry->bg	= bg;
++		__entry->normal	= normal;
++		__entry->max	= max;
++	),
++
++	TP_printk("%s: %s: step=%d, window=%luus, background=%u, normal=%u, max=%u\n",
++		  __entry->name, __entry->msg, __entry->step, __entry->window,
++		  __entry->bg, __entry->normal, __entry->max)
++);
++
++/**
++ * wbt_timer - trace wb timer event
++ * @status: timer state status
++ * @step: the current scale step count
++ * @inflight: tracked writes inflight
++ */
++TRACE_EVENT(wbt_timer,
++
++	TP_PROTO(struct backing_dev_info *bdi, unsigned int status,
++		 int step, unsigned int inflight),
++
++	TP_ARGS(bdi, status, step, inflight),
++
++	TP_STRUCT__entry(
++		__array(char, name, 32)
++		__field(unsigned int, status)
++		__field(int, step)
++		__field(unsigned int, inflight)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->name, dev_name(bdi->dev), 32);
++		__entry->status		= status;
++		__entry->step		= step;
++		__entry->inflight	= inflight;
++	),
++
++	TP_printk("%s: status=%u, step=%d, inflight=%u\n", __entry->name,
++		  __entry->status, __entry->step, __entry->inflight)
++);
++
++#endif /* _TRACE_WBT_H */
++
++/* This part must be outside protection */
++#include <trace/define_trace.h>
+diff --git a/lib/Kconfig b/lib/Kconfig
+index d79909d..c585e4c 100644
+--- a/lib/Kconfig
++++ b/lib/Kconfig
+@@ -550,4 +550,7 @@ config STACKDEPOT
+ 	bool
+ 	select STACKTRACE
+ 
++config WBT
++	bool
++
+ endmenu
+diff --git a/lib/Makefile b/lib/Makefile
+index 5dc77a8..23afd63 100644
+--- a/lib/Makefile
++++ b/lib/Makefile
+@@ -177,6 +177,7 @@ obj-$(CONFIG_SG_SPLIT) += sg_split.o
+ obj-$(CONFIG_SG_POOL) += sg_pool.o
+ obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
+ obj-$(CONFIG_IRQ_POLL) += irq_poll.o
++obj-$(CONFIG_WBT) += wbt.o
+ 
+ obj-$(CONFIG_STACKDEPOT) += stackdepot.o
+ KASAN_SANITIZE_stackdepot.o := n
+diff --git a/lib/wbt.c b/lib/wbt.c
+new file mode 100644
+index 0000000..a995703
+--- /dev/null
++++ b/lib/wbt.c
+@@ -0,0 +1,681 @@
++/*
++ * buffered writeback throttling. losely based on CoDel. We can't drop
++ * packets for IO scheduling, so the logic is something like this:
++ *
++ * - Monitor latencies in a defined window of time.
++ * - If the minimum latency in the above window exceeds some target, increment
++ *   scaling step and scale down queue depth by a factor of 2x. The monitoring
++ *   window is then shrunk to 100 / sqrt(scaling step + 1).
++ * - For any window where we don't have solid data on what the latencies
++ *   look like, retain status quo.
++ * - If latencies look good, decrement scaling step.
++ * - If we're only doing writes, allow the scaling step to go negative. This
++ *   will temporarily boost write performance, snapping back to a stable
++ *   scaling step of 0 if reads show up or the heavy writers finish. Unlike
++ *   positive scaling steps where we shrink the monitoring window, a negative
++ *   scaling step retains the default step==0 window size.
++ *
++ * Copyright (C) 2016 Jens Axboe
++ *
++ */
++#include <linux/kernel.h>
++#include <linux/blk_types.h>
++#include <linux/slab.h>
++#include <linux/backing-dev.h>
++#include <linux/wbt.h>
++
++#define CREATE_TRACE_POINTS
++#include <trace/events/wbt.h>
++
++enum {
++	/*
++	 * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
++	 * from here depending on device stats
++	 */
++	RWB_DEF_DEPTH	= 16,
++
++	/*
++	 * 100msec window
++	 */
++	RWB_WINDOW_NSEC		= 100 * 1000 * 1000ULL,
++
++	/*
++	 * Disregard stats, if we don't meet this minimum
++	 */
++	RWB_MIN_WRITE_SAMPLES	= 3,
++
++	/*
++	 * If we have this number of consecutive windows with not enough
++	 * information to scale up or down, scale up.
++	 */
++	RWB_UNKNOWN_BUMP	= 5,
++};
++
++static inline bool rwb_enabled(struct rq_wb *rwb)
++{
++	return rwb && rwb->wb_normal != 0;
++}
++
++/*
++ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
++ * false if 'v' + 1 would be bigger than 'below'.
++ */
++static bool atomic_inc_below(atomic_t *v, int below)
++{
++	int cur = atomic_read(v);
++
++	for (;;) {
++		int old;
++
++		if (cur >= below)
++			return false;
++		old = atomic_cmpxchg(v, cur, cur + 1);
++		if (old == cur)
++			break;
++		cur = old;
++	}
++
++	return true;
++}
++
++static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
++{
++	if (rwb_enabled(rwb)) {
++		const unsigned long cur = jiffies;
++
++		if (cur != *var)
++			*var = cur;
++	}
++}
++
++/*
++ * If a task was rate throttled in balance_dirty_pages() within the last
++ * second or so, use that to indicate a higher cleaning rate.
++ */
++static bool wb_recent_wait(struct rq_wb *rwb)
++{
++	struct bdi_writeback *wb = &rwb->bdi->wb;
++
++	return time_before(jiffies, wb->dirty_sleep + HZ);
++}
++
++void __wbt_done(struct rq_wb *rwb)
++{
++	int inflight, limit;
++
++	inflight = atomic_dec_return(&rwb->inflight);
++
++	/*
++	 * wbt got disabled with IO in flight. Wake up any potential
++	 * waiters, we don't have to do more than that.
++	 */
++	if (unlikely(!rwb_enabled(rwb))) {
++		wake_up_all(&rwb->wait);
++		return;
++	}
++
++	/*
++	 * If the device does write back caching, drop further down
++	 * before we wake people up.
++	 */
++	if (rwb->wc && !wb_recent_wait(rwb))
++		limit = 0;
++	else
++		limit = rwb->wb_normal;
++
++	/*
++	 * Don't wake anyone up if we are above the normal limit.
++	 */
++	if (inflight && inflight >= limit)
++		return;
++
++	if (waitqueue_active(&rwb->wait)) {
++		int diff = limit - inflight;
++
++		if (!inflight || diff >= rwb->wb_background / 2)
++			wake_up(&rwb->wait);
++	}
++}
++
++/*
++ * Called on completion of a request. Note that it's also called when
++ * a request is merged, when the request gets freed.
++ */
++void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat)
++{
++	if (!rwb)
++		return;
++
++	if (!wbt_tracked(stat)) {
++		if (rwb->sync_cookie == stat) {
++			rwb->sync_issue = 0;
++			rwb->sync_cookie = NULL;
++		}
++
++		if (wbt_is_read(stat))
++			wb_timestamp(rwb, &rwb->last_comp);
++		wbt_clear_state(stat);
++	} else {
++		WARN_ON_ONCE(stat == rwb->sync_cookie);
++		__wbt_done(rwb);
++		wbt_clear_state(stat);
++	}
++}
++
++/*
++ * Return true, if we can't increase the depth further by scaling
++ */
++static bool calc_wb_limits(struct rq_wb *rwb)
++{
++	unsigned int depth;
++	bool ret = false;
++
++	if (!rwb->min_lat_nsec) {
++		rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
++		return false;
++	}
++
++	/*
++	 * For QD=1 devices, this is a special case. It's important for those
++	 * to have one request ready when one completes, so force a depth of
++	 * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
++	 * since the device can't have more than that in flight. If we're
++	 * scaling down, then keep a setting of 1/1/1.
++	 */
++	if (rwb->queue_depth == 1) {
++		if (rwb->scale_step > 0)
++			rwb->wb_max = rwb->wb_normal = 1;
++		else {
++			rwb->wb_max = rwb->wb_normal = 2;
++			ret = true;
++		}
++		rwb->wb_background = 1;
++	} else {
++		/*
++		 * scale_step == 0 is our default state. If we have suffered
++		 * latency spikes, step will be > 0, and we shrink the
++		 * allowed write depths. If step is < 0, we're only doing
++		 * writes, and we allow a temporarily higher depth to
++		 * increase performance.
++		 */
++		depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
++		if (rwb->scale_step > 0)
++			depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
++		else if (rwb->scale_step < 0) {
++			unsigned int maxd = 3 * rwb->queue_depth / 4;
++
++			depth = 1 + ((depth - 1) << -rwb->scale_step);
++			if (depth > maxd) {
++				depth = maxd;
++				ret = true;
++			}
++		}
++
++		/*
++		 * Set our max/normal/bg queue depths based on how far
++		 * we have scaled down (->scale_step).
++		 */
++		rwb->wb_max = depth;
++		rwb->wb_normal = (rwb->wb_max + 1) / 2;
++		rwb->wb_background = (rwb->wb_max + 3) / 4;
++	}
++
++	return ret;
++}
++
++static bool inline stat_sample_valid(struct blk_rq_stat *stat)
++{
++	/*
++	 * We need at least one read sample, and a minimum of
++	 * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
++	 * that it's writes impacting us, and not just some sole read on
++	 * a device that is in a lower power state.
++	 */
++	return stat[0].nr_samples >= 1 &&
++		stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
++}
++
++static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
++{
++	u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
++
++	if (!issue || !rwb->sync_cookie)
++		return 0;
++
++	now = ktime_to_ns(ktime_get());
++	return now - issue;
++}
++
++enum {
++	LAT_OK = 1,
++	LAT_UNKNOWN,
++	LAT_UNKNOWN_WRITES,
++	LAT_EXCEEDED,
++};
++
++static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
++{
++	u64 thislat;
++
++	/*
++	 * If our stored sync issue exceeds the window size, or it
++	 * exceeds our min target AND we haven't logged any entries,
++	 * flag the latency as exceeded. wbt works off completion latencies,
++	 * but for a flooded device, a single sync IO can take a long time
++	 * to complete after being issued. If this time exceeds our
++	 * monitoring window AND we didn't see any other completions in that
++	 * window, then count that sync IO as a violation of the latency.
++	 */
++	thislat = rwb_sync_issue_lat(rwb);
++	if (thislat > rwb->cur_win_nsec ||
++	    (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
++		trace_wbt_lat(rwb->bdi, thislat);
++		return LAT_EXCEEDED;
++	}
++
++	/*
++	 * No read/write mix, if stat isn't valid
++	 */
++	if (!stat_sample_valid(stat)) {
++		/*
++		 * If we had writes in this stat window and the window is
++		 * current, we're only doing writes. If a task recently
++		 * waited or still has writes in flights, consider us doing
++		 * just writes as well.
++		 */
++		if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) ||
++		    wb_recent_wait(rwb) || atomic_read(&rwb->inflight))
++			return LAT_UNKNOWN_WRITES;
++		return LAT_UNKNOWN;
++	}
++
++	/*
++	 * If the 'min' latency exceeds our target, step down.
++	 */
++	if (stat[0].min > rwb->min_lat_nsec) {
++		trace_wbt_lat(rwb->bdi, stat[0].min);
++		trace_wbt_stat(rwb->bdi, stat);
++		return LAT_EXCEEDED;
++	}
++
++	if (rwb->scale_step)
++		trace_wbt_stat(rwb->bdi, stat);
++
++	return LAT_OK;
++}
++
++static int latency_exceeded(struct rq_wb *rwb)
++{
++	struct blk_rq_stat stat[2];
++
++	rwb->stat_ops->get(rwb->ops_data, stat);
++	return __latency_exceeded(rwb, stat);
++}
++
++static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
++{
++	trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
++			rwb->wb_background, rwb->wb_normal, rwb->wb_max);
++}
++
++static void scale_up(struct rq_wb *rwb)
++{
++	/*
++	 * Hit max in previous round, stop here
++	 */
++	if (rwb->scaled_max)
++		return;
++
++	rwb->scale_step--;
++	rwb->unknown_cnt = 0;
++	rwb->stat_ops->clear(rwb->ops_data);
++
++	rwb->scaled_max = calc_wb_limits(rwb);
++
++	if (waitqueue_active(&rwb->wait))
++		wake_up_all(&rwb->wait);
++
++	rwb_trace_step(rwb, "step up");
++}
++
++/*
++ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
++ * had a latency violation.
++ */
++static void scale_down(struct rq_wb *rwb, bool hard_throttle)
++{
++	/*
++	 * Stop scaling down when we've hit the limit. This also prevents
++	 * ->scale_step from going to crazy values, if the device can't
++	 * keep up.
++	 */
++	if (rwb->wb_max == 1)
++		return;
++
++	if (rwb->scale_step < 0 && hard_throttle)
++		rwb->scale_step = 0;
++	else
++		rwb->scale_step++;
++
++	rwb->scaled_max = false;
++	rwb->unknown_cnt = 0;
++	rwb->stat_ops->clear(rwb->ops_data);
++	calc_wb_limits(rwb);
++	rwb_trace_step(rwb, "step down");
++}
++
++static void rwb_arm_timer(struct rq_wb *rwb)
++{
++	unsigned long expires;
++
++	if (rwb->scale_step > 0) {
++		/*
++		 * We should speed this up, using some variant of a fast
++		 * integer inverse square root calculation. Since we only do
++		 * this for every window expiration, it's not a huge deal,
++		 * though.
++		 */
++		rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
++					int_sqrt((rwb->scale_step + 1) << 8));
++	} else {
++		/*
++		 * For step < 0, we don't want to increase/decrease the
++		 * window size.
++		 */
++		rwb->cur_win_nsec = rwb->win_nsec;
++	}
++
++	expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
++	mod_timer(&rwb->window_timer, expires);
++}
++
++static void wb_timer_fn(unsigned long data)
++{
++	struct rq_wb *rwb = (struct rq_wb *) data;
++	int status, inflight;
++
++	inflight = atomic_read(&rwb->inflight);
++
++	status = latency_exceeded(rwb);
++
++	trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight);
++
++	/*
++	 * If we exceeded the latency target, step down. If we did not,
++	 * step one level up. If we don't know enough to say either exceeded
++	 * or ok, then don't do anything.
++	 */
++	switch (status) {
++	case LAT_EXCEEDED:
++		scale_down(rwb, true);
++		break;
++	case LAT_OK:
++		scale_up(rwb);
++		break;
++	case LAT_UNKNOWN_WRITES:
++		scale_up(rwb);
++		break;
++	case LAT_UNKNOWN:
++		if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
++			break;
++		/*
++		 * We get here for two reasons:
++		 *
++		 * 1) We previously scaled reduced depth, and we currently
++		 *    don't have a valid read/write sample. For that case,
++		 *    slowly return to center state (step == 0).
++		 * 2) We started a the center step, but don't have a valid
++		 *    read/write sample, but we do have writes going on.
++		 *    Allow step to go negative, to increase write perf.
++		 */
++		if (rwb->scale_step > 0)
++			scale_up(rwb);
++		else if (rwb->scale_step < 0)
++			scale_down(rwb, false);
++		break;
++	default:
++		break;
++	}
++
++	/*
++	 * Re-arm timer, if we have IO in flight
++	 */
++	if (rwb->scale_step || inflight)
++		rwb_arm_timer(rwb);
++}
++
++void wbt_update_limits(struct rq_wb *rwb)
++{
++	rwb->scale_step = 0;
++	rwb->scaled_max = false;
++	calc_wb_limits(rwb);
++
++	if (waitqueue_active(&rwb->wait))
++		wake_up_all(&rwb->wait);
++}
++
++static bool close_io(struct rq_wb *rwb)
++{
++	const unsigned long now = jiffies;
++
++	return time_before(now, rwb->last_issue + HZ / 10) ||
++		time_before(now, rwb->last_comp + HZ / 10);
++}
++
++#define REQ_HIPRIO	(REQ_SYNC | REQ_META | REQ_PRIO)
++
++static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
++{
++	unsigned int limit;
++
++	/*
++	 * At this point we know it's a buffered write. If REQ_SYNC is
++	 * set, then it's WB_SYNC_ALL writeback, and we'll use the max
++	 * limit for that. If the write is marked as a background write,
++	 * then use the idle limit, or go to normal if we haven't had
++	 * competing IO for a bit.
++	 */
++	if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb))
++		limit = rwb->wb_max;
++	else if ((rw & REQ_BG) || close_io(rwb)) {
++		/*
++		 * If less than 100ms since we completed unrelated IO,
++		 * limit us to half the depth for background writeback.
++		 */
++		limit = rwb->wb_background;
++	} else
++		limit = rwb->wb_normal;
++
++	return limit;
++}
++
++static inline bool may_queue(struct rq_wb *rwb, unsigned long rw)
++{
++	/*
++	 * inc it here even if disabled, since we'll dec it at completion.
++	 * this only happens if the task was sleeping in __wbt_wait(),
++	 * and someone turned it off at the same time.
++	 */
++	if (!rwb_enabled(rwb)) {
++		atomic_inc(&rwb->inflight);
++		return true;
++	}
++
++	return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw));
++}
++
++/*
++ * Block if we will exceed our limit, or if we are currently waiting for
++ * the timer to kick off queuing again.
++ */
++static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
++{
++	DEFINE_WAIT(wait);
++
++	if (may_queue(rwb, rw))
++		return;
++
++	do {
++		prepare_to_wait_exclusive(&rwb->wait, &wait,
++						TASK_UNINTERRUPTIBLE);
++
++		if (may_queue(rwb, rw))
++			break;
++
++		if (lock)
++			spin_unlock_irq(lock);
++
++		io_schedule();
++
++		if (lock)
++			spin_lock_irq(lock);
++	} while (1);
++
++	finish_wait(&rwb->wait, &wait);
++}
++
++static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
++{
++	const int op = rw >> BIO_OP_SHIFT;
++
++	/*
++	 * If not a WRITE (or a discard), do nothing
++	 */
++	if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
++		return false;
++
++	/*
++	 * Don't throttle WRITE_ODIRECT
++	 */
++	if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
++		return false;
++
++	return true;
++}
++
++/*
++ * Returns true if the IO request should be accounted, false if not.
++ * May sleep, if we have exceeded the writeback limits. Caller can pass
++ * in an irq held spinlock, if it holds one when calling this function.
++ * If we do sleep, we'll release and re-grab it.
++ */
++unsigned int wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
++{
++	unsigned int ret;
++
++	if (!rwb_enabled(rwb))
++		return 0;
++
++	if ((rw >> BIO_OP_SHIFT) == REQ_OP_READ)
++		ret = WBT_READ;
++
++	if (!wbt_should_throttle(rwb, rw)) {
++		if (ret & WBT_READ)
++			wb_timestamp(rwb, &rwb->last_issue);
++		return ret;
++	}
++
++	__wbt_wait(rwb, rw, lock);
++
++	if (!timer_pending(&rwb->window_timer))
++		rwb_arm_timer(rwb);
++
++	return ret | WBT_TRACKED;
++}
++
++void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat)
++{
++	if (!rwb_enabled(rwb))
++		return;
++
++	wbt_issue_stat_set_time(stat);
++
++	/*
++	 * Track sync issue, in case it takes a long time to complete. Allows
++	 * us to react quicker, if a sync IO takes a long time to complete.
++	 * Note that this is just a hint. 'stat' can go away when the
++	 * request completes, so it's important we never dereference it. We
++	 * only use the address to compare with, which is why we store the
++	 * sync_issue time locally.
++	 */
++	if (wbt_is_read(stat) && !rwb->sync_issue) {
++		rwb->sync_cookie = stat;
++		rwb->sync_issue = wbt_issue_stat_get_time(stat);
++	}
++}
++
++void wbt_track(struct wb_issue_stat *stat, unsigned int wb_acct)
++{
++	if (wb_acct & WBT_TRACKED)
++		wbt_mark_tracked(stat);
++	else if (wb_acct & WBT_READ)
++		wbt_mark_read(stat);
++}
++
++void wbt_requeue(struct rq_wb *rwb, struct wb_issue_stat *stat)
++{
++	if (!rwb_enabled(rwb))
++		return;
++	if (stat == rwb->sync_cookie) {
++		rwb->sync_issue = 0;
++		rwb->sync_cookie = NULL;
++	}
++}
++
++void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
++{
++	if (rwb) {
++		rwb->queue_depth = depth;
++		wbt_update_limits(rwb);
++	}
++}
++
++void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
++{
++	if (rwb)
++		rwb->wc = write_cache_on;
++}
++
++void wbt_disable(struct rq_wb *rwb)
++{
++	if (rwb) {
++		del_timer_sync(&rwb->window_timer);
++		rwb->win_nsec = rwb->min_lat_nsec = 0;
++		wbt_update_limits(rwb);
++	}
++}
++EXPORT_SYMBOL_GPL(wbt_disable);
++
++struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops,
++		       void *ops_data)
++{
++	struct rq_wb *rwb;
++
++	if (!ops->get || !ops->is_current || !ops->clear)
++		return ERR_PTR(-EINVAL);
++
++	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
++	if (!rwb)
++		return ERR_PTR(-ENOMEM);
++
++	atomic_set(&rwb->inflight, 0);
++	init_waitqueue_head(&rwb->wait);
++	setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
++	rwb->wc = 1;
++	rwb->queue_depth = RWB_DEF_DEPTH;
++	rwb->last_comp = rwb->last_issue = jiffies;
++	rwb->bdi = bdi;
++	rwb->win_nsec = RWB_WINDOW_NSEC;
++	rwb->stat_ops = ops,
++	rwb->ops_data = ops_data;
++	wbt_update_limits(rwb);
++	return rwb;
++}
++
++void wbt_exit(struct rq_wb *rwb)
++{
++	if (rwb) {
++		del_timer_sync(&rwb->window_timer);
++		kfree(rwb);
++	}
++}
+-- 
+cgit v0.11.2
+
+From db3de07314ef350fceb90ade08474fe4eea5e665 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Thu, 8 Sep 2016 11:08:17 -0600
+Subject: writeback: throttle buffered writeback
+
+Test patch that throttles buffered writeback to make it a lot
+more smooth, and has way less impact on other system activity.
+Background writeback should be, by definition, background
+activity. The fact that we flush huge bundles of it at the time
+means that it potentially has heavy impacts on foreground workloads,
+which isn't ideal. We can't easily limit the sizes of writes that
+we do, since that would impact file system layout in the presence
+of delayed allocation. So just throttle back buffered writeback,
+unless someone is waiting for it.
+
+The algorithm for when to throttle takes its inspiration in the
+CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors
+the minimum latencies of requests over a window of time. In that
+window of time, if the minimum latency of any request exceeds a
+given target, then a scale count is incremented and the queue depth
+is shrunk. The next monitoring window is shrunk accordingly. Unlike
+CoDel, if we hit a window that exhibits good behavior, then we
+simply increment the scale count and re-calculate the limits for that
+scale value. This prevents us from oscillating between a
+close-to-ideal value and max all the time, instead remaining in the
+windows where we get good behavior.
+
+Unlike CoDel, blk-wb allows the scale count to to negative. This
+happens if we primarily have writes going on. Unlike positive
+scale counts, this doesn't change the size of the monitoring window.
+When the heavy writers finish, blk-bw quickly snaps back to it's
+stable state of a zero scale count.
+
+The patch registers two sysfs entries. The first one, 'wb_window_usec',
+defines the window of monitoring. The second one, 'wb_lat_usec',
+sets the latency target for the window. It defaults to 2 msec for
+non-rotational storage, and 75 msec for rotational storage. Setting
+this value to '0' disables blk-wb. Generally, a user would not have
+to touch these settings.
+
+We don't enable WBT on devices that are managed with CFQ, and have
+a non-root block cgroup attached. If we have a proportional share setup
+on this particular disk, then the wbt throttling will interfere with
+that. We don't have a strong need for wbt for that case, since we will
+rely on CFQ doing that for us.
+
+Signed-off-by: Jens Axboe <axboe@fb.com>
+---
+ Documentation/block/queue-sysfs.txt |  13 ++++
+ block/Kconfig                       |   1 +
+ block/blk-core.c                    |  20 +++++-
+ block/blk-mq.c                      |  30 ++++++++-
+ block/blk-settings.c                |   3 +
+ block/blk-stat.c                    |   5 +-
+ block/blk-sysfs.c                   | 125 ++++++++++++++++++++++++++++++++++++
+ block/cfq-iosched.c                 |  13 ++++
+ include/linux/blkdev.h              |   6 +-
+ 9 files changed, 207 insertions(+), 9 deletions(-)
+
+diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
+index 2a39040..2847219 100644
+--- a/Documentation/block/queue-sysfs.txt
++++ b/Documentation/block/queue-sysfs.txt
+@@ -169,5 +169,18 @@ This is the number of bytes the device can write in a single write-same
+ command.  A value of '0' means write-same is not supported by this
+ device.
+ 
++wb_lat_usec (RW)
++----------------
++If the device is registered for writeback throttling, then this file shows
++the target minimum read latency. If this latency is exceeded in a given
++window of time (see wb_window_usec), then the writeback throttling will start
++scaling back writes.
++
++wb_window_usec (RW)
++-------------------
++If the device is registered for writeback throttling, then this file shows
++the value of the monitoring window in which we'll look at the target
++latency. See wb_lat_usec.
++
+ 
+ Jens Axboe <jens.axboe@oracle.com>, February 2009
+diff --git a/block/Kconfig b/block/Kconfig
+index 161491d..6da79e6 100644
+--- a/block/Kconfig
++++ b/block/Kconfig
+@@ -4,6 +4,7 @@
+ menuconfig BLOCK
+        bool "Enable the block layer" if EXPERT
+        default y
++       select WBT
+        help
+ 	 Provide block layer support for the kernel.
+ 
+diff --git a/block/blk-core.c b/block/blk-core.c
+index 4075cbe..4f4ce05 100644
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -33,6 +33,7 @@
+ #include <linux/ratelimit.h>
+ #include <linux/pm_runtime.h>
+ #include <linux/blk-cgroup.h>
++#include <linux/wbt.h>
+ 
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/block.h>
+@@ -882,6 +883,8 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
+ 
+ fail:
+ 	blk_free_flush_queue(q->fq);
++	wbt_exit(q->rq_wb);
++	q->rq_wb = NULL;
+ 	return NULL;
+ }
+ EXPORT_SYMBOL(blk_init_allocated_queue);
+@@ -1346,6 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
+ 	blk_delete_timer(rq);
+ 	blk_clear_rq_complete(rq);
+ 	trace_block_rq_requeue(q, rq);
++	wbt_requeue(q->rq_wb, &rq->wb_stat);
+ 
+ 	if (rq->cmd_flags & REQ_QUEUED)
+ 		blk_queue_end_tag(q, rq);
+@@ -1436,6 +1440,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
+ 	/* this is a bio leak */
+ 	WARN_ON(req->bio != NULL);
+ 
++	wbt_done(q->rq_wb, &req->wb_stat);
++
+ 	/*
+ 	 * Request may not have originated from ll_rw_blk. if not,
+ 	 * it didn't come out of our reserved rq pools
+@@ -1667,6 +1673,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
+ 	int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT;
+ 	struct request *req;
+ 	unsigned int request_count = 0;
++	unsigned int wb_acct;
+ 
+ 	/*
+ 	 * low level driver can indicate that it wants pages above a
+@@ -1719,6 +1726,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
+ 	}
+ 
+ get_rq:
++	wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, q->queue_lock);
++
+ 	/*
+ 	 * This sync check and mask will be re-done in init_request_from_bio(),
+ 	 * but we need to set it earlier to expose the sync flag to the
+@@ -1738,11 +1747,15 @@ get_rq:
+ 	 */
+ 	req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO);
+ 	if (IS_ERR(req)) {
++		if (wb_acct & WBT_TRACKED)
++			__wbt_done(q->rq_wb);
+ 		bio->bi_error = PTR_ERR(req);
+ 		bio_endio(bio);
+ 		goto out_unlock;
+ 	}
+ 
++	wbt_track(&req->wb_stat, wb_acct);
++
+ 	/*
+ 	 * After dropping the lock and possibly sleeping here, our request
+ 	 * may now be mergeable after it had proven unmergeable (above).
+@@ -2475,7 +2488,7 @@ void blk_start_request(struct request *req)
+ {
+ 	blk_dequeue_request(req);
+ 
+-	req->issue_time = ktime_to_ns(ktime_get());
++	wbt_issue(req->q->rq_wb, &req->wb_stat);
+ 
+ 	/*
+ 	 * We are now handing the request to the hardware, initialize
+@@ -2713,9 +2726,10 @@ void blk_finish_request(struct request *req, int error)
+ 
+ 	blk_account_io_done(req);
+ 
+-	if (req->end_io)
++	if (req->end_io) {
++		wbt_done(req->q->rq_wb, &req->wb_stat);
+ 		req->end_io(req, error);
+-	else {
++	} else {
+ 		if (blk_bidi_rq(req))
+ 			__blk_put_request(req->next_rq->q, req->next_rq);
+ 
+diff --git a/block/blk-mq.c b/block/blk-mq.c
+index 712f141..511289a 100644
+--- a/block/blk-mq.c
++++ b/block/blk-mq.c
+@@ -22,6 +22,7 @@
+ #include <linux/sched/sysctl.h>
+ #include <linux/delay.h>
+ #include <linux/crash_dump.h>
++#include <linux/wbt.h>
+ 
+ #include <trace/events/block.h>
+ 
+@@ -319,6 +320,8 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
+ 
+ 	if (rq->cmd_flags & REQ_MQ_INFLIGHT)
+ 		atomic_dec(&hctx->nr_active);
++
++	wbt_done(q->rq_wb, &rq->wb_stat);
+ 	rq->cmd_flags = 0;
+ 
+ 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+@@ -351,6 +354,7 @@ inline void __blk_mq_end_request(struct request *rq, int error)
+ 	blk_account_io_done(rq);
+ 
+ 	if (rq->end_io) {
++		wbt_done(rq->q->rq_wb, &rq->wb_stat);
+ 		rq->end_io(rq, error);
+ 	} else {
+ 		if (unlikely(blk_bidi_rq(rq)))
+@@ -457,7 +461,7 @@ void blk_mq_start_request(struct request *rq)
+ 	if (unlikely(blk_bidi_rq(rq)))
+ 		rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
+ 
+-	rq->issue_time = ktime_to_ns(ktime_get());
++	wbt_issue(q->rq_wb, &rq->wb_stat);
+ 
+ 	blk_add_timer(rq);
+ 
+@@ -494,6 +498,7 @@ static void __blk_mq_requeue_request(struct request *rq)
+ 	struct request_queue *q = rq->q;
+ 
+ 	trace_block_rq_requeue(q, rq);
++	wbt_requeue(q->rq_wb, &rq->wb_stat);
+ 
+ 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+ 		if (q->dma_drain_size && blk_rq_bytes(rq))
+@@ -1312,6 +1317,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
+ 	struct blk_plug *plug;
+ 	struct request *same_queue_rq = NULL;
+ 	blk_qc_t cookie;
++	unsigned int wb_acct;
+ 
+ 	blk_queue_bounce(q, &bio);
+ 
+@@ -1326,9 +1332,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
+ 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+ 		return BLK_QC_T_NONE;
+ 
++	wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, NULL);
++
+ 	rq = blk_mq_map_request(q, bio, &data);
+-	if (unlikely(!rq))
++	if (unlikely(!rq)) {
++		if (wb_acct & WBT_TRACKED)
++			__wbt_done(q->rq_wb);
+ 		return BLK_QC_T_NONE;
++	}
++
++	wbt_track(&rq->wb_stat, wb_acct);
+ 
+ 	cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
+ 
+@@ -1405,6 +1418,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
+ 	struct blk_map_ctx data;
+ 	struct request *rq;
+ 	blk_qc_t cookie;
++	unsigned int wb_acct;
+ 
+ 	blk_queue_bounce(q, &bio);
+ 
+@@ -1421,9 +1435,16 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
+ 	} else
+ 		request_count = blk_plug_queued_count(q);
+ 
++	wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, NULL);
++
+ 	rq = blk_mq_map_request(q, bio, &data);
+-	if (unlikely(!rq))
++	if (unlikely(!rq)) {
++		if (wb_acct & WBT_TRACKED)
++			__wbt_done(q->rq_wb);
+ 		return BLK_QC_T_NONE;
++	}
++
++	wbt_track(&rq->wb_stat, wb_acct);
+ 
+ 	cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
+ 
+@@ -2147,6 +2168,9 @@ void blk_mq_free_queue(struct request_queue *q)
+ 	list_del_init(&q->all_q_node);
+ 	mutex_unlock(&all_q_mutex);
+ 
++	wbt_exit(q->rq_wb);
++	q->rq_wb = NULL;
++
+ 	blk_mq_del_queue_tag_set(q);
+ 
+ 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
+diff --git a/block/blk-settings.c b/block/blk-settings.c
+index f7e122e..746dc9f 100644
+--- a/block/blk-settings.c
++++ b/block/blk-settings.c
+@@ -840,6 +840,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
+ void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
+ {
+ 	q->queue_depth = depth;
++	wbt_set_queue_depth(q->rq_wb, depth);
+ }
+ EXPORT_SYMBOL(blk_set_queue_depth);
+ 
+@@ -863,6 +864,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
+ 	else
+ 		queue_flag_clear(QUEUE_FLAG_FUA, q);
+ 	spin_unlock_irq(q->queue_lock);
++
++	wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+ }
+ EXPORT_SYMBOL_GPL(blk_queue_write_cache);
+ 
+diff --git a/block/blk-stat.c b/block/blk-stat.c
+index 3965e8a..bdb16d8 100644
+--- a/block/blk-stat.c
++++ b/block/blk-stat.c
+@@ -178,15 +178,16 @@ bool blk_stat_is_current(struct blk_rq_stat *stat)
+ void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+ {
+ 	s64 now, value;
++	u64 rq_time = wbt_issue_stat_get_time(&rq->wb_stat);
+ 
+ 	now = ktime_to_ns(ktime_get());
+-	if (now < rq->issue_time)
++	if (now < rq_time)
+ 		return;
+ 
+ 	if (!__blk_stat_is_current(stat, now))
+ 		__blk_stat_init(stat, now);
+ 
+-	value = now - rq->issue_time;
++	value = now - rq_time;
+ 	if (value > stat->max)
+ 		stat->max = value;
+ 	if (value < stat->min)
+diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
+index 0b9e435..85c3dc2 100644
+--- a/block/blk-sysfs.c
++++ b/block/blk-sysfs.c
+@@ -10,6 +10,7 @@
+ #include <linux/blktrace_api.h>
+ #include <linux/blk-mq.h>
+ #include <linux/blk-cgroup.h>
++#include <linux/wbt.h>
+ 
+ #include "blk.h"
+ #include "blk-mq.h"
+@@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
+ 	return count;
+ }
+ 
++static ssize_t queue_var_store64(u64 *var, const char *page)
++{
++	int err;
++	u64 v;
++
++	err = kstrtou64(page, 10, &v);
++	if (err < 0)
++		return err;
++
++	*var = v;
++	return 0;
++}
++
+ static ssize_t queue_requests_show(struct request_queue *q, char *page)
+ {
+ 	return queue_var_show(q->nr_requests, (page));
+@@ -347,6 +361,58 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
+ 	return ret;
+ }
+ 
++static ssize_t queue_wb_win_show(struct request_queue *q, char *page)
++{
++	if (!q->rq_wb)
++		return -EINVAL;
++
++	return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000));
++}
++
++static ssize_t queue_wb_win_store(struct request_queue *q, const char *page,
++				  size_t count)
++{
++	ssize_t ret;
++	u64 val;
++
++	if (!q->rq_wb)
++		return -EINVAL;
++
++	ret = queue_var_store64(&val, page);
++	if (ret < 0)
++		return ret;
++
++	q->rq_wb->win_nsec = val * 1000ULL;
++	wbt_update_limits(q->rq_wb);
++	return count;
++}
++
++static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
++{
++	if (!q->rq_wb)
++		return -EINVAL;
++
++	return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
++}
++
++static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
++				  size_t count)
++{
++	ssize_t ret;
++	u64 val;
++
++	if (!q->rq_wb)
++		return -EINVAL;
++
++	ret = queue_var_store64(&val, page);
++	if (ret < 0)
++		return ret;
++
++	q->rq_wb->min_lat_nsec = val * 1000ULL;
++	wbt_update_limits(q->rq_wb);
++	return count;
++}
++
+ static ssize_t queue_wc_show(struct request_queue *q, char *page)
+ {
+ 	if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+@@ -551,6 +617,18 @@ static struct queue_sysfs_entry queue_stats_entry = {
+ 	.show = queue_stats_show,
+ };
+ 
++static struct queue_sysfs_entry queue_wb_lat_entry = {
++	.attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
++	.show = queue_wb_lat_show,
++	.store = queue_wb_lat_store,
++};
++
++static struct queue_sysfs_entry queue_wb_win_entry = {
++	.attr = {.name = "wbt_window_usec", .mode = S_IRUGO | S_IWUSR },
++	.show = queue_wb_win_show,
++	.store = queue_wb_win_store,
++};
++
+ static struct attribute *default_attrs[] = {
+ 	&queue_requests_entry.attr,
+ 	&queue_ra_entry.attr,
+@@ -579,6 +657,8 @@ static struct attribute *default_attrs[] = {
+ 	&queue_wc_entry.attr,
+ 	&queue_dax_entry.attr,
+ 	&queue_stats_entry.attr,
++	&queue_wb_lat_entry.attr,
++	&queue_wb_win_entry.attr,
+ 	NULL,
+ };
+ 
+@@ -693,6 +773,49 @@ struct kobj_type blk_queue_ktype = {
+ 	.release	= blk_release_queue,
+ };
+ 
++static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat)
++{
++	blk_queue_stat_get(data, stat);
++}
++
++static void blk_wb_stat_clear(void *data)
++{
++	blk_stat_clear(data);
++}
++
++static bool blk_wb_stat_is_current(struct blk_rq_stat *stat)
++{
++	return blk_stat_is_current(stat);
++}
++
++static struct wb_stat_ops wb_stat_ops = {
++	.get		= blk_wb_stat_get,
++	.is_current	= blk_wb_stat_is_current,
++	.clear		= blk_wb_stat_clear,
++};
++
++static void blk_wb_init(struct request_queue *q)
++{
++	struct rq_wb *rwb;
++
++	rwb = wbt_init(&q->backing_dev_info, &wb_stat_ops, q);
++
++	/*
++	 * If this fails, we don't get throttling
++	 */
++	if (IS_ERR(rwb))
++		return;
++
++	if (blk_queue_nonrot(q))
++		rwb->min_lat_nsec = 2000000ULL;
++	else
++		rwb->min_lat_nsec = 75000000ULL;
++
++	wbt_set_queue_depth(rwb, blk_queue_depth(q));
++	wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
++	q->rq_wb = rwb;
++}
++
+ int blk_register_queue(struct gendisk *disk)
+ {
+ 	int ret;
+@@ -732,6 +855,8 @@ int blk_register_queue(struct gendisk *disk)
+ 	if (q->mq_ops)
+ 		blk_mq_register_disk(disk);
+ 
++	blk_wb_init(q);
++
+ 	if (!q->request_fn)
+ 		return 0;
+ 
+diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
+index cc2f6db..fdcd5999 100644
+--- a/block/cfq-iosched.c
++++ b/block/cfq-iosched.c
+@@ -3764,9 +3764,11 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+ 	struct cfq_data *cfqd = cic_to_cfqd(cic);
+ 	struct cfq_queue *cfqq;
+ 	uint64_t serial_nr;
++	bool nonroot_cg;
+ 
+ 	rcu_read_lock();
+ 	serial_nr = bio_blkcg(bio)->css.serial_nr;
++	nonroot_cg = bio_blkcg(bio) != &blkcg_root;
+ 	rcu_read_unlock();
+ 
+ 	/*
+@@ -3777,6 +3779,17 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+ 		return;
+ 
+ 	/*
++	 * If we have a non-root cgroup, we can depend on that to
++	 * do proper throttling of writes. Turn off wbt for that
++	 * case.
++	 */
++	if (nonroot_cg) {
++		struct request_queue *q = cfqd->queue;
++
++		wbt_disable(q->rq_wb);
++	}
++
++	/*
+ 	 * Drop reference to queues.  New queues will be assigned in new
+ 	 * group upon arrival of fresh requests.
+ 	 */
+diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
+index 259eba8..45256d7 100644
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -24,6 +24,7 @@
+ #include <linux/rcupdate.h>
+ #include <linux/percpu-refcount.h>
+ #include <linux/scatterlist.h>
++#include <linux/wbt.h>
+ 
+ struct module;
+ struct scsi_ioctl_command;
+@@ -37,6 +38,7 @@ struct bsg_job;
+ struct blkcg_gq;
+ struct blk_flush_queue;
+ struct pr_ops;
++struct rq_wb;
+ 
+ #define BLKDEV_MIN_RQ	4
+ #define BLKDEV_MAX_RQ	128	/* Default maximum */
+@@ -151,7 +153,7 @@ struct request {
+ 	struct gendisk *rq_disk;
+ 	struct hd_struct *part;
+ 	unsigned long start_time;
+-	s64 issue_time;
++	struct wb_issue_stat wb_stat;
+ #ifdef CONFIG_BLK_CGROUP
+ 	struct request_list *rl;		/* rl this rq is alloced from */
+ 	unsigned long long start_time_ns;
+@@ -303,6 +305,8 @@ struct request_queue {
+ 	int			nr_rqs[2];	/* # allocated [a]sync rqs */
+ 	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */
+ 
++	struct rq_wb		*rq_wb;
++
+ 	/*
+ 	 * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
+ 	 * is used, root blkg allocates from @q->root_rl and all other
+-- 
+cgit v0.11.2
+
+From 21c990f3ab1d3324ad3152cb94f86e6e0772b73c Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Sat, 10 Sep 2016 10:06:26 -0600
+Subject: wbt: spelling check fix
+
+Signed-off-by: Jens Axboe <axboe@fb.com>
+---
+ lib/wbt.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/lib/wbt.c b/lib/wbt.c
+index a995703..5c507e5 100644
+--- a/lib/wbt.c
++++ b/lib/wbt.c
+@@ -1,5 +1,5 @@
+ /*
+- * buffered writeback throttling. losely based on CoDel. We can't drop
++ * buffered writeback throttling. loosely based on CoDel. We can't drop
+  * packets for IO scheduling, so the logic is something like this:
+  *
+  * - Monitor latencies in a defined window of time.
+-- 
+cgit v0.11.2
+
diff --git a/config.x86_64 b/config.x86_64
index 63c2f3f9a3af..ba7715200831 100644
--- a/config.x86_64
+++ b/config.x86_64
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/x86 4.6.0 Kernel Configuration
+# Linux/x86 4.8.11-2.1 Kernel Configuration
 #
 CONFIG_64BIT=y
 CONFIG_X86_64=y
@@ -441,9 +441,15 @@ CONFIG_IOMMU_HELPER=y
 CONFIG_NR_CPUS=128
 CONFIG_SCHED_SMT=y
 CONFIG_SCHED_MC=y
+CONFIG_PREEMPT=y
+CONFIG_PREEMPT_RT_BASE=y
+CONFIG_HAVE_PREEMPT_LAZY=y
+CONFIG_PREEMPT_LAZY=y
 # CONFIG_PREEMPT_NONE is not set
 # CONFIG_PREEMPT_VOLUNTARY is not set
-CONFIG_PREEMPT=y
+# CONFIG_PREEMPT__LL is not set
+# CONFIG_PREEMPT_RTB is not set
+CONFIG_PREEMPT_RT_FULL=y
 CONFIG_PREEMPT_COUNT=y
 CONFIG_X86_LOCAL_APIC=y
 CONFIG_X86_IO_APIC=y
@@ -663,7 +669,7 @@ CONFIG_SFI=y
 #
 CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_GOV_COMMON=y
-CONFIG_CPU_FREQ_STAT=y
+CONFIG_CPU_FREQ_STAT=m
 CONFIG_CPU_FREQ_STAT_DETAILS=y
 # CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
 # CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
@@ -1611,7 +1617,7 @@ CONFIG_BT_ATH3K=m
 CONFIG_BT_WILINK=m
 CONFIG_AF_RXRPC=m
 # CONFIG_AF_RXRPC_DEBUG is not set
-CONFIG_RXKAD=y
+CONFIG_RXKAD=m
 CONFIG_AF_KCM=m
 CONFIG_FIB_RULES=y
 CONFIG_WIRELESS=y
@@ -7708,7 +7714,7 @@ CONFIG_CRYPTO_DEV_QAT_C62X=m
 CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m
 CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m
 CONFIG_CRYPTO_DEV_QAT_C62XVF=m
-CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_KEY_TYPE=m
 CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m
 CONFIG_X509_CERTIFICATE_PARSER=m
 CONFIG_PKCS7_MESSAGE_PARSER=m
diff --git a/init.patch b/init.patch
new file mode 100644
index 000000000000..60af1eb9412a
--- /dev/null
+++ b/init.patch
@@ -0,0 +1,38 @@
+--- a/init/do_mounts.c	2015-08-19 10:27:16.753852576 -0400
++++ b/init/do_mounts.c	2015-08-19 10:34:25.473850353 -0400
+@@ -490,7 +490,11 @@ void __init change_floppy(char *fmt, ...
+ 	va_start(args, fmt);
+ 	vsprintf(buf, fmt, args);
+ 	va_end(args);
+-	fd = sys_open("/dev/root", O_RDWR | O_NDELAY, 0);
++	if (saved_root_name[0])
++		fd = sys_open(saved_root_name, O_RDWR | O_NDELAY, 0);
++	else
++		fd = sys_open("/dev/root", O_RDWR | O_NDELAY, 0);
++
+ 	if (fd >= 0) {
+ 		sys_ioctl(fd, FDEJECT, 0);
+ 		sys_close(fd);
+@@ -534,11 +538,17 @@ void __init mount_root(void)
+ #endif
+ #ifdef CONFIG_BLOCK
+ 	{
+-		int err = create_dev("/dev/root", ROOT_DEV);
+-
+-		if (err < 0)
+-			pr_emerg("Failed to create /dev/root: %d\n", err);
+-		mount_block_root("/dev/root", root_mountflags);
++		if (saved_root_name[0] == '/') {
++	       	int err = create_dev(saved_root_name, ROOT_DEV);
++			if (err < 0)
++				pr_emerg("Failed to create %s: %d\n", saved_root_name, err);
++			mount_block_root(saved_root_name, root_mountflags);
++		} else {
++			int err = create_dev("/dev/root", ROOT_DEV);
++			if (err < 0)
++				pr_emerg("Failed to create /dev/root: %d\n", err);
++			mount_block_root("/dev/root", root_mountflags);
++		}
+ 	}
+ #endif
+ }
diff --git a/kconfig.patch b/kconfig.patch
new file mode 100644
index 000000000000..d9729b23c3e6
--- /dev/null
+++ b/kconfig.patch
@@ -0,0 +1,426 @@
+WARNING - this version of the patch works with version 4.9+ of gcc and with
+kernel version 3.15.x+ and should NOT be applied when compiling on older
+versions due to name changes of the flags with the 4.9 release of gcc.
+Use the older version of this patch hosted on the same github for older
+versions of gcc. For example:
+
+corei7 --> nehalem
+corei7-avx --> sandybridge
+core-avx-i --> ivybridge
+core-avx2 --> haswell
+
+For more, see: https://gcc.gnu.org/gcc-4.9/changes.html
+
+It also changes 'atom' to 'bonnell' in accordance with the gcc v4.9 changes.
+Note that upstream is using the deprecated 'match=atom' flags when I believe it
+should use the newer 'march=bonnell' flag for atom processors.
+
+I have made that change to this patch set as well.  See the following kernel
+bug report to see if I'm right: https://bugzilla.kernel.org/show_bug.cgi?id=77461
+
+This patch will expand the number of microarchitectures to include newer
+processors including: AMD K10-family, AMD Family 10h (Barcelona), AMD Family
+14h (Bobcat), AMD Family 15h (Bulldozer), AMD Family 15h (Piledriver), AMD
+Family 15h (Steamroller), Family 16h (Jaguar), Intel 1st Gen Core i3/i5/i7
+(Nehalem), Intel 1.5 Gen Core i3/i5/i7 (Westmere), Intel 2nd Gen Core i3/i5/i7
+(Sandybridge), Intel 3rd Gen Core i3/i5/i7 (Ivybridge), Intel 4th Gen Core
+i3/i5/i7 (Haswell), Intel 5th Gen Core i3/i5/i7 (Broadwell), and the low power
+Silvermont series of Atom processors (Silvermont). It also offers the compiler
+the 'native' flag.
+
+Small but real speed increases are measurable using a make endpoint comparing
+a generic kernel to one built with one of the respective microarchs.
+
+See the following experimental evidence supporting this statement:
+https://github.com/graysky2/kernel_gcc_patch
+
+REQUIREMENTS
+linux version >=3.15
+gcc version >=4.9
+
+--- a/arch/x86/include/asm/module.h	2015-08-30 14:34:09.000000000 -0400
++++ b/arch/x86/include/asm/module.h	2015-11-06 14:18:24.234941036 -0500
+@@ -15,6 +15,24 @@
+ #define MODULE_PROC_FAMILY "586MMX "
+ #elif defined CONFIG_MCORE2
+ #define MODULE_PROC_FAMILY "CORE2 "
++#elif defined CONFIG_MNATIVE
++#define MODULE_PROC_FAMILY "NATIVE "
++#elif defined CONFIG_MNEHALEM
++#define MODULE_PROC_FAMILY "NEHALEM "
++#elif defined CONFIG_MWESTMERE
++#define MODULE_PROC_FAMILY "WESTMERE "
++#elif defined CONFIG_MSILVERMONT
++#define MODULE_PROC_FAMILY "SILVERMONT "
++#elif defined CONFIG_MSANDYBRIDGE
++#define MODULE_PROC_FAMILY "SANDYBRIDGE "
++#elif defined CONFIG_MIVYBRIDGE
++#define MODULE_PROC_FAMILY "IVYBRIDGE "
++#elif defined CONFIG_MHASWELL
++#define MODULE_PROC_FAMILY "HASWELL "
++#elif defined CONFIG_MBROADWELL
++#define MODULE_PROC_FAMILY "BROADWELL "
++#elif defined CONFIG_MSKYLAKE
++#define MODULE_PROC_FAMILY "SKYLAKE "
+ #elif defined CONFIG_MATOM
+ #define MODULE_PROC_FAMILY "ATOM "
+ #elif defined CONFIG_M686
+@@ -33,6 +51,22 @@
+ #define MODULE_PROC_FAMILY "K7 "
+ #elif defined CONFIG_MK8
+ #define MODULE_PROC_FAMILY "K8 "
++#elif defined CONFIG_MK8SSE3
++#define MODULE_PROC_FAMILY "K8SSE3 "
++#elif defined CONFIG_MK10
++#define MODULE_PROC_FAMILY "K10 "
++#elif defined CONFIG_MBARCELONA
++#define MODULE_PROC_FAMILY "BARCELONA "
++#elif defined CONFIG_MBOBCAT
++#define MODULE_PROC_FAMILY "BOBCAT "
++#elif defined CONFIG_MBULLDOZER
++#define MODULE_PROC_FAMILY "BULLDOZER "
++#elif defined CONFIG_MPILEDRIVER
++#define MODULE_PROC_FAMILY "STEAMROLLER "
++#elif defined CONFIG_MSTEAMROLLER
++#define MODULE_PROC_FAMILY "PILEDRIVER "
++#elif defined CONFIG_MJAGUAR
++#define MODULE_PROC_FAMILY "JAGUAR "
+ #elif defined CONFIG_MELAN
+ #define MODULE_PROC_FAMILY "ELAN "
+ #elif defined CONFIG_MCRUSOE
+--- a/arch/x86/Kconfig.cpu	2015-08-30 14:34:09.000000000 -0400
++++ b/arch/x86/Kconfig.cpu	2015-11-06 14:20:14.948369244 -0500
+@@ -137,9 +137,8 @@ config MPENTIUM4
+ 		-Paxville
+ 		-Dempsey
+ 
+-
+ config MK6
+-	bool "K6/K6-II/K6-III"
++	bool "AMD K6/K6-II/K6-III"
+ 	depends on X86_32
+ 	---help---
+ 	  Select this for an AMD K6-family processor.  Enables use of
+@@ -147,7 +146,7 @@ config MK6
+ 	  flags to GCC.
+ 
+ config MK7
+-	bool "Athlon/Duron/K7"
++	bool "AMD Athlon/Duron/K7"
+ 	depends on X86_32
+ 	---help---
+ 	  Select this for an AMD Athlon K7-family processor.  Enables use of
+@@ -155,12 +154,69 @@ config MK7
+ 	  flags to GCC.
+ 
+ config MK8
+-	bool "Opteron/Athlon64/Hammer/K8"
++	bool "AMD Opteron/Athlon64/Hammer/K8"
+ 	---help---
+ 	  Select this for an AMD Opteron or Athlon64 Hammer-family processor.
+ 	  Enables use of some extended instructions, and passes appropriate
+ 	  optimization flags to GCC.
+ 
++config MK8SSE3
++	bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
++	---help---
++	  Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
++	  Enables use of some extended instructions, and passes appropriate
++	  optimization flags to GCC.
++
++config MK10
++	bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
++	---help---
++	  Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
++		Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
++	  Enables use of some extended instructions, and passes appropriate
++	  optimization flags to GCC.
++
++config MBARCELONA
++	bool "AMD Barcelona"
++	---help---
++	  Select this for AMD Barcelona and newer processors.
++
++	  Enables -march=barcelona
++
++config MBOBCAT
++	bool "AMD Bobcat"
++	---help---
++	  Select this for AMD Bobcat processors.
++
++	  Enables -march=btver1
++
++config MBULLDOZER
++	bool "AMD Bulldozer"
++	---help---
++	  Select this for AMD Bulldozer processors.
++
++	  Enables -march=bdver1
++
++config MPILEDRIVER
++	bool "AMD Piledriver"
++	---help---
++	  Select this for AMD Piledriver processors.
++
++	  Enables -march=bdver2
++
++config MSTEAMROLLER
++	bool "AMD Steamroller"
++	---help---
++	  Select this for AMD Steamroller processors.
++
++	  Enables -march=bdver3
++
++config MJAGUAR
++	bool "AMD Jaguar"
++	---help---
++	  Select this for AMD Jaguar processors.
++
++	  Enables -march=btver2
++
+ config MCRUSOE
+ 	bool "Crusoe"
+ 	depends on X86_32
+@@ -251,8 +307,17 @@ config MPSC
+ 	  using the cpu family field
+ 	  in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
+ 
++config MATOM
++	bool "Intel Atom"
++	---help---
++
++	  Select this for the Intel Atom platform. Intel Atom CPUs have an
++	  in-order pipelining architecture and thus can benefit from
++	  accordingly optimized code. Use a recent GCC with specific Atom
++	  support in order to fully benefit from selecting this option.
++
+ config MCORE2
+-	bool "Core 2/newer Xeon"
++	bool "Intel Core 2"
+ 	---help---
+ 
+ 	  Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
+@@ -260,14 +325,71 @@ config MCORE2
+ 	  family in /proc/cpuinfo. Newer ones have 6 and older ones 15
+ 	  (not a typo)
+ 
+-config MATOM
+-	bool "Intel Atom"
++	  Enables -march=core2
++
++config MNEHALEM
++	bool "Intel Nehalem"
+ 	---help---
+ 
+-	  Select this for the Intel Atom platform. Intel Atom CPUs have an
+-	  in-order pipelining architecture and thus can benefit from
+-	  accordingly optimized code. Use a recent GCC with specific Atom
+-	  support in order to fully benefit from selecting this option.
++	  Select this for 1st Gen Core processors in the Nehalem family.
++
++	  Enables -march=nehalem
++
++config MWESTMERE
++	bool "Intel Westmere"
++	---help---
++
++	  Select this for the Intel Westmere formerly Nehalem-C family.
++
++	  Enables -march=westmere
++
++config MSILVERMONT
++	bool "Intel Silvermont"
++	---help---
++
++	  Select this for the Intel Silvermont platform.
++
++	  Enables -march=silvermont
++
++config MSANDYBRIDGE
++	bool "Intel Sandy Bridge"
++	---help---
++
++	  Select this for 2nd Gen Core processors in the Sandy Bridge family.
++
++	  Enables -march=sandybridge
++
++config MIVYBRIDGE
++	bool "Intel Ivy Bridge"
++	---help---
++
++	  Select this for 3rd Gen Core processors in the Ivy Bridge family.
++
++	  Enables -march=ivybridge
++
++config MHASWELL
++	bool "Intel Haswell"
++	---help---
++
++	  Select this for 4th Gen Core processors in the Haswell family.
++
++	  Enables -march=haswell
++
++config MBROADWELL
++	bool "Intel Broadwell"
++	---help---
++
++	  Select this for 5th Gen Core processors in the Broadwell family.
++
++	  Enables -march=broadwell
++
++config MSKYLAKE
++	bool "Intel Skylake"
++	---help---
++
++	  Select this for 6th Gen Core processors in the Skylake family.
++
++	  Enables -march=skylake
+ 
+ config GENERIC_CPU
+ 	bool "Generic-x86-64"
+@@ -276,6 +398,19 @@ config GENERIC_CPU
+ 	  Generic x86-64 CPU.
+ 	  Run equally well on all x86-64 CPUs.
+ 
++config MNATIVE
++ bool "Native optimizations autodetected by GCC"
++ ---help---
++
++   GCC 4.2 and above support -march=native, which automatically detects
++   the optimum settings to use based on your processor. -march=native 
++   also detects and applies additional settings beyond -march specific
++   to your CPU, (eg. -msse4). Unless you have a specific reason not to
++   (e.g. distcc cross-compiling), you should probably be using
++   -march=native rather than anything listed below.
++
++   Enables -march=native
++
+ endchoice
+ 
+ config X86_GENERIC
+@@ -300,7 +435,7 @@ config X86_INTERNODE_CACHE_SHIFT
+ config X86_L1_CACHE_SHIFT
+ 	int
+ 	default "7" if MPENTIUM4 || MPSC
+-	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
++	default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
+ 	default "4" if MELAN || M486 || MGEODEGX1
+ 	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
+ 
+@@ -331,11 +466,11 @@ config X86_ALIGNMENT_16
+ 
+ config X86_INTEL_USERCOPY
+ 	def_bool y
+-	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
++	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE
+ 
+ config X86_USE_PPRO_CHECKSUM
+ 	def_bool y
+-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
++	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MATOM || MNATIVE
+ 
+ config X86_USE_3DNOW
+ 	def_bool y
+@@ -359,17 +494,17 @@ config X86_P6_NOP
+ 
+ config X86_TSC
+ 	def_bool y
+-	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
++	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM) || X86_64
+ 
+ config X86_CMPXCHG64
+ 	def_bool y
+-	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM
++	depends on X86_PAE || X86_64 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE
+ 
+ # this should be set for all -march=.. options where the compiler
+ # generates cmov.
+ config X86_CMOV
+ 	def_bool y
+-	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
++	depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX)
+ 
+ config X86_MINIMUM_CPU_FAMILY
+ 	int
+--- a/arch/x86/Makefile	2015-08-30 14:34:09.000000000 -0400
++++ b/arch/x86/Makefile	2015-11-06 14:21:05.708983344 -0500
+@@ -94,13 +94,38 @@ else
+ 	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
+ 
+         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
++        cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)
+         cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
++        cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8)
++        cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10)
++        cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona)
++        cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1)
++        cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1)
++        cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2)
++        cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3)
++        cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2)
+         cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
+ 
+         cflags-$(CONFIG_MCORE2) += \
+-                $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
+-	cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \
+-		$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
++                $(call cc-option,-march=core2,$(call cc-option,-mtune=core2))
++        cflags-$(CONFIG_MNEHALEM) += \
++                $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem))
++        cflags-$(CONFIG_MWESTMERE) += \
++                $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere))
++        cflags-$(CONFIG_MSILVERMONT) += \
++                $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont))
++        cflags-$(CONFIG_MSANDYBRIDGE) += \
++                $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge))
++        cflags-$(CONFIG_MIVYBRIDGE) += \
++                $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge))
++        cflags-$(CONFIG_MHASWELL) += \
++                $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell))
++        cflags-$(CONFIG_MBROADWELL) += \
++                $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell))
++        cflags-$(CONFIG_MSKYLAKE) += \
++                $(call cc-option,-march=skylake,$(call cc-option,-mtune=skylake))
++        cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \
++                $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic))
+         cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
+         KBUILD_CFLAGS += $(cflags-y)
+ 
+--- a/arch/x86/Makefile_32.cpu	2015-08-30 14:34:09.000000000 -0400
++++ b/arch/x86/Makefile_32.cpu	2015-11-06 14:21:43.604429077 -0500
+@@ -23,7 +23,16 @@ cflags-$(CONFIG_MK6)		+= -march=k6
+ # Please note, that patches that add -march=athlon-xp and friends are pointless.
+ # They make zero difference whatsosever to performance at this time.
+ cflags-$(CONFIG_MK7)		+= -march=athlon
++cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)
+ cflags-$(CONFIG_MK8)		+= $(call cc-option,-march=k8,-march=athlon)
++cflags-$(CONFIG_MK8SSE3)		+= $(call cc-option,-march=k8-sse3,-march=athlon)
++cflags-$(CONFIG_MK10)	+= $(call cc-option,-march=amdfam10,-march=athlon)
++cflags-$(CONFIG_MBARCELONA)	+= $(call cc-option,-march=barcelona,-march=athlon)
++cflags-$(CONFIG_MBOBCAT)	+= $(call cc-option,-march=btver1,-march=athlon)
++cflags-$(CONFIG_MBULLDOZER)	+= $(call cc-option,-march=bdver1,-march=athlon)
++cflags-$(CONFIG_MPILEDRIVER)	+= $(call cc-option,-march=bdver2,-march=athlon)
++cflags-$(CONFIG_MSTEAMROLLER)	+= $(call cc-option,-march=bdver3,-march=athlon)
++cflags-$(CONFIG_MJAGUAR)	+= $(call cc-option,-march=btver2,-march=athlon)
+ cflags-$(CONFIG_MCRUSOE)	+= -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
+ cflags-$(CONFIG_MEFFICEON)	+= -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
+ cflags-$(CONFIG_MWINCHIPC6)	+= $(call cc-option,-march=winchip-c6,-march=i586)
+@@ -32,8 +41,16 @@ cflags-$(CONFIG_MCYRIXIII)	+= $(call cc-
+ cflags-$(CONFIG_MVIAC3_2)	+= $(call cc-option,-march=c3-2,-march=i686)
+ cflags-$(CONFIG_MVIAC7)		+= -march=i686
+ cflags-$(CONFIG_MCORE2)		+= -march=i686 $(call tune,core2)
+-cflags-$(CONFIG_MATOM)		+= $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \
+-	$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
++cflags-$(CONFIG_MNEHALEM)	+= -march=i686 $(call tune,nehalem)
++cflags-$(CONFIG_MWESTMERE)	+= -march=i686 $(call tune,westmere)
++cflags-$(CONFIG_MSILVERMONT)	+= -march=i686 $(call tune,silvermont)
++cflags-$(CONFIG_MSANDYBRIDGE)	+= -march=i686 $(call tune,sandybridge)
++cflags-$(CONFIG_MIVYBRIDGE)	+= -march=i686 $(call tune,ivybridge)
++cflags-$(CONFIG_MHASWELL)	+= -march=i686 $(call tune,haswell)
++cflags-$(CONFIG_MBROADWELL)	+= -march=i686 $(call tune,broadwell)
++cflags-$(CONFIG_MSKYLAKE)	+= -march=i686 $(call tune,skylake)
++cflags-$(CONFIG_MATOM)		+= $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \
++	$(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic))
+ 
+ # AMD Elan support
+ cflags-$(CONFIG_MELAN)		+= -march=i486
diff --git a/xattr.patch b/xattr.patch
new file mode 100644
index 000000000000..bacd0322989b
--- /dev/null
+++ b/xattr.patch
@@ -0,0 +1,69 @@
+From: Anthony G. Basile <blueness@gentoo.org>
+
+This patch adds support for a restricted user-controlled namespace on
+tmpfs filesystem used to house PaX flags.  The namespace must be of the
+form user.pax.* and its value cannot exceed a size of 8 bytes.
+
+This is needed even on all Gentoo systems so that XATTR_PAX flags
+are preserved for users who might build packages using portage on
+a tmpfs system with a non-hardened kernel and then switch to a
+hardened kernel with XATTR_PAX enabled.
+
+The namespace is added to any user with Extended Attribute support
+enabled for tmpfs.  Users who do not enable xattrs will not have
+the XATTR_PAX flags preserved.
+
+diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
+index 1590c49..5eab462 100644
+--- a/include/uapi/linux/xattr.h
++++ b/include/uapi/linux/xattr.h
+@@ -73,5 +73,9 @@
+ #define XATTR_POSIX_ACL_DEFAULT  "posix_acl_default"
+ #define XATTR_NAME_POSIX_ACL_DEFAULT XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_DEFAULT
+ 
++/* User namespace */
++#define XATTR_PAX_PREFIX XATTR_USER_PREFIX "pax."
++#define XATTR_PAX_FLAGS_SUFFIX "flags"
++#define XATTR_NAME_PAX_FLAGS XATTR_PAX_PREFIX XATTR_PAX_FLAGS_SUFFIX
+ 
+ #endif /* _UAPI_LINUX_XATTR_H */
+diff --git a/mm/shmem.c b/mm/shmem.c
+index 440e2a7..c377172 100644
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -2667,6 +2667,14 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler,
+ 	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+ 
+ 	name = xattr_full_name(handler, name);
++
++	if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
++		if (strcmp(name, XATTR_NAME_PAX_FLAGS))
++			return -EOPNOTSUPP;
++		if (size > 8)
++			return -EINVAL;
++	}
++
+ 	return simple_xattr_set(&info->xattrs, name, value, size, flags);
+ }
+ 
+@@ -2682,6 +2690,12 @@ static const struct xattr_handler shmem_trusted_xattr_handler = {
+ 	.set = shmem_xattr_handler_set,
+ };
+ 
++static const struct xattr_handler shmem_user_xattr_handler = {
++	.prefix = XATTR_USER_PREFIX,
++	.get = shmem_xattr_handler_get,
++	.set = shmem_xattr_handler_set,
++};
++
+ static const struct xattr_handler *shmem_xattr_handlers[] = {
+ #ifdef CONFIG_TMPFS_POSIX_ACL
+ 	&posix_acl_access_xattr_handler,
+@@ -2689,6 +2703,7 @@ static const struct xattr_handler *shmem_xattr_handlers[] = {
+ #endif
+ 	&shmem_security_xattr_handler,
+ 	&shmem_trusted_xattr_handler,
++	&shmem_user_xattr_handler,
+ 	NULL
+ };
+ 
diff --git a/xfs.patch b/xfs.patch
new file mode 100644
index 000000000000..1020cdcb1e53
--- /dev/null
+++ b/xfs.patch
@@ -0,0 +1,137 @@
+From:   Dave Chinner <david@fromorbit.com>
+To:     linux-xfs@vger.kernel.org
+Cc:     xfs@oss.sgi.com
+Subject: [PATCH] xfs: quiesce the filesystem after recovery on readonly mount
+Date:   Fri, 23 Sep 2016 10:11:40 +1000
+
+From: Dave Chinner <dchinner@redhat.com>
+
+Recently we've had a number of reports where log recovery on a v5
+filesystem has reported corruptions that looked to be caused by
+recovery being re-run over the top of an already-recovered
+metadata. This has uncovered a bug in recovery (fixed elsewhere)
+but the vector that caused this was largely unknown.
+
+A kdump test started tripping over this problem - the system
+would be crashed, the kdump kernel and environment would boot and
+dump the kernel core image, and then the system would reboot. After
+reboot, the root filesystem was triggering log recovery and
+corruptions were being detected. The metadumps indicated the above
+log recovery issue.
+
+What is happening is that the kdump kernel and environment is
+mounting the root device read-only to find the binaries needed to do
+it's work. The result of this is that it is running log recovery.
+However, because there were unlinked files and EFIs to be processed
+by recovery, the completion of phase 1 of log recovery could not
+mark the log clean. And because it's a read-only mount, the unmount
+process does not write records to the log to mark it clean, either.
+Hence on the next mount of the filesystem, log recovery was run
+again across all the metadata that had already been recovered and
+this is what triggered corruption warnings.
+
+To avoid this problem, we need to ensure that a read-only mount
+always updates the log when it completes the second phase of
+recovery. We already handle this sort of issue with rw->ro remount
+transitions, so the solution is as simple as quiescing the
+filesystem at the appropriate time during the mount process. This
+results in the log being marked clean so the mount behaviour
+recorded in the logs on repeated RO mounts will change (i.e. log
+recovery will no longer be run on every mount until a RW mount is
+done). This is a user visible change in behaviour, but it is
+harmless.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+---
+ fs/xfs/xfs_mount.c | 14 ++++++++++++++
+ fs/xfs/xfs_super.c |  2 +-
+ fs/xfs/xfs_super.h |  1 +
+ 3 files changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
+index faeead6..56e85a6 100644
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -934,6 +934,20 @@ xfs_mountfs(
+ 	}
+ 
+ 	/*
++	 * Now the log is fully replayed, we can transition to full read-only
++	 * mode for read-only mounts. This will sync all the metadata and clean
++	 * the log so that the recovery we just performed does not have to be
++	 * replayed again on the next mount.
++	 *
++	 * We use the same quiesce mechanism as the rw->ro remount, as they are
++	 * semantically identical operations.
++	 */
++	if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) ==
++							XFS_MOUNT_RDONLY) {
++		xfs_quiesce_attr(mp);
++	}
++
++	/*
+ 	 * Complete the quota initialisation, post-log-replay component.
+ 	 */
+ 	if (quotamount) {
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 3409753..2d092f9 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1137,7 +1137,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
+  * Note: xfs_log_quiesce() stops background log work - the callers must ensure
+  * it is started again when appropriate.
+  */
+-static void
++void
+ xfs_quiesce_attr(
+ 	struct xfs_mount	*mp)
+ {
+diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
+index 529bce9..b6418ab 100644
+--- a/fs/xfs/xfs_super.h
++++ b/fs/xfs/xfs_super.h
+@@ -61,6 +61,7 @@ struct xfs_mount;
+ struct xfs_buftarg;
+ struct block_device;
+ 
++extern void xfs_quiesce_attr(struct xfs_mount *mp);
+ extern void xfs_flush_inodes(struct xfs_mount *mp);
+ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
+ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
+From:   Eryu Guan <eguan@redhat.com>
+To:     linux-xfs@vger.kernel.org
+Cc:     xfs@oss.sgi.com, Eryu Guan <eguan@redhat.com>
+Subject: [PATCH v2] xfs: undo block reservation correctly in xfs_trans_reserve()
+Date:   Tue,  6 Sep 2016 20:14:40 +0800
+
+"blocks" should be added back to fdblocks at undo time, not taken
+away, i.e. the minus sign should not be used.
+
+This is a regression introduced by commit 0d485ada404b ("xfs: use
+generic percpu counters for free block counter"). And it's found by
+code inspection, I didn't it in real world, so there's no
+reproducer.
+
+Signed-off-by: Eryu Guan <eguan@redhat.com>
+---
+v2:
+- Remove "Fixes:" tag and describe relevant commit in commit log
+- Update commit log to mention that it's found by code inspection
+- Remove outer () from the "int64_t" cast
+
+ fs/xfs/xfs_trans.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
+index 5f3d33d..836eb80 100644
+--- a/fs/xfs/xfs_trans.c
++++ b/fs/xfs/xfs_trans.c
+@@ -217,7 +217,7 @@ undo_log:
+ 
+ undo_blocks:
+ 	if (blocks > 0) {
+-		xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
++		xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
+ 		tp->t_blk_res = 0;
+ 	}
+