Blob Blame History Raw
From: Dmitry Monakhov <dmonakhov@xxxxxxxx>
Date: Thu, 9 Oct 2014 15:14:47 +0400
Subject: [PATCH] ext4: fix race between write and fcntl(F_SETFL)

O_DIRECT flags can be toggeled via fcntl(F_SETFL).
But this value checked twice inside ext4_file_write_iter() and __generic_file_write()
which result in BUG_ON (see typical stack trace below)
In order to fix this we have to use our own copy of __generic_file_write and
pass o_direct status explicitly.

TESTCASE: xfstest:generic/326  (http://patchwork.ozlabs.org/patch/397949/)

kernel BUG at fs/ext4/inode.c:2960!
invalid opcode: 0000 [#1] SMP
Modules linked in: brd iTCO_wdt lpc_ich mfd_core igb ptp dm_mirror dm_region_hash dm_log dm_mod
CPU: 6 PID: 5505 Comm: aio-dio-fcntl-r Not tainted 3.17.0-rc2-00176-gff5c017 #161
Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011
task: ffff88080e95a7c0 ti: ffff88080f908000 task.ti: ffff88080f908000
RIP: 0010:[<ffffffff811fabf2>]  [<ffffffff811fabf2>] ext4_direct_IO+0x162/0x3d0
RSP: 0018:ffff88080f90bb58  EFLAGS: 00010246
RAX: 0000000000000400 RBX: ffff88080fdb2a28 RCX: 00000000a802c818
RDX: 0000040000080000 RSI: ffff88080d8aeb80 RDI: 0000000000000001
RBP: ffff88080f90bbc8 R08: 0000000000000000 R09: 0000000000001581
R10: 0000000000000000 R11: 0000000000000000 R12: ffff88080d8aeb80
R13: ffff88080f90bbf8 R14: ffff88080fdb28c8 R15: ffff88080fdb2a28
FS:  00007f23b2055700(0000) GS:ffff880818400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f23b2045000 CR3: 000000080cedf000 CR4: 00000000000407e0
Stack:
 ffff88080f90bb98 0000000000000000 7ffffffffffffffe ffff88080fdb2c30
 0000000000000200 0000000000000200 0000000000000001 0000000000000200
 ffff88080f90bbc8 ffff88080fdb2c30 ffff88080f90be08 0000000000000200
Call Trace:
 [<ffffffff8112ca9d>] generic_file_direct_write+0xed/0x180
 [<ffffffff8112f2b2>] __generic_file_write_iter+0x222/0x370
 [<ffffffff811f495b>] ext4_file_write_iter+0x34b/0x400
 [<ffffffff811bd709>] ? aio_run_iocb+0x239/0x410
 [<ffffffff811bd709>] ? aio_run_iocb+0x239/0x410
 [<ffffffff810990e5>] ? local_clock+0x25/0x30
 [<ffffffff810abd94>] ? __lock_acquire+0x274/0x700
 [<ffffffff811f4610>] ? ext4_unwritten_wait+0xb0/0xb0
 [<ffffffff811bd756>] aio_run_iocb+0x286/0x410
 [<ffffffff810990e5>] ? local_clock+0x25/0x30
 [<ffffffff810ac359>] ? lock_release_holdtime+0x29/0x190
 [<ffffffff811bc05b>] ? lookup_ioctx+0x4b/0xf0
 [<ffffffff811bde3b>] do_io_submit+0x55b/0x740
 [<ffffffff811bdcaa>] ? do_io_submit+0x3ca/0x740
 [<ffffffff811be030>] SyS_io_submit+0x10/0x20
 [<ffffffff815ce192>] system_call_fastpath+0x16/0x1b
Code: 01 48 8b 80 f0 01 00 00 48 8b 18 49 8b 45 10 0f 85 f1 01 00 00 48 03 45 c8 48 3b 43 48 0f 8f e3 01 00 00 49 83 7c 24 18 00 75 04 <0f> 0b eb fe f0 ff 83 ec 01 00 00 49 8b 44 24 18 8b 00 85 c0 89
RIP  [<ffffffff811fabf2>] ext4_direct_IO+0x162/0x3d0
 RSP <ffff88080f90bb58>

Upstream-status: Submitted but likely not accepted
Bugzilla: 1152608

Reported-by: Sasha Levin <sasha.levin@xxxxxxxxxx>
Signed-off-by: Dmitry Monakhov <dmonakhov@xxxxxxxxxx>
---
 fs/ext4/file.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index aca7b24a4432..8477eb259809 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -88,6 +88,100 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
 	return 0;
 }
 
+/**
+ * copy of __generic_file_write_iter with explicit O_DIRECT status
+ * @iocb:	IO state structure (file, offset, etc.)
+ * @from:	iov_iter with data to write
+ * @direct:	perform O_DIRECT IO
+ */
+static ssize_t
+__ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from, int direct)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode	*inode = mapping->host;
+	loff_t		pos = iocb->ki_pos;
+	ssize_t		written = 0;
+	ssize_t		err;
+	ssize_t		status;
+	size_t		count = iov_iter_count(from);
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+
+	if (count == 0)
+		goto out;
+
+	iov_iter_truncate(from, count);
+
+	err = file_remove_suid(file);
+	if (err)
+		goto out;
+
+	err = file_update_time(file);
+	if (err)
+		goto out;
+
+	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+	if (unlikely(direct)) {
+		loff_t endbyte;
+
+		written = generic_file_direct_write(iocb, from, pos);
+		if (written < 0 || written == count)
+			goto out;
+
+		/*
+		 * direct-io write to a hole: fall through to buffered I/O
+		 * for completing the rest of the request.
+		 */
+		pos += written;
+		count -= written;
+
+		status = generic_perform_write(file, from, pos);
+		/*
+		 * If generic_perform_write() returned a synchronous error
+		 * then we want to return the number of bytes which were
+		 * direct-written, or the error code if that was zero.  Note
+		 * that this differs from normal direct-io semantics, which
+		 * will return -EFOO even if some bytes were written.
+		 */
+		if (unlikely(status < 0)) {
+			err = status;
+			goto out;
+		}
+		iocb->ki_pos = pos + status;
+		/*
+		 * We need to ensure that the page cache pages are written to
+		 * disk and invalidated to preserve the expected O_DIRECT
+		 * semantics.
+		 */
+		endbyte = pos + status - 1;
+		err = filemap_write_and_wait_range(file->f_mapping, pos,
+						   endbyte);
+		if (err == 0) {
+			written += status;
+			invalidate_mapping_pages(mapping,
+						 pos >> PAGE_CACHE_SHIFT,
+						 endbyte >> PAGE_CACHE_SHIFT);
+		} else {
+			/*
+			 * We don't know how much we wrote, so just return
+			 * the number of bytes which were direct-written
+			 */
+		}
+	} else {
+		written = generic_perform_write(file, from, pos);
+		if (likely(written >= 0))
+			iocb->ki_pos = pos + written;
+	}
+out:
+	current->backing_dev_info = NULL;
+	return written ? written : err;
+}
+
 static ssize_t
 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
@@ -172,7 +266,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		}
 	}
 
-	ret = __generic_file_write_iter(iocb, from);
+	ret = __ext4_file_write_iter(iocb, from, o_direct);
 	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0) {
-- 
1.9.3