5c8bd1f
From ea3d7209ca01da209cda6f0dea8be9cc4b7a933b Mon Sep 17 00:00:00 2001
5c8bd1f
From: Jan Kara <jack@suse.com>
5c8bd1f
Date: Mon, 7 Dec 2015 14:28:03 -0500
5c8bd1f
Subject: [PATCH 1/4] ext4: fix races between page faults and hole punching
5c8bd1f
5c8bd1f
Currently, page faults and hole punching are completely unsynchronized.
5c8bd1f
This can result in page fault faulting in a page into a range that we
5c8bd1f
are punching after truncate_pagecache_range() has been called and thus
5c8bd1f
we can end up with a page mapped to disk blocks that will be shortly
5c8bd1f
freed. Filesystem corruption will shortly follow. Note that the same
5c8bd1f
race is avoided for truncate by checking page fault offset against
5c8bd1f
i_size but there isn't similar mechanism available for punching holes.
5c8bd1f
5c8bd1f
Fix the problem by creating new rw semaphore i_mmap_sem in inode and
5c8bd1f
grab it for writing over truncate, hole punching, and other functions
5c8bd1f
removing blocks from extent tree and for read over page faults. We
5c8bd1f
cannot easily use i_data_sem for this since that ranks below transaction
5c8bd1f
start and we need something ranking above it so that it can be held over
5c8bd1f
the whole truncate / hole punching operation. Also remove various
5c8bd1f
workarounds we had in the code to reduce race window when page fault
5c8bd1f
could have created pages with stale mapping information.
5c8bd1f
5c8bd1f
Signed-off-by: Jan Kara <jack@suse.com>
5c8bd1f
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
5c8bd1f
---
5c8bd1f
 fs/ext4/ext4.h     | 10 +++++++++
5c8bd1f
 fs/ext4/extents.c  | 54 ++++++++++++++++++++++++--------------------
5c8bd1f
 fs/ext4/file.c     | 66 ++++++++++++++++++++++++++++++++++++++++++++++--------
5c8bd1f
 fs/ext4/inode.c    | 36 +++++++++++++++++++++--------
5c8bd1f
 fs/ext4/super.c    |  1 +
5c8bd1f
 fs/ext4/truncate.h |  2 ++
5c8bd1f
 6 files changed, 127 insertions(+), 42 deletions(-)
5c8bd1f
5c8bd1f
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
5c8bd1f
index cc7ca4e87144..348a5ff4a0e2 100644
5c8bd1f
--- a/fs/ext4/ext4.h
5c8bd1f
+++ b/fs/ext4/ext4.h
5c8bd1f
@@ -910,6 +910,15 @@ struct ext4_inode_info {
5c8bd1f
 	 * by other means, so we have i_data_sem.
5c8bd1f
 	 */
5c8bd1f
 	struct rw_semaphore i_data_sem;
5c8bd1f
+	/*
5c8bd1f
+	 * i_mmap_sem is for serializing page faults with truncate / punch hole
5c8bd1f
+	 * operations. We have to make sure that new page cannot be faulted in
5c8bd1f
+	 * a section of the inode that is being punched. We cannot easily use
5c8bd1f
+	 * i_data_sem for this since we need protection for the whole punch
5c8bd1f
+	 * operation and i_data_sem ranks below transaction start so we have
5c8bd1f
+	 * to occasionally drop it.
5c8bd1f
+	 */
5c8bd1f
+	struct rw_semaphore i_mmap_sem;
5c8bd1f
 	struct inode vfs_inode;
5c8bd1f
 	struct jbd2_inode *jinode;
5c8bd1f
 
5c8bd1f
@@ -2484,6 +2493,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
5c8bd1f
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
5c8bd1f
 			     loff_t lstart, loff_t lend);
5c8bd1f
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
5c8bd1f
+extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
5c8bd1f
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
5c8bd1f
 extern void ext4_da_update_reserve_space(struct inode *inode,
5c8bd1f
 					int used, int quota_claim);
5c8bd1f
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
5c8bd1f
index 551353b1b17a..5be9ca5a8a7a 100644
5c8bd1f
--- a/fs/ext4/extents.c
5c8bd1f
+++ b/fs/ext4/extents.c
5c8bd1f
@@ -4770,7 +4770,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
5c8bd1f
 	int partial_begin, partial_end;
5c8bd1f
 	loff_t start, end;
5c8bd1f
 	ext4_lblk_t lblk;
5c8bd1f
-	struct address_space *mapping = inode->i_mapping;
5c8bd1f
 	unsigned int blkbits = inode->i_blkbits;
5c8bd1f
 
5c8bd1f
 	trace_ext4_zero_range(inode, offset, len, mode);
5c8bd1f
@@ -4786,17 +4785,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
5c8bd1f
 	}
5c8bd1f
 
5c8bd1f
 	/*
5c8bd1f
-	 * Write out all dirty pages to avoid race conditions
5c8bd1f
-	 * Then release them.
5c8bd1f
-	 */
5c8bd1f
-	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
5c8bd1f
-		ret = filemap_write_and_wait_range(mapping, offset,
5c8bd1f
-						   offset + len - 1);
5c8bd1f
-		if (ret)
5c8bd1f
-			return ret;
5c8bd1f
-	}
5c8bd1f
-
5c8bd1f
-	/*
5c8bd1f
 	 * Round up offset. This is not fallocate, we neet to zero out
5c8bd1f
 	 * blocks, so convert interior block aligned part of the range to
5c8bd1f
 	 * unwritten and possibly manually zero out unaligned parts of the
5c8bd1f
@@ -4856,16 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
5c8bd1f
 		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
5c8bd1f
 			  EXT4_EX_NOCACHE);
5c8bd1f
 
5c8bd1f
-		/* Now release the pages and zero block aligned part of pages*/
5c8bd1f
-		truncate_pagecache_range(inode, start, end - 1);
5c8bd1f
-		inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
5c8bd1f
-
5c8bd1f
 		/* Wait all existing dio workers, newcomers will block on i_mutex */
5c8bd1f
 		ext4_inode_block_unlocked_dio(inode);
5c8bd1f
 		inode_dio_wait(inode);
5c8bd1f
 
5c8bd1f
+		/*
5c8bd1f
+		 * Prevent page faults from reinstantiating pages we have
5c8bd1f
+		 * released from page cache.
5c8bd1f
+		 */
5c8bd1f
+		down_write(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
+		/* Now release the pages and zero block aligned part of pages */
5c8bd1f
+		truncate_pagecache_range(inode, start, end - 1);
5c8bd1f
+		inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
5c8bd1f
+
5c8bd1f
 		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
5c8bd1f
 					     flags, mode);
5c8bd1f
+		up_write(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 		if (ret)
5c8bd1f
 			goto out_dio;
5c8bd1f
 	}
5c8bd1f
@@ -5524,17 +5518,22 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5c8bd1f
 		goto out_mutex;
5c8bd1f
 	}
5c8bd1f
 
5c8bd1f
-	truncate_pagecache(inode, ioffset);
5c8bd1f
-
5c8bd1f
 	/* Wait for existing dio to complete */
5c8bd1f
 	ext4_inode_block_unlocked_dio(inode);
5c8bd1f
 	inode_dio_wait(inode);
5c8bd1f
 
5c8bd1f
+	/*
5c8bd1f
+	 * Prevent page faults from reinstantiating pages we have released from
5c8bd1f
+	 * page cache.
5c8bd1f
+	 */
5c8bd1f
+	down_write(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
+	truncate_pagecache(inode, ioffset);
5c8bd1f
+
5c8bd1f
 	credits = ext4_writepage_trans_blocks(inode);
5c8bd1f
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5c8bd1f
 	if (IS_ERR(handle)) {
5c8bd1f
 		ret = PTR_ERR(handle);
5c8bd1f
-		goto out_dio;
5c8bd1f
+		goto out_mmap;
5c8bd1f
 	}
5c8bd1f
 
5c8bd1f
 	down_write(&EXT4_I(inode)->i_data_sem);
5c8bd1f
@@ -5573,7 +5572,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5c8bd1f
 
5c8bd1f
 out_stop:
5c8bd1f
 	ext4_journal_stop(handle);
5c8bd1f
-out_dio:
5c8bd1f
+out_mmap:
5c8bd1f
+	up_write(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 	ext4_inode_resume_unlocked_dio(inode);
5c8bd1f
 out_mutex:
5c8bd1f
 	mutex_unlock(&inode->i_mutex);
5c8bd1f
@@ -5660,17 +5660,22 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
5c8bd1f
 		goto out_mutex;
5c8bd1f
 	}
5c8bd1f
 
5c8bd1f
-	truncate_pagecache(inode, ioffset);
5c8bd1f
-
5c8bd1f
 	/* Wait for existing dio to complete */
5c8bd1f
 	ext4_inode_block_unlocked_dio(inode);
5c8bd1f
 	inode_dio_wait(inode);
5c8bd1f
 
5c8bd1f
+	/*
5c8bd1f
+	 * Prevent page faults from reinstantiating pages we have released from
5c8bd1f
+	 * page cache.
5c8bd1f
+	 */
5c8bd1f
+	down_write(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
+	truncate_pagecache(inode, ioffset);
5c8bd1f
+
5c8bd1f
 	credits = ext4_writepage_trans_blocks(inode);
5c8bd1f
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5c8bd1f
 	if (IS_ERR(handle)) {
5c8bd1f
 		ret = PTR_ERR(handle);
5c8bd1f
-		goto out_dio;
5c8bd1f
+		goto out_mmap;
5c8bd1f
 	}
5c8bd1f
 
5c8bd1f
 	/* Expand file to avoid data loss if there is error while shifting */
5c8bd1f
@@ -5741,7 +5746,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
5c8bd1f
 
5c8bd1f
 out_stop:
5c8bd1f
 	ext4_journal_stop(handle);
5c8bd1f
-out_dio:
5c8bd1f
+out_mmap:
5c8bd1f
+	up_write(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 	ext4_inode_resume_unlocked_dio(inode);
5c8bd1f
 out_mutex:
5c8bd1f
 	mutex_unlock(&inode->i_mutex);
5c8bd1f
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
5c8bd1f
index 113837e7ba98..0d24ebcd7c9e 100644
5c8bd1f
--- a/fs/ext4/file.c
5c8bd1f
+++ b/fs/ext4/file.c
5c8bd1f
@@ -209,15 +209,18 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
5c8bd1f
 {
5c8bd1f
 	int result;
5c8bd1f
 	handle_t *handle = NULL;
5c8bd1f
-	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
5c8bd1f
+	struct inode *inode = file_inode(vma->vm_file);
5c8bd1f
+	struct super_block *sb = inode->i_sb;
5c8bd1f
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
5c8bd1f
 
5c8bd1f
 	if (write) {
5c8bd1f
 		sb_start_pagefault(sb);
5c8bd1f
 		file_update_time(vma->vm_file);
5c8bd1f
+		down_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
5c8bd1f
 						EXT4_DATA_TRANS_BLOCKS(sb));
5c8bd1f
-	}
5c8bd1f
+	} else
5c8bd1f
+		down_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 
5c8bd1f
 	if (IS_ERR(handle))
5c8bd1f
 		result = VM_FAULT_SIGBUS;
5c8bd1f
@@ -228,8 +231,10 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
5c8bd1f
 	if (write) {
5c8bd1f
 		if (!IS_ERR(handle))
5c8bd1f
 			ext4_journal_stop(handle);
5c8bd1f
+		up_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 		sb_end_pagefault(sb);
5c8bd1f
-	}
5c8bd1f
+	} else
5c8bd1f
+		up_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 
5c8bd1f
 	return result;
5c8bd1f
 }
5c8bd1f
@@ -246,10 +251,12 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
5c8bd1f
 	if (write) {
5c8bd1f
 		sb_start_pagefault(sb);
5c8bd1f
 		file_update_time(vma->vm_file);
5c8bd1f
+		down_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
5c8bd1f
 				ext4_chunk_trans_blocks(inode,
5c8bd1f
 							PMD_SIZE / PAGE_SIZE));
5c8bd1f
-	}
5c8bd1f
+	} else
5c8bd1f
+		down_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 
5c8bd1f
 	if (IS_ERR(handle))
5c8bd1f
 		result = VM_FAULT_SIGBUS;
5c8bd1f
@@ -260,30 +267,71 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
5c8bd1f
 	if (write) {
5c8bd1f
 		if (!IS_ERR(handle))
5c8bd1f
 			ext4_journal_stop(handle);
5c8bd1f
+		up_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 		sb_end_pagefault(sb);
5c8bd1f
-	}
5c8bd1f
+	} else
5c8bd1f
+		up_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 
5c8bd1f
 	return result;
5c8bd1f
 }
5c8bd1f
 
5c8bd1f
 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5c8bd1f
 {
5c8bd1f
-	return dax_mkwrite(vma, vmf, ext4_get_block_dax,
5c8bd1f
-				ext4_end_io_unwritten);
5c8bd1f
+	int err;
5c8bd1f
+	struct inode *inode = file_inode(vma->vm_file);
5c8bd1f
+
5c8bd1f
+	sb_start_pagefault(inode->i_sb);
5c8bd1f
+	file_update_time(vma->vm_file);
5c8bd1f
+	down_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
+	err = __dax_mkwrite(vma, vmf, ext4_get_block_dax,
5c8bd1f
+			    ext4_end_io_unwritten);
5c8bd1f
+	up_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
+	sb_end_pagefault(inode->i_sb);
5c8bd1f
+
5c8bd1f
+	return err;
5c8bd1f
+}
5c8bd1f
+
5c8bd1f
+/*
5c8bd1f
+ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
5c8bd1f
+ * handler we check for races agaist truncate. Note that since we cycle through
5c8bd1f
+ * i_mmap_sem, we are sure that also any hole punching that began before we
5c8bd1f
+ * were called is finished by now and so if it included part of the file we
5c8bd1f
+ * are working on, our pte will get unmapped and the check for pte_same() in
5c8bd1f
+ * wp_pfn_shared() fails. Thus fault gets retried and things work out as
5c8bd1f
+ * desired.
5c8bd1f
+ */
5c8bd1f
+static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
5c8bd1f
+				struct vm_fault *vmf)
5c8bd1f
+{
5c8bd1f
+	struct inode *inode = file_inode(vma->vm_file);
5c8bd1f
+	struct super_block *sb = inode->i_sb;
5c8bd1f
+	int ret = VM_FAULT_NOPAGE;
5c8bd1f
+	loff_t size;
5c8bd1f
+
5c8bd1f
+	sb_start_pagefault(sb);
5c8bd1f
+	file_update_time(vma->vm_file);
5c8bd1f
+	down_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
5c8bd1f
+	if (vmf->pgoff >= size)
5c8bd1f
+		ret = VM_FAULT_SIGBUS;
5c8bd1f
+	up_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
+	sb_end_pagefault(sb);
5c8bd1f
+
5c8bd1f
+	return ret;
5c8bd1f
 }
5c8bd1f
 
5c8bd1f
 static const struct vm_operations_struct ext4_dax_vm_ops = {
5c8bd1f
 	.fault		= ext4_dax_fault,
5c8bd1f
 	.pmd_fault	= ext4_dax_pmd_fault,
5c8bd1f
 	.page_mkwrite	= ext4_dax_mkwrite,
5c8bd1f
-	.pfn_mkwrite	= dax_pfn_mkwrite,
5c8bd1f
+	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
5c8bd1f
 };
5c8bd1f
 #else
5c8bd1f
 #define ext4_dax_vm_ops	ext4_file_vm_ops
5c8bd1f
 #endif
5c8bd1f
 
5c8bd1f
 static const struct vm_operations_struct ext4_file_vm_ops = {
5c8bd1f
-	.fault		= filemap_fault,
5c8bd1f
+	.fault		= ext4_filemap_fault,
5c8bd1f
 	.map_pages	= filemap_map_pages,
5c8bd1f
 	.page_mkwrite   = ext4_page_mkwrite,
5c8bd1f
 };
5c8bd1f
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
5c8bd1f
index ea433a7f4bca..d1207d03c961 100644
5c8bd1f
--- a/fs/ext4/inode.c
5c8bd1f
+++ b/fs/ext4/inode.c
5c8bd1f
@@ -3623,6 +3623,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
5c8bd1f
 
5c8bd1f
 	}
5c8bd1f
 
5c8bd1f
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
5c8bd1f
+	ext4_inode_block_unlocked_dio(inode);
5c8bd1f
+	inode_dio_wait(inode);
5c8bd1f
+
5c8bd1f
+	/*
5c8bd1f
+	 * Prevent page faults from reinstantiating pages we have released from
5c8bd1f
+	 * page cache.
5c8bd1f
+	 */
5c8bd1f
+	down_write(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 	first_block_offset = round_up(offset, sb->s_blocksize);
5c8bd1f
 	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
5c8bd1f
 
5c8bd1f
@@ -3631,10 +3640,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
5c8bd1f
 		truncate_pagecache_range(inode, first_block_offset,
5c8bd1f
 					 last_block_offset);
5c8bd1f
 
5c8bd1f
-	/* Wait all existing dio workers, newcomers will block on i_mutex */
5c8bd1f
-	ext4_inode_block_unlocked_dio(inode);
5c8bd1f
-	inode_dio_wait(inode);
5c8bd1f
-
5c8bd1f
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5c8bd1f
 		credits = ext4_writepage_trans_blocks(inode);
5c8bd1f
 	else
5c8bd1f
@@ -3680,16 +3685,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
5c8bd1f
 	if (IS_SYNC(inode))
5c8bd1f
 		ext4_handle_sync(handle);
5c8bd1f
 
5c8bd1f
-	/* Now release the pages again to reduce race window */
5c8bd1f
-	if (last_block_offset > first_block_offset)
5c8bd1f
-		truncate_pagecache_range(inode, first_block_offset,
5c8bd1f
-					 last_block_offset);
5c8bd1f
-
5c8bd1f
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
5c8bd1f
 	ext4_mark_inode_dirty(handle, inode);
5c8bd1f
 out_stop:
5c8bd1f
 	ext4_journal_stop(handle);
5c8bd1f
 out_dio:
5c8bd1f
+	up_write(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 	ext4_inode_resume_unlocked_dio(inode);
5c8bd1f
 out_mutex:
5c8bd1f
 	mutex_unlock(&inode->i_mutex);
5c8bd1f
@@ -4823,6 +4824,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5c8bd1f
 			} else
5c8bd1f
 				ext4_wait_for_tail_page_commit(inode);
5c8bd1f
 		}
5c8bd1f
+		down_write(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 		/*
5c8bd1f
 		 * Truncate pagecache after we've waited for commit
5c8bd1f
 		 * in data=journal mode to make pages freeable.
5c8bd1f
@@ -4830,6 +4832,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5c8bd1f
 		truncate_pagecache(inode, inode->i_size);
5c8bd1f
 		if (shrink)
5c8bd1f
 			ext4_truncate(inode);
5c8bd1f
+		up_write(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 	}
5c8bd1f
 
5c8bd1f
 	if (!rc) {
5c8bd1f
@@ -5278,6 +5281,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5c8bd1f
 
5c8bd1f
 	sb_start_pagefault(inode->i_sb);
5c8bd1f
 	file_update_time(vma->vm_file);
5c8bd1f
+
5c8bd1f
+	down_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 	/* Delalloc case is easy... */
5c8bd1f
 	if (test_opt(inode->i_sb, DELALLOC) &&
5c8bd1f
 	    !ext4_should_journal_data(inode) &&
5c8bd1f
@@ -5347,6 +5352,19 @@ retry_alloc:
5c8bd1f
 out_ret:
5c8bd1f
 	ret = block_page_mkwrite_return(ret);
5c8bd1f
 out:
5c8bd1f
+	up_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 	sb_end_pagefault(inode->i_sb);
5c8bd1f
 	return ret;
5c8bd1f
 }
5c8bd1f
+
5c8bd1f
+int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
5c8bd1f
+{
5c8bd1f
+	struct inode *inode = file_inode(vma->vm_file);
5c8bd1f
+	int err;
5c8bd1f
+
5c8bd1f
+	down_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
+	err = filemap_fault(vma, vmf);
5c8bd1f
+	up_read(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
+
5c8bd1f
+	return err;
5c8bd1f
+}
5c8bd1f
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
5c8bd1f
index c9ab67da6e5a..493370e6590e 100644
5c8bd1f
--- a/fs/ext4/super.c
5c8bd1f
+++ b/fs/ext4/super.c
5c8bd1f
@@ -958,6 +958,7 @@ static void init_once(void *foo)
5c8bd1f
 	INIT_LIST_HEAD(&ei->i_orphan);
5c8bd1f
 	init_rwsem(&ei->xattr_sem);
5c8bd1f
 	init_rwsem(&ei->i_data_sem);
5c8bd1f
+	init_rwsem(&ei->i_mmap_sem);
5c8bd1f
 	inode_init_once(&ei->vfs_inode);
5c8bd1f
 }
5c8bd1f
 
5c8bd1f
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
5c8bd1f
index 011ba6670d99..c70d06a383e2 100644
5c8bd1f
--- a/fs/ext4/truncate.h
5c8bd1f
+++ b/fs/ext4/truncate.h
5c8bd1f
@@ -10,8 +10,10 @@
5c8bd1f
  */
5c8bd1f
 static inline void ext4_truncate_failed_write(struct inode *inode)
5c8bd1f
 {
5c8bd1f
+	down_write(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 	truncate_inode_pages(inode->i_mapping, inode->i_size);
5c8bd1f
 	ext4_truncate(inode);
5c8bd1f
+	up_write(&EXT4_I(inode)->i_mmap_sem);
5c8bd1f
 }
5c8bd1f
 
5c8bd1f
 /*
5c8bd1f
-- 
5c8bd1f
2.5.5
5c8bd1f