diff --git a/ext4-fix-races-between-buffered-IO-and-collapse-inse.patch b/ext4-fix-races-between-buffered-IO-and-collapse-inse.patch new file mode 100644 index 0000000..0c89ea7 --- /dev/null +++ b/ext4-fix-races-between-buffered-IO-and-collapse-inse.patch @@ -0,0 +1,119 @@ +From 32ebffd3bbb4162da5ff88f9a35dd32d0a28ea70 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Mon, 7 Dec 2015 14:31:11 -0500 +Subject: [PATCH 3/4] ext4: fix races between buffered IO and collapse / insert + range + +Current code implementing FALLOC_FL_COLLAPSE_RANGE and +FALLOC_FL_INSERT_RANGE is prone to races with buffered writes and page +faults. If buffered write or write via mmap manages to squeeze between +filemap_write_and_wait_range() and truncate_pagecache() in the fallocate +implementations, the written data is simply discarded by +truncate_pagecache() although it should have been shifted. + +Fix the problem by moving filemap_write_and_wait_range() call inside +i_mutex and i_mmap_sem. That way we are protected against races with +both buffered writes and page faults. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/extents.c | 59 +++++++++++++++++++++++++++++-------------------------- + 1 file changed, 31 insertions(+), 28 deletions(-) + +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index 65b5ada2833f..4b105c96df08 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -5487,21 +5487,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) + return ret; + } + +- /* +- * Need to round down offset to be aligned with page size boundary +- * for page size > block size. +- */ +- ioffset = round_down(offset, PAGE_SIZE); +- +- /* Write out all dirty pages */ +- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, +- LLONG_MAX); +- if (ret) +- return ret; +- +- /* Take mutex lock */ + mutex_lock(&inode->i_mutex); +- + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation +@@ -5526,6 +5512,27 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); ++ /* ++ * Need to round down offset to be aligned with page size boundary ++ * for page size > block size. ++ */ ++ ioffset = round_down(offset, PAGE_SIZE); ++ /* ++ * Write tail of the last page before removed range since it will get ++ * removed from the page cache below. ++ */ ++ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); ++ if (ret) ++ goto out_mmap; ++ /* ++ * Write data that will be shifted to preserve them when discarding ++ * page cache below. We are also protected from pages becoming dirty ++ * by i_mmap_sem. ++ */ ++ ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, ++ LLONG_MAX); ++ if (ret) ++ goto out_mmap; + truncate_pagecache(inode, ioffset); + + credits = ext4_writepage_trans_blocks(inode); +@@ -5626,21 +5633,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) + return ret; + } + +- /* +- * Need to round down to align start offset to page size boundary +- * for page size > block size. +- */ +- ioffset = round_down(offset, PAGE_SIZE); +- +- /* Write out all dirty pages */ +- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, +- LLONG_MAX); +- if (ret) +- return ret; +- +- /* Take mutex lock */ + mutex_lock(&inode->i_mutex); +- + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; +@@ -5668,6 +5661,16 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); ++ /* ++ * Need to round down to align start offset to page size boundary ++ * for page size > block size. ++ */ ++ ioffset = round_down(offset, PAGE_SIZE); ++ /* Write out all dirty pages */ ++ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, ++ LLONG_MAX); ++ if (ret) ++ goto out_mmap; + truncate_pagecache(inode, ioffset); + + credits = ext4_writepage_trans_blocks(inode); +-- +2.5.5 + diff --git a/ext4-fix-races-between-page-faults-and-hole-punching.patch b/ext4-fix-races-between-page-faults-and-hole-punching.patch new file mode 100644 index 0000000..9034b95 --- /dev/null +++ b/ext4-fix-races-between-page-faults-and-hole-punching.patch @@ -0,0 +1,442 @@ +From ea3d7209ca01da209cda6f0dea8be9cc4b7a933b Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Mon, 7 Dec 2015 14:28:03 -0500 +Subject: [PATCH 1/4] ext4: fix races between page faults and hole punching + +Currently, page faults and hole punching are completely unsynchronized. +This can result in page fault faulting in a page into a range that we +are punching after truncate_pagecache_range() has been called and thus +we can end up with a page mapped to disk blocks that will be shortly +freed. Filesystem corruption will shortly follow. Note that the same +race is avoided for truncate by checking page fault offset against +i_size but there isn't similar mechanism available for punching holes. + +Fix the problem by creating new rw semaphore i_mmap_sem in inode and +grab it for writing over truncate, hole punching, and other functions +removing blocks from extent tree and for read over page faults. We +cannot easily use i_data_sem for this since that ranks below transaction +start and we need something ranking above it so that it can be held over +the whole truncate / hole punching operation. Also remove various +workarounds we had in the code to reduce race window when page fault +could have created pages with stale mapping information. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/ext4.h | 10 +++++++++ + fs/ext4/extents.c | 54 ++++++++++++++++++++++++-------------------- + fs/ext4/file.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++-------- + fs/ext4/inode.c | 36 +++++++++++++++++++++-------- + fs/ext4/super.c | 1 + + fs/ext4/truncate.h | 2 ++ + 6 files changed, 127 insertions(+), 42 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index cc7ca4e87144..348a5ff4a0e2 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -910,6 +910,15 @@ struct ext4_inode_info { + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; ++ /* ++ * i_mmap_sem is for serializing page faults with truncate / punch hole ++ * operations. We have to make sure that new page cannot be faulted in ++ * a section of the inode that is being punched. We cannot easily use ++ * i_data_sem for this since we need protection for the whole punch ++ * operation and i_data_sem ranks below transaction start so we have ++ * to occasionally drop it. ++ */ ++ struct rw_semaphore i_mmap_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + +@@ -2484,6 +2493,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); + extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); + extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); ++extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); + extern qsize_t *ext4_get_reserved_space(struct inode *inode); + extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index 551353b1b17a..5be9ca5a8a7a 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -4770,7 +4770,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, + int partial_begin, partial_end; + loff_t start, end; + ext4_lblk_t lblk; +- struct address_space *mapping = inode->i_mapping; + unsigned int blkbits = inode->i_blkbits; + + trace_ext4_zero_range(inode, offset, len, mode); +@@ -4786,17 +4785,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, + } + + /* +- * Write out all dirty pages to avoid race conditions +- * Then release them. +- */ +- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { +- ret = filemap_write_and_wait_range(mapping, offset, +- offset + len - 1); +- if (ret) +- return ret; +- } +- +- /* + * Round up offset. This is not fallocate, we neet to zero out + * blocks, so convert interior block aligned part of the range to + * unwritten and possibly manually zero out unaligned parts of the +@@ -4856,16 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset, + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE); + +- /* Now release the pages and zero block aligned part of pages*/ +- truncate_pagecache_range(inode, start, end - 1); +- inode->i_mtime = inode->i_ctime = ext4_current_time(inode); +- + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ++ /* ++ * Prevent page faults from reinstantiating pages we have ++ * released from page cache. ++ */ ++ down_write(&EXT4_I(inode)->i_mmap_sem); ++ /* Now release the pages and zero block aligned part of pages */ ++ truncate_pagecache_range(inode, start, end - 1); ++ inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ++ + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); ++ up_write(&EXT4_I(inode)->i_mmap_sem); + if (ret) + goto out_dio; + } +@@ -5524,17 +5518,22 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) + goto out_mutex; + } + +- truncate_pagecache(inode, ioffset); +- + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ++ /* ++ * Prevent page faults from reinstantiating pages we have released from ++ * page cache. ++ */ ++ down_write(&EXT4_I(inode)->i_mmap_sem); ++ truncate_pagecache(inode, ioffset); ++ + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); +- goto out_dio; ++ goto out_mmap; + } + + down_write(&EXT4_I(inode)->i_data_sem); +@@ -5573,7 +5572,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) + + out_stop: + ext4_journal_stop(handle); +-out_dio: ++out_mmap: ++ up_write(&EXT4_I(inode)->i_mmap_sem); + ext4_inode_resume_unlocked_dio(inode); + out_mutex: + mutex_unlock(&inode->i_mutex); +@@ -5660,17 +5660,22 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) + goto out_mutex; + } + +- truncate_pagecache(inode, ioffset); +- + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ++ /* ++ * Prevent page faults from reinstantiating pages we have released from ++ * page cache. ++ */ ++ down_write(&EXT4_I(inode)->i_mmap_sem); ++ truncate_pagecache(inode, ioffset); ++ + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); +- goto out_dio; ++ goto out_mmap; + } + + /* Expand file to avoid data loss if there is error while shifting */ +@@ -5741,7 +5746,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) + + out_stop: + ext4_journal_stop(handle); +-out_dio: ++out_mmap: ++ up_write(&EXT4_I(inode)->i_mmap_sem); + ext4_inode_resume_unlocked_dio(inode); + out_mutex: + mutex_unlock(&inode->i_mutex); +diff --git a/fs/ext4/file.c b/fs/ext4/file.c +index 113837e7ba98..0d24ebcd7c9e 100644 +--- a/fs/ext4/file.c ++++ b/fs/ext4/file.c +@@ -209,15 +209,18 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) + { + int result; + handle_t *handle = NULL; +- struct super_block *sb = file_inode(vma->vm_file)->i_sb; ++ struct inode *inode = file_inode(vma->vm_file); ++ struct super_block *sb = inode->i_sb; + bool write = vmf->flags & FAULT_FLAG_WRITE; + + if (write) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); ++ down_read(&EXT4_I(inode)->i_mmap_sem); + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + EXT4_DATA_TRANS_BLOCKS(sb)); +- } ++ } else ++ down_read(&EXT4_I(inode)->i_mmap_sem); + + if (IS_ERR(handle)) + result = VM_FAULT_SIGBUS; +@@ -228,8 +231,10 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) + if (write) { + if (!IS_ERR(handle)) + ext4_journal_stop(handle); ++ up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); +- } ++ } else ++ up_read(&EXT4_I(inode)->i_mmap_sem); + + return result; + } +@@ -246,10 +251,12 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + if (write) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); ++ down_read(&EXT4_I(inode)->i_mmap_sem); + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + ext4_chunk_trans_blocks(inode, + PMD_SIZE / PAGE_SIZE)); +- } ++ } else ++ down_read(&EXT4_I(inode)->i_mmap_sem); + + if (IS_ERR(handle)) + result = VM_FAULT_SIGBUS; +@@ -260,30 +267,71 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + if (write) { + if (!IS_ERR(handle)) + ext4_journal_stop(handle); ++ up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); +- } ++ } else ++ up_read(&EXT4_I(inode)->i_mmap_sem); + + return result; + } + + static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) + { +- return dax_mkwrite(vma, vmf, ext4_get_block_dax, +- ext4_end_io_unwritten); ++ int err; ++ struct inode *inode = file_inode(vma->vm_file); ++ ++ sb_start_pagefault(inode->i_sb); ++ file_update_time(vma->vm_file); ++ down_read(&EXT4_I(inode)->i_mmap_sem); ++ err = __dax_mkwrite(vma, vmf, ext4_get_block_dax, ++ ext4_end_io_unwritten); ++ up_read(&EXT4_I(inode)->i_mmap_sem); ++ sb_end_pagefault(inode->i_sb); ++ ++ return err; ++} ++ ++/* ++ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() ++ * handler we check for races agaist truncate. Note that since we cycle through ++ * i_mmap_sem, we are sure that also any hole punching that began before we ++ * were called is finished by now and so if it included part of the file we ++ * are working on, our pte will get unmapped and the check for pte_same() in ++ * wp_pfn_shared() fails. Thus fault gets retried and things work out as ++ * desired. ++ */ ++static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, ++ struct vm_fault *vmf) ++{ ++ struct inode *inode = file_inode(vma->vm_file); ++ struct super_block *sb = inode->i_sb; ++ int ret = VM_FAULT_NOPAGE; ++ loff_t size; ++ ++ sb_start_pagefault(sb); ++ file_update_time(vma->vm_file); ++ down_read(&EXT4_I(inode)->i_mmap_sem); ++ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ if (vmf->pgoff >= size) ++ ret = VM_FAULT_SIGBUS; ++ up_read(&EXT4_I(inode)->i_mmap_sem); ++ sb_end_pagefault(sb); ++ ++ return ret; + } + + static const struct vm_operations_struct ext4_dax_vm_ops = { + .fault = ext4_dax_fault, + .pmd_fault = ext4_dax_pmd_fault, + .page_mkwrite = ext4_dax_mkwrite, +- .pfn_mkwrite = dax_pfn_mkwrite, ++ .pfn_mkwrite = ext4_dax_pfn_mkwrite, + }; + #else + #define ext4_dax_vm_ops ext4_file_vm_ops + #endif + + static const struct vm_operations_struct ext4_file_vm_ops = { +- .fault = filemap_fault, ++ .fault = ext4_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = ext4_page_mkwrite, + }; +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index ea433a7f4bca..d1207d03c961 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3623,6 +3623,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) + + } + ++ /* Wait all existing dio workers, newcomers will block on i_mutex */ ++ ext4_inode_block_unlocked_dio(inode); ++ inode_dio_wait(inode); ++ ++ /* ++ * Prevent page faults from reinstantiating pages we have released from ++ * page cache. ++ */ ++ down_write(&EXT4_I(inode)->i_mmap_sem); + first_block_offset = round_up(offset, sb->s_blocksize); + last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + +@@ -3631,10 +3640,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + +- /* Wait all existing dio workers, newcomers will block on i_mutex */ +- ext4_inode_block_unlocked_dio(inode); +- inode_dio_wait(inode); +- + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_writepage_trans_blocks(inode); + else +@@ -3680,16 +3685,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +- /* Now release the pages again to reduce race window */ +- if (last_block_offset > first_block_offset) +- truncate_pagecache_range(inode, first_block_offset, +- last_block_offset); +- + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + out_stop: + ext4_journal_stop(handle); + out_dio: ++ up_write(&EXT4_I(inode)->i_mmap_sem); + ext4_inode_resume_unlocked_dio(inode); + out_mutex: + mutex_unlock(&inode->i_mutex); +@@ -4823,6 +4824,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) + } else + ext4_wait_for_tail_page_commit(inode); + } ++ down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Truncate pagecache after we've waited for commit + * in data=journal mode to make pages freeable. +@@ -4830,6 +4832,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) + truncate_pagecache(inode, inode->i_size); + if (shrink) + ext4_truncate(inode); ++ up_write(&EXT4_I(inode)->i_mmap_sem); + } + + if (!rc) { +@@ -5278,6 +5281,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); ++ ++ down_read(&EXT4_I(inode)->i_mmap_sem); + /* Delalloc case is easy... */ + if (test_opt(inode->i_sb, DELALLOC) && + !ext4_should_journal_data(inode) && +@@ -5347,6 +5352,19 @@ retry_alloc: + out_ret: + ret = block_page_mkwrite_return(ret); + out: ++ up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(inode->i_sb); + return ret; + } ++ ++int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ struct inode *inode = file_inode(vma->vm_file); ++ int err; ++ ++ down_read(&EXT4_I(inode)->i_mmap_sem); ++ err = filemap_fault(vma, vmf); ++ up_read(&EXT4_I(inode)->i_mmap_sem); ++ ++ return err; ++} +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index c9ab67da6e5a..493370e6590e 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -958,6 +958,7 @@ static void init_once(void *foo) + INIT_LIST_HEAD(&ei->i_orphan); + init_rwsem(&ei->xattr_sem); + init_rwsem(&ei->i_data_sem); ++ init_rwsem(&ei->i_mmap_sem); + inode_init_once(&ei->vfs_inode); + } + +diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h +index 011ba6670d99..c70d06a383e2 100644 +--- a/fs/ext4/truncate.h ++++ b/fs/ext4/truncate.h +@@ -10,8 +10,10 @@ + */ + static inline void ext4_truncate_failed_write(struct inode *inode) + { ++ down_write(&EXT4_I(inode)->i_mmap_sem); + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); ++ up_write(&EXT4_I(inode)->i_mmap_sem); + } + + /* +-- +2.5.5 + diff --git a/ext4-fix-races-of-writeback-with-punch-hole-and-zero.patch b/ext4-fix-races-of-writeback-with-punch-hole-and-zero.patch new file mode 100644 index 0000000..9ff9e27 --- /dev/null +++ b/ext4-fix-races-of-writeback-with-punch-hole-and-zero.patch @@ -0,0 +1,110 @@ +From 011278485ecc3cd2a3954b5d4c73101d919bf1fa Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Mon, 7 Dec 2015 14:34:49 -0500 +Subject: [PATCH 4/4] ext4: fix races of writeback with punch hole and zero + range + +When doing delayed allocation, update of on-disk inode size is postponed +until IO submission time. However hole punch or zero range fallocate +calls can end up discarding the tail page cache page and thus on-disk +inode size would never be properly updated. + +Make sure the on-disk inode size is updated before truncating page +cache. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/ext4.h | 3 +++ + fs/ext4/extents.c | 5 +++++ + fs/ext4/inode.c | 35 ++++++++++++++++++++++++++++++++++- + 3 files changed, 42 insertions(+), 1 deletion(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 348a5ff4a0e2..80f76f092079 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -2858,6 +2858,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) + return changed; + } + ++int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, ++ loff_t len); ++ + struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index 4b105c96df08..3578b25fccfd 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -4847,6 +4847,11 @@ static long ext4_zero_range(struct file *file, loff_t offset, + * released from page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); ++ ret = ext4_update_disksize_before_punch(inode, offset, len); ++ if (ret) { ++ up_write(&EXT4_I(inode)->i_mmap_sem); ++ goto out_dio; ++ } + /* Now release the pages and zero block aligned part of pages */ + truncate_pagecache_range(inode, start, end - 1); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index d1207d03c961..472e608da13d 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3559,6 +3559,35 @@ int ext4_can_truncate(struct inode *inode) + } + + /* ++ * We have to make sure i_disksize gets properly updated before we truncate ++ * page cache due to hole punching or zero range. Otherwise i_disksize update ++ * can get lost as it may have been postponed to submission of writeback but ++ * that will never happen after we truncate page cache. ++ */ ++int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, ++ loff_t len) ++{ ++ handle_t *handle; ++ loff_t size = i_size_read(inode); ++ ++ WARN_ON(!mutex_is_locked(&inode->i_mutex)); ++ if (offset > size || offset + len < size) ++ return 0; ++ ++ if (EXT4_I(inode)->i_disksize >= size) ++ return 0; ++ ++ handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ext4_update_i_disksize(inode, size); ++ ext4_mark_inode_dirty(handle, inode); ++ ext4_journal_stop(handle); ++ ++ return 0; ++} ++ ++/* + * ext4_punch_hole: punches a hole in a file by releaseing the blocks + * associated with the given offset and length + * +@@ -3636,9 +3665,13 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) + last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + + /* Now release the pages and zero block aligned part of pages*/ +- if (last_block_offset > first_block_offset) ++ if (last_block_offset > first_block_offset) { ++ ret = ext4_update_disksize_before_punch(inode, offset, length); ++ if (ret) ++ goto out_dio; + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); ++ } + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_writepage_trans_blocks(inode); +-- +2.5.5 + diff --git a/ext4-move-unlocked-dio-protection-from-ext4_alloc_fi.patch b/ext4-move-unlocked-dio-protection-from-ext4_alloc_fi.patch new file mode 100644 index 0000000..3bf6765 --- /dev/null +++ b/ext4-move-unlocked-dio-protection-from-ext4_alloc_fi.patch @@ -0,0 +1,93 @@ +From 17048e8a083fec7ad841d88ef0812707fbc7e39f Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Mon, 7 Dec 2015 14:29:17 -0500 +Subject: [PATCH 2/4] ext4: move unlocked dio protection from + ext4_alloc_file_blocks() + +Currently ext4_alloc_file_blocks() was handling protection against +unlocked DIO. However we now need to sometimes call it under i_mmap_sem +and sometimes not and DIO protection ranks above it (although strictly +speaking this cannot currently create any deadlocks). Also +ext4_zero_range() was actually getting & releasing unlocked DIO +protection twice in some cases. Luckily it didn't introduce any real bug +but it was a land mine waiting to be stepped on. So move DIO protection +out from ext4_alloc_file_blocks() into the two callsites. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/extents.c | 21 ++++++++++----------- + 1 file changed, 10 insertions(+), 11 deletions(-) + +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index 5be9ca5a8a7a..65b5ada2833f 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -4685,10 +4685,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, + if (len <= EXT_UNWRITTEN_MAX_LEN) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + +- /* Wait all existing dio workers, newcomers will block on i_mutex */ +- ext4_inode_block_unlocked_dio(inode); +- inode_dio_wait(inode); +- + /* + * credits to insert 1 extent into extent tree + */ +@@ -4752,8 +4748,6 @@ retry: + goto retry; + } + +- ext4_inode_resume_unlocked_dio(inode); +- + return ret > 0 ? ret2 : ret; + } + +@@ -4827,6 +4821,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + ++ /* Wait all existing dio workers, newcomers will block on i_mutex */ ++ ext4_inode_block_unlocked_dio(inode); ++ inode_dio_wait(inode); ++ + /* Preallocate the range including the unaligned edges */ + if (partial_begin || partial_end) { + ret = ext4_alloc_file_blocks(file, +@@ -4835,7 +4833,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, + round_down(offset, 1 << blkbits)) >> blkbits, + new_size, flags, mode); + if (ret) +- goto out_mutex; ++ goto out_dio; + + } + +@@ -4844,10 +4842,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE); + +- /* Wait all existing dio workers, newcomers will block on i_mutex */ +- ext4_inode_block_unlocked_dio(inode); +- inode_dio_wait(inode); +- + /* + * Prevent page faults from reinstantiating pages we have + * released from page cache. +@@ -4992,8 +4986,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) + goto out; + } + ++ /* Wait all existing dio workers, newcomers will block on i_mutex */ ++ ext4_inode_block_unlocked_dio(inode); ++ inode_dio_wait(inode); ++ + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); ++ ext4_inode_resume_unlocked_dio(inode); + if (ret) + goto out; + +-- +2.5.5 + diff --git a/kernel.spec b/kernel.spec index f45fb30..ab91030 100644 --- a/kernel.spec +++ b/kernel.spec @@ -681,6 +681,12 @@ Patch689: x86-iopl-64-Properly-context-switch-IOPL-on-Xen-PV.patch # CVE-2016-3672 rhbz 1324749 1324750 Patch690: x86-mm-32-Enable-full-randomization-on-i386-and-X86_.patch +#CVE-2015-8839 rhbz 1323577 1323579 +Patch691: ext4-fix-races-between-page-faults-and-hole-punching.patch +Patch692: ext4-move-unlocked-dio-protection-from-ext4_alloc_fi.patch +Patch693: ext4-fix-races-between-buffered-IO-and-collapse-inse.patch +Patch694: ext4-fix-races-of-writeback-with-punch-hole-and-zero.patch + # END OF PATCH DEFINITIONS %endif @@ -2124,6 +2130,9 @@ fi # # %changelog +* Mon Apr 11 2016 Josh Boyer +- CVE-2015-8839 ext4: data corruption due to punch hole races (rhbz 1323577 1323579) + * Thu Apr 07 2016 Justin M. Forbes - Enable Full Randomization on 32bit x86 CVE-2016-3672 (rhbz 1324749 1324750)