From b0cdc7c3c52a170c28fb4f5a98debcda391ee84e Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Nov 19 2012 16:04:45 +0000 Subject: Apply patches from Jeff Moyer to fix direct-io oops (rhbz 812129) --- diff --git a/block-fix-a-crash-when-block-device-is.patch b/block-fix-a-crash-when-block-device-is.patch new file mode 100644 index 0000000..af99283 --- /dev/null +++ b/block-fix-a-crash-when-block-device-is.patch @@ -0,0 +1,214 @@ +Fix a crash when block device is read and block size is changed at the same time + +commit b87570f5d349661814b262dd5fc40787700f80d6 +Author: Mikulas Patocka +Date: Wed Sep 26 07:46:40 2012 +0200 + + Fix a crash when block device is read and block size is changed at the same time + + The kernel may crash when block size is changed and I/O is issued + simultaneously. + + Because some subsystems (udev or lvm) may read any block device anytime, + the bug actually puts any code that changes a block device size in + jeopardy. + + The crash can be reproduced if you place "msleep(1000)" to + blkdev_get_blocks just before "bh->b_size = max_blocks << + inode->i_blkbits;". + Then, run "dd if=/dev/ram0 of=/dev/null bs=4k count=1 iflag=direct" + While it is waiting in msleep, run "blockdev --setbsz 2048 /dev/ram0" + You get a BUG. + + The direct and non-direct I/O is written with the assumption that block + size does not change. It doesn't seem practical to fix these crashes + one-by-one there may be many crash possibilities when block size changes + at a certain place and it is impossible to find them all and verify the + code. + + This patch introduces a new rw-lock bd_block_size_semaphore. The lock is + taken for read during I/O. It is taken for write when changing block + size. Consequently, block size can't be changed while I/O is being + submitted. + + For asynchronous I/O, the patch only prevents block size change while + the I/O is being submitted. The block size can change when the I/O is in + progress or when the I/O is being finished. This is acceptable because + there are no accesses to block size when asynchronous I/O is being + finished. + + The patch prevents block size changing while the device is mapped with + mmap. + + Signed-off-by: Mikulas Patocka + Signed-off-by: Jens Axboe + +Index: linux-3.6.x86_64/drivers/char/raw.c +=================================================================== +--- linux-3.6.x86_64.orig/drivers/char/raw.c 2012-11-16 17:12:35.127010280 -0500 ++++ linux-3.6.x86_64/drivers/char/raw.c 2012-11-16 17:12:37.381002516 -0500 +@@ -285,7 +285,7 @@ + + static const struct file_operations raw_fops = { + .read = do_sync_read, +- .aio_read = generic_file_aio_read, ++ .aio_read = blkdev_aio_read, + .write = do_sync_write, + .aio_write = blkdev_aio_write, + .fsync = blkdev_fsync, +Index: linux-3.6.x86_64/fs/block_dev.c +=================================================================== +--- linux-3.6.x86_64.orig/fs/block_dev.c 2012-11-16 17:12:35.127010280 -0500 ++++ linux-3.6.x86_64/fs/block_dev.c 2012-11-16 17:12:37.381002516 -0500 +@@ -116,6 +116,8 @@ + + int set_blocksize(struct block_device *bdev, int size) + { ++ struct address_space *mapping; ++ + /* Size must be a power of two, and between 512 and PAGE_SIZE */ + if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) + return -EINVAL; +@@ -124,6 +126,20 @@ + if (size < bdev_logical_block_size(bdev)) + return -EINVAL; + ++ /* Prevent starting I/O or mapping the device */ ++ down_write(&bdev->bd_block_size_semaphore); ++ ++ /* Check that the block device is not memory mapped */ ++ mapping = bdev->bd_inode->i_mapping; ++ mutex_lock(&mapping->i_mmap_mutex); ++ if (!prio_tree_empty(&mapping->i_mmap) || ++ !list_empty(&mapping->i_mmap_nonlinear)) { ++ mutex_unlock(&mapping->i_mmap_mutex); ++ up_write(&bdev->bd_block_size_semaphore); ++ return -EBUSY; ++ } ++ mutex_unlock(&mapping->i_mmap_mutex); ++ + /* Don't change the size if it is same as current */ + if (bdev->bd_block_size != size) { + sync_blockdev(bdev); +@@ -131,6 +147,9 @@ + bdev->bd_inode->i_blkbits = blksize_bits(size); + kill_bdev(bdev); + } ++ ++ up_write(&bdev->bd_block_size_semaphore); ++ + return 0; + } + +@@ -472,6 +491,7 @@ + inode_init_once(&ei->vfs_inode); + /* Initialize mutex for freeze. */ + mutex_init(&bdev->bd_fsfreeze_mutex); ++ init_rwsem(&bdev->bd_block_size_semaphore); + } + + static inline void __bd_forget(struct inode *inode) +@@ -1567,6 +1587,22 @@ + return blkdev_ioctl(bdev, mode, cmd, arg); + } + ++ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, ++ unsigned long nr_segs, loff_t pos) ++{ ++ ssize_t ret; ++ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); ++ ++ down_read(&bdev->bd_block_size_semaphore); ++ ++ ret = generic_file_aio_read(iocb, iov, nr_segs, pos); ++ ++ up_read(&bdev->bd_block_size_semaphore); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(blkdev_aio_read); ++ + /* + * Write data to the block device. Only intended for the block device itself + * and the raw driver which basically is a fake block device. +@@ -1578,12 +1614,16 @@ + unsigned long nr_segs, loff_t pos) + { + struct file *file = iocb->ki_filp; ++ struct block_device *bdev = I_BDEV(file->f_mapping->host); + struct blk_plug plug; + ssize_t ret; + + BUG_ON(iocb->ki_pos != pos); + + blk_start_plug(&plug); ++ ++ down_read(&bdev->bd_block_size_semaphore); ++ + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + if (ret > 0 || ret == -EIOCBQUEUED) { + ssize_t err; +@@ -1592,11 +1632,29 @@ + if (err < 0 && ret > 0) + ret = err; + } ++ ++ up_read(&bdev->bd_block_size_semaphore); ++ + blk_finish_plug(&plug); ++ + return ret; + } + EXPORT_SYMBOL_GPL(blkdev_aio_write); + ++int blkdev_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ int ret; ++ struct block_device *bdev = I_BDEV(file->f_mapping->host); ++ ++ down_read(&bdev->bd_block_size_semaphore); ++ ++ ret = generic_file_mmap(file, vma); ++ ++ up_read(&bdev->bd_block_size_semaphore); ++ ++ return ret; ++} ++ + /* + * Try to release a page associated with block device when the system + * is under memory pressure. +@@ -1627,9 +1685,9 @@ + .llseek = block_llseek, + .read = do_sync_read, + .write = do_sync_write, +- .aio_read = generic_file_aio_read, ++ .aio_read = blkdev_aio_read, + .aio_write = blkdev_aio_write, +- .mmap = generic_file_mmap, ++ .mmap = blkdev_mmap, + .fsync = blkdev_fsync, + .unlocked_ioctl = block_ioctl, + #ifdef CONFIG_COMPAT +Index: linux-3.6.x86_64/include/linux/fs.h +=================================================================== +--- linux-3.6.x86_64.orig/include/linux/fs.h 2012-11-16 17:12:35.127010280 -0500 ++++ linux-3.6.x86_64/include/linux/fs.h 2012-11-16 17:12:37.424002387 -0500 +@@ -724,6 +724,8 @@ + int bd_fsfreeze_count; + /* Mutex for freeze */ + struct mutex bd_fsfreeze_mutex; ++ /* A semaphore that prevents I/O while block size is being changed */ ++ struct rw_semaphore bd_block_size_semaphore; + }; + + /* +@@ -2564,6 +2566,8 @@ + unsigned long *nr_segs, size_t *count, int access_flags); + + /* fs/block_dev.c */ ++extern ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, ++ unsigned long nr_segs, loff_t pos); + extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); + extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, diff --git a/blockdev-turn-a-rw-semaphore-into-a-percpu-rw-sem.patch b/blockdev-turn-a-rw-semaphore-into-a-percpu-rw-sem.patch new file mode 100644 index 0000000..82caa6b --- /dev/null +++ b/blockdev-turn-a-rw-semaphore-into-a-percpu-rw-sem.patch @@ -0,0 +1,290 @@ +blockdev: turn a rw semaphore into a percpu rw semaphore + +commit 62ac665ff9fc07497ca524bd20d6a96893d11071 +Author: Mikulas Patocka +Date: Wed Sep 26 07:46:43 2012 +0200 + + blockdev: turn a rw semaphore into a percpu rw semaphore + + This avoids cache line bouncing when many processes lock the semaphore + for read. + + New percpu lock implementation + + The lock consists of an array of percpu unsigned integers, a boolean + variable and a mutex. + + When we take the lock for read, we enter rcu read section, check for a + "locked" variable. If it is false, we increase a percpu counter on the + current cpu and exit the rcu section. If "locked" is true, we exit the + rcu section, take the mutex and drop it (this waits until a writer + finished) and retry. + + Unlocking for read just decreases percpu variable. Note that we can + unlock on a difference cpu than where we locked, in this case the + counter underflows. The sum of all percpu counters represents the number + of processes that hold the lock for read. + + When we need to lock for write, we take the mutex, set "locked" variable + to true and synchronize rcu. Since RCU has been synchronized, no + processes can create new read locks. We wait until the sum of percpu + counters is zero - when it is, there are no readers in the critical + section. + + Signed-off-by: Mikulas Patocka + Signed-off-by: Jens Axboe + +Index: linux-3.6.x86_64/Documentation/percpu-rw-semaphore.txt +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-3.6.x86_64/Documentation/percpu-rw-semaphore.txt 2012-11-16 17:12:57.351936583 -0500 +@@ -0,0 +1,27 @@ ++Percpu rw semaphores ++-------------------- ++ ++Percpu rw semaphores is a new read-write semaphore design that is ++optimized for locking for reading. ++ ++The problem with traditional read-write semaphores is that when multiple ++cores take the lock for reading, the cache line containing the semaphore ++is bouncing between L1 caches of the cores, causing performance ++degradation. ++ ++Locking for reading it very fast, it uses RCU and it avoids any atomic ++instruction in the lock and unlock path. On the other hand, locking for ++writing is very expensive, it calls synchronize_rcu() that can take ++hundreds of microseconds. ++ ++The lock is declared with "struct percpu_rw_semaphore" type. ++The lock is initialized percpu_init_rwsem, it returns 0 on success and ++-ENOMEM on allocation failure. ++The lock must be freed with percpu_free_rwsem to avoid memory leak. ++ ++The lock is locked for read with percpu_down_read, percpu_up_read and ++for write with percpu_down_write, percpu_up_write. ++ ++The idea of using RCU for optimized rw-lock was introduced by ++Eric Dumazet . ++The code was written by Mikulas Patocka +Index: linux-3.6.x86_64/fs/block_dev.c +=================================================================== +--- linux-3.6.x86_64.orig/fs/block_dev.c 2012-11-16 17:12:37.381002516 -0500 ++++ linux-3.6.x86_64/fs/block_dev.c 2012-11-16 17:27:41.217005828 -0500 +@@ -127,7 +127,7 @@ + return -EINVAL; + + /* Prevent starting I/O or mapping the device */ +- down_write(&bdev->bd_block_size_semaphore); ++ percpu_down_write(&bdev->bd_block_size_semaphore); + + /* Check that the block device is not memory mapped */ + mapping = bdev->bd_inode->i_mapping; +@@ -135,7 +135,7 @@ + if (!prio_tree_empty(&mapping->i_mmap) || + !list_empty(&mapping->i_mmap_nonlinear)) { + mutex_unlock(&mapping->i_mmap_mutex); +- up_write(&bdev->bd_block_size_semaphore); ++ percpu_up_write(&bdev->bd_block_size_semaphore); + return -EBUSY; + } + mutex_unlock(&mapping->i_mmap_mutex); +@@ -148,7 +148,7 @@ + kill_bdev(bdev); + } + +- up_write(&bdev->bd_block_size_semaphore); ++ percpu_up_write(&bdev->bd_block_size_semaphore); + + return 0; + } +@@ -460,6 +460,12 @@ + struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); + if (!ei) + return NULL; ++ ++ if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) { ++ kmem_cache_free(bdev_cachep, ei); ++ return NULL; ++ } ++ + return &ei->vfs_inode; + } + +@@ -468,6 +474,8 @@ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct bdev_inode *bdi = BDEV_I(inode); + ++ percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore); ++ + kmem_cache_free(bdev_cachep, bdi); + } + +@@ -491,7 +499,6 @@ + inode_init_once(&ei->vfs_inode); + /* Initialize mutex for freeze. */ + mutex_init(&bdev->bd_fsfreeze_mutex); +- init_rwsem(&bdev->bd_block_size_semaphore); + } + + static inline void __bd_forget(struct inode *inode) +@@ -1593,11 +1600,11 @@ + ssize_t ret; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + +- down_read(&bdev->bd_block_size_semaphore); ++ percpu_down_read(&bdev->bd_block_size_semaphore); + + ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + +- up_read(&bdev->bd_block_size_semaphore); ++ percpu_up_read(&bdev->bd_block_size_semaphore); + + return ret; + } +@@ -1622,7 +1629,7 @@ + + blk_start_plug(&plug); + +- down_read(&bdev->bd_block_size_semaphore); ++ percpu_down_read(&bdev->bd_block_size_semaphore); + + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + if (ret > 0 || ret == -EIOCBQUEUED) { +@@ -1633,7 +1640,7 @@ + ret = err; + } + +- up_read(&bdev->bd_block_size_semaphore); ++ percpu_up_read(&bdev->bd_block_size_semaphore); + + blk_finish_plug(&plug); + +@@ -1646,11 +1653,11 @@ + int ret; + struct block_device *bdev = I_BDEV(file->f_mapping->host); + +- down_read(&bdev->bd_block_size_semaphore); ++ percpu_down_read(&bdev->bd_block_size_semaphore); + + ret = generic_file_mmap(file, vma); + +- up_read(&bdev->bd_block_size_semaphore); ++ percpu_up_read(&bdev->bd_block_size_semaphore); + + return ret; + } +Index: linux-3.6.x86_64/include/linux/fs.h +=================================================================== +--- linux-3.6.x86_64.orig/include/linux/fs.h 2012-11-16 17:12:37.424002387 -0500 ++++ linux-3.6.x86_64/include/linux/fs.h 2012-11-16 17:28:12.578901349 -0500 +@@ -415,6 +415,7 @@ + #include + #include + #include ++#include + + #include + +@@ -725,7 +726,7 @@ + /* Mutex for freeze */ + struct mutex bd_fsfreeze_mutex; + /* A semaphore that prevents I/O while block size is being changed */ +- struct rw_semaphore bd_block_size_semaphore; ++ struct percpu_rw_semaphore bd_block_size_semaphore; + }; + + /* +Index: linux-3.6.x86_64/include/linux/percpu-rwsem.h +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-3.6.x86_64/include/linux/percpu-rwsem.h 2012-11-16 17:12:57.354936574 -0500 +@@ -0,0 +1,89 @@ ++#ifndef _LINUX_PERCPU_RWSEM_H ++#define _LINUX_PERCPU_RWSEM_H ++ ++#include ++#include ++#include ++#include ++ ++struct percpu_rw_semaphore { ++ unsigned __percpu *counters; ++ bool locked; ++ struct mutex mtx; ++}; ++ ++static inline void percpu_down_read(struct percpu_rw_semaphore *p) ++{ ++ rcu_read_lock(); ++ if (unlikely(p->locked)) { ++ rcu_read_unlock(); ++ mutex_lock(&p->mtx); ++ this_cpu_inc(*p->counters); ++ mutex_unlock(&p->mtx); ++ return; ++ } ++ this_cpu_inc(*p->counters); ++ rcu_read_unlock(); ++} ++ ++static inline void percpu_up_read(struct percpu_rw_semaphore *p) ++{ ++ /* ++ * On X86, write operation in this_cpu_dec serves as a memory unlock ++ * barrier (i.e. memory accesses may be moved before the write, but ++ * no memory accesses are moved past the write). ++ * On other architectures this may not be the case, so we need smp_mb() ++ * there. ++ */ ++#if defined(CONFIG_X86) && (!defined(CONFIG_X86_PPRO_FENCE) && !defined(CONFIG_X86_OOSTORE)) ++ barrier(); ++#else ++ smp_mb(); ++#endif ++ this_cpu_dec(*p->counters); ++} ++ ++static inline unsigned __percpu_count(unsigned __percpu *counters) ++{ ++ unsigned total = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ total += ACCESS_ONCE(*per_cpu_ptr(counters, cpu)); ++ ++ return total; ++} ++ ++static inline void percpu_down_write(struct percpu_rw_semaphore *p) ++{ ++ mutex_lock(&p->mtx); ++ p->locked = true; ++ synchronize_rcu(); ++ while (__percpu_count(p->counters)) ++ msleep(1); ++ smp_rmb(); /* paired with smp_mb() in percpu_sem_up_read() */ ++} ++ ++static inline void percpu_up_write(struct percpu_rw_semaphore *p) ++{ ++ p->locked = false; ++ mutex_unlock(&p->mtx); ++} ++ ++static inline int percpu_init_rwsem(struct percpu_rw_semaphore *p) ++{ ++ p->counters = alloc_percpu(unsigned); ++ if (unlikely(!p->counters)) ++ return -ENOMEM; ++ p->locked = false; ++ mutex_init(&p->mtx); ++ return 0; ++} ++ ++static inline void percpu_free_rwsem(struct percpu_rw_semaphore *p) ++{ ++ free_percpu(p->counters); ++ p->counters = NULL; /* catch use after free bugs */ ++} ++ ++#endif diff --git a/fs-lock-splice_read-and-splice_write-functions.patch b/fs-lock-splice_read-and-splice_write-functions.patch new file mode 100644 index 0000000..938cbd9 --- /dev/null +++ b/fs-lock-splice_read-and-splice_write-functions.patch @@ -0,0 +1,74 @@ +Lock splice_read and splice_write functions + +commit 1a25b1c4ce189e3926f2981f3302352a930086db +Author: Mikulas Patocka +Date: Mon Oct 15 17:20:17 2012 -0400 + + Lock splice_read and splice_write functions + + Functions generic_file_splice_read and generic_file_splice_write access + the pagecache directly. For block devices these functions must be locked + so that block size is not changed while they are in progress. + + This patch is an additional fix for commit b87570f5d349 ("Fix a crash + when block device is read and block size is changed at the same time") + that locked aio_read, aio_write and mmap against block size change. + + Signed-off-by: Mikulas Patocka + Signed-off-by: Linus Torvalds + +Index: linux-3.6.x86_64/fs/block_dev.c +=================================================================== +--- linux-3.6.x86_64.orig/fs/block_dev.c 2012-11-16 17:12:57.352936580 -0500 ++++ linux-3.6.x86_64/fs/block_dev.c 2012-11-16 17:13:11.908887989 -0500 +@@ -1662,6 +1662,39 @@ + return ret; + } + ++static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) ++{ ++ ssize_t ret; ++ struct block_device *bdev = I_BDEV(file->f_mapping->host); ++ ++ percpu_down_read(&bdev->bd_block_size_semaphore); ++ ++ ret = generic_file_splice_read(file, ppos, pipe, len, flags); ++ ++ percpu_up_read(&bdev->bd_block_size_semaphore); ++ ++ return ret; ++} ++ ++static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe, ++ struct file *file, loff_t *ppos, size_t len, ++ unsigned int flags) ++{ ++ ssize_t ret; ++ struct block_device *bdev = I_BDEV(file->f_mapping->host); ++ ++ percpu_down_read(&bdev->bd_block_size_semaphore); ++ ++ ret = generic_file_splice_write(pipe, file, ppos, len, flags); ++ ++ percpu_up_read(&bdev->bd_block_size_semaphore); ++ ++ return ret; ++} ++ ++ + /* + * Try to release a page associated with block device when the system + * is under memory pressure. +@@ -1700,8 +1733,8 @@ + #ifdef CONFIG_COMPAT + .compat_ioctl = compat_blkdev_ioctl, + #endif +- .splice_read = generic_file_splice_read, +- .splice_write = generic_file_splice_write, ++ .splice_read = blkdev_splice_read, ++ .splice_write = blkdev_splice_write, + }; + + int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) diff --git a/kernel.spec b/kernel.spec index 854f6ec..e584991 100644 --- a/kernel.spec +++ b/kernel.spec @@ -62,7 +62,7 @@ Summary: The Linux kernel # For non-released -rc kernels, this will be appended after the rcX and # gitX tags, so a 3 here would become part of release "0.rcX.gitX.3" # -%global baserelease 1 +%global baserelease 2 %global fedora_build %{baserelease} # base_sublevel is the kernel version we're starting with and patching @@ -802,6 +802,11 @@ Patch22114: iwlwifi-remove-queue-empty-warn-3.6.patch #rhbz 870562 Patch22115: keyspan.patch +#rhbz 812129 +Patch22120: block-fix-a-crash-when-block-device-is.patch +Patch22121: blockdev-turn-a-rw-semaphore-into-a-percpu-rw-sem.patch +Patch22122: fs-lock-splice_read-and-splice_write-functions.patch + # END OF PATCH DEFINITIONS %endif @@ -1551,6 +1556,11 @@ ApplyPatch iwlwifi-remove-queue-empty-warn-3.6.patch #rhbz 870562 ApplyPatch keyspan.patch +#rhbz 812129 +ApplyPatch block-fix-a-crash-when-block-device-is.patch +ApplyPatch blockdev-turn-a-rw-semaphore-into-a-percpu-rw-sem.patch +ApplyPatch fs-lock-splice_read-and-splice_write-functions.patch + # END OF PATCH APPLICATIONS %endif @@ -2416,6 +2426,9 @@ fi # ||----w | # || || %changelog +* Mon Nov 19 2012 Josh Boyer +- Apply patches from Jeff Moyer to fix direct-io oops (rhbz 812129) + * Sat Nov 17 2012 Justin M. Forbes - 3.6.7-1 - linux 3.6.7