cd39414
From e23394806df0768ed2dac87484590d2f3a730d55 Mon Sep 17 00:00:00 2001
cd39414
From: Dmitry Monakhov <dmonakhov@openvz.org>
cd39414
Date: Sat, 29 Sep 2012 00:14:55 -0400
cd39414
Subject: [PATCH 04/13] ext4: completed_io locking cleanup
cd39414
cd39414
Current unwritten extent conversion state-machine is very fuzzy.
cd39414
- For unknown reason it performs conversion under i_mutex. What for?
cd39414
  My diagnosis:
cd39414
  We already protect extent tree with i_data_sem, truncate and punch_hole
cd39414
  should wait for DIO, so the only data we have to protect is end_io->flags
cd39414
  modification, but only flush_completed_IO and end_io_work modified this
cd39414
  flags and we can serialize them via i_completed_io_lock.
cd39414
cd39414
  Currently all these games with mutex_trylock result in the following deadlock
cd39414
   truncate:                          kworker:
cd39414
    ext4_setattr                       ext4_end_io_work
cd39414
    mutex_lock(i_mutex)
cd39414
    inode_dio_wait(inode)  ->BLOCK
cd39414
                             DEADLOCK<- mutex_trylock()
cd39414
                                        inode_dio_done()
cd39414
  #TEST_CASE1_BEGIN
cd39414
  MNT=/mnt_scrach
cd39414
  unlink $MNT/file
cd39414
  fallocate -l $((1024*1024*1024)) $MNT/file
cd39414
  aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
cd39414
  sleep 2
cd39414
  truncate -s 0 $MNT/file
cd39414
  #TEST_CASE1_END
cd39414
cd39414
Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286
cd39414
cd39414
This patch makes state machine simple and clean:
cd39414
cd39414
(1) xxx_end_io schedule final extent conversion simply by calling
cd39414
    ext4_add_complete_io(), which append it to ei->i_completed_io_list
cd39414
    NOTE1: because of (2A) work should be queued only if
cd39414
    ->i_completed_io_list was empty, otherwise the work is scheduled already.
cd39414
cd39414
(2) ext4_flush_completed_IO is responsible for handling all pending
cd39414
    end_io from ei->i_completed_io_list
cd39414
    Flushing sequence consists of following stages:
cd39414
    A) LOCKED: Atomically drain completed_io_list to local_list
cd39414
    B) Perform extents conversion
cd39414
    C) LOCKED: move converted io's to to_free list for final deletion
cd39414
       	     This logic depends on context which we was called from.
cd39414
    D) Final end_io context destruction
cd39414
    NOTE1: i_mutex is no longer required because end_io->flags modification
cd39414
    is protected by ei->ext4_complete_io_lock
cd39414
cd39414
Full list of changes:
cd39414
- Move all completion end_io related routines to page-io.c in order to improve
cd39414
  logic locality
cd39414
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
cd39414
- remove EXT4_IO_END_FSYNC
cd39414
- Improve SMP scalability by removing useless i_mutex which does not
cd39414
  protect io->flags anymore.
cd39414
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
cd39414
- Rename ext4_end_io_nolock to end4_end_io and make it static
cd39414
- Check flush completion status to ext4_ext_punch_hole(). Because it is
cd39414
  not good idea to punch blocks from corrupted inode.
cd39414
cd39414
Changes since V3 (in request to Jan's comments):
cd39414
  Fall back to active flush_completed_IO() approach in order to prevent
cd39414
  performance issues with nolocked DIO reads.
cd39414
Changes since V2:
cd39414
  Fix use-after-free caused by race truncate vs end_io_work
cd39414
cd39414
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
cd39414
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
cd39414
(cherry picked from commit 28a535f9a0df060569dcc786e5bc2e1de43d7dc7)
cd39414
---
cd39414
 fs/ext4/ext4.h     |   3 +-
cd39414
 fs/ext4/extents.c  |   4 +-
cd39414
 fs/ext4/fsync.c    |  81 -------------------------
cd39414
 fs/ext4/indirect.c |   6 +-
cd39414
 fs/ext4/inode.c    |  25 +-------
cd39414
 fs/ext4/page-io.c  | 171 +++++++++++++++++++++++++++++++++++------------------
cd39414
 6 files changed, 121 insertions(+), 169 deletions(-)
cd39414
cd39414
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
cd39414
index 28dfd9b..7687d15 100644
cd39414
--- a/fs/ext4/ext4.h
cd39414
+++ b/fs/ext4/ext4.h
cd39414
@@ -186,7 +186,6 @@ struct mpage_da_data {
cd39414
 #define EXT4_IO_END_ERROR	0x0002
cd39414
 #define EXT4_IO_END_QUEUED	0x0004
cd39414
 #define EXT4_IO_END_DIRECT	0x0008
cd39414
-#define EXT4_IO_END_IN_FSYNC	0x0010
cd39414
 
cd39414
 struct ext4_io_page {
cd39414
 	struct page	*p_page;
cd39414
@@ -2408,11 +2407,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
cd39414
 
cd39414
 /* page-io.c */
cd39414
 extern int __init ext4_init_pageio(void);
cd39414
+extern void ext4_add_complete_io(ext4_io_end_t *io_end);
cd39414
 extern void ext4_exit_pageio(void);
cd39414
 extern void ext4_ioend_wait(struct inode *);
cd39414
 extern void ext4_free_io_end(ext4_io_end_t *io);
cd39414
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
cd39414
-extern int ext4_end_io_nolock(ext4_io_end_t *io);
cd39414
 extern void ext4_io_submit(struct ext4_io_submit *io);
cd39414
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
cd39414
 			       struct page *page,
cd39414
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
cd39414
index e04eb4f..1fbf2ff 100644
cd39414
--- a/fs/ext4/extents.c
cd39414
+++ b/fs/ext4/extents.c
cd39414
@@ -4815,7 +4815,9 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
cd39414
 	}
cd39414
 
cd39414
 	/* finish any pending end_io work */
cd39414
-	ext4_flush_completed_IO(inode);
cd39414
+	err = ext4_flush_completed_IO(inode);
cd39414
+	if (err)
cd39414
+		return err;
cd39414
 
cd39414
 	credits = ext4_writepage_trans_blocks(inode);
cd39414
 	handle = ext4_journal_start(inode, credits);
cd39414
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
cd39414
index 2a1dcea..520b058 100644
cd39414
--- a/fs/ext4/fsync.c
cd39414
+++ b/fs/ext4/fsync.c
cd39414
@@ -34,87 +34,6 @@
cd39414
 
cd39414
 #include <trace/events/ext4.h>
cd39414
 
cd39414
-static void dump_completed_IO(struct inode * inode)
cd39414
-{
cd39414
-#ifdef	EXT4FS_DEBUG
cd39414
-	struct list_head *cur, *before, *after;
cd39414
-	ext4_io_end_t *io, *io0, *io1;
cd39414
-	unsigned long flags;
cd39414
-
cd39414
-	if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
cd39414
-		ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
cd39414
-		return;
cd39414
-	}
cd39414
-
cd39414
-	ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
cd39414
-	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
cd39414
-	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
cd39414
-		cur = &io->list;
cd39414
-		before = cur->prev;
cd39414
-		io0 = container_of(before, ext4_io_end_t, list);
cd39414
-		after = cur->next;
cd39414
-		io1 = container_of(after, ext4_io_end_t, list);
cd39414
-
cd39414
-		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
cd39414
-			    io, inode->i_ino, io0, io1);
cd39414
-	}
cd39414
-	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
cd39414
-#endif
cd39414
-}
cd39414
-
cd39414
-/*
cd39414
- * This function is called from ext4_sync_file().
cd39414
- *
cd39414
- * When IO is completed, the work to convert unwritten extents to
cd39414
- * written is queued on workqueue but may not get immediately
cd39414
- * scheduled. When fsync is called, we need to ensure the
cd39414
- * conversion is complete before fsync returns.
cd39414
- * The inode keeps track of a list of pending/completed IO that
cd39414
- * might needs to do the conversion. This function walks through
cd39414
- * the list and convert the related unwritten extents for completed IO
cd39414
- * to written.
cd39414
- * The function return the number of pending IOs on success.
cd39414
- */
cd39414
-int ext4_flush_completed_IO(struct inode *inode)
cd39414
-{
cd39414
-	ext4_io_end_t *io;
cd39414
-	struct ext4_inode_info *ei = EXT4_I(inode);
cd39414
-	unsigned long flags;
cd39414
-	int ret = 0;
cd39414
-	int ret2 = 0;
cd39414
-
cd39414
-	dump_completed_IO(inode);
cd39414
-	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
cd39414
-	while (!list_empty(&ei->i_completed_io_list)){
cd39414
-		io = list_entry(ei->i_completed_io_list.next,
cd39414
-				ext4_io_end_t, list);
cd39414
-		list_del_init(&io->list);
cd39414
-		io->flag |= EXT4_IO_END_IN_FSYNC;
cd39414
-		/*
cd39414
-		 * Calling ext4_end_io_nolock() to convert completed
cd39414
-		 * IO to written.
cd39414
-		 *
cd39414
-		 * When ext4_sync_file() is called, run_queue() may already
cd39414
-		 * about to flush the work corresponding to this io structure.
cd39414
-		 * It will be upset if it founds the io structure related
cd39414
-		 * to the work-to-be schedule is freed.
cd39414
-		 *
cd39414
-		 * Thus we need to keep the io structure still valid here after
cd39414
-		 * conversion finished. The io structure has a flag to
cd39414
-		 * avoid double converting from both fsync and background work
cd39414
-		 * queue work.
cd39414
-		 */
cd39414
-		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
cd39414
-		ret = ext4_end_io_nolock(io);
cd39414
-		if (ret < 0)
cd39414
-			ret2 = ret;
cd39414
-		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
cd39414
-		io->flag &= ~EXT4_IO_END_IN_FSYNC;
cd39414
-	}
cd39414
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
cd39414
-	return (ret2 < 0) ? ret2 : 0;
cd39414
-}
cd39414
-
cd39414
 /*
cd39414
  * If we're not journaling and this is a just-created file, we have to
cd39414
  * sync our parent directory (if it was freshly created) since
cd39414
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
cd39414
index 830e1b2..61f13e5 100644
cd39414
--- a/fs/ext4/indirect.c
cd39414
+++ b/fs/ext4/indirect.c
cd39414
@@ -807,11 +807,9 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
cd39414
 
cd39414
 retry:
cd39414
 	if (rw == READ && ext4_should_dioread_nolock(inode)) {
cd39414
-		if (unlikely(!list_empty(&ei->i_completed_io_list))) {
cd39414
-			mutex_lock(&inode->i_mutex);
cd39414
+		if (unlikely(!list_empty(&ei->i_completed_io_list)))
cd39414
 			ext4_flush_completed_IO(inode);
cd39414
-			mutex_unlock(&inode->i_mutex);
cd39414
-		}
cd39414
+
cd39414
 		ret = __blockdev_direct_IO(rw, iocb, inode,
cd39414
 				 inode->i_sb->s_bdev, iov,
cd39414
 				 offset, nr_segs,
cd39414
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
cd39414
index acadd2b..dd3fd23 100644
cd39414
--- a/fs/ext4/inode.c
cd39414
+++ b/fs/ext4/inode.c
cd39414
@@ -2879,9 +2879,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
cd39414
 {
cd39414
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
cd39414
         ext4_io_end_t *io_end = iocb->private;
cd39414
-	struct workqueue_struct *wq;
cd39414
-	unsigned long flags;
cd39414
-	struct ext4_inode_info *ei;
cd39414
 
cd39414
 	/* if not async direct IO or dio with 0 bytes write, just return */
cd39414
 	if (!io_end || !size)
cd39414
@@ -2910,24 +2907,14 @@ out:
cd39414
 		io_end->iocb = iocb;
cd39414
 		io_end->result = ret;
cd39414
 	}
cd39414
-	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
cd39414
-
cd39414
-	/* Add the io_end to per-inode completed aio dio list*/
cd39414
-	ei = EXT4_I(io_end->inode);
cd39414
-	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
cd39414
-	list_add_tail(&io_end->list, &ei->i_completed_io_list);
cd39414
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
cd39414
 
cd39414
-	/* queue the work to convert unwritten extents to written */
cd39414
-	queue_work(wq, &io_end->work);
cd39414
+	ext4_add_complete_io(io_end);
cd39414
 }
cd39414
 
cd39414
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
cd39414
 {
cd39414
 	ext4_io_end_t *io_end = bh->b_private;
cd39414
-	struct workqueue_struct *wq;
cd39414
 	struct inode *inode;
cd39414
-	unsigned long flags;
cd39414
 
cd39414
 	if (!test_clear_buffer_uninit(bh) || !io_end)
cd39414
 		goto out;
cd39414
@@ -2946,15 +2933,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
cd39414
 	 */
cd39414
 	inode = io_end->inode;
cd39414
 	ext4_set_io_unwritten_flag(inode, io_end);
cd39414
-
cd39414
-	/* Add the io_end to per-inode completed io list*/
cd39414
-	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
cd39414
-	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
cd39414
-	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
cd39414
-
cd39414
-	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
cd39414
-	/* queue the work to convert unwritten extents to written */
cd39414
-	queue_work(wq, &io_end->work);
cd39414
+	ext4_add_complete_io(io_end);
cd39414
 out:
cd39414
 	bh->b_private = NULL;
cd39414
 	bh->b_end_io = NULL;
cd39414
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
cd39414
index 9970022..5b24c40 100644
cd39414
--- a/fs/ext4/page-io.c
cd39414
+++ b/fs/ext4/page-io.c
cd39414
@@ -71,6 +71,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
cd39414
 	int i;
cd39414
 
cd39414
 	BUG_ON(!io);
cd39414
+	BUG_ON(!list_empty(&io->list));
cd39414
 	BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
cd39414
 
cd39414
 	if (io->page)
cd39414
@@ -83,21 +84,14 @@ void ext4_free_io_end(ext4_io_end_t *io)
cd39414
 	kmem_cache_free(io_end_cachep, io);
cd39414
 }
cd39414
 
cd39414
-/*
cd39414
- * check a range of space and convert unwritten extents to written.
cd39414
- *
cd39414
- * Called with inode->i_mutex; we depend on this when we manipulate
cd39414
- * io->flag, since we could otherwise race with ext4_flush_completed_IO()
cd39414
- */
cd39414
-int ext4_end_io_nolock(ext4_io_end_t *io)
cd39414
+/* check a range of space and convert unwritten extents to written. */
cd39414
+static int ext4_end_io(ext4_io_end_t *io)
cd39414
 {
cd39414
 	struct inode *inode = io->inode;
cd39414
 	loff_t offset = io->offset;
cd39414
 	ssize_t size = io->size;
cd39414
 	int ret = 0;
cd39414
 
cd39414
-	BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
cd39414
-
cd39414
 	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
cd39414
 		   "list->prev 0x%p\n",
cd39414
 		   io, inode->i_ino, io->list.next, io->list.prev);
cd39414
@@ -110,7 +104,6 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
cd39414
 			 "(inode %lu, offset %llu, size %zd, error %d)",
cd39414
 			 inode->i_ino, offset, size, ret);
cd39414
 	}
cd39414
-	io->flag &= ~EXT4_IO_END_UNWRITTEN;
cd39414
 	if (io->iocb)
cd39414
 		aio_complete(io->iocb, io->result, 0);
cd39414
 
cd39414
@@ -122,51 +115,122 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
cd39414
 	return ret;
cd39414
 }
cd39414
 
cd39414
-/*
cd39414
- * work on completed aio dio IO, to convert unwritten extents to extents
cd39414
- */
cd39414
-static void ext4_end_io_work(struct work_struct *work)
cd39414
+static void dump_completed_IO(struct inode *inode)
cd39414
+{
cd39414
+#ifdef	EXT4FS_DEBUG
cd39414
+	struct list_head *cur, *before, *after;
cd39414
+	ext4_io_end_t *io, *io0, *io1;
cd39414
+	unsigned long flags;
cd39414
+
cd39414
+	if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
cd39414
+		ext4_debug("inode %lu completed_io list is empty\n",
cd39414
+			   inode->i_ino);
cd39414
+		return;
cd39414
+	}
cd39414
+
cd39414
+	ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
cd39414
+	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
cd39414
+		cur = &io->list;
cd39414
+		before = cur->prev;
cd39414
+		io0 = container_of(before, ext4_io_end_t, list);
cd39414
+		after = cur->next;
cd39414
+		io1 = container_of(after, ext4_io_end_t, list);
cd39414
+
cd39414
+		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
cd39414
+			    io, inode->i_ino, io0, io1);
cd39414
+	}
cd39414
+#endif
cd39414
+}
cd39414
+
cd39414
+/* Add the io_end to per-inode completed end_io list. */
cd39414
+void ext4_add_complete_io(ext4_io_end_t *io_end)
cd39414
 {
cd39414
-	ext4_io_end_t		*io = container_of(work, ext4_io_end_t, work);
cd39414
-	struct inode		*inode = io->inode;
cd39414
-	struct ext4_inode_info	*ei = EXT4_I(inode);
cd39414
-	unsigned long		flags;
cd39414
+	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
cd39414
+	struct workqueue_struct *wq;
cd39414
+	unsigned long flags;
cd39414
+
cd39414
+	BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
cd39414
+	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
cd39414
 
cd39414
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
cd39414
-	if (io->flag & EXT4_IO_END_IN_FSYNC)
cd39414
-		goto requeue;
cd39414
-	if (list_empty(&io->list)) {
cd39414
-		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
cd39414
-		goto free;
cd39414
+	if (list_empty(&ei->i_completed_io_list)) {
cd39414
+		io_end->flag |= EXT4_IO_END_QUEUED;
cd39414
+		queue_work(wq, &io_end->work);
cd39414
 	}
cd39414
+	list_add_tail(&io_end->list, &ei->i_completed_io_list);
cd39414
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
cd39414
+}
cd39414
 
cd39414
-	if (!mutex_trylock(&inode->i_mutex)) {
cd39414
-		bool was_queued;
cd39414
-requeue:
cd39414
-		was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
cd39414
-		io->flag |= EXT4_IO_END_QUEUED;
cd39414
-		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
cd39414
-		/*
cd39414
-		 * Requeue the work instead of waiting so that the work
cd39414
-		 * items queued after this can be processed.
cd39414
-		 */
cd39414
-		queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
cd39414
-		/*
cd39414
-		 * To prevent the ext4-dio-unwritten thread from keeping
cd39414
-		 * requeueing end_io requests and occupying cpu for too long,
cd39414
-		 * yield the cpu if it sees an end_io request that has already
cd39414
-		 * been requeued.
cd39414
-		 */
cd39414
-		if (was_queued)
cd39414
-			yield();
cd39414
-		return;
cd39414
+static int ext4_do_flush_completed_IO(struct inode *inode,
cd39414
+				      ext4_io_end_t *work_io)
cd39414
+{
cd39414
+	ext4_io_end_t *io;
cd39414
+	struct list_head unwritten, complete, to_free;
cd39414
+	unsigned long flags;
cd39414
+	struct ext4_inode_info *ei = EXT4_I(inode);
cd39414
+	int err, ret = 0;
cd39414
+
cd39414
+	INIT_LIST_HEAD(&complete);
cd39414
+	INIT_LIST_HEAD(&to_free);
cd39414
+
cd39414
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
cd39414
+	dump_completed_IO(inode);
cd39414
+	list_replace_init(&ei->i_completed_io_list, &unwritten);
cd39414
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
cd39414
+
cd39414
+	while (!list_empty(&unwritten)) {
cd39414
+		io = list_entry(unwritten.next, ext4_io_end_t, list);
cd39414
+		BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
cd39414
+		list_del_init(&io->list);
cd39414
+
cd39414
+		err = ext4_end_io(io);
cd39414
+		if (unlikely(!ret && err))
cd39414
+			ret = err;
cd39414
+
cd39414
+		list_add_tail(&io->list, &complete);
cd39414
+	}
cd39414
+	/* It is important to update all flags for all end_io in one shot w/o
cd39414
+	 * dropping the lock.*/
cd39414
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
cd39414
+	while (!list_empty(&complete)) {
cd39414
+		io = list_entry(complete.next, ext4_io_end_t, list);
cd39414
+		io->flag &= ~EXT4_IO_END_UNWRITTEN;
cd39414
+		/* end_io context can not be destroyed now because it still
cd39414
+		 * used by queued worker. Worker thread will destroy it later */
cd39414
+		if (io->flag & EXT4_IO_END_QUEUED)
cd39414
+			list_del_init(&io->list);
cd39414
+		else
cd39414
+			list_move(&io->list, &to_free);
cd39414
+	}
cd39414
+	/* If we are called from worker context, it is time to clear queued
cd39414
+	 * flag, and destroy it's end_io if it was converted already */
cd39414
+	if (work_io) {
cd39414
+		work_io->flag &= ~EXT4_IO_END_QUEUED;
cd39414
+		if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
cd39414
+			list_add_tail(&work_io->list, &to_free);
cd39414
 	}
cd39414
-	list_del_init(&io->list);
cd39414
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
cd39414
-	(void) ext4_end_io_nolock(io);
cd39414
-	mutex_unlock(&inode->i_mutex);
cd39414
-free:
cd39414
-	ext4_free_io_end(io);
cd39414
+
cd39414
+	while (!list_empty(&to_free)) {
cd39414
+		io = list_entry(to_free.next, ext4_io_end_t, list);
cd39414
+		list_del_init(&io->list);
cd39414
+		ext4_free_io_end(io);
cd39414
+	}
cd39414
+	return ret;
cd39414
+}
cd39414
+
cd39414
+/*
cd39414
+ * work on completed aio dio IO, to convert unwritten extents to extents
cd39414
+ */
cd39414
+static void ext4_end_io_work(struct work_struct *work)
cd39414
+{
cd39414
+	ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
cd39414
+	ext4_do_flush_completed_IO(io->inode, io);
cd39414
+}
cd39414
+
cd39414
+int ext4_flush_completed_IO(struct inode *inode)
cd39414
+{
cd39414
+	return ext4_do_flush_completed_IO(inode, NULL);
cd39414
 }
cd39414
 
cd39414
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
cd39414
@@ -199,9 +263,7 @@ static void buffer_io_error(struct buffer_head *bh)
cd39414
 static void ext4_end_bio(struct bio *bio, int error)
cd39414
 {
cd39414
 	ext4_io_end_t *io_end = bio->bi_private;
cd39414
-	struct workqueue_struct *wq;
cd39414
 	struct inode *inode;
cd39414
-	unsigned long flags;
cd39414
 	int i;
cd39414
 	sector_t bi_sector = bio->bi_sector;
cd39414
 
cd39414
@@ -259,14 +321,7 @@ static void ext4_end_bio(struct bio *bio, int error)
cd39414
 		return;
cd39414
 	}
cd39414
 
cd39414
-	/* Add the io_end to per-inode completed io list*/
cd39414
-	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
cd39414
-	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
cd39414
-	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
cd39414
-
cd39414
-	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
cd39414
-	/* queue the work to convert unwritten extents to written */
cd39414
-	queue_work(wq, &io_end->work);
cd39414
+	ext4_add_complete_io(io_end);
cd39414
 }
cd39414
 
cd39414
 void ext4_io_submit(struct ext4_io_submit *io)
cd39414
-- 
cd39414
1.7.12.rc0.22.gcdd159b
cd39414