Blob Blame History Raw
diff -rupN xen-unstable.hg-3.0.5-pre-14797.orig/tools/blktap/drivers/block-aio.c xen-unstable.hg-3.0.5-pre-14797.new/tools/blktap/drivers/block-aio.c
--- xen-unstable.hg-3.0.5-pre-14797.orig/tools/blktap/drivers/block-aio.c	2007-04-11 19:10:30.000000000 -0400
+++ xen-unstable.hg-3.0.5-pre-14797.new/tools/blktap/drivers/block-aio.c	2007-04-13 11:17:08.000000000 -0400
@@ -44,14 +44,6 @@
 #include <linux/fs.h>
 #include "tapdisk.h"
 
-
-/**
- * We used a kernel patch to return an fd associated with the AIO context
- * so that we can concurrently poll on synchronous and async descriptors.
- * This is signalled by passing 1 as the io context to io_setup.
- */
-#define REQUEST_ASYNC_FD 1
-
 #define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
 
 struct pending_aio {
@@ -72,12 +64,18 @@ struct tdaio_state {
 	int                iocb_free_count;
 	struct iocb       *iocb_queue[MAX_AIO_REQS];
 	int                iocb_queued;
-	int                poll_fd; /* NB: we require aio_poll support */
 	struct io_event    aio_events[MAX_AIO_REQS];
+
+	pthread_t          aio_thread;
+	/* pipe fds for communication with the aio completion thread */
+	int                command_fd[2];
+	int                completion_fd[2];
 };
 
 #define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
 
+static void *tdaio_completion_thread(void *);
+
 /*Get Image size, secsize*/
 static int get_image_info(struct td_state *s, int fd)
 {
@@ -109,7 +107,6 @@ static int get_image_info(struct td_stat
 		/*Get the sector size*/
 #if defined(BLKSSZGET)
 		{
-			int arg;
 			s->sector_size = DEFAULT_SECTOR_SIZE;
 			ioctl(fd, BLKSSZGET, &s->sector_size);
 			
@@ -148,7 +145,7 @@ static inline void init_fds(struct disk_
 	for(i = 0; i < MAX_IOFD; i++) 
 		dd->io_fd[i] = 0;
 
-	dd->io_fd[0] = prv->poll_fd;
+	dd->io_fd[0] = prv->completion_fd[0];
 }
 
 /* Open the disk file and initialize aio state. */
@@ -163,11 +160,10 @@ int tdaio_open (struct disk_driver *dd, 
 	prv->iocb_free_count = MAX_AIO_REQS;
 	prv->iocb_queued     = 0;
 	
-	prv->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;
-	prv->poll_fd = io_setup(MAX_AIO_REQS, &prv->aio_ctx);
+	prv->aio_ctx = (io_context_t) 0;
+	ret = io_setup(MAX_AIO_REQS, &prv->aio_ctx);
 
-	if (prv->poll_fd < 0) {
-		ret = prv->poll_fd;
+	if (ret < 0) {
                 if (ret == -EAGAIN) {
                         DPRINTF("Couldn't setup AIO context.  If you are "
                                 "trying to concurrently use a large number "
@@ -176,9 +172,7 @@ int tdaio_open (struct disk_driver *dd, 
                                 "(e.g. 'echo echo 1048576 > /proc/sys/fs/"
                                 "aio-max-nr')\n");
                 } else {
-                        DPRINTF("Couldn't get fd for AIO poll support.  This "
-                                "is probably because your kernel does not "
-                                "have the aio-poll patch applied.\n");
+			DPRINTF("Couldn't setup AIO context.\n");
                 }
 		goto done;
 	}
@@ -209,6 +203,14 @@ int tdaio_open (struct disk_driver *dd, 
 
         prv->fd = fd;
 
+	pipe(prv->command_fd);
+	pipe(prv->completion_fd);
+
+	ret = pthread_create(&prv->aio_thread, NULL,
+			     tdaio_completion_thread, prv);
+	ret = 0;
+	write(prv->command_fd[1], &ret, sizeof(ret));
+
 	init_fds(dd);
 	ret = get_image_info(s, fd);
 
@@ -306,17 +308,62 @@ int tdaio_close(struct disk_driver *dd)
 	return 0;
 }
 
+/*
+ * We don't have any way to do epoll on aio events in a normal kernel, so
+ * wait for aio events in a separate thread and return completion status
+ * that via a pipe that can be waited on normally.
+ *
+ * To keep locking problems between the completion thread and the submit
+ * thread to a minimum, there's a handshake which allows only one thread
+ * to be doing work on the completion queue at a time:
+ *
+ * 1) main thread sends completion thread a command via the command pipe;
+ * 2) completion thread waits for aio events and returns the number
+ *    received on the completion pipe
+ * 3) main thread processes the received prv->aio_events events
+ * 4) loop back to 1) to let the completion thread refill the aio_events
+ *    buffer.
+ *
+ * This workaround needs to disappear once the kernel provides a single
+ * mechanism for waiting on both aio and normal fd wakeups.
+ */
+
+static void *tdaio_completion_thread(void *arg)
+{
+       struct tdaio_state *prv = (struct tdaio_state *) arg;
+       int command;
+       int nr_events;
+       int rc;
+
+       while (1) {
+               rc = read(prv->command_fd[0], &command, sizeof(command));
+
+               do {
+                       /* Non-blocking test for completed io. */
+                       rc = io_getevents(prv->aio_ctx, 1,
+                                         MAX_AIO_REQS, prv->aio_events,
+                                         NULL);
+                       if (rc) {
+                               nr_events = rc;
+                               rc = write(prv->completion_fd[1], &nr_events,
+                                          sizeof(nr_events));
+                       }
+               } while (!rc);
+       }
+}
+
+
 int tdaio_do_callbacks(struct disk_driver *dd, int sid)
 {
-	int ret, i, rsp = 0;
+	int ret, i, nr_events, rsp = 0;
 	struct io_event *ep;
 	struct tdaio_state *prv = (struct tdaio_state *)dd->private;
 
 	/* Non-blocking test for completed io. */
-	ret = io_getevents(prv->aio_ctx, 0, MAX_AIO_REQS, prv->aio_events,
-			   NULL);
-			
-	for (ep=prv->aio_events,i=ret; i-->0; ep++) {
+	ret = read(prv->completion_fd[0], &nr_events, sizeof(nr_events));
+
+repeat:
+	for (ep=prv->aio_events,i=nr_events; i-->0; ep++) {
 		struct iocb        *io  = ep->obj;
 		struct pending_aio *pio;
 		
@@ -327,6 +374,16 @@ int tdaio_do_callbacks(struct disk_drive
 
 		prv->iocb_free[prv->iocb_free_count++] = io;
 	}
+
+	if (nr_events) {
+		nr_events = io_getevents(prv->aio_ctx, 0,
+					 MAX_AIO_REQS, prv->aio_events,
+					 NULL);
+		goto repeat;
+	}
+
+	write(prv->command_fd[1], &nr_events, sizeof(nr_events));
+
 	return rsp;
 }