diff -rupN xen-unstable.hg-3.0.5-pre-14797.orig/tools/blktap/drivers/block-aio.c xen-unstable.hg-3.0.5-pre-14797.new/tools/blktap/drivers/block-aio.c --- xen-unstable.hg-3.0.5-pre-14797.orig/tools/blktap/drivers/block-aio.c 2007-04-11 19:10:30.000000000 -0400 +++ xen-unstable.hg-3.0.5-pre-14797.new/tools/blktap/drivers/block-aio.c 2007-04-13 11:17:08.000000000 -0400 @@ -44,14 +44,6 @@ #include #include "tapdisk.h" - -/** - * We used a kernel patch to return an fd associated with the AIO context - * so that we can concurrently poll on synchronous and async descriptors. - * This is signalled by passing 1 as the io context to io_setup. - */ -#define REQUEST_ASYNC_FD 1 - #define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ) struct pending_aio { @@ -72,12 +64,18 @@ struct tdaio_state { int iocb_free_count; struct iocb *iocb_queue[MAX_AIO_REQS]; int iocb_queued; - int poll_fd; /* NB: we require aio_poll support */ struct io_event aio_events[MAX_AIO_REQS]; + + pthread_t aio_thread; + /* pipe fds for communication with the aio completion thread */ + int command_fd[2]; + int completion_fd[2]; }; #define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list) +static void *tdaio_completion_thread(void *); + /*Get Image size, secsize*/ static int get_image_info(struct td_state *s, int fd) { @@ -109,7 +107,6 @@ static int get_image_info(struct td_stat /*Get the sector size*/ #if defined(BLKSSZGET) { - int arg; s->sector_size = DEFAULT_SECTOR_SIZE; ioctl(fd, BLKSSZGET, &s->sector_size); @@ -148,7 +145,7 @@ static inline void init_fds(struct disk_ for(i = 0; i < MAX_IOFD; i++) dd->io_fd[i] = 0; - dd->io_fd[0] = prv->poll_fd; + dd->io_fd[0] = prv->completion_fd[0]; } /* Open the disk file and initialize aio state. */ @@ -163,11 +160,10 @@ int tdaio_open (struct disk_driver *dd, prv->iocb_free_count = MAX_AIO_REQS; prv->iocb_queued = 0; - prv->aio_ctx = (io_context_t) REQUEST_ASYNC_FD; - prv->poll_fd = io_setup(MAX_AIO_REQS, &prv->aio_ctx); + prv->aio_ctx = (io_context_t) 0; + ret = io_setup(MAX_AIO_REQS, &prv->aio_ctx); - if (prv->poll_fd < 0) { - ret = prv->poll_fd; + if (ret < 0) { if (ret == -EAGAIN) { DPRINTF("Couldn't setup AIO context. If you are " "trying to concurrently use a large number " @@ -176,9 +172,7 @@ int tdaio_open (struct disk_driver *dd, "(e.g. 'echo echo 1048576 > /proc/sys/fs/" "aio-max-nr')\n"); } else { - DPRINTF("Couldn't get fd for AIO poll support. This " - "is probably because your kernel does not " - "have the aio-poll patch applied.\n"); + DPRINTF("Couldn't setup AIO context.\n"); } goto done; } @@ -209,6 +203,14 @@ int tdaio_open (struct disk_driver *dd, prv->fd = fd; + pipe(prv->command_fd); + pipe(prv->completion_fd); + + ret = pthread_create(&prv->aio_thread, NULL, + tdaio_completion_thread, prv); + ret = 0; + write(prv->command_fd[1], &ret, sizeof(ret)); + init_fds(dd); ret = get_image_info(s, fd); @@ -306,17 +308,62 @@ int tdaio_close(struct disk_driver *dd) return 0; } +/* + * We don't have any way to do epoll on aio events in a normal kernel, so + * wait for aio events in a separate thread and return completion status + * that via a pipe that can be waited on normally. + * + * To keep locking problems between the completion thread and the submit + * thread to a minimum, there's a handshake which allows only one thread + * to be doing work on the completion queue at a time: + * + * 1) main thread sends completion thread a command via the command pipe; + * 2) completion thread waits for aio events and returns the number + * received on the completion pipe + * 3) main thread processes the received prv->aio_events events + * 4) loop back to 1) to let the completion thread refill the aio_events + * buffer. + * + * This workaround needs to disappear once the kernel provides a single + * mechanism for waiting on both aio and normal fd wakeups. + */ + +static void *tdaio_completion_thread(void *arg) +{ + struct tdaio_state *prv = (struct tdaio_state *) arg; + int command; + int nr_events; + int rc; + + while (1) { + rc = read(prv->command_fd[0], &command, sizeof(command)); + + do { + /* Non-blocking test for completed io. */ + rc = io_getevents(prv->aio_ctx, 1, + MAX_AIO_REQS, prv->aio_events, + NULL); + if (rc) { + nr_events = rc; + rc = write(prv->completion_fd[1], &nr_events, + sizeof(nr_events)); + } + } while (!rc); + } +} + + int tdaio_do_callbacks(struct disk_driver *dd, int sid) { - int ret, i, rsp = 0; + int ret, i, nr_events, rsp = 0; struct io_event *ep; struct tdaio_state *prv = (struct tdaio_state *)dd->private; /* Non-blocking test for completed io. */ - ret = io_getevents(prv->aio_ctx, 0, MAX_AIO_REQS, prv->aio_events, - NULL); - - for (ep=prv->aio_events,i=ret; i-->0; ep++) { + ret = read(prv->completion_fd[0], &nr_events, sizeof(nr_events)); + +repeat: + for (ep=prv->aio_events,i=nr_events; i-->0; ep++) { struct iocb *io = ep->obj; struct pending_aio *pio; @@ -327,6 +374,16 @@ int tdaio_do_callbacks(struct disk_drive prv->iocb_free[prv->iocb_free_count++] = io; } + + if (nr_events) { + nr_events = io_getevents(prv->aio_ctx, 0, + MAX_AIO_REQS, prv->aio_events, + NULL); + goto repeat; + } + + write(prv->command_fd[1], &nr_events, sizeof(nr_events)); + return rsp; }