diff -urN v2.4.2-ac28/Makefile aio-v2.4.2-ac28/Makefile --- v2.4.2-ac28/Makefile Fri Mar 30 18:49:14 2001 +++ aio-v2.4.2-ac28/Makefile Fri Mar 30 18:54:51 2001 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 2 -EXTRAVERSION = -ac28 +EXTRAVERSION = -ac28-bcrl1 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -urN v2.4.2-ac28/arch/i386/Makefile aio-v2.4.2-ac28/arch/i386/Makefile --- v2.4.2-ac28/arch/i386/Makefile Fri Mar 30 18:49:16 2001 +++ aio-v2.4.2-ac28/arch/i386/Makefile Fri Mar 30 18:53:41 2001 @@ -21,7 +21,7 @@ LDFLAGS=-e stext LINKFLAGS =-T $(TOPDIR)/arch/i386/vmlinux.lds $(LDFLAGS) -CFLAGS += -pipe +#CFLAGS += -pipe # prevent gcc from keeping the stack 16 byte aligned CFLAGS += $(shell if $(CC) -mpreferred-stack-boundary=2 -S -o /dev/null -xc /dev/null >/dev/null 2>&1; then echo "-mpreferred-stack-boundary=2"; fi) diff -urN v2.4.2-ac28/arch/i386/kernel/entry.S aio-v2.4.2-ac28/arch/i386/kernel/entry.S --- v2.4.2-ac28/arch/i386/kernel/entry.S Wed Nov 8 20:09:50 2000 +++ aio-v2.4.2-ac28/arch/i386/kernel/entry.S Fri Mar 30 18:53:41 2001 @@ -646,6 +646,11 @@ .long SYMBOL_NAME(sys_getdents64) /* 220 */ .long SYMBOL_NAME(sys_fcntl64) .long SYMBOL_NAME(sys_ni_syscall) /* reserved for TUX */ + .long SYMBOL_NAME(sys_ni_syscall) /* 223 */ + .long SYMBOL_NAME(sys___io_cancel) + .long SYMBOL_NAME(sys___io_wait) + .long SYMBOL_NAME(sys___io_getevents) + .long SYMBOL_NAME(sys_submit_ios) /* * NOTE!! This doesn't have to be exact - we just have diff -urN v2.4.2-ac28/drivers/char/mem.c aio-v2.4.2-ac28/drivers/char/mem.c --- v2.4.2-ac28/drivers/char/mem.c Fri Mar 30 18:49:30 2001 +++ aio-v2.4.2-ac28/drivers/char/mem.c Fri Mar 30 18:53:41 2001 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -568,6 +569,9 @@ case 9: filp->f_op = &urandom_fops; break; + case 10: + filp->f_op = &aio_fops; + break; default: return -ENXIO; } @@ -592,7 +596,8 @@ {5, "zero", S_IRUGO | S_IWUGO, &zero_fops}, {7, "full", S_IRUGO | S_IWUGO, &full_fops}, {8, "random", S_IRUGO | S_IWUSR, &random_fops}, - {9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops} + {9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops}, + {10,"aio", S_IRUGO | S_IWUSR, &aio_fops}, }; int i; @@ -614,6 +619,7 @@ memory_devfs_register(); rand_initialize(); raw_init(); + aio_setup(); #ifdef CONFIG_I2C i2c_init_all(); #endif diff -urN v2.4.2-ac28/drivers/char/raw.c aio-v2.4.2-ac28/drivers/char/raw.c --- v2.4.2-ac28/drivers/char/raw.c Fri Mar 30 18:49:31 2001 +++ aio-v2.4.2-ac28/drivers/char/raw.c Fri Mar 30 18:59:58 2001 @@ -16,6 +16,8 @@ #include #include #include +#include +#include #define dprintk(x...) @@ -31,13 +33,14 @@ int raw_open(struct inode *, struct file *); int raw_release(struct inode *, struct file *); int raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long); - +int raw_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos); static struct file_operations raw_fops = { read: raw_read, write: raw_write, open: raw_open, release: raw_release, + rw_kiovec: raw_rw_kiovec, }; static struct file_operations raw_ctl_fops = { @@ -102,7 +105,8 @@ * the blocksize on a device which is already mounted. */ - sector_size = 512; + //sector_size = 512; + sector_size = 2048; if (get_super(rdev) != NULL) { if (blksize_size[MAJOR(rdev)]) sector_size = blksize_size[MAJOR(rdev)][MINOR(rdev)]; @@ -224,7 +228,6 @@ } - ssize_t raw_read(struct file *filp, char * buf, size_t size, loff_t *offp) { @@ -318,7 +321,7 @@ for (i=0; i < blocks; i++) b[i] = blocknr++; - err = brw_kiovec(rw, 1, &iobuf, dev, b, sector_size); + err = brw_kiovec(rw, 1, &iobuf, dev, blocks, b, sector_size); if (rw == READ && err > 0) mark_dirty_kiobuf(iobuf, err); @@ -343,3 +346,92 @@ return err; } + +int raw_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos) +{ + int err; + unsigned long blocknr, blocks; + unsigned long __b[KIO_MAX_SECTORS]; + unsigned long *b = __b; + int i; + int minor; + kdev_t dev; + unsigned long limit; + + int sector_size, sector_bits, sector_mask; + int max_sectors; + +#if 0 /* FIXME: this is wrong. */ + err = 0; + if (!size) + goto out_complete; +#endif + + pr_debug("raw_rw_kiovec: %p %d %d %p %d %d %Lu\n", filp, rw, nr, kiovec, flags, size, pos); + /* + * First, a few checks on device size limits + */ + + minor = MINOR(filp->f_dentry->d_inode->i_rdev); + dev = to_kdev_t(raw_device_bindings[minor]->bd_dev); + sector_size = raw_device_sector_size[minor]; + sector_bits = raw_device_sector_bits[minor]; + sector_mask = sector_size- 1; + max_sectors = 25000; //KIO_MAX_SECTORS >> (sector_bits - 9); + + if (blk_size[MAJOR(dev)]) + limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits; + else + limit = INT_MAX; + dprintk ("rw_raw_dev_async: dev %d:%d (+%d)\n", + MAJOR(dev), MINOR(dev), limit); + + err = -EINVAL; + if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) { + printk("pos/size wrong\n"); + goto out; + } + + err = -ENXIO; + if ((pos >> sector_bits) >= limit) { + printk("raw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits); + goto out; + } + + /* + * Split the IO into KIO_MAX_SECTORS chunks, mapping and + * unmapping the single kiobuf as we go to perform each chunk of + * IO. + */ + + blocknr = pos >> sector_bits; + blocks = size >> sector_bits; + if (blocks > max_sectors) + blocks = max_sectors; + if (blocks > limit - blocknr) + blocks = limit - blocknr; + err = -ENXIO; + pr_debug("raw: !blocks %d %ld %ld\n", max_sectors, limit, blocknr); + if (!blocks) + goto out; + + if (blocks > KIO_MAX_SECTORS) { + err = -ENOMEM; + b = kmalloc(sizeof(*b) * blocks, GFP_KERNEL); + if (!b) + goto out; + } + + for (i=0; i < blocks; i++) + b[i] = blocknr++; + + err = brw_kiovec_async(rw, nr, kiovec, dev, blocks, b, sector_size); + pr_debug("brw_kiovec_async: %d\n", err); + + if (b != __b) + kfree(b); +out: + pr_debug("brw_kiovec_async: ret is %d\n", err); + return err; +} + diff -urN v2.4.2-ac28/fs/Makefile aio-v2.4.2-ac28/fs/Makefile --- v2.4.2-ac28/fs/Makefile Fri Mar 30 18:50:15 2001 +++ aio-v2.4.2-ac28/fs/Makefile Sun Apr 1 21:16:44 2001 @@ -12,7 +12,7 @@ obj-y := open.o read_write.o devices.o file_table.o buffer.o \ super.o block_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ - ioctl.o readdir.o select.o fifo.o locks.o \ + ioctl.o readdir.o select.o fifo.o locks.o aio.o \ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ filesystems.o diff -urN v2.4.2-ac28/fs/aio.c aio-v2.4.2-ac28/fs/aio.c --- v2.4.2-ac28/fs/aio.c Wed Dec 31 19:00:00 1969 +++ aio-v2.4.2-ac28/fs/aio.c Fri Mar 30 18:53:41 2001 @@ -0,0 +1,913 @@ +/* drivers/char/aio.c + * Copyright 2000 Red Hat, Inc. All Rights Reserved. + * + * An async IO implementation for Linux + * Written by Benjamin LaHaise + * + * Implements /dev/aio, something on top of which it should be possible + * to write a POSIX AIO library. + * + * Notes on interface: + * - aiocbs are submitted by doing a submit_ios syscall + * on an array of aiocbs to the /dev/aio fd + * - on completion, the aiocb, events are placed in + * a ringbuffer + * - the contents of the ring buffer can be read via the + * __io_getevents syscall. + * - each open(/dev/aio) instance provides a unique aio + * control space + */ +//#define DEBUG 1 + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#undef KERN_DEBUG +#define KERN_DEBUG "" + +static spinlock_t aio_read_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t aio_req_lock = SPIN_LOCK_UNLOCKED; + +static kmem_cache_t *kiocb_cachep; +static kmem_cache_t *kiogrp_cachep; +static kmem_cache_t *kioctx_cachep; + +/* aio_setup + * Creates the slab caches used by the aio routines, panic on + * failure as this is done early during the boot sequence. + */ +void __init aio_setup(void) +{ + kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kiocb_cachep) + panic("unable to create kiocb cache"); + + kiogrp_cachep = kmem_cache_create("kiogrp", sizeof(struct kiogrp), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kiogrp_cachep) + panic("unable to create kiogrp cache"); + + kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!kioctx_cachep) + panic("unable to create kioctx cache"); + + printk(KERN_NOTICE "aio_setup: okay!\n"); + printk(KERN_NOTICE "aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); +} + +/* ioctx_alloc + * Allocates and initializes an aioctx. Returns an ERR_PTR if it failed. + */ +static struct kioctx *ioctx_alloc(void) +{ + struct kioctx *ctx; + + ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); + if (ctx) { + memset(ctx, 0, sizeof(*ctx)); + atomic_set(&ctx->users, 1); + spin_lock_init(&ctx->done_lock); + init_waitqueue_head(&ctx->wait); + + ctx->max_reqs = AIO_RING_SIZE; + ctx->reqs = kmalloc(sizeof(struct iocb *) * ctx->max_reqs, GFP_KERNEL); + if (ctx->reqs) { + memset(ctx->reqs, 0, sizeof(struct iocb *) * ctx->max_reqs); + ctx->ring = kmalloc(sizeof(*ctx->ring), GFP_KERNEL); + if (ctx->ring) { + memset(ctx->ring, 0, sizeof(*ctx->ring)); + printk("aio: allocated aioctx %p\n", ctx); + return ctx; + } + kfree(ctx->reqs); + ctx->reqs = NULL; + } + kmem_cache_free(kioctx_cachep, ctx); + ctx = ERR_PTR(-ENOMEM); + } + + printk("aio: error allocating aioctx %p\n", ctx); + return ctx; +} + +struct kiogrp *kiogrp_alloc(struct kioctx *ctx) +{ + struct kiogrp *iogrp; + + iogrp = kmem_cache_alloc(kiogrp_cachep, GFP_KERNEL); + if (iogrp) { + memset(iogrp, 0, sizeof(*iogrp)); + aioctx_get(ctx); + iogrp->ctx = ctx; + iogrp->idx = -1; + } + return iogrp; +} + +void kiocb_free(struct kiocb *iocb) +{ + int i; + + for (i=0; inr_kiovec; i++) + unmap_kiobuf(iocb->kiovec[i]); + + free_kiovec(iocb->nr_kiovec, iocb->kiovec); + iocb->nr_kiovec = 0; + fput(iocb->filp); + iocb->filp = NULL; + kmem_cache_free(kiocb_cachep, iocb); +} + +void kiogrp_free(struct kiogrp *iogrp) +{ + struct kioctx *ctx = iogrp->ctx; + int i; + pr_debug("kio_free: %p/%d\n", iogrp, iogrp->idx); + + if ((i=atomic_read(&iogrp->count))) { + printk("kiogrp_free: %d/%p/%d still active!!!\n", i, iogrp, iogrp->idx); + return; + } + + if ((iogrp->idx >= 0) && (iogrp->idx < ctx->max_reqs)) + ctx->reqs[iogrp->idx] = NULL; + + for (i=0; inr_iocbs; i++) { + kiocb_free(iogrp->iocbs[i]); + } + kmem_cache_free(kiogrp_cachep, iogrp); + aioctx_put(ctx); +} + +/* iogrp_putio + * Called when the io count on iogrp is decremented. Checks + * to see if the kiogrp the request belongs to has finished, + * and if so sends the completion notice to its context. + */ +static void iogrp_putio(struct kiogrp *iogrp) +{ + struct kioctx *ctx = iogrp->ctx; + struct aio_ring *ring = ctx->ring; + unsigned long flags; + unsigned long tail; + + /* Is this the last io to complete in the group? */ + if (!atomic_dec_and_test(&iogrp->count)) { + if (atomic_read(&iogrp->count) < 0) + BUG(); + return; + } + + /* Yes we are, go ahead with completion */ + aioctx_get(ctx); + + /* add a completion event to the ring buffer. + * must be done holding done_lock to prevent + * other code from messing with the tail + * pointer since we might be called from irq + * context. + */ + spin_lock_irqsave(&ctx->done_lock, flags); + + tail = (ring->tail + 1) % AIO_RING_SIZE; + + ring->io_events[tail].data = iogrp->user_data; + ring->io_events[tail].key = iogrp->idx; + ring->io_events[tail].type = IO_EVENT_IOCB_DONE; + + /* after flagging the request as done, we + * must never even look at it again + */ + barrier(); + + ring->tail = tail; + + wmb(); + if (!ring->woke) + ring->woke = 1; + + spin_unlock_irqrestore(&ctx->done_lock, flags); + + pr_debug("added to ring %p at [%lu]\n", iogrp, tail); +#if 0 + if (!wake) { + printk("kio_complete: should send user of %p a signal...\n", ctx); + } +#endif + + wake_up(&ctx->wait); + + aioctx_put(ctx); +} + +/* aio_kiobuf_endio + * Called when io on a given kiobuf is complete. + */ +static void aio_kiobuf_endio(struct kiobuf *iobuf) +{ + struct kiogrp *iogrp = iobuf->end_io_data; + + /* TODO: possibly put the return code into the iocb + * here. This only really makes sense if it's being + * put into the user's iocb, which would mean pinning + * it down in memory. Maybe. + */ + pr_debug("aio_kiobuf_endio: %p %p/%d\n", iobuf, iogrp, iogrp->idx); + iogrp_putio(iogrp); +} + +/* kio_submit: + * Submits an actual aiocb + */ +static inline int kio_submit(struct kiogrp *iogrp, struct kiocb *iocb, + struct iocb *aiocb) +{ + int (*rw_kiovec)(struct file *, int, int, struct kiobuf **, int, size_t, loff_t); + int ret = -ENOSYS; + int rw; + + switch(aiocb->aio_lio_opcode) { + case IOCB_CMD_WRITE: + rw = WRITE; + break; + case IOCB_CMD_READ: + rw = READ; + break; + default: + printk("kio_submit: lio_opcode = %d\n", aiocb->aio_lio_opcode); + goto out; + } + + rw_kiovec = iocb->filp->f_op->rw_kiovec; + if (rw_kiovec) + ret = rw_kiovec(iocb->filp, rw, iocb->nr_kiovec, iocb->kiovec, /*flags*/ 0, aiocb->aio_nbytes, aiocb->aio_offset); + else { + iocb->kiovec[0]->transferred = 0; + iocb->kiovec[0]->errno = -ENOSYS; + aio_kiobuf_endio(iocb->kiovec[0]); + ret = 0; + } + +out: + if (ret) { + static int count; + if (count < 10) { + count++; + printk("kio_submit: failed!\n"); + } + atomic_dec(&iogrp->count); + if (atomic_read(&iogrp->count) < 0) + BUG(); + } + + return ret; +} + +/*----------------- /dev/aio interface ----------------------- */ +static inline struct kiocb *aio_convert_user_aiocb(struct kiogrp *iogrp, + struct iocb *uaiocb, struct iocb *user_aiocb) +{ + struct kiocb *iocb; + int rw = WRITE; + int ret = -ENOMEM; + int i; + + iocb = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); + if (!iocb) + goto out; + + atomic_inc(&iogrp->count); /* FIXME: should be according to number of iobufs in this iocb */ + + memset(iocb, 0, sizeof(*iocb)); + + iocb->user_aiocb = user_aiocb; + iocb->filp = fget(uaiocb->aio_fildes); + ret = -EBADF; + if (!iocb->filp) + goto out_err; + + iocb->nr_kiovec = 1; + ret = alloc_kiovec(1, iocb->kiovec); + if (ret) + goto out_err; + + for (i=0; i < iocb->nr_kiovec; i++) { + iocb->kiovec[i]->end_io = aio_kiobuf_endio; + iocb->kiovec[i]->end_io_data = iogrp; + } + + switch (uaiocb->aio_lio_opcode) { + case IOCB_CMD_READ: rw = READ; + case IOCB_CMD_WRITE: + pr_debug("aio: map_user_kiobuf(%d, %p, %lu, %lu) = ", + rw, iocb->kiovec[0], (unsigned long)uaiocb->aio_buf, + (unsigned long)uaiocb->aio_nbytes); + ret = map_user_kiobuf(rw, iocb->kiovec[0], + (unsigned long)uaiocb->aio_buf, + uaiocb->aio_nbytes); + pr_debug("%d\n", ret); + if (ret) + goto out_kiobuf_err; + break; + default: + ret = -EINVAL; + printk("aio_convert_user_aiocb: lio_opcode = %d\n", uaiocb->aio_lio_opcode); + goto out_kiobuf_err; + } + + pr_debug("kio_convert_user_aiocb: (%p, %p) / %p\n", iogrp, uaiocb, iocb); + + return iocb; + +out_kiobuf_err: +out_err: + kiocb_free(iocb); +out: + return ERR_PTR(ret); +} + +/* aio_open + * Open method for /dev/aio. Allocates an aioctx for this open()er + * and places it in the file's private_data field. Can fail because + * of memory allocation failure. + */ +int aio_open(struct inode *inode, struct file *filp) +{ + struct kioctx *ctx = ioctx_alloc(); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + filp->private_data = ctx; + ctx->filp = filp; + return 0; +} + +/* aio_release + * Free the aioctx associated with the file. FIXME! + */ +int aio_release(struct inode *inode, struct file *filp) +{ + struct kioctx *ioctx = filp->private_data; + printk("aio_release(%p)\n", filp->private_data); + aioctx_put(ioctx); + filp->private_data = NULL; + return 0; +} + +/* kiocb_get + * + */ +static inline struct kiogrp *kiogrp_get(struct kioctx *ctx, int idx, void *key) +{ + struct kiogrp *iogrp; + + spin_lock(&aio_req_lock); + iogrp = ctx->reqs[idx]; + if (iogrp && iogrp->user_data == key) { + if (!iogrp->locked) + iogrp->locked = 1; + else + iogrp = ERR_PTR(-EBUSY); + } else + iogrp = ERR_PTR(-ENOENT); + spin_unlock(&aio_req_lock); + return iogrp; +} + +/* aio_complete + * Checks if the kiogrp in ctx at idx is finished. If so, copies the + * completion codes into userspace, and then releases the kiogrp. + */ +static int aio_complete(struct kioctx *ctx, int idx, void *key, int please_wait) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + struct kiogrp *iogrp; + int ret = -EINVAL; + unsigned i; + + pr_debug("aio_complete: %p %d %p %d\n", ctx, idx, key, please_wait); + if (idx < 0 || idx >= ctx->max_reqs) { + printk("aio_complete: idx(%d) is invalid\n", idx); + goto out; + } + + ret = -EBUSY; + + if (please_wait) { + add_wait_queue(&ctx->wait, &wait); + + do { + set_task_state(tsk, TASK_INTERRUPTIBLE); + iogrp = kiogrp_get(ctx, idx, key); + if (iogrp == ERR_PTR(-EBUSY)) { + schedule(); + + /* interrupted due to a signal? */ + iogrp = ERR_PTR(-EINTR); + if (signal_pending(tsk)) + break; + iogrp = kiogrp_get(ctx, idx, key); + } + } while (iogrp == ERR_PTR(-EBUSY)); + + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); + } else + iogrp = kiogrp_get(ctx, idx, key); + + ret = PTR_ERR(iogrp); + if (IS_ERR(iogrp)) { + printk("aio_complete: ERR: %d [%d, %p] from %p\n", ret, idx, key, __builtin_return_address(0)); + goto out; + } + + pr_debug("aio_complete: [%d] = %p\n", idx, iogrp); + + ret = -EFAULT; + for (i=0; inr_iocbs; i++) { + struct kiocb *iocb = iogrp->iocbs[i]; + + /* FIXME: decide kiovec vs iocb interaction, this is a KLUDGE */ + iocb->aio_return = iocb->kiovec[0]->transferred ? + iocb->kiovec[0]->transferred : + iocb->kiovec[0]->errno; + + if (put_user(iocb->aio_return, &iocb->user_aiocb->__aio_return)) + goto out_undo; + if (put_user(-1, &iocb->user_aiocb->__aio_key)) + goto out_undo; + } + + /* everything turned out well, dispose of the aiocb. */ + kiogrp_free(iogrp); + + return 0; + +out_undo: +printk("out_undo\n"); + /* unlock and wakeup so anyone else waiting can attempt this iocb */ + iogrp->locked = 0; + wake_up(&ctx->wait); + +out: + return ret; +} + +/* aio_read_evt + * Pull an event off of the aioctx's event ring. + * FIXME: make this use cmpxchg. + * TODO: make the ringbuffer user mmap()able (requires FIXME). + */ +static int aio_read_evt(struct aio_ring *ring, struct io_event *ent) +{ + unsigned long head; + int ret = -EAGAIN; + + pr_debug("in aio_read_evt h%lu t%lu\n", ring->head, ring->tail); + barrier(); + if (ring->head == ring->tail) + goto out; + + spin_lock(&aio_read_lock); /* investigate the value of making this per-ctx */ + + head = ring->head; + if (head != ring->tail) { + head = (head + 1) % AIO_RING_SIZE; + *ent = ring->io_events[head]; + barrier(); + ring->head = head; + ret = 0; + } + spin_unlock(&aio_read_lock); + +out: + pr_debug("leaving aio_read_evt: %d h%lu t%lu\n", ret, ring->head, ring->tail); + return ret; +} + +struct timeout { + struct timer_list timer; + int timed_out; + wait_queue_head_t wait; +}; + +static void timeout_func(unsigned long data) +{ + struct timeout *to = (struct timeout *)data; + + to->timed_out = 1; + wake_up(&to->wait); +} + +static inline void init_timeout(struct timeout *to) +{ + init_timer(&to->timer); + to->timer.data = (unsigned long)to; + to->timer.function = timeout_func; + to->timed_out = 0; + init_waitqueue_head(&to->wait); +} + +static inline void set_timeout(struct timeout *to, struct timespec *ts) +{ + unsigned long how_long; + + if (!ts->tv_sec && !ts->tv_nsec) { + to->timed_out = 1; + return; + } + + how_long = ts->tv_sec * HZ; +#define HZ_NS (1000000000 / HZ) + how_long += (ts->tv_nsec + HZ_NS - 1) / HZ_NS; + + to->timer.expires = jiffies + how_long; + add_timer(&to->timer); +} + +static inline void clear_timeout(struct timeout *to) +{ + del_timer_sync(&to->timer); +} + +static int read_events(struct kioctx *ctx, struct io_event *event, int max_nr, + struct timespec *timeout) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(to_wait, tsk); + int ret = -EINVAL; + int nr = 0; + struct io_event ent; + struct timespec ts; + struct timeout to; + + init_timeout(&to); + + if (timeout) { + ret = -EFAULT; + if (copy_from_user(&ts, timeout, sizeof(ts))) + goto out; + + set_timeout(&to, &ts); + } + + memset(&ent, 0, sizeof(ent)); + ret = 0; + + while (nr < max_nr) { + ret = aio_read_evt(ctx->ring, &ent); + if (ret) { + if (nr) + break; + + add_wait_queue(&ctx->wait, &wait); + add_wait_queue(&to.wait, &to_wait); + do { + set_task_state(tsk, TASK_INTERRUPTIBLE); + + ret = aio_read_evt(ctx->ring, &ent); + if (!ret) + break; + ret = -ETIMEDOUT; + if (to.timed_out) + break; + schedule(); + if (to.timed_out) + break; + if (signal_pending(tsk)) { + ret = -EINTR; + break; + } + ret = aio_read_evt(ctx->ring, &ent); + } while (ret) ; + + set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); + remove_wait_queue(&to.wait, &to_wait); + } + + if (ret) + break; + + /* FIXME: split checks in two */ + ret = -EFAULT; + if (copy_to_user(event, &ent, sizeof(ent))) { + /* FIXME: we lose an event here. */ + printk(KERN_DEBUG "aio: lost an event due to EFAULT.\n"); + break; + } + + /* Now complete the aio request and copy the result codes to userland. */ + ret = aio_complete(ctx, ent.key, ent.data, 0); + if (ret) { + printk(KERN_DEBUG "aio: lost an event -- aio_complete: %d.\n", ret); + break; /* FIXME: we lose an event here */ + } + + event ++; + nr ++; + } + + if (timeout) + clear_timeout(&to); +out: + return nr ? nr : ret; +} + +/* __aioctx_put + * Called when the last user of an aio context has gone away, + * and the struct needs to be freed. + */ +void __aioctx_put(struct kioctx *ctx) +{ + struct io_event ent; + printk("aio: free aioctx %p\n", ctx); + + /* release any io requests that were not reaped by the user process */ + while (!aio_read_evt(ctx->ring, &ent)) { + struct kiogrp *iogrp = kiogrp_get(ctx, ent.key, ent.data); + if (!IS_ERR(iogrp)) + kiogrp_free(iogrp); + } + + kfree(ctx->ring); + kfree(ctx->reqs); + kmem_cache_free(kioctx_cachep, ctx); +} + +/* aio_read + * read() method for /dev/aio. Reads the next iogrp completion + * event off of the queue and then copies the iocb's return codes + * back into the userspace aiocbs. + * FIXME: error handling isn't complete. Bummer. + * TODO: implement O_NONBLOCK. + */ +static ssize_t aio_read(struct file *filp, char *buf, size_t size, loff_t *offp) +{ + struct kioctx *ctx; + int ret; + + if (size < 0) + return -EINVAL; + + size /= sizeof(struct io_event); + ctx = filp->private_data; + + ret = read_events(ctx, (struct io_event *)buf, size, NULL); + + return (ret > 0) ? ret * sizeof(struct io_event) : ret; +} + +/* iogrp_setup + * Allocate and initialize a kiogrp in the given + * context at idx. For positive values of idx, + * attempts to install the iogrp at idx, negative + * means allocate one. + * Error returns are by means of ERR_PTR's. + */ +static inline struct kiogrp *iogrp_setup(struct kioctx *ctx, int idx) +{ + struct kiogrp *iogrp; + + iogrp = ERR_PTR(-EINVAL); + if (idx >= ctx->max_reqs) + goto out; + + iogrp = kiogrp_alloc(ctx); + if (IS_ERR(iogrp)) + goto out; + + /* Get a reference to ze iogrp so that it isn't reported + * as complete before we're done queuing it. + */ + //atomic_inc(&iogrp->count); + + /* Assign the iogrp an id. */ + + /* FIXME: use cmpxchg instead of spin_lock? */ + spin_lock(&aio_req_lock); + if (idx < 0) { + for (idx=0; (idxmax_reqs) && (ctx->reqs[idx]); idx++) + ; + if (idx < ctx->max_reqs) + ctx->reqs[idx] = iogrp; + else { + printk("iogrp_setup: -EAGAIN\n"); + idx = -EAGAIN; + } + } else if (idx < ctx->max_reqs) { + if (!ctx->reqs[idx]) + ctx->reqs[idx] = iogrp; + else { + printk("iogrp_setup: -EBUSY\n"); + idx = -EBUSY; + } + } else + idx = -EINVAL; + + spin_unlock(&aio_req_lock); + + iogrp->idx = idx; /* side effect on error: kiogrp_free notices idx < 0 */ + if (idx < 0) { + //atomic_dec(&iogrp->count); + kiogrp_free(iogrp); + iogrp = ERR_PTR(idx); + } + +out: + return iogrp; +} + +static inline struct kioctx *get_ioctx(int ctx_id) +{ + struct file *filp; + + filp = fget(ctx_id); + if (filp) { + if (filp->f_op == &aio_fops) + return filp->private_data; + fput(filp); + } + + return NULL; +} + +static inline void put_ioctx(struct kioctx *ctx) +{ + fput(ctx->filp); +} + + +/* __submit_io + * Copies the aiocb from userspace into the kernel and sets up the + * request. Returns 0 if the request is successfully queued, -errno + * otherwise. + */ +static inline long __submit_io(struct kioctx *ctx, struct iocb *uaiocbp) +{ + struct iocb uaiocb; + long ret; + struct kiogrp *iogrp; + struct kiocb *kiocb; + + iogrp = iogrp_setup(ctx, -1); + ret = PTR_ERR(iogrp); + if (IS_ERR(iogrp)) + goto out_nofree; + + pr_debug("aio: submit %p %p\n", uaiocbp, &uaiocb); + ret = -EFAULT; + if (copy_from_user(&uaiocb, uaiocbp, sizeof(uaiocb))) + goto out; + + kiocb = aio_convert_user_aiocb(iogrp, &uaiocb, uaiocbp); + pr_debug("aio: kiocb = %p\n", kiocb); + ret = PTR_ERR(kiocb); + if (IS_ERR(kiocb)) + goto out; + + /* we don't do scatter gather... yet */ + iogrp->nr_iocbs = 1; + iogrp->iocbs = iogrp->atomic_iocbs; + iogrp->iocbs[0] = kiocb; + iogrp->user_data = uaiocbp; + + ret = -EFAULT; + if (put_user((int)iogrp->idx, &uaiocbp->__aio_key)) + goto out; + + /* kio_submit will free the kiocb if it fails. */ + ret = kio_submit(iogrp, kiocb, &uaiocb); + if (!ret) + return 0; + + if (atomic_read(&iogrp->count) != 0) + BUG(); + kiogrp_free(iogrp); + + return ret; + +out: + /* Shoot, something went wrong. Discard the iogrp we allocated. */ + kiogrp_free(iogrp); +out_nofree: + return ret; +} + +/* sys_submit_ios + * Copy an aiocb from userspace into kernel space, then convert it to + * a kiocb, submit and repeat until done. Error codes on copy/submit + * only get returned for the first aiocb copied as otherwise the size + * of aiocbs copied is returned (standard write sematics). + */ +long sys_submit_ios(int ctx_id, int nr, struct iocb **uaiocbpp) +{ + struct kioctx *ctx; + struct iocb *uaiocbp; + int i; + long ret = 0; + + if (ctx_id < 0 || nr <= 0) + goto out_inval; + + ctx = get_ioctx(ctx_id); + if (!ctx) + goto out_inval; + + for (i=0; iprivate_data; + + switch (command) { + case AIO_IOCTL_SUBMIT_AIOCB: + return aio_submit(ctx, (struct iocb *)arg); +#if 0 /* FIXME */ + case AIO_IOCTL_SUBMIT_LIST: + return aio_submit_group(ctx, (struct io_group_list *)arg); +#endif + case AIO_IOCTL_COMPLETE: + return aio_complete(ctx, arg, 0); + case AIO_IOCTL_COMPLETE_WAIT: + return aio_complete(ctx, arg, 1); + } + return -ENOSYS; +} +#endif + +struct file_operations aio_fops = { + //ioctl: aio_ioctl, + open: aio_open, + release: aio_release, + read: aio_read, +}; + diff -urN v2.4.2-ac28/fs/buffer.c aio-v2.4.2-ac28/fs/buffer.c --- v2.4.2-ac28/fs/buffer.c Fri Mar 30 18:50:15 2001 +++ aio-v2.4.2-ac28/fs/buffer.c Fri Mar 30 21:18:16 2001 @@ -45,6 +45,13 @@ #include #include #include +#include + +struct brw_cb { + struct kiobuf *kiobuf; + int nr; + struct buffer_head *bh[1]; +}; #include #include @@ -1059,6 +1066,10 @@ if (state < 0) return; + + if (state && (!dev || MAJOR(dev) == LOOP_MAJOR)) + state = 0; + wakeup_bdflush(state); } @@ -1992,64 +2003,53 @@ return tmp.b_blocknr; } +static inline void brw_kio_put_iobuf(struct brw_cb *brw_cb, struct kiobuf *kiobuf) +{ + if (atomic_dec_and_test(&kiobuf->io_count)) { + int nr; + + /* Walk the buffer heads associated with this kiobuf + * checking for errors and freeing them as we go. + */ + for (nr=0; nr < brw_cb->nr; nr++) { + struct buffer_head *bh = brw_cb->bh[nr]; + if (buffer_uptodate(bh) && !kiobuf->errno) + kiobuf->transferred += bh->b_size; + else if (!kiobuf->errno) + kiobuf->errno = -EIO; + kmem_cache_free(bh_cachep, bh); + } + + if (kiobuf->end_io) + kiobuf->end_io(kiobuf); + wake_up(&kiobuf->wait_queue); + + kfree(brw_cb); + } +} + /* * IO completion routine for a buffer_head being used for kiobuf IO: we * can't dispatch the kiobuf callback until io_count reaches 0. */ -static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate) +static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate) { + struct brw_cb *brw_cb; struct kiobuf *kiobuf; mark_buffer_uptodate(bh, uptodate); - kiobuf = bh->b_private; + brw_cb = bh->b_private; unlock_buffer(bh); - end_kio_request(kiobuf, uptodate); -} - - -/* - * For brw_kiovec: submit a set of buffer_head temporary IOs and wait - * for them to complete. Clean up the buffer_heads afterwards. - */ - -static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size) -{ - int iosize, err; - int i; - struct buffer_head *tmp; - iosize = 0; - err = 0; - spin_lock(&unused_list_lock); - - for (i = nr; --i >= 0; ) { - iosize += size; - tmp = bh[i]; - if (buffer_locked(tmp)) { - spin_unlock(&unused_list_lock); - wait_on_buffer(tmp); - spin_lock(&unused_list_lock); - } - - if (!buffer_uptodate(tmp)) { - /* We are traversing bh'es in reverse order so - clearing iosize on error calculates the - amount of IO before the first error. */ - iosize = 0; - err = -EIO; - } - __put_unused_buffer_head(tmp); - } - - spin_unlock(&unused_list_lock); - - if (iosize) - return iosize; - return err; + kiobuf = brw_cb->kiobuf; + if (!uptodate && !kiobuf->errno) + brw_cb->kiobuf->errno = -EIO; + brw_kio_put_iobuf(brw_cb, kiobuf); } + /* * Start I/O on a physical range of kernel memory, defined by a vector * of kiobuf structs (much like a user-space iovec list). @@ -2062,13 +2062,11 @@ * passed in to completely map the iobufs to disk. */ -int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], - kdev_t dev, unsigned long b[], int size) +int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) { int err; int length; - int transferred; - int i; int bufind; int pageind; int bhind; @@ -2076,19 +2074,33 @@ unsigned long blocknr; struct kiobuf * iobuf = NULL; struct page * map; - struct buffer_head *tmp, *bh[KIO_MAX_SECTORS]; + struct buffer_head *tmp; + int bh_nr; + int i; + +#define MAX_KIOVEC_NR 8 + struct brw_cb *brw_cb_table[MAX_KIOVEC_NR]; + struct brw_cb *brw_cb; if (!nr) return 0; - + + if (nr > MAX_KIOVEC_NR) { + printk("kiovec too large: %d\n", nr); + BUG(); + } + /* * First, do some alignment and validity checks */ for (i = 0; i < nr; i++) { iobuf = iovec[i]; - if ((iobuf->offset & (size-1)) || - (iobuf->length & (size-1))) + if ((iobuf->offset & (sector_size-1)) || + (iobuf->length & (sector_size-1))) { + printk("brw_kiovec_async: iobuf->offset=0x%x length=0x%x sector_size: 0x%x\n", iobuf->offset, iobuf->length, sector_size); return -EINVAL; + } + if (!iobuf->nr_pages) panic("brw_kiovec: iobuf not initialised"); } @@ -2096,62 +2108,67 @@ /* * OK to walk down the iovec doing page IO on each page we find. */ - bufind = bhind = transferred = err = 0; + bufind = bhind = err = 0; for (i = 0; i < nr; i++) { iobuf = iovec[i]; offset = iobuf->offset; length = iobuf->length; iobuf->errno = 0; - + iobuf->transferred = 0; + atomic_inc(&iobuf->io_count); + + bh_nr = ((iobuf->nr_pages * PAGE_SIZE) - offset) / sector_size; + if (!bh_nr) { + printk("brw_kiovec_async: !bh_nr\n"); + return -EINVAL; + } + + /* FIXME: tie into userbeans here */ + brw_cb = kmalloc(sizeof(*brw_cb) + (bh_nr * sizeof(struct buffer_head *)), GFP_KERNEL); + if (!brw_cb) + return -ENOMEM; + + brw_cb_table[i] = brw_cb; + brw_cb->kiobuf = iobuf; + brw_cb->nr = 0; + for (pageind = 0; pageind < iobuf->nr_pages; pageind++) { map = iobuf->maplist[pageind]; - if (!map) { - err = -EFAULT; + err = -EFAULT; + if (!map) goto error; - } - - while (length > 0) { + + while (length > 0 && (bufind < nr_blocks)) { blocknr = b[bufind++]; - tmp = get_unused_buffer_head(0); - if (!tmp) { - err = -ENOMEM; + tmp = kmem_cache_alloc(bh_cachep, SLAB_BUFFER); + err = -ENOMEM; + if (!tmp) goto error; - } - + + memset(tmp, 0, sizeof(*tmp)); + init_waitqueue_head(&tmp->b_wait); tmp->b_dev = B_FREE; - tmp->b_size = size; + tmp->b_size = sector_size; set_bh_page(tmp, map, offset); tmp->b_this_page = tmp; - init_buffer(tmp, end_buffer_io_kiobuf, iobuf); + init_buffer(tmp, end_buffer_io_kiobuf_async, NULL); tmp->b_dev = dev; tmp->b_blocknr = blocknr; tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req); + tmp->b_private = brw_cb; if (rw == WRITE) { set_bit(BH_Uptodate, &tmp->b_state); clear_bit(BH_Dirty, &tmp->b_state); } - bh[bhind++] = tmp; - length -= size; - offset += size; + brw_cb->bh[brw_cb->nr++] = tmp; + length -= sector_size; + offset += sector_size; atomic_inc(&iobuf->io_count); - submit_bh(rw, tmp); - /* - * Wait for IO if we have got too much - */ - if (bhind >= KIO_MAX_SECTORS) { - err = wait_kio(rw, bhind, bh, size); - if (err >= 0) - transferred += err; - else - goto finished; - bhind = 0; - } - if (offset >= PAGE_SIZE) { offset = 0; break; @@ -2160,22 +2177,72 @@ } /* End of page loop */ } /* End of iovec loop */ + /* okay, we've setup all our io requests, now fire them off! */ + for (i = 0; i < nr; i++) { + int j; + brw_cb = brw_cb_table[i]; +#if 1 + for (j=0; jnr; j++) + submit_bh(rw, brw_cb->bh[j]); + //ll_rw_block(rw, brw_cb->nr, brw_cb->bh); +#else + generic_make_requests(dev, rw, brw_cb->bh, brw_cb->nr); +#endif + brw_kio_put_iobuf(brw_cb, brw_cb->kiobuf); + } + + return 0; + error: - /* Is there any IO still left to submit? */ - if (bhind) { - int tmp_err; - tmp_err = wait_kio(rw, bhind, bh, size); - if (tmp_err >= 0) - transferred += tmp_err; - else + /* Walk brw_cb_table freeing all the goop associated with each kiobuf */ + do { + brw_cb = brw_cb_table[i]; + if (brw_cb) { + /* We got an error allocating the bh'es. Just free the current + buffer_heads and exit. */ + for (bhind = brw_cb->nr; bhind--; ) + kmem_cache_free(bh_cachep, brw_cb->bh[bhind]); + atomic_dec(&brw_cb->kiobuf->io_count); + kfree(brw_cb); + } + } while (i--) ; + + return err; +} + +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int sector_size) +{ + int i; + int transferred = 0; + int err = 0; + + if (!nr) + return 0; + + /* queue up and trigger the io */ + err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size); + if (err) + goto out; + + /* wait on the last iovec first -- it's more likely to finish last */ + for (i=nr; --i >= 0; ) + kiobuf_wait_for_io(iovec[i]); + + run_task_queue(&tq_disk); + + /* okay, how much data actually got through? */ + for (i=0; ierrno) { if (!err) - err = tmp_err; + err = iovec[i]->errno; + break; + } + transferred += iovec[i]->length; } - finished: - if (transferred) - return transferred; - return err; +out: + return transferred ? transferred : err; } /* diff -urN v2.4.2-ac28/fs/ext2/file.c aio-v2.4.2-ac28/fs/ext2/file.c --- v2.4.2-ac28/fs/ext2/file.c Fri Mar 30 18:50:16 2001 +++ aio-v2.4.2-ac28/fs/ext2/file.c Fri Mar 30 18:53:41 2001 @@ -41,6 +41,7 @@ struct file_operations ext2_file_operations = { read: generic_file_read, write: generic_file_write, + rw_kiovec: generic_file_rw_kiovec, ioctl: ext2_ioctl, mmap: generic_file_mmap, open: generic_file_open, diff -urN v2.4.2-ac28/include/asm-i386/unistd.h aio-v2.4.2-ac28/include/asm-i386/unistd.h --- v2.4.2-ac28/include/asm-i386/unistd.h Fri Aug 11 17:39:23 2000 +++ aio-v2.4.2-ac28/include/asm-i386/unistd.h Fri Mar 30 18:53:41 2001 @@ -227,6 +227,11 @@ #define __NR_madvise1 219 /* delete when C lib stub is removed */ #define __NR_getdents64 220 #define __NR_fcntl64 221 +/* reserved for tux 222 */ +#define __NR___io_cancel 224 +#define __NR___io_wait 225 +#define __NR___io_getevents 226 +#define __NR_submit_ios 227 /* user-visible error numbers are in the range -1 - -124: see */ diff -urN v2.4.2-ac28/include/linux/aio.h aio-v2.4.2-ac28/include/linux/aio.h --- v2.4.2-ac28/include/linux/aio.h Wed Dec 31 19:00:00 1969 +++ aio-v2.4.2-ac28/include/linux/aio.h Fri Mar 30 18:53:41 2001 @@ -0,0 +1,138 @@ +/* linux/aio.h + * Written by Benjamin LaHaise + */ +#ifndef __AIO_H__ +#define __AIO_H__ + +#define IOCB_CMD_FINISHING -3 /* kernel internal */ + +#define IOCB_CMD_READ 0 +#define IOCB_CMD_WRITE 1 +#define IOCB_CMD_NOP 2 +#define IOCB_CMD_CANCEL 3 +#define IOCB_CMD_FSYNC 4 +#define IOCB_CMD_FDSYNC 5 +#define IOCB_CMD_RUNNING 6 +#define IOCB_CMD_DONE 7 + +#define AIO_RING_SIZE 8000 + +/* FIXME: get real ioctl values */ +#define AIO_IOCTL_SUBMIT_AIOCB 0x10c11000 /* (struct iocb *), submits a single aio request as its own group */ +#define AIO_IOCTL_SUBMIT_LIST 0x10c11001 /* NOT IMPL. (struct aio_group_list *), submits a series of aio requests */ +#define AIO_IOCTL_COMPLETE 0x10c11002 /* int grp_idx, runs aio_complete on the given group. If still in progress, returns EBUSY */ +#define AIO_IOCTL_COMPLETE_WAIT 0x10c11003 /* int grp_idx, runs aio_complete on the given group. If still in progress, waits for completion. */ +#define AIO_IOCTL_CANCEL 0x10c11004 + +/* Notification method. Not implemented yet. */ +#define AIO_IOCTL_SET_NOTIFY_SIGNAL 0x10c11005 + +struct io_group { + int nr; + void *data; + struct iocb **list; +}; + +struct io_group_list { + int nr; + struct io_group *list; +}; + +/* read() from /dev/aio returns these structures. */ +enum io_event_types { + IO_EVENT_NONE, + IO_EVENT_IOCB_DONE, +}; + +struct io_event { + long type; + long flags; + long key; + void *data; +}; + +struct aio_ring { + unsigned long head; + unsigned long tail; + unsigned long woke; + unsigned long __reserved; + struct io_event io_events[AIO_RING_SIZE]; +}; + +/* + * we always use a 64bit off_t when communicating + * with userland. its up to libraries to do the + * proper padding and aio_error abstraction + * + * FIXME: this must change from glibc's definition + * as we do *not* use the sigevent structure which + * is big and bloated. + */ + +struct iocb { + int aio_fildes; + short aio_lio_opcode; + short aio_reqprio; + void *aio_buf; + size_t aio_nbytes; + loff_t aio_offset; + + /* these are internal to the kernel/libc. */ + ssize_t __aio_return; /* the kernel writes the return code here */ + long __aio_key; /* the kernel sets this to -1 when completed, + * otherwise is the >= 0 iogrp #. */ +}; /* 32 bytes on 32 bit machines, 48 on 64 */ + +#ifdef __KERNEL__ +#define AIO_MAXSEGS 4 +#define AIO_KIOGRP_NR_ATOMIC 8 + +struct kiocb { + int nr_kiovec; + struct kiobuf *kiovec[AIO_MAXSEGS]; + struct iocb *user_aiocb; + struct file *filp; + long aio_return; +}; + +#define IOGRP_STATE_SETUP 0 +#define IOGRP_STATE_DONE 1 + +struct kiogrp { + int locked:1; + atomic_t count; /* ios left */ + void *user_data; + struct kioctx *ctx; + int idx; + int nr_iocbs; + struct kiocb **iocbs; + struct kiocb *atomic_iocbs[AIO_KIOGRP_NR_ATOMIC]; +}; + +struct kioctx { + atomic_t users; + + wait_queue_head_t wait; + + int max_reqs; + struct kiogrp **reqs; + + spinlock_t done_lock; + + int pid; /* pid to send wakeups to */ + struct aio_ring *ring; + struct file *filp; +}; + +extern struct file_operations aio_fops; + +extern void __aioctx_put(struct kioctx *ctx); +extern void aio_setup(void); + +#define aioctx_get(kioctx) atomic_inc(&(kioctx)->users) +#define aioctx_put(kioctx) do { if (atomic_dec_and_test(&(kioctx)->users)) __aioctx_put(kioctx); } while (0) + +#endif /*__KERNEL__*/ + +#endif /* __AIO_H__ */ + diff -urN v2.4.2-ac28/include/linux/blkdev.h aio-v2.4.2-ac28/include/linux/blkdev.h --- v2.4.2-ac28/include/linux/blkdev.h Fri Mar 30 18:50:24 2001 +++ aio-v2.4.2-ac28/include/linux/blkdev.h Fri Mar 30 21:00:16 2001 @@ -149,7 +149,7 @@ extern struct blk_dev_struct blk_dev[MAX_BLKDEV]; extern void grok_partitions(struct gendisk *dev, int drive, unsigned minors, long size); extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size); -extern void generic_make_request(int rw, struct buffer_head * bh); +extern void generic_make_request(int rw, struct buffer_head *bh); extern request_queue_t *blk_get_queue(kdev_t dev); extern inline request_queue_t *__blk_get_queue(kdev_t dev); extern void blkdev_release_request(struct request *); diff -urN v2.4.2-ac28/include/linux/event.h aio-v2.4.2-ac28/include/linux/event.h --- v2.4.2-ac28/include/linux/event.h Wed Dec 31 19:00:00 1969 +++ aio-v2.4.2-ac28/include/linux/event.h Fri Mar 30 18:53:41 2001 @@ -0,0 +1,21 @@ +#ifndef _LINUX_KEVENTQ_H +#define _LINUX_KEVENTQ_H + +typedef struct file *keventq_t; + +keventq_t keventq_get(int qid); +#define keventq_put(evq) fput(evq) + +keventq_t keventq_get(int qid) +{ + struct file *filp = fget(qid); + if (filp) { + if (&keventq_fops == filp->f_op) + return filp; + fput(filp); + } + return NULL; +} + + +#endif diff -urN v2.4.2-ac28/include/linux/fs.h aio-v2.4.2-ac28/include/linux/fs.h --- v2.4.2-ac28/include/linux/fs.h Fri Mar 30 18:50:25 2001 +++ aio-v2.4.2-ac28/include/linux/fs.h Fri Mar 30 20:55:07 2001 @@ -20,7 +20,6 @@ #include #include #include -#include #include @@ -757,7 +756,13 @@ * NOTE: * read, write, poll, fsync, readv, writev can be called * without the big kernel lock held in all filesystems. + * + * rw_kiovec returns the number of bytes that will actually + * be transferred into the kiovec, or an error that occurred + * during queueing. */ +struct kiobuf; + struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); @@ -776,6 +781,7 @@ ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *); ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); ssize_t (*writepage) (struct file *, struct page *, int, size_t, loff_t *, int); + int (*rw_kiovec)(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos); }; struct inode_operations { @@ -1312,6 +1318,7 @@ extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *); extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *); extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t); +extern int generic_file_rw_kiovec(struct file *filp, int rw, int nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos); extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *); extern int generic_file_open(struct inode *, struct file *); diff -urN v2.4.2-ac28/include/linux/iobuf.h aio-v2.4.2-ac28/include/linux/iobuf.h --- v2.4.2-ac28/include/linux/iobuf.h Fri Mar 30 18:50:25 2001 +++ aio-v2.4.2-ac28/include/linux/iobuf.h Fri Mar 30 21:01:57 2001 @@ -52,8 +52,10 @@ /* Dynamic state for IO completion: */ atomic_t io_count; /* IOs still in progress */ + int transferred; /* Number of bytes of completed IO at the beginning of the buffer */ int errno; /* Status of completed IO */ void (*end_io) (struct kiobuf *); /* Completion callback */ + void *end_io_data; wait_queue_head_t wait_queue; }; @@ -79,7 +81,9 @@ /* fs/buffer.c */ +int brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, int nr_blocks, unsigned long b[], int size); int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], - kdev_t dev, unsigned long b[], int size); + kdev_t dev, int nr_blocks, unsigned long b[], int size); #endif /* __LINUX_IOBUF_H */ diff -urN v2.4.2-ac28/include/linux/locks.h aio-v2.4.2-ac28/include/linux/locks.h --- v2.4.2-ac28/include/linux/locks.h Wed Feb 21 19:10:26 2001 +++ aio-v2.4.2-ac28/include/linux/locks.h Fri Mar 30 21:00:16 2001 @@ -30,8 +30,7 @@ { clear_bit(BH_Lock, &bh->b_state); smp_mb__after_clear_bit(); - if (waitqueue_active(&bh->b_wait)) - wake_up(&bh->b_wait); + wake_up(&bh->b_wait); } /* @@ -61,8 +60,7 @@ * No need of any barrier, we're protected by * the big kernel lock here... unfortunately :) */ - if (waitqueue_active(&sb->s_wait)) - wake_up(&sb->s_wait); + wake_up(&sb->s_wait); } #endif /* _LINUX_LOCKS_H */ diff -urN v2.4.2-ac28/include/linux/mm.h aio-v2.4.2-ac28/include/linux/mm.h --- v2.4.2-ac28/include/linux/mm.h Fri Mar 30 18:50:25 2001 +++ aio-v2.4.2-ac28/include/linux/mm.h Fri Mar 30 20:55:08 2001 @@ -315,8 +315,7 @@ smp_mb__before_clear_bit(); \ if (!test_and_clear_bit(PG_locked, &(page)->flags)) BUG(); \ smp_mb__after_clear_bit(); \ - if (waitqueue_active(&(page)->wait)) \ - wake_up(&(page)->wait); \ + wake_up(&(page)->wait); \ } while (0) #define PageError(page) test_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags) diff -urN v2.4.2-ac28/include/linux/sched.h aio-v2.4.2-ac28/include/linux/sched.h --- v2.4.2-ac28/include/linux/sched.h Fri Mar 30 18:50:26 2001 +++ aio-v2.4.2-ac28/include/linux/sched.h Fri Mar 30 20:55:08 2001 @@ -758,6 +758,7 @@ extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); #define __wait_event(wq, condition) \ diff -urN v2.4.2-ac28/include/linux/tqueue.h aio-v2.4.2-ac28/include/linux/tqueue.h --- v2.4.2-ac28/include/linux/tqueue.h Fri Mar 30 18:50:27 2001 +++ aio-v2.4.2-ac28/include/linux/tqueue.h Fri Mar 30 20:55:07 2001 @@ -67,6 +67,7 @@ #define TQ_ACTIVE(q) (!list_empty(&q)) extern task_queue tq_timer, tq_immediate, tq_disk; +extern struct tq_struct run_disk_tq; /* * To implement your own list of active bottom halfs, use the following diff -urN v2.4.2-ac28/include/linux/wait.h aio-v2.4.2-ac28/include/linux/wait.h --- v2.4.2-ac28/include/linux/wait.h Fri Mar 30 18:50:27 2001 +++ aio-v2.4.2-ac28/include/linux/wait.h Fri Mar 30 20:55:07 2001 @@ -28,17 +28,20 @@ #define WAITQUEUE_DEBUG 0 #endif +typedef struct __wait_queue wait_queue_t; +typedef void (*wait_queue_func_t)(wait_queue_t *wait); + struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 struct task_struct * task; struct list_head task_list; + wait_queue_func_t func; #if WAITQUEUE_DEBUG long __magic; long __waker; #endif }; -typedef struct __wait_queue wait_queue_t; /* * 'dual' spinlock architecture. Can be switched between spinlock_t and @@ -137,6 +140,7 @@ #endif #define __WAITQUEUE_INITIALIZER(name, tsk) { \ + func: NULL, \ task: tsk, \ task_list: { NULL, NULL }, \ __WAITQUEUE_DEBUG_INIT(name)} @@ -174,6 +178,22 @@ #endif q->flags = 0; q->task = p; + q->func = NULL; +#if WAITQUEUE_DEBUG + q->__magic = (long)&q->__magic; +#endif +} + +static inline void init_waitqueue_func_entry(wait_queue_t *q, + wait_queue_func_t func) +{ +#if WAITQUEUE_DEBUG + if (!q || !p) + WQ_BUG(); +#endif + q->flags = 0; + q->task = NULL; + q->func = func; #if WAITQUEUE_DEBUG q->__magic = (long)&q->__magic; #endif @@ -230,6 +250,19 @@ #endif list_del(&old->task_list); } + +#define add_wait_queue_cond(q, wait, cond, fail) \ + do { \ + unsigned long flags; \ + wq_write_lock_irqsave(&(q)->lock, flags); \ + (wait)->flags = 0; \ + if (cond) \ + __add_wait_queue((q), (wait)); \ + else { \ + fail; \ + } \ + wq_write_unlock_irqrestore(&(q)->lock, flags); \ + } while (0) #endif /* __KERNEL__ */ diff -urN v2.4.2-ac28/include/linux/worktodo.h aio-v2.4.2-ac28/include/linux/worktodo.h --- v2.4.2-ac28/include/linux/worktodo.h Wed Dec 31 19:00:00 1969 +++ aio-v2.4.2-ac28/include/linux/worktodo.h Fri Mar 30 21:10:26 2001 @@ -0,0 +1,40 @@ +#ifndef _LINUX_WORKTODO_H +#define _LINUX_WORKTODO_H + +#ifndef _LINUX_WAIT_H +#include +#endif +#ifndef _LINUX_TQUEUE_H +#include +#endif + +struct worktodo { + wait_queue_t wait; + struct tq_struct tq; + + void *data; /* for use by the wtd_ primatives */ +}; + +/* FIXME NOTE: factor from kernel/context.c */ +#define wtd_queue(wtd) schedule_task(&(wtd)->tq) + +#define wtd_set_action(wtd, action, wtddata) \ + do { \ + (wtd)->tq.routine = (action); \ + (wtd)->tq.data = (wtddata); \ + } while (0) + +struct page; +extern void wtd_wait_page(struct worktodo *wtd, struct page *page); +extern void wtd_lock_page(struct worktodo *wtd, struct page *page); +struct buffer_head; +extern void wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh); + +#if 0 /* not implemented yet */ +extern void wtd_down(struct worktodo *wtd, struct semaphore *sem); +extern void wtd_down_write(struct worktodo *wtd, struct rw_semaphore *sem); +extern void wtd_down_read(struct worktodo *wtd, struct rw_semaphore *sem); +#endif + +#endif /* _LINUX_WORKTODO_H */ + diff -urN v2.4.2-ac28/init/main.c aio-v2.4.2-ac28/init/main.c --- v2.4.2-ac28/init/main.c Fri Mar 30 18:50:27 2001 +++ aio-v2.4.2-ac28/init/main.c Fri Mar 30 18:53:42 2001 @@ -824,8 +824,13 @@ if (initrd_start && mount_initrd) root_mountflags &= ~MS_RDONLY; else mount_initrd =0; #endif - - start_context_thread(); + { + int i = smp_num_cpus; + if (i < 2) + i = 2; + for (; i>0; i--) + start_context_thread(); + } do_initcalls(); /* .. filesystems .. */ diff -urN v2.4.2-ac28/kernel/context.c aio-v2.4.2-ac28/kernel/context.c --- v2.4.2-ac28/kernel/context.c Fri Jan 12 12:52:41 2001 +++ aio-v2.4.2-ac28/kernel/context.c Fri Mar 30 18:53:42 2001 @@ -91,12 +91,18 @@ */ for (;;) { set_task_state(curtask, TASK_INTERRUPTIBLE); - add_wait_queue(&context_task_wq, &wait); - if (TQ_ACTIVE(tq_context)) + add_wait_queue_exclusive_lifo(&context_task_wq, &wait); + if (spin_is_locked(&tqueue_lock) || TQ_ACTIVE(tq_context)) set_task_state(curtask, TASK_RUNNING); - schedule(); + else + schedule(); remove_wait_queue(&context_task_wq, &wait); run_task_queue(&tq_context); + while (TQ_ACTIVE(tq_context)) { + if (current->need_resched) + schedule(); + run_task_queue(&tq_context); + } wake_up(&context_task_done); if (signal_pending(curtask)) { while (waitpid(-1, (unsigned int *)0, __WALL|WNOHANG) > 0) diff -urN v2.4.2-ac28/kernel/fork.c aio-v2.4.2-ac28/kernel/fork.c --- v2.4.2-ac28/kernel/fork.c Fri Mar 30 18:50:28 2001 +++ aio-v2.4.2-ac28/kernel/fork.c Fri Mar 30 18:53:42 2001 @@ -44,6 +44,16 @@ wq_write_unlock_irqrestore(&q->lock, flags); } +void add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait) +{ + unsigned long flags; + + wq_write_lock_irqsave(&q->lock, flags); + wait->flags = WQ_FLAG_EXCLUSIVE; + __add_wait_queue(q, wait); + wq_write_unlock_irqrestore(&q->lock, flags); +} + void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait) { unsigned long flags; diff -urN v2.4.2-ac28/kernel/sched.c aio-v2.4.2-ac28/kernel/sched.c --- v2.4.2-ac28/kernel/sched.c Fri Mar 30 18:50:28 2001 +++ aio-v2.4.2-ac28/kernel/sched.c Fri Mar 30 18:53:42 2001 @@ -736,10 +736,19 @@ tmp = head->next; while (tmp != head) { unsigned int state; - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + wait_queue_func_t func; tmp = tmp->next; CHECK_MAGIC(curr->__magic); + func = curr->func; + if (func) { + unsigned flags = curr->flags; + func(curr); + if (flags & WQ_FLAG_EXCLUSIVE && !--nr_exclusive) + break; + continue; + } p = curr->task; state = p->state; if (state & mode) { diff -urN v2.4.2-ac28/kernel/softirq.c aio-v2.4.2-ac28/kernel/softirq.c --- v2.4.2-ac28/kernel/softirq.c Fri Dec 29 17:07:24 2000 +++ aio-v2.4.2-ac28/kernel/softirq.c Fri Mar 30 18:53:42 2001 @@ -311,6 +311,7 @@ data = p->data; wmb(); p->sync = 0; + smp_mb(); if (f) f(data); } diff -urN v2.4.2-ac28/mm/filemap.c aio-v2.4.2-ac28/mm/filemap.c --- v2.4.2-ac28/mm/filemap.c Fri Mar 30 18:50:28 2001 +++ aio-v2.4.2-ac28/mm/filemap.c Fri Mar 30 18:53:42 2001 @@ -21,12 +21,14 @@ #include #include #include +#include #include #include #include #include +#include /* * Shared mappings implemented 30.11.1994. It's not fully working yet, @@ -2694,3 +2696,729 @@ panic("Failed to allocate page hash table\n"); memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *)); } + +/* address_space_map + * Maps a series of pages from the page cache into the given array. + */ +static int address_space_map(struct address_space *as, unsigned long index, + int nr, struct page **pages, + int *nr_newp, struct page **new_pages) +{ + struct page *cached_page = NULL; + int nr_new = 0; + int ret; + + ret = -EINVAL; + if (nr <= 0) + goto out; + + ret = 0; + + spin_lock(&pagecache_lock); + + while (nr > 0) { + struct page **hash = page_hash(as, index); + struct page *page; + + page = __find_page_nolock(as, index, *hash); + if (page) { + page_cache_get(page); +got_page: + pages[ret++] = page; + index++; + nr--; + continue; + } + + if (cached_page) { + __add_to_page_cache(cached_page, as, index, hash); + nr_new++; + *new_pages++ = page = cached_page; + cached_page = NULL; + goto got_page; + } + spin_unlock(&pagecache_lock); + + cached_page = page_cache_alloc(as); + if (!cached_page) + goto out; + + /* Okay, we now have an allocated page. Retry + * the search and add. */ + spin_lock(&pagecache_lock); + } + + spin_unlock(&pagecache_lock); + +out: + if (cached_page) + page_cache_free(cached_page); + + *nr_newp = nr_new; + return ret ? ret : -ENOMEM; +} + +struct iodesc { + struct worktodo wtd; + + struct page *good_page; /* the highest Uptodate page */ + int good_idx; + int err; + int did_read; + int rw; + + struct page **pages; + struct page **new_pages; + struct page **cur_pagep; + struct page **src_pagep; + int nr_pages; + int nr_new_pages; + + struct address_space *as; + struct file *file; + struct kiobuf *kiovec[8]; + int kio_nr; + + size_t size; + unsigned long transferred; + unsigned offset; + unsigned src_offset; + struct kiobuf *iobuf; + + int sync; + +#define READDESC_NR_DEF 3 + struct page *def_pages[READDESC_NR_DEF]; + struct page *def_new_pages[READDESC_NR_DEF]; +}; + +static void __iodesc_free(struct iodesc *io) +{ + int i; + + for (i=0; inr_pages; i++) + page_cache_release(io->pages[i]); + + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); + if (io->pages != io->def_pages) + kfree(io->pages); + kfree(io); +} + +/* By the time this function is called, all of the pages prior to + * the current good_idx have been released appropriately. The remaining + * duties are to release any remaining pages and to honour O_SYNC. + */ +static void __iodesc_finish_write(struct iodesc *io) +{ + int i; + + pr_debug("__iodesc_finish_write(%p)\n", io); + + if (WRITE == io->rw) + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + UnlockPage(page); + deactivate_page(page); + //page_cache_release(page); + } + + /* FIXME: this is buggy */ + { + struct kiobuf *iobuf = io->kiovec[0]; + iobuf->transferred = io->transferred; + iobuf->errno = io->err; + iobuf->end_io(iobuf); + } + + __iodesc_free(io); +} + +/* This is mostly ripped from generic_file_write */ +static int __iodesc_write_page(struct iodesc *io, struct page *page) +{ + unsigned long bytes; + unsigned long offset, src_offset; + struct page *src_page; + long status; + char *kaddr; + int src_bytes; + char *src; + int done = 0; + unsigned left; + + src_bytes = PAGE_CACHE_SIZE - io->src_offset; + src_page = *io->src_pagep; + src = kmap(src_page) + io->src_offset; + + offset = io->offset; + src_offset = io->src_offset; + kaddr = kmap(page); + kaddr += offset; + + bytes = PAGE_CACHE_SIZE - offset; + if (io->size < bytes) + bytes = io->size; + + pr_debug("__iodesc_write_page(%p (%lu), %lu %lu %lu)\n", page, page->index, offset, bytes, src_offset); + + io->err = io->as->a_ops->prepare_write(io->file, page, + offset, offset + bytes); + if (io->err) { +printk("prepare_write: %d\n", io->err); + goto unlock; + } + + left = bytes; + for (;;) { + if (left < src_bytes) + src_bytes = left; + + memcpy(kaddr, src, src_bytes); + kaddr += src_bytes; + src += src_bytes; + left -= src_bytes; + src_offset += src_bytes; + src_offset &= PAGE_SIZE - 1; + if (!src_offset) + io->src_pagep++; + + if (left <= 0) + break; + + if (!src_offset) { + kunmap(src_page); + src_page = *io->src_pagep; + src = kmap(src_page); + src_bytes = PAGE_SIZE; + } + } + flush_dcache_page(page); + status = io->as->a_ops->commit_write(io->file, page, + offset, offset+bytes); + + /* We don't handle short writes */ + if (status > 0 && status != bytes) + done = 1; + + if (!status) + status = bytes; +else +printk("commit_write: %ld\n", status); + + if (status > 0) { + io->transferred += status; + io->size -= status; + io->offset = (offset + status) & (PAGE_CACHE_SIZE - 1); + + if (io->offset) + done = 1; + + io->src_offset += status; + io->src_offset &= PAGE_CACHE_SIZE - 1; + } else { + io->err = status; + done = 1; + } + +unlock: + kunmap(page); + kunmap(src_page); + + //UnlockPage(page); + //deactivate_page(page); + //page_cache_release(page); + + return done; +} + +void __iodesc_sync_wait_page(void *data) +{ + struct iodesc *io = data; + + do { + struct buffer_head *bh, *head = io->pages[io->good_idx]->buffers; + + if (!head) + continue; + + bh = head; + do { + if (buffer_locked(bh)) { +//printk("waiting on bh=%pi io=%p\n", bh, io); + wtd_wait_on_buffer(&io->wtd, bh); + return; + } + if (buffer_req(bh) && !buffer_uptodate(bh)) { +//printk("io err bh=%p (%p)\n", bh, io); + io->err = -EIO; + break; + } + } while ((bh = bh->b_this_page) != head); + } while (!io->err && ++io->good_idx < io->nr_pages) ; + +//printk("finish_write(%p)\n", io); + __iodesc_finish_write(io); +} + +static void __iodesc_do_write(void *data) +{ + struct iodesc *io = data; + unsigned i; + + up(&io->file->f_dentry->d_inode->i_sem); + + for (i=0; inr_pages; i++) + if (__iodesc_write_page(io, io->pages[i])) + break; + + if (io->sync) { + io->good_idx = 0; + +//printk("writing out pages(%p)\n", io); + for (i=0; inr_pages; i++) { + if (io->pages[i]->buffers) + writeout_one_page(io->pages[i]); + } + +//printk("calling __iodesc_sync_wait_page(%p)\n", io); + wtd_set_action(&io->wtd, __iodesc_sync_wait_page, io); + __iodesc_sync_wait_page(io); + return; + } + + __iodesc_finish_write(io); +} + +static void __iodesc_write_lock_next_page(void *data) +{ + struct iodesc *io = data; + pr_debug("__iodesc_write_next_page(%p)\n", io); + + while (io->good_idx < io->nr_pages) { + io->good_page = io->pages[io->good_idx++]; + if (io->good_page == *io->cur_pagep) + io->cur_pagep++; + else { + wtd_lock_page(&io->wtd, io->good_page); + return; + } + } + + //__iodesc_do_write(io); + wtd_set_action(&io->wtd, __iodesc_do_write, io); + wtd_queue(&io->wtd); +} + +static +void __generic_file_write_iodesc(struct iodesc *io) +{ + struct inode *inode = io->file->f_dentry->d_inode; + time_t now = CURRENT_TIME; + + remove_suid(inode); + if (inode->i_ctime != now || inode->i_mtime != now) { + inode->i_ctime = inode->i_mtime = now; + mark_inode_dirty_sync(inode); + } + + wtd_set_action(&io->wtd, __iodesc_write_lock_next_page, io); + io->sync = !!(io->file->f_flags & O_SYNC); + io->good_idx = 0; + io->cur_pagep = io->new_pages; + io->src_offset = io->kiovec[0]->offset; + io->src_pagep = io->kiovec[0]->maplist; + __iodesc_write_lock_next_page(io); +} + +static void __iodesc_read_finish(struct iodesc *io) +{ + char *dst_addr, *src_addr; + int src_off, i; + size_t size; + size_t valid; + + struct page **src_pagep; + + pr_debug("__iodesc_read_finish: good_idx = %d\n", io->good_idx); + if (io->good_idx <= 0) + goto no_data; + + size = io->size; + src_off = io->offset; + src_pagep = io->pages; + src_addr = kmap(*src_pagep); + + valid = (size_t)io->good_idx << PAGE_CACHE_SHIFT; + valid -= src_off; + pr_debug("size=%d valid=%d src_off=%d\n", size, valid, src_off); + + if (valid < size) + size = valid; + + for (i=0; ikio_nr; i++) { + struct kiobuf *iobuf = io->kiovec[i]; + int dst_len = iobuf->length; + int dst_off = iobuf->offset; + struct page **dst_pagep = iobuf->maplist; + + dst_addr = kmap(*dst_pagep); + iobuf->transferred = 0; + + while (size > 0) { + int this = PAGE_CACHE_SIZE - src_off; + if ((PAGE_SIZE - dst_off) < this) + this = PAGE_SIZE - dst_off; + if (size < this) + this = size; + pr_debug("this=%d src_off=%d dst_off=%d dst_len=%d\n", + this, src_off, dst_off, dst_len); + memcpy(dst_addr + dst_off, src_addr + src_off, this); + + src_off += this; + dst_off += this; + dst_len -= this; + size -= this; + iobuf->transferred += this; + pr_debug("read_finish: this=%d transferred=%d\n", this, iobuf->transferred); + + if (dst_len <= 0) + break; + + if (size <= 0) + break; + + if (dst_off >= PAGE_SIZE) { + kunmap(*dst_pagep); + dst_pagep++; + dst_addr = kmap(*dst_pagep); + dst_off = 0; + } + + if (src_off >= PAGE_SIZE) { /* FIXME: PAGE_CACHE_SIZE */ + kunmap(*src_pagep); +pr_debug("page(%lu)->count = %d\n", (*src_pagep)->index, atomic_read(&(*src_pagep)->count)); + src_pagep++; + src_addr = kmap(*src_pagep); + src_off = 0; + } + } + kunmap(*dst_pagep); + + iobuf->errno = iobuf->transferred ? 0 : io->err; + if (iobuf->errno && i) + iobuf->errno = -EAGAIN; + iobuf->end_io(iobuf); + } + + kunmap(*src_pagep); + __iodesc_free(io); + + return; + +no_data: + io->kiovec[0]->errno = io->err; + io->kiovec[0]->transferred = 0; + io->kiovec[0]->end_io(io->kiovec[0]); + + for (i=1; ikio_nr; i++) { + struct kiobuf *iobuf = io->kiovec[i]; + + iobuf->errno = -EAGAIN; + iobuf->transferred = 0; + iobuf->end_io(iobuf); + } + __iodesc_free(io); +} + +static void __iodesc_make_uptodate(void *data) +{ + struct iodesc *io = data; + struct page *page = io->good_page; + int locked = 1; + + pr_debug("__iodesc_make_uptodate: io=%p index=%lu\n", io, page->index); + while (Page_Uptodate(page)) { +again: + pr_debug("page index %lu uptodate\n", page->index); + if (locked) { + UnlockPage(page); + locked = 0; + } + io->did_read = 0; + io->good_idx++; + if (io->good_idx >= io->nr_pages) { + __iodesc_read_finish(io); + return; + } + page = io->good_page = io->pages[io->good_idx]; + pr_debug("__iodesc_make_uptodate: index=%lu\n", page->index); + } + + if (!locked) { + wtd_lock_page(&io->wtd, page); + return; + } + + if (!io->did_read) { + /* We haven't tried reading this page before, give it a go. */ + printk("attempting to read %lu\n", page->index); + io->did_read = 1; + io->err = page->mapping->a_ops->readpage(io->file, page); + if (!io->err) { + if (Page_Uptodate(page)) + goto again; + wtd_lock_page(&io->wtd, page); + return; + } + } + + if (locked) + UnlockPage(page); + + /* We've already read this page before. Set err to EIO and quite */ + if (!io->err) + io->err = -EIO; + __iodesc_read_finish(io); +} + +static void __wtdgeneric_file_read_iodesc(void *data); + +static void __generic_file_read_iodesc(struct iodesc *io, int mayblock) +{ + int (*readpage)(struct file *, struct page *); + int i; + + wtd_set_action(&io->wtd, __iodesc_make_uptodate, io); + readpage = io->as->a_ops->readpage; + for (i=0; inr_new_pages; i++) { + int foo; + if (!mayblock) + goto do_wtd; + foo = readpage(io->file, io->new_pages[i]); + if (foo) + printk(KERN_DEBUG "__generic_file_read_kiovec: readpage(%lu) = %d\n", io->new_pages[i]->index, foo); + } + + for (i=0; inr_pages; i++) { + struct page *page = io->pages[i]; + if (Page_Uptodate(page)) { + pr_debug("__generic_file_read_iodesc: %lu is uptodate\n", page->index); + continue; + } + + if (!mayblock) + goto do_wtd; + if (!TryLockPage(page)) { + int foo = readpage(io->file, page); + if (foo) + printk(KERN_DEBUG "__generic_file_read_iodesc: readpage(%lu): %d\n", page->index, foo); + } + + if (!Page_Uptodate(page) && io->good_idx == -1) { + pr_debug("first good_idx=%d (%lu)\n", i, page->index); + io->good_idx = i; + io->good_page = page; + } + } + + /* Whee, all the pages are uptodate! */ + if (!io->good_page) { + do {static int zoo; if (!mayblock && zoo++ < 5) printk("all uptodate\n");} while(0); + pr_debug("all pages uptodate!\n"); + io->good_idx = io->nr_pages; + __iodesc_read_finish(io); + return; + } + + pr_debug("locking good_page\n"); + wtd_lock_page(&io->wtd, io->good_page); + return; + +do_wtd: + do {static int zoo; if (zoo++ < 5) printk("read sleep\n");} while(0); + wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io); + wtd_queue(&io->wtd); +} + +static void __wtdgeneric_file_read_iodesc(void *data) +{ + struct iodesc *io = data; + __generic_file_read_iodesc(io, 1); +} + +int generic_file_rw_kiovec(struct file *file, int rw, + int kio_nr, struct kiobuf **kiovec, int flags, size_t size, loff_t pos) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *as = inode->i_mapping; + unsigned long index; + unsigned long eindex; + unsigned long nr_pages; + struct iodesc *io = NULL; + int ret; + + ret = -EINVAL; + if (rw != READ && rw != WRITE) + goto out; + + ret = -ENOMEM; + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) + goto out; + + memset(io, 0, sizeof(*io)); + io->size = size; + + if (READ == rw) { + pr_debug("pos=%Ld i_size=%Ld\n", pos, inode->i_size); + + if (pos > inode->i_size) + size = 0; + else if ((pos + size) > inode->i_size) + size = inode->i_size - pos; + + if (io->size < size) + size = io->size; + else if (size < io->size) + io->size = size; + + pr_debug("io->size=%d size=%d\n", io->size, size); + } + + index = pos >> PAGE_CACHE_SHIFT; + eindex = (pos + size - 1) >> PAGE_CACHE_SHIFT; + nr_pages = eindex - index + 1; + + pr_debug("nr_pages: %lu\n", nr_pages); + + io->good_idx = -1; + io->good_page = NULL; + io->did_read = 0; + io->err = 0; + io->rw = rw; + io->as = as; + io->offset = (unsigned long)pos & (PAGE_CACHE_SIZE - 1); + io->file = file; + io->kio_nr = kio_nr; + if (kio_nr > 8) + BUG(); + memcpy(io->kiovec, kiovec, sizeof(struct kiobuf *) * kio_nr); + if (nr_pages < READDESC_NR_DEF) { + io->pages = io->def_pages; + io->new_pages = io->def_new_pages; + } else { + io->pages = kmalloc(sizeof(*io->pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->pages) + goto out_io; + + io->new_pages = kmalloc(sizeof(*io->new_pages) * (nr_pages + 1), GFP_KERNEL); + if (!io->new_pages) + goto out_pages; + } + + /* FIXME: make the down a WTD_op */ + if (rw == WRITE) + down(&io->file->f_dentry->d_inode->i_sem); + + ret = address_space_map(as, index, nr_pages, io->pages, + &io->nr_new_pages, io->new_pages); + pr_debug("as_map: %d (%d new)\n", ret, io->nr_new_pages); + if (ret <= 0) + goto out_new_pages; + + io->nr_pages = ret; + io->pages[io->nr_pages] = NULL; + io->new_pages[io->nr_new_pages] = NULL; + + if (rw == READ) + __generic_file_read_iodesc(io, 0); + else if (rw == WRITE) + __generic_file_write_iodesc(io); + + return 0; + +out_new_pages: + if (io->new_pages != io->def_new_pages) + kfree(io->new_pages); +out_pages: + if (io->pages != io->def_pages) + kfree(io->pages); +out_io: + kfree(io); +out: + return ret; +} + +static void __wtd_lock_page_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct page *page = (struct page *)wtd->data; + + if (!TryLockPage(page)) { + __remove_wait_queue(&page->wait, &wtd->wait); + wtd_queue(wtd); + } else { + schedule_task(&run_disk_tq); + } +} + +void wtd_lock_page(struct worktodo *wtd, struct page *page) +{ + if (TryLockPage(page)) { + int raced = 0; + wtd->data = page; + init_waitqueue_func_entry(&wtd->wait, __wtd_lock_page_waiter); + add_wait_queue_cond(&page->wait, &wtd->wait, TryLockPage(page), raced = 1); + + if (!raced) { + run_task_queue(&tq_disk); + return; + } + } + + wtd->tq.routine(wtd->tq.data); +} + +static void __wtd_bh_waiter(wait_queue_t *wait) +{ + struct worktodo *wtd = (struct worktodo *)wait; + struct buffer_head *bh = (struct buffer_head *)wtd->data; + + if (!buffer_locked(bh)) { + __remove_wait_queue(&bh->b_wait, &wtd->wait); + wtd_queue(wtd); + } else { + schedule_task(&run_disk_tq); + } +} + +void wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh) +{ + int raced = 0; + + if (!buffer_locked(bh)) { + wtd->tq.routine(wtd->tq.data); + return; + } + wtd->data = bh; + init_waitqueue_func_entry(&wtd->wait, __wtd_bh_waiter); + add_wait_queue_cond(&bh->b_wait, &wtd->wait, buffer_locked(bh), raced = 1); + + if (raced) + wtd->tq.routine(wtd->tq.data); + else + run_task_queue(&tq_disk); +} + +void do_run_tq_disk(void *data) +{ + run_task_queue(&tq_disk); +} + +struct tq_struct run_disk_tq = { + routine: do_run_tq_disk, + data: NULL +}; +