arch/i386/Kconfig | 4 arch/i386/kernel/semaphore.c | 185 +----------- arch/um/Kconfig_i386 | 4 arch/um/Kconfig_x86_64 | 4 arch/x86_64/Kconfig | 4 arch/x86_64/kernel/Makefile | 2 arch/x86_64/kernel/semaphore.c | 180 ----------- arch/x86_64/lib/thunk.S | 1 description | 5 drivers/usb/gadget/inode.c | 7 fs/aio.c | 623 +++++++++++++++++++++++++++++++++++------ fs/bad_inode.c | 2 fs/block_dev.c | 9 fs/buffer.c | 2 fs/ext2/file.c | 2 fs/ext3/file.c | 16 - fs/inode.c | 2 fs/jfs/file.c | 2 fs/ntfs/file.c | 2 fs/pipe.c | 176 +++++++++-- fs/read_write.c | 182 +++++++---- fs/reiserfs/file.c | 10 include/asm-i386/semaphore.h | 42 ++ include/asm-x86_64/semaphore.h | 44 ++ include/linux/aio.h | 37 ++ include/linux/aio_abi.h | 13 include/linux/fs.h | 9 include/linux/net.h | 4 include/linux/pagemap.h | 29 + include/linux/sched.h | 13 include/linux/wait.h | 42 ++ include/linux/writeback.h | 2 kernel/exit.c | 2 kernel/fork.c | 7 kernel/sched.c | 14 kernel/wait.c | 40 +- lib/Makefile | 1 lib/semaphore-sleepers.c | 253 ++++++++++++++++ mm/filemap.c | 164 +++++++--- net/socket.c | 78 +++-- 41 files changed, 1567 insertions(+), 652 deletions(-) diff -purN 00_v2.6.13-rc6/arch/i386/Kconfig test-84.diff/arch/i386/Kconfig --- 00_v2.6.13-rc6/arch/i386/Kconfig 2005-08-08 15:45:47.000000000 -0400 +++ test-84.diff/arch/i386/Kconfig 2005-08-08 17:15:28.000000000 -0400 @@ -14,6 +14,10 @@ config X86 486, 586, Pentiums, and various instruction-set-compatible chips by AMD, Cyrix, and others. +config SEMAPHORE_SLEEPERS + bool + default y + config MMU bool default y diff -purN 00_v2.6.13-rc6/arch/i386/kernel/semaphore.c test-84.diff/arch/i386/kernel/semaphore.c --- 00_v2.6.13-rc6/arch/i386/kernel/semaphore.c 2005-06-20 13:33:10.000000000 -0400 +++ test-84.diff/arch/i386/kernel/semaphore.c 2005-08-08 17:15:30.000000000 -0400 @@ -13,171 +13,10 @@ * rw semaphores implemented November 1999 by Benjamin LaHaise */ #include -#include -#include -#include +#include #include /* - * Semaphores are implemented using a two-way counter: - * The "count" variable is decremented for each process - * that tries to acquire the semaphore, while the "sleeping" - * variable is a count of such acquires. - * - * Notably, the inline "up()" and "down()" functions can - * efficiently test if they need to do any extra work (up - * needs to do something only if count was negative before - * the increment operation. - * - * "sleeping" and the contention routine ordering is protected - * by the spinlock in the semaphore's waitqueue head. - * - * Note that these functions are only called when there is - * contention on the lock, and as such all this is the - * "non-critical" part of the whole semaphore business. The - * critical part is the inline stuff in - * where we want to avoid any extra jumps and calls. - */ - -/* - * Logic: - * - only on a boundary condition do we need to care. When we go - * from a negative count to a non-negative, we wake people up. - * - when we go from a non-negative count to a negative do we - * (a) synchronize with the "sleeper" count and (b) make sure - * that we're on the wakeup list before we synchronize so that - * we cannot lose wakeup events. - */ - -static fastcall void __attribute_used__ __up(struct semaphore *sem) -{ - wake_up(&sem->wait); -} - -static fastcall void __attribute_used__ __sched __down(struct semaphore * sem) -{ - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - unsigned long flags; - - tsk->state = TASK_UNINTERRUPTIBLE; - spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); - - sem->sleepers++; - for (;;) { - int sleepers = sem->sleepers; - - /* - * Add "everybody else" into it. They aren't - * playing, because we own the spinlock in - * the wait_queue_head. - */ - if (!atomic_add_negative(sleepers - 1, &sem->count)) { - sem->sleepers = 0; - break; - } - sem->sleepers = 1; /* us - see -1 above */ - spin_unlock_irqrestore(&sem->wait.lock, flags); - - schedule(); - - spin_lock_irqsave(&sem->wait.lock, flags); - tsk->state = TASK_UNINTERRUPTIBLE; - } - remove_wait_queue_locked(&sem->wait, &wait); - wake_up_locked(&sem->wait); - spin_unlock_irqrestore(&sem->wait.lock, flags); - tsk->state = TASK_RUNNING; -} - -static fastcall int __attribute_used__ __sched __down_interruptible(struct semaphore * sem) -{ - int retval = 0; - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - unsigned long flags; - - tsk->state = TASK_INTERRUPTIBLE; - spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); - - sem->sleepers++; - for (;;) { - int sleepers = sem->sleepers; - - /* - * With signals pending, this turns into - * the trylock failure case - we won't be - * sleeping, and we* can't get the lock as - * it has contention. Just correct the count - * and exit. - */ - if (signal_pending(current)) { - retval = -EINTR; - sem->sleepers = 0; - atomic_add(sleepers, &sem->count); - break; - } - - /* - * Add "everybody else" into it. They aren't - * playing, because we own the spinlock in - * wait_queue_head. The "-1" is because we're - * still hoping to get the semaphore. - */ - if (!atomic_add_negative(sleepers - 1, &sem->count)) { - sem->sleepers = 0; - break; - } - sem->sleepers = 1; /* us - see -1 above */ - spin_unlock_irqrestore(&sem->wait.lock, flags); - - schedule(); - - spin_lock_irqsave(&sem->wait.lock, flags); - tsk->state = TASK_INTERRUPTIBLE; - } - remove_wait_queue_locked(&sem->wait, &wait); - wake_up_locked(&sem->wait); - spin_unlock_irqrestore(&sem->wait.lock, flags); - - tsk->state = TASK_RUNNING; - return retval; -} - -/* - * Trylock failed - make sure we correct for - * having decremented the count. - * - * We could have done the trylock with a - * single "cmpxchg" without failure cases, - * but then it wouldn't work on a 386. - */ -static fastcall int __attribute_used__ __down_trylock(struct semaphore * sem) -{ - int sleepers; - unsigned long flags; - - spin_lock_irqsave(&sem->wait.lock, flags); - sleepers = sem->sleepers + 1; - sem->sleepers = 0; - - /* - * Add "everybody else" and us into it. They aren't - * playing, because we own the spinlock in the - * wait_queue_head. - */ - if (!atomic_add_negative(sleepers, &sem->count)) { - wake_up_locked(&sem->wait); - } - - spin_unlock_irqrestore(&sem->wait.lock, flags); - return 1; -} - - -/* * The semaphore operations have a special calling sequence that * allow us to do a simpler in-line version of them. These routines * need to convert that sequence back into the C sequence when @@ -211,6 +50,28 @@ asm( asm( ".section .sched.text\n" ".align 4\n" +".globl __aio_down_failed\n" +"__aio_down_failed:\n\t" +#if defined(CONFIG_FRAME_POINTER) + "pushl %ebp\n\t" + "movl %esp,%ebp\n\t" +#endif + "pushl %edx\n\t" + "pushl %ecx\n\t" + "call __aio_down\n\t" + "popl %ecx\n\t" + "popl %edx\n\t" +#if defined(CONFIG_FRAME_POINTER) + "movl %ebp,%esp\n\t" + "popl %ebp\n\t" +#endif + "ret" +); +EXPORT_SYMBOL(__aio_down_failed); + +asm( +".section .sched.text\n" +".align 4\n" ".globl __down_failed_interruptible\n" "__down_failed_interruptible:\n\t" #if defined(CONFIG_FRAME_POINTER) diff -purN 00_v2.6.13-rc6/arch/um/Kconfig_i386 test-84.diff/arch/um/Kconfig_i386 --- 00_v2.6.13-rc6/arch/um/Kconfig_i386 2005-08-04 15:54:42.000000000 -0400 +++ test-84.diff/arch/um/Kconfig_i386 2005-08-08 17:15:28.000000000 -0400 @@ -6,6 +6,10 @@ config 64BIT bool default n +config SEMAPHORE_SLEEPERS + bool + default y + config TOP_ADDR hex default 0xc0000000 if !HOST_2G_2G diff -purN 00_v2.6.13-rc6/arch/um/Kconfig_x86_64 test-84.diff/arch/um/Kconfig_x86_64 --- 00_v2.6.13-rc6/arch/um/Kconfig_x86_64 2005-08-04 15:54:42.000000000 -0400 +++ test-84.diff/arch/um/Kconfig_x86_64 2005-08-08 17:15:28.000000000 -0400 @@ -6,6 +6,10 @@ config 64BIT bool default y +config SEMAPHORE_SLEEPERS + bool + default y + config TOP_ADDR hex default 0x80000000 diff -purN 00_v2.6.13-rc6/arch/x86_64/Kconfig test-84.diff/arch/x86_64/Kconfig --- 00_v2.6.13-rc6/arch/x86_64/Kconfig 2005-08-04 15:54:44.000000000 -0400 +++ test-84.diff/arch/x86_64/Kconfig 2005-08-08 17:15:28.000000000 -0400 @@ -24,6 +24,10 @@ config X86 bool default y +config SEMAPHORE_SLEEPERS + bool + default y + config MMU bool default y diff -purN 00_v2.6.13-rc6/arch/x86_64/kernel/Makefile test-84.diff/arch/x86_64/kernel/Makefile --- 00_v2.6.13-rc6/arch/x86_64/kernel/Makefile 2005-08-04 15:54:44.000000000 -0400 +++ test-84.diff/arch/x86_64/kernel/Makefile 2005-08-08 17:15:28.000000000 -0400 @@ -4,7 +4,7 @@ extra-y := head.o head64.o init_task.o vmlinux.lds EXTRA_AFLAGS := -traditional -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ +obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o diff -purN 00_v2.6.13-rc6/arch/x86_64/kernel/semaphore.c test-84.diff/arch/x86_64/kernel/semaphore.c --- 00_v2.6.13-rc6/arch/x86_64/kernel/semaphore.c 2005-06-20 13:33:15.000000000 -0400 +++ test-84.diff/arch/x86_64/kernel/semaphore.c 1969-12-31 19:00:00.000000000 -0500 @@ -1,180 +0,0 @@ -/* - * x86_64 semaphore implementation. - * - * (C) Copyright 1999 Linus Torvalds - * - * Portions Copyright 1999 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * rw semaphores implemented November 1999 by Benjamin LaHaise - */ -#include -#include -#include -#include - -#include - -/* - * Semaphores are implemented using a two-way counter: - * The "count" variable is decremented for each process - * that tries to acquire the semaphore, while the "sleeping" - * variable is a count of such acquires. - * - * Notably, the inline "up()" and "down()" functions can - * efficiently test if they need to do any extra work (up - * needs to do something only if count was negative before - * the increment operation. - * - * "sleeping" and the contention routine ordering is protected - * by the spinlock in the semaphore's waitqueue head. - * - * Note that these functions are only called when there is - * contention on the lock, and as such all this is the - * "non-critical" part of the whole semaphore business. The - * critical part is the inline stuff in - * where we want to avoid any extra jumps and calls. - */ - -/* - * Logic: - * - only on a boundary condition do we need to care. When we go - * from a negative count to a non-negative, we wake people up. - * - when we go from a non-negative count to a negative do we - * (a) synchronize with the "sleeper" count and (b) make sure - * that we're on the wakeup list before we synchronize so that - * we cannot lose wakeup events. - */ - -void __up(struct semaphore *sem) -{ - wake_up(&sem->wait); -} - -void __sched __down(struct semaphore * sem) -{ - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - unsigned long flags; - - tsk->state = TASK_UNINTERRUPTIBLE; - spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); - - sem->sleepers++; - for (;;) { - int sleepers = sem->sleepers; - - /* - * Add "everybody else" into it. They aren't - * playing, because we own the spinlock in - * the wait_queue_head. - */ - if (!atomic_add_negative(sleepers - 1, &sem->count)) { - sem->sleepers = 0; - break; - } - sem->sleepers = 1; /* us - see -1 above */ - spin_unlock_irqrestore(&sem->wait.lock, flags); - - schedule(); - - spin_lock_irqsave(&sem->wait.lock, flags); - tsk->state = TASK_UNINTERRUPTIBLE; - } - remove_wait_queue_locked(&sem->wait, &wait); - wake_up_locked(&sem->wait); - spin_unlock_irqrestore(&sem->wait.lock, flags); - tsk->state = TASK_RUNNING; -} - -int __sched __down_interruptible(struct semaphore * sem) -{ - int retval = 0; - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - unsigned long flags; - - tsk->state = TASK_INTERRUPTIBLE; - spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); - - sem->sleepers++; - for (;;) { - int sleepers = sem->sleepers; - - /* - * With signals pending, this turns into - * the trylock failure case - we won't be - * sleeping, and we* can't get the lock as - * it has contention. Just correct the count - * and exit. - */ - if (signal_pending(current)) { - retval = -EINTR; - sem->sleepers = 0; - atomic_add(sleepers, &sem->count); - break; - } - - /* - * Add "everybody else" into it. They aren't - * playing, because we own the spinlock in - * wait_queue_head. The "-1" is because we're - * still hoping to get the semaphore. - */ - if (!atomic_add_negative(sleepers - 1, &sem->count)) { - sem->sleepers = 0; - break; - } - sem->sleepers = 1; /* us - see -1 above */ - spin_unlock_irqrestore(&sem->wait.lock, flags); - - schedule(); - - spin_lock_irqsave(&sem->wait.lock, flags); - tsk->state = TASK_INTERRUPTIBLE; - } - remove_wait_queue_locked(&sem->wait, &wait); - wake_up_locked(&sem->wait); - spin_unlock_irqrestore(&sem->wait.lock, flags); - - tsk->state = TASK_RUNNING; - return retval; -} - -/* - * Trylock failed - make sure we correct for - * having decremented the count. - * - * We could have done the trylock with a - * single "cmpxchg" without failure cases, - * but then it wouldn't work on a 386. - */ -int __down_trylock(struct semaphore * sem) -{ - int sleepers; - unsigned long flags; - - spin_lock_irqsave(&sem->wait.lock, flags); - sleepers = sem->sleepers + 1; - sem->sleepers = 0; - - /* - * Add "everybody else" and us into it. They aren't - * playing, because we own the spinlock in the - * wait_queue_head. - */ - if (!atomic_add_negative(sleepers, &sem->count)) { - wake_up_locked(&sem->wait); - } - - spin_unlock_irqrestore(&sem->wait.lock, flags); - return 1; -} - - diff -purN 00_v2.6.13-rc6/arch/x86_64/lib/thunk.S test-84.diff/arch/x86_64/lib/thunk.S --- 00_v2.6.13-rc6/arch/x86_64/lib/thunk.S 2004-12-24 16:34:44.000000000 -0500 +++ test-84.diff/arch/x86_64/lib/thunk.S 2005-08-08 17:15:30.000000000 -0400 @@ -47,6 +47,7 @@ thunk __down_failed,__down thunk_retrax __down_failed_interruptible,__down_interruptible thunk_retrax __down_failed_trylock,__down_trylock + thunk_retrax __aio_down_failed,__aio_down thunk __up_wakeup,__up /* SAVE_ARGS below is used only for the .cfi directives it contains. */ diff -purN 00_v2.6.13-rc6/description test-84.diff/description --- 00_v2.6.13-rc6/description 1969-12-31 19:00:00.000000000 -0500 +++ test-84.diff/description 2005-08-09 20:12:11.000000000 -0400 @@ -0,0 +1,5 @@ +[AIO] add thread based fallback for aio_{read,write}v + +Add thread based fallbacks for aio readv and writev operations. + +Signed-off-by: Benjamin LaHaise diff -purN 00_v2.6.13-rc6/drivers/usb/gadget/inode.c test-84.diff/drivers/usb/gadget/inode.c --- 00_v2.6.13-rc6/drivers/usb/gadget/inode.c 2005-08-04 15:55:46.000000000 -0400 +++ test-84.diff/drivers/usb/gadget/inode.c 2005-08-08 17:15:34.000000000 -0400 @@ -540,7 +540,6 @@ static int ep_aio_cancel(struct kiocb *i local_irq_disable(); epdata = priv->epdata; // spin_lock(&epdata->dev->lock); - kiocbSetCancelled(iocb); if (likely(epdata && epdata->ep && priv->req)) value = usb_ep_dequeue (epdata->ep, priv->req); else @@ -548,7 +547,6 @@ static int ep_aio_cancel(struct kiocb *i // spin_unlock(&epdata->dev->lock); local_irq_enable(); - aio_put_req(iocb); return value; } @@ -586,10 +584,7 @@ static void ep_aio_complete(struct usb_e kfree(priv); iocb->private = NULL; /* aio_complete() reports bytes-transferred _and_ faults */ - if (unlikely(kiocbIsCancelled(iocb))) - aio_put_req(iocb); - else - aio_complete(iocb, + aio_complete(iocb, req->actual ? req->actual : req->status, req->status); } else { diff -purN 00_v2.6.13-rc6/fs/aio.c test-84.diff/fs/aio.c --- 00_v2.6.13-rc6/fs/aio.c 2005-08-04 15:55:50.000000000 -0400 +++ test-84.diff/fs/aio.c 2005-08-11 21:35:40.000000000 -0400 @@ -29,6 +29,9 @@ #include #include #include +#include +#include +#include #include #include @@ -59,6 +62,7 @@ static LIST_HEAD(fput_head); static void aio_kick_handler(void *); static void aio_queue_work(struct kioctx *); +static long aio_cancel_kiocb(struct kiocb *iocb, struct io_event __user *event); /* aio_setup * Creates the slab caches used by the aio routines, panic on @@ -135,7 +139,7 @@ static int aio_setup_ring(struct kioctx 0); if (IS_ERR((void *)info->mmap_base)) { up_write(&ctx->mm->mmap_sem); - printk("mmap err: %ld\n", -info->mmap_base); + dprintk("mmap err: %ld\n", -info->mmap_base); info->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; @@ -268,22 +272,20 @@ out_freectx: */ static void aio_cancel_all(struct kioctx *ctx) { - int (*cancel)(struct kiocb *, struct io_event *); - struct io_event res; + dprintk("aio_cancel_all(%p)\n", ctx); spin_lock_irq(&ctx->ctx_lock); ctx->dead = 1; while (!list_empty(&ctx->active_reqs)) { struct list_head *pos = ctx->active_reqs.next; struct kiocb *iocb = list_kiocb(pos); list_del_init(&iocb->ki_list); - cancel = iocb->ki_cancel; kiocbSetCancelled(iocb); - if (cancel) { - iocb->ki_users++; - spin_unlock_irq(&ctx->ctx_lock); - cancel(iocb, &res); - spin_lock_irq(&ctx->ctx_lock); - } + iocb->ki_users++; + spin_unlock_irq(&ctx->ctx_lock); + dprintk("aio_cancel_all(%p)\n", iocb); + aio_cancel_kiocb(iocb, NULL); + aio_put_req(iocb); + spin_lock_irq(&ctx->ctx_lock); } spin_unlock_irq(&ctx->ctx_lock); } @@ -404,7 +406,9 @@ static struct kiocb fastcall *__aio_get_ req->ki_cancel = NULL; req->ki_retry = NULL; req->ki_dtor = NULL; + req->ki_signo = 0; req->private = NULL; + req->ki_iovec = NULL; INIT_LIST_HEAD(&req->ki_run_list); /* Check if the completion queue has enough free space to @@ -448,9 +452,11 @@ static inline void really_put_req(struct { if (req->ki_dtor) req->ki_dtor(req); + kfree(req->ki_iovec); kmem_cache_free(kiocb_cachep, req); ctx->reqs_active--; + dprintk("really_put_req(%p) active=%d\n", req, ctx->reqs_active); if (unlikely(!ctx->reqs_active && ctx->dead)) wake_up(&ctx->wait); } @@ -492,6 +498,7 @@ static int __aio_put_req(struct kioctx * BUG(); if (likely(req->ki_users)) return 0; + BUG_ON(!list_empty(&req->ki_wait.wait.task_list)); list_del(&req->ki_list); /* remove from active_reqs */ req->ki_cancel = NULL; req->ki_retry = NULL; @@ -546,6 +553,26 @@ struct kioctx *lookup_ioctx(unsigned lon return ioctx; } +static int lock_kiocb_action(void *param, wait_queue_t *wait) +{ + schedule(); + return 0; +} + +static inline void lock_kiocb(struct kiocb *iocb) +{ + dprintk("lock_kiocb(%p) %p\n", iocb, current); + wait_on_bit_lock(&iocb->ki_flags, KIF_LOCKED, lock_kiocb_action, + TASK_UNINTERRUPTIBLE); +} + +static inline void unlock_kiocb(struct kiocb *iocb) +{ + dprintk("unlock_kiocb(%p) %p\n", iocb, current); + kiocbClearLocked(iocb); + wake_up_bit(&iocb->ki_flags, KIF_LOCKED); +} + /* * use_mm * Makes the calling kernel thread take on the specified @@ -649,7 +676,7 @@ static ssize_t aio_run_iocb(struct kiocb ssize_t ret; if (iocb->ki_retried++ > 1024*1024) { - printk("Maximal retry count. Bytes done %Zd\n", + printk(KERN_DEBUG "Maximal retry count. Bytes done %Zd\n", iocb->ki_nbytes - iocb->ki_left); return -EAGAIN; } @@ -660,7 +687,7 @@ static ssize_t aio_run_iocb(struct kiocb } if (!(retry = iocb->ki_retry)) { - printk("aio_run_iocb: iocb->ki_retry = NULL\n"); + printk(KERN_DEBUG "aio_run_iocb: iocb->ki_retry = NULL\n"); return 0; } @@ -712,14 +739,14 @@ static ssize_t aio_run_iocb(struct kiocb * cause the iocb to be kicked for continuation (through * the aio_wake_function callback). */ - BUG_ON(current->io_wait != NULL); - current->io_wait = &iocb->ki_wait; + BUG_ON(!is_sync_wait(current->io_wait)); + current->io_wait = &iocb->ki_wait.wait; ret = retry(iocb); - current->io_wait = NULL; + current->io_wait = ¤t->__wait.wait; if (-EIOCBRETRY != ret) { if (-EIOCBQUEUED != ret) { - BUG_ON(!list_empty(&iocb->ki_wait.task_list)); + BUG_ON(!list_empty(&iocb->ki_wait.wait.task_list)); aio_complete(iocb, ret, 0); /* must not access the iocb after this */ } @@ -728,7 +755,7 @@ static ssize_t aio_run_iocb(struct kiocb * Issue an additional retry to avoid waiting forever if * no waits were queued (e.g. in case of a short read). */ - if (list_empty(&iocb->ki_wait.task_list)) + if (list_empty(&iocb->ki_wait.wait.task_list)) kiocbSetKicked(iocb); } out: @@ -782,7 +809,9 @@ static int __aio_run_iocbs(struct kioctx * Hold an extra reference while retrying i/o. */ iocb->ki_users++; /* grab extra reference */ + lock_kiocb(iocb); aio_run_iocb(iocb); + unlock_kiocb(iocb); if (__aio_put_req(ctx, iocb)) /* drop extra ref */ put_ioctx(ctx); } @@ -879,7 +908,7 @@ static void queue_kicked_iocb(struct kio unsigned long flags; int run = 0; - WARN_ON((!list_empty(&iocb->ki_wait.task_list))); + WARN_ON((!list_empty(&iocb->ki_wait.wait.task_list))); spin_lock_irqsave(&ctx->ctx_lock, flags); run = __queue_kicked_iocb(iocb); @@ -912,6 +941,97 @@ void fastcall kick_iocb(struct kiocb *io } EXPORT_SYMBOL(kick_iocb); +static void __aio_send_signal(struct kiocb *iocb) +{ + struct siginfo info; + struct task_struct *p; + unsigned long flags; + int ret = -1; + + memset(&info, 0, sizeof(struct siginfo)); + + info.si_signo = iocb->ki_signo; + info.si_errno = 0; + info.si_code = SI_ASYNCIO; + info.si_pid = 0; + info.si_uid = 0; + info.si_value = iocb->ki_sigev_value; + + read_lock(&tasklist_lock); + p = find_task_by_pid(iocb->ki_pid); + if (!p || !p->sighand) + goto out_unlock; + + /* Do we have permission to signal this task? */ + if ((iocb->ki_euid ^ p->suid) && (iocb->ki_euid ^ p->uid) + && (iocb->ki_uid ^ p->suid) && (iocb->ki_uid ^ p->uid)) + goto out_unlock; /* No. */ + + spin_lock_irqsave(&p->sighand->siglock, flags); + + switch(iocb->ki_notify) { + case IO_NOTIFY_SIGNAL: + ret = __group_send_sig_info(iocb->ki_signo, &info, p); + break; + case IO_NOTIFY_THREAD_ID: + //ret = specific_send_sig_info(iocb->ki_signo, &info, p); + ret = __group_send_sig_info(iocb->ki_signo, &info, p); + break; + } + + spin_unlock_irqrestore(&p->sighand->siglock, flags); + + if (ret) + printk(KERN_DEBUG "__aio_send_signal: failed to send signal %d to %d\n", + iocb->ki_signo, iocb->ki_pid); + +out_unlock: + read_unlock(&tasklist_lock); +} + +static void __aio_write_evt(struct kioctx *ctx, struct io_event *event) +{ + struct aio_ring_info *info; + struct aio_ring *ring; + struct io_event *ring_event; + unsigned long tail; + + info = &ctx->ring_info; + + /* add a completion event to the ring buffer. + * must be done holding ctx->ctx_lock to prevent + * other code from messing with the tail + * pointer since we might be called from irq + * context. + */ + + ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); + + tail = info->tail; + ring_event = aio_ring_event(info, tail, KM_IRQ0); + if (++tail >= info->nr) + tail = 0; + + *ring_event = *event; + + dprintk("aio_write_evt: %p[%lu]: %Lx %Lx %Lx %Lx\n", + ctx, tail, event->obj, event->data, event->res, event->res2); + + /* after flagging the request as done, we + * must never even look at it again + */ + + smp_wmb(); /* make event visible before updating tail */ + + info->tail = tail; + ring->tail = tail; + + put_aio_ring_event(ring_event, KM_IRQ0); + kunmap_atomic(ring, KM_IRQ1); + + pr_debug("added to ring at [%lu]\n", tail); +} + /* aio_complete * Called when the io request on the given iocb is complete. * Returns true if this is the last user of the request. The @@ -920,11 +1040,8 @@ EXPORT_SYMBOL(kick_iocb); int fastcall aio_complete(struct kiocb *iocb, long res, long res2) { struct kioctx *ctx = iocb->ki_ctx; - struct aio_ring_info *info; - struct aio_ring *ring; - struct io_event *event; + struct io_event event; unsigned long flags; - unsigned long tail; int ret; /* Special case handling for sync iocbs: events go directly @@ -951,14 +1068,13 @@ int fastcall aio_complete(struct kiocb * return ret; } - info = &ctx->ring_info; + /* insert event in the event ring */ + + event.obj = (u64)(unsigned long)iocb->ki_obj.user; + event.data = iocb->ki_user_data; + event.res = res; + event.res2 = res2; - /* add a completion event to the ring buffer. - * must be done holding ctx->ctx_lock to prevent - * other code from messing with the tail - * pointer since we might be called from irq - * context. - */ spin_lock_irqsave(&ctx->ctx_lock, flags); if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) @@ -971,34 +1087,10 @@ int fastcall aio_complete(struct kiocb * if (kiocbIsCancelled(iocb)) goto put_rq; - ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); - - tail = info->tail; - event = aio_ring_event(info, tail, KM_IRQ0); - if (++tail >= info->nr) - tail = 0; + __aio_write_evt(ctx, &event); - event->obj = (u64)(unsigned long)iocb->ki_obj.user; - event->data = iocb->ki_user_data; - event->res = res; - event->res2 = res2; - - dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, - res, res2); - - /* after flagging the request as done, we - * must never even look at it again - */ - smp_wmb(); /* make event visible before updating tail */ - - info->tail = tail; - ring->tail = tail; - - put_aio_ring_event(event, KM_IRQ0); - kunmap_atomic(ring, KM_IRQ1); - - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + if (iocb->ki_signo) + __aio_send_signal(iocb); pr_debug("%ld retries: %d of %d\n", iocb->ki_retried, iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes); @@ -1295,11 +1387,70 @@ asmlinkage long sys_io_destroy(aio_conte return -EINVAL; } +static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) +{ + struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; + + BUG_ON(ret <= 0); + + while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { + ssize_t this = min(iov->iov_len, (size_t)ret); + iov->iov_base += this; + iov->iov_len -= this; + iocb->ki_left -= this; + ret -= this; + if (iov->iov_len == 0) { + iocb->ki_cur_seg++; + iov++; + } + } + + /* the caller should not have done more io than what fit in + * the remaining iovecs */ + BUG_ON(ret > 0 && iocb->ki_left == 0); +} + +static ssize_t aio_rw_vect_retry(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret = 0; + + if (iocb->ki_opcode == IOCB_CMD_PREADV) + ret = file->f_op->aio_readv(iocb, + &iocb->ki_iovec[iocb->ki_cur_seg], + iocb->ki_nr_segs - iocb->ki_cur_seg, + iocb->ki_pos); + else + ret = file->f_op->aio_writev(iocb, + &iocb->ki_iovec[iocb->ki_cur_seg], + iocb->ki_nr_segs - iocb->ki_cur_seg, + iocb->ki_pos); + + if (ret > 0) { + aio_advance_iovec(iocb, ret); + /* turn partial completion into retries. full completion + * gets ret = 0 below. only retry partial reads if they + * were to a regular file. */ + if (iocb->ki_opcode == IOCB_CMD_PWRITEV || + (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))) + ret = -EIOCBRETRY; + } + + /* This means we must have transferred all that we could */ + /* No need to retry anymore */ + if ((ret == 0) || (iocb->ki_left == 0)) + ret = iocb->ki_nbytes - iocb->ki_left; + + return ret; +} + /* * Default retry method for aio_read (also used for first time submit) * Responsible for updating iocb state as retries progress */ -static ssize_t aio_pread(struct kiocb *iocb) +ssize_t aio_pread(struct kiocb *iocb) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -1338,7 +1489,7 @@ static ssize_t aio_pread(struct kiocb *i * Default retry method for aio_write (also used for first time submit) * Responsible for updating iocb state as retries progress */ -static ssize_t aio_pwrite(struct kiocb *iocb) +ssize_t aio_pwrite(struct kiocb *iocb) { struct file *file = iocb->ki_filp; ssize_t ret = 0; @@ -1361,6 +1512,136 @@ static ssize_t aio_pwrite(struct kiocb * return ret; } +static int aio_thread_cancel(struct kiocb *iocb, struct io_event *res) +{ + dprintk("aio_thread_cancel(%p)\n", iocb); + force_sig_specific(SIGKILL, iocb->private); + return -EAGAIN; +} + +static int aio_thread_rw_v_fn(void *arg) +{ + struct kiocb *iocb = arg; + long ret; + + current->exit_signal = -1; + set_fs(USER_DS); + + /* setup cancellation */ + iocb->private = current; + iocb->ki_cancel = aio_thread_cancel; + unlock_kiocb(iocb); + + ret = -EINTR; + if (kiocbIsCancelled(iocb)) + goto out; + /* We don't count against the aio usage count. */ + if (atomic_dec_and_test(&iocb->ki_ctx->mm->default_kioctx.users)) + goto out; + + dprintk("aio_thread_rw_v_fn(%p)\n", iocb); + switch (iocb->ki_opcode) { + case IOCB_CMD_PREAD: + ret = iocb->ki_filp->f_op->read(iocb->ki_filp, iocb->ki_buf, + iocb->ki_left, &iocb->ki_pos); + break; + case IOCB_CMD_PREADV: + ret = vfs_readv(iocb->ki_filp, + &iocb->ki_iovec[iocb->ki_cur_seg], + iocb->ki_nr_segs - iocb->ki_cur_seg, + &iocb->ki_pos); + break; + case IOCB_CMD_PWRITE: + ret = iocb->ki_filp->f_op->write(iocb->ki_filp, iocb->ki_buf, + iocb->ki_left, &iocb->ki_pos); + break; + case IOCB_CMD_PWRITEV: + ret = vfs_writev(iocb->ki_filp, + &iocb->ki_iovec[iocb->ki_cur_seg], + iocb->ki_nr_segs - iocb->ki_cur_seg, + &iocb->ki_pos); + break; + case IOCB_CMD_SENDMSG: + ret = -EINVAL; + if (iocb->ki_nbytes != sizeof(struct msghdr)) + break; + ret = vfs_sendmsg(iocb->ki_filp->private_data, + (void __user *)iocb->ki_buf, iocb->ki_pos); + break; + case IOCB_CMD_RECVMSG: + ret = -EINVAL; + if (iocb->ki_nbytes != sizeof(struct msghdr)) + break; + ret = vfs_recvmsg(iocb->ki_filp->private_data, + (void __user *)iocb->ki_buf, iocb->ki_pos); + break; + default: + ret = -EINVAL; + break; + } +out: + current->aio_mm = NULL; /* prevent mmput from dec'ing aio_mm */ + aio_complete(iocb, ret, 0); + dprintk("aio_thread_rw_v_fn(%p) complete %ld\n", iocb, ret); + return 0; +} + +static int aio_thread_fsync_fn(void *arg) +{ + struct kiocb *iocb = arg; + long ret; + + ret = iocb->ki_filp->f_op->fsync(iocb->ki_filp, + iocb->ki_filp->f_dentry, 0); + aio_complete(iocb, ret, 0); + + return 0; +} + +static int aio_thread_fdsync_fn(void *arg) +{ + struct kiocb *iocb = arg; + long ret; + + ret = iocb->ki_filp->f_op->fsync(iocb->ki_filp, + iocb->ki_filp->f_dentry, 1); + aio_complete(iocb, ret, 0); + + return 0; +} + +static ssize_t aio_kernel_thread(struct kiocb *iocb, int (*fn)(void *)) +{ + long ret; + + /* Setting up cancellation depends on actions of the thread, so + * we need to prevent the submit path from unlocking the iocb + * until our cancel method is in place. + */ + kiocbSetDontUnlock(iocb); + + ret = kernel_thread(fn, iocb, CLONE_VM|CLONE_FS|CLONE_FILES); + if (ret < 0) + return ret; + + return -EIOCBQUEUED; +} + +static ssize_t aio_thread_rw_v(struct kiocb *iocb) +{ + return aio_kernel_thread(iocb, aio_thread_rw_v_fn); +} + +static ssize_t aio_thread_fsync(struct kiocb *iocb) +{ + return aio_kernel_thread(iocb, aio_thread_fsync_fn); +} + +static ssize_t aio_thread_fdsync(struct kiocb *iocb) +{ + return aio_kernel_thread(iocb, aio_thread_fdsync_fn); +} + static ssize_t aio_fdsync(struct kiocb *iocb) { struct file *file = iocb->ki_filp; @@ -1381,6 +1662,27 @@ static ssize_t aio_fsync(struct kiocb *i return ret; } +static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, int type) +{ + ssize_t ret; + + ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, + kiocb->ki_nbytes, 0, NULL, + &kiocb->ki_iovec); + if (ret < 0) + goto out; + + kiocb->ki_nr_segs = kiocb->ki_nbytes; + kiocb->ki_cur_seg = 0; + /* ki_nbytes/left now reflect bytes instead of segs */ + kiocb->ki_nbytes = ret; + kiocb->ki_left = ret; + + ret = 0; +out: + return ret; +} + /* * aio_setup_iocb: * Performs the initial checks and aio retry method @@ -1403,6 +1705,8 @@ static ssize_t aio_setup_iocb(struct kio ret = -EINVAL; if (file->f_op->aio_read) kiocb->ki_retry = aio_pread; + else if (file->f_op->read) + kiocb->ki_retry = aio_thread_rw_v; break; case IOCB_CMD_PWRITE: ret = -EBADF; @@ -1415,16 +1719,52 @@ static ssize_t aio_setup_iocb(struct kio ret = -EINVAL; if (file->f_op->aio_write) kiocb->ki_retry = aio_pwrite; + else if (file->f_op->write) + kiocb->ki_retry = aio_thread_rw_v; + break; + case IOCB_CMD_PREADV: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_READ))) + break; + ret = aio_setup_vectored_rw(kiocb, READ); + if (ret) + break; + ret = EINVAL; + if (file->f_op->aio_readv) + kiocb->ki_retry = aio_rw_vect_retry; + else if (file->f_op->readv || file->f_op->read) + kiocb->ki_retry = aio_thread_rw_v; + break; + case IOCB_CMD_PWRITEV: + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_WRITE))) + break; + ret = aio_setup_vectored_rw(kiocb, WRITE); + if (ret) + break; + ret = EINVAL; + if (file->f_op->aio_writev) + kiocb->ki_retry = aio_rw_vect_retry; + else if (file->f_op->writev || file->f_op->write) + kiocb->ki_retry = aio_thread_rw_v; break; case IOCB_CMD_FDSYNC: ret = -EINVAL; if (file->f_op->aio_fsync) kiocb->ki_retry = aio_fdsync; + else if (file->f_op->fsync) + kiocb->ki_retry = aio_thread_fdsync; break; case IOCB_CMD_FSYNC: ret = -EINVAL; if (file->f_op->aio_fsync) kiocb->ki_retry = aio_fsync; + else if (file->f_op->fsync) + kiocb->ki_retry = aio_thread_fsync; + break; + case IOCB_CMD_SENDMSG: + case IOCB_CMD_RECVMSG: + kiocb->ki_retry = aio_thread_rw_v; break; default: dprintk("EINVAL: io_submit: no operation provided\n"); @@ -1455,14 +1795,93 @@ static ssize_t aio_setup_iocb(struct kio * because this callback isn't used for wait queues which * are nested inside ioctx lock (i.e. ctx->wait) */ -static int aio_wake_function(wait_queue_t *wait, unsigned mode, - int sync, void *key) +int aio_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) { - struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait); + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + struct kiocb *iocb = container_of(wait_bit, struct kiocb, ki_wait); + + /* Assumes that a non-NULL key implies wait bit filtering */ + if (key && !test_wait_bit_key(wait, key)) + return 0; list_del_init(&wait->task_list); kick_iocb(iocb); - return 1; + /* + * Avoid exclusive wakeups with retries since an exclusive wakeup + * may involve implicit expectations of waking up the next waiter + * and there is no guarantee that the retry will take a path that + * would do so. For example if a page has become up-to-date, then + * a retried read may end up straightaway performing a copyout + * and not go through a lock_page - unlock_page that would have + * passed the baton to the next waiter. + */ + return 0; +} + +static long iocb_setup_sigevent(struct kiocb *req, + struct sigevent __user *user_event) +{ + int notify = 0; + pid_t aio_pid; + int aio_signo = 0; + + req->ki_uid = current->uid; + req->ki_euid = current->euid; + + if (!access_ok(VERIFY_READ, user_event, sizeof(struct sigevent))) + return -EFAULT; + /* + * We avoid copying the whole sigevent bunch and only get the + * needed fields. + */ + if (unlikely(__get_user(aio_pid, &user_event->sigev_notify_thread_id))) + return -EFAULT; + + if (unlikely(__get_user(aio_signo, &user_event->sigev_signo))) + return -EFAULT; + + if (unlikely(__copy_from_user(&req->ki_sigev_value, + &user_event->sigev_value, + sizeof(sigval_t)))) + return -EFAULT; + + if (!aio_signo) + return 0; /* no signal number, we do nothing */ + + if (aio_pid == 0) { + /* notify itself */ + aio_pid = current->pid; + notify = IO_NOTIFY_SIGNAL; + } else { + pid_t group_id; + task_t *ptask; + /* notify given thread */ + + /* caller thread and target thread must be in same + * thread group + */ + read_lock(&tasklist_lock); + + ptask = find_task_by_pid(req->ki_pid); + if (ptask) + group_id = ptask->tgid; + read_unlock(&tasklist_lock); + + if (unlikely (ptask == NULL)) + return -ESRCH; + + if (group_id != current->tgid) + return -EINVAL; + + notify = IO_NOTIFY_THREAD_ID; + } + + req->ki_pid = aio_pid; + req->ki_signo = aio_signo; + req->ki_notify = notify; + + return 0; } int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, @@ -1473,8 +1892,7 @@ int fastcall io_submit_one(struct kioctx ssize_t ret; /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2 || - iocb->aio_reserved3)) { + if (unlikely(iocb->aio_reserved1)) { pr_debug("EINVAL: io_submit: reserve field set\n"); return -EINVAL; } @@ -1483,6 +1901,7 @@ int fastcall io_submit_one(struct kioctx if (unlikely( (iocb->aio_buf != (unsigned long)iocb->aio_buf) || (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || + (iocb->aio_sigeventp != (unsigned long)iocb->aio_sigeventp) || ((ssize_t)iocb->aio_nbytes < 0) )) { pr_debug("EINVAL: io_submit: overflow check\n"); @@ -1513,20 +1932,29 @@ int fastcall io_submit_one(struct kioctx req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; req->ki_left = req->ki_nbytes = iocb->aio_nbytes; req->ki_opcode = iocb->aio_lio_opcode; - init_waitqueue_func_entry(&req->ki_wait, aio_wake_function); - INIT_LIST_HEAD(&req->ki_wait.task_list); + init_waitqueue_func_entry(&req->ki_wait.wait, aio_wake_function); + INIT_LIST_HEAD(&req->ki_wait.wait.task_list); + req->ki_run_list.next = req->ki_run_list.prev = NULL; req->ki_retried = 0; - ret = aio_setup_iocb(req); + /* handle setting up the sigevent for POSIX AIO signals */ + if (iocb->aio_sigeventp) { + ret = iocb_setup_sigevent(req, + (struct sigevent __user *)(unsigned long) + iocb->aio_sigeventp); + if (ret) + goto out_put_req; + } + ret = aio_setup_iocb(req); if (ret) goto out_put_req; spin_lock_irq(&ctx->ctx_lock); - if (likely(list_empty(&ctx->run_list))) { - aio_run_iocb(req); - } else { - list_add_tail(&req->ki_run_list, &ctx->run_list); + aio_run_iocb(req); + if (!kiocbIsDontUnlock(req)) + unlock_kiocb(req); + if (!list_empty(&ctx->run_list)) { /* drain the run list */ while (__aio_run_iocbs(ctx)) ; @@ -1590,6 +2018,12 @@ asmlinkage long sys_io_submit(aio_contex break; } + /* Check user_iocb access */ + if (unlikely(!access_ok(VERIFY_READ, user_iocb, sizeof(struct iocb)))) { + ret = -EFAULT; + break; + } + ret = io_submit_one(ctx, user_iocb, &tmp); if (ret) break; @@ -1616,6 +2050,38 @@ static struct kiocb *lookup_kiocb(struct return NULL; } +static long aio_cancel_kiocb(struct kiocb *kiocb, struct io_event __user *result) +{ + int (*cancel)(struct kiocb *iocb, struct io_event *res); + struct io_event tmp; + long ret = -EAGAIN; + + pr_debug("calling cancel(%p) %p\n", kiocb, kiocb->ki_cancel); + memset(&tmp, 0, sizeof(tmp)); + tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; + tmp.data = kiocb->ki_user_data; + + lock_kiocb(kiocb); + cancel = kiocb->ki_cancel; + if (cancel) { + ret = cancel(kiocb, &tmp); + if (!ret) { + /* Cancellation succeeded -- copy the result + * into the user's buffer. + */ + if (result && copy_to_user(result, &tmp, sizeof(tmp))) + ret = -EFAULT; + } + } + unlock_kiocb(kiocb); + /* If the cancellation was successful, we must discard the + * reference held for completion of the iocb. + */ + if (!ret) + aio_put_req(kiocb); + return ret; +} + /* sys_io_cancel: * Attempts to cancel an iocb previously passed to io_submit. If * the operation is successfully cancelled, the resulting event is @@ -1655,19 +2121,8 @@ asmlinkage long sys_io_cancel(aio_contex spin_unlock_irq(&ctx->ctx_lock); if (NULL != cancel) { - struct io_event tmp; - pr_debug("calling cancel\n"); - memset(&tmp, 0, sizeof(tmp)); - tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; - tmp.data = kiocb->ki_user_data; - ret = cancel(kiocb, &tmp); - if (!ret) { - /* Cancellation succeeded -- copy the result - * into the user's buffer. - */ - if (copy_to_user(result, &tmp, sizeof(tmp))) - ret = -EFAULT; - } + ret = aio_cancel_kiocb(kiocb, result); + aio_put_req(kiocb); } else printk(KERN_DEBUG "iocb has no cancel operation\n"); diff -purN 00_v2.6.13-rc6/fs/bad_inode.c test-84.diff/fs/bad_inode.c --- 00_v2.6.13-rc6/fs/bad_inode.c 2005-06-20 13:33:31.000000000 -0400 +++ test-84.diff/fs/bad_inode.c 2005-08-09 19:13:07.000000000 -0400 @@ -26,9 +26,11 @@ static struct file_operations bad_file_o { .llseek = EIO_ERROR, .aio_read = EIO_ERROR, + .aio_readv = EIO_ERROR, .read = EIO_ERROR, .write = EIO_ERROR, .aio_write = EIO_ERROR, + .aio_writev = EIO_ERROR, .readdir = EIO_ERROR, .poll = EIO_ERROR, .ioctl = EIO_ERROR, diff -purN 00_v2.6.13-rc6/fs/block_dev.c test-84.diff/fs/block_dev.c --- 00_v2.6.13-rc6/fs/block_dev.c 2005-08-04 15:55:50.000000000 -0400 +++ test-84.diff/fs/block_dev.c 2005-08-09 19:15:45.000000000 -0400 @@ -777,6 +777,13 @@ static ssize_t blkdev_file_aio_write(str return generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); } +static ssize_t blkdev_file_aio_writev(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + return generic_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); +} + static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { return blkdev_ioctl(file->f_mapping->host, file, cmd, arg); @@ -799,7 +806,9 @@ struct file_operations def_blk_fops = { .read = generic_file_read, .write = blkdev_file_write, .aio_read = generic_file_aio_read, + .aio_readv = generic_file_aio_readv, .aio_write = blkdev_file_aio_write, + .aio_writev = blkdev_file_aio_writev, .mmap = generic_file_mmap, .fsync = block_fsync, .unlocked_ioctl = block_ioctl, diff -purN 00_v2.6.13-rc6/fs/buffer.c test-84.diff/fs/buffer.c --- 00_v2.6.13-rc6/fs/buffer.c 2005-08-04 15:55:50.000000000 -0400 +++ test-84.diff/fs/buffer.c 2005-08-08 17:15:51.000000000 -0400 @@ -53,7 +53,7 @@ init_buffer(struct buffer_head *bh, bh_e bh->b_private = private; } -static int sync_buffer(void *word) +static int sync_buffer(void *word, wait_queue_t *wait) { struct block_device *bd; struct buffer_head *bh diff -purN 00_v2.6.13-rc6/fs/ext2/file.c test-84.diff/fs/ext2/file.c --- 00_v2.6.13-rc6/fs/ext2/file.c 2005-08-04 15:55:50.000000000 -0400 +++ test-84.diff/fs/ext2/file.c 2005-08-09 19:13:07.000000000 -0400 @@ -44,7 +44,9 @@ struct file_operations ext2_file_operati .read = generic_file_read, .write = generic_file_write, .aio_read = generic_file_aio_read, + .aio_readv = generic_file_aio_readv, .aio_write = generic_file_aio_write, + .aio_writev = generic_file_aio_writev, .ioctl = ext2_ioctl, .mmap = generic_file_mmap, .open = generic_file_open, diff -purN 00_v2.6.13-rc6/fs/ext3/file.c test-84.diff/fs/ext3/file.c --- 00_v2.6.13-rc6/fs/ext3/file.c 2005-08-04 15:55:51.000000000 -0400 +++ test-84.diff/fs/ext3/file.c 2005-08-09 19:13:07.000000000 -0400 @@ -23,6 +23,7 @@ #include #include #include +#include #include "xattr.h" #include "acl.h" @@ -48,14 +49,15 @@ static int ext3_release_file (struct ino } static ssize_t -ext3_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) +ext3_file_aio_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_dentry->d_inode; ssize_t ret; int err; - ret = generic_file_aio_write(iocb, buf, count, pos); + ret = generic_file_aio_writev(iocb, iov, nr_segs, pos); /* * Skip flushing if there was an error, or if nothing was written. @@ -105,12 +107,22 @@ force_commit: return ret; } +static ssize_t +ext3_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) +{ + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; + return ext3_file_aio_writev(iocb, &local_iov, 1, pos); +} + struct file_operations ext3_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, + .aio_readv = generic_file_aio_readv, .aio_write = ext3_file_write, + .aio_writev = ext3_file_aio_writev, .readv = generic_file_readv, .writev = generic_file_writev, .ioctl = ext3_ioctl, diff -purN 00_v2.6.13-rc6/fs/inode.c test-84.diff/fs/inode.c --- 00_v2.6.13-rc6/fs/inode.c 2005-08-04 15:55:51.000000000 -0400 +++ test-84.diff/fs/inode.c 2005-08-08 17:15:51.000000000 -0400 @@ -1279,7 +1279,7 @@ void remove_dquot_ref(struct super_block #endif -int inode_wait(void *word) +int inode_wait(void *word, wait_queue_t *wait) { schedule(); return 0; diff -purN 00_v2.6.13-rc6/fs/jfs/file.c test-84.diff/fs/jfs/file.c --- 00_v2.6.13-rc6/fs/jfs/file.c 2005-08-04 15:55:52.000000000 -0400 +++ test-84.diff/fs/jfs/file.c 2005-08-09 19:13:07.000000000 -0400 @@ -106,7 +106,9 @@ struct file_operations jfs_file_operatio .write = generic_file_write, .read = generic_file_read, .aio_read = generic_file_aio_read, + .aio_readv = generic_file_aio_readv, .aio_write = generic_file_aio_write, + .aio_writev = generic_file_aio_writev, .mmap = generic_file_mmap, .readv = generic_file_readv, .writev = generic_file_writev, diff -purN 00_v2.6.13-rc6/fs/ntfs/file.c test-84.diff/fs/ntfs/file.c --- 00_v2.6.13-rc6/fs/ntfs/file.c 2005-08-04 15:55:56.000000000 -0400 +++ test-84.diff/fs/ntfs/file.c 2005-08-09 19:13:07.000000000 -0400 @@ -111,10 +111,12 @@ struct file_operations ntfs_file_ops = { .llseek = generic_file_llseek, /* Seek inside file. */ .read = generic_file_read, /* Read from file. */ .aio_read = generic_file_aio_read, /* Async read from file. */ + .aio_readv = generic_file_aio_readv, /* Async readv from file. */ .readv = generic_file_readv, /* Read from file. */ #ifdef NTFS_RW .write = generic_file_write, /* Write to file. */ .aio_write = generic_file_aio_write, /* Async write to file. */ + .aio_writev = generic_file_aio_writev,/* Async writev to file. */ .writev = generic_file_writev, /* Write to file. */ /*.release = ,*/ /* Last file is closed. See fs/ext2/file.c:: diff -purN 00_v2.6.13-rc6/fs/pipe.c test-84.diff/fs/pipe.c --- 00_v2.6.13-rc6/fs/pipe.c 2005-06-20 13:33:32.000000000 -0400 +++ test-84.diff/fs/pipe.c 2005-08-08 17:15:58.000000000 -0400 @@ -46,6 +46,49 @@ void pipe_wait(struct inode * inode) down(PIPE_SEM(*inode)); } +static int pipe_aio_waiter(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + struct kiocb *iocb = io_wait_to_kiocb(wait); + + list_del_init(&wait->task_list); + iocb->ki_cancel = NULL; /* We're removed from the wait queue, so our + * cancellation code no longer applies. + */ + kick_iocb(iocb); + return 1; +} + +static int pipe_aio_cancel(struct kiocb *kiocb, struct io_event *event) +{ + struct inode *inode = kiocb->ki_filp->f_dentry->d_inode; + wait_queue_head_t *wq = PIPE_WAIT(*inode); + int ret = 0; + + spin_lock_irq(&wq->lock); + if (kiocb->ki_cancel == pipe_aio_cancel) { + kiocb->ki_cancel = NULL; + list_del_init(&kiocb->ki_wait.wait.task_list); + if (event) { + event->res = -EINTR; + event->res2 = 0; + } + } else + ret = -EAGAIN; + spin_unlock_irq(&wq->lock); + return ret; +} + +static long pipe_aio_wait(struct kiocb *kiocb, struct inode *inode) +{ + kiocb->ki_wait.wait.func = pipe_aio_waiter; + kiocb->ki_cancel = pipe_aio_cancel; + add_wait_queue(PIPE_WAIT(*inode), &kiocb->ki_wait.wait); + aio_up(kiocb, PIPE_SEM(*inode)); + kiocbSetIntr(kiocb); + return -EIOCBRETRY; +} + static inline int pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) { @@ -115,9 +158,12 @@ static struct pipe_buf_operations anon_p }; static ssize_t -pipe_readv(struct file *filp, const struct iovec *_iov, - unsigned long nr_segs, loff_t *ppos) +pipe_aio_read(struct kiocb *kiocb, char __user *buf, size_t len, loff_t pos) { + struct iovec _iov[2] = {{ .iov_base = (void __user *)buf, .iov_len = len }}; + unsigned long nr_segs = 1; + struct file *filp = kiocb->ki_filp; + struct inode *inode = filp->f_dentry->d_inode; struct pipe_inode_info *info; int do_wakeup; @@ -125,14 +171,25 @@ pipe_readv(struct file *filp, const stru struct iovec *iov = (struct iovec *)_iov; size_t total_len; + /* In retries we need to remove ourself from the wait queue at this + * point. Checking ki_cancel is a convenient way of checking for + * this case, as we clear the cancel operation when the iocb is + * removed from the wait queue. + */ + if (kiocb->ki_cancel == pipe_aio_cancel) + pipe_aio_cancel(kiocb, NULL); + total_len = iov_length(iov, nr_segs); /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; do_wakeup = 0; - ret = 0; - down(PIPE_SEM(*inode)); + ret = aio_down(kiocb, PIPE_SEM(*inode)); + if (ret) + return ret; + + ret = kiocb->ki_nbytes - kiocb->ki_left; info = inode->i_pipe; for (;;) { int bufs = info->nrbufs; @@ -155,6 +212,8 @@ pipe_readv(struct file *filp, const stru break; } ret += chars; + kiocb->ki_left -= chars; + kiocb->ki_buf += chars; buf->offset += chars; buf->len -= chars; if (!buf->len) { @@ -186,7 +245,7 @@ pipe_readv(struct file *filp, const stru break; } } - if (signal_pending(current)) { + if (is_sync_kiocb(kiocb) && signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } @@ -194,9 +253,9 @@ pipe_readv(struct file *filp, const stru wake_up_interruptible_sync(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); } - pipe_wait(inode); + return pipe_aio_wait(kiocb, inode); } - up(PIPE_SEM(*inode)); + aio_up(kiocb, PIPE_SEM(*inode)); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { wake_up_interruptible(PIPE_WAIT(*inode)); @@ -208,16 +267,12 @@ pipe_readv(struct file *filp, const stru } static ssize_t -pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) +pipe_aio_write(struct kiocb *kiocb, const char __user *buf, size_t len, loff_t pos) { - struct iovec iov = { .iov_base = buf, .iov_len = count }; - return pipe_readv(filp, &iov, 1, ppos); -} + struct iovec _iov[2] = {{ .iov_base = (void __user *)buf, .iov_len = len }}; + unsigned long nr_segs = 1; -static ssize_t -pipe_writev(struct file *filp, const struct iovec *_iov, - unsigned long nr_segs, loff_t *ppos) -{ + struct file *filp = kiocb->ki_filp; struct inode *inode = filp->f_dentry->d_inode; struct pipe_inode_info *info; ssize_t ret; @@ -231,13 +286,33 @@ pipe_writev(struct file *filp, const str if (unlikely(total_len == 0)) return 0; + /* In retries we need to remove ourself from the wait queue at this + * point. Checking ki_cancel is a convenient way of checking for + * this case, as we clear the cancel operation when the iocb is + * removed from the wait queue. + */ + if (kiocb->ki_cancel == pipe_aio_cancel) + pipe_aio_cancel(kiocb, NULL); + do_wakeup = 0; - ret = 0; - down(PIPE_SEM(*inode)); + ret = aio_down(kiocb, PIPE_SEM(*inode)); + if (ret) + return ret; + + /* Undo the WRITERS++ done below where we are queued. We use + * kiocb->private to flag if we were waiting, as the higher layers + * initialize it to NULL at the beginning of a request's life. + */ + if (kiocb->ki_user_data) { + PIPE_WAITING_WRITERS(*inode)--; + kiocb->ki_user_data = 0; + } + info = inode->i_pipe; if (!PIPE_READERS(*inode)) { - send_sig(SIGPIPE, current, 0); + if (is_sync_kiocb(kiocb)) + send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } @@ -257,6 +332,8 @@ pipe_writev(struct file *filp, const str do_wakeup = 1; if (error) goto out; + iov->iov_base += chars; + iov->iov_len -= chars; buf->len += chars; total_len -= chars; ret = chars; @@ -267,8 +344,10 @@ pipe_writev(struct file *filp, const str for (;;) { int bufs; + if (!PIPE_READERS(*inode)) { - send_sig(SIGPIPE, current, 0); + if (is_sync_kiocb(kiocb)) + send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } @@ -304,6 +383,8 @@ pipe_writev(struct file *filp, const str break; } ret += chars; + kiocb->ki_left -= chars; + kiocb->ki_buf += chars; /* Insert it into the buffer array */ buf->page = page; @@ -323,7 +404,7 @@ pipe_writev(struct file *filp, const str if (!ret) ret = -EAGAIN; break; } - if (signal_pending(current)) { + if (is_sync_kiocb(kiocb) && signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } @@ -333,11 +414,11 @@ pipe_writev(struct file *filp, const str do_wakeup = 0; } PIPE_WAITING_WRITERS(*inode)++; - pipe_wait(inode); - PIPE_WAITING_WRITERS(*inode)--; + kiocb->ki_user_data = 1; /* Flag for retry. */ + return pipe_aio_wait(kiocb, inode); } out: - up(PIPE_SEM(*inode)); + aio_up(kiocb, PIPE_SEM(*inode)); if (do_wakeup) { wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); @@ -347,6 +428,7 @@ out: return ret; } +#if 0 static ssize_t pipe_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) @@ -354,6 +436,7 @@ pipe_write(struct file *filp, const char struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; return pipe_writev(filp, &iov, 1, ppos); } +#endif static ssize_t bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) @@ -362,11 +445,23 @@ bad_pipe_r(struct file *filp, char __use } static ssize_t +bad_pipe_aio_r(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) +{ + return -EBADF; +} + +static ssize_t bad_pipe_w(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { return -EBADF; } +static ssize_t +bad_pipe_aio_w(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) +{ + return -EBADF; +} + static int pipe_ioctl(struct inode *pino, struct file *filp, unsigned int cmd, unsigned long arg) @@ -565,8 +660,8 @@ pipe_rdwr_open(struct inode *inode, stru */ struct file_operations read_fifo_fops = { .llseek = no_llseek, - .read = pipe_read, - .readv = pipe_readv, + .read = do_sync_read, + .aio_read = pipe_aio_read, .write = bad_pipe_w, .poll = fifo_poll, .ioctl = pipe_ioctl, @@ -578,8 +673,9 @@ struct file_operations read_fifo_fops = struct file_operations write_fifo_fops = { .llseek = no_llseek, .read = bad_pipe_r, - .write = pipe_write, - .writev = pipe_writev, + .write = do_sync_write, + .aio_read = bad_pipe_aio_r, + .aio_write = pipe_aio_write, .poll = fifo_poll, .ioctl = pipe_ioctl, .open = pipe_write_open, @@ -589,10 +685,10 @@ struct file_operations write_fifo_fops = struct file_operations rdwr_fifo_fops = { .llseek = no_llseek, - .read = pipe_read, - .readv = pipe_readv, - .write = pipe_write, - .writev = pipe_writev, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = pipe_aio_read, + .aio_write = pipe_aio_write, .poll = fifo_poll, .ioctl = pipe_ioctl, .open = pipe_rdwr_open, @@ -602,9 +698,10 @@ struct file_operations rdwr_fifo_fops = struct file_operations read_pipe_fops = { .llseek = no_llseek, - .read = pipe_read, - .readv = pipe_readv, + .read = do_sync_read, .write = bad_pipe_w, + .aio_read = pipe_aio_read, + .aio_write = bad_pipe_aio_w, .poll = pipe_poll, .ioctl = pipe_ioctl, .open = pipe_read_open, @@ -615,8 +712,9 @@ struct file_operations read_pipe_fops = struct file_operations write_pipe_fops = { .llseek = no_llseek, .read = bad_pipe_r, - .write = pipe_write, - .writev = pipe_writev, + .write = do_sync_write, + .aio_read = bad_pipe_aio_r, + .aio_write = pipe_aio_write, .poll = pipe_poll, .ioctl = pipe_ioctl, .open = pipe_write_open, @@ -626,10 +724,10 @@ struct file_operations write_pipe_fops = struct file_operations rdwr_pipe_fops = { .llseek = no_llseek, - .read = pipe_read, - .readv = pipe_readv, - .write = pipe_write, - .writev = pipe_writev, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = pipe_aio_read, + .aio_write = pipe_aio_write, .poll = pipe_poll, .ioctl = pipe_ioctl, .open = pipe_rdwr_open, diff -purN 00_v2.6.13-rc6/fs/read_write.c test-84.diff/fs/read_write.c --- 00_v2.6.13-rc6/fs/read_write.c 2005-08-04 15:55:57.000000000 -0400 +++ test-84.diff/fs/read_write.c 2005-08-09 19:41:35.000000000 -0400 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -203,14 +204,36 @@ Einval: return -EINVAL; } -static void wait_on_retry_sync_kiocb(struct kiocb *iocb) +static long wait_on_retry_sync_kiocb(struct kiocb *iocb) { - set_current_state(TASK_UNINTERRUPTIBLE); + int (*cancel)(struct kiocb *, struct io_event *); + long ret = 0; + set_current_state(kiocbIsIntr(iocb) ? TASK_INTERRUPTIBLE + : TASK_UNINTERRUPTIBLE); if (!kiocbIsKicked(iocb)) schedule(); else kiocbClearKicked(iocb); + + /* If we were interrupted by a signal, issue a cancel to allow the + * operation to clean up. + */ + if (kiocbIsIntr(iocb) && signal_pending(current) && + (cancel = iocb->ki_cancel)) { + struct io_event dummy_event; + dummy_event.res = 0; + if (!cancel(iocb, &dummy_event)) { + ret = dummy_event.res; + if (!ret) + printk(KERN_DEBUG "wait_on_retry_sync_kiocb: ki_cancel method %p is buggy\n", cancel); + goto out; + } + } + kiocbClearIntr(iocb); + ret = iocb->ki_retry(iocb); +out: __set_current_state(TASK_RUNNING); + return ret; } ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) @@ -220,12 +243,15 @@ ssize_t do_sync_read(struct file *filp, init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - while (-EIOCBRETRY == - (ret = filp->f_op->aio_read(&kiocb, buf, len, kiocb.ki_pos))) - wait_on_retry_sync_kiocb(&kiocb); + kiocb.ki_retry = aio_pread; + kiocb.ki_buf = buf; + kiocb.ki_nbytes = len; + kiocb.ki_left = len; + ret = filp->f_op->aio_read(&kiocb, buf, len, *ppos); + while (ret == -EIOCBRETRY) + ret = wait_on_retry_sync_kiocb(&kiocb); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(!list_empty(&kiocb.ki_wait.wait.task_list)); *ppos = kiocb.ki_pos; return ret; } @@ -271,12 +297,15 @@ ssize_t do_sync_write(struct file *filp, init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - while (-EIOCBRETRY == - (ret = filp->f_op->aio_write(&kiocb, buf, len, kiocb.ki_pos))) - wait_on_retry_sync_kiocb(&kiocb); + kiocb.ki_retry = aio_pwrite; + kiocb.ki_buf = (void *)buf; + kiocb.ki_nbytes = len; + kiocb.ki_left = len; + ret = filp->f_op->aio_write(&kiocb, buf, len, kiocb.ki_pos); + while (ret == -EIOCBRETRY) + ret = wait_on_retry_sync_kiocb(&kiocb); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(!list_empty(&kiocb.ki_wait.wait.task_list)); *ppos = kiocb.ki_pos; return ret; } @@ -427,6 +456,77 @@ EXPORT_SYMBOL(iov_shorten); /* A write operation does a read from user space and vice versa */ #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) +ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, + unsigned long nr_segs, unsigned long fast_segs, + struct iovec *fast_pointer, + struct iovec **ret_pointer) + { + unsigned long seg; + ssize_t ret; + struct iovec *iov = fast_pointer; + + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ + if (nr_segs == 0) { + ret = 0; + goto out; + } + + /* + * First get the "struct iovec" from user memory and + * verify all the pointers + */ + if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0)) { + ret = -EINVAL; + goto out; + } + if (nr_segs > fast_segs) { + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + if (iov == NULL) { + ret = -ENOMEM; + goto out; + } + } + if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { + ret = -EFAULT; + goto out; + } + + /* + * According to the Single Unix Specification we should return EINVAL + * if an element length is < 0 when cast to ssize_t or if the + * total length would overflow the ssize_t return value of the + * system call. + */ + ret = 0; + for (seg = 0; seg < nr_segs; seg++) { + void __user *buf = iov[seg].iov_base; + ssize_t len = (ssize_t)iov[seg].iov_len; + + /* see if we we're about to use an invalid len or if + * it's about to overflow ssize_t */ + if (len < 0 || (ret + len < ret)) { + ret = -EINVAL; + goto out; + } + if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { + ret = -EFAULT; + goto out; + } + + ret += len; + } +out: + *ret_pointer = iov; + return ret; +} + +/* A write operation does a read from user space and vice versa */ +#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) + static ssize_t do_readv_writev(int type, struct file *file, const struct iovec __user * uvector, unsigned long nr_segs, loff_t *pos) @@ -434,62 +534,23 @@ static ssize_t do_readv_writev(int type, typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *); - size_t tot_len; + ssize_t tot_len; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov=iovstack, *vector; ssize_t ret; - int seg; io_fn_t fn; iov_fn_t fnv; - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - ret = 0; - if (nr_segs == 0) - goto out; - - /* - * First get the "struct iovec" from user memory and - * verify all the pointers - */ - ret = -EINVAL; - if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0)) + if (!file->f_op) { + ret = -EINVAL; goto out; - if (!file->f_op) - goto out; - if (nr_segs > UIO_FASTIOV) { - ret = -ENOMEM; - iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); - if (!iov) - goto out; } - ret = -EFAULT; - if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) - goto out; - /* - * Single unix specification: - * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. The total length is fitting an ssize_t - * - * Be careful here because iov_len is a size_t not an ssize_t - */ - tot_len = 0; - ret = -EINVAL; - for (seg = 0; seg < nr_segs; seg++) { - void __user *buf = iov[seg].iov_base; - ssize_t len = (ssize_t)iov[seg].iov_len; - - if (len < 0) /* size_t not fitting an ssize_t .. */ - goto out; - if (unlikely(!access_ok(vrfy_dir(type), buf, len))) - goto Efault; - tot_len += len; - if ((ssize_t)tot_len < 0) /* maths overflow on the ssize_t */ - goto out; + tot_len = rw_copy_check_uvector(type, uvector, nr_segs, + ARRAY_SIZE(iovstack), iovstack, &iov); + if (tot_len < 0) { + ret = tot_len; + goto out; } if (tot_len == 0) { ret = 0; @@ -546,9 +607,6 @@ out: fsnotify_modify(file->f_dentry); } return ret; -Efault: - ret = -EFAULT; - goto out; } ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, diff -purN 00_v2.6.13-rc6/fs/reiserfs/file.c test-84.diff/fs/reiserfs/file.c --- 00_v2.6.13-rc6/fs/reiserfs/file.c 2005-08-04 15:55:57.000000000 -0400 +++ test-84.diff/fs/reiserfs/file.c 2005-08-09 19:29:57.000000000 -0400 @@ -1540,12 +1540,6 @@ static ssize_t reiserfs_file_write(struc return res; } -static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf, - size_t count, loff_t pos) -{ - return generic_file_aio_write(iocb, buf, count, pos); -} - struct file_operations reiserfs_file_operations = { .read = generic_file_read, .write = reiserfs_file_write, @@ -1555,7 +1549,9 @@ struct file_operations reiserfs_file_ope .fsync = reiserfs_sync_file, .sendfile = generic_file_sendfile, .aio_read = generic_file_aio_read, - .aio_write = reiserfs_aio_write, + .aio_readv = generic_file_aio_readv, + .aio_write = generic_file_aio_write, + .aio_writev = generic_file_aio_writev, }; struct inode_operations reiserfs_file_inode_operations = { diff -purN 00_v2.6.13-rc6/include/asm-i386/semaphore.h test-84.diff/include/asm-i386/semaphore.h --- 00_v2.6.13-rc6/include/asm-i386/semaphore.h 2005-06-20 13:33:36.000000000 -0400 +++ test-84.diff/include/asm-i386/semaphore.h 2005-08-08 17:15:30.000000000 -0400 @@ -41,10 +41,12 @@ #include #include +struct kiocb; struct semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; + struct kiocb *aio_owner; }; @@ -52,7 +54,8 @@ struct semaphore { { \ .count = ATOMIC_INIT(n), \ .sleepers = 0, \ - .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ + .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait), \ + .aio_owner = NULL \ } #define __MUTEX_INITIALIZER(name) \ @@ -75,6 +78,7 @@ static inline void sema_init (struct sem atomic_set(&sem->count, val); sem->sleepers = 0; init_waitqueue_head(&sem->wait); + sem->aio_owner = NULL; } static inline void init_MUTEX (struct semaphore *sem) @@ -87,6 +91,7 @@ static inline void init_MUTEX_LOCKED (st sema_init(sem, 0); } +fastcall void __aio_down_failed(void /* special register calling convention */); fastcall void __down_failed(void /* special register calling convention */); fastcall int __down_failed_interruptible(void /* params in registers */); fastcall int __down_failed_trylock(void /* params in registers */); @@ -142,6 +147,32 @@ static inline int down_interruptible(str } /* + * Non-blockingly attempt to down() a semaphore for use with aio. + * Returns zero if we acquired it + */ +static inline int aio_down(struct kiocb *iocb, struct semaphore * sem) +{ + int result; + + __asm__ __volatile__( + "# atomic aio down operation\n\t" + LOCK "decl %1\n\t" /* --sem->count */ + "js 2f\n\t" + "movl %3,%2\n" + "xorl %0,%0\n" + "1:\n" + LOCK_SECTION_START("") + "2:\tlea %1,%%edx\n\t" + "call __aio_down_failed\n\t" + "jmp 1b\n" + LOCK_SECTION_END + :"=a" (result), "+m" (sem->count), "=m" (sem->aio_owner) + :"0" (iocb) + :"memory","cc","dx"); + return result; +} + +/* * Non-blockingly attempt to down() a semaphore. * Returns zero if we acquired it */ @@ -190,5 +221,14 @@ static inline void up(struct semaphore * :"memory","ax"); } +static inline void aio_up(struct kiocb *iocb, struct semaphore *sem) +{ +#ifdef CONFIG_DEBUG_KERNEL + BUG_ON(sem->aio_owner != iocb); +#endif + sem->aio_owner = NULL; + up(sem); +} + #endif #endif diff -purN 00_v2.6.13-rc6/include/asm-x86_64/semaphore.h test-84.diff/include/asm-x86_64/semaphore.h --- 00_v2.6.13-rc6/include/asm-x86_64/semaphore.h 2004-12-24 16:33:48.000000000 -0500 +++ test-84.diff/include/asm-x86_64/semaphore.h 2005-08-08 17:15:30.000000000 -0400 @@ -43,17 +43,20 @@ #include #include +struct kiocb; struct semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; + struct kiocb *aio_owner; }; #define __SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .sleepers = 0, \ - .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ + .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait), \ + .aio_owner = NULL \ } #define __MUTEX_INITIALIZER(name) \ @@ -76,6 +79,7 @@ static inline void sema_init (struct sem atomic_set(&sem->count, val); sem->sleepers = 0; init_waitqueue_head(&sem->wait); + sem->aio_owner = NULL; } static inline void init_MUTEX (struct semaphore *sem) @@ -88,11 +92,13 @@ static inline void init_MUTEX_LOCKED (st sema_init(sem, 0); } +asmlinkage long __aio_down_failed(void /* special register calling convention */); asmlinkage void __down_failed(void /* special register calling convention */); asmlinkage int __down_failed_interruptible(void /* params in registers */); asmlinkage int __down_failed_trylock(void /* params in registers */); asmlinkage void __up_wakeup(void /* special register calling convention */); +asmlinkage long __aio_down(struct kiocb *iocb, struct semaphore * sem); asmlinkage void __down(struct semaphore * sem); asmlinkage int __down_interruptible(struct semaphore * sem); asmlinkage int __down_trylock(struct semaphore * sem); @@ -148,6 +154,32 @@ static inline int down_interruptible(str } /* + * Non-blockingly attempt to down() a semaphore for use with aio. + * Returns zero if we acquired it, -EIOCBRETRY if the operation was + * queued and the iocb will receive a kick_iocb() on completion. + */ +static inline long aio_down(struct kiocb *iocb, struct semaphore * sem) +{ + long result; + + __asm__ __volatile__( + "# atomic aio_down operation\n\t" + LOCK "decl %1\n\t" /* --sem->count */ + "js 2f\n\t" + "movq %3,%2\n" /* sem->aio_owner = iocb */ + "xorq %0,%0\n\t" + "1:\n" + LOCK_SECTION_START("") + "2:\tcall __aio_down_failed\n\t" + "jmp 1b\n" + LOCK_SECTION_END + :"=a" (result), "+m" (sem->count), "=m" (sem->aio_owner) + : "D" (iocb), "S" (sem) + :"memory"); + return result; +} + +/* * Non-blockingly attempt to down() a semaphore. * Returns zero if we acquired it */ @@ -192,5 +224,15 @@ static inline void up(struct semaphore * :"D" (sem) :"memory"); } + +static inline void aio_up(struct kiocb *iocb, struct semaphore *sem) +{ +#ifdef CONFIG_DEBUG_KERNEL + BUG_ON(sem->aio_owner != iocb); +#endif + sem->aio_owner = NULL; + up(sem); +} + #endif /* __KERNEL__ */ #endif diff -purN 00_v2.6.13-rc6/include/linux/aio.h test-84.diff/include/linux/aio.h --- 00_v2.6.13-rc6/include/linux/aio.h 2004-12-24 16:35:50.000000000 -0500 +++ test-84.diff/include/linux/aio.h 2005-08-11 15:20:33.000000000 -0400 @@ -7,6 +7,11 @@ #include +enum { + IO_NOTIFY_SIGNAL = 0, /* send signal to a processe */ + IO_NOTIFY_THREAD_ID = 1, /* send signal to a specific thread */ +}; + #define AIO_MAXSEGS 4 #define AIO_KIOGRP_NR_ATOMIC 8 @@ -27,21 +32,33 @@ struct kioctx; #define KIF_LOCKED 0 #define KIF_KICKED 1 #define KIF_CANCELLED 2 +#define KIF_INTR 3 /* use TASK_INTERRUPTIBLE waits */ +#define KIF_SYNCED 4 +#define KIF_DONT_UNLOCK 5 /* don't unlock the iocb after submit */ #define kiocbTryLock(iocb) test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags) #define kiocbTryKick(iocb) test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags) +#define kiocbTrySync(iocb) test_and_set_bit(KIF_SYNCED, &(iocb)->ki_flags) #define kiocbSetLocked(iocb) set_bit(KIF_LOCKED, &(iocb)->ki_flags) #define kiocbSetKicked(iocb) set_bit(KIF_KICKED, &(iocb)->ki_flags) #define kiocbSetCancelled(iocb) set_bit(KIF_CANCELLED, &(iocb)->ki_flags) +#define kiocbSetIntr(iocb) set_bit(KIF_INTR, &(iocb)->ki_flags) +#define kiocbSetSynced(iocb) set_bit(KIF_SYNCED, &(iocb)->ki_flags) +#define kiocbSetDontUnlock(iocb) set_bit(KIF_DONT_UNLOCK, &(iocb)->ki_flags) #define kiocbClearLocked(iocb) clear_bit(KIF_LOCKED, &(iocb)->ki_flags) #define kiocbClearKicked(iocb) clear_bit(KIF_KICKED, &(iocb)->ki_flags) #define kiocbClearCancelled(iocb) clear_bit(KIF_CANCELLED, &(iocb)->ki_flags) +#define kiocbClearIntr(iocb) clear_bit(KIF_INTR, &(iocb)->ki_flags) +#define kiocbClearSynced(iocb) clear_bit(KIF_SYNCED, &(iocb)->ki_flags) #define kiocbIsLocked(iocb) test_bit(KIF_LOCKED, &(iocb)->ki_flags) #define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags) #define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags) +#define kiocbIsIntr(iocb) test_bit(KIF_INTR, &(iocb)->ki_flags) +#define kiocbIsSynced(iocb) test_bit(KIF_SYNCED, &(iocb)->ki_flags) +#define kiocbIsDontUnlock(iocb) test_bit(KIF_DONT_UNLOCK, &(iocb)->ki_flags) struct kiocb { struct list_head ki_run_list; @@ -69,12 +86,22 @@ struct kiocb { size_t ki_nbytes; /* copy of iocb->aio_nbytes */ char __user *ki_buf; /* remaining iocb->aio_buf */ size_t ki_left; /* remaining bytes */ - wait_queue_t ki_wait; + struct wait_bit_queue ki_wait; long ki_retried; /* just for testing */ long ki_kicked; /* just for testing */ long ki_queued; /* just for testing */ + struct iovec *ki_iovec; + unsigned long ki_nr_segs; + unsigned long ki_cur_seg; void *private; + + /* to notify a process on I/O event only valid if ki_signo != 0 */ + pid_t ki_pid; + __u16 ki_signo; + __u16 ki_notify; + uid_t ki_uid, ki_euid; + sigval_t ki_sigev_value; }; #define is_sync_kiocb(iocb) ((iocb)->ki_key == KIOCB_SYNC_KEY) @@ -90,7 +117,7 @@ struct kiocb { (x)->ki_dtor = NULL; \ (x)->ki_obj.tsk = tsk; \ (x)->ki_user_data = 0; \ - init_wait((&(x)->ki_wait)); \ + init_wait_bit_task((&(x)->ki_wait), current);\ } while (0) #define AIO_RING_MAGIC 0xa10a10a1 @@ -154,6 +181,7 @@ struct kioctx { /* prototypes */ extern unsigned aio_max_size; +extern int aio_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); extern ssize_t FASTCALL(wait_on_sync_kiocb(struct kiocb *iocb)); extern int FASTCALL(aio_put_req(struct kiocb *iocb)); extern void FASTCALL(kick_iocb(struct kiocb *iocb)); @@ -164,6 +192,8 @@ extern void FASTCALL(exit_aio(struct mm_ extern struct kioctx *lookup_ioctx(unsigned long ctx_id); extern int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, struct iocb *iocb)); +extern ssize_t aio_pread(struct kiocb *iocb); +extern ssize_t aio_pwrite(struct kiocb *iocb); /* semi private, but used by the 32bit emulations: */ struct kioctx *lookup_ioctx(unsigned long ctx_id); @@ -184,7 +214,8 @@ do { \ } \ } while (0) -#define io_wait_to_kiocb(wait) container_of(wait, struct kiocb, ki_wait) +#define io_wait_to_kiocb(io_wait) container_of(container_of(io_wait, \ + struct wait_bit_queue, wait), struct kiocb, ki_wait) #define is_retried_kiocb(iocb) ((iocb)->ki_retried > 1) #include diff -purN 00_v2.6.13-rc6/include/linux/aio_abi.h test-84.diff/include/linux/aio_abi.h --- 00_v2.6.13-rc6/include/linux/aio_abi.h 2005-06-20 13:33:38.000000000 -0400 +++ test-84.diff/include/linux/aio_abi.h 2005-08-10 10:47:36.000000000 -0400 @@ -41,6 +41,16 @@ enum { * IOCB_CMD_POLL = 5, */ IOCB_CMD_NOOP = 6, + IOCB_CMD_PREADV = 7, + IOCB_CMD_PWRITEV = 8, + + /* For these network operations: + * aio_buf points to struct msghdr *, + * aio_nbytes is sizeof(struct msghdr) + * aio_offset is the flags value. + */ + IOCB_CMD_SENDMSG = 9, + IOCB_CMD_RECVMSG = 10, }; /* read() from /dev/aio returns these structures. */ @@ -80,8 +90,9 @@ struct iocb { __u64 aio_nbytes; __s64 aio_offset; + __u64 aio_sigeventp; /* pointer to struct sigevent */ + /* extra parameters */ - __u64 aio_reserved2; /* TODO: use this for a (struct sigevent *) */ __u64 aio_reserved3; }; /* 64 bytes */ diff -purN 00_v2.6.13-rc6/include/linux/fs.h test-84.diff/include/linux/fs.h --- 00_v2.6.13-rc6/include/linux/fs.h 2005-08-04 15:56:06.000000000 -0400 +++ test-84.diff/include/linux/fs.h 2005-08-09 19:38:19.000000000 -0400 @@ -956,8 +956,10 @@ struct file_operations { loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t); + ssize_t (*aio_readv) (struct kiocb *, const struct iovec *, unsigned long, loff_t); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t, loff_t); + ssize_t (*aio_writev) (struct kiocb *, const struct iovec *, unsigned long, loff_t); int (*readdir) (struct file *, void *, filldir_t); unsigned int (*poll) (struct file *, struct poll_table_struct *); int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); @@ -1007,6 +1009,11 @@ struct inode_operations { struct seq_file; +ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, + unsigned long nr_segs, unsigned long fast_segs, + struct iovec *fast_pointer, + struct iovec **ret_pointer); + extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_readv(struct file *, const struct iovec __user *, @@ -1498,8 +1505,10 @@ extern ssize_t generic_file_read(struct int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t generic_file_aio_read(struct kiocb *, char __user *, size_t, loff_t); +extern ssize_t generic_file_aio_readv(struct kiocb *, const struct iovec *, unsigned long, loff_t); extern ssize_t __generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t *); extern ssize_t generic_file_aio_write(struct kiocb *, const char __user *, size_t, loff_t); +extern ssize_t generic_file_aio_writev(struct kiocb *, const struct iovec *, unsigned long, loff_t); extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, unsigned long, loff_t *); extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, diff -purN 00_v2.6.13-rc6/include/linux/net.h test-84.diff/include/linux/net.h --- 00_v2.6.13-rc6/include/linux/net.h 2005-06-20 13:33:40.000000000 -0400 +++ test-84.diff/include/linux/net.h 2005-08-10 15:17:32.000000000 -0400 @@ -186,6 +186,10 @@ extern int sock_sendmsg(struct s size_t len); extern int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); +extern long vfs_sendmsg(struct socket *sock, struct msghdr __user *msg, + unsigned flags); +extern long vfs_recvmsg(struct socket *sock, struct msghdr __user *msg, + unsigned flags); extern int sock_map_fd(struct socket *sock); extern struct socket *sockfd_lookup(int fd, int *err); #define sockfd_put(sock) fput(sock->file) diff -purN 00_v2.6.13-rc6/include/linux/pagemap.h test-84.diff/include/linux/pagemap.h --- 00_v2.6.13-rc6/include/linux/pagemap.h 2005-08-04 15:56:06.000000000 -0400 +++ test-84.diff/include/linux/pagemap.h 2005-08-08 17:16:00.000000000 -0400 @@ -159,21 +159,25 @@ static inline pgoff_t linear_page_index( return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT); } -extern void FASTCALL(__lock_page(struct page *page)); +extern int FASTCALL(lock_page_slow(struct page *page, wait_queue_t *wait)); extern void FASTCALL(unlock_page(struct page *page)); -static inline void lock_page(struct page *page) +static inline int __lock_page(struct page *page, wait_queue_t *wait) { might_sleep(); if (TestSetPageLocked(page)) - __lock_page(page); + return lock_page_slow(page, wait); + return 0; } + +#define lock_page(page) __lock_page(page, ¤t->__wait.wait) /* * This is exported only for wait_on_page_locked/wait_on_page_writeback. * Never use this directly! */ -extern void FASTCALL(wait_on_page_bit(struct page *page, int bit_nr)); +extern int FASTCALL(wait_on_page_bit(struct page *page, int bit_nr, + wait_queue_t *wait)); /* * Wait for a page to be unlocked. @@ -182,21 +186,30 @@ extern void FASTCALL(wait_on_page_bit(st * ie with increased "page->count" so that the page won't * go away during the wait.. */ -static inline void wait_on_page_locked(struct page *page) +static inline int __wait_on_page_locked(struct page *page, wait_queue_t *wait) { if (PageLocked(page)) - wait_on_page_bit(page, PG_locked); + return wait_on_page_bit(page, PG_locked, wait); + return 0; } +#define wait_on_page_locked(page) \ + __wait_on_page_locked(page, ¤t->__wait.wait) + /* * Wait for a page to complete writeback */ -static inline void wait_on_page_writeback(struct page *page) +static inline int __wait_on_page_writeback(struct page *page, + wait_queue_t *wait) { if (PageWriteback(page)) - wait_on_page_bit(page, PG_writeback); + return wait_on_page_bit(page, PG_writeback, wait); + return 0; } +#define wait_on_page_writeback(page) \ + __wait_on_page_writeback(page, ¤t->__wait.wait) + extern void end_page_writeback(struct page *page); /* diff -purN 00_v2.6.13-rc6/include/linux/sched.h test-84.diff/include/linux/sched.h --- 00_v2.6.13-rc6/include/linux/sched.h 2005-08-04 15:56:07.000000000 -0400 +++ test-84.diff/include/linux/sched.h 2005-08-11 16:38:27.000000000 -0400 @@ -170,6 +170,7 @@ extern void show_stack(struct task_struc void io_schedule(void); long io_schedule_timeout(long timeout); +int io_wait_schedule(wait_queue_t *wait); extern void cpu_init (void); extern void trap_init(void); @@ -636,6 +637,7 @@ struct task_struct { struct list_head ptrace_list; struct mm_struct *mm, *active_mm; + struct mm_struct *aio_mm; /* task state */ struct linux_binfmt *binfmt; @@ -746,11 +748,14 @@ struct task_struct { unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ + +/* Space for default IO wait bit entry used for synchronous IO waits */ + struct wait_bit_queue __wait; /* - * current io wait handle: wait queue entry to use for io waits - * If this thread is processing aio, this points at the waitqueue - * inside the currently handled kiocb. It may be NULL (i.e. default - * to a stack based synchronous wait) if its doing sync IO. + * Current IO wait handle: wait queue entry to use for IO waits + * If this thread is processing AIO, this points at the waitqueue + * inside the currently handled kiocb. Otherwise, points to the + * default IO wait field (i.e &__wait.wait above). */ wait_queue_t *io_wait; /* i/o counters(bytes read/written, #syscalls */ diff -purN 00_v2.6.13-rc6/include/linux/wait.h test-84.diff/include/linux/wait.h --- 00_v2.6.13-rc6/include/linux/wait.h 2005-08-04 15:56:07.000000000 -0400 +++ test-84.diff/include/linux/wait.h 2005-08-08 17:15:55.000000000 -0400 @@ -103,6 +103,17 @@ static inline int waitqueue_active(wait_ return !list_empty(&q->task_list); } +static inline int test_wait_bit_key(wait_queue_t *wait, + struct wait_bit_key *key) +{ + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + + return (wait_bit->key.flags == key->flags && + wait_bit->key.bit_nr == key->bit_nr && + !test_bit(key->bit_nr, key->flags)); +} + /* * Used to distinguish between sync and async io wait context: * sync i/o typically specifies a NULL wait queue entry or a wait @@ -140,11 +151,15 @@ void FASTCALL(__wake_up(wait_queue_head_ extern void FASTCALL(__wake_up_locked(wait_queue_head_t *q, unsigned int mode)); extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)); void FASTCALL(__wake_up_bit(wait_queue_head_t *, void *, int)); -int FASTCALL(__wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned)); -int FASTCALL(__wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned)); +int FASTCALL(__wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, + int (*)(void *, wait_queue_t *), unsigned)); +int FASTCALL(__wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, + int (*)(void *, wait_queue_t *), unsigned)); void FASTCALL(wake_up_bit(void *, int)); -int FASTCALL(out_of_line_wait_on_bit(void *, int, int (*)(void *), unsigned)); -int FASTCALL(out_of_line_wait_on_bit_lock(void *, int, int (*)(void *), unsigned)); +int FASTCALL(out_of_line_wait_on_bit(void *, int, int (*)(void *, + wait_queue_t *), unsigned)); +int FASTCALL(out_of_line_wait_on_bit_lock(void *, int, int (*)(void *, + wait_queue_t *), unsigned)); wait_queue_head_t *FASTCALL(bit_waitqueue(void *, int)); #define wake_up(x) __wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, NULL) @@ -407,6 +422,19 @@ int wake_bit_function(wait_queue_t *wait INIT_LIST_HEAD(&(wait)->task_list); \ } while (0) +#define init_wait_bit_key(waitbit, word, bit) \ + do { \ + (waitbit)->key.flags = word; \ + (waitbit)->key.bit_nr = bit; \ + } while (0) + +#define init_wait_bit_task(waitbit, tsk) \ + do { \ + (waitbit)->wait.private = tsk; \ + (waitbit)->wait.func = wake_bit_function; \ + INIT_LIST_HEAD(&(waitbit)->wait.task_list); \ + } while (0) + /** * wait_on_bit - wait for a bit to be cleared * @word: the word being waited on, a kernel virtual address @@ -422,7 +450,8 @@ int wake_bit_function(wait_queue_t *wait * but has no intention of setting it. */ static inline int wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) + int (*action)(void *, wait_queue_t *), + unsigned mode) { if (!test_bit(bit, word)) return 0; @@ -446,7 +475,8 @@ static inline int wait_on_bit(void *word * clear with the intention of setting it, and when done, clearing it. */ static inline int wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) + int (*action)(void *, wait_queue_t *), + unsigned mode) { if (!test_and_set_bit(bit, word)) return 0; diff -purN 00_v2.6.13-rc6/include/linux/writeback.h test-84.diff/include/linux/writeback.h --- 00_v2.6.13-rc6/include/linux/writeback.h 2005-08-04 15:56:07.000000000 -0400 +++ test-84.diff/include/linux/writeback.h 2005-08-08 17:15:51.000000000 -0400 @@ -70,7 +70,7 @@ struct writeback_control { */ void writeback_inodes(struct writeback_control *wbc); void wake_up_inode(struct inode *inode); -int inode_wait(void *); +int inode_wait(void *, wait_queue_t *); void sync_inodes_sb(struct super_block *, int wait); void sync_inodes(int wait); diff -purN 00_v2.6.13-rc6/kernel/exit.c test-84.diff/kernel/exit.c --- 00_v2.6.13-rc6/kernel/exit.c 2005-08-08 15:45:49.000000000 -0400 +++ test-84.diff/kernel/exit.c 2005-08-11 21:43:05.000000000 -0400 @@ -485,6 +485,8 @@ static void exit_mm(struct task_struct * { struct mm_struct *mm = tsk->mm; + if (tsk->aio_mm && atomic_dec_and_test(&mm->default_kioctx.users)) + exit_aio(mm); mm_release(tsk, mm); if (!mm) return; diff -purN 00_v2.6.13-rc6/kernel/fork.c test-84.diff/kernel/fork.c --- 00_v2.6.13-rc6/kernel/fork.c 2005-08-04 15:56:08.000000000 -0400 +++ test-84.diff/kernel/fork.c 2005-08-11 16:37:48.000000000 -0400 @@ -370,7 +370,6 @@ void fastcall __mmdrop(struct mm_struct void mmput(struct mm_struct *mm) { if (atomic_dec_and_test(&mm->mm_users)) { - exit_aio(mm); exit_mmap(mm); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); @@ -457,6 +456,7 @@ static int copy_mm(unsigned long clone_f tsk->mm = NULL; tsk->active_mm = NULL; + tsk->aio_mm = NULL; /* * Are we cloning a kernel thread? @@ -469,6 +469,7 @@ static int copy_mm(unsigned long clone_f if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); + atomic_inc(&oldmm->default_kioctx.users); mm = oldmm; /* * There are cases where the PTL is held to ensure no @@ -502,6 +503,7 @@ static int copy_mm(unsigned long clone_f good_mm: tsk->mm = mm; + tsk->aio_mm = mm; tsk->active_mm = mm; return 0; @@ -943,7 +945,8 @@ static task_t *copy_process(unsigned lon do_posix_clock_monotonic_gettime(&p->start_time); p->security = NULL; p->io_context = NULL; - p->io_wait = NULL; + init_wait_bit_task(&p->__wait, p); + p->io_wait = &p->__wait.wait; p->audit_context = NULL; #ifdef CONFIG_NUMA p->mempolicy = mpol_copy(p->mempolicy); diff -purN 00_v2.6.13-rc6/kernel/sched.c test-84.diff/kernel/sched.c --- 00_v2.6.13-rc6/kernel/sched.c 2005-08-04 15:56:08.000000000 -0400 +++ test-84.diff/kernel/sched.c 2005-08-08 17:16:00.000000000 -0400 @@ -3993,6 +3993,20 @@ long __sched io_schedule_timeout(long ti return ret; } +/* + * Sleep only if the wait context passed is not async, + * otherwise return so that a retry can be issued later. + */ +int __sched io_wait_schedule(wait_queue_t *wait) +{ + if (!is_sync_wait(wait)) + return -EIOCBRETRY; + io_schedule(); + return 0; +} + +EXPORT_SYMBOL(io_wait_schedule); + /** * sys_sched_get_priority_max - return maximum RT priority. * @policy: scheduling class. diff -purN 00_v2.6.13-rc6/kernel/wait.c test-84.diff/kernel/wait.c --- 00_v2.6.13-rc6/kernel/wait.c 2004-12-24 16:35:27.000000000 -0500 +++ test-84.diff/kernel/wait.c 2005-08-08 17:15:58.000000000 -0400 @@ -132,16 +132,10 @@ EXPORT_SYMBOL(autoremove_wake_function); int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { - struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - test_bit(key->bit_nr, key->flags)) + /* Assumes that a non-NULL key implies wait bit filtering */ + if (arg && !test_wait_bit_key(wait, arg)) return 0; - else - return autoremove_wake_function(wait, mode, sync, key); + return autoremove_wake_function(wait, mode, sync, arg); } EXPORT_SYMBOL(wake_bit_function); @@ -152,22 +146,28 @@ EXPORT_SYMBOL(wake_bit_function); */ int __sched fastcall __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) + int (*action)(void *, wait_queue_t *), unsigned mode) { int ret = 0; do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(q->key.flags); + ret = (*action)(q->key.flags, &q->wait); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq, &q->wait); + /* + * AIO retries require the wait queue entry to remain queued + * for async notification + */ + if (ret != -EIOCBRETRY) + finish_wait(wq, &q->wait); return ret; } EXPORT_SYMBOL(__wait_on_bit); int __sched fastcall out_of_line_wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) + int (*action)(void *, wait_queue_t *), + unsigned mode) { wait_queue_head_t *wq = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); @@ -178,24 +178,30 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit); int __sched fastcall __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) + int (*action)(void *, wait_queue_t *), unsigned mode) { int ret = 0; do { prepare_to_wait_exclusive(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) { - if ((ret = (*action)(q->key.flags))) + if ((ret = (*action)(q->key.flags, &q->wait))) break; } } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); - finish_wait(wq, &q->wait); + /* + * AIO retries require the wait queue entry to remain queued + * for async notification + */ + if (ret != -EIOCBRETRY) + finish_wait(wq, &q->wait); return ret; } EXPORT_SYMBOL(__wait_on_bit_lock); int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) + int (*action)(void *, wait_queue_t *wait), + unsigned mode) { wait_queue_head_t *wq = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); diff -purN 00_v2.6.13-rc6/lib/Makefile test-84.diff/lib/Makefile --- 00_v2.6.13-rc6/lib/Makefile 2005-08-04 15:56:09.000000000 -0400 +++ test-84.diff/lib/Makefile 2005-08-08 17:15:28.000000000 -0400 @@ -18,6 +18,7 @@ endif lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o +lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o diff -purN 00_v2.6.13-rc6/lib/semaphore-sleepers.c test-84.diff/lib/semaphore-sleepers.c --- 00_v2.6.13-rc6/lib/semaphore-sleepers.c 1969-12-31 19:00:00.000000000 -0500 +++ test-84.diff/lib/semaphore-sleepers.c 2005-08-08 17:15:58.000000000 -0400 @@ -0,0 +1,253 @@ +/* + * i386 and x86-64 semaphore implementation. + * + * (C) Copyright 1999 Linus Torvalds + * + * Portions Copyright 1999 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * rw semaphores implemented November 1999 by Benjamin LaHaise + */ +#include +#include +#include +#include +#include +#include + +/* + * Semaphores are implemented using a two-way counter: + * The "count" variable is decremented for each process + * that tries to acquire the semaphore, while the "sleeping" + * variable is a count of such acquires. + * + * Notably, the inline "up()" and "down()" functions can + * efficiently test if they need to do any extra work (up + * needs to do something only if count was negative before + * the increment operation. + * + * "sleeping" and the contention routine ordering is protected + * by the spinlock in the semaphore's waitqueue head. + * + * Note that these functions are only called when there is + * contention on the lock, and as such all this is the + * "non-critical" part of the whole semaphore business. The + * critical part is the inline stuff in + * where we want to avoid any extra jumps and calls. + */ + +/* + * Logic: + * - only on a boundary condition do we need to care. When we go + * from a negative count to a non-negative, we wake people up. + * - when we go from a non-negative count to a negative do we + * (a) synchronize with the "sleeper" count and (b) make sure + * that we're on the wakeup list before we synchronize so that + * we cannot lose wakeup events. + */ + +fastcall void __up(struct semaphore *sem) +{ + wake_up(&sem->wait); +} + +fastcall void __sched __down(struct semaphore * sem) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + unsigned long flags; + + tsk->state = TASK_UNINTERRUPTIBLE; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + sem->sleepers++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * the wait_queue_head. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + schedule(); + + spin_lock_irqsave(&sem->wait.lock, flags); + tsk->state = TASK_UNINTERRUPTIBLE; + } + remove_wait_queue_locked(&sem->wait, &wait); + wake_up_locked(&sem->wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); + tsk->state = TASK_RUNNING; +} + +static int aio_down_wait(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct kiocb *iocb = io_wait_to_kiocb(wait); + struct semaphore *sem = wait->private; + int sleepers = sem->sleepers; + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * the wait_queue_head. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + iocb->ki_wait.wait.func = aio_wake_function; + iocb->ki_cancel = NULL; + sem->sleepers = 0; + sem->aio_owner = iocb; + list_del_init(&wait->task_list); + wake_up_locked(&sem->wait); + kick_iocb(iocb); + return 1; + } + sem->sleepers = 1; /* us - see -1 above */ + + return 1; +} + +static void fixup_down_trylock_locked(struct semaphore *sem); +static int cancel_aio_down(struct kiocb *iocb, struct io_event *event) +{ + /* At this point, the kiocb is locked and even if we have kicked + * it, the pointer to the semaphore is still valid. + */ + struct semaphore *sem = iocb->ki_wait.wait.private; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&sem->wait.lock, flags); + if (!list_empty(&iocb->ki_wait.wait.task_list)) { + /* Ensure aio_down_wait() can no longer be called. */ + list_del_init(&iocb->ki_wait.wait.task_list); + fixup_down_trylock_locked(sem); + event->res = is_sync_kiocb(iocb) ? -ERESTARTSYS : -EINTR; + } else + ret = -EAGAIN; /* we lost the race with aio_down_wait(). */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + return ret; +} + +fastcall long __sched __aio_down(struct kiocb *iocb, struct semaphore * sem) +{ + unsigned long flags; + + if (sem->aio_owner == iocb) { + atomic_inc(&sem->count); /* undo dec in aio_down() */ + return 0; + } + + iocb->ki_wait.wait.private = sem; + iocb->ki_wait.wait.func = aio_down_wait; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &iocb->ki_wait.wait); + + sem->sleepers++; + + iocb->ki_cancel = cancel_aio_down; + + aio_down_wait(&iocb->ki_wait.wait, 0, 0, NULL); + spin_unlock_irqrestore(&sem->wait.lock, flags); + return -EIOCBRETRY; +} + +fastcall int __sched __down_interruptible(struct semaphore * sem) +{ + int retval = 0; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + unsigned long flags; + + tsk->state = TASK_INTERRUPTIBLE; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + sem->sleepers++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * With signals pending, this turns into + * the trylock failure case - we won't be + * sleeping, and we* can't get the lock as + * it has contention. Just correct the count + * and exit. + */ + if (signal_pending(current)) { + retval = -EINTR; + sem->sleepers = 0; + atomic_add(sleepers, &sem->count); + break; + } + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * wait_queue_head. The "-1" is because we're + * still hoping to get the semaphore. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + schedule(); + + spin_lock_irqsave(&sem->wait.lock, flags); + tsk->state = TASK_INTERRUPTIBLE; + } + remove_wait_queue_locked(&sem->wait, &wait); + wake_up_locked(&sem->wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); + + tsk->state = TASK_RUNNING; + return retval; +} + +/* + * Trylock failed - make sure we correct for + * having decremented the count. + * + * We could have done the trylock with a + * single "cmpxchg" without failure cases, + * but then it wouldn't work on a 386. + */ +static void fixup_down_trylock_locked(struct semaphore *sem) +{ + int sleepers; + sleepers = sem->sleepers + 1; + sem->sleepers = 0; + + /* + * Add "everybody else" and us into it. They aren't + * playing, because we own the spinlock in the + * wait_queue_head. + */ + if (!atomic_add_negative(sleepers, &sem->count)) + wake_up_locked(&sem->wait); +} + +fastcall int __down_trylock(struct semaphore * sem) +{ + unsigned long flags; + + spin_lock_irqsave(&sem->wait.lock, flags); + fixup_down_trylock_locked(sem); + spin_unlock_irqrestore(&sem->wait.lock, flags); + return 1; +} diff -purN 00_v2.6.13-rc6/mm/filemap.c test-84.diff/mm/filemap.c --- 00_v2.6.13-rc6/mm/filemap.c 2005-08-04 15:56:09.000000000 -0400 +++ test-84.diff/mm/filemap.c 2005-08-09 19:34:31.000000000 -0400 @@ -126,7 +126,7 @@ void remove_from_page_cache(struct page write_unlock_irq(&mapping->tree_lock); } -static int sync_page(void *word) +static int sync_page(void *word, wait_queue_t *wait) { struct address_space *mapping; struct page *page; @@ -158,8 +158,7 @@ static int sync_page(void *word) mapping = page_mapping(page); if (mapping && mapping->a_ops && mapping->a_ops->sync_page) mapping->a_ops->sync_page(page); - io_schedule(); - return 0; + return io_wait_schedule(wait); } /** @@ -223,10 +222,11 @@ EXPORT_SYMBOL(filemap_flush); /* * Wait for writeback to complete against pages indexed by start->end - * inclusive + * inclusive. In AIO context, this may queue an async notification + * and retry callback and return, instead of blocking the caller. */ -static int wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end) +static int __wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end, wait_queue_t *wait) { struct pagevec pvec; int nr_pages; @@ -238,20 +238,20 @@ static int wait_on_page_writeback_range( pagevec_init(&pvec, 0); index = start; - while ((index <= end) && + while (!ret && (index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_WRITEBACK, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { unsigned i; - for (i = 0; i < nr_pages; i++) { + for (i = 0; !ret && (i < nr_pages); i++) { struct page *page = pvec.pages[i]; /* until radix tree lookup accepts end_index */ if (page->index > end) continue; - wait_on_page_writeback(page); + ret = __wait_on_page_writeback(page, wait); if (PageError(page)) ret = -EIO; } @@ -268,6 +268,14 @@ static int wait_on_page_writeback_range( return ret; } +static inline int wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + return __wait_on_page_writeback_range(mapping, start, end, + ¤t->__wait.wait); +} + + /* * Write and wait upon all the pages in the passed range. This is a "data * integrity" operation. It waits upon in-flight writeout before starting and @@ -281,18 +289,27 @@ int sync_page_range(struct inode *inode, { pgoff_t start = pos >> PAGE_CACHE_SHIFT; pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; + int ret = 0; if (!mapping_cap_writeback_dirty(mapping) || !count) return 0; + if (in_aio()) { + /* Already issued writeouts for this iocb ? */ + if (kiocbTrySync(io_wait_to_kiocb(current->io_wait))) + goto do_wait; /* just need to check if done */ + } ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) { + + if (ret >= 0) { down(&inode->i_sem); ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); up(&inode->i_sem); } - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); +do_wait: + if (ret >= 0) { + ret = __wait_on_page_writeback_range(mapping, start, end, + current->io_wait); + } return ret; } EXPORT_SYMBOL(sync_page_range); @@ -307,15 +324,23 @@ int sync_page_range_nolock(struct inode { pgoff_t start = pos >> PAGE_CACHE_SHIFT; pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; + int ret = 0; if (!mapping_cap_writeback_dirty(mapping) || !count) return 0; + if (in_aio()) { + /* Already issued writeouts for this iocb ? */ + if (kiocbTrySync(io_wait_to_kiocb(current->io_wait))) + goto do_wait; /* just need to check if done */ + } ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) + if (ret >= 0) ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); +do_wait: + if (ret >= 0) { + ret = __wait_on_page_writeback_range(mapping, start, end, + current->io_wait); + } return ret; } EXPORT_SYMBOL(sync_page_range_nolock); @@ -428,13 +453,17 @@ static inline void wake_up_page(struct p __wake_up_bit(page_waitqueue(page), &page->flags, bit); } -void fastcall wait_on_page_bit(struct page *page, int bit_nr) +int fastcall wait_on_page_bit(struct page *page, int bit_nr, + wait_queue_t *wait) { - DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); - - if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, sync_page, + if (test_bit(bit_nr, &page->flags)) { + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + init_wait_bit_key(wait_bit, &page->flags, bit_nr); + return __wait_on_bit(page_waitqueue(page), wait_bit, sync_page, TASK_UNINTERRUPTIBLE); + } + return 0; } EXPORT_SYMBOL(wait_on_page_bit); @@ -478,21 +507,23 @@ void end_page_writeback(struct page *pag EXPORT_SYMBOL(end_page_writeback); /* - * Get a lock on the page, assuming we need to sleep to get it. + * Get a lock on the page, assuming we need to wait to get it. * * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some * random driver's requestfn sets TASK_RUNNING, we could busywait. However * chances are that on the second loop, the block layer's plug list is empty, * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ -void fastcall __lock_page(struct page *page) +int fastcall lock_page_slow(struct page *page, wait_queue_t *wait) { - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); - __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, + init_wait_bit_key(wait_bit, &page->flags, PG_locked); + return __wait_on_bit_lock(page_waitqueue(page), wait_bit, sync_page, TASK_UNINTERRUPTIBLE); } -EXPORT_SYMBOL(__lock_page); +EXPORT_SYMBOL(lock_page_slow); /* * a rather lightweight function, finding and getting a reference to a @@ -740,6 +771,11 @@ void do_generic_mapping_read(struct addr if (!isize) goto out; + if (in_aio()) { + /* Avoid repeat readahead */ + if (is_retried_kiocb(io_wait_to_kiocb(current->io_wait))) + next_index = last_index; + } end_index = (isize - 1) >> PAGE_CACHE_SHIFT; for (;;) { struct page *page; @@ -809,7 +845,11 @@ page_ok: page_not_up_to_date: /* Get exclusive access to the page ... */ - lock_page(page); + + if ((error = __lock_page(page, current->io_wait))) { + pr_debug("queued lock page \n"); + goto readpage_error; + } /* Did it get unhashed before we got the lock? */ if (!page->mapping) { @@ -832,7 +872,8 @@ readpage: goto readpage_error; if (!PageUptodate(page)) { - lock_page(page); + if ((error = __lock_page(page, current->io_wait))) + goto readpage_error; if (!PageUptodate(page)) { if (page->mapping == NULL) { /* @@ -877,7 +918,11 @@ readpage: goto page_ok; readpage_error: - /* UHHUH! A synchronous read error occurred. Report it */ + /* We don't have uptodate data in the page yet */ + /* Could be due to an error or because we need to + * retry when we get an async i/o notification. + * Report the reason. + */ desc->error = error; page_cache_release(page); goto out; @@ -1051,6 +1096,15 @@ generic_file_aio_read(struct kiocb *iocb EXPORT_SYMBOL(generic_file_aio_read); ssize_t +generic_file_aio_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + BUG_ON(iocb->ki_pos != pos); + return __generic_file_aio_read(iocb, iov, nr_segs, &iocb->ki_pos); +} +EXPORT_SYMBOL(generic_file_aio_readv); + +ssize_t generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct iovec local_iov = { .iov_base = buf, .iov_len = count }; @@ -1983,7 +2037,7 @@ generic_file_buffered_write(struct kiocb */ if (likely(status >= 0)) { if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - if (!a_ops->writepage || !is_sync_kiocb(iocb)) + if (!a_ops->writepage) status = generic_osync_inode(inode, mapping, OSYNC_METADATA|OSYNC_DATA); } @@ -2090,14 +2144,23 @@ generic_file_aio_write_nolock(struct kio ssize_t ret; loff_t pos = *ppos; + if (!is_sync_kiocb(iocb) && kiocbIsSynced(iocb)) { + /* nothing to transfer, may just need to sync data */ + ret = iov->iov_len; /* vector AIO not supported yet */ + goto osync; + } + ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos); +osync: if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err; err = sync_page_range_nolock(inode, mapping, pos, ret); - if (err < 0) - ret = err; + if (err < 0) { + ret = err; + *ppos = pos; + } } return ret; } @@ -2131,32 +2194,47 @@ generic_file_write_nolock(struct file *f } EXPORT_SYMBOL(generic_file_write_nolock); -ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, - size_t count, loff_t pos) +ssize_t generic_file_aio_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; - struct iovec local_iov = { .iov_base = (void __user *)buf, - .iov_len = count }; - BUG_ON(iocb->ki_pos != pos); + if (!is_sync_kiocb(iocb) && kiocbIsSynced(iocb)) { + /* nothing to transfer, may just need to sync data */ + ret = iocb->ki_left; + goto osync; + } - down(&inode->i_sem); - ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, - &iocb->ki_pos); - up(&inode->i_sem); + ret = aio_down(iocb, &inode->i_sem); + if (ret) + return ret; + ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); + aio_up(iocb, &inode->i_sem); +osync: if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; err = sync_page_range(inode, mapping, pos, ret); - if (err < 0) + if (err < 0) { ret = err; + iocb->ki_pos = pos; + } } return ret; } +EXPORT_SYMBOL(generic_file_aio_writev); + +ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, + size_t count, loff_t pos) +{ + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; + return generic_file_aio_writev(iocb, &local_iov, 1, pos); +} EXPORT_SYMBOL(generic_file_aio_write); ssize_t generic_file_write(struct file *file, const char __user *buf, diff -purN 00_v2.6.13-rc6/net/socket.c test-84.diff/net/socket.c --- 00_v2.6.13-rc6/net/socket.c 2005-08-04 15:56:11.000000000 -0400 +++ test-84.diff/net/socket.c 2005-08-10 15:29:53.000000000 -0400 @@ -108,6 +108,10 @@ static unsigned int sock_poll(struct fil static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); static int sock_fasync(int fd, struct file *filp, int on); +static ssize_t sock_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos); +static ssize_t sock_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos); static ssize_t sock_readv(struct file *file, const struct iovec *vector, unsigned long count, loff_t *ppos); static ssize_t sock_writev(struct file *file, const struct iovec *vector, @@ -124,8 +128,12 @@ static ssize_t sock_sendpage(struct file static struct file_operations socket_file_ops = { .owner = THIS_MODULE, .llseek = no_llseek, +#if 0 .aio_read = sock_aio_read, .aio_write = sock_aio_write, +#endif + .read = sock_read, + .write = sock_write, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, .mmap = sock_mmap, @@ -404,6 +412,7 @@ int sock_map_fd(struct socket *sock) file->f_mode = FMODE_READ | FMODE_WRITE; file->f_flags = O_RDWR; file->f_pos = 0; + file->private_data = sock; fd_install(fd, file); } @@ -436,6 +445,9 @@ struct socket *sockfd_lookup(int fd, int return NULL; } + if (file->f_op == &socket_file_ops) + return file->private_data; + inode = file->f_dentry->d_inode; if (!S_ISSOCK(inode->i_mode)) { *err = -ENOTSOCK; @@ -638,7 +650,6 @@ static void sock_aio_dtor(struct kiocb * * Read data from a socket. ubuf is a user mode pointer. We make sure the user * area ubuf...ubuf+size-1 is writable before asking the protocol. */ - static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf, size_t size, loff_t pos) { @@ -772,7 +783,13 @@ static ssize_t sock_readv(struct file *f return sock_readv_writev(VERIFY_WRITE, file->f_dentry->d_inode, file, vector, count, tot_len); } - + +static ssize_t sock_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) +{ + struct iovec local_iov = { .iov_base = buf, .iov_len = len }; + return sock_readv(file, &local_iov, 1, ppos); +} + static ssize_t sock_writev(struct file *file, const struct iovec *vector, unsigned long count, loff_t *ppos) { @@ -784,6 +801,13 @@ static ssize_t sock_writev(struct file * file, vector, count, tot_len); } +static ssize_t sock_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) +{ + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = len }; + return sock_writev(file, &local_iov, 1, ppos); +} + /* * Atomic setting of ioctl hooks to avoid race @@ -1690,8 +1714,21 @@ asmlinkage long sys_shutdown(int fd, int asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags) { - struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; struct socket *sock; + int err; + + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + err = vfs_sendmsg(sock, msg, flags); + sockfd_put(sock); +out: + return err; +} + +long vfs_sendmsg(struct socket *sock, struct msghdr __user *msg, unsigned flags) +{ + struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; char address[MAX_SOCK_ADDR]; struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; unsigned char ctl[sizeof(struct cmsghdr) + 20]; /* 20 is size of ipv6_pktinfo */ @@ -1706,14 +1743,10 @@ asmlinkage long sys_sendmsg(int fd, stru } else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr))) return -EFAULT; - sock = sockfd_lookup(fd, &err); - if (!sock) - goto out; - /* do not move before msg_sys is valid */ err = -EMSGSIZE; if (msg_sys.msg_iovlen > UIO_MAXIOV) - goto out_put; + goto out; /* Check whether to allocate the iovec area*/ err = -ENOMEM; @@ -1721,7 +1754,7 @@ asmlinkage long sys_sendmsg(int fd, stru if (msg_sys.msg_iovlen > UIO_FASTIOV) { iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); if (!iov) - goto out_put; + goto out; } /* This will also move the address data into kernel space */ @@ -1772,8 +1805,6 @@ out_freectl: out_freeiov: if (iov != iovstack) sock_kfree_s(sock->sk, iov, iov_size); -out_put: - sockfd_put(sock); out: return err; } @@ -1784,8 +1815,21 @@ out: asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned int flags) { - struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; struct socket *sock; + int err; + + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + err = vfs_recvmsg(sock, msg, flags); + sockfd_put(sock); +out: + return err; +} + +long vfs_recvmsg(struct socket *sock, struct msghdr __user *msg, unsigned int flags) +{ + struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov=iovstack; struct msghdr msg_sys; @@ -1806,13 +1850,9 @@ asmlinkage long sys_recvmsg(int fd, stru if (copy_from_user(&msg_sys,msg,sizeof(struct msghdr))) return -EFAULT; - sock = sockfd_lookup(fd, &err); - if (!sock) - goto out; - err = -EMSGSIZE; if (msg_sys.msg_iovlen > UIO_MAXIOV) - goto out_put; + goto out; /* Check whether to allocate the iovec area*/ err = -ENOMEM; @@ -1820,7 +1860,7 @@ asmlinkage long sys_recvmsg(int fd, stru if (msg_sys.msg_iovlen > UIO_FASTIOV) { iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); if (!iov) - goto out_put; + goto out; } /* @@ -1871,8 +1911,6 @@ asmlinkage long sys_recvmsg(int fd, stru out_freeiov: if (iov != iovstack) sock_kfree_s(sock->sk, iov, iov_size); -out_put: - sockfd_put(sock); out: return err; }