Index: linux-2.1.86-mm/arch/alpha/kernel/osf_sys.c diff -u linux-2.1.86-mm/arch/alpha/kernel/osf_sys.c:1.1.1.3 linux-2.1.86-mm/arch/alpha/kernel/osf_sys.c:1.2 --- linux-2.1.86-mm/arch/alpha/kernel/osf_sys.c:1.1.1.3 Fri Feb 27 01:53:10 1998 +++ linux-2.1.86-mm/arch/alpha/kernel/osf_sys.c Fri Mar 6 03:51:23 1998 @@ -1163,7 +1163,7 @@ r.ru_stime.tv_usec = CT_TO_USECS(current->times.tms_stime); r.ru_minflt = current->min_flt; r.ru_majflt = current->maj_flt; - r.ru_nswap = current->nswap; + r.ru_nswap = current->mm->nswap; break; case RUSAGE_CHILDREN: r.ru_utime.tv_sec = CT_TO_SECS(current->times.tms_cutime); Index: linux-2.1.86-mm/arch/i386/kernel/traps.c diff -u linux-2.1.86-mm/arch/i386/kernel/traps.c:1.1.1.2 linux-2.1.86-mm/arch/i386/kernel/traps.c:1.2 --- linux-2.1.86-mm/arch/i386/kernel/traps.c:1.1.1.2 Sat Feb 21 18:55:43 1998 +++ linux-2.1.86-mm/arch/i386/kernel/traps.c Wed Mar 11 20:01:28 1998 @@ -190,6 +190,8 @@ printk("%s: %04lx\n", str, err & 0xffff); show_registers(regs); spin_unlock_irq(&die_lock); + printk("dead.\n"); + for (;;) ; do_exit(SIGSEGV); } Index: linux-2.1.86-mm/drivers/char/mem.c diff -u linux-2.1.86-mm/drivers/char/mem.c:1.1.1.2 linux-2.1.86-mm/drivers/char/mem.c:1.2 --- linux-2.1.86-mm/drivers/char/mem.c:1.1.1.2 Mon Mar 2 22:28:42 1998 +++ linux-2.1.86-mm/drivers/char/mem.c Fri Mar 6 03:51:32 1998 @@ -260,12 +260,15 @@ struct vm_area_struct * vma; unsigned long addr=(unsigned long)buf; + /* Oops, this was forgotten before. -ben */ + down(¤t->mm->mmap_sem); + /* For private mappings, just map in zero pages. */ for (vma = find_vma(current->mm, addr); vma; vma = vma->vm_next) { unsigned long count; if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) - return size; + goto out_up; if (vma->vm_flags & VM_SHARED) break; count = vma->vm_end - addr; @@ -273,16 +276,18 @@ count = size; flush_cache_range(current->mm, addr, addr + count); - zap_page_range(current->mm, addr, count); - zeromap_page_range(addr, count, PAGE_COPY); + zap_page_range(vma, addr, count); + zeromap_page_range(vma, addr, count, PAGE_COPY); flush_tlb_range(current->mm, addr, addr + count); size -= count; buf += count; addr += count; if (size == 0) - return 0; + goto out_up; } + + up(¤t->mm->mmap_sem); /* The shared case is hard. Lets do the conventional zeroing. */ do { @@ -296,6 +301,9 @@ } while (size); return size; +out_up: + up(¤t->mm->mmap_sem); + return size; } static ssize_t read_zero(struct file * file, char * buf, @@ -340,7 +348,7 @@ { if (vma->vm_flags & VM_SHARED) return -EINVAL; - if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) + if (zeromap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) return -EAGAIN; return 0; } Index: linux-2.1.86-mm/fs/exec.c diff -u linux-2.1.86-mm/fs/exec.c:1.1.1.2 linux-2.1.86-mm/fs/exec.c:1.2 --- linux-2.1.86-mm/fs/exec.c:1.1.1.2 Mon Mar 2 22:21:57 1998 +++ linux-2.1.86-mm/fs/exec.c Mon Mar 9 03:08:42 1998 @@ -324,14 +324,15 @@ mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (mpnt) { mpnt->vm_mm = current->mm; - mpnt->vm_start = PAGE_MASK & (unsigned long) p; + mpnt->vm_offset = mpnt->vm_start = PAGE_MASK & (unsigned long) p; mpnt->vm_end = STACK_TOP; mpnt->vm_page_prot = PAGE_COPY; mpnt->vm_flags = VM_STACK_FLAGS; mpnt->vm_ops = NULL; - mpnt->vm_offset = 0; mpnt->vm_file = NULL; mpnt->vm_pte = 0; + mpnt->vm_prev_private = NULL; + mpnt->vm_private_count = 0; insert_vm_struct(current->mm, mpnt); current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; } @@ -339,7 +340,7 @@ for (i = 0 ; i < MAX_ARG_PAGES ; i++) { if (bprm->page[i]) { current->mm->rss++; - put_dirty_page(current,bprm->page[i],stack_base); + put_dirty_page(current, mpnt,bprm->page[i],stack_base); } stack_base += PAGE_SIZE; } Index: linux-2.1.86-mm/fs/proc/array.c diff -u linux-2.1.86-mm/fs/proc/array.c:1.1.1.2 linux-2.1.86-mm/fs/proc/array.c:1.2 --- linux-2.1.86-mm/fs/proc/array.c:1.1.1.2 Mon Mar 2 22:22:07 1998 +++ linux-2.1.86-mm/fs/proc/array.c Fri Mar 6 03:51:38 1998 @@ -851,7 +851,7 @@ sigign_str, sigcatch_str, wchan, - tsk->nswap, + tsk->mm->nswap, tsk->cnswap); } Index: linux-2.1.86-mm/include/linux/mm.h diff -u linux-2.1.86-mm/include/linux/mm.h:1.1.1.3 linux-2.1.86-mm/include/linux/mm.h:1.7 --- linux-2.1.86-mm/include/linux/mm.h:1.1.1.3 Mon Mar 2 22:23:20 1998 +++ linux-2.1.86-mm/include/linux/mm.h Thu Mar 12 16:01:36 1998 @@ -46,10 +46,17 @@ struct vm_area_struct *vm_next_share; struct vm_area_struct **vm_pprev_share; + /* Mappings that are private, yet shared. Derived from fork()ing. + */ + struct vm_area_struct *vm_next_private; + struct vm_area_struct *vm_prev_private; + struct vm_operations_struct * vm_ops; unsigned long vm_offset; struct file * vm_file; unsigned long vm_pte; /* shared mem */ + + unsigned long vm_private_count; /* some more santity checking for now. -ben */ }; /* @@ -100,6 +107,7 @@ unsigned long page); int (*swapout)(struct vm_area_struct *, unsigned long, pte_t *); pte_t (*swapin)(struct vm_area_struct *, unsigned long, unsigned long); + int (*unuse)(struct vm_area_struct *, struct page *, pte_t *); /* must not block, must be irq safe */ }; /* @@ -112,25 +120,58 @@ */ typedef struct page { /* these must be first (free area handling) */ - struct page *next; - struct page *prev; + union { + struct { + struct page *next; + struct page *prev; + } normal; /* used by inode pages and free area handling */ + struct { + struct vm_area_struct *vma; + unsigned long vm_offset; + } private; /* used for private mappings. */ + } u; + struct inode *inode; unsigned long offset; + struct page *next_hash; atomic_t count; - unsigned int age; unsigned long flags; /* atomic flags, some possibly updated asynchronously */ struct wait_queue *wait; + struct page **pprev_hash; struct buffer_head * buffers; +#if 0 + /* page on one of the circular page_queues */ + struct page *pgq_next; + struct page *pgq_prev; +#endif unsigned long map_nr; /* page->map_nr == page - mem_map */ + unsigned int age; /* this should disappear soon */ } mem_map_t; +/* uses a dummy struct page so we get next & prev for beginning/end of lists */ +extern struct page page_queues[]; +extern atomic_t page_queues_cnt[]; + +#define PgQ_Locked 0 /* page is unswappable - mlock()'d */ +#define PgQ_Active 1 /* page is mapped and active -> young */ +#define PgQ_Inactive 2 /* page is mapped, but hasn't been referenced recently -> old */ +#define PgQ_Swappable 3 /* page has no mappings, is dirty */ +#define PgQ_Swapping 4 /* page is being swapped */ +#define PgQ_Dumpable 5 /* page has no mappings, is not dirty, but is still in the page cache */ + +#define NR_PAGE_QUEUE (PgQ_Dumpable+1) + +/* The low 3 bits of page->flag have been snarfed to index into page_queues */ +#define PGmask_pgq 0x7 + /* Page flag bit values */ -#define PG_locked 0 -#define PG_error 1 -#define PG_referenced 2 -#define PG_uptodate 3 +#define PG_on_queue 3 +#define PG_locked 10 +#define PG_error 11 +#define PG_referenced 12 +#define PG_uptodate 13 #define PG_free_after 4 #define PG_decr_after 5 /* Unused 6 */ @@ -140,6 +181,7 @@ #define PG_reserved 31 /* Make it prettier to test the above... */ +#define PageOnQueue(page) (test_bit(PG_on_queue, &(page)->flags)) #define PageLocked(page) (test_bit(PG_locked, &(page)->flags)) #define PageError(page) (test_bit(PG_error, &(page)->flags)) #define PageReferenced(page) (test_bit(PG_referenced, &(page)->flags)) @@ -258,18 +300,21 @@ extern void FASTCALL(__free_page(struct page *)); extern void show_free_areas(void); -extern unsigned long put_dirty_page(struct task_struct * tsk,unsigned long page, - unsigned long address); +extern unsigned long put_dirty_page(struct task_struct * tsk, struct vm_area_struct *vma, + unsigned long page, unsigned long address); + +extern int is_vma_on_private_list(struct vm_area_struct *vma, struct vm_area_struct *head); +extern void remove_vma_from_page(struct vm_area_struct *vma, struct page *page); extern void free_page_tables(struct mm_struct * mm); extern void clear_page_tables(struct task_struct * tsk); extern int new_page_tables(struct task_struct * tsk); extern int copy_page_tables(struct task_struct * to); -extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size); +extern void zap_page_range(struct vm_area_struct *mm, unsigned long address, unsigned long size); extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); -extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot); +extern int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long size, pgprot_t prot); extern void vmtruncate(struct inode * inode, unsigned long offset); extern void handle_mm_fault(struct task_struct *tsk,struct vm_area_struct *vma, unsigned long address, int write_access); @@ -286,9 +331,11 @@ unsigned long prot, unsigned long flags, unsigned long off); extern void merge_segments(struct mm_struct *, unsigned long, unsigned long); extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); +extern void remove_shared_vm_struct(struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); extern int do_munmap(unsigned long, size_t); extern unsigned long get_unmapped_area(unsigned long, unsigned long); +extern void change_page_vmas_on_split(struct vm_area_struct *from, struct vm_area_struct *to); /* filemap.c */ extern unsigned long page_unuse(unsigned long); Index: linux-2.1.86-mm/include/linux/pagemap.h diff -u linux-2.1.86-mm/include/linux/pagemap.h:1.1.1.1 linux-2.1.86-mm/include/linux/pagemap.h:1.2 --- linux-2.1.86-mm/include/linux/pagemap.h:1.1.1.1 Wed Feb 18 23:35:42 1998 +++ linux-2.1.86-mm/include/linux/pagemap.h Fri Mar 6 03:52:22 1998 @@ -103,13 +103,13 @@ page->inode = NULL; inode->i_nrpages--; if (inode->i_pages == page) - inode->i_pages = page->next; - if (page->next) - page->next->prev = page->prev; - if (page->prev) - page->prev->next = page->next; - page->next = NULL; - page->prev = NULL; + inode->i_pages = page->u.normal.next; + if (page->u.normal.next) + page->u.normal.next->u.normal.prev = page->u.normal.prev; + if (page->u.normal.prev) + page->u.normal.prev->u.normal.next = page->u.normal.next; + page->u.normal.next = NULL; + page->u.normal.prev = NULL; } static inline void add_page_to_inode_queue(struct inode * inode, struct page * page) @@ -118,9 +118,9 @@ inode->i_nrpages++; page->inode = inode; - page->prev = NULL; - if ((page->next = *p) != NULL) - page->next->prev = page; + page->u.normal.prev = NULL; + if ((page->u.normal.next = *p) != NULL) + page->u.normal.next->u.normal.prev = page; *p = page; } Index: linux-2.1.86-mm/include/linux/sched.h diff -u linux-2.1.86-mm/include/linux/sched.h:1.1.1.1 linux-2.1.86-mm/include/linux/sched.h:1.2 --- linux-2.1.86-mm/include/linux/sched.h:1.1.1.1 Wed Feb 18 23:35:39 1998 +++ linux-2.1.86-mm/include/linux/sched.h Fri Mar 6 03:52:22 1998 @@ -155,6 +155,7 @@ unsigned long rss, total_vm, locked_vm; unsigned long def_flags; unsigned long cpu_vm_mask; + unsigned long nswap; /* new swapper doesn't access page tables via tasks */ }; #define INIT_MM { \ @@ -236,7 +237,7 @@ unsigned long start_time; long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS]; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ - unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; + unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt, cnswap; int swappable:1; unsigned long swap_address; unsigned long old_maj_flt; /* old value of maj_flt */ @@ -333,7 +334,7 @@ /* timer */ { NULL, NULL, 0, 0, it_real_fn }, \ /* utime */ {0,0,0,0},0, \ /* per cpu times */ {0, }, {0, }, \ -/* flt */ 0,0,0,0,0,0, \ +/* flt */ 0,0,0,0,0, \ /* swp */ 0,0,0,0,0, \ /* rlimits */ INIT_RLIMITS, \ /* math */ 0, \ Index: linux-2.1.86-mm/include/linux/swap.h diff -u linux-2.1.86-mm/include/linux/swap.h:1.1.1.3 linux-2.1.86-mm/include/linux/swap.h:1.2 --- linux-2.1.86-mm/include/linux/swap.h:1.1.1.3 Fri Feb 27 01:49:51 1998 +++ linux-2.1.86-mm/include/linux/swap.h Wed Mar 11 20:01:44 1998 @@ -57,7 +57,7 @@ extern void rw_swap_page_nocache(int, unsigned long, char *); /* linux/mm/page_alloc.c */ -extern void swap_in(struct task_struct *, struct vm_area_struct *, +extern void swap_in(struct task_struct *, struct vm_area_struct *, unsigned long, pte_t *, unsigned long, int); Index: linux-2.1.86-mm/include/linux/swapctl.h diff -u linux-2.1.86-mm/include/linux/swapctl.h:1.1.1.1 linux-2.1.86-mm/include/linux/swapctl.h:1.3 --- linux-2.1.86-mm/include/linux/swapctl.h:1.1.1.1 Wed Feb 18 23:35:42 1998 +++ linux-2.1.86-mm/include/linux/swapctl.h Thu Mar 12 00:28:50 1998 @@ -38,6 +38,11 @@ unsigned int pages_shm; unsigned int pages_mmap; unsigned int pages_swap; + + struct { + unsigned int min; + unsigned int max; + } pgq[NR_PAGE_QUEUE]; } swapstat_v1; typedef swapstat_v1 swapstat_t; extern swapstat_t swapstats; @@ -90,12 +95,53 @@ return n; } +#if 0 +static inline void pgq_remove ( struct page *page ) +{ + __label__ here; + if (!test_bit(PG_on_queue, &page->flags)) { /* debugging */ + printk(KERN_DEBUG "doh! pgq_remove[%p]: page %lu not on queue\n", &&here, page->map_nr); + return; + } +here: ; + clear_bit(PG_on_queue, &page->flags); + page->flags &= ~PGmask_pgq; /* FIXME!!! */ + atomic_dec(&page_queues_cnt[page->flags & PGmask_pgq]); + page->pgq_prev->pgq_next = page->pgq_next; + page->pgq_next->pgq_prev = page->pgq_prev; + page->pgq_prev = page->pgq_next = NULL; /* debugging */ +} + +static inline void pgq_insert ( struct page *page, int pgq ) +{ + __label__ here; + if (!test_bit(PG_on_queue, &page->flags)) { /* debugging */ + printk(KERN_DEBUG "doh! pgq_insert[%p]: page %lu already on queue %lu\n", &&here, page->map_nr, page->flags & PGmask_pgq); + pgq_remove(page); + } +here: ; + page->pgq_next = page_queues[pgq].pgq_next; + page_queues[pgq].pgq_next->pgq_prev = page; + page_queues[pgq].pgq_next = page; + page->pgq_prev = &page_queues[pgq]; + page->flags |= pgq; /* FIXME!!! */ + set_bit(PG_on_queue, &page->flags); +} +#endif + static inline void touch_page(struct page *page) { if (page->age < (MAX_PAGE_AGE - PAGE_ADVANCE)) page->age += PAGE_ADVANCE; else page->age = MAX_PAGE_AGE; +#if 0 + int pgq; + if (PgQ_Active < (pgq = page->flags & PGmask_pgq)) { + pgq_remove(page); + pgq_insert(page, PgQ_Active); + } +#endif } static inline void age_page(struct page *page) Index: linux-2.1.86-mm/ipc/shm.c diff -u linux-2.1.86-mm/ipc/shm.c:1.1.1.3 linux-2.1.86-mm/ipc/shm.c:1.3 --- linux-2.1.86-mm/ipc/shm.c:1.1.1.3 Mon Mar 2 22:27:21 1998 +++ linux-2.1.86-mm/ipc/shm.c Wed Mar 11 16:07:56 1998 @@ -564,6 +564,8 @@ shmd->vm_file = NULL; shmd->vm_offset = 0; shmd->vm_ops = &shm_vm_ops; + shmd->vm_prev_private = NULL; + shmd->vm_private_count = 0; shp->shm_nattch++; /* prevent destruction */ if ((err = shm_map (shmd))) { Index: linux-2.1.86-mm/kernel/acct.c diff -u linux-2.1.86-mm/kernel/acct.c:1.1.1.2 linux-2.1.86-mm/kernel/acct.c:1.2 --- linux-2.1.86-mm/kernel/acct.c:1.1.1.2 Mon Mar 2 22:23:12 1998 +++ linux-2.1.86-mm/kernel/acct.c Fri Mar 6 03:52:29 1998 @@ -307,7 +307,7 @@ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); ac.ac_minflt = encode_comp_t(current->min_flt); ac.ac_majflt = encode_comp_t(current->maj_flt); - ac.ac_swaps = encode_comp_t(current->nswap); + ac.ac_swaps = encode_comp_t(current->mm->nswap); ac.ac_exitcode = exitcode; /* Index: linux-2.1.86-mm/kernel/exit.c diff -u linux-2.1.86-mm/kernel/exit.c:1.1.1.2 linux-2.1.86-mm/kernel/exit.c:1.2 --- linux-2.1.86-mm/kernel/exit.c:1.1.1.2 Mon Mar 2 22:23:06 1998 +++ linux-2.1.86-mm/kernel/exit.c Fri Mar 6 03:52:29 1998 @@ -49,7 +49,7 @@ release_thread(p); current->cmin_flt += p->min_flt + p->cmin_flt; current->cmaj_flt += p->maj_flt + p->cmaj_flt; - current->cnswap += p->nswap + p->cnswap; + current->cnswap += p->mm->nswap + p->cnswap; free_task_struct(p); } else { printk("task releasing itself\n"); Index: linux-2.1.86-mm/kernel/fork.c diff -u linux-2.1.86-mm/kernel/fork.c:1.1.1.2 linux-2.1.86-mm/kernel/fork.c:1.4 --- linux-2.1.86-mm/kernel/fork.c:1.1.1.2 Mon Mar 2 22:23:07 1998 +++ linux-2.1.86-mm/kernel/fork.c Thu Mar 12 16:01:39 1998 @@ -236,6 +236,12 @@ tmp->vm_pprev_share = &mpnt->vm_next_share; } + tmp->vm_private_count = 0; + tmp->vm_prev_private = mpnt; + tmp->vm_next_private = mpnt->vm_next_private; + tmp->vm_next_private->vm_prev_private = tmp; + mpnt->vm_next_private = tmp; + /* Copy the pages, but defer checking for errors */ retval = copy_page_range(mm, current->mm, tmp); if (!retval && tmp->vm_ops && tmp->vm_ops->open) @@ -321,11 +327,13 @@ tsk->mm = mm; tsk->min_flt = tsk->maj_flt = 0; tsk->cmin_flt = tsk->cmaj_flt = 0; - tsk->nswap = tsk->cnswap = 0; + tsk->cnswap = 0; retval = new_page_tables(tsk); if (retval) goto free_mm; + down(&mm->mmap_sem); retval = dup_mmap(mm); + up(&mm->mmap_sem); if (retval) goto free_pt; return 0; Index: linux-2.1.86-mm/kernel/sys.c diff -u linux-2.1.86-mm/kernel/sys.c:1.1.1.1 linux-2.1.86-mm/kernel/sys.c:1.2 --- linux-2.1.86-mm/kernel/sys.c:1.1.1.1 Wed Feb 18 23:35:39 1998 +++ linux-2.1.86-mm/kernel/sys.c Fri Mar 6 03:52:30 1998 @@ -831,6 +831,11 @@ * either stopped or zombied. In the zombied case the task won't get * reaped till shortly after the call to getrusage(), in both cases the * task being examined is in a frozen state so the counters won't change. + * + * FIXME: I broke the smp safeness when I moved nswap into the mm_struct, + * but we don't actually have a spinlock for the mm_struct right now, and + * I think that doing an up/down on the mm's semaphore would be a bit + * heavy.. Argh! -bcrl */ int getrusage(struct task_struct *p, int who, struct rusage *ru) { @@ -845,7 +850,7 @@ r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_stime); r.ru_minflt = p->min_flt; r.ru_majflt = p->maj_flt; - r.ru_nswap = p->nswap; + r.ru_nswap = p->mm->nswap; break; case RUSAGE_CHILDREN: r.ru_utime.tv_sec = CT_TO_SECS(p->times.tms_cutime); @@ -863,7 +868,7 @@ r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_stime + p->times.tms_cstime); r.ru_minflt = p->min_flt + p->cmin_flt; r.ru_majflt = p->maj_flt + p->cmaj_flt; - r.ru_nswap = p->nswap + p->cnswap; + r.ru_nswap = p->mm->nswap + p->cnswap; break; } return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; Index: linux-2.1.86-mm/mm/filemap.c diff -u linux-2.1.86-mm/mm/filemap.c:1.1.1.3 linux-2.1.86-mm/mm/filemap.c:1.4 --- linux-2.1.86-mm/mm/filemap.c:1.1.1.3 Mon Mar 2 22:23:15 1998 +++ linux-2.1.86-mm/mm/filemap.c Wed Mar 11 20:01:44 1998 @@ -60,14 +60,14 @@ p = &inode->i_pages; while ((page = *p) != NULL) { if (PageLocked(page)) { - p = &page->next; + p = &page->u.normal.next; continue; } inode->i_nrpages--; - if ((*p = page->next) != NULL) - (*p)->prev = page->prev; - page->next = NULL; - page->prev = NULL; + if ((*p = page->u.normal.next) != NULL) + (*p)->u.normal.prev = page->u.normal.prev; + page->u.normal.next = NULL; + page->u.normal.prev = NULL; remove_page_from_hash_queue(page); page->inode = NULL; __free_page(page); @@ -96,16 +96,16 @@ goto repeat; } inode->i_nrpages--; - if ((*p = page->next) != NULL) - (*p)->prev = page->prev; - page->next = NULL; - page->prev = NULL; + if ((*p = page->u.normal.next) != NULL) + (*p)->u.normal.prev = page->u.normal.prev; + page->u.normal.next = NULL; + page->u.normal.prev = NULL; remove_page_from_hash_queue(page); page->inode = NULL; __free_page(page); continue; } - p = &page->next; + p = &page->u.normal.next; offset = start - offset; /* partial truncate, clear end of page */ if (offset < PAGE_SIZE) { @@ -169,6 +169,10 @@ delete_from_swap_cache(page); return 1; } + if (page->inode == &swapper_inode) { + printk("shrink_mmap: page->inode == swapper_inode!!!\n"); + break; + } remove_page_from_hash_queue(page); remove_page_from_inode_queue(page); __free_page(page); @@ -216,7 +220,9 @@ if (PageSwapCache(p)) panic ("Doing a normal page_unuse of a swap cache page"); remove_page_from_hash_queue(p); - remove_page_from_inode_queue(p); + /* only remove non-anonymous pages. Should actually have a bit in page->flags for this. -ben */ + if (p->inode && p->inode != &swapper_inode) + remove_page_from_inode_queue(p); free_page(page); return 1; } @@ -835,6 +841,9 @@ copy_page(new_page, old_page); flush_page_to_ram(new_page); release_page(page); + mem_map[MAP_NR(new_page)].u.private.vma = area; + mem_map[MAP_NR(new_page)].u.private.vm_offset = offset; + area->vm_private_count++; return new_page; no_cached_page: Index: linux-2.1.86-mm/mm/memory.c diff -u linux-2.1.86-mm/mm/memory.c:1.1.1.3 linux-2.1.86-mm/mm/memory.c:1.6 --- linux-2.1.86-mm/mm/memory.c:1.1.1.3 Fri Feb 27 01:49:41 1998 +++ linux-2.1.86-mm/mm/memory.c Thu Mar 12 16:01:47 1998 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -293,15 +294,90 @@ return -ENOMEM; } +int is_vma_on_private_list(struct vm_area_struct *vma, struct vm_area_struct *head) +{ + struct vm_area_struct *node = head; + int i = 0; + while (node && node->vm_next_private != head && node != vma && ++i < 1000) + node = node->vm_next_private; + if (1000 == i) + printk("is_vma_on_private_list: private list destroyed. (%p, %p)\n", vma, head); + return node == vma; +} + +static inline pte_t *lookup_pte_p(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + if (pgd_present(*pgd)) { + pmd = pmd_offset(pgd, addr); + if (pmd_present(*pmd)) + return pte_offset(pmd, addr); + } + return NULL; +} + +/* FIXME: Make this function inline once everything is working. + * The debugging code here is far too clever. -ben + */ +void remove_vma_from_page(struct vm_area_struct *vma, struct page *page) +{ + if (page->inode && page->inode != &swapper_inode) + return; + if (!is_page_shared(page)) { + if (vma != page->u.private.vma) + printk("remove_vma_from_page: vma != page->vma (%p, %p, %lu)\n", vma, page->u.private.vma, + page->map_nr); + page->u.private.vma->vm_private_count--; + page->u.private.vma = NULL; + return; + } +#if 1 + if (!is_vma_on_private_list(vma, page->u.private.vma)) { + if (MAP_NR(ZERO_PAGE) != page->map_nr) + printk("remove_vma_from_page: vma not on page's private list! (%p, %p, %lu)\n", vma, + page->u.private.vma, page->map_nr); + } else +#endif + if (vma == page->u.private.vma) { + struct vm_area_struct *tmp; + unsigned long addr = PAGE_OFFSET + (page->map_nr << PAGE_SHIFT); /* should be macro */ + for (tmp=vma->vm_next_private; tmp != vma; tmp=tmp->vm_next_private) { + pte_t *pte = lookup_pte_p(tmp->vm_mm, page->u.private.vm_offset - tmp->vm_offset + tmp->vm_start); + if (pte && pte_present(*pte) && pte_page(*pte) == addr) + break; + } +#if 1 + if (tmp == vma) { + printk("remove_vma_from_page: shared and no eligable vmas (%p, %p, %lu, c=%d)\n", vma, page->u.private.vma, + page->map_nr, atomic_read(&page->count)); + for (addr=0,tmp=vma->vm_next_private; tmp != vma; addr++,tmp=tmp->vm_next_private) { + pte_t *pte = lookup_pte_p(tmp->vm_mm, page->u.private.vm_offset - tmp->vm_offset + tmp->vm_start); + if (pte && pte_present(*pte)) + printk("#%lu: %p / %08lx\n", addr, tmp, pte_page(*pte)); + } + tmp = NULL; + } else +#endif + tmp->vm_private_count++; + page->u.private.vma = tmp; + vma->vm_private_count--; + } +} + /* * Return indicates whether a page was freed so caller can adjust rss */ -static inline int free_pte(pte_t page) +static inline int free_pte(struct vm_area_struct *vma, pte_t entry) { - if (pte_present(page)) { - unsigned long addr = pte_page(page); - if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr))) + if (pte_present(entry)) { + unsigned long addr = pte_page(entry); + struct page *page = mem_map + MAP_NR(addr); + if (MAP_NR(addr) >= max_mapnr || PageReserved(page)) return 0; + remove_vma_from_page(vma, page); /* * free_page() used to be able to clear swap cache * entries. We may now have to do it manually. @@ -309,20 +385,21 @@ free_page_and_swap_cache(addr); return 1; } - swap_free(pte_val(page)); + swap_free(pte_val(entry)); return 0; } -static inline void forget_pte(pte_t page) +static inline void forget_pte(struct vm_area_struct *vma, pte_t page) { if (!pte_none(page)) { printk("forget_pte: old mapping existed!\n"); - free_pte(page); + free_pte(vma, page); } } -static inline int zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size) +static inline int zap_pte_range(struct vm_area_struct *vma, pmd_t * pmd, unsigned long address, unsigned long size) { +/* unsigned long real_addr = address; */ pte_t * pte; int freed; @@ -349,12 +426,29 @@ if (pte_none(page)) continue; pte_clear(pte-1); - freed += free_pte(page); +#if 0 + if (real_addr >= vma->vm_end) + printk("zap_pte_range: crossing vmas!\n"); + if (pte_present(page) && (MAP_NR(pte_page(page)) < max_mapnr)) { + struct page *page_map = mem_map + MAP_NR(pte_page(page)); + static int last_jiffies; + if ((page_map->u.private.vm_offset != (real_addr - vma->vm_start + vma->vm_offset)) && + (jiffies - last_jiffies) > 5*HZ) { + last_jiffies = jiffies; + printk("zap_pte_range: page's vm_offset is wrong (is %08lx vs %08lx at %08lx on %08lx/%08lx)!!!\n", + page_map->u.private.vm_offset, real_addr - vma->vm_start + vma->vm_offset, + real_addr, vma->vm_start, vma->vm_offset); + } + } + real_addr += PAGE_SIZE; +#endif + freed += free_pte(vma, page); + address += PAGE_SIZE; } return freed; } -static inline int zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size) +static inline int zap_pmd_range(struct vm_area_struct *vma, pgd_t * dir, unsigned long address, unsigned long size) { pmd_t * pmd; unsigned long end; @@ -374,7 +468,7 @@ end = PGDIR_SIZE; freed = 0; do { - freed += zap_pte_range(pmd, address, end - address); + freed += zap_pte_range(vma, pmd, address, end - address); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); @@ -384,29 +478,30 @@ /* * remove user pages in a given range. */ -void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) +void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size) { pgd_t * dir; unsigned long end = address + size; int freed = 0; - dir = pgd_offset(mm, address); + dir = pgd_offset(vma->vm_mm, address); while (address < end) { - freed += zap_pmd_range(dir, address, end - address); + freed += zap_pmd_range(vma, dir, address, end - address); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } /* * Update rss for the mm_struct (not necessarily current->mm) */ - if (mm->rss > 0) { - mm->rss -= freed; - if (mm->rss < 0) - mm->rss = 0; + if (vma->vm_mm->rss > 0) { + vma->vm_mm->rss -= freed; + if (vma->vm_mm->rss < 0) + vma->vm_mm->rss = 0; } } -static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pte_t zero_pte) +static inline void zeromap_pte_range(struct vm_area_struct *vma, pte_t * pte, unsigned long address, unsigned long size, + pte_t zero_pte) { unsigned long end; @@ -417,13 +512,14 @@ do { pte_t oldpage = *pte; set_pte(pte, zero_pte); - forget_pte(oldpage); + forget_pte(vma, oldpage); address += PAGE_SIZE; pte++; } while (address < end); } -static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, pte_t zero_pte) +static inline int zeromap_pmd_range(struct vm_area_struct *vma, pmd_t * pmd, unsigned long address, unsigned long size, + pte_t zero_pte) { unsigned long end; @@ -435,14 +531,14 @@ pte_t * pte = pte_alloc(pmd, address); if (!pte) return -ENOMEM; - zeromap_pte_range(pte, address, end - address, zero_pte); + zeromap_pte_range(vma, pte, address, end - address, zero_pte); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); return 0; } -int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot) +int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, pgprot_t prot) { int error = 0; pgd_t * dir; @@ -451,20 +547,20 @@ pte_t zero_pte; zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot)); - dir = pgd_offset(current->mm, address); - flush_cache_range(current->mm, beg, end); + dir = pgd_offset(vma->vm_mm, address); + flush_cache_range(vma->vm_mm, beg, end); while (address < end) { pmd_t *pmd = pmd_alloc(dir, address); error = -ENOMEM; if (!pmd) break; - error = zeromap_pmd_range(pmd, address, end - address, zero_pte); + error = zeromap_pmd_range(vma, pmd, address, end - address, zero_pte); if (error) break; address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } - flush_tlb_range(current->mm, beg, end); + flush_tlb_range(vma->vm_mm, beg, end); return error; } @@ -490,7 +586,7 @@ mapnr = MAP_NR(__va(phys_addr)); if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr)) set_pte(pte, mk_pte_phys(phys_addr, prot)); - forget_pte(oldpage); + forget_pte(NULL, oldpage); address += PAGE_SIZE; phys_addr += PAGE_SIZE; pte++; @@ -560,7 +656,7 @@ * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. */ -unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address) +unsigned long put_dirty_page(struct task_struct * tsk, struct vm_area_struct *vma, unsigned long page, unsigned long address) { pgd_t * pgd; pmd_t * pmd; @@ -589,6 +685,9 @@ return 0; } flush_page_to_ram(page); + mem_map[MAP_NR(page)].u.private.vma = vma; + mem_map[MAP_NR(page)].u.private.vm_offset = address - vma->vm_start + vma->vm_offset; + vma->vm_private_count++; set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY)))); /* no need for flush_tlb */ return page; @@ -637,9 +736,15 @@ * Do we need to copy? */ if (is_page_shared(page_map)) { + remove_vma_from_page(vma, page_map); if (new_page) { - if (PageReserved(mem_map + MAP_NR(old_page))) + struct page *new_map; + if (PageReserved(page_map)) ++vma->vm_mm->rss; + new_map = mem_map + MAP_NR(new_page); + new_map->u.private.vma = vma; + new_map->u.private.vm_offset = (address & PAGE_MASK) - vma->vm_start + vma->vm_offset; + vma->vm_private_count++; copy_cow_page(old_page,new_page); flush_page_to_ram(old_page); flush_page_to_ram(new_page); @@ -658,6 +763,15 @@ } if (PageSwapCache(page_map)) delete_from_swap_cache(page_map); + + /* these assertions can be removed soon. -ben */ + if (vma != page_map->u.private.vma) + printk("do_wp_page(%08lx/%lu): page->vma != vma (%p vs %p)!!! (vm_flags = %04x)\n", + address, page_map->map_nr, page_map->u.private.vma, vma, vma->vm_flags); + if (page_map->u.private.vm_offset != ((address & PAGE_MASK) - vma->vm_start + vma->vm_offset)) + printk("do_wp_page: page's vm_offset is wrong (is %08lx vs %08lx at %08lx on %08lx/%08lx)!!!\n", + page_map->u.private.vm_offset, (address & PAGE_MASK) - vma->vm_start + vma->vm_offset, address, vma->vm_start, vma->vm_offset); + flush_cache_page(vma, address); set_pte(page_table, pte_mkdirty(pte_mkwrite(pte))); flush_tlb_page(vma, address); @@ -737,7 +851,7 @@ /* mapping wholly truncated? */ if (mpnt->vm_offset >= offset) { flush_cache_range(mm, start, end); - zap_page_range(mm, start, len); + zap_page_range(mpnt, start, len); flush_tlb_range(mm, start, end); continue; } @@ -753,7 +867,7 @@ start = (start + ~PAGE_MASK) & PAGE_MASK; } flush_cache_range(mm, start, end); - zap_page_range(mm, start, len); + zap_page_range(mpnt, start, len); flush_tlb_range(mm, start, end); } while ((mpnt = mpnt->vm_next_share) != NULL); } @@ -766,7 +880,7 @@ pte_t page; if (!vma->vm_ops || !vma->vm_ops->swapin) { - swap_in(tsk, vma, page_table, pte_val(entry), write_access); + swap_in(tsk, vma, address - vma->vm_start + vma->vm_offset, page_table, pte_val(entry), write_access); flush_page_to_ram(pte_page(*page_table)); return; } @@ -840,8 +954,13 @@ entry = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot)); if (write_access) { unsigned long page = __get_free_page(GFP_KERNEL); + struct page *map; if (!page) goto sigbus; + map = mem_map + MAP_NR(page); + map->u.private.vma = vma; + map->u.private.vm_offset = (address & PAGE_MASK) - vma->vm_start + vma->vm_offset; + vma->vm_private_count++; clear_page(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); vma->vm_mm->rss++; @@ -884,8 +1003,18 @@ entry = pte_mkyoung(entry); set_pte(pte, entry); flush_tlb_page(vma, address); - if (!write_access) + if (!write_access) { +#if 0 + unsigned nr = MAP_NR(pte_page(entry)); + if (nr < max_mapnr) { + struct page *page = mem_map + nr; + if (!PageOnQueue(page)) + printk(KERN_ERR "handle_pte_fault: page %u not on queue!\n", nr); + touch_page(page); + } +#endif return; + } if (pte_write(entry)) { entry = pte_mkdirty(entry); set_pte(pte, entry); Index: linux-2.1.86-mm/mm/mlock.c diff -u linux-2.1.86-mm/mm/mlock.c:1.1.1.2 linux-2.1.86-mm/mm/mlock.c:1.3 --- linux-2.1.86-mm/mm/mlock.c:1.1.1.2 Mon Mar 2 22:23:16 1998 +++ linux-2.1.86-mm/mm/mlock.c Thu Mar 12 16:01:47 1998 @@ -38,10 +38,13 @@ n->vm_end = end; vma->vm_offset += vma->vm_start - n->vm_start; n->vm_flags = newflags; + n->vm_private_count = 0; if (n->vm_file) n->vm_file->f_count++; if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); + n->vm_prev_private = (vma != vma->vm_prev_private) ? vma : NULL; + change_page_vmas_on_split(vma, n); insert_vm_struct(current->mm, n); return 0; } @@ -59,10 +62,13 @@ n->vm_start = start; n->vm_offset += n->vm_start - vma->vm_start; n->vm_flags = newflags; + n->vm_private_count = 0; if (n->vm_file) n->vm_file->f_count++; if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); + n->vm_prev_private = (vma != vma->vm_prev_private) ? vma : NULL; + change_page_vmas_on_split(vma, n); insert_vm_struct(current->mm, n); return 0; } @@ -89,6 +95,8 @@ vma->vm_offset += vma->vm_start - left->vm_start; right->vm_offset += right->vm_start - left->vm_start; vma->vm_flags = newflags; + left->vm_private_count = 0; + right->vm_private_count = 0; if (vma->vm_file) vma->vm_file->f_count += 2; @@ -96,6 +104,9 @@ vma->vm_ops->open(left); vma->vm_ops->open(right); } + left->vm_prev_private = right->vm_prev_private = (vma != vma->vm_prev_private) ? vma : NULL; + change_page_vmas_on_split(vma, left); + change_page_vmas_on_split(vma, right); insert_vm_struct(current->mm, left); insert_vm_struct(current->mm, right); return 0; Index: linux-2.1.86-mm/mm/mmap.c diff -u linux-2.1.86-mm/mm/mmap.c:1.1.1.2 linux-2.1.86-mm/mm/mmap.c:1.6 --- linux-2.1.86-mm/mm/mmap.c:1.1.1.2 Mon Mar 2 22:23:14 1998 +++ linux-2.1.86-mm/mm/mmap.c Thu Mar 12 16:01:48 1998 @@ -73,7 +73,7 @@ } /* Remove one vm structure from the inode's i_mmap ring. */ -static inline void remove_shared_vm_struct(struct vm_area_struct *vma) +void remove_shared_vm_struct(struct vm_area_struct *vma) { struct file * file = vma->vm_file; @@ -84,6 +84,23 @@ vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share; *vma->vm_pprev_share = vma->vm_next_share; } + + /* unconditionally remove the vma from its private ring */ + vma->vm_prev_private->vm_next_private = vma->vm_next_private; + vma->vm_next_private->vm_prev_private = vma->vm_prev_private; + + vma->vm_prev_private = vma->vm_next_private = NULL; + + if (vma->vm_private_count) { + static int last_jiffies; + /*if ((jiffies - last_jiffies) > 5*HZ)*/{ + __label__ here; +here: + last_jiffies = jiffies; + printk("remove_shared_vm_struct(%p): vma still pointed to by %d pages - prepare for meltdown...\n", + &&here, vma->vm_private_count); + } + } } asmlinkage unsigned long sys_brk(unsigned long brk) @@ -257,13 +274,20 @@ if (!(file->f_mode & 2)) vma->vm_flags &= ~(VM_MAYWRITE | VM_SHARED); } - } else + } else { vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + /* By giving private mappings an initial offset of their virtual address, + * we're able to merge remapped vmas later on. -ben + */ + off = addr; + } vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f]; vma->vm_ops = NULL; vma->vm_offset = off; vma->vm_file = NULL; vma->vm_pte = 0; + vma->vm_prev_private = NULL; + vma->vm_private_count = 0; /* Clear old maps */ error = -ENOMEM; @@ -298,8 +322,8 @@ } if (!error) error = file->f_op->mmap(file, vma); - } + /* Fix up the count if necessary, then check for an error */ if (correct_wcount) file->f_dentry->d_inode->i_writecount++; @@ -358,6 +382,55 @@ } } +/* Change each page previously owned by `from' to `to'. + * Note that we're only concerned with the area bounded by `to'. + */ +void change_page_vmas_on_split(struct vm_area_struct *from, struct vm_area_struct *to) +{ + unsigned long addr = to->vm_start; + pgd_t *pgd = pgd_offset(to->vm_mm, addr); + pmd_t *pmd; + pte_t *pte; + + for (;;) { + if (!pgd_present(*pgd)) + goto cont_pgd; + + pmd = pmd_offset(pgd, addr); + do { + if (!pmd_present(*pmd)) + goto cont_pmd; + + pte = pte_offset(pmd, addr); + do { + if (pte_present(*pte)) { + struct page *page = mem_map + MAP_NR(pte_page(*pte)); + + /* note that we skip verifying this is an anon page, but that's + * okay as u.private.vma overlaps another pointer, which can't + * point to a vma. The check of !PageReserved could even go. -ben + */ + if (MAP_NR(pte_page(*pte)) < max_mapnr && !PageReserved(page) && + (page->u.private.vma == from)) { + page->u.private.vma = to; + to->vm_private_count++; + from->vm_private_count--; + } + } + + pte ++; + addr += PAGE_SIZE; + if (addr >= to->vm_end) + return; + } while (addr & (PMD_SIZE - 1)) ; +cont_pmd: + pmd++; + } while (addr & (PGDIR_SIZE - 1)) ; +cont_pgd: + pgd++; + } +} + /* Normal function to fix up a mapping * This function is the default for when an area has no specific * function. This may be used as part of a more specific routine. @@ -393,6 +466,7 @@ /* Unmapping the whole area. */ if (addr == area->vm_start && end == area->vm_end) { + remove_shared_vm_struct(area); if (area->vm_ops && area->vm_ops->close) area->vm_ops->close(area); if (area->vm_file) @@ -424,10 +498,29 @@ mpnt->vm_file->f_count++; if (mpnt->vm_ops && mpnt->vm_ops->open) mpnt->vm_ops->open(mpnt); + mpnt->vm_prev_private = (area->vm_prev_private == area) ? NULL : area->vm_prev_private; + mpnt->vm_private_count = 0; + change_page_vmas_on_split(area, mpnt); /* must be done before vm_end is changed */ + remove_shared_vm_struct(area); + area->vm_prev_private = mpnt->vm_prev_private; area->vm_end = addr; /* Truncate area */ insert_vm_struct(current->mm, mpnt); + + goto no_remove; } + /* NOTE: if open/close blocks, Bad Things will happen to the vm_private list. -ben */ + mpnt = area->vm_prev_private; + /* ignore the ugly trick-the-debug-code hack here. This is valid. -ben */ + { unsigned long count = area->vm_private_count; area->vm_private_count = 0; +#if 0 + if (area->vm_private_count != 0) + printk("unmap_fixup(%p): vma still pointed to by %d pages...\n", area, area->vm_private_count); +#endif + remove_shared_vm_struct(area); + area->vm_prev_private = (mpnt == area) ? NULL : mpnt; + area->vm_private_count = count; } +no_remove: /* Close the current area ... */ if (area->vm_ops && area->vm_ops->close) { end = area->vm_end; /* save new end */ @@ -525,7 +618,6 @@ freed = 1; mm->map_count--; - remove_shared_vm_struct(mpnt); st = addr < mpnt->vm_start ? mpnt->vm_start : addr; end = addr+len; @@ -536,7 +628,7 @@ mpnt->vm_ops->unmap(mpnt, st, size); flush_cache_range(mm, st, end); - zap_page_range(mm, st, size); + zap_page_range(mpnt, st, size); flush_tlb_range(mm, st, end); /* @@ -578,8 +670,8 @@ mpnt->vm_ops->close(mpnt); } mm->map_count--; + zap_page_range(mpnt, start, size); remove_shared_vm_struct(mpnt); - zap_page_range(mm, start, size); if (mpnt->vm_file) fput(mpnt->vm_file); kmem_cache_free(vm_area_cachep, mpnt); @@ -593,6 +685,7 @@ /* Insert vm structure into process list sorted by address * and into the inode's i_mmap ring. + * Also, insert this mapping into the vm's private ring if applicable. */ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp) { @@ -623,6 +716,18 @@ inode->i_mmap = vmp; vmp->vm_pprev_share = &inode->i_mmap; } + + /* Note: only use prev to link into the private ring, as multiple vma's might be + * copies of the original. -ben + */ + if (vmp->vm_prev_private) { + vmp->vm_next_private = vmp->vm_prev_private->vm_next_private; + vmp->vm_prev_private->vm_next_private = vmp; + vmp->vm_next_private->vm_prev_private = vmp; + } else { + vmp->vm_next_private = vmp; + vmp->vm_prev_private = vmp; + } } /* Merge the list of memory segments if possible. @@ -633,6 +738,11 @@ */ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) { +#if 0 + /* FIXME: with the vm_private list stuff, this becomes a bit heavy, and is + * severely broken right now, as I haven't written the code to walk the page tables + * looking for and adjusting page->u.private.vma == vma. + */ struct vm_area_struct *prev, *mpnt, *next; down(&mm->mmap_sem); @@ -702,6 +812,7 @@ mm->mmap_cache = NULL; /* Kill the cache. */ no_vma: up(&mm->mmap_sem); +#endif } __initfunc(void vma_init(void)) Index: linux-2.1.86-mm/mm/mprotect.c diff -u linux-2.1.86-mm/mm/mprotect.c:1.1.1.2 linux-2.1.86-mm/mm/mprotect.c:1.3 --- linux-2.1.86-mm/mm/mprotect.c:1.1.1.2 Mon Mar 2 22:23:14 1998 +++ linux-2.1.86-mm/mm/mprotect.c Thu Mar 12 16:01:48 1998 @@ -110,10 +110,13 @@ vma->vm_offset += vma->vm_start - n->vm_start; n->vm_flags = newflags; n->vm_page_prot = prot; + n->vm_private_count = 0; if (n->vm_file) n->vm_file->f_count++; if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); + n->vm_prev_private = (vma != vma->vm_prev_private) ? vma : NULL; + change_page_vmas_on_split(vma, n); insert_vm_struct(current->mm, n); return 0; } @@ -133,10 +136,13 @@ n->vm_offset += n->vm_start - vma->vm_start; n->vm_flags = newflags; n->vm_page_prot = prot; + n->vm_private_count = 0; if (n->vm_file) n->vm_file->f_count++; if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); + n->vm_prev_private = (vma != vma->vm_prev_private) ? vma : NULL; + change_page_vmas_on_split(vma, n); insert_vm_struct(current->mm, n); return 0; } @@ -165,12 +171,17 @@ right->vm_offset += right->vm_start - left->vm_start; vma->vm_flags = newflags; vma->vm_page_prot = prot; + left->vm_private_count = 0; + right->vm_private_count = 0; if (vma->vm_file) vma->vm_file->f_count += 2; if (vma->vm_ops && vma->vm_ops->open) { vma->vm_ops->open(left); vma->vm_ops->open(right); } + left->vm_prev_private = right->vm_prev_private = (vma != vma->vm_prev_private) ? vma : NULL; + change_page_vmas_on_split(vma, left); + change_page_vmas_on_split(vma, right); insert_vm_struct(current->mm, left); insert_vm_struct(current->mm, right); return 0; Index: linux-2.1.86-mm/mm/mremap.c diff -u linux-2.1.86-mm/mm/mremap.c:1.1.1.2 linux-2.1.86-mm/mm/mremap.c:1.5 --- linux-2.1.86-mm/mm/mremap.c:1.1.1.2 Mon Mar 2 22:23:17 1998 +++ linux-2.1.86-mm/mm/mremap.c Thu Mar 12 16:01:48 1998 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -63,7 +64,7 @@ return pte; } -static inline int copy_one_pte(pte_t * src, pte_t * dst) +static inline int move_one_pte(struct vm_area_struct *from, struct vm_area_struct *to, pte_t * src, pte_t * dst) { int error = 0; pte_t pte = *src; @@ -73,26 +74,32 @@ if (dst) { pte_clear(src); set_pte(dst, pte); + if (pte_present(pte)) { + struct page *page = mem_map + MAP_NR(pte_page(pte)); + if ((pte_page(pte) < max_mapnr) && (page->u.private.vma == from)) + page->u.private.vma = to; + } error--; } } return error; } -static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr) +static int move_one_page(struct vm_area_struct *from, struct vm_area_struct *to, unsigned long old_addr, unsigned long new_addr) { int error = 0; pte_t * src; - src = get_one_pte(mm, old_addr); + src = get_one_pte(from->vm_mm, old_addr); if (src) - error = copy_one_pte(src, alloc_one_pte(mm, new_addr)); + error = move_one_pte(from, to, src, alloc_one_pte(from->vm_mm, new_addr)); return error; } -static int move_page_tables(struct mm_struct * mm, +static int move_page_tables(struct vm_area_struct *to, struct vm_area_struct *from, unsigned long new_addr, unsigned long old_addr, unsigned long len) { + struct mm_struct *mm = from->vm_mm; unsigned long offset = len; flush_cache_range(mm, old_addr, old_addr + len); @@ -105,7 +112,7 @@ */ while (offset) { offset -= PAGE_SIZE; - if (move_one_page(mm, old_addr + offset, new_addr + offset)) + if (move_one_page(from, to, old_addr + offset, new_addr + offset)) goto oops_we_failed; } return 0; @@ -120,8 +127,8 @@ oops_we_failed: flush_cache_range(mm, new_addr, new_addr + len); while ((offset += PAGE_SIZE) < len) - move_one_page(mm, new_addr + offset, old_addr + offset); - zap_page_range(mm, new_addr, new_addr + len); + move_one_page(to, from, new_addr + offset, old_addr + offset); + zap_page_range(from, new_addr, new_addr + len); flush_tlb_range(mm, new_addr, new_addr + len); return -1; } @@ -135,22 +142,34 @@ if (new_vma) { unsigned long new_addr = get_unmapped_area(addr, new_len); - if (new_addr && !move_page_tables(current->mm, new_addr, addr, old_len)) { - *new_vma = *vma; - new_vma->vm_start = new_addr; - new_vma->vm_end = new_addr+new_len; - new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start); - new_vma->vm_file = vma->vm_file; - if (new_vma->vm_file) - new_vma->vm_file->f_count++; - if (new_vma->vm_ops && new_vma->vm_ops->open) - new_vma->vm_ops->open(new_vma); - insert_vm_struct(current->mm, new_vma); + /* We have to initialize new_vma before move_page_tables so that another task + * that looks at a page's vma sees a consistent vma. You see, move_page_tables + * might sleep... -ben + */ + *new_vma = *vma; + new_vma->vm_start = new_addr; + new_vma->vm_end = new_addr+new_len; + new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start); + new_vma->vm_file = vma->vm_file; + new_vma->vm_private_count = 0; + new_vma->vm_prev_private = (vma->vm_next_private == vma) ? NULL : vma->vm_prev_private; + if (new_vma->vm_file) + new_vma->vm_file->f_count++; + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + insert_vm_struct(current->mm, new_vma); + + if (new_addr && !move_page_tables(new_vma, vma, new_addr, addr, old_len)) { merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end); do_munmap(addr, old_len); current->mm->total_vm += new_len >> PAGE_SHIFT; return new_addr; } + remove_shared_vm_struct(new_vma); + if (new_vma->vm_ops && new_vma->vm_ops->close) + new_vma->vm_ops->close(new_vma); + if (new_vma->vm_file) + fput(new_vma->vm_file); kmem_cache_free(vm_area_cachep, new_vma); } return -ENOMEM; Index: linux-2.1.86-mm/mm/page_alloc.c diff -u linux-2.1.86-mm/mm/page_alloc.c:1.1.1.2 linux-2.1.86-mm/mm/page_alloc.c:1.3 --- linux-2.1.86-mm/mm/page_alloc.c:1.1.1.2 Fri Feb 27 01:49:41 1998 +++ linux-2.1.86-mm/mm/page_alloc.c Wed Mar 11 20:01:45 1998 @@ -67,18 +67,18 @@ { struct page * next = head->next; - entry->prev = memory_head(head); - entry->next = next; - next->prev = entry; + entry->u.normal.prev = memory_head(head); + entry->u.normal.next = next; + next->u.normal.prev = entry; head->next = entry; } static inline void remove_mem_queue(struct page * entry) { - struct page * next = entry->next; - struct page * prev = entry->prev; - next->prev = prev; - prev->next = next; + struct page * next = entry->u.normal.next; + struct page * prev = entry->u.normal.prev; + next->u.normal.prev = prev; + prev->u.normal.next = next; } /* @@ -121,7 +121,7 @@ struct free_area_struct * last = free_area + NR_MEM_LISTS - 1; spin_lock_irqsave(&page_alloc_lock, flags); - retval = (last->next != memory_head(last)) && (last->next->next != memory_head(last)); + retval = (last->next != memory_head(last)) && (last->next->u.normal.next != memory_head(last)); spin_unlock_irqrestore(&page_alloc_lock, flags); return retval; } @@ -160,6 +160,10 @@ if (!PageReserved(page) && atomic_dec_and_test(&page->count)) { if (PageSwapCache(page)) panic ("Freeing swap cache page"); + if ((!page->inode || page->inode == &swapper_inode) && page->u.private.vma) { /* can be removed later. -ben */ + printk("page still has vma set! Forcing Oops...\n"); + *(char *)0 = 0; + } free_pages_ok(page->map_nr, 0); } if (PageSwapCache(page) && atomic_read(&page->count) == 1) @@ -196,13 +200,13 @@ #define RMQUEUE(order, maxorder, dma) \ do { struct free_area_struct * area = free_area+order; \ unsigned long new_order = order; \ - do { struct page *prev = memory_head(area), *ret = prev->next; \ + do { struct page *prev = memory_head(area), *ret = prev->u.normal.next; \ while (memory_head(area) != ret) { \ - if (new_order >= maxorder && ret->next == prev) \ + if (new_order >= maxorder && ret->u.normal.next == prev) \ break; \ if (!dma || CAN_DMA(ret)) { \ unsigned long map_nr = ret->map_nr; \ - (prev->next = ret->next)->prev = prev; \ + (prev->u.normal.next = ret->u.normal.next)->u.normal.prev = prev; \ MARK_USED(map_nr, new_order, area); \ nr_free_pages -= 1 << order; \ EXPAND(ret, map_nr, order, new_order, area); \ @@ -210,7 +214,7 @@ return ADDRESS(map_nr); \ } \ prev = ret; \ - ret = ret->next; \ + ret = ret->u.normal.next; \ } \ new_order++; area++; \ } while (new_order < NR_MEM_LISTS); \ @@ -227,6 +231,7 @@ } \ atomic_set(&map->count, 1); \ map->age = PAGE_INITIAL_AGE; \ + map->u.normal.next = map->u.normal.prev = NULL; \ } while (0) unsigned long __get_free_pages(int gfp_mask, unsigned long order) @@ -280,7 +285,7 @@ for (order=0 ; order < NR_MEM_LISTS; order++) { struct page * tmp; unsigned long nr = 0; - for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->next) { + for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->u.normal.next) { nr ++; } total += nr * ((PAGE_SIZE>>10) << order); @@ -350,7 +355,7 @@ * Also, don't bother to add to the swap cache if this page-in * was due to a write access. */ -void swap_in(struct task_struct * tsk, struct vm_area_struct * vma, +void swap_in(struct task_struct * tsk, struct vm_area_struct * vma, unsigned long offset, pte_t * page_table, unsigned long entry, int write_access) { unsigned long page; @@ -374,6 +379,17 @@ vma->vm_mm->rss++; tsk->min_flt++; swap_free(entry); + + /* if not already, link the page to a vma */ + if (!page_map->u.private.vma) { + vma->vm_private_count++; + page_map->u.private.vma = vma; + page_map->u.private.vm_offset = offset & PAGE_MASK; + } +#if 1 + else if (!is_vma_on_private_list(vma, page_map->u.private.vma)) + printk("swap_in: vma not on page's private list! (%p, %p, %lu)\n", vma, page_map->u.private.vma, page_map->map_nr); +#endif if (!write_access || is_page_shared(page_map)) { set_pte(page_table, mk_pte(page, vma->vm_page_prot)); Index: linux-2.1.86-mm/mm/slab.c diff -u linux-2.1.86-mm/mm/slab.c:1.1.1.1 linux-2.1.86-mm/mm/slab.c:1.3 --- linux-2.1.86-mm/mm/slab.c:1.1.1.1 Wed Feb 18 23:35:39 1998 +++ linux-2.1.86-mm/mm/slab.c Wed Mar 11 20:01:45 1998 @@ -320,10 +320,10 @@ * slab an obj belongs to. With kmalloc(), and kfree(), these are used * to find the cache which an obj belongs to. */ -#define SLAB_SET_PAGE_CACHE(pg, x) ((pg)->next = (struct page *)(x)) -#define SLAB_GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->next) -#define SLAB_SET_PAGE_SLAB(pg, x) ((pg)->prev = (struct page *)(x)) -#define SLAB_GET_PAGE_SLAB(pg) ((kmem_slab_t *)(pg)->prev) +#define SLAB_SET_PAGE_CACHE(pg, x) ((pg)->u.normal.next = (struct page *)(x)) +#define SLAB_GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->u.normal.next) +#define SLAB_SET_PAGE_SLAB(pg, x) ((pg)->u.normal.prev = (struct page *)(x)) +#define SLAB_GET_PAGE_SLAB(pg) ((kmem_slab_t *)(pg)->u.normal.prev) /* Size description struct for general-caches. */ typedef struct cache_sizes { @@ -1271,6 +1271,7 @@ SLAB_STATS_INC_ERR(cachep); /* this is atomic */ printk(KERN_ERR "kmem_alloc: %s (name=%s)\n", str, cachep ? cachep->c_name : "unknown"); + *(char *)0 = 0; } static void Index: linux-2.1.86-mm/mm/swap_state.c diff -u linux-2.1.86-mm/mm/swap_state.c:1.1.1.2 linux-2.1.86-mm/mm/swap_state.c:1.3 --- linux-2.1.86-mm/mm/swap_state.c:1.1.1.2 Fri Feb 27 01:49:42 1998 +++ linux-2.1.86-mm/mm/swap_state.c Wed Mar 11 20:01:45 1998 @@ -77,7 +77,7 @@ page->inode = &swapper_inode; page->offset = entry; add_page_to_hash_queue(page, &swapper_inode, entry); - add_page_to_inode_queue(&swapper_inode, page); + /*add_page_to_inode_queue(&swapper_inode, page); * No more... -ben */ #ifdef SWAP_CACHE_INFO swap_cache_add_success++; #endif @@ -160,7 +160,10 @@ page_address(page), atomic_read(&page->count)); #endif remove_page_from_hash_queue (page); +#if 0 /* No more, inode queue links are overloaded with page->u.pricate.vma/vm_offset. -ben */ remove_page_from_inode_queue (page); +#endif + page->inode = NULL; PageClearSwapCache (page); __free_page (page); } Index: linux-2.1.86-mm/mm/vmscan.c diff -u linux-2.1.86-mm/mm/vmscan.c:1.1.1.2 linux-2.1.86-mm/mm/vmscan.c:1.3 --- linux-2.1.86-mm/mm/vmscan.c:1.1.1.2 Fri Feb 27 01:49:41 1998 +++ linux-2.1.86-mm/mm/vmscan.c Wed Mar 11 20:01:45 1998 @@ -156,8 +156,9 @@ } vma->vm_mm->rss--; - tsk->nswap++; + tsk->mm->nswap++; flush_cache_page(vma, address); + remove_vma_from_page(vma, page_map); set_pte(page_table, __pte(entry)); flush_tlb_page(vma, address); swap_duplicate(entry); @@ -203,6 +204,7 @@ if ((entry = in_swap_cache(page_map))) { vma->vm_mm->rss--; flush_cache_page(vma, address); + remove_vma_from_page(vma, page_map); set_pte(page_table, __pte(entry)); flush_tlb_page(vma, address); swap_duplicate(entry); @@ -220,6 +222,7 @@ } vma->vm_mm->rss--; flush_cache_page(vma, address); + remove_vma_from_page(vma, page_map); pte_clear(page_table); flush_tlb_page(vma, address); entry = page_unuse(page);