Index: linux/linux/Documentation/filesystems/vfs.txt diff -u linux/linux/Documentation/filesystems/vfs.txt:1.1.1.1 linux/linux/Documentation/filesystems/vfs.txt:1.2 --- linux/linux/Documentation/filesystems/vfs.txt:1.1.1.1 Fri Jul 25 18:33:45 1997 +++ linux/linux/Documentation/filesystems/vfs.txt Sun Jul 27 05:52:28 1997 @@ -39,10 +39,14 @@ Name points to a string that the system will know the filesystem by. - int requires_dev; + int fs_flags; - Set this flag to 1 if the filesystem requires a block device to be mounted - on. + A bitwise or'd collection of flags from include/linux/fs.h + + FS_REQUIRES_DEV + FS_NO_DCACHE + FS_NO_PRELIM + FS_IBASKET struct file_system_type * next; @@ -56,7 +60,7 @@ super_block structure. void (*read_inode) (struct inode *inode); - [optional - doesn't quite make sense] + [not optional] read_inode is called by the VFS when iget is called requesting an inode not already present in the inode table. i_ino is set to the number of the inode requested. @@ -65,18 +69,24 @@ structure. Typically filesystems have separate inode_operations for directories, files and symlinks. i_op can be NULL. - int (*notify_change) (struct inode *, struct iattr *); + int (*notify_change) (struct inode *inode, struct iattr *iattr); [optional] + Called by the VFS when the given attributes of the inode have changed + void (*write_inode) (struct inode *); [optional] + Called by the VFS to write the inode to its backing store - typically + when syncing an inode. int (*put_inode) (struct inode *inode); + [optional][not serialized] + Called by the VFS whenever an iput() is done on an inode. + + int (*delete_inode) (struct inode *inode); [optional] - put_inode is called by the VFS when the last instance of inode is released - with a call to iput. The only special consideration that should be made - is that iget may reuse inode without calling read_inode unless clear_inode - is called. put_inode MUST return 1 if it called clear_inode on the inode, - otherwise zero. + delete_inode is called by the VFS when the last instance of inode is released + and inode has been deleted. This is where the filesystem should release any + resources held by the inode. void (*put_super) (struct super_block *); [optional] Index: linux/linux/arch/i386/kernel/head.S diff -u linux/linux/arch/i386/kernel/head.S:1.1.1.2 linux/linux/arch/i386/kernel/head.S:1.2 --- linux/linux/arch/i386/kernel/head.S:1.1.1.2 Fri Jul 25 18:32:46 1997 +++ linux/linux/arch/i386/kernel/head.S Tue Jul 29 19:13:54 1997 @@ -494,15 +494,23 @@ .long 0x3f8007,0x3f9007,0x3fa007,0x3fb007,0x3fc007,0x3fd007,0x3fe007,0x3ff007 .org 0x3000 -ENTRY(empty_bad_page) +ENTRY(pg0_links) + .fill 1024,4,0 .org 0x4000 -ENTRY(empty_bad_page_table) +ENTRY(empty_bad_page) .org 0x5000 ENTRY(empty_zero_page) .org 0x6000 +ENTRY(empty_bad_page_table) + +.org 0x6000 +ENTRY(empty_bad_page_table_links) + .fill 1024,4,0 + +.org 0x8000 ENTRY(this_must_match_init_task) /* Index: linux/linux/fs/buffer.c diff -u linux/linux/fs/buffer.c:1.1.1.5 linux/linux/fs/buffer.c:1.2 --- linux/linux/fs/buffer.c:1.1.1.5 Fri Jul 25 18:28:57 1997 +++ linux/linux/fs/buffer.c Mon Jul 28 23:32:14 1997 @@ -1147,7 +1147,7 @@ if (test_and_clear_bit(PG_decr_after, &page->flags)) atomic_dec(&nr_async_pages); if (test_and_clear_bit(PG_swap_unlock_after, &page->flags)) - swap_after_unlock_page(page->pg_swap_entry); + swap_after_unlock_page(page->u.d.pg_swap_entry); if (test_and_clear_bit(PG_free_after, &page->flags)) __free_page(page); } Index: linux/linux/fs/nfs/write.c diff -u linux/linux/fs/nfs/write.c:1.1.1.1 linux/linux/fs/nfs/write.c:1.2 --- linux/linux/fs/nfs/write.c:1.1.1.1 Fri Jul 25 18:29:04 1997 +++ linux/linux/fs/nfs/write.c Mon Jul 28 23:32:20 1997 @@ -133,7 +133,7 @@ if (test_and_clear_bit(PG_decr_after, &page->flags)) atomic_dec(&page->count); if (test_and_clear_bit(PG_swap_unlock_after, &page->flags)) - swap_after_unlock_page(page->pg_swap_entry); + swap_after_unlock_page(page->u.d.pg_swap_entry); #endif } Index: linux/linux/fs/proc/array.c diff -u linux/linux/fs/proc/array.c:1.1.1.3 linux/linux/fs/proc/array.c:1.2 --- linux/linux/fs/proc/array.c:1.1.1.3 Fri Jul 25 18:29:01 1997 +++ linux/linux/fs/proc/array.c Wed Jul 30 22:41:53 1997 @@ -762,7 +762,7 @@ sigignore, sigcatch, wchan, - tsk->nswap, + tsk->mm->nswap, tsk->cnswap); } Index: linux/linux/fs/proc/mem.c diff -u linux/linux/fs/proc/mem.c:1.1.1.3 linux/linux/fs/proc/mem.c:1.2 --- linux/linux/fs/proc/mem.c:1.1.1.3 Fri Jul 25 18:29:00 1997 +++ linux/linux/fs/proc/mem.c Sun Jul 27 05:52:43 1997 @@ -293,6 +293,7 @@ set_pte(src_table, pte_mkdirty(*src_table)); set_pte(dest_table, *src_table); atomic_inc(&mem_map[MAP_NR(pte_page(*src_table))].count); + pte_link(dest_table, pte_page(*dest_table)); stmp += PAGE_SIZE; dtmp += PAGE_SIZE; Index: linux/linux/include/asm-i386/pgtable.h diff -u linux/linux/include/asm-i386/pgtable.h:1.1.1.4 linux/linux/include/asm-i386/pgtable.h:1.16 --- linux/linux/include/asm-i386/pgtable.h:1.1.1.4 Fri Jul 25 18:29:48 1997 +++ linux/linux/include/asm-i386/pgtable.h Sun Aug 3 11:58:47 1997 @@ -3,6 +3,8 @@ #include +#define PGTABLE_LINK_DEBUG + /* * The Linux memory management assumes a three-level page table setup. On * the i386, we use that, but "fold" the mid level into the top-level page @@ -296,6 +298,7 @@ #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { pmd_val(*(xp)) = 0; } while (0) +#define pmd_next(x) (NULL) /* * The "pgd_xxx()" functions here are trivial for a folded two-level @@ -372,7 +375,7 @@ */ extern inline void pte_free_kernel(pte_t * pte) { - free_page((unsigned long) pte); + free_pages((unsigned long) pte, 1); } extern const char bad_pmd_string[]; @@ -381,16 +384,19 @@ { address = (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); if (pmd_none(*pmd)) { - pte_t * page = (pte_t *) get_free_page(GFP_KERNEL); + pte_t * page = (pte_t *) __get_free_pages(GFP_KERNEL, 1, 0); if (pmd_none(*pmd)) { if (page) { + mem_map[MAP_NR(page)].type = PGtype_pte; + mem_map[MAP_NR(page)].u.pte.pmd = NULL; + memset((void *) page, 0, PAGE_SIZE*2); pmd_val(*pmd) = _KERNPG_TABLE + __pa(page); return page + address; } pmd_val(*pmd) = _KERNPG_TABLE + __pa(BAD_PAGETABLE); return NULL; } - free_page((unsigned long) page); + free_pages((unsigned long) page, 1); } if (pmd_bad(*pmd)) { printk(bad_pmd_string, pmd_val(*pmd)); @@ -414,9 +420,19 @@ return (pmd_t *) pgd; } +#define pte_next(pte_p) (*(pte_t **)((unsigned long)(pte_p) + PAGE_SIZE)) extern inline void pte_free(pte_t * pte) { - free_page((unsigned long) pte); + int i, ow=0; + for (i=0; i<1024; i++) { + if (NULL != pte_next(pte+i)) { + printk("Akk!! pte not unlinked - %p (%08lx)\n", pte+i, pte_val(pte[i])); + ow = 1; + } + } + if (ow) + for (;;) ; + free_pages((unsigned long) pte, 1); } extern inline pte_t * pte_alloc(pmd_t * pmd, unsigned long address) @@ -432,16 +448,18 @@ getnew: { - unsigned long page = __get_free_page(GFP_KERNEL); + unsigned long page = __get_free_pages(GFP_KERNEL, 1, 0); if (!pmd_none(*pmd)) goto freenew; if (!page) goto oom; - memset((void *) page, 0, PAGE_SIZE); + mem_map[MAP_NR(page)].type = PGtype_pte; + mem_map[MAP_NR(page)].u.pte.pmd = pmd; + memset((void *) page, 0, PAGE_SIZE*2); pmd_val(*pmd) = _PAGE_TABLE + __pa(page); return (pte_t *) (page + address); freenew: - free_page(page); + free_pages(page, 1); goto repeat; } @@ -452,6 +470,89 @@ return NULL; } +#define pgd_getmm(pgd_p) (mem_map[MAP_NR((unsigned long)pgd_p & PAGE_MASK)].u.pgd.mm) +#define pgd_getoffset(pgd_p) (((unsigned long)pgd_p & PAGE_SIZE) / sizeof(pgd_t)) + +#define pmd_getpgd(pmd_p) ((pgd_t *)(pmd_p)) +#define pmd_getoffset(pmd_p) (0) + +#define pte_getpmd(pte_p) (mem_map[MAP_NR((unsigned long)pte_p & PAGE_MASK)].u.pte.pmd) +#define pte_getoffset(pte_p) (((unsigned long)pte_p & PAGE_SIZE) / sizeof(pte_t)) + +static inline unsigned long ptepmdpgd_getaddr(pte_t *pte_p, pmd_t *pmd_p, pgd_t *pgd_p) +{ return (pte_getoffset(pte_p) << PAGE_SHIFT) | (pmd_getoffset(pmd_p) << PMD_SHIFT) | (pgd_getoffset(pgd_p) << PGDIR_SHIFT); } + +extern unsigned long nr_pte_links, nr_zero_pte_links; +extern unsigned long nr_pte_unlinks, nr_zero_pte_unlinks; + +static inline void pte_link(pte_t *pte_p, unsigned long addr) +{ + struct page *page = mem_map + MAP_NR(addr); +#ifdef PGTABLE_LINK_DEBUG + if (MAP_NR(addr) >= max_mapnr || PageReserved(page)) { + printk("Aieee! pte_link(%p, %08lx) on reserved page!\n", pte_p, addr); + for(;;) ; + } + if (PGtype_free == page->type) { + printk("Aieee! pte_link(%p, %08lx) on free page! - %p\n", pte_p, addr, page->u.d.pte_head); + for(;;) ; + } + if (PGtype_data != page->type) { + printk("Aieee! pte_link(%p, %08lx) on page->type(%d) != PGtype_data!\n", pte_p, addr, page->type); + for(;;) ; + } + if (NULL != pte_next(pte_p)) { + printk("pte_link(%p, %08lx): pte already on list!\n", pte_p, addr); + for(;;) ; + } +#endif + if (NULL == page->u.d.pte_head) + page->u.d.pte_head = pte_p; + pte_next(pte_p) = pte_next(page->u.d.pte_head); + pte_next(page->u.d.pte_head) = pte_p; + nr_pte_links++; +} + +static inline void pte_unlink(pte_t *pte_p, unsigned long addr) +{ + struct page *page = mem_map + MAP_NR(addr); + pte_t **pte_pp; +#ifdef PGTABLE_LINK_DEBUG + if (MAP_NR(ZERO_PAGE) == MAP_NR(addr)) { + printk("Aieee! pte_unlink(%p, %08lx) on zero page!\n", pte_p, addr); + for(;;) ; + } + if (MAP_NR(addr) >= max_mapnr || PageReserved(page)) { + printk("Aieee! pte_unlink(%p, %08lx) on reserved page!\n", pte_p, addr); + for(;;) ; + } + if (PGtype_free == page->type) { + printk("Aieee! pte_unlink(%p, %08lx) on free page! - %p\n", pte_p, addr, page->u.d.pte_head); + for(;;) ; + } + if (NULL == pte_next(pte_p) || NULL == page->u.d.pte_head) { +notfound: + printk("pte_unlink(%p, %08lx): pte not on list(%p)! next(%p)\n", pte_p, addr, page->u.d.pte_head, pte_next(pte_p)); + for(;;) ; + } + pte_pp = &pte_next(page->u.d.pte_head); + while (pte_p != *pte_pp) { + if (page->u.d.pte_head == *pte_pp || NULL == *pte_pp) + goto notfound; + pte_pp = &pte_next(*pte_pp); + } +#else + pte_pp = &pte_next(page->u.d.pte_head); + while (pte_p != *pte_pp) + pte_pp = &pte_next(*pte_pp); +#endif + *pte_pp = pte_next(*pte_pp); + pte_next(pte_p) = NULL; + if (pte_p == page->u.d.pte_head) + page->u.d.pte_head = *pte_pp; + nr_pte_unlinks++; +} + /* * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. @@ -468,12 +569,20 @@ extern inline void pgd_free(pgd_t * pgd) { - free_page((unsigned long) pgd); + PageSetPgd(&mem_map[MAP_NR((unsigned long) pgd)]); + free_pages((unsigned long) pgd, 1); } -extern inline pgd_t * pgd_alloc(void) +extern inline pgd_t * pgd_alloc(struct mm_struct *mm) { - return (pgd_t *) get_free_page(GFP_KERNEL); + unsigned long pgd = __get_free_pages(GFP_KERNEL, 1, 0); + if (pgd) { + memset((void *)pgd, 0, PAGE_SIZE*2); + PageSetPgd(&mem_map[MAP_NR(pgd)]); + mem_map[MAP_NR(pgd)].type = PGtype_pgd; + mem_map[MAP_NR(pgd)].u.pgd.mm = mm; + } + return (pgd_t *) pgd; } extern pgd_t swapper_pg_dir[1024]; Index: linux/linux/include/linux/mm.h diff -u linux/linux/include/linux/mm.h:1.1.1.3 linux/linux/include/linux/mm.h:1.4 --- linux/linux/include/linux/mm.h:1.1.1.3 Fri Jul 25 18:29:25 1997 +++ linux/linux/include/linux/mm.h Tue Jul 29 02:01:37 1997 @@ -120,14 +120,37 @@ atomic_t count; unsigned long flags; /* atomic flags, some possibly updated asynchronously */ unsigned dirty:16, + type:4, + order:4, age:8; struct wait_queue *wait; struct page **pprev_hash; struct buffer_head * buffers; - unsigned long pg_swap_entry; unsigned long map_nr; /* page->map_nr == page - mem_map */ + union { + struct { + unsigned long pg_swap_entry; + pte_t *pte_head; + } d; /* normal 'managed' pages */ + struct { + pmd_t *pmd; + } pte; + struct { + pgd_t *pgd; + } pmd; + struct { + struct mm_struct *mm; + } pgd; + } u; } mem_map_t; +#define PGtype_other 0 /* internal unspecified kernel use */ +#define PGtype_data 1 /* standard vm managed page */ +#define PGtype_pte 2 /* pte_frompage(page) is valid */ +#define PGtype_pmd 3 /* pmd_frompage(page) is valid */ +#define PGtype_pgd 4 /* pgd_frompage(page) is valid */ +#define PGtype_free 5 /* a free page ;-) */ + /* Page flag bit values */ #define PG_locked 0 #define PG_error 1 @@ -139,6 +162,8 @@ #define PG_DMA 7 #define PG_Slab 8 #define PG_swap_cache 9 +#define PG_pgd 10 +#define PG_dirty 11 #define PG_reserved 31 /* Make it prettier to test the above... */ @@ -159,9 +184,11 @@ #define PageSetSwapCache(page) (set_bit(PG_swap_cache, &(page)->flags)) #define PageTestandSetSwapCache(page) \ (test_and_set_bit(PG_swap_cache, &(page)->flags)) +#define PageSetPgd(page) (set_bit(PG_pgd, &(page)->flags)) #define PageClearSlab(page) (clear_bit(PG_Slab, &(page)->flags)) #define PageClearSwapCache(page)(clear_bit(PG_swap_cache, &(page)->flags)) +#define PageClearPgd(page) (clear_bit(PG_pgd, &(page)->flags)) #define PageTestandClearSwapCache(page) \ (test_and_clear_bit(PG_swap_cache, &(page)->flags)) Index: linux/linux/include/linux/sched.h diff -u linux/linux/include/linux/sched.h:1.1.1.3 linux/linux/include/linux/sched.h:1.3 --- linux/linux/include/linux/sched.h:1.1.1.3 Fri Jul 25 18:29:25 1997 +++ linux/linux/include/linux/sched.h Wed Jul 30 23:01:25 1997 @@ -151,6 +151,7 @@ unsigned long rss, total_vm, locked_vm; unsigned long def_flags; unsigned long cpu_vm_mask; + unsigned long nswap; }; #define INIT_MM { \ @@ -161,7 +162,8 @@ 0, 0, 0, 0, \ 0, 0, 0, 0, \ 0, 0, 0, \ - 0, 0 } + 0, 0, \ + 0 } struct signal_struct { atomic_t count; @@ -228,7 +230,7 @@ struct tms times; unsigned long start_time; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ - unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; + unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt, cnswap; int swappable:1; unsigned long swap_address; unsigned long old_maj_flt; /* old value of maj_flt */ @@ -323,7 +325,7 @@ /* timeout */ 0,SCHED_OTHER,0,0,0,0,0,0,0, \ /* timer */ { NULL, NULL, 0, 0, it_real_fn }, \ /* utime */ {0,0,0,0},0, \ -/* flt */ 0,0,0,0,0,0, \ +/* flt */ 0,0,0,0,0, \ /* swp */ 0,0,0,0,0, \ /* rlimits */ INIT_RLIMITS, \ /* math */ 0, \ Index: linux/linux/include/linux/swap.h diff -u linux/linux/include/linux/swap.h:1.1.1.3 linux/linux/include/linux/swap.h:1.2 --- linux/linux/include/linux/swap.h:1.1.1.3 Fri Jul 25 18:29:37 1997 +++ linux/linux/include/linux/swap.h Mon Jul 28 23:32:26 1997 @@ -100,7 +100,7 @@ extern inline unsigned long in_swap_cache(struct page *page) { if (PageSwapCache(page)) - return page->pg_swap_entry; + return page->u.d.pg_swap_entry; return 0; } @@ -113,7 +113,7 @@ #ifdef SWAP_CACHE_INFO swap_cache_find_success++; #endif - return page->pg_swap_entry; + return page->u.d.pg_swap_entry; } return 0; } @@ -127,7 +127,7 @@ #ifdef SWAP_CACHE_INFO swap_cache_del_success++; #endif - swap_free(page->pg_swap_entry); + swap_free(page->u.d.pg_swap_entry); return 1; } return 0; Index: linux/linux/ipc/shm.c diff -u linux/linux/ipc/shm.c:1.1.1.4 linux/linux/ipc/shm.c:1.2 --- linux/linux/ipc/shm.c:1.1.1.4 Fri Jul 25 18:30:48 1997 +++ linux/linux/ipc/shm.c Sun Jul 27 05:52:47 1997 @@ -465,6 +465,7 @@ break; } set_pte(page_table, __pte(shm_sgn)); + pte_link(page_table, pte_page(__pte(shm_sgn))); } flush_tlb_range(shmd->vm_mm, shmd->vm_start, shmd->vm_end); return error; @@ -801,6 +802,7 @@ set_pte(page_table, __pte(shmd->vm_pte + SWP_ENTRY(0, idx << SHM_IDX_SHIFT))); atomic_dec(&mem_map[MAP_NR(pte_page(pte))].count); + pte_unlink(page_table, pte_page(page)); if (shmd->vm_mm->rss > 0) shmd->vm_mm->rss--; flush_tlb_page(shmd, tmp); Index: linux/linux/kernel/exit.c diff -u linux/linux/kernel/exit.c:1.1.1.3 linux/linux/kernel/exit.c:1.2 --- linux/linux/kernel/exit.c:1.1.1.3 Fri Jul 25 18:29:20 1997 +++ linux/linux/kernel/exit.c Wed Jul 30 22:41:34 1997 @@ -144,7 +144,7 @@ release_thread(p); current->cmin_flt += p->min_flt + p->cmin_flt; current->cmaj_flt += p->maj_flt + p->cmaj_flt; - current->cnswap += p->nswap + p->cnswap; + current->cnswap += p->mm->nswap + p->cnswap; free_task_struct(p); } else { printk("task releasing itself\n"); Index: linux/linux/kernel/fork.c diff -u linux/linux/kernel/fork.c:1.1.1.4 linux/linux/kernel/fork.c:1.2 --- linux/linux/kernel/fork.c:1.1.1.4 Fri Jul 25 18:29:20 1997 +++ linux/linux/kernel/fork.c Wed Jul 30 22:41:34 1997 @@ -275,7 +275,7 @@ tsk->mm = mm; tsk->min_flt = tsk->maj_flt = 0; tsk->cmin_flt = tsk->cmaj_flt = 0; - tsk->nswap = tsk->cnswap = 0; + tsk->cnswap = 0; if (new_page_tables(tsk)) goto free_mm; if (dup_mmap(mm)) { Index: linux/linux/kernel/sys.c diff -u linux/linux/kernel/sys.c:1.1.1.4 linux/linux/kernel/sys.c:1.2 --- linux/linux/kernel/sys.c:1.1.1.4 Fri Jul 25 18:29:20 1997 +++ linux/linux/kernel/sys.c Wed Jul 30 22:41:34 1997 @@ -1084,7 +1084,7 @@ r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_stime); r.ru_minflt = p->min_flt; r.ru_majflt = p->maj_flt; - r.ru_nswap = p->nswap; + r.ru_nswap = p->mm->nswap; break; case RUSAGE_CHILDREN: r.ru_utime.tv_sec = CT_TO_SECS(p->times.tms_cutime); @@ -1102,7 +1102,7 @@ r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_stime + p->times.tms_cstime); r.ru_minflt = p->min_flt + p->cmin_flt; r.ru_majflt = p->maj_flt + p->cmaj_flt; - r.ru_nswap = p->nswap + p->cnswap; + r.ru_nswap = p->mm->nswap + p->cnswap; break; } return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; Index: linux/linux/mm/filemap.c diff -u linux/linux/mm/filemap.c:1.1.1.5 linux/linux/mm/filemap.c:1.4 --- linux/linux/mm/filemap.c:1.1.1.5 Fri Jul 25 18:29:24 1997 +++ linux/linux/mm/filemap.c Tue Jul 29 21:17:09 1997 @@ -28,7 +28,7 @@ #include #include #include - +#define inline /**/ /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -250,6 +250,7 @@ struct page **hash) { atomic_inc(&page->count); + page->type = PGtype_data; page->flags &= ~((1 << PG_uptodate) | (1 << PG_error)); page->offset = offset; add_page_to_inode_queue(inode, page); @@ -821,6 +822,7 @@ copy_page(new_page, old_page); flush_page_to_ram(new_page); release_page(page); + mem_map[MAP_NR(new_page)].type = PGtype_data; return new_page; no_cached_page: @@ -983,6 +985,7 @@ flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset)); set_pte(page_table, __pte(entry)); + pte_unlink(page_table, page); flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset)); error = filemap_write_page(vma, offset, page); if (pte_val(*page_table) == entry) @@ -1037,6 +1040,7 @@ return 0; } page = pte_page(pte); + pte_unlink(ptep, page); if (!pte_dirty(pte) || flags == MS_INVALIDATE) { free_page(page); return 0; Index: linux/linux/mm/memory.c diff -u linux/linux/mm/memory.c:1.1.1.3 linux/linux/mm/memory.c:1.9 --- linux/linux/mm/memory.c:1.1.1.3 Fri Jul 25 18:29:22 1997 +++ linux/linux/mm/memory.c Sun Aug 3 04:06:43 1997 @@ -50,6 +50,8 @@ #include #include +#define inline /**/ + unsigned long max_mapnr = 0; unsigned long num_physpages = 0; void * high_memory = NULL; @@ -166,7 +168,7 @@ { pgd_t * page_dir, * new_pg; - if (!(new_pg = pgd_alloc())) + if (!(new_pg = pgd_alloc(tsk->mm))) return -ENOMEM; page_dir = pgd_offset(&init_mm, 0); memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD, @@ -200,6 +202,7 @@ set_pte(new_pte, pte_mkold(pte)); set_pte(old_pte, pte); atomic_inc(&mem_map[page_nr].count); + pte_link(new_pte, pte_page(pte)); } static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow) @@ -291,12 +294,13 @@ return error; } -static inline void free_pte(pte_t page) +static inline void free_pte(pte_t *page_table, pte_t page) { if (pte_present(page)) { unsigned long addr = pte_page(page); if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr))) return; + pte_unlink(page_table, addr); free_page(addr); if (current->mm->rss <= 0) return; @@ -306,11 +310,11 @@ swap_free(pte_val(page)); } -static inline void forget_pte(pte_t page) +static inline void forget_pte(pte_t *page_table, pte_t page) { if (!pte_none(page)) { printk("forget_pte: old mapping existed!\n"); - free_pte(page); + free_pte(page_table, page); } } @@ -340,7 +344,7 @@ if (pte_none(page)) continue; pte_clear(pte-1); - free_pte(page); + free_pte(pte-1, page); } } @@ -395,7 +399,7 @@ do { pte_t oldpage = *pte; set_pte(pte, zero_pte); - forget_pte(oldpage); + forget_pte(pte, oldpage); address += PAGE_SIZE; pte++; } while (address < end); @@ -468,7 +472,7 @@ mapnr = MAP_NR(__va(phys_addr)); if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr)) set_pte(pte, mk_pte_phys(phys_addr, prot)); - forget_pte(oldpage); + forget_pte(pte, oldpage); address += PAGE_SIZE; phys_addr += PAGE_SIZE; pte++; @@ -568,6 +572,8 @@ } flush_page_to_ram(page); set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY)))); + mem_map[MAP_NR(page)].type = PGtype_data; + pte_link(pte, page); /* no need for flush_tlb */ return page; } @@ -617,12 +623,21 @@ flush_page_to_ram(new_page); flush_cache_page(vma, address); set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); + if (!PageReserved(mem_map + MAP_NR(old_page))) { + static int foo = 0; + if (++foo < 5) + printk("do_wp_page: pte_unlink: addr = %08lx\n", address); + pte_unlink(page_table, old_page); + } + mem_map[MAP_NR(new_page)].type = PGtype_data; + pte_link(page_table, new_page); free_page(old_page); flush_tlb_page(vma, address); return; } flush_cache_page(vma, address); set_pte(page_table, BAD_PAGE); + pte_unlink(page_table, old_page); flush_tlb_page(vma, address); free_page(old_page); oom(tsk); @@ -752,6 +767,7 @@ ++tsk->maj_flt; flush_page_to_ram(pte_page(page)); set_pte(page_table, page); + pte_link(page_table, pte_page(page)); return; } @@ -803,6 +819,7 @@ !(vma->vm_flags & VM_SHARED)) entry = pte_wrprotect(entry); put_page(page_table, entry); + pte_link(page_table, page); /* no need to invalidate: a not-present page shouldn't be cached */ return; @@ -817,6 +834,8 @@ vma->vm_mm->rss++; tsk->min_flt++; flush_page_to_ram(page); + mem_map[MAP_NR(page)].type = PGtype_data; + pte_link(page_table, page); } put_page(page_table, entry); return; Index: linux/linux/mm/mremap.c diff -u linux/linux/mm/mremap.c:1.1.1.3 linux/linux/mm/mremap.c:1.2 --- linux/linux/mm/mremap.c:1.1.1.3 Fri Jul 25 18:29:24 1997 +++ linux/linux/mm/mremap.c Sun Jul 27 05:52:48 1997 @@ -73,6 +73,12 @@ if (dst) { pte_clear(src); set_pte(dst, pte); + if (pte_present(pte) && + MAP_NR(pte_page(pte)) <= max_mapnr && + !PageReserved(mem_map+MAP_NR(pte_page(pte)))) { + pte_unlink(src, pte_page(pte)); + pte_link(dst, pte_page(pte)); + } error--; } } Index: linux/linux/mm/page_alloc.c diff -u linux/linux/mm/page_alloc.c:1.1.1.4 linux/linux/mm/page_alloc.c:1.20 --- linux/linux/mm/page_alloc.c:1.1.1.4 Fri Jul 25 18:29:24 1997 +++ linux/linux/mm/page_alloc.c Tue Aug 5 22:08:09 1997 @@ -26,6 +26,12 @@ #include #include #include +#define inline /**/ + +unsigned long nr_pte_links = 0; +unsigned long nr_zero_pte_links = 0; +unsigned long nr_pte_unlinks = 0; +unsigned long nr_zero_pte_unlinks = 0; int nr_swap_pages = 0; int nr_free_pages = 0; @@ -50,6 +56,7 @@ struct page *next; struct page *prev; unsigned int * map; + unsigned long inuse; }; #define memory_head(x) ((struct page *)(x)) @@ -100,7 +107,7 @@ #ifdef __SMP__ static spinlock_t page_alloc_lock; #endif - +void *caller; static inline void free_pages_ok(unsigned long map_nr, unsigned long order) { struct free_area_struct *area = free_area + order; @@ -109,9 +116,28 @@ unsigned long flags; spin_lock_irqsave(&page_alloc_lock, flags); +#ifdef PGTABLE_LINK_DEBUG + if (NULL != mem_map[map_nr].u.d.pte_head) { + printk("Aieee! freeing page %08lx with pte_head = %p!\n", (PAGE_OFFSET + (map_nr << PAGE_SHIFT)), mem_map[map_nr].u.d.pte_head); + *(char *)0 = 0; sti(); + for (;;) ; + } + if (PGtype_free == mem_map[map_nr].type) { + printk("Aiee: freeing free page(%lu) from %p/%p!\n", map_nr, caller, mem_map[map_nr].u.d.pte_head); + goto out; + } + mem_map[map_nr].u.d.pte_head = caller; +#endif + if (order != mem_map[map_nr].order) { + printk("Aiee: freeing page(%lu) with wrong order (%luv%d)! from %p\n", map_nr, order, mem_map[map_nr].order, caller); + goto out; + } + mem_map[map_nr].type = PGtype_free; #define list(x) (mem_map+(x)) + if (area->inuse) + area->inuse--; map_nr &= mask; nr_free_pages -= mask; while (mask + (1 << (NR_MEM_LISTS-1))) { @@ -127,6 +153,7 @@ #undef list +out: spin_unlock_irqrestore(&page_alloc_lock, flags); } @@ -134,6 +161,7 @@ { if (!PageReserved(page) && atomic_dec_and_test(&page->count)) { delete_from_swap_cache(page); + caller = ((void **)&page)[-1]; free_pages_ok(page->map_nr, 0); } } @@ -148,6 +176,7 @@ return; if (atomic_dec_and_test(&map->count)) { delete_from_swap_cache(map); + caller = ((void **)&addr)[-1]; free_pages_ok(map_nr, order); return; } @@ -172,6 +201,10 @@ MARK_USED(map_nr, new_order, area); \ nr_free_pages -= 1 << order; \ EXPAND(ret, map_nr, order, new_order, area); \ + free_area[order].inuse++; \ + mem_map[map_nr].order = order; \ + mem_map[map_nr].type = PGtype_other; \ + mem_map[map_nr].u.d.pte_head = NULL; \ spin_unlock_irqrestore(&page_alloc_lock, flags); \ return ADDRESS(map_nr); \ } \ @@ -213,7 +246,7 @@ reserved_pages = 5; if (priority != GFP_NFS) - reserved_pages = min_free_pages; + reserved_pages = (priority == GFP_KERNEL) ? free_pages_low : min_free_pages; repeat: spin_lock_irqsave(&page_alloc_lock, flags); if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) { @@ -222,7 +255,7 @@ return 0; } spin_unlock_irqrestore(&page_alloc_lock, flags); - if (priority != GFP_BUFFER && try_to_free_page(priority, dma, 1)) + if (priority != GFP_BUFFER && try_to_free_page(priority, dma, order)) goto repeat; return 0; } @@ -236,20 +269,28 @@ { unsigned long order, flags; unsigned long total = 0; + unsigned long total_data = 0; + unsigned long nr; printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10)); spin_lock_irqsave(&page_alloc_lock, flags); for (order=0 ; order < NR_MEM_LISTS; order++) { struct page * tmp; - unsigned long nr = 0; + nr = 0; for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->next) { nr ++; } total += nr * ((PAGE_SIZE>>10) << order); - printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order)); + printk("%lu/%lu*%lukB ", nr, free_area[order].inuse, (unsigned long)((PAGE_SIZE>>10) << order)); } + for (nr=0; nr>10)); +#ifdef PGTABLE_LINK_DEBUG + printk("pte_links: %lu of zero: %lu\n", nr_pte_links, nr_zero_pte_links); + printk("pte_unlinks: %lu of zero: %lu\n", nr_pte_unlinks, nr_zero_pte_unlinks); +#endif #ifdef SWAP_CACHE_INFO show_swap_cache_info(); #endif @@ -288,7 +329,7 @@ atomic_set(&p->count, 0); p->flags = (1 << PG_DMA) | (1 << PG_reserved); p->map_nr = p - mem_map; - } while (p > mem_map); + } while (p >= mem_map); for (i = 0 ; i < NR_MEM_LISTS ; i++) { unsigned long bitmap_size; @@ -327,6 +368,7 @@ oom(tsk); return; } + mem_map[MAP_NR(page)].type = PGtype_data; read_swap_page(entry, (char *) page); if (pte_val(*page_table) != entry) { free_page(page); @@ -334,6 +376,8 @@ } vma->vm_mm->rss++; tsk->maj_flt++; + mem_map[MAP_NR(page)].type = PGtype_data; + pte_link(page_table, page); if (!write_access && add_to_swap_cache(&mem_map[MAP_NR(page)], entry)) { /* keep swap page allocated for the moment (swap cache) */ set_pte(page_table, mk_pte(page, vma->vm_page_prot)); Index: linux/linux/mm/page_io.c diff -u linux/linux/mm/page_io.c:1.1.1.3 linux/linux/mm/page_io.c:1.2 --- linux/linux/mm/page_io.c:1.1.1.3 Fri Jul 25 18:29:24 1997 +++ linux/linux/mm/page_io.c Mon Jul 28 23:32:27 1997 @@ -85,7 +85,7 @@ set_bit(PG_swap_unlock_after, &page->flags); /* swap-cache shouldn't be set, but play safe */ PageClearSwapCache(page); - page->pg_swap_entry = entry; + page->u.d.pg_swap_entry = entry; atomic_inc(&nr_async_pages); } ll_rw_page(rw,p->swap_device,offset,buf); Index: linux/linux/mm/swap_state.c diff -u linux/linux/mm/swap_state.c:1.1.1.3 linux/linux/mm/swap_state.c:1.2 --- linux/linux/mm/swap_state.c:1.1.1.3 Fri Jul 25 18:29:24 1997 +++ linux/linux/mm/swap_state.c Mon Jul 28 23:32:27 1997 @@ -49,7 +49,7 @@ swap_cache_add_total++; #endif if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { - page->pg_swap_entry = entry; + page->u.d.pg_swap_entry = entry; if (PageTestandSetSwapCache(page)) printk("swap_cache: replacing non-empty entry\n"); #ifdef SWAP_CACHE_INFO Index: linux/linux/mm/swapfile.c diff -u linux/linux/mm/swapfile.c:1.1.1.4 linux/linux/mm/swapfile.c:1.3 --- linux/linux/mm/swapfile.c:1.1.1.4 Fri Jul 25 18:29:24 1997 +++ linux/linux/mm/swapfile.c Tue Jul 29 01:56:00 1997 @@ -169,7 +169,7 @@ * from the beginning for this process.. */ static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address, - pte_t *dir, unsigned int type, unsigned long page) + pte_t *dir, unsigned int type) { pte_t pte = *dir; @@ -191,21 +191,13 @@ } if (SWP_TYPE(pte_val(pte)) != type) return 0; - read_swap_page(pte_val(pte), (char *) page); - if (pte_val(*dir) != pte_val(pte)) { - free_page(page); - return 1; - } - set_pte(dir, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)))); - flush_tlb_page(vma, address); - ++vma->vm_mm->rss; - swap_free(pte_val(pte)); + swap_in(current, vma, dir, pte_val(pte), 0); return 1; } static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, - unsigned int type, unsigned long page) + unsigned int type) { pte_t * pte; unsigned long end; @@ -224,7 +216,7 @@ if (end > PMD_SIZE) end = PMD_SIZE; do { - if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page)) + if (unuse_pte(vma, offset+address-vma->vm_start, pte, type)) return 1; address += PAGE_SIZE; pte++; @@ -234,7 +226,7 @@ static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, - unsigned int type, unsigned long page) + unsigned int type) { pmd_t * pmd; unsigned long offset, end; @@ -253,7 +245,7 @@ if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - if (unuse_pmd(vma, pmd, address, end - address, offset, type, page)) + if (unuse_pmd(vma, pmd, address, end - address, offset, type)) return 1; address = (address + PMD_SIZE) & PMD_MASK; pmd++; @@ -263,10 +255,10 @@ static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, unsigned long start, unsigned long end, - unsigned int type, unsigned long page) + unsigned int type) { while (start < end) { - if (unuse_pgd(vma, pgdir, start, end - start, type, page)) + if (unuse_pgd(vma, pgdir, start, end - start, type)) return 1; start = (start + PGDIR_SIZE) & PGDIR_MASK; pgdir++; @@ -274,7 +266,7 @@ return 0; } -static int unuse_process(struct mm_struct * mm, unsigned int type, unsigned long page) +static int unuse_process(struct mm_struct * mm, unsigned int type) { struct vm_area_struct* vma; @@ -286,7 +278,7 @@ vma = mm->mmap; while (vma) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); - if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page)) + if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type)) return 1; vma = vma->vm_next; } @@ -305,21 +297,22 @@ if (!page) return -ENOMEM; + free_page(page); again: read_lock(&tasklist_lock); for_each_task(p) { read_unlock(&tasklist_lock); - if(unuse_process(p->mm, type, page)) { + if(unuse_process(p->mm, type)) { page = get_free_page(GFP_KERNEL); if(!page) return -ENOMEM; + free_page(page); goto again; } read_lock(&tasklist_lock); } read_unlock(&tasklist_lock); - free_page(page); return 0; } Index: linux/linux/mm/vmalloc.c diff -u linux/linux/mm/vmalloc.c:1.1.1.3 linux/linux/mm/vmalloc.c:1.3 --- linux/linux/mm/vmalloc.c:1.1.1.3 Fri Jul 25 18:29:22 1997 +++ linux/linux/mm/vmalloc.c Sun Aug 3 04:18:56 1997 @@ -37,6 +37,9 @@ if (pte_none(page)) continue; if (pte_present(page)) { +#if 0 + pte_unlink(pte, pte_page(page)); +#endif free_page(pte_page(page)); continue; } @@ -99,6 +102,10 @@ if (!page) return -ENOMEM; set_pte(pte, mk_pte(page, PAGE_KERNEL)); +#if 0 + mem_map[MAP_NR(page)].type = PGtype_data; + pte_link(pte, page); +#endif address += PAGE_SIZE; pte++; } Index: linux/linux/mm/vmscan.c diff -u linux/linux/mm/vmscan.c:1.1.1.4 linux/linux/mm/vmscan.c:1.48 --- linux/linux/mm/vmscan.c:1.1.1.4 Fri Jul 25 18:29:24 1997 +++ linux/linux/mm/vmscan.c Tue Aug 5 22:59:02 1997 @@ -7,7 +7,8 @@ * kswapd added: 7.1.96 sct * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to free_pages_high: 2.4.97, Rik van Riel. - * Version: $Id: vmscan.c,v 1.23 1997/04/12 04:31:05 davem Exp $ + * Major rewrite in honour of The Great Pumpkin 28.7.97, Benjamin LaHaise. + * Version: $Id: vmscan.c,v 1.47 1997/08/06 02:55:28 blah Exp $ */ #include @@ -23,6 +24,7 @@ #include #include #include +#include #include #include /* for cli()/sti() */ @@ -30,10 +32,13 @@ #include #include +#define inline /**/ + /* * When are we next due for a page scan? */ -static int next_swap_jiffies = 0; +static int last_swap_jiffies = 0; +static int last_age_jiffies = 0; /* * How often do we do a pageout scan during normal conditions? @@ -53,6 +58,17 @@ static void init_swap_timer(void); +static void kill_mm(struct mm_struct *mm) +{ + struct task_struct *t; + read_lock(&tasklist_lock); + for_each_task(t) { + if (t->mm == mm) + send_sig(SIGBUS, t, 1); + } + read_unlock(&tasklist_lock); +} + /* * The swap-out functions return 1 if they successfully * threw something out, and we got a free page. It returns @@ -64,58 +80,76 @@ * using a process that no longer actually exists (it might * have died while we slept). */ -static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma, - unsigned long address, pte_t * page_table, int dma, int wait) +static inline int swap_out_pte (pte_t *page_table, struct page *page_map, int wait) { pte_t pte; unsigned long entry; unsigned long page; - struct page * page_map; + struct vm_area_struct *vma; + unsigned long address; + pmd_t *pmd_p; + pgd_t *pgd_p; + struct mm_struct *mm; pte = *page_table; - if (!pte_present(pte)) - return 0; page = pte_page(pte); - if (MAP_NR(page) >= max_mapnr) +#ifdef PGTABLE_LINK_DEBUG + if (!pte_present(pte)) { + printk("try_to_swap_out: Aiee! page(%lu) not present! pte=%08lx\n", page_map->map_nr, pte_val(pte)); return 0; - - page_map = mem_map + MAP_NR(page); - if (PageReserved(page_map) - || PageLocked(page_map) - || (dma && !PageDMA(page_map))) + } + if (page_map->map_nr != (page_map - mem_map)) { + printk("try_to_swap_out: Aiee! page_map->map_nr(%lu) disagrees with page_map(%u)\n", page_map->map_nr, page_map-mem_map); return 0; - /* Deal with page aging. Pages age from being unused; they - * rejuvenate on being accessed. Only swap old pages (age==0 - * is oldest). */ - if ((pte_dirty(pte) && delete_from_swap_cache(page_map)) - || pte_young(pte)) { - set_pte(page_table, pte_mkold(pte)); - touch_page(page_map); + } +#endif + pmd_p = pte_getpmd(page_table); + if (!pmd_p) { + printk("swap_out_pte: no pmd!\n"); return 0; } - age_page(page_map); - if (page_map->age) + pgd_p = pmd_getpgd(pmd_p); + if (!pgd_p) { + printk("swap_out_pte: no pgd!\n"); + return 0; + } + mm = pgd_getmm(pgd_p); + if (!mm) { + printk("swap_out_pte: no mm!\n"); + return 0; + } + address = ptepmdpgd_getaddr(page_table, pmd_p, pgd_p); + vma = find_vma(mm, address); + /* Don't swap out areas like shared memory which have their + own separate swapping mechanism or areas which are locked down */ + if (vma->vm_flags & (VM_SHM | VM_LOCKED)) return 0; if (pte_dirty(pte)) { if (vma->vm_ops && vma->vm_ops->swapout) { - pid_t pid = tsk->pid; vma->vm_mm->rss--; if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table)) - kill_proc(pid, SIGBUS, 1); + kill_mm(vma->vm_mm); } else { - if (atomic_read(&page_map->count) != 1) + /* this isn't right: can only happen with a private map that's shared and dirty */ +#if 0 + if (atomic_read(&page_map->count) != 1) { + printk("swap_out_pte: pte_dirty, page shared(%d), but no swapout operation!\n", atomic_read(&page_map->count)); return 0; + } +#endif if (!(entry = get_swap_page())) return 0; vma->vm_mm->rss--; flush_cache_page(vma, address); set_pte(page_table, __pte(entry)); + pte_unlink(page_table, page); flush_tlb_page(vma, address); - tsk->nswap++; + mm->nswap++; rw_swap_page(WRITE, entry, (char *) page, wait); } + entry = atomic_read(&page_map->count); free_page(page); - return 1; /* we slept: the process may not exist any more */ + return entry; /* we slept: the process may not exist any more */ } if ((entry = find_in_swap_cache(page_map))) { if (atomic_read(&page_map->count) != 1) { @@ -126,266 +160,231 @@ vma->vm_mm->rss--; flush_cache_page(vma, address); set_pte(page_table, __pte(entry)); + pte_unlink(page_table, page); flush_tlb_page(vma, address); + entry = atomic_read(&page_map->count); free_page(page); - return 1; + return entry; } vma->vm_mm->rss--; flush_cache_page(vma, address); pte_clear(page_table); + pte_unlink(page_table, page); flush_tlb_page(vma, address); entry = page_unuse(page); free_page(page); return entry; } -/* - * A new implementation of swap_out(). We do not swap complete processes, - * but only a small number of blocks, before we continue with the next - * process. The number of blocks actually swapped is determined on the - * number of page faults, that this process actually had in the last time, - * so we won't swap heavily used processes all the time ... - * - * Note: the priority argument is a hint on much CPU to waste with the - * swap block search, not a hint, of how much blocks to swap with - * each process. - * - * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de - */ - -static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma, - pmd_t *dir, unsigned long address, unsigned long end, int dma, int wait) +static inline int try_to_swap_page (struct page *page, int wait) { - pte_t * pte; - unsigned long pmd_end; - - if (pmd_none(*dir)) - return 0; - if (pmd_bad(*dir)) { - printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir)); - pmd_clear(dir); - return 0; + pte_t *pte_p; + int tries = atomic_read(&page->count)+1; + if (PGtype_data != page->type) + goto buffers; +#ifdef DEBUG + printk(KERN_DEBUG "kswapd: try_to_swap_page(NR=%lu,%d) tries=%d\n", page->map_nr, wait, tries); +#endif + while ((pte_p = page->u.d.pte_head) && (--tries > 0)) { +#ifdef PGTABLE_LINK_DEBUG + /* + * Hack Alert!!!: by going to the next pte in the chain, + * we make pte_unlink(pte_p) O(1) rather than O(page->count) + * The marvels of Singly Linked Lists - quack! + */ + pte_p = pte_next(pte_p); +#endif + switch (swap_out_pte(pte_p, page, wait)) { + case 0: /* something prevented the unuse of this page... uh-oh */ + goto out_fail; + case 1: /* page was freed */ + goto out_good; + } + if (PageLocked(page)) { + printk("try_to_swap_page(%lu,%d): page now locked\n", page->map_nr, wait); + goto out_fail; + } } - - pte = pte_offset(dir, address); - - pmd_end = (address + PMD_SIZE) & PMD_MASK; - if (end > pmd_end) - end = pmd_end; - - do { - int result; - tsk->swap_address = address + PAGE_SIZE; - result = try_to_swap_out(tsk, vma, address, pte, dma, wait); - if (result) - return result; - address += PAGE_SIZE; - pte++; - } while (address < end); + if (1 != atomic_read(&page->count)) + goto out_fail; + if (page->inode) { +#ifdef DEBUG + printk(KERN_DEBUG "kswapd: try_to_swap_page: freeing page cache page\n"); +#endif + remove_page_from_hash_queue(page); + remove_page_from_inode_queue(page); + __free_page(page); + goto out_good; + } +buffers: + if (!test_and_clear_bit(PG_referenced, &page->flags)) { + struct buffer_head *bh = page->buffers; + if (bh && try_to_free_buffer(bh, &bh, 6)) + goto out_good; + } +out_fail: +#ifdef DEBUG + printk(KERN_DEBUG "kswapd: try_to_swap_page: failed\n"); +#endif + return 1; +out_good: return 0; } -static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma, - pgd_t *dir, unsigned long address, unsigned long end, int dma, int wait) +static inline void do_age_page(struct page *page) { - pmd_t * pmd; - unsigned long pgd_end; - - if (pgd_none(*dir)) - return 0; - if (pgd_bad(*dir)) { - printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir)); - pgd_clear(dir); - return 0; + struct buffer_head *bh; + pte_t *pte_p; + int touches = 0; + + if (page->map_nr != (page - mem_map)) { + printk("do_age_page: Aiee! page-mem_map(%u) disagrees with page->map_nr(%lu)\n", (page-mem_map), page->map_nr); + return; } + if (PGtype_data != page->type) + goto buffers; + /* + * lock the page while mucking with it, as otherwise + * interrupts can steal it out from under us. + */ + if (test_and_set_bit(PG_locked, &page->flags)) + return; + pte_p = page->u.d.pte_head; + if (!pte_p) + goto buffers_unlock; - pmd = pmd_offset(dir, address); - - pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; - if (end > pgd_end) - end = pgd_end; - do { - int result = swap_out_pmd(tsk, vma, pmd, address, end, dma, wait); - if (result) - return result; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); - return 0; + pte_t pte = *pte_p; + if (!pte_present(pte)) { + printk("update_page: Aiee - pte not present!\n"); + goto next; + } + if ((pte_dirty(pte) && delete_from_swap_cache(page)) + ||pte_young(pte)) { + set_pte(pte_p, pte_mkold(pte)); + touches++; + } +next: + pte_p = pte_next(pte_p); + } while (pte_p != page->u.d.pte_head) ; +buffers_unlock: + clear_bit(PG_locked, &page->flags); +buffers: + bh = page->buffers; + if (bh) { + struct buffer_head *tmp = bh; + int ref = 0; + do { + if (buffer_touched(tmp)) { + clear_bit(BH_Touched, &tmp->b_state); + set_bit(PG_referenced, &page->flags); + ref = 1; + } + tmp = tmp->b_this_page; + } while (tmp != bh); + if (ref) + touches++; + } + + if (touches) + touch_page(page); + else + age_page(page); } -static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma, - pgd_t *pgdir, unsigned long start, int dma, int wait) +static int lowest_age = 0; + +static void age_sweep(void) { - unsigned long end; + unsigned long active = 0; + struct page *page_map; + const struct page *max_map = mem_map+max_mapnr; + + lowest_age = MAX_PAGE_AGE; + for (page_map = mem_map; page_map < max_map; + page_map += 1 << page_map->order ) { + if ( + page_map->type == PGtype_free + || PageReserved(page_map) + || PageLocked(page_map)) + continue; + active++; + do_age_page(page_map); + if (page_map->age < lowest_age) + lowest_age = page_map->age; + } +#ifdef DEBUG + printk(KERN_DEBUG "kswapd: age_sweep finished (%lu active)\n", active); +#endif +} - /* Don't swap out areas like shared memory which have their - own separate swapping mechanism or areas which are locked down */ - if (vma->vm_flags & (VM_SHM | VM_LOCKED)) +static int swap_out(int num, int dma, int wait) +{ + static unsigned long clock = 0; + unsigned long limit = max_mapnr; + int swapped = 0; + unsigned long reps = max_mapnr<<1; + struct page *page; +#ifdef DEBUG + printk(KERN_DEBUG "kswapd: swap_out(%d,%d,%d) started lowest_age=%d\n", num, dma, wait, lowest_age); +#endif + for (page = mem_map+clock; + swapped < num && --reps > 0; + page = mem_map+clock) { + { register int order = 1 << page->order; + clock += order; } + if (clock >= limit) + clock = 0; + if ( + page->type == PGtype_free + || PageReserved(page) + || PageLocked(page) + || (dma && !PageDMA(page))) + goto cont; + do_age_page(page); + if (page->age <= lowest_age && + !try_to_swap_page(page, wait)) + swapped++; + else if (!test_and_clear_bit(PG_referenced, &page->flags)) { + struct buffer_head *bh = page->buffers; + if (bh && try_to_free_buffer(bh, &bh, 6)) + swapped++; + } +cont: + } + if (swapped < num) { + printk(KERN_DEBUG "swapd(%d,%d): only swapped %d\n", num, dma, swapped); return 0; - - end = vma->vm_end; - while (start < end) { - int result = swap_out_pgd(tsk, vma, pgdir, start, end, dma, wait); - if (result) - return result; - start = (start + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; } - return 0; + return 1; } -static int swap_out_process(struct task_struct * p, int dma, int wait) + +/* + * This will change big time... =) + */ +static inline int do_try_to_free_page(int priority, int dma, int order) { - unsigned long address; - struct vm_area_struct* vma; + static int foo = 0; +#ifdef DEBUG + printk(KERN_DEBUG "do_try_to_free_page(%d,%d,%d)\n", priority, dma, order); +#endif + if (0 == (++foo & 3)) + shrink_dcache(); /* - * Go through process' page directory. + * We shouldn't have a priority here: + * If we're low on memory we should + * unconditionally throw away _all_ + * kmalloc caches! */ - address = p->swap_address; - p->swap_address = 0; + if ((2 == foo) && kmem_cache_reap(priority, dma, 1)) + return 1; - /* - * Find the proper vm-area + /* try to swap out 2* the amount desired... will use order to add intelligence + * in the near future. */ - vma = find_vma(p->mm, address); - if (!vma) - return 0; - if (address < vma->vm_start) - address = vma->vm_start; - - for (;;) { - int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, dma, wait); - if (result) - return result; - vma = vma->vm_next; - if (!vma) - break; - address = vma->vm_start; - } - p->swap_address = 0; - return 0; -} - -static int swap_out(unsigned int priority, int dma, int wait) -{ - static int skip_factor = 0; - int limit = nr_tasks - 1; - int loop, counter, i; - struct task_struct *p; - - counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority; - if(skip_factor > nr_tasks) - skip_factor = 0; - - read_lock(&tasklist_lock); - p = init_task.next_task; - i = skip_factor; - while(i--) - p = p->next_task; - for(; counter >= 0; counter--) { - /* Check if task is suitable for swapping. */ - loop = 0; - while(1) { - if(!--limit) { - limit = nr_tasks - 1; - /* See if all processes are unswappable or - * already swapped out. - */ - if (loop) - goto out; - loop = 1; - } - if (p->swappable && p->mm->rss) - break; - if((p = p->next_task) == &init_task) - p = p->next_task; - } - skip_factor++; - - /* Determine the number of pages to swap from this process. */ - if (!p->swap_cnt) { - /* Normalise the number of pages swapped by - multiplying by (RSS / 1MB) */ - p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss); - } - if (!--p->swap_cnt) - skip_factor++; - read_unlock(&tasklist_lock); - - switch (swap_out_process(p, dma, wait)) { - case 0: - if (p->swap_cnt) - skip_factor++; - break; - case 1: - return 1; - default: - break; - }; - - /* Whoever we swapped may not even exist now, in fact we cannot - * assume anything about the list we were searching previously. - */ - read_lock(&tasklist_lock); - p = init_task.next_task; - i = skip_factor; - while(i--) - p = p->next_task; - } -out: - read_unlock(&tasklist_lock); - return 0; -} - -/* - * We are much more aggressive about trying to swap out than we used - * to be. This works out OK, because we now do proper aging on page - * contents. - */ -static inline int do_try_to_free_page(int priority, int dma, int wait) -{ - static int state = 0; - int i=6; - int stop; - - /* we don't try as hard if we're not waiting.. */ - stop = 3; - if (wait) - stop = 0; - switch (state) { - do { - case 0: - if (shrink_mmap(i, dma)) - return 1; - state = 1; - case 1: - shrink_dcache(); - state = 2; - case 2: - /* - * We shouldn't have a priority here: - * If we're low on memory we should - * unconditionally throw away _all_ - * kmalloc caches! - */ - if (kmem_cache_reap(0, dma, wait)) - return 1; - state = 3; - case 3: - if (shm_swap(i, dma)) - return 1; - state = 4; - default: - if (swap_out(i, dma, wait)) - return 1; - state = 0; - i--; - } while ((i - stop) >= 0); - } - return 0; + return swap_out(2 << order, dma, 1); } /* @@ -395,12 +394,12 @@ * now we need this so that we can do page allocations * without holding the kernel lock etc. */ -int try_to_free_page(int priority, int dma, int wait) +int try_to_free_page(int priority, int dma, int order) { int retval; lock_kernel(); - retval = do_try_to_free_page(priority,dma,wait); + retval = do_try_to_free_page(priority,dma,order); unlock_kernel(); return retval; } @@ -413,15 +412,15 @@ */ void kswapd_setup(void) { - int i; - char *revision="$Revision: 1.23 $", *s, *e; + int i; + char *revision="$Revision: 1.47 $", *s, *e; - if ((s = strchr(revision, ':')) && - (e = strchr(s, '$'))) - s++, i = e - s; - else - s = revision, i = -1; - printk ("Starting kswapd v%.*s\n", i, s); + if ((s = strchr(revision, ':')) && + (e = strchr(s, '$'))) + s++, i = e - s; + else + s = revision, i = -1; + printk ("Starting kswapd v%.*s\n", i, s); } /* @@ -430,6 +429,7 @@ */ int kswapd(void *unused) { + int num, did_age; current->session = 1; current->pgrp = 1; sprintf(current->comm, "kswapd"); @@ -451,23 +451,28 @@ init_swap_timer(); while (1) { + last_swap_jiffies = jiffies; kswapd_awake = 0; current->signal = 0; - run_task_queue(&tq_disk); interruptible_sleep_on(&kswapd_wait); kswapd_awake = 1; swapstats.wakeups++; - /* Do the background pageout: - * We now only swap out as many pages as needed. - * When we are truly low on memory, we swap out - * synchronously (WAIT == 1). -- Rik. + if (lowest_age > 0 || + (jiffies - last_age_jiffies) >= HZ/4) { + age_sweep(); + last_age_jiffies = jiffies; + did_age = 1; + } else + did_age = 0; + /* + * This needs to be better tuned... we can end up with a + * 'thundering heard' of pages being pushed off to disk. */ - while(nr_free_pages < min_free_pages) - try_to_free_page(GFP_KERNEL, 0, 1); - while((nr_free_pages + atomic_read(&nr_async_pages)) < free_pages_low) - try_to_free_page(GFP_KERNEL, 0, 1); - while((nr_free_pages + atomic_read(&nr_async_pages)) < free_pages_high) - try_to_free_page(GFP_KERNEL, 0, 0); + num = free_pages_high - nr_free_pages - atomic_read(&nr_async_pages); + if (num <= 0) + num = did_age ? 4 : 8; + swap_out(num, 0, 0); + run_task_queue(&tq_disk); } } @@ -477,31 +482,21 @@ void swap_tick(void) { - int want_wakeup = 0; - static int last_wakeup_low = 0; - - if ((nr_free_pages + atomic_read(&nr_async_pages)) < free_pages_low) { - if (last_wakeup_low) - want_wakeup = jiffies >= next_swap_jiffies; - else - last_wakeup_low = want_wakeup = 1; - } - else if (((nr_free_pages + atomic_read(&nr_async_pages)) < free_pages_high) && - jiffies >= next_swap_jiffies) { - last_wakeup_low = 0; - want_wakeup = 1; - } - - if (want_wakeup) { - if (!kswapd_awake) { - wake_up(&kswapd_wait); - need_resched = 1; - } - /* low on memory, we need to start swapping soon */ - if(last_wakeup_low) - next_swap_jiffies = jiffies; - else - next_swap_jiffies = jiffies + swapout_interval; + int free_pages = nr_free_pages + atomic_read(&nr_async_pages); + if ( !kswapd_awake && ( +#if 1 + (free_pages <= free_pages_low && (long)(jiffies - last_swap_jiffies) >= HZ/25) || +#endif + (lowest_age > 0 && (long)(jiffies - last_swap_jiffies) >= HZ/6) || + (free_pages < free_pages_high*2 && (long)(jiffies - last_age_jiffies) >= 4*HZ) || + (free_pages < free_pages_high && (long)(jiffies - last_swap_jiffies) >= swapout_interval) || + ((long)(jiffies - last_age_jiffies) >= HZ*40) + )) { +#if 0 + printk("Awakening kswapd, jiffies=%lu last_swap_jiffies=%lu last_age_jiffies=%lu, free_pages=%d async=%d nr_free_pages=%d\n", jiffies, last_swap_jiffies, last_age_jiffies, free_pages, atomic_read(&nr_async_pages), nr_free_pages); +#endif + wake_up(&kswapd_wait); + need_resched = 1; } timer_active |= (1<