diff -ur 2.1.66/Documentation/filesystems/vfs.txt 2.1.66-pte_list/Documentation/filesystems/vfs.txt --- 2.1.66/Documentation/filesystems/vfs.txt Sun Feb 2 08:18:29 1997 +++ 2.1.66-pte_list/Documentation/filesystems/vfs.txt Wed Nov 26 23:10:34 1997 @@ -39,10 +39,14 @@ Name points to a string that the system will know the filesystem by. - int requires_dev; + int fs_flags; - Set this flag to 1 if the filesystem requires a block device to be mounted - on. + A bitwise or'd collection of flags from include/linux/fs.h + + FS_REQUIRES_DEV + FS_NO_DCACHE + FS_NO_PRELIM + FS_IBASKET struct file_system_type * next; @@ -56,7 +60,7 @@ super_block structure. void (*read_inode) (struct inode *inode); - [optional - doesn't quite make sense] + [not optional] read_inode is called by the VFS when iget is called requesting an inode not already present in the inode table. i_ino is set to the number of the inode requested. @@ -65,18 +69,24 @@ structure. Typically filesystems have separate inode_operations for directories, files and symlinks. i_op can be NULL. - int (*notify_change) (struct inode *, struct iattr *); + int (*notify_change) (struct inode *inode, struct iattr *iattr); [optional] + Called by the VFS when the given attributes of the inode have changed + void (*write_inode) (struct inode *); [optional] + Called by the VFS to write the inode to its backing store - typically + when syncing an inode. int (*put_inode) (struct inode *inode); + [optional][not serialized] + Called by the VFS whenever an iput() is done on an inode. + + int (*delete_inode) (struct inode *inode); [optional] - put_inode is called by the VFS when the last instance of inode is released - with a call to iput. The only special consideration that should be made - is that iget may reuse inode without calling read_inode unless clear_inode - is called. put_inode MUST return 1 if it called clear_inode on the inode, - otherwise zero. + delete_inode is called by the VFS when the last instance of inode is released + and inode has been deleted. This is where the filesystem should release any + resources held by the inode. void (*put_super) (struct super_block *); [optional] diff -ur 2.1.66/arch/i386/kernel/head.S 2.1.66-pte_list/arch/i386/kernel/head.S --- 2.1.66/arch/i386/kernel/head.S Fri Nov 14 20:52:12 1997 +++ 2.1.66-pte_list/arch/i386/kernel/head.S Wed Nov 26 23:10:34 1997 @@ -505,15 +505,23 @@ .long 0x3f8007,0x3f9007,0x3fa007,0x3fb007,0x3fc007,0x3fd007,0x3fe007,0x3ff007 .org 0x3000 -ENTRY(empty_bad_page) +ENTRY(pg0_links) + .fill 1024,4,0 .org 0x4000 -ENTRY(empty_bad_page_table) +ENTRY(empty_bad_page) .org 0x5000 ENTRY(empty_zero_page) .org 0x6000 +ENTRY(empty_bad_page_table) + +.org 0x6000 +ENTRY(empty_bad_page_table_links) + .fill 1024,4,0 + +.org 0x8000 ENTRY(this_must_match_init_task) /* diff -ur 2.1.66/fs/buffer.c 2.1.66-pte_list/fs/buffer.c --- 2.1.66/fs/buffer.c Mon Nov 24 16:02:51 1997 +++ 2.1.66-pte_list/fs/buffer.c Wed Nov 26 23:10:34 1997 @@ -1273,7 +1273,7 @@ if (test_and_clear_bit(PG_decr_after, &page->flags)) atomic_dec(&nr_async_pages); if (test_and_clear_bit(PG_swap_unlock_after, &page->flags)) - swap_after_unlock_page(page->pg_swap_entry); + swap_after_unlock_page(page->u.d.pg_swap_entry); if (test_and_clear_bit(PG_free_after, &page->flags)) __free_page(page); } diff -ur 2.1.66/fs/nfs/write.c 2.1.66-pte_list/fs/nfs/write.c --- 2.1.66/fs/nfs/write.c Wed Oct 29 17:07:39 1997 +++ 2.1.66-pte_list/fs/nfs/write.c Wed Nov 26 23:10:34 1997 @@ -146,7 +146,7 @@ if (test_and_clear_bit(PG_decr_after, &page->flags)) atomic_dec(&page->count); if (test_and_clear_bit(PG_swap_unlock_after, &page->flags)) - swap_after_unlock_page(page->pg_swap_entry); + swap_after_unlock_page(page->u.d.pg_swap_entry); #endif } diff -ur 2.1.66/fs/proc/array.c 2.1.66-pte_list/fs/proc/array.c --- 2.1.66/fs/proc/array.c Tue Oct 21 11:57:29 1997 +++ 2.1.66-pte_list/fs/proc/array.c Wed Nov 26 23:10:34 1997 @@ -782,7 +782,7 @@ sigignore, sigcatch, wchan, - tsk->nswap, + tsk->mm->nswap, tsk->cnswap); } diff -ur 2.1.66/fs/proc/mem.c 2.1.66-pte_list/fs/proc/mem.c --- 2.1.66/fs/proc/mem.c Tue Oct 21 11:57:29 1997 +++ 2.1.66-pte_list/fs/proc/mem.c Wed Nov 26 23:10:34 1997 @@ -294,6 +294,7 @@ set_pte(src_table, pte_mkdirty(*src_table)); set_pte(dest_table, *src_table); atomic_inc(&mem_map[MAP_NR(pte_page(*src_table))].count); + pte_link(dest_table, pte_page(*dest_table)); stmp += PAGE_SIZE; dtmp += PAGE_SIZE; diff -ur 2.1.66/include/asm-i386/pgtable.h 2.1.66-pte_list/include/asm-i386/pgtable.h --- 2.1.66/include/asm-i386/pgtable.h Tue Nov 25 18:55:31 1997 +++ 2.1.66-pte_list/include/asm-i386/pgtable.h Thu Nov 27 00:57:19 1997 @@ -3,6 +3,8 @@ #include +#define PGTABLE_LINK_DEBUG + /* * The Linux memory management assumes a three-level page table setup. On * the i386, we use that, but "fold" the mid level into the top-level page @@ -298,6 +300,7 @@ #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { pmd_val(*(xp)) = 0; } while (0) +#define pmd_next(x) (NULL) /* * The "pgd_xxx()" functions here are trivial for a folded two-level @@ -374,7 +377,7 @@ */ extern inline void pte_free_kernel(pte_t * pte) { - free_page((unsigned long) pte); + free_pages((unsigned long) pte, 1); } extern const char bad_pmd_string[]; @@ -383,16 +386,19 @@ { address = (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); if (pmd_none(*pmd)) { - pte_t * page = (pte_t *) get_free_page(GFP_KERNEL); + pte_t * page = (pte_t *) __get_free_pages(GFP_KERNEL, 1, 0); if (pmd_none(*pmd)) { if (page) { + mem_map[MAP_NR(page)].type = PGtype_pte; + mem_map[MAP_NR(page)].u.pte.pmd = NULL; + memset((void *) page, 0, PAGE_SIZE*2); pmd_val(*pmd) = _KERNPG_TABLE + __pa(page); return page + address; } pmd_val(*pmd) = _KERNPG_TABLE + __pa(BAD_PAGETABLE); return NULL; } - free_page((unsigned long) page); + free_pages((unsigned long) page, 1); } if (pmd_bad(*pmd)) { printk(bad_pmd_string, pmd_val(*pmd)); @@ -416,9 +422,19 @@ return (pmd_t *) pgd; } +#define pte_next(pte_p) (*(pte_t **)((unsigned long)(pte_p) + PAGE_SIZE)) extern inline void pte_free(pte_t * pte) { - free_page((unsigned long) pte); + int i, ow=0; + for (i=0; i<1024; i++) { + if (NULL != pte_next(pte+i)) { + printk("Akk!! pte not unlinked - %p (%08lx)\n", pte+i, pte_val(pte[i])); + ow = 1; + } + } + if (ow) + for (;;) ; + free_pages((unsigned long) pte, 1); } extern inline pte_t * pte_alloc(pmd_t * pmd, unsigned long address) @@ -434,16 +450,18 @@ getnew: { - unsigned long page = __get_free_page(GFP_KERNEL); + unsigned long page = __get_free_pages(GFP_KERNEL, 1, 0); if (!pmd_none(*pmd)) goto freenew; if (!page) goto oom; - memset((void *) page, 0, PAGE_SIZE); + mem_map[MAP_NR(page)].type = PGtype_pte; + mem_map[MAP_NR(page)].u.pte.pmd = pmd; + memset((void *) page, 0, PAGE_SIZE*2); pmd_val(*pmd) = _PAGE_TABLE + __pa(page); return (pte_t *) (page + address); freenew: - free_page(page); + free_pages(page, 1); goto repeat; } @@ -454,6 +472,89 @@ return NULL; } +#define pgd_getmm(pgd_p) (mem_map[MAP_NR((unsigned long)pgd_p & PAGE_MASK)].u.pgd.mm) +#define pgd_getoffset(pgd_p) (((unsigned long)pgd_p & PAGE_SIZE) / sizeof(pgd_t)) + +#define pmd_getpgd(pmd_p) ((pgd_t *)(pmd_p)) +#define pmd_getoffset(pmd_p) (0) + +#define pte_getpmd(pte_p) (mem_map[MAP_NR((unsigned long)pte_p & PAGE_MASK)].u.pte.pmd) +#define pte_getoffset(pte_p) (((unsigned long)pte_p & PAGE_SIZE) / sizeof(pte_t)) + +static inline unsigned long ptepmdpgd_getaddr(pte_t *pte_p, pmd_t *pmd_p, pgd_t *pgd_p) +{ return (pte_getoffset(pte_p) << PAGE_SHIFT) | (pmd_getoffset(pmd_p) << PMD_SHIFT) | (pgd_getoffset(pgd_p) << PGDIR_SHIFT); } + +extern unsigned long nr_pte_links, nr_zero_pte_links; +extern unsigned long nr_pte_unlinks, nr_zero_pte_unlinks; + +static inline void pte_link(pte_t *pte_p, unsigned long addr) +{ + struct page *page = mem_map + MAP_NR(addr); +#ifdef PGTABLE_LINK_DEBUG + if (MAP_NR(addr) >= max_mapnr || PageReserved(page)) { + printk("Aieee! pte_link(%p, %08lx) on reserved page!\n", pte_p, addr); + for(;;) ; + } + if (PGtype_free == page->type) { + printk("Aieee! pte_link(%p, %08lx) on free page! - %p\n", pte_p, addr, page->u.d.pte_head); + for(;;) ; + } + if (PGtype_data != page->type) { + printk("Aieee! pte_link(%p, %08lx) on page->type(%d) != PGtype_data!\n", pte_p, addr, page->type); + for(;;) ; + } + if (NULL != pte_next(pte_p)) { + printk("pte_link(%p, %08lx): pte already on list!\n", pte_p, addr); + for(;;) ; + } +#endif + if (NULL == page->u.d.pte_head) + page->u.d.pte_head = pte_p; + pte_next(pte_p) = pte_next(page->u.d.pte_head); + pte_next(page->u.d.pte_head) = pte_p; + nr_pte_links++; +} + +static inline void pte_unlink(pte_t *pte_p, unsigned long addr) +{ + struct page *page = mem_map + MAP_NR(addr); + pte_t **pte_pp; +#ifdef PGTABLE_LINK_DEBUG + if (MAP_NR(ZERO_PAGE) == MAP_NR(addr)) { + printk("Aieee! pte_unlink(%p, %08lx) on zero page!\n", pte_p, addr); + for(;;) ; + } + if (MAP_NR(addr) >= max_mapnr || PageReserved(page)) { + printk("Aieee! pte_unlink(%p, %08lx) on reserved page!\n", pte_p, addr); + for(;;) ; + } + if (PGtype_free == page->type) { + printk("Aieee! pte_unlink(%p, %08lx) on free page! - %p\n", pte_p, addr, page->u.d.pte_head); + for(;;) ; + } + if (NULL == pte_next(pte_p) || NULL == page->u.d.pte_head) { +notfound: + printk("pte_unlink(%p, %08lx): pte not on list(%p)! next(%p)\n", pte_p, addr, page->u.d.pte_head, pte_next(pte_p)); + for(;;) ; + } + pte_pp = &pte_next(page->u.d.pte_head); + while (pte_p != *pte_pp) { + if (page->u.d.pte_head == *pte_pp || NULL == *pte_pp) + goto notfound; + pte_pp = &pte_next(*pte_pp); + } +#else + pte_pp = &pte_next(page->u.d.pte_head); + while (pte_p != *pte_pp) + pte_pp = &pte_next(*pte_pp); +#endif + *pte_pp = pte_next(*pte_pp); + pte_next(pte_p) = NULL; + if (pte_p == page->u.d.pte_head) + page->u.d.pte_head = *pte_pp; + nr_pte_unlinks++; +} + /* * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. @@ -470,12 +571,20 @@ extern inline void pgd_free(pgd_t * pgd) { - free_page((unsigned long) pgd); + PageSetPgd(&mem_map[MAP_NR((unsigned long) pgd)]); + free_pages((unsigned long) pgd, 1); } -extern inline pgd_t * pgd_alloc(void) +extern inline pgd_t * pgd_alloc(struct mm_struct *mm) { - return (pgd_t *) get_free_page(GFP_KERNEL); + unsigned long pgd = __get_free_pages(GFP_KERNEL, 1, 0); + if (pgd) { + memset((void *)pgd, 0, PAGE_SIZE*2); + PageSetPgd(&mem_map[MAP_NR(pgd)]); + mem_map[MAP_NR(pgd)].type = PGtype_pgd; + mem_map[MAP_NR(pgd)].u.pgd.mm = mm; + } + return (pgd_t *) pgd; } extern pgd_t swapper_pg_dir[1024]; diff -ur 2.1.66/include/linux/mm.h 2.1.66-pte_list/include/linux/mm.h --- 2.1.66/include/linux/mm.h Tue Nov 25 18:55:31 1997 +++ 2.1.66-pte_list/include/linux/mm.h Thu Nov 27 00:57:18 1997 @@ -118,15 +118,40 @@ unsigned long offset; struct page *next_hash; atomic_t count; - unsigned int age; unsigned long flags; /* atomic flags, some possibly updated asynchronously */ struct wait_queue *wait; struct page **pprev_hash; struct buffer_head * buffers; - unsigned long pg_swap_entry; unsigned long map_nr; /* page->map_nr == page - mem_map */ + + /* these should be rethought. For that matter, the entire structure is on the large side */ + unsigned short age; + unsigned char type; + unsigned char order; + union { + struct { + unsigned long pg_swap_entry; + pte_t *pte_head; + } d; /* normal 'managed' pages */ + struct { + pmd_t *pmd; + } pte; + struct { + pgd_t *pgd; + } pmd; + struct { + struct mm_struct *mm; + } pgd; + } u; } mem_map_t; +#define PGtype_other 0 /* internal unspecified kernel use */ +#define PGtype_data 1 /* standard vm managed pages */ +#define PGtype_pte 2 /* pte_frompage(page) is valid */ +#define PGtype_pmd 3 /* pmd_frompage(page) is valid */ +#define PGtype_pgd 4 /* pgd_frompage(page) is valid */ +#define PGtype_free 5 /* a free page ;-) */ + /* Page flag bit values */ #define PG_locked 0 #define PG_error 1 @@ -138,6 +163,8 @@ #define PG_DMA 7 #define PG_Slab 8 #define PG_swap_cache 9 +#define PG_pgd 10 +#define PG_dirty 11 #define PG_reserved 31 /* Make it prettier to test the above... */ @@ -158,9 +185,11 @@ #define PageSetSwapCache(page) (set_bit(PG_swap_cache, &(page)->flags)) #define PageTestandSetSwapCache(page) \ (test_and_set_bit(PG_swap_cache, &(page)->flags)) +#define PageSetPgd(page) (set_bit(PG_pgd, &(page)->flags)) #define PageClearSlab(page) (clear_bit(PG_Slab, &(page)->flags)) #define PageClearSwapCache(page)(clear_bit(PG_swap_cache, &(page)->flags)) +#define PageClearPgd(page) (clear_bit(PG_pgd, &(page)->flags)) #define PageTestandClearSwapCache(page) \ (test_and_clear_bit(PG_swap_cache, &(page)->flags)) diff -ur 2.1.66/include/linux/sched.h 2.1.66-pte_list/include/linux/sched.h --- 2.1.66/include/linux/sched.h Tue Nov 25 18:55:31 1997 +++ 2.1.66-pte_list/include/linux/sched.h Thu Nov 27 00:57:18 1997 @@ -151,6 +151,7 @@ unsigned long rss, total_vm, locked_vm; unsigned long def_flags; unsigned long cpu_vm_mask; + unsigned long nswap; }; #define INIT_MM { \ @@ -161,7 +162,8 @@ 0, 0, 0, 0, \ 0, 0, 0, 0, \ 0, 0, 0, \ - 0, 0 } + 0, 0, \ + 0 } struct signal_struct { atomic_t count; @@ -229,7 +231,7 @@ struct tms times; unsigned long start_time; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ - unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; + unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt, cnswap; int swappable:1; unsigned long swap_address; unsigned long old_maj_flt; /* old value of maj_flt */ @@ -324,7 +326,7 @@ /* timeout */ 0,SCHED_OTHER,0,0,0,0,0,0,0, \ /* timer */ { NULL, NULL, 0, 0, it_real_fn }, \ /* utime */ {0,0,0,0},0, \ -/* flt */ 0,0,0,0,0,0, \ +/* flt */ 0,0,0,0,0, \ /* swp */ 0,0,0,0,0, \ /* rlimits */ INIT_RLIMITS, \ /* math */ 0, \ diff -ur 2.1.66/include/linux/swap.h 2.1.66-pte_list/include/linux/swap.h --- 2.1.66/include/linux/swap.h Fri Oct 17 18:49:15 1997 +++ 2.1.66-pte_list/include/linux/swap.h Wed Nov 26 23:10:34 1997 @@ -100,7 +100,7 @@ extern inline unsigned long in_swap_cache(struct page *page) { if (PageSwapCache(page)) - return page->pg_swap_entry; + return page->u.d.pg_swap_entry; return 0; } @@ -113,7 +113,7 @@ #ifdef SWAP_CACHE_INFO swap_cache_find_success++; #endif - return page->pg_swap_entry; + return page->u.d.pg_swap_entry; } return 0; } @@ -127,7 +127,7 @@ #ifdef SWAP_CACHE_INFO swap_cache_del_success++; #endif - swap_free(page->pg_swap_entry); + swap_free(page->u.d.pg_swap_entry); return 1; } return 0; diff -ur 2.1.66/ipc/shm.c 2.1.66-pte_list/ipc/shm.c --- 2.1.66/ipc/shm.c Sat Sep 6 13:12:16 1997 +++ 2.1.66-pte_list/ipc/shm.c Wed Nov 26 23:10:34 1997 @@ -465,6 +465,7 @@ break; } set_pte(page_table, __pte(shm_sgn)); + pte_link(page_table, pte_page(__pte(shm_sgn))); } flush_tlb_range(shmd->vm_mm, shmd->vm_start, shmd->vm_end); return error; @@ -807,6 +808,7 @@ set_pte(page_table, __pte(shmd->vm_pte + SWP_ENTRY(0, idx << SHM_IDX_SHIFT))); atomic_dec(&mem_map[MAP_NR(pte_page(pte))].count); + pte_unlink(page_table, pte_page(page)); if (shmd->vm_mm->rss > 0) shmd->vm_mm->rss--; flush_tlb_page(shmd, tmp); diff -ur 2.1.66/kernel/exit.c 2.1.66-pte_list/kernel/exit.c --- 2.1.66/kernel/exit.c Mon Sep 22 17:55:59 1997 +++ 2.1.66-pte_list/kernel/exit.c Wed Nov 26 23:10:34 1997 @@ -145,7 +145,7 @@ release_thread(p); current->cmin_flt += p->min_flt + p->cmin_flt; current->cmaj_flt += p->maj_flt + p->cmaj_flt; - current->cnswap += p->nswap + p->cnswap; + current->cnswap += p->mm->nswap + p->cnswap; free_task_struct(p); } else { printk("task releasing itself\n"); diff -ur 2.1.66/kernel/fork.c 2.1.66-pte_list/kernel/fork.c --- 2.1.66/kernel/fork.c Sun Oct 12 13:12:38 1997 +++ 2.1.66-pte_list/kernel/fork.c Wed Nov 26 23:20:23 1997 @@ -319,7 +319,7 @@ tsk->mm = mm; tsk->min_flt = tsk->maj_flt = 0; tsk->cmin_flt = tsk->cmaj_flt = 0; - tsk->nswap = tsk->cnswap = 0; + tsk->cnswap = 0; retval = new_page_tables(tsk); if (retval) goto free_mm; diff -ur 2.1.66/kernel/sys.c 2.1.66-pte_list/kernel/sys.c --- 2.1.66/kernel/sys.c Tue Oct 21 11:57:30 1997 +++ 2.1.66-pte_list/kernel/sys.c Wed Nov 26 23:10:34 1997 @@ -970,7 +970,7 @@ r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_stime); r.ru_minflt = p->min_flt; r.ru_majflt = p->maj_flt; - r.ru_nswap = p->nswap; + r.ru_nswap = p->mm->nswap; break; case RUSAGE_CHILDREN: r.ru_utime.tv_sec = CT_TO_SECS(p->times.tms_cutime); @@ -988,7 +988,7 @@ r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_stime + p->times.tms_cstime); r.ru_minflt = p->min_flt + p->cmin_flt; r.ru_majflt = p->maj_flt + p->cmaj_flt; - r.ru_nswap = p->nswap + p->cnswap; + r.ru_nswap = p->mm->nswap + p->cnswap; break; } return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; diff -ur 2.1.66/mm/filemap.c 2.1.66-pte_list/mm/filemap.c --- 2.1.66/mm/filemap.c Tue Nov 25 17:45:26 1997 +++ 2.1.66-pte_list/mm/filemap.c Wed Nov 26 23:10:34 1997 @@ -29,7 +29,7 @@ #include #include #include - +#define inline /**/ /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -249,6 +249,7 @@ struct page **hash) { atomic_inc(&page->count); + page->type = PGtype_data; page->flags &= ~((1 << PG_uptodate) | (1 << PG_error)); page->offset = offset; add_page_to_inode_queue(inode, page); @@ -830,6 +831,7 @@ copy_page(new_page, old_page); flush_page_to_ram(new_page); release_page(page); + mem_map[MAP_NR(new_page)].type = PGtype_data; return new_page; no_cached_page: @@ -997,6 +999,7 @@ flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset)); set_pte(page_table, __pte(entry)); + pte_unlink(page_table, page); flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset)); error = filemap_write_page(vma, offset, page); if (pte_val(*page_table) == entry) @@ -1051,6 +1054,7 @@ return 0; } page = pte_page(pte); + pte_unlink(ptep, page); if (!pte_dirty(pte) || flags == MS_INVALIDATE) { free_page(page); return 0; diff -ur 2.1.66/mm/memory.c 2.1.66-pte_list/mm/memory.c --- 2.1.66/mm/memory.c Mon Sep 1 15:10:04 1997 +++ 2.1.66-pte_list/mm/memory.c Wed Nov 26 23:23:12 1997 @@ -50,6 +50,8 @@ #include #include +#define inline /**/ + unsigned long max_mapnr = 0; unsigned long num_physpages = 0; void * high_memory = NULL; @@ -167,7 +169,7 @@ { pgd_t * page_dir, * new_pg; - if (!(new_pg = pgd_alloc())) + if (!(new_pg = pgd_alloc(tsk->mm))) return -ENOMEM; page_dir = pgd_offset(&init_mm, 0); memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD, @@ -201,6 +203,7 @@ set_pte(new_pte, pte_mkold(pte)); set_pte(old_pte, pte); atomic_inc(&mem_map[page_nr].count); + pte_link(new_pte, pte_page(pte)); } static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow) @@ -295,12 +298,13 @@ /* * Return indicates whether a page was freed so caller can adjust rss */ -static inline int free_pte(pte_t page) +static inline int free_pte(pte_t *page_table, pte_t page) { if (pte_present(page)) { unsigned long addr = pte_page(page); if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr))) return 0; + pte_unlink(page_table, addr); free_page(addr); return 1; } @@ -308,11 +312,11 @@ return 0; } -static inline void forget_pte(pte_t page) +static inline void forget_pte(pte_t *page_table, pte_t page) { if (!pte_none(page)) { printk("forget_pte: old mapping existed!\n"); - free_pte(page); + free_pte(page_table, page); } } @@ -344,7 +348,7 @@ if (pte_none(page)) continue; pte_clear(pte-1); - freed += free_pte(page); + freed += free_pte(pte-1, page); } return freed; } @@ -412,7 +416,7 @@ do { pte_t oldpage = *pte; set_pte(pte, zero_pte); - forget_pte(oldpage); + forget_pte(pte, oldpage); address += PAGE_SIZE; pte++; } while (address < end); @@ -485,7 +489,7 @@ mapnr = MAP_NR(__va(phys_addr)); if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr)) set_pte(pte, mk_pte_phys(phys_addr, prot)); - forget_pte(oldpage); + forget_pte(pte, oldpage); address += PAGE_SIZE; phys_addr += PAGE_SIZE; pte++; @@ -585,6 +589,8 @@ } flush_page_to_ram(page); set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY)))); + mem_map[MAP_NR(page)].type = PGtype_data; + pte_link(pte, page); /* no need for flush_tlb */ return page; } @@ -634,12 +640,21 @@ flush_page_to_ram(new_page); flush_cache_page(vma, address); set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); + if (!PageReserved(mem_map + MAP_NR(old_page))) { + static int foo = 0; + if (++foo < 5) + printk("do_wp_page: pte_unlink: addr = %08lx\n", address); + pte_unlink(page_table, old_page); + } + mem_map[MAP_NR(new_page)].type = PGtype_data; + pte_link(page_table, new_page); free_page(old_page); flush_tlb_page(vma, address); return; } flush_cache_page(vma, address); set_pte(page_table, BAD_PAGE); + pte_unlink(page_table, old_page); flush_tlb_page(vma, address); free_page(old_page); oom(tsk); @@ -769,6 +784,7 @@ ++tsk->maj_flt; flush_page_to_ram(pte_page(page)); set_pte(page_table, page); + pte_link(page_table, pte_page(page)); return; } @@ -820,6 +836,7 @@ !(vma->vm_flags & VM_SHARED)) entry = pte_wrprotect(entry); put_page(page_table, entry); + pte_link(page_table, page); /* no need to invalidate: a not-present page shouldn't be cached */ return; @@ -834,6 +851,8 @@ vma->vm_mm->rss++; tsk->min_flt++; flush_page_to_ram(page); + mem_map[MAP_NR(page)].type = PGtype_data; + pte_link(page_table, page); } put_page(page_table, entry); return; diff -ur 2.1.66/mm/mremap.c 2.1.66-pte_list/mm/mremap.c --- 2.1.66/mm/mremap.c Mon Jul 14 00:20:11 1997 +++ 2.1.66-pte_list/mm/mremap.c Wed Nov 26 23:10:34 1997 @@ -73,6 +73,12 @@ if (dst) { pte_clear(src); set_pte(dst, pte); + if (pte_present(pte) && + MAP_NR(pte_page(pte)) <= max_mapnr && + !PageReserved(mem_map+MAP_NR(pte_page(pte)))) { + pte_unlink(src, pte_page(pte)); + pte_link(dst, pte_page(pte)); + } error--; } } diff -ur 2.1.66/mm/page_alloc.c 2.1.66-pte_list/mm/page_alloc.c --- 2.1.66/mm/page_alloc.c Mon Jun 16 19:36:01 1997 +++ 2.1.66-pte_list/mm/page_alloc.c Wed Nov 26 23:10:34 1997 @@ -26,6 +26,12 @@ #include #include #include +#define inline /**/ + +unsigned long nr_pte_links = 0; +unsigned long nr_zero_pte_links = 0; +unsigned long nr_pte_unlinks = 0; +unsigned long nr_zero_pte_unlinks = 0; int nr_swap_pages = 0; int nr_free_pages = 0; @@ -50,6 +56,7 @@ struct page *next; struct page *prev; unsigned int * map; + unsigned long inuse; }; #define memory_head(x) ((struct page *)(x)) @@ -100,7 +107,7 @@ #ifdef __SMP__ static spinlock_t page_alloc_lock; #endif - +void *caller; static inline void free_pages_ok(unsigned long map_nr, unsigned long order) { struct free_area_struct *area = free_area + order; @@ -109,9 +116,28 @@ unsigned long flags; spin_lock_irqsave(&page_alloc_lock, flags); +#ifdef PGTABLE_LINK_DEBUG + if (NULL != mem_map[map_nr].u.d.pte_head) { + printk("Aieee! freeing page %08lx with pte_head = %p!\n", (PAGE_OFFSET + (map_nr << PAGE_SHIFT)), mem_map[map_nr].u.d.pte_head); + *(char *)0 = 0; sti(); + for (;;) ; + } + if (PGtype_free == mem_map[map_nr].type) { + printk("Aiee: freeing free page(%lu) from %p/%p!\n", map_nr, caller, mem_map[map_nr].u.d.pte_head); + goto out; + } + mem_map[map_nr].u.d.pte_head = caller; +#endif + if (order != mem_map[map_nr].order) { + printk("Aiee: freeing page(%lu) with wrong order (%luv%d)! from %p\n", map_nr, order, mem_map[map_nr].order, caller); + goto out; + } + mem_map[map_nr].type = PGtype_free; #define list(x) (mem_map+(x)) + if (area->inuse) + area->inuse--; map_nr &= mask; nr_free_pages -= mask; while (mask + (1 << (NR_MEM_LISTS-1))) { @@ -127,6 +153,7 @@ #undef list +out: spin_unlock_irqrestore(&page_alloc_lock, flags); } @@ -134,6 +161,7 @@ { if (!PageReserved(page) && atomic_dec_and_test(&page->count)) { delete_from_swap_cache(page); + caller = ((void **)&page)[-1]; free_pages_ok(page->map_nr, 0); } } @@ -148,6 +176,7 @@ return; if (atomic_dec_and_test(&map->count)) { delete_from_swap_cache(map); + caller = ((void **)&addr)[-1]; free_pages_ok(map_nr, order); return; } @@ -172,6 +201,10 @@ MARK_USED(map_nr, new_order, area); \ nr_free_pages -= 1 << order; \ EXPAND(ret, map_nr, order, new_order, area); \ + free_area[order].inuse++; \ + mem_map[map_nr].order = order; \ + mem_map[map_nr].type = PGtype_other; \ + mem_map[map_nr].u.d.pte_head = NULL; \ spin_unlock_irqrestore(&page_alloc_lock, flags); \ return ADDRESS(map_nr); \ } \ @@ -213,7 +246,7 @@ reserved_pages = 5; if (priority != GFP_NFS) - reserved_pages = min_free_pages; + reserved_pages = (priority == GFP_KERNEL) ? free_pages_low : min_free_pages; repeat: spin_lock_irqsave(&page_alloc_lock, flags); if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) { @@ -222,7 +255,7 @@ return 0; } spin_unlock_irqrestore(&page_alloc_lock, flags); - if (priority != GFP_BUFFER && try_to_free_page(priority, dma, 1)) + if (priority != GFP_BUFFER && try_to_free_page(priority, dma, order)) goto repeat; return 0; } @@ -236,20 +269,28 @@ { unsigned long order, flags; unsigned long total = 0; + unsigned long total_data = 0; + unsigned long nr; printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10)); spin_lock_irqsave(&page_alloc_lock, flags); for (order=0 ; order < NR_MEM_LISTS; order++) { struct page * tmp; - unsigned long nr = 0; + nr = 0; for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->next) { nr ++; } total += nr * ((PAGE_SIZE>>10) << order); - printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order)); + printk("%lu/%lu*%lukB ", nr, free_area[order].inuse, (unsigned long)((PAGE_SIZE>>10) << order)); } + for (nr=0; nr>10)); +#ifdef PGTABLE_LINK_DEBUG + printk("pte_links: %lu of zero: %lu\n", nr_pte_links, nr_zero_pte_links); + printk("pte_unlinks: %lu of zero: %lu\n", nr_pte_unlinks, nr_zero_pte_unlinks); +#endif #ifdef SWAP_CACHE_INFO show_swap_cache_info(); #endif @@ -288,7 +329,7 @@ atomic_set(&p->count, 0); p->flags = (1 << PG_DMA) | (1 << PG_reserved); p->map_nr = p - mem_map; - } while (p > mem_map); + } while (p >= mem_map); for (i = 0 ; i < NR_MEM_LISTS ; i++) { unsigned long bitmap_size; @@ -327,6 +368,7 @@ oom(tsk); return; } + mem_map[MAP_NR(page)].type = PGtype_data; read_swap_page(entry, (char *) page); if (pte_val(*page_table) != entry) { free_page(page); @@ -334,6 +376,8 @@ } vma->vm_mm->rss++; tsk->maj_flt++; + mem_map[MAP_NR(page)].type = PGtype_data; + pte_link(page_table, page); if (!write_access && add_to_swap_cache(&mem_map[MAP_NR(page)], entry)) { /* keep swap page allocated for the moment (swap cache) */ set_pte(page_table, mk_pte(page, vma->vm_page_prot)); diff -ur 2.1.66/mm/page_io.c 2.1.66-pte_list/mm/page_io.c --- 2.1.66/mm/page_io.c Mon Jul 14 00:20:11 1997 +++ 2.1.66-pte_list/mm/page_io.c Wed Nov 26 23:10:34 1997 @@ -85,7 +85,7 @@ set_bit(PG_swap_unlock_after, &page->flags); /* swap-cache shouldn't be set, but play safe */ PageClearSwapCache(page); - page->pg_swap_entry = entry; + page->u.d.pg_swap_entry = entry; atomic_inc(&nr_async_pages); } ll_rw_page(rw,p->swap_device,offset,buf); diff -ur 2.1.66/mm/swap_state.c 2.1.66-pte_list/mm/swap_state.c --- 2.1.66/mm/swap_state.c Thu Oct 30 13:52:30 1997 +++ 2.1.66-pte_list/mm/swap_state.c Wed Nov 26 23:10:34 1997 @@ -46,7 +46,7 @@ swap_cache_add_total++; #endif if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { - page->pg_swap_entry = entry; + page->u.d.pg_swap_entry = entry; if (PageTestandSetSwapCache(page)) printk("swap_cache: replacing non-empty entry\n"); #ifdef SWAP_CACHE_INFO diff -ur 2.1.66/mm/swapfile.c 2.1.66-pte_list/mm/swapfile.c --- 2.1.66/mm/swapfile.c Thu Oct 30 13:52:30 1997 +++ 2.1.66-pte_list/mm/swapfile.c Thu Nov 27 00:14:50 1997 @@ -201,6 +201,7 @@ } if (pte_val(pte) != entry) return 0; + pte_link(dir, page); set_pte(dir, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)))); ++vma->vm_mm->rss; swap_free(entry); diff -ur 2.1.66/mm/vmalloc.c 2.1.66-pte_list/mm/vmalloc.c --- 2.1.66/mm/vmalloc.c Mon Jul 7 11:18:56 1997 +++ 2.1.66-pte_list/mm/vmalloc.c Wed Nov 26 23:10:34 1997 @@ -37,6 +37,9 @@ if (pte_none(page)) continue; if (pte_present(page)) { +#if 0 + pte_unlink(pte, pte_page(page)); +#endif free_page(pte_page(page)); continue; } @@ -99,6 +102,10 @@ if (!page) return -ENOMEM; set_pte(pte, mk_pte(page, PAGE_KERNEL)); +#if 0 + mem_map[MAP_NR(page)].type = PGtype_data; + pte_link(pte, page); +#endif address += PAGE_SIZE; pte++; } diff -ur 2.1.66/mm/vmscan.c 2.1.66-pte_list/mm/vmscan.c --- 2.1.66/mm/vmscan.c Thu Oct 23 16:30:25 1997 +++ 2.1.66-pte_list/mm/vmscan.c Thu Nov 27 01:04:19 1997 @@ -7,7 +7,8 @@ * kswapd added: 7.1.96 sct * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to free_pages_high: 2.4.97, Rik van Riel. - * Version: $Id: vmscan.c,v 1.23 1997/04/12 04:31:05 davem Exp $ + * Major rewrite in honour of The Great Pumpkin 28.7.97, Benjamin LaHaise. + * Version: $Id: vmscan.c,v 1.47 1997/08/06 02:55:28 blah Exp $ */ #include @@ -22,14 +23,18 @@ #include #include #include +#include #include #include +#define inline /**/ + /* * When are we next due for a page scan? */ -static int next_swap_jiffies = 0; +static int last_swap_jiffies = 0; +static int last_age_jiffies = 0; /* * How often do we do a pageout scan during normal conditions? @@ -49,6 +54,17 @@ static void init_swap_timer(void); +static void kill_mm(struct mm_struct *mm) +{ + struct task_struct *t; + read_lock(&tasklist_lock); + for_each_task(t) { + if (t->mm == mm) + send_sig(SIGBUS, t, 1); + } + read_unlock(&tasklist_lock); +} + /* * The swap-out functions return 1 if they successfully * threw something out, and we got a free page. It returns @@ -60,60 +76,78 @@ * using a process that no longer actually exists (it might * have died while we slept). */ -static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma, - unsigned long address, pte_t * page_table, int dma, int wait) +static inline int swap_out_pte (pte_t *page_table, struct page *page_map, int wait) { pte_t pte; unsigned long entry; unsigned long page; - struct page * page_map; + struct vm_area_struct *vma; + unsigned long address; + pmd_t *pmd_p; + pgd_t *pgd_p; + struct mm_struct *mm; pte = *page_table; - if (!pte_present(pte)) - return 0; page = pte_page(pte); - if (MAP_NR(page) >= max_mapnr) +#ifdef PGTABLE_LINK_DEBUG + if (!pte_present(pte)) { + printk("try_to_swap_out: Aiee! page(%lu) not present! pte=%08lx\n", page_map->map_nr, pte_val(pte)); return 0; - - page_map = mem_map + MAP_NR(page); - if (PageReserved(page_map) - || PageLocked(page_map) - || (dma && !PageDMA(page_map))) + } + if (page_map->map_nr != (page_map - mem_map)) { + printk("try_to_swap_out: Aiee! page_map->map_nr(%lu) disagrees with page_map(%u)\n", page_map->map_nr, page_map-mem_map); + return 0; + } +#endif + pmd_p = pte_getpmd(page_table); + if (!pmd_p) { + printk("swap_out_pte: no pmd!\n"); + return 0; + } + pgd_p = pmd_getpgd(pmd_p); + if (!pgd_p) { + printk("swap_out_pte: no pgd!\n"); return 0; - /* Deal with page aging. Pages age from being unused; they - * rejuvenate on being accessed. Only swap old pages (age==0 - * is oldest). */ - if ((pte_dirty(pte) && delete_from_swap_cache(page_map)) - || pte_young(pte)) { - set_pte(page_table, pte_mkold(pte)); - touch_page(page_map); + } + mm = pgd_getmm(pgd_p); + if (!mm) { + printk("swap_out_pte: no mm!\n"); return 0; } - age_page(page_map); - if (page_map->age) + address = ptepmdpgd_getaddr(page_table, pmd_p, pgd_p); + vma = find_vma(mm, address); + /* Don't swap out areas like shared memory which have their + own separate swapping mechanism or areas which are locked down */ + if (vma->vm_flags & (VM_SHM | VM_LOCKED)) return 0; if (pte_dirty(pte)) { if (vma->vm_ops && vma->vm_ops->swapout) { - pid_t pid = tsk->pid; vma->vm_mm->rss--; if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table)) - kill_proc(pid, SIGBUS, 1); + kill_mm(vma->vm_mm); } else { - if (atomic_read(&page_map->count) != 1) + /* this isn't right: can only happen with a private map that's shared and dirty */ +#if 0 + if (atomic_read(&page_map->count) != 1) { + printk("swap_out_pte: pte_dirty, page shared(%d), but no swapout operation!\n", atomic_read(&page_map->count)); return 0; + } +#endif if (!(entry = get_swap_page())) return 0; vma->vm_mm->rss--; flush_cache_page(vma, address); set_pte(page_table, __pte(entry)); + pte_unlink(page_table, page); flush_tlb_page(vma, address); - tsk->nswap++; + mm->nswap++; rw_swap_page(WRITE, entry, (char *) page, wait); } + entry = atomic_read(&page_map->count); free_page(page); - return 1; /* we slept: the process may not exist any more */ + return entry; /* we slept: the process may not exist any more */ } - if ((entry = find_in_swap_cache(page_map))) { + if ((entry = find_in_swap_cache(page_map))) { if (atomic_read(&page_map->count) != 1) { set_pte(page_table, pte_mkdirty(pte)); printk("Aiee.. duplicated cached swap-cache entry\n"); @@ -122,262 +156,223 @@ vma->vm_mm->rss--; flush_cache_page(vma, address); set_pte(page_table, __pte(entry)); + pte_unlink(page_table, page); flush_tlb_page(vma, address); + entry = atomic_read(&page_map->count); free_page(page); - return 1; + return entry; } vma->vm_mm->rss--; flush_cache_page(vma, address); pte_clear(page_table); + pte_unlink(page_table, page); flush_tlb_page(vma, address); - entry = page_unuse(page); + entry = page_unuse(page); free_page(page); return entry; } -/* - * A new implementation of swap_out(). We do not swap complete processes, - * but only a small number of blocks, before we continue with the next - * process. The number of blocks actually swapped is determined on the - * number of page faults, that this process actually had in the last time, - * so we won't swap heavily used processes all the time ... - * - * Note: the priority argument is a hint on much CPU to waste with the - * swap block search, not a hint, of how much blocks to swap with - * each process. - * - * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de - */ - -static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma, - pmd_t *dir, unsigned long address, unsigned long end, int dma, int wait) +static inline int try_to_swap_page (struct page *page, int wait) { - pte_t * pte; - unsigned long pmd_end; - - if (pmd_none(*dir)) - return 0; - if (pmd_bad(*dir)) { - printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir)); - pmd_clear(dir); - return 0; + pte_t *pte_p; + int tries = atomic_read(&page->count)+1; + if (PGtype_data != page->type) + goto buffers; +#ifdef DEBUG + printk(KERN_DEBUG "kswapd: try_to_swap_page(NR=%lu,%d) tries=%d\n", page->map_nr, wait, tries); +#endif + while ((pte_p = page->u.d.pte_head) && (--tries > 0)) { +#ifdef PGTABLE_LINK_DEBUG + /* + * Hack Alert!!!: by going to the next pte in the chain, + * we make pte_unlink(pte_p) O(1) rather than O(page->count) + * The marvels of Singly Linked Lists - quack! + */ + pte_p = pte_next(pte_p); +#endif + switch (swap_out_pte(pte_p, page, wait)) { + case 0: /* something prevented the unuse of this page... uh-oh */ + goto out_fail; + case 1: /* page was freed */ + goto out_good; + } + if (PageLocked(page)) { + printk("try_to_swap_page(%lu,%d): page now locked\n", page->map_nr, wait); + goto out_fail; + } } - - pte = pte_offset(dir, address); - - pmd_end = (address + PMD_SIZE) & PMD_MASK; - if (end > pmd_end) - end = pmd_end; - - do { - int result; - tsk->swap_address = address + PAGE_SIZE; - result = try_to_swap_out(tsk, vma, address, pte, dma, wait); - if (result) - return result; - address += PAGE_SIZE; - pte++; - } while (address < end); + if (1 != atomic_read(&page->count)) + goto out_fail; + if (page->inode) { +#ifdef DEBUG + printk(KERN_DEBUG "kswapd: try_to_swap_page: freeing page cache page\n"); +#endif + remove_page_from_hash_queue(page); + remove_page_from_inode_queue(page); + __free_page(page); + goto out_good; + } +buffers: + if (!test_and_clear_bit(PG_referenced, &page->flags)) { + struct buffer_head *bh = page->buffers; + if (bh && try_to_free_buffer(bh, &bh, 6)) + goto out_good; + } +out_fail: +#ifdef DEBUG + printk(KERN_DEBUG "kswapd: try_to_swap_page: failed\n"); +#endif + return 1; +out_good: return 0; } -static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma, - pgd_t *dir, unsigned long address, unsigned long end, int dma, int wait) +static inline void do_age_page(struct page *page) { - pmd_t * pmd; - unsigned long pgd_end; - - if (pgd_none(*dir)) - return 0; - if (pgd_bad(*dir)) { - printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir)); - pgd_clear(dir); - return 0; + struct buffer_head *bh; + pte_t *pte_p; + int touches = 0; + + if (page->map_nr != (page - mem_map)) { + printk("do_age_page: Aiee! page-mem_map(%u) disagrees with page->map_nr(%lu)\n", (page-mem_map), page->map_nr); + return; } + if (PGtype_data != page->type) + goto buffers; + /* + * lock the page while mucking with it, as otherwise + * interrupts can steal it out from under us. + */ + if (test_and_set_bit(PG_locked, &page->flags)) + return; + pte_p = page->u.d.pte_head; + if (!pte_p) + goto buffers_unlock; - pmd = pmd_offset(dir, address); - - pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; - if (end > pgd_end) - end = pgd_end; - do { - int result = swap_out_pmd(tsk, vma, pmd, address, end, dma, wait); - if (result) - return result; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); - return 0; -} + pte_t pte = *pte_p; + if (!pte_present(pte)) { + printk("update_page: Aiee - pte not present!\n"); + goto next; + } + if ((pte_dirty(pte) && delete_from_swap_cache(page)) + ||pte_young(pte)) { + set_pte(pte_p, pte_mkold(pte)); + touches++; + } +next: + pte_p = pte_next(pte_p); + } while (pte_p != page->u.d.pte_head) ; +buffers_unlock: + clear_bit(PG_locked, &page->flags); +buffers: + bh = page->buffers; + if (bh) { + struct buffer_head *tmp = bh; + int ref = 0; + do { + if (buffer_touched(tmp)) { + clear_bit(BH_Touched, &tmp->b_state); + set_bit(PG_referenced, &page->flags); + ref = 1; + } + tmp = tmp->b_this_page; + } while (tmp != bh); + if (ref) + touches++; + } -static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma, - pgd_t *pgdir, unsigned long start, int dma, int wait) -{ - unsigned long end; + if (touches) + touch_page(page); + else + age_page(page); +} - /* Don't swap out areas like shared memory which have their - own separate swapping mechanism or areas which are locked down */ - if (vma->vm_flags & (VM_SHM | VM_LOCKED)) - return 0; +static int lowest_age = 0; - end = vma->vm_end; - while (start < end) { - int result = swap_out_pgd(tsk, vma, pgdir, start, end, dma, wait); - if (result) - return result; - start = (start + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; +static void age_sweep(void) +{ + unsigned long active = 0; + struct page *page_map; + const struct page *max_map = mem_map+max_mapnr; + + lowest_age = MAX_PAGE_AGE; + for (page_map = mem_map; page_map < max_map; + page_map += 1 << page_map->order ) { + if ( + page_map->type == PGtype_free + || PageReserved(page_map) + || PageLocked(page_map)) + continue; + active++; + do_age_page(page_map); + if (page_map->age < lowest_age) + lowest_age = page_map->age; } - return 0; +#ifdef DEBUG + printk(KERN_DEBUG "kswapd: age_sweep finished (%lu active)\n", active); +#endif } -static int swap_out_process(struct task_struct * p, int dma, int wait) +static int swap_out(int num, int dma, int wait) { - unsigned long address; - struct vm_area_struct* vma; - - /* - * Go through process' page directory. - */ - address = p->swap_address; - p->swap_address = 0; - - /* - * Find the proper vm-area - */ - vma = find_vma(p->mm, address); - if (!vma) + static unsigned long clock = 0; + unsigned long limit = max_mapnr; + int swapped = 0; + unsigned long reps = max_mapnr<<1; + struct page *page; +#ifdef DEBUG + printk(KERN_DEBUG "kswapd: swap_out(%d,%d,%d) started lowest_age=%d\n", num, dma, wait, lowest_age); +#endif + for (page = mem_map+clock; + swapped < num && --reps > 0; + page = mem_map+clock) { + { register int order = 1 << page->order; + clock += order; } + if (clock >= limit) + clock = 0; + if ( + page->type == PGtype_free + || PageReserved(page) + || PageLocked(page) + || (dma && !PageDMA(page))) + goto cont; + do_age_page(page); + if (page->age <= lowest_age && + !try_to_swap_page(page, wait)) + swapped++; + else if (!test_and_clear_bit(PG_referenced, &page->flags)) { + struct buffer_head *bh = page->buffers; + if (bh && try_to_free_buffer(bh, &bh, 6)) + swapped++; + } +cont: + } + if (swapped < num) { + printk(KERN_DEBUG "swapd(%d,%d): only swapped %d\n", num, dma, swapped); return 0; - if (address < vma->vm_start) - address = vma->vm_start; - - for (;;) { - int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, dma, wait); - if (result) - return result; - vma = vma->vm_next; - if (!vma) - break; - address = vma->vm_start; } - p->swap_address = 0; - return 0; + return 1; } -/* - * Select the task with maximal swap_cnt and try to swap out a page. - * N.B. This function returns only 0 or 1. Return values != 1 from - * the lower level routines result in continued processing. - */ -static int swap_out(unsigned int priority, int dma, int wait) -{ - struct task_struct * p, * pbest; - int counter, assign, max_cnt; - - /* - * We make one or two passes through the task list, indexed by - * assign = {0, 1}: - * Pass 1: select the swappable task with maximal swap_cnt. - * Pass 2: assign new swap_cnt values, then select as above. - * With this approach, there's no need to remember the last task - * swapped out. If the swap-out fails, we clear swap_cnt so the - * task won't be selected again until all others have been tried. - */ - counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority; - for (; counter >= 0; counter--) { - assign = 0; - max_cnt = 0; - pbest = NULL; - select: - read_lock(&tasklist_lock); - p = init_task.next_task; - for (; p != &init_task; p = p->next_task) { - if (!p->swappable) - continue; - if (p->mm->rss <= 0) - continue; - if (assign) { - /* - * If we didn't select a task on pass 1, - * assign each task a new swap_cnt. - * Normalise the number of pages swapped - * by multiplying by (RSS / 1MB) - */ - p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss); - } - if (p->swap_cnt > max_cnt) { - max_cnt = p->swap_cnt; - pbest = p; - } - } - read_unlock(&tasklist_lock); - if (!pbest) { - if (!assign) { - assign = 1; - goto select; - } - goto out; - } - pbest->swap_cnt--; - - switch (swap_out_process(pbest, dma, wait)) { - case 0: - /* - * Clear swap_cnt so we don't look at this task - * again until we've tried all of the others. - * (We didn't block, so the task is still here.) - */ - pbest->swap_cnt = 0; - break; - case 1: - return 1; - default: - break; - }; - } -out: - return 0; -} /* - * We are much more aggressive about trying to swap out than we used - * to be. This works out OK, because we now do proper aging on page - * contents. - */ -static inline int do_try_to_free_page(int priority, int dma, int wait) -{ - static int state = 0; - int i=6; - int stop; - + * This will change big time... =) + */ +static inline int do_try_to_free_page(int priority, int dma, int order) +{ +#ifdef DEBUG + printk(KERN_DEBUG "do_try_to_free_page(%d,%d,%d)\n", priority, dma, order); +#endif /* Let the dcache know we're looking for memory ... */ shrink_dcache_memory(); /* Always trim SLAB caches when memory gets low. */ - (void) kmem_cache_reap(0, dma, wait); + kmem_cache_reap(priority, dma, 1); - /* we don't try as hard if we're not waiting.. */ - stop = 3; - if (wait) - stop = 0; - switch (state) { - do { - case 0: - if (shrink_mmap(i, dma)) - return 1; - state = 1; - case 1: - if (shm_swap(i, dma)) - return 1; - state = 2; - default: - if (swap_out(i, dma, wait)) - return 1; - state = 0; - i--; - } while ((i - stop) >= 0); - } - return 0; + /* try to swap out 2* the amount desired... will use order to add intelligence + * in the near future. + */ + return swap_out(2 << order, dma, 1); } /* @@ -387,12 +382,12 @@ * now we need this so that we can do page allocations * without holding the kernel lock etc. */ -int try_to_free_page(int priority, int dma, int wait) +int try_to_free_page(int priority, int dma, int order) { int retval; lock_kernel(); - retval = do_try_to_free_page(priority,dma,wait); + retval = do_try_to_free_page(priority,dma,order); unlock_kernel(); return retval; } @@ -405,15 +400,15 @@ */ void kswapd_setup(void) { - int i; - char *revision="$Revision: 1.23 $", *s, *e; + int i; + char *revision="$Revision: 1.47 $", *s, *e; - if ((s = strchr(revision, ':')) && - (e = strchr(s, '$'))) - s++, i = e - s; - else - s = revision, i = -1; - printk ("Starting kswapd v%.*s\n", i, s); + if ((s = strchr(revision, ':')) && + (e = strchr(s, '$'))) + s++, i = e - s; + else + s = revision, i = -1; + printk ("Starting kswapd v%.*s\n", i, s); } #define MAX_SWAP_FAIL 3 @@ -423,6 +418,7 @@ */ int kswapd(void *unused) { + int num, did_age; current->session = 1; current->pgrp = 1; sprintf(current->comm, "kswapd"); @@ -444,39 +440,28 @@ init_swap_timer(); while (1) { - int fail; - + last_swap_jiffies = jiffies; kswapd_awake = 0; current->signal = 0; - run_task_queue(&tq_disk); interruptible_sleep_on(&kswapd_wait); kswapd_awake = 1; swapstats.wakeups++; - /* Do the background pageout: - * We now only swap out as many pages as needed. - * When we are truly low on memory, we swap out - * synchronously (WAIT == 1). -- Rik. - * If we've had too many consecutive failures, - * go back to sleep to let other tasks run. - */ - for (fail = 0; fail++ < MAX_SWAP_FAIL;) { - int pages, wait; - - pages = nr_free_pages; - if (nr_free_pages >= min_free_pages) - pages += atomic_read(&nr_async_pages); - if (pages >= free_pages_high) - break; - wait = (pages < free_pages_low); - if (try_to_free_page(GFP_KERNEL, 0, wait)) - fail = 0; - } + if (lowest_age > 0 || + (jiffies - last_age_jiffies) >= HZ/4) { + age_sweep(); + last_age_jiffies = jiffies; + did_age = 1; + } else + did_age = 0; /* - * Report failure if we couldn't reach the minimum goal. + * This needs to be better tuned... we can end up with a + * 'thundering heard' of pages being pushed off to disk. */ - if (nr_free_pages < min_free_pages) - printk("kswapd: failed, got %d of %d\n", - nr_free_pages, min_free_pages); + num = free_pages_high - nr_free_pages - atomic_read(&nr_async_pages); + if (num <= 0) + num = did_age ? 4 : 8; + swap_out(num, 0, 0); + run_task_queue(&tq_disk); } } @@ -486,23 +471,21 @@ void swap_tick(void) { - int want_wakeup = 0, memory_low = 0; - int pages = nr_free_pages + atomic_read(&nr_async_pages); - - if (pages < free_pages_low) - memory_low = want_wakeup = 1; - else if (pages < free_pages_high && jiffies >= next_swap_jiffies) - want_wakeup = 1; - - if (want_wakeup) { - if (!kswapd_awake) { - wake_up(&kswapd_wait); - need_resched = 1; - } - /* Set the next wake-up time */ - next_swap_jiffies = jiffies; - if (!memory_low) - next_swap_jiffies += swapout_interval; + int free_pages = nr_free_pages + atomic_read(&nr_async_pages); + if ( !kswapd_awake && ( +#if 1 + (free_pages <= free_pages_low && (long)(jiffies - last_swap_jiffies) >= HZ/25) || +#endif + (lowest_age > 0 && (long)(jiffies - last_swap_jiffies) >= HZ/6) || + (free_pages < free_pages_high*2 && (long)(jiffies - last_age_jiffies) >= 4*HZ) || + (free_pages < free_pages_high && (long)(jiffies - last_swap_jiffies) >= swapout_interval) || + ((long)(jiffies - last_age_jiffies) >= HZ*40) + )) { +#if 0 + printk("Awakening kswapd, jiffies=%lu last_swap_jiffies=%lu last_age_jiffies=%lu, free_pages=%d async=%d nr_free_pages=%d\n", jiffies, last_swap_jiffies, last_age_jiffies, free_pages, atomic_read(&nr_async_pages), nr_free_pages); +#endif + wake_up(&kswapd_wait); + need_resched = 1; } timer_active |= (1<