From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from penguin.e-mind.com (penguin.e-mind.com [195.223.140.120]) by kvack.org (8.8.7/8.8.7) with ESMTP id TAA10005 for ; Mon, 4 Jan 1999 19:33:58 -0500 Date: Tue, 5 Jan 1999 01:32:17 +0100 (CET) From: Andrea Arcangeli Subject: Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]] In-Reply-To: Message-ID: MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII Sender: owner-linux-mm@kvack.org To: Linus Torvalds Cc: Alan Cox , steve@netplus.net, bredelin@ucsd.edu, sct@redhat.com, linux-kernel@vger.rutgers.edu, H.H.vanRiel@phys.uu.nl, linux-mm@kvack.org List-ID: On Mon, 4 Jan 1999, Andrea Arcangeli wrote: > I am going to do something like that right now... Here a new patch (arca-vm-7). It pratically removes kswapd for all places except the ATOMIC memory allocation if there aren't process that are just freeing memory. It returns also to the stock/arca-vm-3 shrink_mmap() (even if it seems slower). Index: linux/mm/vmscan.c diff -u linux/mm/vmscan.c:1.1.1.9 linux/mm/vmscan.c:1.1.1.1.2.64 --- linux/mm/vmscan.c:1.1.1.9 Sat Jan 2 15:46:20 1999 +++ linux/mm/vmscan.c Tue Jan 5 01:02:43 1999 @@ -10,6 +10,14 @@ * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $ */ +/* + * Developed the balanced page freeing algorithm (do_free_user_and_cache). + * Developed a smart mechanism to handle the swapout weight. + * Allowed the process to swapout async and only then get the credit from + * the bank. This has doubled swapout performances and fluidness. + * Copyright (C) 1998 Andrea Arcangeli + */ + #include #include #include @@ -21,12 +29,15 @@ #include /* + * When are we next due for a page scan? + */ +static atomic_t nr_tasks_freeing_memory = ATOMIC_INIT(0); + +/* * The wait queue for waking up the pageout daemon: */ static struct task_struct * kswapd_task = NULL; -static void init_swap_timer(void); - /* * The swap-out functions return 1 if they successfully * threw something out, and we got a free page. It returns @@ -163,7 +174,7 @@ * cache. */ if (PageSwapCache(page_map)) { __free_page(page_map); - return (atomic_read(&page_map->count) == 0); + return 1; } add_to_swap_cache(page_map, entry); /* We checked we were unlocked way up above, and we @@ -195,7 +206,7 @@ flush_tlb_page(vma, address); swap_duplicate(entry); __free_page(page_map); - return (atomic_read(&page_map->count) == 0); + return 1; } /* * A clean page to be discarded? Must be mmap()ed from @@ -210,9 +221,8 @@ flush_cache_page(vma, address); pte_clear(page_table); flush_tlb_page(vma, address); - entry = (atomic_read(&page_map->count) == 1); __free_page(page_map); - return entry; + return 1; } /* @@ -230,7 +240,7 @@ */ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma, - pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask) + pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter) { pte_t * pte; unsigned long pmd_end; @@ -251,18 +261,20 @@ do { int result; - tsk->swap_address = address + PAGE_SIZE; result = try_to_swap_out(tsk, vma, address, pte, gfp_mask); + address += PAGE_SIZE; + tsk->swap_address = address; if (result) return result; - address += PAGE_SIZE; + if (!--*counter) + return 0; pte++; } while (address < end); return 0; } static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma, - pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask) + pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter) { pmd_t * pmd; unsigned long pgd_end; @@ -282,9 +294,11 @@ end = pgd_end; do { - int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask); + int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask, counter); if (result) return result; + if (!*counter) + return 0; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); @@ -292,7 +306,7 @@ } static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma, - unsigned long address, int gfp_mask) + unsigned long address, int gfp_mask, unsigned long * counter) { pgd_t *pgdir; unsigned long end; @@ -306,16 +320,19 @@ end = vma->vm_end; while (address < end) { - int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask); + int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask, counter); if (result) return result; + if (!*counter) + return 0; address = (address + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } return 0; } -static int swap_out_process(struct task_struct * p, int gfp_mask) +static int swap_out_process(struct task_struct * p, int gfp_mask, + unsigned long * counter) { unsigned long address; struct vm_area_struct* vma; @@ -334,9 +351,12 @@ address = vma->vm_start; for (;;) { - int result = swap_out_vma(p, vma, address, gfp_mask); + int result = swap_out_vma(p, vma, address, gfp_mask, + counter); if (result) return result; + if (!*counter) + return 0; vma = vma->vm_next; if (!vma) break; @@ -350,6 +370,25 @@ return 0; } +static inline unsigned long calc_swapout_weight(int priority) +{ + struct task_struct * p; + unsigned long total_vm = 0; + + read_lock(&tasklist_lock); + for_each_task(p) + { + if (!p->swappable) + continue; + if (p->mm->rss == 0) + continue; + total_vm += p->mm->total_vm; + } + read_unlock(&tasklist_lock); + + return total_vm / (priority+1); +} + /* * Select the task with maximal swap_cnt and try to swap out a page. * N.B. This function returns only 0 or 1. Return values != 1 from @@ -358,7 +397,10 @@ static int swap_out(unsigned int priority, int gfp_mask) { struct task_struct * p, * pbest; - int counter, assign, max_cnt; + int assign; + unsigned long counter, max_cnt; + + counter = calc_swapout_weight(priority); /* * We make one or two passes through the task list, indexed by @@ -374,23 +416,17 @@ * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ - counter = nr_tasks / (priority+1); - if (counter < 1) - counter = 1; - if (counter > nr_tasks) - counter = nr_tasks; - - for (; counter >= 0; counter--) { + while (counter != 0) { assign = 0; max_cnt = 0; pbest = NULL; select: read_lock(&tasklist_lock); - p = init_task.next_task; - for (; p != &init_task; p = p->next_task) { + for_each_task(p) + { if (!p->swappable) continue; - if (p->mm->rss <= 0) + if (p->mm->rss == 0) continue; /* Refresh swap_cnt? */ if (assign) @@ -410,10 +446,11 @@ } /* - * Nonzero means we cleared out something, but only "1" means - * that we actually free'd up a page as a result. + * Nonzero means we cleared out something, and "1" means + * that we actually moved a page from the process memory + * to the swap cache (it's not been freed yet). */ - if (swap_out_process(pbest, gfp_mask) == 1) + if (swap_out_process(pbest, gfp_mask, &counter)) return 1; } out: @@ -441,39 +478,62 @@ printk ("Starting kswapd v%.*s\n", i, s); } -#define free_memory(fn) \ - count++; do { if (!--count) goto done; } while (fn) +static int do_free_user_and_cache(int priority, int gfp_mask) +{ + if (shrink_mmap(priority, gfp_mask)) + return 1; -static int kswapd_free_pages(int kswapd_state) + /* + * NOTE: Here we allow also the process to do async swapout + * because the swapout is really only a credit at the bank of + * free memory right now. So we don't care to have it _now_. + * Allowing async I/O we are going to improve drammatically + * swapout performance -arca (discovered this afternoon ;) 980105 + */ + if (swap_out(priority, gfp_mask & ~__GFP_WAIT)) + /* + * We done at least some swapping progress so return 1 in + * this case. -arca + */ + return 1; + + return 0; +} + +static int do_free_page(int * state, int gfp_mask) { - unsigned long end_time; + int priority = 8; - /* Always trim SLAB caches when memory gets low. */ - kmem_cache_reap(0); + switch (*state) { + do { + case 0: + if (do_free_user_and_cache(priority, gfp_mask)) + return 1; + *state = 1; + case 1: + if (shm_swap(priority, gfp_mask)) + return 1; + *state = 0; - /* max one hundreth of a second */ - end_time = jiffies + (HZ-1)/100; - do { - int priority = 8; - int count = pager_daemon.swap_cluster; + shrink_dcache_memory(priority, gfp_mask); + kmem_cache_reap(gfp_mask); + } while (--priority >= 0); + } + return 0; +} - switch (kswapd_state) { - do { - default: - free_memory(shrink_mmap(priority, 0)); - free_memory(swap_out(priority, 0)); - kswapd_state++; - case 1: - free_memory(shm_swap(priority, 0)); - shrink_dcache_memory(priority, 0); - kswapd_state = 0; - } while (--priority >= 0); - return kswapd_state; - } -done: - if (nr_free_pages > freepages.high + pager_daemon.swap_cluster) +static int kswapd_free_pages(int kswapd_state) +{ + for(;;) + { + do_free_page(&kswapd_state, 0); + if (nr_free_pages > freepages.high) + break; + if (atomic_read(&nr_tasks_freeing_memory)) break; - } while (time_before_eq(jiffies,end_time)); + if (kswapd_task->need_resched) + schedule(); + }; return kswapd_state; } @@ -496,13 +556,6 @@ lock_kernel(); /* - * Set the base priority to something smaller than a - * regular process. We will scale up the priority - * dynamically depending on how much memory we need. - */ - current->priority = (DEF_PRIORITY * 2) / 3; - - /* * Tell the memory management that we're a "memory allocator", * and that if we need more memory we should get access to it * regardless (see "try_to_free_pages()"). "kswapd" should @@ -516,7 +569,6 @@ */ current->flags |= PF_MEMALLOC; - init_swap_timer(); kswapd_task = current; while (1) { int state = 0; @@ -543,107 +595,37 @@ * if we need more memory as part of a swap-out effort we * will just silently return "success" to tell the page * allocator to accept the allocation. - * - * We want to try to free "count" pages, and we need to - * cluster them so that we get good swap-out behaviour. See - * the "free_memory()" macro for details. */ int try_to_free_pages(unsigned int gfp_mask, int count) { - int retval; - + int retval = 1; lock_kernel(); - /* Always trim SLAB caches when memory gets low. */ - kmem_cache_reap(gfp_mask); - - retval = 1; if (!(current->flags & PF_MEMALLOC)) { - int priority; - current->flags |= PF_MEMALLOC; - - priority = 8; - do { - free_memory(shrink_mmap(priority, gfp_mask)); - free_memory(shm_swap(priority, gfp_mask)); - free_memory(swap_out(priority, gfp_mask)); - shrink_dcache_memory(priority, gfp_mask); - } while (--priority >= 0); - retval = 0; -done: + atomic_inc(&nr_tasks_freeing_memory); + while (count--) + { + static int state = 0; + if (!do_free_page(&state, gfp_mask)) + { + retval = 0; + break; + } + } + atomic_dec(&nr_tasks_freeing_memory); current->flags &= ~PF_MEMALLOC; } unlock_kernel(); return retval; } - -/* - * Wake up kswapd according to the priority - * 0 - no wakeup - * 1 - wake up as a low-priority process - * 2 - wake up as a normal process - * 3 - wake up as an almost real-time process - * - * This plays mind-games with the "goodness()" - * function in kernel/sched.c. - */ -static inline void kswapd_wakeup(struct task_struct *p, int priority) -{ - if (priority) { - p->counter = p->priority << priority; - wake_up_process(p); - } -} -/* - * The swap_tick function gets called on every clock tick. - */ -void swap_tick(void) +void kswapd_wakeup(void) { - struct task_struct *p = kswapd_task; - - /* - * Only bother to try to wake kswapd up - * if the task exists and can be woken. - */ - if (p && (p->state & TASK_INTERRUPTIBLE)) { - unsigned int pages; - int want_wakeup; - - /* - * Schedule for wakeup if there isn't lots - * of free memory or if there is too much - * of it used for buffers or pgcache. - * - * "want_wakeup" is our priority: 0 means - * not to wake anything up, while 3 means - * that we'd better give kswapd a realtime - * priority. - */ - want_wakeup = 0; - pages = nr_free_pages; - if (pages < freepages.high) - want_wakeup = 1; - if (pages < freepages.low) - want_wakeup = 2; - if (pages < freepages.min) - want_wakeup = 3; - - kswapd_wakeup(p,want_wakeup); - } - - timer_active |= (1<state & TASK_INTERRUPTIBLE) && + !atomic_read(&nr_tasks_freeing_memory)) + wake_up_process(p); } Index: linux/mm/page_alloc.c diff -u linux/mm/page_alloc.c:1.1.1.5 linux/mm/page_alloc.c:1.1.1.1.2.15 --- linux/mm/page_alloc.c:1.1.1.5 Sun Jan 3 20:42:44 1999 +++ linux/mm/page_alloc.c Tue Jan 5 01:13:00 1999 @@ -151,7 +151,6 @@ if (!PageReserved(page) && atomic_dec_and_test(&page->count)) { if (PageSwapCache(page)) panic ("Freeing swap cache page"); - page->flags &= ~(1 << PG_referenced); free_pages_ok(page->map_nr, 0); return; } @@ -173,7 +172,6 @@ if (atomic_dec_and_test(&map->count)) { if (PageSwapCache(map)) panic ("Freeing swap cache pages"); - map->flags &= ~(1 << PG_referenced); free_pages_ok(map_nr, order); return; } @@ -260,7 +258,8 @@ if (nr_free_pages > freepages.min) { if (!current->trashing_memory) goto ok_to_allocate; - if (nr_free_pages > freepages.low) { + if (!(current->flags & PF_MEMALLOC) && + nr_free_pages > freepages.low) { current->trashing_memory = 0; goto ok_to_allocate; } @@ -271,8 +270,11 @@ * memory. */ current->trashing_memory = 1; - if (!try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX) && !(gfp_mask & (__GFP_MED | __GFP_HIGH))) + if (!try_to_free_pages(gfp_mask, freepages.high - nr_free_pages + 1<count); if (PageSwapCache(page)) { +#if 0 /* PARANOID */ if (page->inode != &swapper_inode) panic("swap cache page has wrong inode\n"); +#endif count += swap_count(page->offset) - 2; } if (PageFreeAfter(page)) Andrea Arcangeli -- This is a majordomo managed list. To unsubscribe, send a message with the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org